You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
70 lines
2.4 KiB
70 lines
2.4 KiB
#' Extract a table from a Word document
|
|
#'
|
|
#' Given a document read with \code{read_docx} and a table to extract (optionally
|
|
#' indicating whether there was a header or not and if cell whitepace trimming is
|
|
#' desired) extract the contents of the table to a \code{data.frame}.
|
|
#'
|
|
#' @md
|
|
#' @param docx \code{docx} object read with \code{read_docx}
|
|
#' @param tbl_number which table to extract (defaults to \code{1})
|
|
#' @param header assume first row of table is a header row? (default; \code{TRUE})
|
|
#' @param preserve preserve line breaks within a cell? Default: `FALSE`. NOTE: This overrides `trim`.
|
|
#' @param trim trim leading/trailing whitespace (if any) in cells? (default: \code{TRUE})
|
|
#' @return \code{data.frame}
|
|
#' @seealso \code{\link{docx_extract_all}}, \code{\link{docx_extract_tbl}},
|
|
#' \code{\link{assign_colnames}}
|
|
#' @export
|
|
#' @examples
|
|
#' doc3 <- read_docx(system.file("examples/data3.docx", package="docxtractr"))
|
|
#' docx_extract_tbl(doc3, 3)
|
|
#'
|
|
#' intracell_whitespace <- read_docx(system.file("examples/preserve.docx", package="docxtractr"))
|
|
#' docx_extract_tbl(intracell_whitespace, 2, preserve=FALSE)
|
|
#' docx_extract_tbl(intracell_whitespace, 2, preserve=TRUE)
|
|
docx_extract_tbl <- function(docx, tbl_number=1, header=TRUE, preserve=FALSE, trim=TRUE) {
|
|
|
|
ensure_docx(docx)
|
|
if ((tbl_number < 1) | (tbl_number > docx_tbl_count(docx))) {
|
|
stop("'tbl_number' is invalid.", call.=FALSE)
|
|
}
|
|
|
|
if (preserve) trim <- FALSE
|
|
|
|
ns <- docx$ns
|
|
tbl <- docx$tbls[[tbl_number]]
|
|
|
|
cells <- xml2::xml_find_all(tbl, "./w:tr/w:tc", ns=ns)
|
|
rows <- xml2::xml_find_all(tbl, "./w:tr", ns=ns)
|
|
|
|
purrr::map_df(rows, ~{
|
|
res <- xml2::xml_find_all(.x, "./w:tc", ns=ns)
|
|
if (preserve) {
|
|
purrr::map(res, ~{
|
|
paras <- xml2::xml_text(xml2::xml_find_all(.x, "./w:p", ns=ns))
|
|
paste0(paras, collapse="\n")
|
|
}) -> vals
|
|
} else {
|
|
vals <- xml2::xml_text(res, trim=trim)
|
|
}
|
|
names(vals) <- sprintf("V%d", 1:length(vals))
|
|
as.list(vals)
|
|
# data.frame(as.list(vals), stringsAsFactors=FALSE)
|
|
}) -> dat
|
|
|
|
if (header) {
|
|
hopeful_names <- make.names(dat[1,])
|
|
colnames(dat) <- hopeful_names
|
|
dat <- dat[-1,]
|
|
} else {
|
|
hdr <- has_header(tbl, rows, ns)
|
|
if (!is.na(hdr)) {
|
|
message("NOTE: header=FALSE but table has a marked header row in the Word document")
|
|
}
|
|
}
|
|
|
|
rownames(dat) <- NULL
|
|
|
|
class(dat) <- c("tbl_df", "tbl", "data.frame")
|
|
dat
|
|
|
|
}
|
|
|