You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

113 lines
3.1 KiB

#' Returns a description of all the tables in the Word document
#'
#' This function will attempt to discern the structure of each of the tables
#' in \code{docx} and print this information
#'
#' @param docx \code{docx} object read with \code{read_docx}
#' @export
#' @examples
#' complx <- read_docx(system.file("examples/complex.docx", package="docxtractr"))
#' docx_tbl_count(complx)
#' docx_describe_tbls(complx)
docx_describe_tbls <- function(docx) {
ensure_docx(docx)
if (!docx_tbl_count(docx) > 0) {
message("No tables in document")
return(invisible(NULL))
}
ns <- docx$ns
tbls <- docx$tbls
cat(sprintf("Word document [%s]\n\n", docx$path))
for (i in 1:length(tbls)) {
tbl <- tbls[[i]]
cells <- xml2::xml_find_all(tbl, "./w:tr/w:tc", ns=ns)
rows <- xml2::xml_find_all(tbl, "./w:tr", ns=ns)
cell_count_by_row <- purrr::map_int(rows, ~{ length(xml2::xml_find_all(.x, "./w:tc", ns)) })
row_counts <- paste0(unique(cell_count_by_row), collapse=", ")
max_cell_count <- max(cell_count_by_row)
cat(sprintf("Table %d\n total cells: %d\n row count : %d\n",
i, length(cells), length(rows)))
# simplistic test for whether table is uniform rows x cells == cell count
if ((max_cell_count * length(rows)) == length(cells)) {
cat(" uniform : likely!\n")
} else {
cat(sprintf(
" uniform : unlikely => found differing cell counts (%s) across some rows\n",
row_counts))
}
# microsoft has a tag for some table structure info. examine it to
# see if the creator of the header made the first row special which
# will likely mean it's a header candidate
hdr <- has_header(tbl, rows, ns)
if (is.na(hdr)) {
cat(" has header : unlikely\n")
} else {
cat(sprintf(" has header : likely! => possibly [%s]\n", hdr))
}
cat("\n")
}
}
#' Returns information about the comments in the Word document
#'
#' @param docx \code{docx} object read with \code{read_docx}
#' @export
#' @examples
#' cmnts <- read_docx(system.file("examples/comments.docx", package="docxtractr"))
#' docx_cmnt_count(cmnts)
#' docx_describe_cmnts(cmnts)
docx_describe_cmnts <- function(docx) {
ensure_docx(docx)
if (!docx_cmnt_count(docx) > 0) {
message("No comments in document")
return(invisible(NULL))
}
ns <- docx$ns
cmnts <- docx$cmnts
cat(sprintf("Word document [%s]\n\n", docx$path))
cat(sprintf("Found %d comments.\n", length(cmnts)))
purrr::map_df(xml_attrs(cmnts), function(x) {
as.data.frame(
t(cbind.data.frame(x, stringsAsFactors=FALSE)),
stringsAsFactors = FALSE
)
}) -> meta
cmnt_df <- dplyr::bind_cols(meta,
cbind.data.frame(comment_text=xml2::xml_text(cmnts),
stringsAsFactors=FALSE))
aut_df <- dplyr::count(cmnt_df, author)
aut_df <- dplyr::arrange(aut_df, -n)
print(select(aut_df, author, `# Comments`=n))
}
#' Display information about the document
#'
#' @param x \code{docx} object
#' @param ... ignored
#' @export
print.docx <- function(x, ...) {
docx_describe_tbls(x)
docx_describe_cmnts(x)
}