You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
113 lines
3.1 KiB
113 lines
3.1 KiB
#' Returns a description of all the tables in the Word document
|
|
#'
|
|
#' This function will attempt to discern the structure of each of the tables
|
|
#' in \code{docx} and print this information
|
|
#'
|
|
#' @param docx \code{docx} object read with \code{read_docx}
|
|
#' @export
|
|
#' @examples
|
|
#' complx <- read_docx(system.file("examples/complex.docx", package="docxtractr"))
|
|
#' docx_tbl_count(complx)
|
|
#' docx_describe_tbls(complx)
|
|
docx_describe_tbls <- function(docx) {
|
|
|
|
ensure_docx(docx)
|
|
if (!docx_tbl_count(docx) > 0) {
|
|
message("No tables in document")
|
|
return(invisible(NULL))
|
|
}
|
|
|
|
ns <- docx$ns
|
|
tbls <- docx$tbls
|
|
|
|
cat(sprintf("Word document [%s]\n\n", docx$path))
|
|
|
|
for (i in 1:length(tbls)) {
|
|
|
|
tbl <- tbls[[i]]
|
|
|
|
cells <- xml2::xml_find_all(tbl, "./w:tr/w:tc", ns=ns)
|
|
rows <- xml2::xml_find_all(tbl, "./w:tr", ns=ns)
|
|
|
|
cell_count_by_row <- purrr::map_int(rows, ~{ length(xml2::xml_find_all(.x, "./w:tc", ns)) })
|
|
row_counts <- paste0(unique(cell_count_by_row), collapse=", ")
|
|
max_cell_count <- max(cell_count_by_row)
|
|
|
|
cat(sprintf("Table %d\n total cells: %d\n row count : %d\n",
|
|
i, length(cells), length(rows)))
|
|
|
|
# simplistic test for whether table is uniform rows x cells == cell count
|
|
if ((max_cell_count * length(rows)) == length(cells)) {
|
|
cat(" uniform : likely!\n")
|
|
} else {
|
|
cat(sprintf(
|
|
" uniform : unlikely => found differing cell counts (%s) across some rows\n",
|
|
row_counts))
|
|
}
|
|
|
|
# microsoft has a tag for some table structure info. examine it to
|
|
# see if the creator of the header made the first row special which
|
|
# will likely mean it's a header candidate
|
|
hdr <- has_header(tbl, rows, ns)
|
|
if (is.na(hdr)) {
|
|
cat(" has header : unlikely\n")
|
|
} else {
|
|
cat(sprintf(" has header : likely! => possibly [%s]\n", hdr))
|
|
}
|
|
|
|
cat("\n")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#' Returns information about the comments in the Word document
|
|
#'
|
|
#' @param docx \code{docx} object read with \code{read_docx}
|
|
#' @export
|
|
#' @examples
|
|
#' cmnts <- read_docx(system.file("examples/comments.docx", package="docxtractr"))
|
|
#' docx_cmnt_count(cmnts)
|
|
#' docx_describe_cmnts(cmnts)
|
|
docx_describe_cmnts <- function(docx) {
|
|
|
|
ensure_docx(docx)
|
|
if (!docx_cmnt_count(docx) > 0) {
|
|
message("No comments in document")
|
|
return(invisible(NULL))
|
|
}
|
|
|
|
ns <- docx$ns
|
|
cmnts <- docx$cmnts
|
|
|
|
cat(sprintf("Word document [%s]\n\n", docx$path))
|
|
|
|
cat(sprintf("Found %d comments.\n", length(cmnts)))
|
|
|
|
purrr::map_df(xml_attrs(cmnts), function(x) {
|
|
as.data.frame(
|
|
t(cbind.data.frame(x, stringsAsFactors=FALSE)),
|
|
stringsAsFactors = FALSE
|
|
)
|
|
}) -> meta
|
|
|
|
cmnt_df <- dplyr::bind_cols(meta,
|
|
cbind.data.frame(comment_text=xml2::xml_text(cmnts),
|
|
stringsAsFactors=FALSE))
|
|
|
|
aut_df <- dplyr::count(cmnt_df, author)
|
|
aut_df <- dplyr::arrange(aut_df, -n)
|
|
|
|
print(select(aut_df, author, `# Comments`=n))
|
|
|
|
}
|
|
|
|
#' Display information about the document
|
|
#'
|
|
#' @param x \code{docx} object
|
|
#' @param ... ignored
|
|
#' @export
|
|
print.docx <- function(x, ...) {
|
|
docx_describe_tbls(x)
|
|
docx_describe_cmnts(x)
|
|
}
|
|
|