You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

155 lines
4.8 KiB

#' Read in a Word document for table extraction
#'
#' Local file path or URL pointing to a \code{.docx} file. Can also take
#' \code{.doc} file as input if \code{LibreOffice} is installed
#' (see \url{https://www.libreoffice.org/} for more info and to download).
#'
#' @md
#' @param path path to the Word document
#' @param track_changes if not `NULL` (the default) then must be one of
#' "`accept`" or "`reject`" which will, respectively, accept all or
#' reject all changes. NOTE: this functionality relies on the
#' `pandoc` utility being available on the system `PATH`. Both
#' system `PATH` and the `RSTUDIO_PANDOC` (RStudio ships with
#' a copy of `pandoc`) environment variables will be checked.
#' If no `pandoc` binary is found then a warning will be issued
#' and the document will be read without integrating or ignoring
#' any tracked changes. The original Word document *will not be modified*
#' and this feature *only works* with `docx` files.
#' @export
#' @examples
#' doc <- read_docx(system.file("examples/data.docx", package="docxtractr"))
#' class(doc)
#'
#' doc <- read_docx(
#' system.file("examples/trackchanges.docx", package="docxtractr"),
#' track_changes = "accept"
#' )
#'
#' \dontrun{
#' # from a URL
#' budget <- read_docx(
#' "http://rud.is/dl/1.DOCX")
#' }
read_docx <- function(path, track_changes=NULL) {
stopifnot(is.character(path))
if (!is.null(track_changes)) {
track_changes <- match.arg(track_changes, c("accept", "reject"))
}
# make temporary things for us to work with
tmpd <- tempdir()
tmpf <- tempfile(tmpdir=tmpd, fileext=".zip")
# Check to see if input is a .doc file
is_input_doc <- is_doc(path)
# If input is a .doc file, create a temp .doc file
if (is_input_doc) {
lo_assert()
tmpf_doc <- tempfile(tmpdir = tmpd, fileext = ".doc")
tmpf_docx <- gsub("\\.doc$", ".docx", tmpf_doc)
} else {
tmpf_doc <- NULL
tmpf_docx <- NULL
}
on.exit({ #cleanup
unlink(tmpf)
unlink(tmpf_doc)
unlink(tmpf_docx)
unlink(sprintf("%s/docdata", tmpd), recursive=TRUE)
})
if (is_url(path)) {
if (is_input_doc) {
# If input is a url pointing to a .doc file, write file to disk
res <- httr::GET(path, write_disk(tmpf_doc))
httr::stop_for_status(res)
# Save .doc file as a .docx file using LibreOffice command-line tools.
convert_doc_to_docx(tmpd, tmpf_doc)
# copy output of LibreOffice to zip (not entirely necessary)
file_copy(tmpf_docx, tmpf)
} else {
# If input is a url pointing to a .docx file, write file to disk
res <- httr::GET(path, write_disk(tmpf))
httr::stop_for_status(res)
}
} else {
path <- path.expand(path)
if (!file.exists(path)) stop(sprintf("Cannot find '%s'", path), call.=FALSE)
# If input is a .doc file, save it as a .docx file using LibreOffice
# command-line tools.
if (is_input_doc) {
file_copy(path, tmpf_doc)
convert_doc_to_docx(tmpd, tmpf_doc)
# copy output of LibreOffice to zip (not entirely necessary)
file_copy(tmpf_docx, tmpf)
} else {
# Otherwise, if input is a .docx file, just copy docx to zip
# (not entirely necessary)
file_copy(path, tmpf)
}
}
if (!is.null(track_changes)) {
pandoc_bin <- Sys.which("pandoc")
if (pandoc_bin == "") {
pandoc_bin <- Sys.getenv("RSTUDIO_PANDOC")
if (pandoc_bin == "") {
warning(
"Track changes option was used but no pandoc binary was found. ",
"Please ensure that the directory containing pandoc is available ",
"on the system PATH and restart the R session before trying again. ",
"Reading in document *without* tracking any changes."
)
}
}
if (pandoc_bin != "") {
system2(
command = pandoc_bin,
args = c(
"-f", "docx",
"-t", "docx",
"-o", tmpf,
sprintf("--track-changes=%s", track_changes),
tmpf
)
)
}
}
# unzip it
unzip(tmpf, exdir=sprintf("%s/docdata", tmpd))
# read the actual XML document
doc <- xml2::read_xml(sprintf("%s/docdata/word/document.xml", tmpd))
# extract the namespace
ns <- xml2::xml_ns(doc)
# get the tables
tbls <- xml2::xml_find_all(doc, ".//w:tbl", ns=ns)
if (file.exists(sprintf("%s/docdata/word/comments.xml", tmpd))) {
docmnt <- read_xml(sprintf("%s/docdata/word/comments.xml", tmpd))
# get the comments
cmnts <- xml2::xml_find_all(docmnt, ".//w:comment", ns=ns)
} else {
cmnts <- xml2::xml_find_all(doc, ".//w:comment", ns=ns)
}
# make an object for other functions to work with
docx <- list(docx=doc, ns=ns, tbls=tbls, cmnts=cmnts, path=path)
# special class helps us work with these things
class(docx) <- "docx"
docx
}