You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

163 lines
5.3 KiB

# used by functions to make sure they are working with a well-formed docx object
ensure_docx <- function(docx) {
if (!inherits(docx, "docx")) stop("Must pass in a 'docx' object", call.=FALSE)
if (!(all(purrr::map_lgl(c("docx", "ns", "tbls", "path"), exists, where=docx))))
stop("'docx' object missing necessary components", call.=FALSE)
}
# test if a w:tbl has a header row
has_header <- function(tbl, rows, ns) {
# microsoft has a tag for some table structure info. examine it to
# see if the creator of the header made the first row special which
# will likely mean it's a header candidate
look <- try(xml2::xml_find_first(tbl, "./w:tblPr/w:tblLook", ns), silent=TRUE)
if (inherits(look, "try-error")) {
return(NA)
} else {
look_attr <- xml2::xml_attrs(look)
if ("firstRow" %in% names(look_attr)) {
if (look_attr["firstRow"] == "0") {
return(NA)
} else {
return(paste0(xml2::xml_text(xml_find_all(rows[[1]], "./w:tc", ns)), collapse=", "))
}
} else {
return(NA)
}
}
}
is_url <- function(path) { grepl("^(http|ftp)s?://", path) }
is_docx <- function(path) { tolower(tools::file_ext(path)) == "docx" }
is_pptx <- function(path) { tolower(tools::file_ext(path)) == "pptx" }
is_doc <- function(path) { tolower(tools::file_ext(path)) == "doc" }
# Copy a file to a new location, throw an error if the copy fails.
file_copy <- function(from, to) {
fc <- file.copy(from, to)
if (!fc) stop(sprintf("file copy failure for file %s", from), call.=FALSE)
}
# Save a .doc file as a new .docx file, using the LibreOffice command line
# tools.
convert_doc_to_docx <- function(docx_dir, doc_file) {
lo_path <- getOption("path_to_libreoffice")
if (is.null(lo_path)) {
stop(lo_path_missing, call. = FALSE)
}
if (Sys.info()["sysname"] == "Windows") {
convert_win(lo_path, docx_dir, doc_file)
} else {
convert_osx(lo_path, docx_dir, doc_file)
}
}
# .docx to .doc convertion for Windows
convert_win <- function(lo_path, docx_dir, doc_file,
convert_to = 'docx:"MS Word 2007 XML"') {
cmd <- sprintf('"%s" --convert-to %s -headless -outdir "%s" "%s"',
lo_path,
convert_to,
docx_dir,
doc_file)
system(cmd, show.output.on.console = FALSE)
}
# .docx to .doc convertion for OSX
convert_osx <- function(lo_path, docx_dir, doc_file,
convert_to = 'docx:"MS Word 2007 XML"') {
cmd <- sprintf('"%s" --convert-to %s --headless --outdir "%s" "%s"',
lo_path,
convert_to,
docx_dir,
doc_file)
res <- system(cmd, intern = TRUE)
}
#' Point to Local soffice.exe File
#'
#' Function to set an option that points to the local LibreOffice file
#' \code{soffice.exe}.
#'
#' @param path path to the LibreOffice soffice file
#'
#' @details For a list of possible file path locations for \code{soffice.exe},
#' see \url{https://github.com/hrbrmstr/docxtractr/issues/5#issuecomment-233181976}
#'
#' @return Returns nothing, function sets the option variable
#' \code{path_to_libreoffice}.
#' @export
#'
#' @examples \dontrun{
#' set_libreoffice_path("local/path/to/soffice.exe")
#' }
set_libreoffice_path <- function(path) {
stopifnot(is.character(path))
if (!file.exists(path)) stop(sprintf("Cannot find '%s'", path), call.=FALSE)
options("path_to_libreoffice" = path)
}
# Assert that LibreOffice file "soffice" exists locally.
# Check env variable "path_to_libreoffice". If it's NULL, call lo_find(), which
# will try to determine the local path to LibreOffice file "soffice". If
# lo_find() is successful, the path to "soffice" will be assigned to env
# variable "path_to_libreoffice", otherwise an error is thrown.
lo_assert <- function() {
lo_path <- getOption("path_to_libreoffice")
if (is.null(lo_path)) {
lo_path <- lo_find()
set_libreoffice_path(lo_path)
}
}
# Returns the local path to LibreOffice file "soffice". Search is performed by
# looking in the known file locations for the current OS. If OS is not Linux,
# OSX, or Windows, an error is thrown. If path to "soffice" is not found, an
# error is thrown.
lo_find <- function() {
user_os <- Sys.info()["sysname"]
if (!user_os %in% names(lo_paths_to_check)) {
stop(lo_path_missing, call. = FALSE)
}
lo_path <- NULL
for (path in lo_paths_to_check[[user_os]]) {
if (file.exists(path)) {
lo_path <- path
break
}
}
if (is.null(lo_path)) {
stop(lo_path_missing, call. = FALSE)
}
lo_path
}
# List obj containing known locations of LibreOffice file "soffice".
lo_paths_to_check <- list(
"Linux" = c("/usr/bin/soffice",
"/usr/local/bin/soffice"),
"Darwin" = c("/Applications/LibreOffice.app/Contents/MacOS/soffice",
"~/Applications/LibreOffice.app/Contents/MacOS/soffice"),
"Windows" = c("C:\\Program Files\\LibreOffice\\program\\soffice.exe",
"C:\\progra~1\\libreo~1\\program\\soffice.exe")
)
# Error message thrown if LibreOffice file "soffice" cannot be found.
lo_path_missing <- paste(
"LibreOffice software required to read '.doc' files.",
"Cannot determine file path to LibreOffice.",
"To download LibreOffice, visit: https://www.libreoffice.org/ \n",
"If you've already downloaded the software, use function",
"'set_libreoffice_path()' to point R to your local 'soffice.exe' file"
)