You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
101 lines
3.6 KiB
101 lines
3.6 KiB
#' "Print" to PDF
|
|
#'
|
|
#' @section Working around headless Chrome & OS security restrictions:
|
|
#' Security restrictions on various operating systems and OS configurations can cause
|
|
#' headless Chrome execution to fail. As a result, headless Chrome operations should
|
|
#' use a special directory for `decapitated` package operations. You can pass this
|
|
#' in as `work_dir`. If `work_dir` is `NULL` a `.rdecapdata` directory will be
|
|
#' created in your home directory and used for the data, crash dumps and utility
|
|
#' directories for Chrome operations.\cr
|
|
#' \cr
|
|
#' `tempdir()` does not always meet these requirements (after testing on various
|
|
#' macOS 10.13 systems) as Chrome does some interesting attribute setting for
|
|
#' some of its file operations.
|
|
#' \cr
|
|
#' If you pass in a `work_dir`, it must be one that does not violate OS security
|
|
#' restrictions or headless Chrome will not function.
|
|
#'
|
|
#' @md
|
|
#' @note The default Chrome filename is `output.pdf`
|
|
#' @param url URL to read from
|
|
#' @param path path (with optional output filename) for the generated PDF. If `NULL` then
|
|
#' and `overwrite` is `FALSE`, the fuction will will ensure a uniquely-named file is
|
|
#' placed in the current working directory by incrementing trailing numbers before
|
|
#' the end of it.
|
|
#' @param overwrite overwrite existing file? Default: `TRUE`
|
|
#' @param prime if `TRUE` preliminary URL retrieval requests will be sent to "prime" the
|
|
#' headless Chrome cache. This seems to be necessary primarily on recent versions of macOS.
|
|
#' If numeric, that number of "prime" requests will be sent ahead of the capture request.
|
|
#' If `FALSE` no priming requests will be sent.
|
|
#' @param work_dir See special Section.
|
|
#' @param chrome_bin the path to Chrome (auto-set from `HEADLESS_CHROME` environment variable)
|
|
#' @return output fileame (invisibly)
|
|
#' @export
|
|
#' @examples
|
|
#' chrome_dump_pdf("https://www.r-project.org/")
|
|
chrome_dump_pdf <- function(url, path=NULL, overwrite=TRUE, prime=TRUE,
|
|
work_dir = NULL, chrome_bin=Sys.getenv("HEADLESS_CHROME")) {
|
|
|
|
curwd <- getwd()
|
|
on.exit(setwd(curwd), add = TRUE)
|
|
|
|
path <- if (is.null(path)) "." else path[1]
|
|
|
|
path <- suppressWarnings(normalizePath(path.expand(path)))
|
|
|
|
if (!grepl("\\.pdf$", path)) {
|
|
fil_nam <- "output.pdf"
|
|
dir_nam <- path
|
|
} else {
|
|
fil_nam <- basename(path)
|
|
dir_nam <- dirname(path)
|
|
}
|
|
|
|
fil_ext <- tools::file_ext(fil_nam)
|
|
fil_pre <- tools::file_path_sans_ext(fil_nam)
|
|
|
|
td <- tempdir()
|
|
|
|
setwd(td)
|
|
|
|
work_dir <- if (is.null(work_dir)) .get_app_dir() else work_dir
|
|
|
|
args <- c("--headless")
|
|
args <- c(args, "--disable-gpu")
|
|
args <- c(args, "--no-sandbox")
|
|
args <- c(args, "--allow-no-sandbox-job")
|
|
args <- c(args, sprintf("--user-data-dir=%s", work_dir))
|
|
args <- c(args, sprintf("--crash-dumps-dir=%s", work_dir))
|
|
args <- c(args, sprintf("--utility-allowed-dir=%s", work_dir))
|
|
args <- c(args, "--print-to-pdf", url)
|
|
|
|
vers <- chrome_version(quiet=TRUE)
|
|
|
|
.prime_url(url, as.numeric(prime), work_dir, chrome_bin)
|
|
|
|
processx::run(
|
|
command = chrome_bin,
|
|
args = args,
|
|
error_on_status = FALSE,
|
|
echo_cmd = FALSE,
|
|
echo = FALSE
|
|
) -> res
|
|
|
|
first_fil <- file.path(dir_nam, sprintf("%s.%s", fil_pre, fil_ext))
|
|
out_fil <- first_fil
|
|
|
|
if (!overwrite) {
|
|
|
|
moar_fils <- sprintf(file.path(dir_nam, sprintf("%s%%04d.%s", fil_pre, fil_ext)), 0:9999)
|
|
fils <- c(first_fil, moar_fils)
|
|
|
|
out_fil <- fils[which(!file.exists(fils))[1]]
|
|
if (is.na(out_fil)) stop("Cannot create unique filename")
|
|
|
|
}
|
|
|
|
file.copy("output.pdf", out_fil, overwrite = overwrite)
|
|
|
|
return(invisible(out_fil))
|
|
|
|
}
|