From 1e50ea4a3b267f67857fb93dc40335b8649a20fa Mon Sep 17 00:00:00 2001 From: boB Rudis Date: Fri, 1 Dec 2017 11:05:53 -0500 Subject: [PATCH] Made file creating a bit more bulletproof and added 'priming' --- DESCRIPTION | 17 +++++++++-------- LICENSE | 2 ++ R/aaa.r | 0 R/chrome-pdf.r | 13 +++++++++++-- R/chrome-shot.r | 14 ++++++++++++-- R/prime-url.r | 22 ++++++++++++++++++++++ R/read-html.r | 11 ++++++++++- R/version.r | 8 ++++++-- man/chrome_dump_pdf.Rd | 5 +++++ man/chrome_read_html.Rd | 7 ++++++- man/chrome_shot.Rd | 5 +++++ man/chrome_version.Rd | 7 ++++++- 12 files changed, 94 insertions(+), 17 deletions(-) create mode 100644 LICENSE delete mode 100644 R/aaa.r create mode 100644 R/prime-url.r diff --git a/DESCRIPTION b/DESCRIPTION index 3efeddf..5590f19 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,19 +1,20 @@ Package: decapitated Type: Package Title: Headless 'Chrome' Orchestration -Version: 0.1.0 -Date: 2017-05-02 -Author: Bob Rudis (bob@rud.is) +Version: 0.2.0 +Authors@R: c( + person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), + comment = c(ORCID = "0000-0001-5670-2640")), + person("Chanyub", "Park", email="mrchypark@gmail.com", role=c("ctb")) + ) Maintainer: Bob Rudis -Contributor: Chanyub Park Description: The 'Chrome' browser has a headless mode which can be instrumented programmatically. Tools are provided to perform headless - 'Chrome' instrumentation on the command-line and will eventually provide support - for the 'DevTools' instrumentation 'API' or the forthcoming 'phantomjs'-like higher-level - 'API' being promised by the development team. + 'Chrome' instrumentation, including retrieving the + 'javascript'-executed 'DOM' contents, 'PDF' output for, or screen shot of a 'URL'. URL: https://github.com/hrbrmstr/decapitated BugReports: https://github.com/hrbrmstr/decapitated/issues -License: AGPL +License: MIT + file LICENSE SystemRequirements: Chrome 59+ on macOS/Linux, 60+ on Windows Suggests: testthat, diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..48ed424 --- /dev/null +++ b/LICENSE @@ -0,0 +1,2 @@ +YEAR: 2017 +COPYRIGHT HOLDER: Bob Rudis diff --git a/R/aaa.r b/R/aaa.r deleted file mode 100644 index e69de29..0000000 diff --git a/R/chrome-pdf.r b/R/chrome-pdf.r index d89a692..33652cc 100644 --- a/R/chrome-pdf.r +++ b/R/chrome-pdf.r @@ -8,6 +8,10 @@ #' placed in the current working directory by incrementing trailing numbers before #' the end of it. #' @param overwrite overwrite existing file? Default: `TRUE` +#' @param prime if `TRUE` preliminary URL retrieval requests will be sent to "prime" the +#' headless Chrome cache. This seems to be necessary primarily on recent versions of macOS. +#' If numeric, that number of "prime" requests will be sent ahead of the capture request. +#' If `FALSE` no priming requests will be sent. #' @param chrome_bin the path to Chrome (auto-set from `HEADLESS_CHROME` environment variable) #' @return output fileame (invisibly) #' @export @@ -18,9 +22,9 @@ chrome_dump_pdf <- function(url, path=NULL, overwrite=TRUE, chrome_bin=Sys.geten curwd <- getwd() on.exit(setwd(curwd), add = TRUE) - if (is.null(path)) path <- "." + path <- if (is.null(path)) "." else path[1] - path <- normalizePath(path.expand(path[1])) + path <- suppressWarnings(normalizePath(path.expand(path))) if (!grepl("\\.pdf$", path)) { fil_nam <- "output.pdf" @@ -46,6 +50,11 @@ chrome_dump_pdf <- function(url, path=NULL, overwrite=TRUE, chrome_bin=Sys.geten args <- c(args, sprintf("--utility-allowed-dir=%s", .get_app_dir())) args <- c(args, "--print-to-pdf", url) + vers <- chrome_version(quiet=TRUE) + + if (is.logical(prime) & prime) .prime_url(url, 1, chrome_bin) + if (is.numeric(prime) & (prime>0)) .prime_url(url, prime, chrome_bin) + processx::run( command = chrome_bin, args = args, diff --git a/R/chrome-shot.r b/R/chrome-shot.r index ae01b6d..d897d55 100644 --- a/R/chrome-shot.r +++ b/R/chrome-shot.r @@ -14,6 +14,10 @@ #' placed in the current working directory by incrementing trailing numbers before #' the end of it. #' @param overwrite overwrite existing file? Default: `TRUE` +#' @param prime if `TRUE` preliminary URL retrieval requests will be sent to "prime" the +#' headless Chrome cache. This seems to be necessary primarily on recent versions of macOS. +#' If numeric, that number of "prime" requests will be sent ahead of the capture request. +#' If `FALSE` no priming requests will be sent. #' @param chrome_bin the path to Chrome (auto-set from `HEADLESS_CHROME` environment variable) #' @return `magick` #' @export @@ -25,9 +29,9 @@ chrome_shot <- function(url, width=NULL, height=NULL, path=NULL, overwrite=TRUE, curwd <- getwd() on.exit(setwd(curwd), add = TRUE) - if (is.null(path)) path <- "." + path <- if (is.null(path)) "." else path[1] - path <- normalizePath(path.expand(path[1])) + path <- suppressWarnings(normalizePath(path.expand(path))) if (!grepl("\\.pdf$", path)) { fil_nam <- "screenshot.png" @@ -57,6 +61,11 @@ chrome_shot <- function(url, width=NULL, height=NULL, path=NULL, overwrite=TRUE, args <- c(args, sprintf("--window-size=%s,%s", height, width)) } + vers <- chrome_version(quiet=TRUE) + + if (is.logical(prime) & prime) .prime_url(url, 1, chrome_bin) + if (is.numeric(prime) & (prime>0)) .prime_url(url, prime, chrome_bin) + processx::run( command = chrome_bin, args = args, @@ -65,6 +74,7 @@ chrome_shot <- function(url, width=NULL, height=NULL, path=NULL, overwrite=TRUE, echo = FALSE ) -> res + message(res$stderr) first_fil <- file.path(dir_nam, sprintf("%s.%s", fil_pre, fil_ext)) out_fil <- first_fil diff --git a/R/prime-url.r b/R/prime-url.r new file mode 100644 index 0000000..eda1396 --- /dev/null +++ b/R/prime-url.r @@ -0,0 +1,22 @@ +.prime_url <- function(url, prime_ct = 1, chrome_bin=Sys.getenv("HEADLESS_CHROME")) { + + args <- c("--headless") + args <- c(args, "--disable-gpu") + args <- c(args, "--no-sandbox") + args <- c(args, "--allow-no-sandbox-job") + args <- c(args, sprintf("--user-data-dir=%s", .get_app_dir())) + args <- c(args, sprintf("--crash-dumps-dir=%s", .get_app_dir())) + args <- c(args, sprintf("--utility-allowed-dir=%s", .get_app_dir())) + args <- c(args, "--dump-dom", url) + + for (i in 1:prime_ct) { + processx::run( + command = chrome_bin, + args = args, + error_on_status = FALSE, + echo_cmd = FALSE, + echo = FALSE + ) -> res + } + +} diff --git a/R/read-html.r b/R/read-html.r index dc3b1b7..c3ac10d 100644 --- a/R/read-html.r +++ b/R/read-html.r @@ -4,11 +4,15 @@ #' @note This only grabs the `` `innerHTML` contents #' @param url URL to read from #' @param render if `TRUE` then return an `xml_document`, else the raw HTML (invisibly) +#' @param prime if `TRUE` preliminary URL retrieval requests will be sent to "prime" the +#' headless Chrome cache. This seems to be necessary primarily on recent versions of macOS. +#' If numeric, that number of "prime" requests will be sent ahead of the capture request. +#' If `FALSE` no priming requests will be sent. #' @param chrome_bin the path to Chrome (auto-set from `HEADLESS_CHROME` environment variable) #' @export #' @examples #' chrome_read_html("https://www.r-project.org/") -chrome_read_html <- function(url, render=TRUE, chrome_bin=Sys.getenv("HEADLESS_CHROME")) { +chrome_read_html <- function(url, render=TRUE, prime=TRUE, chrome_bin=Sys.getenv("HEADLESS_CHROME")) { args <- c("--headless") args <- c(args, "--disable-gpu") @@ -19,6 +23,11 @@ chrome_read_html <- function(url, render=TRUE, chrome_bin=Sys.getenv("HEADLESS_C args <- c(args, sprintf("--utility-allowed-dir=%s", .get_app_dir())) args <- c(args, "--dump-dom", url) + vers <- chrome_version(quiet=TRUE) + + if (is.logical(prime) & prime) .prime_url(url, 1, chrome_bin) + if (is.numeric(prime) & (prime>0)) .prime_url(url, prime, chrome_bin) + processx::run( command = chrome_bin, args = args, diff --git a/R/version.r b/R/version.r index 78bb4e4..45f9b71 100644 --- a/R/version.r +++ b/R/version.r @@ -1,10 +1,14 @@ #' Get Chrome version #' #' @md +#' @param quiet if `TRUE`, no messages are displayed #' @param chrome_bin the path to Chrome (auto-set from `HEADLESS_CHROME` environment variable) +#' @return the Chrome version string (invisibly) #' @export #' @examples #' chrome_version() -chrome_version <- function(chrome_bin=Sys.getenv("HEADLESS_CHROME")) { - system2(chrome_bin, "--version") +chrome_version <- function(quiet = FALSE, chrome_bin=Sys.getenv("HEADLESS_CHROME")) { + res <- processx::run(chrome_bin, "--version") + if (!quiet) message(res$stdout) + return(invisible(trimws(res$stdout))) } \ No newline at end of file diff --git a/man/chrome_dump_pdf.Rd b/man/chrome_dump_pdf.Rd index 45619ba..140929a 100644 --- a/man/chrome_dump_pdf.Rd +++ b/man/chrome_dump_pdf.Rd @@ -18,6 +18,11 @@ the end of it.} \item{overwrite}{overwrite existing file? Default: \code{TRUE}} \item{chrome_bin}{the path to Chrome (auto-set from \code{HEADLESS_CHROME} environment variable)} + +\item{prime}{if \code{TRUE} preliminary URL retrieval requests will be sent to "prime" the +headless Chrome cache. This seems to be necessary primarily on recent versions of macOS. +If numeric, that number of "prime" requests will be sent ahead of the capture request. +If \code{FALSE} no priming requests will be sent.} } \value{ output fileame (invisibly) diff --git a/man/chrome_read_html.Rd b/man/chrome_read_html.Rd index 4e40e27..458eb33 100644 --- a/man/chrome_read_html.Rd +++ b/man/chrome_read_html.Rd @@ -4,7 +4,7 @@ \alias{chrome_read_html} \title{Read a URL via headless Chrome and return the raw or rendered \code{} \code{innerHTML} DOM elements} \usage{ -chrome_read_html(url, render = TRUE, +chrome_read_html(url, render = TRUE, prime = TRUE, chrome_bin = Sys.getenv("HEADLESS_CHROME")) } \arguments{ @@ -12,6 +12,11 @@ chrome_read_html(url, render = TRUE, \item{render}{if \code{TRUE} then return an \code{xml_document}, else the raw HTML (invisibly)} +\item{prime}{if \code{TRUE} preliminary URL retrieval requests will be sent to "prime" the +headless Chrome cache. This seems to be necessary primarily on recent versions of macOS. +If numeric, that number of "prime" requests will be sent ahead of the capture request. +If \code{FALSE} no priming requests will be sent.} + \item{chrome_bin}{the path to Chrome (auto-set from \code{HEADLESS_CHROME} environment variable)} } \description{ diff --git a/man/chrome_shot.Rd b/man/chrome_shot.Rd index 16db8f1..f533c94 100644 --- a/man/chrome_shot.Rd +++ b/man/chrome_shot.Rd @@ -20,6 +20,11 @@ the end of it.} \item{overwrite}{overwrite existing file? Default: \code{TRUE}} \item{chrome_bin}{the path to Chrome (auto-set from \code{HEADLESS_CHROME} environment variable)} + +\item{prime}{if \code{TRUE} preliminary URL retrieval requests will be sent to "prime" the +headless Chrome cache. This seems to be necessary primarily on recent versions of macOS. +If numeric, that number of "prime" requests will be sent ahead of the capture request. +If \code{FALSE} no priming requests will be sent.} } \value{ \code{magick} diff --git a/man/chrome_version.Rd b/man/chrome_version.Rd index 749697d..76d5672 100644 --- a/man/chrome_version.Rd +++ b/man/chrome_version.Rd @@ -4,11 +4,16 @@ \alias{chrome_version} \title{Get Chrome version} \usage{ -chrome_version(chrome_bin = Sys.getenv("HEADLESS_CHROME")) +chrome_version(quiet = FALSE, chrome_bin = Sys.getenv("HEADLESS_CHROME")) } \arguments{ +\item{quiet}{if \code{TRUE}, no messages are displayed} + \item{chrome_bin}{the path to Chrome (auto-set from \code{HEADLESS_CHROME} environment variable)} } +\value{ +the Chrome version string (invisibly) +} \description{ Get Chrome version }