Browse Source

Made file creating a bit more bulletproof and added 'priming'

master
boB Rudis 6 years ago
parent
commit
1e50ea4a3b
No known key found for this signature in database GPG Key ID: 2A514A4997464560
  1. 17
      DESCRIPTION
  2. 2
      LICENSE
  3. 0
      R/aaa.r
  4. 13
      R/chrome-pdf.r
  5. 14
      R/chrome-shot.r
  6. 22
      R/prime-url.r
  7. 11
      R/read-html.r
  8. 8
      R/version.r
  9. 5
      man/chrome_dump_pdf.Rd
  10. 7
      man/chrome_read_html.Rd
  11. 5
      man/chrome_shot.Rd
  12. 7
      man/chrome_version.Rd

17
DESCRIPTION

@ -1,19 +1,20 @@
Package: decapitated
Type: Package
Title: Headless 'Chrome' Orchestration
Version: 0.1.0
Date: 2017-05-02
Author: Bob Rudis (bob@rud.is)
Version: 0.2.0
Authors@R: c(
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-5670-2640")),
person("Chanyub", "Park", email="mrchypark@gmail.com", role=c("ctb"))
)
Maintainer: Bob Rudis <bob@rud.is>
Contributor: Chanyub Park <mrchypark@gmail.com>
Description: The 'Chrome' browser <https://www.google.com/chrome/> has a headless mode
which can be instrumented programmatically. Tools are provided to perform headless
'Chrome' instrumentation on the command-line and will eventually provide support
for the 'DevTools' instrumentation 'API' or the forthcoming 'phantomjs'-like higher-level
'API' being promised by the development team.
'Chrome' instrumentation, including retrieving the
'javascript'-executed 'DOM' contents, 'PDF' output for, or screen shot of a 'URL'.
URL: https://github.com/hrbrmstr/decapitated
BugReports: https://github.com/hrbrmstr/decapitated/issues
License: AGPL
License: MIT + file LICENSE
SystemRequirements: Chrome 59+ on macOS/Linux, 60+ on Windows
Suggests:
testthat,

2
LICENSE

@ -0,0 +1,2 @@
YEAR: 2017
COPYRIGHT HOLDER: Bob Rudis

0
R/aaa.r

13
R/chrome-pdf.r

@ -8,6 +8,10 @@
#' placed in the current working directory by incrementing trailing numbers before
#' the end of it.
#' @param overwrite overwrite existing file? Default: `TRUE`
#' @param prime if `TRUE` preliminary URL retrieval requests will be sent to "prime" the
#' headless Chrome cache. This seems to be necessary primarily on recent versions of macOS.
#' If numeric, that number of "prime" requests will be sent ahead of the capture request.
#' If `FALSE` no priming requests will be sent.
#' @param chrome_bin the path to Chrome (auto-set from `HEADLESS_CHROME` environment variable)
#' @return output fileame (invisibly)
#' @export
@ -18,9 +22,9 @@ chrome_dump_pdf <- function(url, path=NULL, overwrite=TRUE, chrome_bin=Sys.geten
curwd <- getwd()
on.exit(setwd(curwd), add = TRUE)
if (is.null(path)) path <- "."
path <- if (is.null(path)) "." else path[1]
path <- normalizePath(path.expand(path[1]))
path <- suppressWarnings(normalizePath(path.expand(path)))
if (!grepl("\\.pdf$", path)) {
fil_nam <- "output.pdf"
@ -46,6 +50,11 @@ chrome_dump_pdf <- function(url, path=NULL, overwrite=TRUE, chrome_bin=Sys.geten
args <- c(args, sprintf("--utility-allowed-dir=%s", .get_app_dir()))
args <- c(args, "--print-to-pdf", url)
vers <- chrome_version(quiet=TRUE)
if (is.logical(prime) & prime) .prime_url(url, 1, chrome_bin)
if (is.numeric(prime) & (prime>0)) .prime_url(url, prime, chrome_bin)
processx::run(
command = chrome_bin,
args = args,

14
R/chrome-shot.r

@ -14,6 +14,10 @@
#' placed in the current working directory by incrementing trailing numbers before
#' the end of it.
#' @param overwrite overwrite existing file? Default: `TRUE`
#' @param prime if `TRUE` preliminary URL retrieval requests will be sent to "prime" the
#' headless Chrome cache. This seems to be necessary primarily on recent versions of macOS.
#' If numeric, that number of "prime" requests will be sent ahead of the capture request.
#' If `FALSE` no priming requests will be sent.
#' @param chrome_bin the path to Chrome (auto-set from `HEADLESS_CHROME` environment variable)
#' @return `magick`
#' @export
@ -25,9 +29,9 @@ chrome_shot <- function(url, width=NULL, height=NULL, path=NULL, overwrite=TRUE,
curwd <- getwd()
on.exit(setwd(curwd), add = TRUE)
if (is.null(path)) path <- "."
path <- if (is.null(path)) "." else path[1]
path <- normalizePath(path.expand(path[1]))
path <- suppressWarnings(normalizePath(path.expand(path)))
if (!grepl("\\.pdf$", path)) {
fil_nam <- "screenshot.png"
@ -57,6 +61,11 @@ chrome_shot <- function(url, width=NULL, height=NULL, path=NULL, overwrite=TRUE,
args <- c(args, sprintf("--window-size=%s,%s", height, width))
}
vers <- chrome_version(quiet=TRUE)
if (is.logical(prime) & prime) .prime_url(url, 1, chrome_bin)
if (is.numeric(prime) & (prime>0)) .prime_url(url, prime, chrome_bin)
processx::run(
command = chrome_bin,
args = args,
@ -65,6 +74,7 @@ chrome_shot <- function(url, width=NULL, height=NULL, path=NULL, overwrite=TRUE,
echo = FALSE
) -> res
message(res$stderr)
first_fil <- file.path(dir_nam, sprintf("%s.%s", fil_pre, fil_ext))
out_fil <- first_fil

22
R/prime-url.r

@ -0,0 +1,22 @@
.prime_url <- function(url, prime_ct = 1, chrome_bin=Sys.getenv("HEADLESS_CHROME")) {
args <- c("--headless")
args <- c(args, "--disable-gpu")
args <- c(args, "--no-sandbox")
args <- c(args, "--allow-no-sandbox-job")
args <- c(args, sprintf("--user-data-dir=%s", .get_app_dir()))
args <- c(args, sprintf("--crash-dumps-dir=%s", .get_app_dir()))
args <- c(args, sprintf("--utility-allowed-dir=%s", .get_app_dir()))
args <- c(args, "--dump-dom", url)
for (i in 1:prime_ct) {
processx::run(
command = chrome_bin,
args = args,
error_on_status = FALSE,
echo_cmd = FALSE,
echo = FALSE
) -> res
}
}

11
R/read-html.r

@ -4,11 +4,15 @@
#' @note This only grabs the `<body>` `innerHTML` contents
#' @param url URL to read from
#' @param render if `TRUE` then return an `xml_document`, else the raw HTML (invisibly)
#' @param prime if `TRUE` preliminary URL retrieval requests will be sent to "prime" the
#' headless Chrome cache. This seems to be necessary primarily on recent versions of macOS.
#' If numeric, that number of "prime" requests will be sent ahead of the capture request.
#' If `FALSE` no priming requests will be sent.
#' @param chrome_bin the path to Chrome (auto-set from `HEADLESS_CHROME` environment variable)
#' @export
#' @examples
#' chrome_read_html("https://www.r-project.org/")
chrome_read_html <- function(url, render=TRUE, chrome_bin=Sys.getenv("HEADLESS_CHROME")) {
chrome_read_html <- function(url, render=TRUE, prime=TRUE, chrome_bin=Sys.getenv("HEADLESS_CHROME")) {
args <- c("--headless")
args <- c(args, "--disable-gpu")
@ -19,6 +23,11 @@ chrome_read_html <- function(url, render=TRUE, chrome_bin=Sys.getenv("HEADLESS_C
args <- c(args, sprintf("--utility-allowed-dir=%s", .get_app_dir()))
args <- c(args, "--dump-dom", url)
vers <- chrome_version(quiet=TRUE)
if (is.logical(prime) & prime) .prime_url(url, 1, chrome_bin)
if (is.numeric(prime) & (prime>0)) .prime_url(url, prime, chrome_bin)
processx::run(
command = chrome_bin,
args = args,

8
R/version.r

@ -1,10 +1,14 @@
#' Get Chrome version
#'
#' @md
#' @param quiet if `TRUE`, no messages are displayed
#' @param chrome_bin the path to Chrome (auto-set from `HEADLESS_CHROME` environment variable)
#' @return the Chrome version string (invisibly)
#' @export
#' @examples
#' chrome_version()
chrome_version <- function(chrome_bin=Sys.getenv("HEADLESS_CHROME")) {
system2(chrome_bin, "--version")
chrome_version <- function(quiet = FALSE, chrome_bin=Sys.getenv("HEADLESS_CHROME")) {
res <- processx::run(chrome_bin, "--version")
if (!quiet) message(res$stdout)
return(invisible(trimws(res$stdout)))
}

5
man/chrome_dump_pdf.Rd

@ -18,6 +18,11 @@ the end of it.}
\item{overwrite}{overwrite existing file? Default: \code{TRUE}}
\item{chrome_bin}{the path to Chrome (auto-set from \code{HEADLESS_CHROME} environment variable)}
\item{prime}{if \code{TRUE} preliminary URL retrieval requests will be sent to "prime" the
headless Chrome cache. This seems to be necessary primarily on recent versions of macOS.
If numeric, that number of "prime" requests will be sent ahead of the capture request.
If \code{FALSE} no priming requests will be sent.}
}
\value{
output fileame (invisibly)

7
man/chrome_read_html.Rd

@ -4,7 +4,7 @@
\alias{chrome_read_html}
\title{Read a URL via headless Chrome and return the raw or rendered \code{<body>} \code{innerHTML} DOM elements}
\usage{
chrome_read_html(url, render = TRUE,
chrome_read_html(url, render = TRUE, prime = TRUE,
chrome_bin = Sys.getenv("HEADLESS_CHROME"))
}
\arguments{
@ -12,6 +12,11 @@ chrome_read_html(url, render = TRUE,
\item{render}{if \code{TRUE} then return an \code{xml_document}, else the raw HTML (invisibly)}
\item{prime}{if \code{TRUE} preliminary URL retrieval requests will be sent to "prime" the
headless Chrome cache. This seems to be necessary primarily on recent versions of macOS.
If numeric, that number of "prime" requests will be sent ahead of the capture request.
If \code{FALSE} no priming requests will be sent.}
\item{chrome_bin}{the path to Chrome (auto-set from \code{HEADLESS_CHROME} environment variable)}
}
\description{

5
man/chrome_shot.Rd

@ -20,6 +20,11 @@ the end of it.}
\item{overwrite}{overwrite existing file? Default: \code{TRUE}}
\item{chrome_bin}{the path to Chrome (auto-set from \code{HEADLESS_CHROME} environment variable)}
\item{prime}{if \code{TRUE} preliminary URL retrieval requests will be sent to "prime" the
headless Chrome cache. This seems to be necessary primarily on recent versions of macOS.
If numeric, that number of "prime" requests will be sent ahead of the capture request.
If \code{FALSE} no priming requests will be sent.}
}
\value{
\code{magick}

7
man/chrome_version.Rd

@ -4,11 +4,16 @@
\alias{chrome_version}
\title{Get Chrome version}
\usage{
chrome_version(chrome_bin = Sys.getenv("HEADLESS_CHROME"))
chrome_version(quiet = FALSE, chrome_bin = Sys.getenv("HEADLESS_CHROME"))
}
\arguments{
\item{quiet}{if \code{TRUE}, no messages are displayed}
\item{chrome_bin}{the path to Chrome (auto-set from \code{HEADLESS_CHROME} environment variable)}
}
\value{
the Chrome version string (invisibly)
}
\description{
Get Chrome version
}

Loading…
Cancel
Save