Browse Source

major improvements all around

master
boB Rudis 6 years ago
parent
commit
269dd1d866
No known key found for this signature in database GPG Key ID: 2A514A4997464560
  1. 5
      DESCRIPTION
  2. 3
      NAMESPACE
  3. 2
      NEWS.md
  4. 74
      R/chrome-pdf.r
  5. 86
      R/chrome-shot.r
  6. 10
      R/decapitated-package.R
  7. 74
      R/read-html.r
  8. 8
      R/util.r
  9. 17
      README.Rmd
  10. 58
      README.md
  11. 0
      location.href
  12. 17
      man/chrome_dump_pdf.Rd
  13. 9
      man/chrome_read_html.Rd
  14. 15
      man/chrome_shot.Rd
  15. 8
      man/decapitated.Rd
  16. BIN
      output.pdf
  17. BIN
      screenshot.png
  18. 2
      tests/test-all.R
  19. 6
      tests/testthat/test-decapitated.R

5
DESCRIPTION

@ -22,5 +22,8 @@ Depends:
R (>= 3.2.0)
Imports:
xml2,
magick
magick,
processx,
tools,
utils
RoxygenNote: 6.0.1

3
NAMESPACE

@ -7,4 +7,7 @@ export(chrome_version)
export(get_chrome_env)
export(set_chrome_env)
import(magick)
import(processx)
import(tools)
import(utils)
import(xml2)

2
NEWS.md

@ -1,6 +1,8 @@
0.2.0
* Re-design of how the Chrome binary is set
* env var functions to help with ^^
* switch to using processx
* options for naming & placing PDF & screenshot files
0.1.0
* Initial release

74
R/chrome-pdf.r

@ -0,0 +1,74 @@
#' "Print" to PDF
#'
#' @md
#' @note The default Chrome filename is `output.pdf`
#' @param url URL to read from
#' @param path path (with optional output filename) for the generated PDF. If `NULL` then
#' and `overwrite` is `FALSE`, the fuction will will ensure a uniquely-named file is
#' placed in the current working directory by incrementing trailing numbers before
#' the end of it.
#' @param overwrite overwrite existing file? Default: `TRUE`
#' @param chrome_bin the path to Chrome (auto-set from `HEADLESS_CHROME` environment variable)
#' @return output fileame (invisibly)
#' @export
#' @examples
#' chrome_dump_pdf("https://www.r-project.org/")
chrome_dump_pdf <- function(url, path=NULL, overwrite=TRUE, chrome_bin=Sys.getenv("HEADLESS_CHROME")) {
curwd <- getwd()
on.exit(setwd(curwd), add = TRUE)
if (is.null(path)) path <- "."
path <- normalizePath(path.expand(path[1]))
if (!grepl("\\.pdf$", path)) {
fil_nam <- "output.pdf"
dir_nam <- path
} else {
fil_nam <- basename(path)
dir_nam <- dirname(path)
}
fil_ext <- tools::file_ext(fil_nam)
fil_pre <- tools::file_path_sans_ext(fil_nam)
td <- tempdir()
setwd(td)
args <- c("--headless")
args <- c(args, "--disable-gpu")
args <- c(args, "--no-sandbox")
args <- c(args, "--allow-no-sandbox-job")
args <- c(args, sprintf("--user-data-dir=%s", .get_app_dir()))
args <- c(args, sprintf("--crash-dumps-dir=%s", .get_app_dir()))
args <- c(args, sprintf("--utility-allowed-dir=%s", .get_app_dir()))
args <- c(args, "--print-to-pdf", url)
processx::run(
command = chrome_bin,
args = args,
error_on_status = FALSE,
echo_cmd = FALSE,
echo = FALSE
) -> res
first_fil <- file.path(dir_nam, sprintf("%s.%s", fil_pre, fil_ext))
out_fil <- first_fil
if (!overwrite) {
moar_fils <- sprintf(file.path(dir_nam, sprintf("%s%%04d.%s", fil_pre, fil_ext)), 0:9999)
fils <- c(first_fil, moar_fils)
out_fil <- fils[which(!file.exists(fils))[1]]
if (is.na(out_fil)) stop("Cannot create unique filename")
}
file.copy("output.pdf", out_fil, overwrite = overwrite)
return(invisible(out_fil))
}

86
R/chrome-shot.r

@ -0,0 +1,86 @@
#' Capture a screenshot
#'
#' For the moment, the capture file is in the current working directory and named
#' `screenshot.png`. This will change, soon.
#'
#' A `magick` image object is returned.
#'
#' @md
#' @note The default Chrome filename is `screenshot.png`
#' @param url URL to read from
#' @param width,height screen size to emulate
#' @param path path (with optional output filename) for the generated PDF. If `NULL` then
#' and `overwrite` is `FALSE`, the fuction will will ensure a uniquely-named file is
#' placed in the current working directory by incrementing trailing numbers before
#' the end of it.
#' @param overwrite overwrite existing file? Default: `TRUE`
#' @param chrome_bin the path to Chrome (auto-set from `HEADLESS_CHROME` environment variable)
#' @return `magick`
#' @export
#' @examples
#' chrome_shot("https://www.r-project.org/logo/Rlogo.svg")
chrome_shot <- function(url, width=NULL, height=NULL, path=NULL, overwrite=TRUE,
chrome_bin=Sys.getenv("HEADLESS_CHROME")) {
curwd <- getwd()
on.exit(setwd(curwd), add = TRUE)
if (is.null(path)) path <- "."
path <- normalizePath(path.expand(path[1]))
if (!grepl("\\.pdf$", path)) {
fil_nam <- "screenshot.png"
dir_nam <- path
} else {
fil_nam <- basename(path)
dir_nam <- dirname(path)
}
fil_ext <- tools::file_ext(fil_nam)
fil_pre <- tools::file_path_sans_ext(fil_nam)
td <- tempdir()
setwd(td)
args <- c("--headless")
args <- c(args, "--disable-gpu")
args <- c(args, "--no-sandbox")
args <- c(args, "--allow-no-sandbox-job")
args <- c(args, sprintf("--user-data-dir=%s", .get_app_dir()))
args <- c(args, sprintf("--crash-dumps-dir=%s", .get_app_dir()))
args <- c(args, sprintf("--utility-allowed-dir=%s", .get_app_dir()))
args <- c(args, "--screenshot", url)
if (!is.null(width) & !is.null(height)) {
args <- c(args, sprintf("--window-size=%s,%s", height, width))
}
processx::run(
command = chrome_bin,
args = args,
error_on_status = FALSE,
echo_cmd = FALSE,
echo = FALSE
) -> res
first_fil <- file.path(dir_nam, sprintf("%s.%s", fil_pre, fil_ext))
out_fil <- first_fil
if (!overwrite) {
moar_fils <- sprintf(file.path(dir_nam, sprintf("%s%%04d.%s", fil_pre, fil_ext)), 0:9999)
fils <- c(first_fil, moar_fils)
out_fil <- fils[which(!file.exists(fils))[1]]
if (is.na(out_fil)) stop("Cannot create unique filename")
}
file.copy("screenshot.png", out_fil, overwrite = overwrite)
if (file.exists(out_fil)) magick::image_read(out_fil)
}

10
R/decapitated-package.R

@ -2,14 +2,12 @@
#'
#' The 'Chrome' browser <https://www.google.com/chrome/> has a headless mode
#' which can be instrumented programmatically. Tools are provided to perform headless
#' 'Chrome' instrumentation on the command-line and will eventually provide support
#' for the 'DevTools' instrumentation 'API' or the forthcoming 'phantomjs'-like higher-level
#' 'API' being promised by the development team.
#' Chrome' instrumentation on the command-line, including retrieving the javascript-executed
#' web page, PDF output or screen shot of a URL.
#'
#' @section Important:
#'
#' This pkg will eventually do much under the covers to find the location of the Chrome binary
#' on all operating systems. For now, you'll need to set an envrionment variable `HEADLESS_CHROME` to one of these two values:
#' You'll need to set an envrionment variable `HEADLESS_CHROME` to one of these two values:
#'
#' - Windows(32bit): `C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe`
#' - Windows(64bit): `C:\\Program Files (x86)/Google\\Chrome\\Application\\chrome.exe`
@ -24,5 +22,5 @@
#' @name decapitated
#' @docType package
#' @author Bob Rudis (bob@@rud.is)
#' @import xml2 magick
#' @import xml2 magick processx tools utils
NULL

74
R/read-html.r

@ -1,62 +1,32 @@
#' Read a URL via headless Chrome and return the renderd `<body>` `innerHTML` DOM elements
#' Read a URL via headless Chrome and return the raw or rendered `<body>` `innerHTML` DOM elements
#'
#' @md
#' @note This only grabs the `<body>` `innerHTML` contents
#' @param url URL to read from
#' @param render if `TRUE` then return an `xml_document`, else the raw HTML (invisibly)
#' @param chrome_bin the path to Chrome (auto-set from `HEADLESS_CHROME` environment variable)
#' @export
#' @examples
#' chrome_read_html("https://www.r-project.org/")
chrome_read_html <- function(url, chrome_bin=Sys.getenv("HEADLESS_CHROME")) {
url <- shQuote(url)
tmp <- system2(chrome_bin, c("--headless", "--no-sandbox", "--disable-gpu", "--dump-dom", url), stdout=TRUE)
xml2::read_html(paste0(tmp, collapse="\n"))
}
#' "Print" to PDF
#'
#' @md
#' @note this is a quick version of the function and will overwrite `output.pdf` if it exists in CWD
#' @param url URL to read from
#' @param chrome_bin the path to Chrome (auto-set from `HEADLESS_CHROME` environment variable)
#' @export
#' @examples
#' chrome_dump_pdf("https://www.r-project.org/")
chrome_dump_pdf <- function(url, chrome_bin=Sys.getenv("HEADLESS_CHROME")) {
url <- shQuote(url)
tmp <- system2(chrome_bin, c("--headless", "--no-sandbox", "--disable-gpu", "--print-to-pdf", url))
}
#' Capture a screenshot
#'
#' For the moment, the capture file is in the current working directory and named
#' `screenshot.png`. This will change, soon.
#'
#' A `magick` image object is returned.
#'
#' @md
#' @note this is a quick version of the function and will overwrite `screenshot.png` if it exists in CWD
#' @param url URL to read from
#' @param width,height screen size to emulate
#' @param chrome_bin the path to Chrome (auto-set from `HEADLESS_CHROME` environment variable)
#' @return `magick`
#' @export
#' @examples
#' chrome_shot("https://www.r-project.org/logo/Rlogo.svg")
chrome_shot <- function(url, width=NULL, height=NULL, chrome_bin=Sys.getenv("HEADLESS_CHROME")) {
args <- c("--headless", "--no-sandbox", "--disable-gpu", "--screenshot")
url <- shQuote(url)
if (!is.null(width) & !is.null(height)) {
args <- c(args, sprintf("--window-size=%s,%s", height, width))
}
args <- c(args, url)
tmp <- system2(chrome_bin, args)
magick::image_read("screenshot.png")
chrome_read_html <- function(url, render=TRUE, chrome_bin=Sys.getenv("HEADLESS_CHROME")) {
args <- c("--headless")
args <- c(args, "--disable-gpu")
args <- c(args, "--no-sandbox")
args <- c(args, "--allow-no-sandbox-job")
args <- c(args, sprintf("--user-data-dir=%s", .get_app_dir()))
args <- c(args, sprintf("--crash-dumps-dir=%s", .get_app_dir()))
args <- c(args, sprintf("--utility-allowed-dir=%s", .get_app_dir()))
args <- c(args, "--dump-dom", url)
processx::run(
command = chrome_bin,
args = args,
error_on_status = FALSE,
echo_cmd = FALSE,
echo = FALSE
) -> res
if (render) xml2::read_html(res$stdout) else return(invisible(res$stdout))
}

8
R/util.r

@ -0,0 +1,8 @@
.get_app_dir <- function() {
ddir <- file.path(Sys.getenv("HOME"), ".rdecapdata")
if (!dir.exists(ddir)) {
message(sprintf("Creating application data directory [%s]...", ddir))
dir.create(ddir, recursive=TRUE)
}
return(ddir)
}

17
README.Rmd

@ -10,16 +10,11 @@ Headless 'Chrome' Orchestration
The 'Chrome' browser <https://www.google.com/chrome/> has a headless mode
which can be instrumented programmatically. Tools are provided to perform headless
'Chrome' instrumentation on the command-line and will eventually provide support
for the 'DevTools' instrumentation 'API' or the forthcoming 'phantomjs'-like higher-level
'API' being promised by the development team.
'Chrome' instrumentation on the command-line, including retrieving the javascript-executed web page, PDF output or screen shot of a URL.
### IMPORTANT
macOS High Sierra and Headless Chrome dinna work so good together.
This pkg will eventually do much under the covers to find the location of the Chrome binary
on all operating systems. For now, you'll need to set an envrionment variable `HEADLESS_CHROME` to one of these values:
You'll need to set an envrionment variable `HEADLESS_CHROME` to one of these two values:
- Windows(32bit): `C:/Program Files/Google/Chrome/Application/chrome.exe`
- Windows(64bit): `C:/Program Files (x86)/Google/Chrome/Application/chrome.exe`
@ -28,14 +23,14 @@ on all operating systems. For now, you'll need to set an envrionment variable `H
A guess is made (but not verified yet) if `HEADLESS_CHROME` is non-existent.
Use `~/.Renviron` to store this value for the time being.
It's best to use `~/.Renviron` to store this value for the time being.
## What's in the tin?
The following functions are implemented:
- `chrome_dump_pdf`: "Print" to PDF
- `chrome_read_html`: Read a URL via headless Chrome and return the renderd '<body>' 'innerHTML' DOM elements
- `chrome_read_html`: Read a URL via headless Chrome and return the raw or rendered '<body>' 'innerHTML' DOM elements
- `chrome_shot`: Capture a screenshot
- `chrome_version`: Get Chrome version
- `get_chrome_env`: get an envrionment variable 'HEADLESS_CHROME'
@ -66,15 +61,13 @@ chrome_read_html("http://httpbin.org/")
```{r eval=FALSE, message=FALSE, warning=FALSE, error=FALSE}
chrome_dump_pdf("http://httpbin.org/")
## [0502/094321.911089:INFO:headless_shell.cc(436)] Written to file output.pdf.
```
```{r message=FALSE, warning=FALSE, error=FALSE, eval=FALSE}
chrome_shot("http://httpbin.org/")
## [0502/094257.370837:INFO:headless_shell.cc(436)] Written to file screenshot.png.
## format width height colorspace filesize
## 1 PNG 1600 1200 sRGB 238967
## 1 PNG 1600 1200 sRGB 215680
```
![screenshot.png](screenshot.png)

58
README.md

@ -1,37 +1,47 @@
# decapitated
Headless 'Chrome' Orchestration
Headless ‘Chrome’ Orchestration
## Description
The 'Chrome' browser <https://www.google.com/chrome/> has a headless mode which can be instrumented programmatically. Tools are provided to perform headless 'Chrome' instrumentation on the command-line and will eventually provide support for the 'DevTools' instrumentation 'API' or the forthcoming 'phantomjs'-like higher-level 'API' being promised by the development team.
The ‘Chrome’ browser <https://www.google.com/chrome/> has a headless
mode which can be instrumented programmatically. Tools are provided to
perform headless ‘Chrome’ instrumentation on the command-line, including
retrieving the javascript-executed web page, PDF output or screen shot
of a URL.
## IMPORTANT
### IMPORTANT
macOS High Sierra and Headless Chrome dinna work so good together.
You’ll need to set an envrionment variable `HEADLESS_CHROME` to one of
these two values:
This pkg will eventually do much under the covers to find the location of the Chrome binary on all operating systems. For now, you'll need to set an envrionment variable `HEADLESS_CHROME` to one of these two values:
- Windows(32bit): `C:/Program
Files/Google/Chrome/Application/chrome.exe`
- Windows(64bit): `C:/Program Files
(x86)/Google/Chrome/Application/chrome.exe`
- macOS: `/Applications/Google\ Chrome.app/Contents/MacOS/Google\
Chrome`
- Linux: `/usr/bin/google-chrome`
- Windows(32bit): `C:/Program Files/Google/Chrome/Application/chrome.exe`
- Windows(64bit): `C:/Program Files (x86)/Google/Chrome/Application/chrome.exe`
- macOS: `/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome`
- Linux: `/usr/bin/google-chrome`
A guess is made (but not verified yet) if `HEADLESS_CHROME` is
non-existent.
A guess is made (but not verified yet) if `HEADLESS_CHROME` is non-existent.
It’s best to use `~/.Renviron` to store this value for the time being.
Use `~/.Renviron` to store this value for the time being.
## What’s in the tin?
The following functions are implemented:
- `chrome_dump_pdf`: "Print" to PDF
- `chrome_read_html`: Read a URL via headless Chrome and return the renderd '
- `chrome_dump_pdf`: “Print” to PDF
- `chrome_read_html`: Read a URL via headless Chrome and return the
raw or rendered ’
<body>
' 'innerHTML' DOM elements
- `chrome_shot`: Capture a screenshot
- `chrome_version`: Get Chrome version
- `get_chrome_env`: get an envrionment variable 'HEADLESS_CHROME'
- `set_chrome_env`: set an envrionment variable 'HEADLESS_CHROME'
‘’innerHTML’ DOM elements
- `chrome_shot`: Capture a screenshot
- `chrome_version`: Get Chrome version
- `get_chrome_env`: get an envrionment variable ‘HEADLESS\_CHROME’
- `set_chrome_env`: set an envrionment variable ‘HEADLESS\_CHROME’
## Installation
@ -48,7 +58,7 @@ library(decapitated)
packageVersion("decapitated")
```
## [1] '0.2.0'
## [1] '0.1.0'
``` r
chrome_version()
@ -58,20 +68,18 @@ chrome_read_html("http://httpbin.org/")
## {xml_document}
## <html>
## [1] <body id="manpage"></body>
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<meta http-equiv="content-type" valu ...
## [2] <body id="manpage">\n<a href="http://github.com/kennethreitz/httpbin"><img style="position: absolute; top: 0; rig ...
``` r
chrome_dump_pdf("http://httpbin.org/")
## [0502/094321.911089:INFO:headless_shell.cc(436)] Written to file output.pdf.
```
``` r
chrome_shot("http://httpbin.org/")
## [0502/094257.370837:INFO:headless_shell.cc(436)] Written to file screenshot.png.
## format width height colorspace filesize
## 1 PNG 1600 1200 sRGB 238967
## 1 PNG 1600 1200 sRGB 215680
```
![](screenshot.png)
![screenshot.png](screenshot.png)

0
location.href

17
man/chrome_dump_pdf.Rd

@ -1,21 +1,32 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/read-html.r
% Please edit documentation in R/chrome-pdf.r
\name{chrome_dump_pdf}
\alias{chrome_dump_pdf}
\title{"Print" to PDF}
\usage{
chrome_dump_pdf(url, chrome_bin = Sys.getenv("HEADLESS_CHROME"))
chrome_dump_pdf(url, path = NULL, overwrite = TRUE,
chrome_bin = Sys.getenv("HEADLESS_CHROME"))
}
\arguments{
\item{url}{URL to read from}
\item{path}{path (with optional output filename) for the generated PDF. If \code{NULL} then
and \code{overwrite} is \code{FALSE}, the fuction will will ensure a uniquely-named file is
placed in the current working directory by incrementing trailing numbers before
the end of it.}
\item{overwrite}{overwrite existing file? Default: \code{TRUE}}
\item{chrome_bin}{the path to Chrome (auto-set from \code{HEADLESS_CHROME} environment variable)}
}
\value{
output fileame (invisibly)
}
\description{
"Print" to PDF
}
\note{
this is a quick version of the function and will overwrite \code{output.pdf} if it exists in CWD
The default Chrome filename is \code{output.pdf}
}
\examples{
chrome_dump_pdf("https://www.r-project.org/")

9
man/chrome_read_html.Rd

@ -2,17 +2,20 @@
% Please edit documentation in R/read-html.r
\name{chrome_read_html}
\alias{chrome_read_html}
\title{Read a URL via headless Chrome and return the renderd \code{<body>} \code{innerHTML} DOM elements}
\title{Read a URL via headless Chrome and return the raw or rendered \code{<body>} \code{innerHTML} DOM elements}
\usage{
chrome_read_html(url, chrome_bin = Sys.getenv("HEADLESS_CHROME"))
chrome_read_html(url, render = TRUE,
chrome_bin = Sys.getenv("HEADLESS_CHROME"))
}
\arguments{
\item{url}{URL to read from}
\item{render}{if \code{TRUE} then return an \code{xml_document}, else the raw HTML (invisibly)}
\item{chrome_bin}{the path to Chrome (auto-set from \code{HEADLESS_CHROME} environment variable)}
}
\description{
Read a URL via headless Chrome and return the renderd \code{<body>} \code{innerHTML} DOM elements
Read a URL via headless Chrome and return the raw or rendered \code{<body>} \code{innerHTML} DOM elements
}
\note{
This only grabs the \code{<body>} \code{innerHTML} contents

15
man/chrome_shot.Rd

@ -1,17 +1,24 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/read-html.r
% Please edit documentation in R/chrome-shot.r
\name{chrome_shot}
\alias{chrome_shot}
\title{Capture a screenshot}
\usage{
chrome_shot(url, width = NULL, height = NULL,
chrome_bin = Sys.getenv("HEADLESS_CHROME"))
chrome_shot(url, width = NULL, height = NULL, path = NULL,
overwrite = TRUE, chrome_bin = Sys.getenv("HEADLESS_CHROME"))
}
\arguments{
\item{url}{URL to read from}
\item{width, height}{screen size to emulate}
\item{path}{path (with optional output filename) for the generated PDF. If \code{NULL} then
and \code{overwrite} is \code{FALSE}, the fuction will will ensure a uniquely-named file is
placed in the current working directory by incrementing trailing numbers before
the end of it.}
\item{overwrite}{overwrite existing file? Default: \code{TRUE}}
\item{chrome_bin}{the path to Chrome (auto-set from \code{HEADLESS_CHROME} environment variable)}
}
\value{
@ -25,7 +32,7 @@ For the moment, the capture file is in the current working directory and named
A \code{magick} image object is returned.
}
\note{
this is a quick version of the function and will overwrite \code{screenshot.png} if it exists in CWD
The default Chrome filename is \code{screenshot.png}
}
\examples{
chrome_shot("https://www.r-project.org/logo/Rlogo.svg")

8
man/decapitated.Rd

@ -8,15 +8,13 @@
\description{
The 'Chrome' browser \url{https://www.google.com/chrome/} has a headless mode
which can be instrumented programmatically. Tools are provided to perform headless
'Chrome' instrumentation on the command-line and will eventually provide support
for the 'DevTools' instrumentation 'API' or the forthcoming 'phantomjs'-like higher-level
'API' being promised by the development team.
Chrome' instrumentation on the command-line, including retrieving the javascript-executed
web page, PDF output or screen shot of a URL.
}
\section{Important}{
This pkg will eventually do much under the covers to find the location of the Chrome binary
on all operating systems. For now, you'll need to set an envrionment variable \code{HEADLESS_CHROME} to one of these two values:
You'll need to set an envrionment variable \code{HEADLESS_CHROME} to one of these two values:
\itemize{
\item Windows(32bit): \code{C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe}
\item Windows(64bit): \code{C:\\Program Files (x86)/Google\\Chrome\\Application\\chrome.exe}

BIN
output.pdf

Binary file not shown.

BIN
screenshot.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 233 KiB

After

Width:  |  Height:  |  Size: 211 KiB

2
tests/test-all.R

@ -1,2 +0,0 @@
library(testthat)
test_check("decapitated")

6
tests/testthat/test-decapitated.R

@ -1,6 +0,0 @@
context("basic functionality")
test_that("we can do something", {
#expect_that(some_function(), is_a("data.frame"))
})
Loading…
Cancel
Save