diff --git a/DESCRIPTION b/DESCRIPTION index 6523e59..15320c1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: splashr Type: Package Title: Tools to Work with the 'Splash' JavaScript Rendering Service -Version: 0.2.0 +Version: 0.3.0 Date: 2017-02-14 Encoding: UTF-8 Author: Bob Rudis (bob@rud.is) @@ -29,6 +29,10 @@ Imports: xml2, jsonlite, magick, - HARtools + stringi, + clipr, + HARtools, + openssl, + lubridate RoxygenNote: 6.0.0 Remotes: wch/harbor diff --git a/NAMESPACE b/NAMESPACE index 3e8d2d7..c596c37 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,15 +6,21 @@ export("%>%") export(HARviewer) export(HARviewerOutput) export(as_har) +export(as_req) +export(as_request) export(execute_lua) export(get_body_size) export(get_content_size) export(get_content_type) export(get_headers_size) +export(get_request_type) +export(get_request_url) +export(get_response_body) export(install_splash) export(is_binary) export(is_content_type) export(is_css) +export(is_get) export(is_gif) export(is_html) export(is_javascript) @@ -22,6 +28,7 @@ export(is_jpeg) export(is_json) export(is_plain) export(is_png) +export(is_post) export(is_svg) export(is_xhr) export(is_xml) @@ -55,6 +62,12 @@ importFrom(HARtools,HARviewer) importFrom(HARtools,HARviewerOutput) importFrom(HARtools,renderHARviewer) importFrom(HARtools,writeHAR) +importFrom(clipr,read_clip) importFrom(jsonlite,fromJSON) +importFrom(lubridate,ymd_hms) +importFrom(openssl,base64_decode) +importFrom(stringi,stri_detect_regex) +importFrom(stringi,stri_split_fixed) +importFrom(stringi,stri_split_regex) importFrom(xml2,read_html) importFrom(xml2,url_parse) diff --git a/NEWS.md b/NEWS.md index fddf97e..bec1a40 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +0.3.0 + +* added `as_req()` +* added `as_request()` + 0.2.0 * added `execute`() diff --git a/R/as_req.r b/R/as_req.r new file mode 100644 index 0000000..25c9b69 --- /dev/null +++ b/R/as_req.r @@ -0,0 +1,93 @@ +#' Create an httr function from an HAR request +#' +#' @md +#' @param entry HAR entry +#' @param quiet quiet +#' @param add_clip add clip +#' @export +as_req <- function(entry, quiet=TRUE, add_clip=TRUE) { + + req <- entry$request + + req$headers <- purrr::map(req$headers, "value") %>% + setNames(map_chr(req$headers, "name")) + + ml <- getOption("deparse.max.lines") + options(deparse.max.lines=10000) + + template <- "httr::VERB(verb = '%s', url = '%s' %s%s%s%s%s%s)" + + hdrs <- enc <- bdy <- ckies <- auth <- verbos <- cfg <- "" + + if (length(req$headers) > 0) { + + # try to determine encoding + ct_idx <- which(grepl("content-type", names(req$headers), ignore.case=TRUE)) + if (length(ct_idx) > 0) { + # retrieve & delete the content type + ct <- req$headers[[ct_idx]] + req$headers[[ct_idx]] <- NULL + + if (stringi::stri_detect_regex(ct, "multipart")) { + enc <- ", encode = 'multipart'" + } else if (stringi::stri_detect_regex(ct, "form")) { + enc <- ", encode = 'form'" + } else if (stringi::stri_detect_regex(ct, "json")) { + enc <- ", encode = 'json'" + } else { + enc <- "" + } + } + + hdrs <- paste0(capture.output(dput(req$headers, control=NULL)), + collapse="") + hdrs <- sub("^list", ", httr::add_headers", hdrs) + + } + + if (length(req$data) > 0) { + bdy_bits <- paste0(capture.output(dput(parse_query(req$data), control=NULL)), + collapse="") + bdy <- sprintf(", body = %s", bdy_bits) + } + + if (length(req$url_parts$username) > 0) { + auth <- sprintf(", httr::authenticate(user='%s', password='%s')", + req$url_parts$username, req$url_parts$password) + } + + if (length(req$verbose) > 0) { + verbos <- ", httr::verbose()" + } + + if (length(req$cookies) > 0) { + ckies <- paste0(capture.output(dput(req$cookies, control=NULL)), + collapse="") + ckies <- sub("^list", ", httr::set_cookies", ckies) + } + + REQ_URL <- req$url + + out <- sprintf(template, toupper(req$method), REQ_URL, auth, verbos, hdrs, ckies, bdy, enc) + + # this does a half-decent job formatting the R function text + fil <- tempfile(fileext=".R") + on.exit(unlink(fil)) + formatR::tidy_source(text=out, width.cutoff=30, indent=4, file=fil) + tmp <- paste0(readLines(fil), collapse="\n") + + if (add_clip) clipr::write_clip(tmp) + + if (!quiet) cat(tmp, "\n") + + # make a bona fide R function + f <- function() {} + formals(f) <- NULL + environment(f) <- parent.frame() + body(f) <- as.expression(parse(text=tmp)) + + options(deparse.max.lines=ml) + + return(f) + +} diff --git a/R/as_request.r b/R/as_request.r new file mode 100644 index 0000000..eab99b9 --- /dev/null +++ b/R/as_request.r @@ -0,0 +1,36 @@ +#' Return a HAR entry response as an httr::response object +#' +#' @param har_entry a HAR object (should contain a response body to be most useful) +#' @export +#' @examples \dontrun{ +#' URL <- "http://www.svs.cl/portal/principal/605/w3-propertyvalue-18554.html" +#' +#' splash_local %>% +#' splash_response_body(TRUE) %>% +#' splash_go(URL) %>% +#' splash_wait(2) %>% +#' splash_har() -> har +#' +#' keep(har$log$entries, is_xhr) %>% +#' map(as_request) %>% +#' map(httr::content, as="parsed") +#' } +as_request <- function(har_entry) { + + if (length(har_entry$response$content$text) > 0) { + content_body <- openssl::base64_decode(har_entry$response$content$text) + } else { + content_body <- NULL + } + + structure(list( + url = har_entry$request$url, + status_code = har_entry$response$status, + date = lubridate::ymd_hms(har_entry$startedDateTime), + headers = setNames(map(har_entry$response$headers, "value"), + map(har_entry$response$headers, "name")) %>% + insensitive(), + content = content_body + ), class="response") + +} diff --git a/R/content.r b/R/content.r new file mode 100644 index 0000000..c72d65b --- /dev/null +++ b/R/content.r @@ -0,0 +1,26 @@ +#' Retrieve size of content | body | headers +#' +#' @param har_resp_obj HAR response object +#' @export +get_content_size <- function(har_resp_obj) { + csize <- har_resp_obj$response$content$size + if (is.null(csize)) return(NA_real_) + return(as.numeric(csize)) +} + +#' @rdname get_content_size +#' @export +get_body_size <- function(har_resp_obj) { + bsize <- har_resp_obj$response$bodySize + if (is.null(bsize)) return(NA_real_) + return(as.numeric(bsize)) +} + +#' @rdname get_content_size +#' @export +get_headers_size <- function(har_resp_obj) { + hsize <- har_resp_obj$response$headersSize + if (is.null(hsize)) return(NA_real_) + return(as.numeric(hsize)) +} + diff --git a/R/helpers.r b/R/helpers.r index a0a34a1..8c87f00 100644 --- a/R/helpers.r +++ b/R/helpers.r @@ -1,3 +1,15 @@ +#' Retrieve the body content of a HAR entry +#' +#' @md +#' @param har_resp_obj HAR response object +#' @return A `raw` vector of the content or `NULL` +#' @export +get_response_body <- function(har_resp_obj) { + resp <- har_resp_obj$response$content$text + if (resp == "") return(NULL) + openssl::base64_decode(resp) +} + #' Retrieve or test content type of a HAR request object #' #' @param har_resp_obj HAR response object @@ -46,23 +58,23 @@ is_javascript <- function(har_resp_obj) { #' @rdname get_content_type #' @export -is_html <- function(har_resp_obj) { is_content_type(har_resp_obj, type="text/html") } +is_html <- function(har_resp_obj) { is_content_type(har_resp_obj, type="text/html") } #' @rdname get_content_type #' @export -is_jpeg <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/jpeg") } +is_jpeg <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/jpeg") } #' @rdname get_content_type #' @export -is_png <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/png") } +is_png <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/png") } #' @rdname get_content_type #' @export -is_svg <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/svg+xml") } +is_svg <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/svg+xml") } #' @rdname get_content_type #' @export -is_gif <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/gif") } +is_gif <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/gif") } #' @rdname get_content_type #' @export @@ -82,29 +94,30 @@ is_xhr <- function(x) { } -#' Retrieve size of content | body | headers +#' Retrieve request URL #' #' @param har_resp_obj HAR response object #' @export -get_content_size <- function(har_resp_obj) { - csize <- har_resp_obj$response$content$size - if (is.null(csize)) return(NA_real_) - return(as.numeric(csize)) +get_request_url <- function(har_resp_obj) { + utype <- har_resp_obj$request$url + if (utype == "") return(NA_character_) + utype } -#' @rdname get_content_size +#' Retrieve or test request type +#' +#' @param har_resp_obj HAR response object #' @export -get_body_size <- function(har_resp_obj) { - bsize <- har_resp_obj$response$bodySize - if (is.null(bsize)) return(NA_real_) - return(as.numeric(bsize)) +get_request_type <- function(har_resp_obj) { + rtype <- har_resp_obj$request$method + if (rtype == "") return(NA_character_) + rtype } -#' @rdname get_content_size +#' @rdname get_request_type #' @export -get_headers_size <- function(har_resp_obj) { - hsize <- har_resp_obj$response$headersSize - if (is.null(hsize)) return(NA_real_) - return(as.numeric(hsize)) -} +is_get <- function(har_resp_obj) { get_requet_type(har_resp_obj) == "GET" } +#' @rdname get_request_type +#' @export +is_post <- function(har_resp_obj) { get_requet_type(har_resp_obj) == "POST" } diff --git a/R/splashr-package.R b/R/splashr-package.R index 0c3c17f..cf009c9 100644 --- a/R/splashr-package.R +++ b/R/splashr-package.R @@ -15,9 +15,14 @@ #' @docType package #' @author Bob Rudis (bob@@rud.is) #' @import purrr httr magick harbor +#' @importFrom stringi stri_split_regex stri_split_fixed stri_detect_regex #' @importFrom HARtools writeHAR HARviewer renderHARviewer HARviewerOutput #' @importFrom xml2 read_html url_parse #' @importFrom jsonlite fromJSON +#' @importFrom openssl base64_decode +#' @importFrom clipr read_clip +#' @importFrom lubridate ymd_hms + NULL #' splashr exported operators diff --git a/README.Rmd b/README.Rmd index 16df239..cd6b574 100644 --- a/README.Rmd +++ b/README.Rmd @@ -49,10 +49,7 @@ The following functions are implemented: - `start_splash`: Start a Splash server Docker container - `stop_splash`: Stop a running a Splash server Docker container -Mini-DSL (domain-specific language). These can be used to create a "script" without actually -scripting in Lua. They are a less-powerful/configurable set of calls than what you -can make with a full Lua function but the idea is to have it take care of very common but -simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG image of a site: +Mini-DSL (domain-specific language). These can be used to create a "script" without actually scripting in Lua. They are a less-powerful/configurable set of calls than what you can make with a full Lua function but the idea is to have it take care of very common but simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG image of a site: - `splash_plugins`: Enable or disable browser plugins (e.g. Flash). - `splash_images`: Enable/disable images @@ -63,6 +60,11 @@ simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG - `splash_html`: Return a HTML snapshot of a current page. - `splash_png`: Return a screenshot of a current page in PNG format. +`httr` helpers. These help turn various bits of `splashr` objects into `httr`-ish things: + +- `as_req`: Turn a HAR response entry into a working `httr` function you can use to make a request with +- `as_request`: Turn a HAR response entry into an `httr` `response`-like object (i.e. you can use `httr::content()` on it) + Helpers: - `get_body_size`: Retrieve size of content | body | headers diff --git a/README.md b/README.md index dd0cd52..5acb24e 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ All you need for this package to work is a running Splash instance. You provide > 'Splash' is a javascript rendering service. It’s a lightweight web browser with an 'HTTP' API, implemented in Python using 'Twisted'and 'QT' [and provides some of the core functionality of the 'RSelenium' or 'seleniumPipes' R packages but with a Java-free footprint]. The (twisted) 'QT' reactor is used to make the sever fully asynchronous allowing to take advantage of 'webkit' concurrency via QT main loop. Some of Splash features include the ability to process multiple webpages in parallel; retrieving HTML results and/or take screenshots; disabling images or use Adblock Plus rules to make rendering faster; executing custom JavaScript in page context; getting detailed rendering info in HAR format. -The following functions are implemented: +The following functions are implemented: - `render_html`: Return the HTML of the javascript-rendered page. - `render_file`: Return the HTML or image (png) of the javascript-rendered page in a local file @@ -46,10 +46,7 @@ The following functions are implemented: - `start_splash`: Start a Splash server Docker container - `stop_splash`: Stop a running a Splash server Docker container -Mini-DSL (domain-specific language). These can be used to create a "script" without actually -scripting in Lua. They are a less-powerful/configurable set of calls than what you -can make with a full Lua function but the idea is to have it take care of very common but -simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG image of a site: +Mini-DSL (domain-specific language). These can be used to create a "script" without actually scripting in Lua. They are a less-powerful/configurable set of calls than what you can make with a full Lua function but the idea is to have it take care of very common but simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG image of a site: - `splash_plugins`: Enable or disable browser plugins (e.g. Flash). - `splash_images`: Enable/disable images @@ -60,7 +57,12 @@ simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG - `splash_html`: Return a HTML snapshot of a current page. - `splash_png`: Return a screenshot of a current page in PNG format. -Helpers: +`httr` helpers. These help turn various bits of `splashr` objects into `httr`-ish things: + +- `as_req`: Turn a HAR response entry into a working `httr` function you can use to make a request with +- `as_request`: Turn a HAR response entry into an `httr` `response`-like object (i.e. you can use `httr::content()` on it) + +Helpers: - `get_body_size`: Retrieve size of content | body | headers - `get_content_sie`: Retrieve size of content | body | headers @@ -119,7 +121,7 @@ library(tidyverse) packageVersion("splashr") ``` - ## [1] '0.2.0' + ## [1] '0.3.0' ``` r splash("splash", 8050L) %>% @@ -292,7 +294,7 @@ library(testthat) date() ``` - ## [1] "Tue Feb 14 09:02:35 2017" + ## [1] "Tue Feb 15 09:02:35 2017" ``` r test_dir("tests/") diff --git a/man/as_req.Rd b/man/as_req.Rd new file mode 100644 index 0000000..0d3e90a --- /dev/null +++ b/man/as_req.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/as_req.r +\name{as_req} +\alias{as_req} +\title{Create an httr function from an HAR request} +\usage{ +as_req(entry, quiet = TRUE, add_clip = TRUE) +} +\arguments{ +\item{entry}{HAR entry} + +\item{quiet}{quiet} + +\item{add_clip}{add clip} +} +\description{ +Create an httr function from an HAR request +} diff --git a/man/as_request.Rd b/man/as_request.Rd new file mode 100644 index 0000000..d9ffa64 --- /dev/null +++ b/man/as_request.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/as_request.r +\name{as_request} +\alias{as_request} +\title{Return a HAR entry response as an httr::response object} +\usage{ +as_request(har_entry) +} +\arguments{ +\item{har_entry}{a HAR object (should contain a response body to be most useful)} +} +\description{ +Return a HAR entry response as an httr::response object +} +\examples{ +\dontrun{ +URL <- "http://www.svs.cl/portal/principal/605/w3-propertyvalue-18554.html" + +splash_local \%>\% + splash_response_body(TRUE) \%>\% + splash_go(URL) \%>\% + splash_wait(2) \%>\% + splash_har() -> har + +keep(har$log$entries, is_xhr) \%>\% + map(as_request) \%>\% + map(httr::content, as="parsed") +} +} diff --git a/man/get_content_size.Rd b/man/get_content_size.Rd index b101942..e63943f 100644 --- a/man/get_content_size.Rd +++ b/man/get_content_size.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helpers.r +% Please edit documentation in R/content.r \name{get_content_size} \alias{get_content_size} \alias{get_body_size} diff --git a/man/get_request_type.Rd b/man/get_request_type.Rd new file mode 100644 index 0000000..242a44c --- /dev/null +++ b/man/get_request_type.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helpers.r +\name{get_request_type} +\alias{get_request_type} +\alias{is_get} +\alias{is_post} +\title{Retrieve or test request type} +\usage{ +get_request_type(har_resp_obj) + +is_get(har_resp_obj) + +is_post(har_resp_obj) +} +\arguments{ +\item{har_resp_obj}{HAR response object} +} +\description{ +Retrieve or test request type +} diff --git a/man/get_request_url.Rd b/man/get_request_url.Rd new file mode 100644 index 0000000..6865177 --- /dev/null +++ b/man/get_request_url.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helpers.r +\name{get_request_url} +\alias{get_request_url} +\title{Retrieve request URL} +\usage{ +get_request_url(har_resp_obj) +} +\arguments{ +\item{har_resp_obj}{HAR response object} +} +\description{ +Retrieve request URL +} diff --git a/man/get_response_body.Rd b/man/get_response_body.Rd new file mode 100644 index 0000000..efc0ed1 --- /dev/null +++ b/man/get_response_body.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helpers.r +\name{get_response_body} +\alias{get_response_body} +\title{Retrieve the body content of a HAR entry} +\usage{ +get_response_body(har_resp_obj) +} +\arguments{ +\item{har_resp_obj}{HAR response object} +} +\value{ +A \code{raw} vector of the content or \code{NULL} +} +\description{ +Retrieve the body content of a HAR entry +}