Browse Source

make httr functions and response objects from HAR entries

master
boB Rudis 3 years ago
parent
commit
4ef5b7a1aa
16 changed files with 333 additions and 36 deletions
  1. +6
    -2
      DESCRIPTION
  2. +13
    -0
      NAMESPACE
  3. +5
    -0
      NEWS.md
  4. +93
    -0
      R/as_req.r
  5. +36
    -0
      R/as_request.r
  6. +26
    -0
      R/content.r
  7. +34
    -21
      R/helpers.r
  8. +5
    -0
      R/splashr-package.R
  9. +6
    -4
      README.Rmd
  10. +10
    -8
      README.md
  11. +18
    -0
      man/as_req.Rd
  12. +29
    -0
      man/as_request.Rd
  13. +1
    -1
      man/get_content_size.Rd
  14. +20
    -0
      man/get_request_type.Rd
  15. +14
    -0
      man/get_request_url.Rd
  16. +17
    -0
      man/get_response_body.Rd

+ 6
- 2
DESCRIPTION View File

@@ -1,7 +1,7 @@
Package: splashr
Type: Package
Title: Tools to Work with the 'Splash' JavaScript Rendering Service
Version: 0.2.0
Version: 0.3.0
Date: 2017-02-14
Encoding: UTF-8
Author: Bob Rudis (bob@rud.is)
@@ -29,6 +29,10 @@ Imports:
xml2,
jsonlite,
magick,
HARtools
stringi,
clipr,
HARtools,
openssl,
lubridate
RoxygenNote: 6.0.0
Remotes: wch/harbor

+ 13
- 0
NAMESPACE View File

@@ -6,15 +6,21 @@ export("%>%")
export(HARviewer)
export(HARviewerOutput)
export(as_har)
export(as_req)
export(as_request)
export(execute_lua)
export(get_body_size)
export(get_content_size)
export(get_content_type)
export(get_headers_size)
export(get_request_type)
export(get_request_url)
export(get_response_body)
export(install_splash)
export(is_binary)
export(is_content_type)
export(is_css)
export(is_get)
export(is_gif)
export(is_html)
export(is_javascript)
@@ -22,6 +28,7 @@ export(is_jpeg)
export(is_json)
export(is_plain)
export(is_png)
export(is_post)
export(is_svg)
export(is_xhr)
export(is_xml)
@@ -55,6 +62,12 @@ importFrom(HARtools,HARviewer)
importFrom(HARtools,HARviewerOutput)
importFrom(HARtools,renderHARviewer)
importFrom(HARtools,writeHAR)
importFrom(clipr,read_clip)
importFrom(jsonlite,fromJSON)
importFrom(lubridate,ymd_hms)
importFrom(openssl,base64_decode)
importFrom(stringi,stri_detect_regex)
importFrom(stringi,stri_split_fixed)
importFrom(stringi,stri_split_regex)
importFrom(xml2,read_html)
importFrom(xml2,url_parse)

+ 5
- 0
NEWS.md View File

@@ -1,3 +1,8 @@
0.3.0

* added `as_req()`
* added `as_request()`

0.2.0

* added `execute`()


+ 93
- 0
R/as_req.r View File

@@ -0,0 +1,93 @@
#' Create an httr function from an HAR request
#'
#' @md
#' @param entry HAR entry
#' @param quiet quiet
#' @param add_clip add clip
#' @export
as_req <- function(entry, quiet=TRUE, add_clip=TRUE) {

req <- entry$request

req$headers <- purrr::map(req$headers, "value") %>%
setNames(map_chr(req$headers, "name"))

ml <- getOption("deparse.max.lines")
options(deparse.max.lines=10000)

template <- "httr::VERB(verb = '%s', url = '%s' %s%s%s%s%s%s)"

hdrs <- enc <- bdy <- ckies <- auth <- verbos <- cfg <- ""

if (length(req$headers) > 0) {

# try to determine encoding
ct_idx <- which(grepl("content-type", names(req$headers), ignore.case=TRUE))
if (length(ct_idx) > 0) {
# retrieve & delete the content type
ct <- req$headers[[ct_idx]]
req$headers[[ct_idx]] <- NULL

if (stringi::stri_detect_regex(ct, "multipart")) {
enc <- ", encode = 'multipart'"
} else if (stringi::stri_detect_regex(ct, "form")) {
enc <- ", encode = 'form'"
} else if (stringi::stri_detect_regex(ct, "json")) {
enc <- ", encode = 'json'"
} else {
enc <- ""
}
}

hdrs <- paste0(capture.output(dput(req$headers, control=NULL)),
collapse="")
hdrs <- sub("^list", ", httr::add_headers", hdrs)

}

if (length(req$data) > 0) {
bdy_bits <- paste0(capture.output(dput(parse_query(req$data), control=NULL)),
collapse="")
bdy <- sprintf(", body = %s", bdy_bits)
}

if (length(req$url_parts$username) > 0) {
auth <- sprintf(", httr::authenticate(user='%s', password='%s')",
req$url_parts$username, req$url_parts$password)
}

if (length(req$verbose) > 0) {
verbos <- ", httr::verbose()"
}

if (length(req$cookies) > 0) {
ckies <- paste0(capture.output(dput(req$cookies, control=NULL)),
collapse="")
ckies <- sub("^list", ", httr::set_cookies", ckies)
}

REQ_URL <- req$url

out <- sprintf(template, toupper(req$method), REQ_URL, auth, verbos, hdrs, ckies, bdy, enc)

# this does a half-decent job formatting the R function text
fil <- tempfile(fileext=".R")
on.exit(unlink(fil))
formatR::tidy_source(text=out, width.cutoff=30, indent=4, file=fil)
tmp <- paste0(readLines(fil), collapse="\n")

if (add_clip) clipr::write_clip(tmp)

if (!quiet) cat(tmp, "\n")

# make a bona fide R function
f <- function() {}
formals(f) <- NULL
environment(f) <- parent.frame()
body(f) <- as.expression(parse(text=tmp))

options(deparse.max.lines=ml)

return(f)

}

+ 36
- 0
R/as_request.r View File

@@ -0,0 +1,36 @@
#' Return a HAR entry response as an httr::response object
#'
#' @param har_entry a HAR object (should contain a response body to be most useful)
#' @export
#' @examples \dontrun{
#' URL <- "http://www.svs.cl/portal/principal/605/w3-propertyvalue-18554.html"
#'
#' splash_local %>%
#' splash_response_body(TRUE) %>%
#' splash_go(URL) %>%
#' splash_wait(2) %>%
#' splash_har() -> har
#'
#' keep(har$log$entries, is_xhr) %>%
#' map(as_request) %>%
#' map(httr::content, as="parsed")
#' }
as_request <- function(har_entry) {

if (length(har_entry$response$content$text) > 0) {
content_body <- openssl::base64_decode(har_entry$response$content$text)
} else {
content_body <- NULL
}

structure(list(
url = har_entry$request$url,
status_code = har_entry$response$status,
date = lubridate::ymd_hms(har_entry$startedDateTime),
headers = setNames(map(har_entry$response$headers, "value"),
map(har_entry$response$headers, "name")) %>%
insensitive(),
content = content_body
), class="response")

}

+ 26
- 0
R/content.r View File

@@ -0,0 +1,26 @@
#' Retrieve size of content | body | headers
#'
#' @param har_resp_obj HAR response object
#' @export
get_content_size <- function(har_resp_obj) {
csize <- har_resp_obj$response$content$size
if (is.null(csize)) return(NA_real_)
return(as.numeric(csize))
}

#' @rdname get_content_size
#' @export
get_body_size <- function(har_resp_obj) {
bsize <- har_resp_obj$response$bodySize
if (is.null(bsize)) return(NA_real_)
return(as.numeric(bsize))
}

#' @rdname get_content_size
#' @export
get_headers_size <- function(har_resp_obj) {
hsize <- har_resp_obj$response$headersSize
if (is.null(hsize)) return(NA_real_)
return(as.numeric(hsize))
}


+ 34
- 21
R/helpers.r View File

@@ -1,3 +1,15 @@
#' Retrieve the body content of a HAR entry
#'
#' @md
#' @param har_resp_obj HAR response object
#' @return A `raw` vector of the content or `NULL`
#' @export
get_response_body <- function(har_resp_obj) {
resp <- har_resp_obj$response$content$text
if (resp == "") return(NULL)
openssl::base64_decode(resp)
}

#' Retrieve or test content type of a HAR request object
#'
#' @param har_resp_obj HAR response object
@@ -46,23 +58,23 @@ is_javascript <- function(har_resp_obj) {

#' @rdname get_content_type
#' @export
is_html <- function(har_resp_obj) { is_content_type(har_resp_obj, type="text/html") }
is_html <- function(har_resp_obj) { is_content_type(har_resp_obj, type="text/html") }

#' @rdname get_content_type
#' @export
is_jpeg <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/jpeg") }
is_jpeg <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/jpeg") }

#' @rdname get_content_type
#' @export
is_png <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/png") }
is_png <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/png") }

#' @rdname get_content_type
#' @export
is_svg <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/svg+xml") }
is_svg <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/svg+xml") }

#' @rdname get_content_type
#' @export
is_gif <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/gif") }
is_gif <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/gif") }

#' @rdname get_content_type
#' @export
@@ -82,29 +94,30 @@ is_xhr <- function(x) {

}

#' Retrieve size of content | body | headers
#' Retrieve request URL
#'
#' @param har_resp_obj HAR response object
#' @export
get_content_size <- function(har_resp_obj) {
csize <- har_resp_obj$response$content$size
if (is.null(csize)) return(NA_real_)
return(as.numeric(csize))
get_request_url <- function(har_resp_obj) {
utype <- har_resp_obj$request$url
if (utype == "") return(NA_character_)
utype
}

#' @rdname get_content_size
#' Retrieve or test request type
#'
#' @param har_resp_obj HAR response object
#' @export
get_body_size <- function(har_resp_obj) {
bsize <- har_resp_obj$response$bodySize
if (is.null(bsize)) return(NA_real_)
return(as.numeric(bsize))
get_request_type <- function(har_resp_obj) {
rtype <- har_resp_obj$request$method
if (rtype == "") return(NA_character_)
rtype
}

#' @rdname get_content_size
#' @rdname get_request_type
#' @export
get_headers_size <- function(har_resp_obj) {
hsize <- har_resp_obj$response$headersSize
if (is.null(hsize)) return(NA_real_)
return(as.numeric(hsize))
}
is_get <- function(har_resp_obj) { get_requet_type(har_resp_obj) == "GET" }

#' @rdname get_request_type
#' @export
is_post <- function(har_resp_obj) { get_requet_type(har_resp_obj) == "POST" }

+ 5
- 0
R/splashr-package.R View File

@@ -15,9 +15,14 @@
#' @docType package
#' @author Bob Rudis (bob@@rud.is)
#' @import purrr httr magick harbor
#' @importFrom stringi stri_split_regex stri_split_fixed stri_detect_regex
#' @importFrom HARtools writeHAR HARviewer renderHARviewer HARviewerOutput
#' @importFrom xml2 read_html url_parse
#' @importFrom jsonlite fromJSON
#' @importFrom openssl base64_decode
#' @importFrom clipr read_clip
#' @importFrom lubridate ymd_hms

NULL

#' splashr exported operators


+ 6
- 4
README.Rmd View File

@@ -49,10 +49,7 @@ The following functions are implemented:
- `start_splash`: Start a Splash server Docker container
- `stop_splash`: Stop a running a Splash server Docker container

Mini-DSL (domain-specific language). These can be used to create a "script" without actually
scripting in Lua. They are a less-powerful/configurable set of calls than what you
can make with a full Lua function but the idea is to have it take care of very common but
simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG image of a site:
Mini-DSL (domain-specific language). These can be used to create a "script" without actually scripting in Lua. They are a less-powerful/configurable set of calls than what you can make with a full Lua function but the idea is to have it take care of very common but simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG image of a site:

- `splash_plugins`: Enable or disable browser plugins (e.g. Flash).
- `splash_images`: Enable/disable images
@@ -63,6 +60,11 @@ simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG
- `splash_html`: Return a HTML snapshot of a current page.
- `splash_png`: Return a screenshot of a current page in PNG format.

`httr` helpers. These help turn various bits of `splashr` objects into `httr`-ish things:

- `as_req`: Turn a HAR response entry into a working `httr` function you can use to make a request with
- `as_request`: Turn a HAR response entry into an `httr` `response`-like object (i.e. you can use `httr::content()` on it)

Helpers:

- `get_body_size`: Retrieve size of content | body | headers


+ 10
- 8
README.md View File

@@ -33,7 +33,7 @@ All you need for this package to work is a running Splash instance. You provide

> 'Splash' <https://github.com/scrapinghub/splash> is a javascript rendering service. It’s a lightweight web browser with an 'HTTP' API, implemented in Python using 'Twisted'and 'QT' [and provides some of the core functionality of the 'RSelenium' or 'seleniumPipes' R packages but with a Java-free footprint]. The (twisted) 'QT' reactor is used to make the sever fully asynchronous allowing to take advantage of 'webkit' concurrency via QT main loop. Some of Splash features include the ability to process multiple webpages in parallel; retrieving HTML results and/or take screenshots; disabling images or use Adblock Plus rules to make rendering faster; executing custom JavaScript in page context; getting detailed rendering info in HAR format.

The following functions are implemented:
The following functions are implemented:

- `render_html`: Return the HTML of the javascript-rendered page.
- `render_file`: Return the HTML or image (png) of the javascript-rendered page in a local file
@@ -46,10 +46,7 @@ The following functions are implemented:
- `start_splash`: Start a Splash server Docker container
- `stop_splash`: Stop a running a Splash server Docker container

Mini-DSL (domain-specific language). These can be used to create a "script" without actually
scripting in Lua. They are a less-powerful/configurable set of calls than what you
can make with a full Lua function but the idea is to have it take care of very common but
simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG image of a site:
Mini-DSL (domain-specific language). These can be used to create a "script" without actually scripting in Lua. They are a less-powerful/configurable set of calls than what you can make with a full Lua function but the idea is to have it take care of very common but simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG image of a site:

- `splash_plugins`: Enable or disable browser plugins (e.g. Flash).
- `splash_images`: Enable/disable images
@@ -60,7 +57,12 @@ simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG
- `splash_html`: Return a HTML snapshot of a current page.
- `splash_png`: Return a screenshot of a current page in PNG format.

Helpers:
`httr` helpers. These help turn various bits of `splashr` objects into `httr`-ish things:

- `as_req`: Turn a HAR response entry into a working `httr` function you can use to make a request with
- `as_request`: Turn a HAR response entry into an `httr` `response`-like object (i.e. you can use `httr::content()` on it)

Helpers:

- `get_body_size`: Retrieve size of content | body | headers
- `get_content_sie`: Retrieve size of content | body | headers
@@ -119,7 +121,7 @@ library(tidyverse)
packageVersion("splashr")
```

## [1] '0.2.0'
## [1] '0.3.0'

``` r
splash("splash", 8050L) %>%
@@ -292,7 +294,7 @@ library(testthat)
date()
```

## [1] "Tue Feb 14 09:02:35 2017"
## [1] "Tue Feb 15 09:02:35 2017"

``` r
test_dir("tests/")


+ 18
- 0
man/as_req.Rd View File

@@ -0,0 +1,18 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/as_req.r
\name{as_req}
\alias{as_req}
\title{Create an httr function from an HAR request}
\usage{
as_req(entry, quiet = TRUE, add_clip = TRUE)
}
\arguments{
\item{entry}{HAR entry}

\item{quiet}{quiet}

\item{add_clip}{add clip}
}
\description{
Create an httr function from an HAR request
}

+ 29
- 0
man/as_request.Rd View File

@@ -0,0 +1,29 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/as_request.r
\name{as_request}
\alias{as_request}
\title{Return a HAR entry response as an httr::response object}
\usage{
as_request(har_entry)
}
\arguments{
\item{har_entry}{a HAR object (should contain a response body to be most useful)}
}
\description{
Return a HAR entry response as an httr::response object
}
\examples{
\dontrun{
URL <- "http://www.svs.cl/portal/principal/605/w3-propertyvalue-18554.html"

splash_local \%>\%
splash_response_body(TRUE) \%>\%
splash_go(URL) \%>\%
splash_wait(2) \%>\%
splash_har() -> har

keep(har$log$entries, is_xhr) \%>\%
map(as_request) \%>\%
map(httr::content, as="parsed")
}
}

+ 1
- 1
man/get_content_size.Rd View File

@@ -1,5 +1,5 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/helpers.r
% Please edit documentation in R/content.r
\name{get_content_size}
\alias{get_content_size}
\alias{get_body_size}


+ 20
- 0
man/get_request_type.Rd View File

@@ -0,0 +1,20 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/helpers.r
\name{get_request_type}
\alias{get_request_type}
\alias{is_get}
\alias{is_post}
\title{Retrieve or test request type}
\usage{
get_request_type(har_resp_obj)

is_get(har_resp_obj)

is_post(har_resp_obj)
}
\arguments{
\item{har_resp_obj}{HAR response object}
}
\description{
Retrieve or test request type
}

+ 14
- 0
man/get_request_url.Rd View File

@@ -0,0 +1,14 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/helpers.r
\name{get_request_url}
\alias{get_request_url}
\title{Retrieve request URL}
\usage{
get_request_url(har_resp_obj)
}
\arguments{
\item{har_resp_obj}{HAR response object}
}
\description{
Retrieve request URL
}

+ 17
- 0
man/get_response_body.Rd View File

@@ -0,0 +1,17 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/helpers.r
\name{get_response_body}
\alias{get_response_body}
\title{Retrieve the body content of a HAR entry}
\usage{
get_response_body(har_resp_obj)
}
\arguments{
\item{har_resp_obj}{HAR response object}
}
\value{
A \code{raw} vector of the content or \code{NULL}
}
\description{
Retrieve the body content of a HAR entry
}

Loading…
Cancel
Save