瀏覽代碼

make httr functions and response objects from HAR entries

master
boB Rudis 7 年前
父節點
當前提交
4ef5b7a1aa
  1. 8
      DESCRIPTION
  2. 13
      NAMESPACE
  3. 5
      NEWS.md
  4. 93
      R/as_req.r
  5. 36
      R/as_request.r
  6. 26
      R/content.r
  7. 55
      R/helpers.r
  8. 5
      R/splashr-package.R
  9. 10
      README.Rmd
  10. 18
      README.md
  11. 18
      man/as_req.Rd
  12. 29
      man/as_request.Rd
  13. 2
      man/get_content_size.Rd
  14. 20
      man/get_request_type.Rd
  15. 14
      man/get_request_url.Rd
  16. 17
      man/get_response_body.Rd

8
DESCRIPTION

@ -1,7 +1,7 @@
Package: splashr
Type: Package
Title: Tools to Work with the 'Splash' JavaScript Rendering Service
Version: 0.2.0
Version: 0.3.0
Date: 2017-02-14
Encoding: UTF-8
Author: Bob Rudis (bob@rud.is)
@ -29,6 +29,10 @@ Imports:
xml2,
jsonlite,
magick,
HARtools
stringi,
clipr,
HARtools,
openssl,
lubridate
RoxygenNote: 6.0.0
Remotes: wch/harbor

13
NAMESPACE

@ -6,15 +6,21 @@ export("%>%")
export(HARviewer)
export(HARviewerOutput)
export(as_har)
export(as_req)
export(as_request)
export(execute_lua)
export(get_body_size)
export(get_content_size)
export(get_content_type)
export(get_headers_size)
export(get_request_type)
export(get_request_url)
export(get_response_body)
export(install_splash)
export(is_binary)
export(is_content_type)
export(is_css)
export(is_get)
export(is_gif)
export(is_html)
export(is_javascript)
@ -22,6 +28,7 @@ export(is_jpeg)
export(is_json)
export(is_plain)
export(is_png)
export(is_post)
export(is_svg)
export(is_xhr)
export(is_xml)
@ -55,6 +62,12 @@ importFrom(HARtools,HARviewer)
importFrom(HARtools,HARviewerOutput)
importFrom(HARtools,renderHARviewer)
importFrom(HARtools,writeHAR)
importFrom(clipr,read_clip)
importFrom(jsonlite,fromJSON)
importFrom(lubridate,ymd_hms)
importFrom(openssl,base64_decode)
importFrom(stringi,stri_detect_regex)
importFrom(stringi,stri_split_fixed)
importFrom(stringi,stri_split_regex)
importFrom(xml2,read_html)
importFrom(xml2,url_parse)

5
NEWS.md

@ -1,3 +1,8 @@
0.3.0
* added `as_req()`
* added `as_request()`
0.2.0
* added `execute`()

93
R/as_req.r

@ -0,0 +1,93 @@
#' Create an httr function from an HAR request
#'
#' @md
#' @param entry HAR entry
#' @param quiet quiet
#' @param add_clip add clip
#' @export
as_req <- function(entry, quiet=TRUE, add_clip=TRUE) {
req <- entry$request
req$headers <- purrr::map(req$headers, "value") %>%
setNames(map_chr(req$headers, "name"))
ml <- getOption("deparse.max.lines")
options(deparse.max.lines=10000)
template <- "httr::VERB(verb = '%s', url = '%s' %s%s%s%s%s%s)"
hdrs <- enc <- bdy <- ckies <- auth <- verbos <- cfg <- ""
if (length(req$headers) > 0) {
# try to determine encoding
ct_idx <- which(grepl("content-type", names(req$headers), ignore.case=TRUE))
if (length(ct_idx) > 0) {
# retrieve & delete the content type
ct <- req$headers[[ct_idx]]
req$headers[[ct_idx]] <- NULL
if (stringi::stri_detect_regex(ct, "multipart")) {
enc <- ", encode = 'multipart'"
} else if (stringi::stri_detect_regex(ct, "form")) {
enc <- ", encode = 'form'"
} else if (stringi::stri_detect_regex(ct, "json")) {
enc <- ", encode = 'json'"
} else {
enc <- ""
}
}
hdrs <- paste0(capture.output(dput(req$headers, control=NULL)),
collapse="")
hdrs <- sub("^list", ", httr::add_headers", hdrs)
}
if (length(req$data) > 0) {
bdy_bits <- paste0(capture.output(dput(parse_query(req$data), control=NULL)),
collapse="")
bdy <- sprintf(", body = %s", bdy_bits)
}
if (length(req$url_parts$username) > 0) {
auth <- sprintf(", httr::authenticate(user='%s', password='%s')",
req$url_parts$username, req$url_parts$password)
}
if (length(req$verbose) > 0) {
verbos <- ", httr::verbose()"
}
if (length(req$cookies) > 0) {
ckies <- paste0(capture.output(dput(req$cookies, control=NULL)),
collapse="")
ckies <- sub("^list", ", httr::set_cookies", ckies)
}
REQ_URL <- req$url
out <- sprintf(template, toupper(req$method), REQ_URL, auth, verbos, hdrs, ckies, bdy, enc)
# this does a half-decent job formatting the R function text
fil <- tempfile(fileext=".R")
on.exit(unlink(fil))
formatR::tidy_source(text=out, width.cutoff=30, indent=4, file=fil)
tmp <- paste0(readLines(fil), collapse="\n")
if (add_clip) clipr::write_clip(tmp)
if (!quiet) cat(tmp, "\n")
# make a bona fide R function
f <- function() {}
formals(f) <- NULL
environment(f) <- parent.frame()
body(f) <- as.expression(parse(text=tmp))
options(deparse.max.lines=ml)
return(f)
}

36
R/as_request.r

@ -0,0 +1,36 @@
#' Return a HAR entry response as an httr::response object
#'
#' @param har_entry a HAR object (should contain a response body to be most useful)
#' @export
#' @examples \dontrun{
#' URL <- "http://www.svs.cl/portal/principal/605/w3-propertyvalue-18554.html"
#'
#' splash_local %>%
#' splash_response_body(TRUE) %>%
#' splash_go(URL) %>%
#' splash_wait(2) %>%
#' splash_har() -> har
#'
#' keep(har$log$entries, is_xhr) %>%
#' map(as_request) %>%
#' map(httr::content, as="parsed")
#' }
as_request <- function(har_entry) {
if (length(har_entry$response$content$text) > 0) {
content_body <- openssl::base64_decode(har_entry$response$content$text)
} else {
content_body <- NULL
}
structure(list(
url = har_entry$request$url,
status_code = har_entry$response$status,
date = lubridate::ymd_hms(har_entry$startedDateTime),
headers = setNames(map(har_entry$response$headers, "value"),
map(har_entry$response$headers, "name")) %>%
insensitive(),
content = content_body
), class="response")
}

26
R/content.r

@ -0,0 +1,26 @@
#' Retrieve size of content | body | headers
#'
#' @param har_resp_obj HAR response object
#' @export
get_content_size <- function(har_resp_obj) {
csize <- har_resp_obj$response$content$size
if (is.null(csize)) return(NA_real_)
return(as.numeric(csize))
}
#' @rdname get_content_size
#' @export
get_body_size <- function(har_resp_obj) {
bsize <- har_resp_obj$response$bodySize
if (is.null(bsize)) return(NA_real_)
return(as.numeric(bsize))
}
#' @rdname get_content_size
#' @export
get_headers_size <- function(har_resp_obj) {
hsize <- har_resp_obj$response$headersSize
if (is.null(hsize)) return(NA_real_)
return(as.numeric(hsize))
}

55
R/helpers.r

@ -1,3 +1,15 @@
#' Retrieve the body content of a HAR entry
#'
#' @md
#' @param har_resp_obj HAR response object
#' @return A `raw` vector of the content or `NULL`
#' @export
get_response_body <- function(har_resp_obj) {
resp <- har_resp_obj$response$content$text
if (resp == "") return(NULL)
openssl::base64_decode(resp)
}
#' Retrieve or test content type of a HAR request object
#'
#' @param har_resp_obj HAR response object
@ -46,23 +58,23 @@ is_javascript <- function(har_resp_obj) {
#' @rdname get_content_type
#' @export
is_html <- function(har_resp_obj) { is_content_type(har_resp_obj, type="text/html") }
is_html <- function(har_resp_obj) { is_content_type(har_resp_obj, type="text/html") }
#' @rdname get_content_type
#' @export
is_jpeg <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/jpeg") }
is_jpeg <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/jpeg") }
#' @rdname get_content_type
#' @export
is_png <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/png") }
is_png <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/png") }
#' @rdname get_content_type
#' @export
is_svg <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/svg+xml") }
is_svg <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/svg+xml") }
#' @rdname get_content_type
#' @export
is_gif <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/gif") }
is_gif <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/gif") }
#' @rdname get_content_type
#' @export
@ -82,29 +94,30 @@ is_xhr <- function(x) {
}
#' Retrieve size of content | body | headers
#' Retrieve request URL
#'
#' @param har_resp_obj HAR response object
#' @export
get_content_size <- function(har_resp_obj) {
csize <- har_resp_obj$response$content$size
if (is.null(csize)) return(NA_real_)
return(as.numeric(csize))
get_request_url <- function(har_resp_obj) {
utype <- har_resp_obj$request$url
if (utype == "") return(NA_character_)
utype
}
#' @rdname get_content_size
#' Retrieve or test request type
#'
#' @param har_resp_obj HAR response object
#' @export
get_body_size <- function(har_resp_obj) {
bsize <- har_resp_obj$response$bodySize
if (is.null(bsize)) return(NA_real_)
return(as.numeric(bsize))
get_request_type <- function(har_resp_obj) {
rtype <- har_resp_obj$request$method
if (rtype == "") return(NA_character_)
rtype
}
#' @rdname get_content_size
#' @rdname get_request_type
#' @export
get_headers_size <- function(har_resp_obj) {
hsize <- har_resp_obj$response$headersSize
if (is.null(hsize)) return(NA_real_)
return(as.numeric(hsize))
}
is_get <- function(har_resp_obj) { get_requet_type(har_resp_obj) == "GET" }
#' @rdname get_request_type
#' @export
is_post <- function(har_resp_obj) { get_requet_type(har_resp_obj) == "POST" }

5
R/splashr-package.R

@ -15,9 +15,14 @@
#' @docType package
#' @author Bob Rudis (bob@@rud.is)
#' @import purrr httr magick harbor
#' @importFrom stringi stri_split_regex stri_split_fixed stri_detect_regex
#' @importFrom HARtools writeHAR HARviewer renderHARviewer HARviewerOutput
#' @importFrom xml2 read_html url_parse
#' @importFrom jsonlite fromJSON
#' @importFrom openssl base64_decode
#' @importFrom clipr read_clip
#' @importFrom lubridate ymd_hms
NULL
#' splashr exported operators

10
README.Rmd

@ -49,10 +49,7 @@ The following functions are implemented:
- `start_splash`: Start a Splash server Docker container
- `stop_splash`: Stop a running a Splash server Docker container
Mini-DSL (domain-specific language). These can be used to create a "script" without actually
scripting in Lua. They are a less-powerful/configurable set of calls than what you
can make with a full Lua function but the idea is to have it take care of very common but
simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG image of a site:
Mini-DSL (domain-specific language). These can be used to create a "script" without actually scripting in Lua. They are a less-powerful/configurable set of calls than what you can make with a full Lua function but the idea is to have it take care of very common but simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG image of a site:
- `splash_plugins`: Enable or disable browser plugins (e.g. Flash).
- `splash_images`: Enable/disable images
@ -63,6 +60,11 @@ simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG
- `splash_html`: Return a HTML snapshot of a current page.
- `splash_png`: Return a screenshot of a current page in PNG format.
`httr` helpers. These help turn various bits of `splashr` objects into `httr`-ish things:
- `as_req`: Turn a HAR response entry into a working `httr` function you can use to make a request with
- `as_request`: Turn a HAR response entry into an `httr` `response`-like object (i.e. you can use `httr::content()` on it)
Helpers:
- `get_body_size`: Retrieve size of content | body | headers

18
README.md

@ -33,7 +33,7 @@ All you need for this package to work is a running Splash instance. You provide
> 'Splash' <https://github.com/scrapinghub/splash> is a javascript rendering service. It’s a lightweight web browser with an 'HTTP' API, implemented in Python using 'Twisted'and 'QT' [and provides some of the core functionality of the 'RSelenium' or 'seleniumPipes' R packages but with a Java-free footprint]. The (twisted) 'QT' reactor is used to make the sever fully asynchronous allowing to take advantage of 'webkit' concurrency via QT main loop. Some of Splash features include the ability to process multiple webpages in parallel; retrieving HTML results and/or take screenshots; disabling images or use Adblock Plus rules to make rendering faster; executing custom JavaScript in page context; getting detailed rendering info in HAR format.
The following functions are implemented:
The following functions are implemented:
- `render_html`: Return the HTML of the javascript-rendered page.
- `render_file`: Return the HTML or image (png) of the javascript-rendered page in a local file
@ -46,10 +46,7 @@ The following functions are implemented:
- `start_splash`: Start a Splash server Docker container
- `stop_splash`: Stop a running a Splash server Docker container
Mini-DSL (domain-specific language). These can be used to create a "script" without actually
scripting in Lua. They are a less-powerful/configurable set of calls than what you
can make with a full Lua function but the idea is to have it take care of very common but
simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG image of a site:
Mini-DSL (domain-specific language). These can be used to create a "script" without actually scripting in Lua. They are a less-powerful/configurable set of calls than what you can make with a full Lua function but the idea is to have it take care of very common but simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG image of a site:
- `splash_plugins`: Enable or disable browser plugins (e.g. Flash).
- `splash_images`: Enable/disable images
@ -60,7 +57,12 @@ simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG
- `splash_html`: Return a HTML snapshot of a current page.
- `splash_png`: Return a screenshot of a current page in PNG format.
Helpers:
`httr` helpers. These help turn various bits of `splashr` objects into `httr`-ish things:
- `as_req`: Turn a HAR response entry into a working `httr` function you can use to make a request with
- `as_request`: Turn a HAR response entry into an `httr` `response`-like object (i.e. you can use `httr::content()` on it)
Helpers:
- `get_body_size`: Retrieve size of content | body | headers
- `get_content_sie`: Retrieve size of content | body | headers
@ -119,7 +121,7 @@ library(tidyverse)
packageVersion("splashr")
```
## [1] '0.2.0'
## [1] '0.3.0'
``` r
splash("splash", 8050L) %>%
@ -292,7 +294,7 @@ library(testthat)
date()
```
## [1] "Tue Feb 14 09:02:35 2017"
## [1] "Tue Feb 15 09:02:35 2017"
``` r
test_dir("tests/")

18
man/as_req.Rd

@ -0,0 +1,18 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/as_req.r
\name{as_req}
\alias{as_req}
\title{Create an httr function from an HAR request}
\usage{
as_req(entry, quiet = TRUE, add_clip = TRUE)
}
\arguments{
\item{entry}{HAR entry}
\item{quiet}{quiet}
\item{add_clip}{add clip}
}
\description{
Create an httr function from an HAR request
}

29
man/as_request.Rd

@ -0,0 +1,29 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/as_request.r
\name{as_request}
\alias{as_request}
\title{Return a HAR entry response as an httr::response object}
\usage{
as_request(har_entry)
}
\arguments{
\item{har_entry}{a HAR object (should contain a response body to be most useful)}
}
\description{
Return a HAR entry response as an httr::response object
}
\examples{
\dontrun{
URL <- "http://www.svs.cl/portal/principal/605/w3-propertyvalue-18554.html"
splash_local \%>\%
splash_response_body(TRUE) \%>\%
splash_go(URL) \%>\%
splash_wait(2) \%>\%
splash_har() -> har
keep(har$log$entries, is_xhr) \%>\%
map(as_request) \%>\%
map(httr::content, as="parsed")
}
}

2
man/get_content_size.Rd

@ -1,5 +1,5 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/helpers.r
% Please edit documentation in R/content.r
\name{get_content_size}
\alias{get_content_size}
\alias{get_body_size}

20
man/get_request_type.Rd

@ -0,0 +1,20 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/helpers.r
\name{get_request_type}
\alias{get_request_type}
\alias{is_get}
\alias{is_post}
\title{Retrieve or test request type}
\usage{
get_request_type(har_resp_obj)
is_get(har_resp_obj)
is_post(har_resp_obj)
}
\arguments{
\item{har_resp_obj}{HAR response object}
}
\description{
Retrieve or test request type
}

14
man/get_request_url.Rd

@ -0,0 +1,14 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/helpers.r
\name{get_request_url}
\alias{get_request_url}
\title{Retrieve request URL}
\usage{
get_request_url(har_resp_obj)
}
\arguments{
\item{har_resp_obj}{HAR response object}
}
\description{
Retrieve request URL
}

17
man/get_response_body.Rd

@ -0,0 +1,17 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/helpers.r
\name{get_response_body}
\alias{get_response_body}
\title{Retrieve the body content of a HAR entry}
\usage{
get_response_body(har_resp_obj)
}
\arguments{
\item{har_resp_obj}{HAR response object}
}
\value{
A \code{raw} vector of the content or \code{NULL}
}
\description{
Retrieve the body content of a HAR entry
}
載入中…
取消
儲存