From 372ed67fda3db5e66375ceb056470957b3249d16 Mon Sep 17 00:00:00 2001 From: boB Rudis Date: Thu, 16 Feb 2017 22:17:24 -0500 Subject: [PATCH] new functions --- DESCRIPTION | 3 +- NAMESPACE | 17 ++++++++++ R/as_request.r | 3 ++ R/dsl.r | 81 +++++++++++++++++++++++++++++++++++++++++++++ R/splashr-package.R | 1 + R/splashr.r | 43 +++++++++++++++++++++++- README.Rmd | 1 + README.md | 1 + man/as_request.Rd | 3 ++ man/splash_go.Rd | 1 + man/splash_har.Rd | 1 + man/splash_history.Rd | 14 ++++++++ man/splash_html.Rd | 1 + man/splash_images.Rd | 1 + man/splash_perf_stats.Rd | 14 ++++++++ man/splash_plugins.Rd | 1 + man/splash_png.Rd | 1 + man/splash_response_body.Rd | 1 + man/splash_user_agent.Rd | 70 +++++++++++++++++++++++++++++++++++++++ man/splash_version.Rd | 14 ++++++++ man/splash_wait.Rd | 1 + 21 files changed, 271 insertions(+), 2 deletions(-) create mode 100644 man/splash_history.Rd create mode 100644 man/splash_perf_stats.Rd create mode 100644 man/splash_user_agent.Rd create mode 100644 man/splash_version.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 15320c1..36f410c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,6 +33,7 @@ Imports: clipr, HARtools, openssl, - lubridate + lubridate, + scales RoxygenNote: 6.0.0 Remotes: wch/harbor diff --git a/NAMESPACE b/NAMESPACE index c596c37..e0c6281 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -44,15 +44,31 @@ export(splash_active) export(splash_debug) export(splash_go) export(splash_har) +export(splash_history) export(splash_html) export(splash_images) export(splash_local) +export(splash_perf_stats) export(splash_plugins) export(splash_png) export(splash_response_body) +export(splash_user_agent) +export(splash_version) export(splash_wait) export(start_splash) export(stop_splash) +export(ua_ios_safari) +export(ua_linux_chrome) +export(ua_linux_firefox) +export(ua_macos_chrome) +export(ua_macos_safari) +export(ua_splashr) +export(ua_win10_chrome) +export(ua_win10_firefox) +export(ua_win10_ie11) +export(ua_win7_chrome) +export(ua_win7_firefox) +export(ua_win7_ie11) export(writeHAR) import(harbor) import(httr) @@ -66,6 +82,7 @@ importFrom(clipr,read_clip) importFrom(jsonlite,fromJSON) importFrom(lubridate,ymd_hms) importFrom(openssl,base64_decode) +importFrom(scales,comma) importFrom(stringi,stri_detect_regex) importFrom(stringi,stri_split_fixed) importFrom(stringi,stri_split_regex) diff --git a/R/as_request.r b/R/as_request.r index eab99b9..3e8f087 100644 --- a/R/as_request.r +++ b/R/as_request.r @@ -3,10 +3,13 @@ #' @param har_entry a HAR object (should contain a response body to be most useful) #' @export #' @examples \dontrun{ +#' library(purrr) +#' #' URL <- "http://www.svs.cl/portal/principal/605/w3-propertyvalue-18554.html" #' #' splash_local %>% #' splash_response_body(TRUE) %>% +#' splash_user_agent(ua_macos_chrome) %>% #' splash_go(URL) %>% #' splash_wait(2) %>% #' splash_har() -> har diff --git a/R/dsl.r b/R/dsl.r index 0f5198a..b107c35 100644 --- a/R/dsl.r +++ b/R/dsl.r @@ -20,6 +20,7 @@ end #' @examples \dontrun{ #' splash_local %>% #' splash_response_body(TRUE) %>% +#' splash_user_agent(ua_macos_chrome) %>% #' splash_go("https://rud.is/b") %>% #' splash_wait(2) %>% #' splash_har() -> rud_har @@ -40,6 +41,7 @@ splash_response_body <- function(splash_obj, enable=FALSE) { #' @examples \dontrun{ #' splash_local %>% #' splash_plugins(TRUE) %>% +#' splash_user_agent(ua_macos_chrome) %>% #' splash_go("https://rud.is/b") %>% #' splash_wait(2) %>% #' splash_har() -> rud_har @@ -63,6 +65,7 @@ splash_plugins <- function(splash_obj, enable=FALSE) { #' @examples \dontrun{ #' splash_local %>% #' splash_images(TRUE) %>% +#' splash_user_agent(ua_macos_chrome) %>% #' splash_go("https://rud.is/b") %>% #' splash_wait(2) %>% #' splash_har() -> rud_har @@ -84,6 +87,7 @@ splash_images <- function(splash_obj, enable=TRUE) { #' @examples \dontrun{ #' splash_local %>% #' splash_response_body(TRUE) %>% +#' splash_user_agent(ua_macos_chrome) %>% #' splash_go("https://rud.is/b") %>% #' splash_wait(2) %>% #' splash_har() -> rud_har @@ -106,6 +110,7 @@ splash_go <- function(splash_obj, url) { #' @examples \dontrun{ #' splash_local %>% #' splash_response_body(TRUE) %>% +#' splash_user_agent(ua_macos_chrome) %>% #' splash_go("https://rud.is/b") %>% #' splash_wait(2) %>% #' splash_har() -> rud_har @@ -126,6 +131,7 @@ splash_wait <- function(splash_obj, time=2) { #' @examples \dontrun{ #' splash_local %>% #' splash_response_body(TRUE) %>% +#' splash_user_agent(ua_macos_chrome) %>% #' splash_go("https://rud.is/b") %>% #' splash_wait(2) %>% #' splash_har() -> rud_har @@ -153,6 +159,7 @@ splash_har <- function(splash_obj) { #' @examples \dontrun{ #' splash_local %>% #' splash_response_body(TRUE) %>% +#' splash_user_agent(ua_macos_chrome) %>% #' splash_go("https://rud.is/b") %>% #' splash_wait(2) %>% #' splash_html() -> rud_pg @@ -182,6 +189,7 @@ splash_html <- function(splash_obj, raw_html=FALSE) { #' @export #' @examples \dontrun{ #' splash_local %>% +#' splash_user_agent(ua_macos_chrome) %>% #' splash_go("https://rud.is/b") %>% #' splash_wait(2) %>% #' splash_png() @@ -198,3 +206,76 @@ splash_png <- function(splash_obj) { } +#' Overwrite the User-Agent header for all further requests. +#' +#' There are a few built-in user agents, all beginning with `ua_`. +#' +#' @md +#' @param splash_obj splashr object +#' @param user_agent 1 element character vector, defaults to `splashr/#.#.#`. +#' @export +#' @examples \dontrun{ +#' library(rvest) +#' +#' URL <- "https://httpbin.org/user-agent" +#' +#' splash_local %>% +#' splash_response_body(TRUE) %>% +#' splash_user_agent(ua_macos_chrome) %>% +#' splash_go(URL) %>% +#' splash_html() %>% +#' html_text("body") %>% +#' jsonlite::fromJSON() +#' } +splash_user_agent <- function(splash_obj, user_agent=ua_splashr) { + splash_obj$calls <- c(splash_obj$calls, sprintf('splash:set_user_agent("%s")', user_agent)) + splash_obj +} + +#' @rdname splash_user_agent +#' @export +ua_splashr <- sprintf("splashr/%s", packageVersion("splashr")) + +#' @rdname splash_user_agent +#' @export +ua_win10_chrome <- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" + +#' @rdname splash_user_agent +#' @export +ua_win10_firefox <- "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0" + +#' @rdname splash_user_agent +#' @export +ua_win10_ie11 <- "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" + +#' @rdname splash_user_agent +#' @export +ua_win7_chrome <- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" + +#' @rdname splash_user_agent +#' @export +ua_win7_firefox <- "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0" + +#' @rdname splash_user_agent +#' @export +ua_win7_ie11 <- "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko" + +#' @rdname splash_user_agent +#' @export +ua_macos_chrome <- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36" + +#' @rdname splash_user_agent +#' @export +ua_macos_safari <- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/602.3.12 (KHTML, like Gecko) Version/10.0.2 Safari/602.3.12" + +#' @rdname splash_user_agent +#' @export +ua_linux_chrome <- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" + +#' @rdname splash_user_agent +#' @export +ua_linux_firefox <- "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:51.0) Gecko/20100101 Firefox/51.0" + +#' @rdname splash_user_agent +#' @export +ua_ios_safari <- "Mozilla/5.0 (iPad; CPU OS 10_2 like Mac OS X) AppleWebKit/602.3.12 (KHTML, like Gecko) Version/10.0 Mobile/14C92 Safari/602.1" diff --git a/R/splashr-package.R b/R/splashr-package.R index cf009c9..3aba8e2 100644 --- a/R/splashr-package.R +++ b/R/splashr-package.R @@ -22,6 +22,7 @@ #' @importFrom openssl base64_decode #' @importFrom clipr read_clip #' @importFrom lubridate ymd_hms +#' @importFrom scales comma NULL diff --git a/R/splashr.r b/R/splashr.r index 055df64..082e191 100644 --- a/R/splashr.r +++ b/R/splashr.r @@ -33,7 +33,8 @@ splash_active <- function(splash_obj) { out$url <- splash_url(splash_obj) - message(sprintf("Status of splash instance on [%s]: %s. Max RSS: %s\n", out$url, out$status, out$maxrss)) + message(sprintf("Status of splash instance on [%s]: %s. Max RSS: %s Mb\n", + out$url, out$status, scales::comma(out$maxrss/1024/1024))) if ("status" %in% names(out)) return(out$status == "ok") @@ -41,6 +42,46 @@ splash_active <- function(splash_obj) { } +#' Get Splash version information +#' +#' @param splash_obj A splash connection object +#' @export +splash_version <- function(splash_obj) { + execute_lua(splash_obj, ' +function main(splash) + return splash:get_version() +end +') -> res + jsonlite::fromJSON(rawToChar(res)) +} + +#' Get information about requests/responses for the pages loaded +#' +#' @param splash_obj A splash connection object +#' @export +splash_history <- function(splash_obj) { + execute_lua(splash_obj, ' +function main(splash) + return splash:history() +end +') -> res + jsonlite::fromJSON(rawToChar(res)) +} + + +#' Get Splash performance-related statistics +#' +#' @param splash_obj A splash connection object +#' @export +splash_perf_stats <- function(splash_obj) { + execute_lua(splash_obj, ' +function main(splash) + return splash:get_perf_stats() +end +') -> res + jsonlite::fromJSON(rawToChar(res)) +} + #' Retrieve debug-level info for a Splash server #' #' @param splash_obj A splash connection object diff --git a/README.Rmd b/README.Rmd index cd6b574..6bf7f24 100644 --- a/README.Rmd +++ b/README.Rmd @@ -59,6 +59,7 @@ Mini-DSL (domain-specific language). These can be used to create a "script" with - `splash_har`: Return information about Splash interaction with a website in HAR format. - `splash_html`: Return a HTML snapshot of a current page. - `splash_png`: Return a screenshot of a current page in PNG format. +- `splash_user_agent: Overwrite the User-Agent header for all further requests. NOTE: There are many "helper" user agent strings to go with `splash_user_agent`. Look for objects in `splashr` starting with `ua_`. `httr` helpers. These help turn various bits of `splashr` objects into `httr`-ish things: diff --git a/README.md b/README.md index 5acb24e..e7467d7 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,7 @@ Mini-DSL (domain-specific language). These can be used to create a "script" with - `splash_har`: Return information about Splash interaction with a website in HAR format. - `splash_html`: Return a HTML snapshot of a current page. - `splash_png`: Return a screenshot of a current page in PNG format. +- `splash_user_agent: Overwrite the User-Agent header for all further requests. NOTE: There are many "helper" user agent strings to go with `splash_user_agent`. Look for objects in `splashr` starting with `ua_`. `httr` helpers. These help turn various bits of `splashr` objects into `httr`-ish things: diff --git a/man/as_request.Rd b/man/as_request.Rd index d9ffa64..ebd7f88 100644 --- a/man/as_request.Rd +++ b/man/as_request.Rd @@ -14,10 +14,13 @@ Return a HAR entry response as an httr::response object } \examples{ \dontrun{ +library(purrr) + URL <- "http://www.svs.cl/portal/principal/605/w3-propertyvalue-18554.html" splash_local \%>\% splash_response_body(TRUE) \%>\% + splash_user_agent(ua_macos_chrome) \%>\% splash_go(URL) \%>\% splash_wait(2) \%>\% splash_har() -> har diff --git a/man/splash_go.Rd b/man/splash_go.Rd index d6214d0..9746d32 100644 --- a/man/splash_go.Rd +++ b/man/splash_go.Rd @@ -19,6 +19,7 @@ until page loads. \dontrun{ splash_local \%>\% splash_response_body(TRUE) \%>\% + splash_user_agent(ua_macos_chrome) \%>\% splash_go("https://rud.is/b") \%>\% splash_wait(2) \%>\% splash_har() -> rud_har diff --git a/man/splash_har.Rd b/man/splash_har.Rd index 6876ec1..1872095 100644 --- a/man/splash_har.Rd +++ b/man/splash_har.Rd @@ -17,6 +17,7 @@ a DSL script chain as this will execute the script and return the HAR content \dontrun{ splash_local \%>\% splash_response_body(TRUE) \%>\% + splash_user_agent(ua_macos_chrome) \%>\% splash_go("https://rud.is/b") \%>\% splash_wait(2) \%>\% splash_har() -> rud_har diff --git a/man/splash_history.Rd b/man/splash_history.Rd new file mode 100644 index 0000000..608041b --- /dev/null +++ b/man/splash_history.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splashr.r +\name{splash_history} +\alias{splash_history} +\title{Get information about requests/responses for the pages loaded} +\usage{ +splash_history(splash_obj) +} +\arguments{ +\item{splash_obj}{A splash connection object} +} +\description{ +Get information about requests/responses for the pages loaded +} diff --git a/man/splash_html.Rd b/man/splash_html.Rd index f9c83ef..8133c42 100644 --- a/man/splash_html.Rd +++ b/man/splash_html.Rd @@ -19,6 +19,7 @@ a DSL script chain as this will execute the script and return the HTML content \dontrun{ splash_local \%>\% splash_response_body(TRUE) \%>\% + splash_user_agent(ua_macos_chrome) \%>\% splash_go("https://rud.is/b") \%>\% splash_wait(2) \%>\% splash_html() -> rud_pg diff --git a/man/splash_images.Rd b/man/splash_images.Rd index 6dbfed3..0f6b292 100644 --- a/man/splash_images.Rd +++ b/man/splash_images.Rd @@ -21,6 +21,7 @@ positions of DOM elements, and scripts may read and use them. \dontrun{ splash_local \%>\% splash_images(TRUE) \%>\% + splash_user_agent(ua_macos_chrome) \%>\% splash_go("https://rud.is/b") \%>\% splash_wait(2) \%>\% splash_har() -> rud_har diff --git a/man/splash_perf_stats.Rd b/man/splash_perf_stats.Rd new file mode 100644 index 0000000..c343f76 --- /dev/null +++ b/man/splash_perf_stats.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splashr.r +\name{splash_perf_stats} +\alias{splash_perf_stats} +\title{Get Splash performance-related statistics} +\usage{ +splash_perf_stats(splash_obj) +} +\arguments{ +\item{splash_obj}{A splash connection object} +} +\description{ +Get Splash performance-related statistics +} diff --git a/man/splash_plugins.Rd b/man/splash_plugins.Rd index 8b77559..1c7aadc 100644 --- a/man/splash_plugins.Rd +++ b/man/splash_plugins.Rd @@ -18,6 +18,7 @@ Plugins are disabled by default. \dontrun{ splash_local \%>\% splash_plugins(TRUE) \%>\% + splash_user_agent(ua_macos_chrome) \%>\% splash_go("https://rud.is/b") \%>\% splash_wait(2) \%>\% splash_har() -> rud_har diff --git a/man/splash_png.Rd b/man/splash_png.Rd index a9f9f70..cd53cdc 100644 --- a/man/splash_png.Rd +++ b/man/splash_png.Rd @@ -19,6 +19,7 @@ a DSL script chain as this will execute the script and return the PNG content \examples{ \dontrun{ splash_local \%>\% + splash_user_agent(ua_macos_chrome) \%>\% splash_go("https://rud.is/b") \%>\% splash_wait(2) \%>\% splash_png() diff --git a/man/splash_response_body.Rd b/man/splash_response_body.Rd index ffcabde..3b33f2e 100644 --- a/man/splash_response_body.Rd +++ b/man/splash_response_body.Rd @@ -18,6 +18,7 @@ By default Splash doesn’t keep bodies of each response in memory, for efficien \dontrun{ splash_local \%>\% splash_response_body(TRUE) \%>\% + splash_user_agent(ua_macos_chrome) \%>\% splash_go("https://rud.is/b") \%>\% splash_wait(2) \%>\% splash_har() -> rud_har diff --git a/man/splash_user_agent.Rd b/man/splash_user_agent.Rd new file mode 100644 index 0000000..2cdda1a --- /dev/null +++ b/man/splash_user_agent.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dsl.r +\docType{data} +\name{splash_user_agent} +\alias{splash_user_agent} +\alias{ua_splashr} +\alias{ua_win10_chrome} +\alias{ua_win10_firefox} +\alias{ua_win10_ie11} +\alias{ua_win7_chrome} +\alias{ua_win7_firefox} +\alias{ua_win7_ie11} +\alias{ua_macos_chrome} +\alias{ua_macos_safari} +\alias{ua_linux_chrome} +\alias{ua_linux_firefox} +\alias{ua_ios_safari} +\title{Overwrite the User-Agent header for all further requests.} +\format{An object of class \code{character} of length 1.} +\usage{ +splash_user_agent(splash_obj, user_agent = ua_splashr) + +ua_splashr + +ua_win10_chrome + +ua_win10_firefox + +ua_win10_ie11 + +ua_win7_chrome + +ua_win7_firefox + +ua_win7_ie11 + +ua_macos_chrome + +ua_macos_safari + +ua_linux_chrome + +ua_linux_firefox + +ua_ios_safari +} +\arguments{ +\item{splash_obj}{splashr object} + +\item{user_agent}{1 element character vector, defaults to \code{splashr/#.#.#}.} +} +\description{ +There are a few built-in user agents, all beginning with \code{ua_}. +} +\examples{ +\dontrun{ +library(rvest) + +URL <- "https://httpbin.org/user-agent" + +splash_local \%>\% + splash_response_body(TRUE) \%>\% + splash_user_agent(ua_macos_chrome) \%>\% + splash_go(URL) \%>\% + splash_html() \%>\% + html_text("body") \%>\% + jsonlite::fromJSON() +} +} +\keyword{datasets} diff --git a/man/splash_version.Rd b/man/splash_version.Rd new file mode 100644 index 0000000..60cad60 --- /dev/null +++ b/man/splash_version.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splashr.r +\name{splash_version} +\alias{splash_version} +\title{Get Splash version information} +\usage{ +splash_version(splash_obj) +} +\arguments{ +\item{splash_obj}{A splash connection object} +} +\description{ +Get Splash version information +} diff --git a/man/splash_wait.Rd b/man/splash_wait.Rd index 2930a40..3390129 100644 --- a/man/splash_wait.Rd +++ b/man/splash_wait.Rd @@ -18,6 +18,7 @@ When script is waiting WebKit continues processing the webpage \dontrun{ splash_local \%>\% splash_response_body(TRUE) \%>\% + splash_user_agent(ua_macos_chrome) \%>\% splash_go("https://rud.is/b") \%>\% splash_wait(2) \%>\% splash_har() -> rud_har