Browse Source

mini-DSL

master
boB Rudis 3 years ago
parent
commit
d1d6a699af
16 changed files with 451 additions and 10 deletions
  1. +8
    -0
      NAMESPACE
  2. +200
    -0
      R/dsl.r
  3. +2
    -2
      R/render-png.r
  4. +15
    -1
      README.Rmd
  5. +17
    -3
      README.md
  6. +1
    -1
      man/render_jpeg.Rd
  7. +1
    -1
      man/render_json.Rd
  8. +2
    -2
      man/render_png.Rd
  9. +26
    -0
      man/splash_go.Rd
  10. +24
    -0
      man/splash_har.Rd
  11. +26
    -0
      man/splash_html.Rd
  12. +28
    -0
      man/splash_images.Rd
  13. +25
    -0
      man/splash_plugins.Rd
  14. +26
    -0
      man/splash_png.Rd
  15. +25
    -0
      man/splash_response_body.Rd
  16. +25
    -0
      man/splash_wait.Rd

+ 8
- 0
NAMESPACE View File

@@ -35,7 +35,15 @@ export(render_png)
export(splash)
export(splash_active)
export(splash_debug)
export(splash_go)
export(splash_har)
export(splash_html)
export(splash_images)
export(splash_local)
export(splash_plugins)
export(splash_png)
export(splash_response_body)
export(splash_wait)
export(start_splash)
export(stop_splash)
export(writeHAR)


+ 200
- 0
R/dsl.r View File

@@ -0,0 +1,200 @@
make_splash_call <- function(splash_obj) {

sprintf('
function main(splash)
%s
end
', paste0(sprintf(" %s", splash_obj$calls), collapse="\n")) -> out

out

}

#' Enable or disable response content tracking.
#'
#' By default Splash doesn’t keep bodies of each response in memory, for efficiency reasons.
#'
#' @param splash_obj splashr object
#' @param enable logical
#' @export
#' @examples \dontrun{
#' splash_local %>%
#' splash_response_body(TRUE) %>%
#' splash_go("https://rud.is/b") %>%
#' splash_wait(2) %>%
#' splash_har() -> rud_har
#' }
splash_response_body <- function(splash_obj, enable=FALSE) {
splash_obj$calls <- c(splash_obj$calls, sprintf('splash.response_body_enabled = %s',
if (enable) "true" else "false"))
splash_obj
}

#' Enable or disable browser plugins (e.g. Flash).
#'
#' Plugins are disabled by default.
#'
#' @param splash_obj splashr object
#' @param enable logical
#' @export
#' @examples \dontrun{
#' splash_local %>%
#' splash_plugins(TRUE) %>%
#' splash_go("https://rud.is/b") %>%
#' splash_wait(2) %>%
#' splash_har() -> rud_har
#' }
splash_plugins <- function(splash_obj, enable=FALSE) {
splash_obj$calls <- c(splash_obj$calls, sprintf('splash.plugins_enabled = %s',
if (enable) "true" else "false"))
splash_obj
}

#' Enable/disable images
#'
#' By default, images are enabled. Disabling of the images can save a lot of network
#' traffic (usually around ~50%) and make rendering faster. Note that this option can
#' affect the JavaScript code inside page: disabling of the images may change sizes and
#' positions of DOM elements, and scripts may read and use them.
#'
#' @param splash_obj splashr object
#' @param enable logical
#' @export
#' @examples \dontrun{
#' splash_local %>%
#' splash_images(TRUE) %>%
#' splash_go("https://rud.is/b") %>%
#' splash_wait(2) %>%
#' splash_har() -> rud_har
#' }
splash_images <- function(splash_obj, enable=TRUE) {
splash_obj$calls <- c(splash_obj$calls, sprintf('splash.images_enabled = %s',
if (enable) "true" else "false"))
splash_obj
}

#' Go to an URL.
#'
#' This is similar to entering an URL in a browser address bar, pressing Enter and waiting
#' until page loads.
#'
#' @param splash_obj splashr object
#' @param url - URL to load;
#' @export
#' @examples \dontrun{
#' splash_local %>%
#' splash_response_body(TRUE) %>%
#' splash_go("https://rud.is/b") %>%
#' splash_wait(2) %>%
#' splash_har() -> rud_har
#' }
splash_go <- function(splash_obj, url) {
splash_obj$calls <- c(splash_obj$calls,
sprintf('url = "%s"', url),
"splash:go(url)")
splash_obj
}

#' Wait for a period time
#'
#' When script is waiting WebKit continues processing the webpage
#'
#' @md
#' @param splash_obj splashr object
#' @param time number of seconds to wait
#' @export
#' @examples \dontrun{
#' splash_local %>%
#' splash_response_body(TRUE) %>%
#' splash_go("https://rud.is/b") %>%
#' splash_wait(2) %>%
#' splash_har() -> rud_har
#' }
splash_wait <- function(splash_obj, time=2) {
splash_obj$calls <- c(splash_obj$calls, sprintf('splash:wait(%s)', time))
splash_obj
}

#' Return information about Splash interaction with a website in HAR format.
#'
#' Similar to [render_har] but used in a script context. Should be the LAST element in
#' a DSL script chain as this will execute the script and return the HAR content
#'
#' @md
#' @param splash_obj splashr object
#' @export
#' @examples \dontrun{
#' splash_local %>%
#' splash_response_body(TRUE) %>%
#' splash_go("https://rud.is/b") %>%
#' splash_wait(2) %>%
#' splash_har() -> rud_har
#' }
splash_har <- function(splash_obj) {

splash_obj$calls <- c(splash_obj$calls, 'return(splash:har())')

call_function <- make_splash_call(splash_obj)

res <- execute_lua(splash_obj, call_function)
as_har(res)

}

#' Return a HTML snapshot of a current page.
#'
#' Similar to [render_html] but used in a script context. Should be the LAST element in
#' a DSL script chain as this will execute the script and return the HTML content
#'
#' @md
#' @param splash_obj splashr object
#' @param raw_html if `TRUE` then return a character vector vs an XML document.
#' @export
#' @examples \dontrun{
#' splash_local %>%
#' splash_response_body(TRUE) %>%
#' splash_go("https://rud.is/b") %>%
#' splash_wait(2) %>%
#' splash_html() -> rud_pg
#' }
splash_html <- function(splash_obj, raw_html=FALSE) {

splash_obj$calls <- c(splash_obj$calls, 'return(splash:html())')

call_function <- make_splash_call(splash_obj)

out <- execute_lua(splash_obj, call_function)

if (!raw_html) out <- xml2::read_html(out)

out

}

#' Return a screenshot of a current page in PNG format.
#'
#' Similar to [render_png] but used in a script context. Should be the LAST element in
#' a DSL script chain as this will execute the script and return the PNG content
#'
#' @md
#' @param splash_obj splashr object
#' @return a [magick] image object
#' @export
#' @examples \dontrun{
#' splash_local %>%
#' splash_go("https://rud.is/b") %>%
#' splash_wait(2) %>%
#' splash_png()
#' }
splash_png <- function(splash_obj) {

splash_obj$calls <- c(splash_obj$calls, 'return splash:png{render_all=true}')

call_function <- make_splash_call(splash_obj)

res <- execute_lua(splash_obj, call_function)

magick::image_read(res)

}


+ 2
- 2
R/render-png.r View File

@@ -2,12 +2,12 @@
#'
#' @md
#' @param width,height Resize the rendered image to the given width/height (in pixels) keeping the aspect ratio.
#' @param render_all If `TRUE` extend the viewport to include the whole webpage (possibly very tall) before rendering. Default is `FASLE`
#' @param render_all If `TRUE` extend the viewport to include the whole webpage (possibly very tall) before rendering.
#' @return a [magick] image object
#' @references [Splash docs](http://splash.readthedocs.io/en/stable/index.html)
#' @inheritParams render_html
#' @export
render_png <- function(splash_obj, url, base_url, width=1024, height=768, render_all=FALSE,
render_png <- function(splash_obj, url, base_url, width=1024, height=768, render_all=TRUE,
timeout=30, resource_timeout, wait=0,
proxy, js, js_src, filters, allowed_domains, allowed_content_types,
forbidden_content_types, viewport="1024x768", images, headers, body,


+ 15
- 1
README.Rmd View File

@@ -34,7 +34,7 @@ All you need for this package to work is a running Splash instance. You provide

### About Splash

>'Splash' <https://github.com/scrapinghub/splash> is a javascript rendering service. It’s a lightweight web browser with an 'HTTP' API, implemented in Python using 'Twisted'and 'QT' and provides some of the core functionality of the 'RSelenium' or 'seleniumPipes' R packages but with a Java-free footprint. The (twisted) 'QT' reactor is used to make the sever fully asynchronous allowing to take advantage of 'webkit' concurrency via QT main loop. Some of Splash features include the ability to process multiple webpages in parallel; retrieving HTML results and/or take screenshots; disabling images or use Adblock Plus rules to make rendering faster; executing custom JavaScript in page context; getting detailed rendering info in HAR format.
>'Splash' <https://github.com/scrapinghub/splash> is a javascript rendering service. It’s a lightweight web browser with an 'HTTP' API, implemented in Python using 'Twisted'and 'QT' [and provides some of the core functionality of the 'RSelenium' or 'seleniumPipes' R packages but with a Java-free footprint]. The (twisted) 'QT' reactor is used to make the sever fully asynchronous allowing to take advantage of 'webkit' concurrency via QT main loop. Some of Splash features include the ability to process multiple webpages in parallel; retrieving HTML results and/or take screenshots; disabling images or use Adblock Plus rules to make rendering faster; executing custom JavaScript in page context; getting detailed rendering info in HAR format.
The following functions are implemented:

@@ -49,6 +49,20 @@ The following functions are implemented:
- `start_splash`: Start a Splash server Docker container
- `stop_splash`: Stop a running a Splash server Docker container

Mini-DSL (domain-specific language). These can be used to create a "script" without actually
scripting in Lua. They are a less-powerful/configurable set of calls than what you
can make with a full Lua function but the idea is to have it take care of very common but
simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG image of a site:

- `splash_plugins`: Enable or disable browser plugins (e.g. Flash).
- `splash_images`: Enable/disable images
- `splash_response_body`: Enable or disable response content tracking.
- `splash_go`: Go to an URL.
- `splash_wait`: Wait for a period time
- `splash_har`: Return information about Splash interaction with a website in HAR format.
- `splash_html`: Return a HTML snapshot of a current page.
- `splash_png`: Return a screenshot of a current page in PNG format.

Helpers:

- `get_body_size`: Retrieve size of content | body | headers


+ 17
- 3
README.md View File

@@ -31,9 +31,9 @@ All you need for this package to work is a running Splash instance. You provide

### About Splash

> 'Splash' <https://github.com/scrapinghub/splash> is a javascript rendering service. It’s a lightweight web browser with an 'HTTP' API, implemented in Python using 'Twisted'and 'QT' and provides some of the core functionality of the 'RSelenium' or 'seleniumPipes' R packages but with a Java-free footprint. The (twisted) 'QT' reactor is used to make the sever fully asynchronous allowing to take advantage of 'webkit' concurrency via QT main loop. Some of Splash features include the ability to process multiple webpages in parallel; retrieving HTML results and/or take screenshots; disabling images or use Adblock Plus rules to make rendering faster; executing custom JavaScript in page context; getting detailed rendering info in HAR format.
> 'Splash' <https://github.com/scrapinghub/splash> is a javascript rendering service. It’s a lightweight web browser with an 'HTTP' API, implemented in Python using 'Twisted'and 'QT' [and provides some of the core functionality of the 'RSelenium' or 'seleniumPipes' R packages but with a Java-free footprint]. The (twisted) 'QT' reactor is used to make the sever fully asynchronous allowing to take advantage of 'webkit' concurrency via QT main loop. Some of Splash features include the ability to process multiple webpages in parallel; retrieving HTML results and/or take screenshots; disabling images or use Adblock Plus rules to make rendering faster; executing custom JavaScript in page context; getting detailed rendering info in HAR format.

The following functions are implemented:
The following functions are implemented:

- `render_html`: Return the HTML of the javascript-rendered page.
- `render_file`: Return the HTML or image (png) of the javascript-rendered page in a local file
@@ -46,7 +46,21 @@ The following functions are implemented:
- `start_splash`: Start a Splash server Docker container
- `stop_splash`: Stop a running a Splash server Docker container

Helpers:
Mini-DSL (domain-specific language). These can be used to create a "script" without actually
scripting in Lua. They are a less-powerful/configurable set of calls than what you
can make with a full Lua function but the idea is to have it take care of very common but
simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG image of a site:

- `splash_plugins`: Enable or disable browser plugins (e.g. Flash).
- `splash_images`: Enable/disable images
- `splash_response_body`: Enable or disable response content tracking.
- `splash_go`: Go to an URL.
- `splash_wait`: Wait for a period time
- `splash_har`: Return information about Splash interaction with a website in HAR format.
- `splash_html`: Return a HTML snapshot of a current page.
- `splash_png`: Return a screenshot of a current page in PNG format.

Helpers:

- `get_body_size`: Retrieve size of content | body | headers
- `get_content_sie`: Retrieve size of content | body | headers


+ 1
- 1
man/render_jpeg.Rd View File

@@ -29,7 +29,7 @@ render_jpeg(splash_obj, url, base_url = NULL, quality = 75, width = 1024,

\item{wait}{Time (in seconds) to wait for updates after page is loaded (defaults to 0).}

\item{render_all}{If \code{TRUE} extend the viewport to include the whole webpage (possibly very tall) before rendering. Default is \code{FASLE}}
\item{render_all}{If \code{TRUE} extend the viewport to include the whole webpage (possibly very tall) before rendering.}

\item{proxy}{Proxy profile name or proxy URL.}



+ 1
- 1
man/render_json.Rd View File

@@ -31,7 +31,7 @@ render_json(splash_obj, url, base_url = NULL, quality = 75, width = 1024,

\item{wait}{Time (in seconds) to wait for updates after page is loaded (defaults to 0).}

\item{render_all}{If \code{TRUE} extend the viewport to include the whole webpage (possibly very tall) before rendering. Default is \code{FASLE}}
\item{render_all}{If \code{TRUE} extend the viewport to include the whole webpage (possibly very tall) before rendering.}

\item{proxy}{Proxy profile name or proxy URL.}



+ 2
- 2
man/render_png.Rd View File

@@ -5,7 +5,7 @@
\title{Return a image (in PNG format) of the javascript-rendered page.}
\usage{
render_png(splash_obj, url, base_url, width = 1024, height = 768,
render_all = FALSE, timeout = 30, resource_timeout, wait = 0, proxy, js,
render_all = TRUE, timeout = 30, resource_timeout, wait = 0, proxy, js,
js_src, filters, allowed_domains, allowed_content_types,
forbidden_content_types, viewport = "1024x768", images, headers, body,
http_method, save_args, load_args)
@@ -19,7 +19,7 @@ render_png(splash_obj, url, base_url, width = 1024, height = 768,

\item{width, height}{Resize the rendered image to the given width/height (in pixels) keeping the aspect ratio.}

\item{render_all}{If \code{TRUE} extend the viewport to include the whole webpage (possibly very tall) before rendering. Default is \code{FASLE}}
\item{render_all}{If \code{TRUE} extend the viewport to include the whole webpage (possibly very tall) before rendering.}

\item{timeout}{A timeout (in seconds) for the render (defaults to 30).}



+ 26
- 0
man/splash_go.Rd View File

@@ -0,0 +1,26 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dsl.r
\name{splash_go}
\alias{splash_go}
\title{Go to an URL.}
\usage{
splash_go(splash_obj, url)
}
\arguments{
\item{splash_obj}{splashr object}

\item{url}{- URL to load;}
}
\description{
This is similar to entering an URL in a browser address bar, pressing Enter and waiting
until page loads.
}
\examples{
\dontrun{
splash_local \%>\%
splash_response_body(TRUE) \%>\%
splash_go("https://rud.is/b") \%>\%
splash_wait(2) \%>\%
splash_har() -> rud_har
}
}

+ 24
- 0
man/splash_har.Rd View File

@@ -0,0 +1,24 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dsl.r
\name{splash_har}
\alias{splash_har}
\title{Return information about Splash interaction with a website in HAR format.}
\usage{
splash_har(splash_obj)
}
\arguments{
\item{splash_obj}{splashr object}
}
\description{
Similar to \link{render_har} but used in a script context. Should be the LAST element in
a DSL script chain as this will execute the script and return the HAR content
}
\examples{
\dontrun{
splash_local \%>\%
splash_response_body(TRUE) \%>\%
splash_go("https://rud.is/b") \%>\%
splash_wait(2) \%>\%
splash_har() -> rud_har
}
}

+ 26
- 0
man/splash_html.Rd View File

@@ -0,0 +1,26 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dsl.r
\name{splash_html}
\alias{splash_html}
\title{Return a HTML snapshot of a current page.}
\usage{
splash_html(splash_obj, raw_html = FALSE)
}
\arguments{
\item{splash_obj}{splashr object}

\item{raw_html}{if \code{TRUE} then return a character vector vs an XML document.}
}
\description{
Similar to \link{render_html} but used in a script context. Should be the LAST element in
a DSL script chain as this will execute the script and return the HTML content
}
\examples{
\dontrun{
splash_local \%>\%
splash_response_body(TRUE) \%>\%
splash_go("https://rud.is/b") \%>\%
splash_wait(2) \%>\%
splash_html() -> rud_pg
}
}

+ 28
- 0
man/splash_images.Rd View File

@@ -0,0 +1,28 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dsl.r
\name{splash_images}
\alias{splash_images}
\title{Enable/disable images}
\usage{
splash_images(splash_obj, enable = TRUE)
}
\arguments{
\item{splash_obj}{splashr object}

\item{enable}{logical}
}
\description{
By default, images are enabled. Disabling of the images can save a lot of network
traffic (usually around ~50%) and make rendering faster. Note that this option can
affect the JavaScript code inside page: disabling of the images may change sizes and
positions of DOM elements, and scripts may read and use them.
}
\examples{
\dontrun{
splash_local \%>\%
splash_images(TRUE) \%>\%
splash_go("https://rud.is/b") \%>\%
splash_wait(2) \%>\%
splash_har() -> rud_har
}
}

+ 25
- 0
man/splash_plugins.Rd View File

@@ -0,0 +1,25 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dsl.r
\name{splash_plugins}
\alias{splash_plugins}
\title{Enable or disable browser plugins (e.g. Flash).}
\usage{
splash_plugins(splash_obj, enable = FALSE)
}
\arguments{
\item{splash_obj}{splashr object}

\item{enable}{logical}
}
\description{
Plugins are disabled by default.
}
\examples{
\dontrun{
splash_local \%>\%
splash_plugins(TRUE) \%>\%
splash_go("https://rud.is/b") \%>\%
splash_wait(2) \%>\%
splash_har() -> rud_har
}
}

+ 26
- 0
man/splash_png.Rd View File

@@ -0,0 +1,26 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dsl.r
\name{splash_png}
\alias{splash_png}
\title{Return a screenshot of a current page in PNG format.}
\usage{
splash_png(splash_obj)
}
\arguments{
\item{splash_obj}{splashr object}
}
\value{
a \link{magick} image object
}
\description{
Similar to \link{render_png} but used in a script context. Should be the LAST element in
a DSL script chain as this will execute the script and return the PNG content
}
\examples{
\dontrun{
splash_local \%>\%
splash_go("https://rud.is/b") \%>\%
splash_wait(2) \%>\%
splash_png()
}
}

+ 25
- 0
man/splash_response_body.Rd View File

@@ -0,0 +1,25 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dsl.r
\name{splash_response_body}
\alias{splash_response_body}
\title{Enable or disable response content tracking.}
\usage{
splash_response_body(splash_obj, enable = FALSE)
}
\arguments{
\item{splash_obj}{splashr object}

\item{enable}{logical}
}
\description{
By default Splash doesn’t keep bodies of each response in memory, for efficiency reasons.
}
\examples{
\dontrun{
splash_local \%>\%
splash_response_body(TRUE) \%>\%
splash_go("https://rud.is/b") \%>\%
splash_wait(2) \%>\%
splash_har() -> rud_har
}
}

+ 25
- 0
man/splash_wait.Rd View File

@@ -0,0 +1,25 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dsl.r
\name{splash_wait}
\alias{splash_wait}
\title{Wait for a period time}
\usage{
splash_wait(splash_obj, time = 2)
}
\arguments{
\item{splash_obj}{splashr object}

\item{time}{number of seconds to wait}
}
\description{
When script is waiting WebKit continues processing the webpage
}
\examples{
\dontrun{
splash_local \%>\%
splash_response_body(TRUE) \%>\%
splash_go("https://rud.is/b") \%>\%
splash_wait(2) \%>\%
splash_har() -> rud_har
}
}

Loading…
Cancel
Save