Browse Source

render_json

master
boB Rudis 3 years ago
parent
commit
d257e164a8
16 changed files with 221 additions and 11 deletions
  1. +3
    -0
      NAMESPACE
  2. +9
    -0
      R/render-har.r
  3. +9
    -6
      R/render-jpg.r
  4. +78
    -0
      R/render-json.r
  5. +1
    -0
      R/render-png.r
  6. +1
    -0
      R/splashr-package.R
  7. +1
    -1
      README.Rmd
  8. +3
    -3
      README.md
  9. BIN
      README_files/figure-markdown_github/unnamed-chunk-5-1.png
  10. BIN
      img/cap.jpg
  11. BIN
      img/cap.png
  12. +7
    -0
      man/render_har.Rd
  13. +3
    -0
      man/render_jpeg.Rd
  14. +102
    -0
      man/render_json.Rd
  15. +3
    -0
      man/render_png.Rd
  16. +1
    -1
      man/splashr.Rd

+ 3
- 0
NAMESPACE View File

@@ -1,11 +1,14 @@
# Generated by roxygen2: do not edit by hand

S3method(print,splash_debug)
S3method(print,splash_har)
S3method(print,splash_json)
S3method(print,splash_status)
export("%>%")
export(render_har)
export(render_html)
export(render_jpeg)
export(render_json)
export(render_png)
export(splash)
export(splash_active)


+ 9
- 0
R/render-har.r View File

@@ -6,6 +6,9 @@
#' @md
#' @param response_body When `TRUE`, response content is included in the HAR records
#' @inheritParams render_html
#' @return a huge `list`
#' @note a custom `print` method is defined to stop your console from being
#' overwhelmed with data. Use [str] to inspect various portions of the result.
#' @references [Splash docs](http://splash.readthedocs.io/en/stable/index.html)
#' @export
render_har <- function(splash_obj, url, base_url, response_body=FALSE, timeout=30, resource_timeout, wait=0,
@@ -44,3 +47,9 @@ render_har <- function(splash_obj, url, base_url, response_body=FALSE, timeout=3
out

}

#' @export
print.splash_har <- function(x, ...) {
cat("<splashr render_har() object>")
invisible(x)
}

+ 9
- 6
R/render-jpg.r View File

@@ -4,13 +4,15 @@
#' @param quality JPEG quality parameter in range from 0 to 100. Default is quality=75.
#' @inheritParams render_html
#' @inheritParams render_png
#' @return a [magick] image object
#' @references [Splash docs](http://splash.readthedocs.io/en/stable/index.html)
#' @export
render_jpeg <- function(splash_obj, url, base_url=NULL, quality=75, width=1024, height=768,
timeout=30, resource_timeout, wait=0, render_all=FALSE,
proxy, js, js_src, filters, allowed_domains, allowed_content_types,
forbidden_content_types, viewport="1024x768", images, headers, body,
http_method, save_args, load_args) {
render_jpeg <- render_jpg <- function(
splash_obj, url, base_url=NULL, quality=75, width=1024, height=768,
timeout=30, resource_timeout, wait=0, render_all=FALSE,
proxy, js, js_src, filters, allowed_domains, allowed_content_types,
forbidden_content_types, viewport="1024x768", images, headers, body,
http_method, save_args, load_args) {

params <- list(url=url, timeout=timeout, wait=wait, viewport=viewport,
quality=quality, width=width, height=height, render_all=as.numeric(render_all))
@@ -37,4 +39,5 @@ render_jpeg <- function(splash_obj, url, base_url=NULL, quality=75, width=1024,

magick::image_read(httr::content(res, as="raw"))

}
}


+ 78
- 0
R/render-json.r View File

@@ -0,0 +1,78 @@
#' Return a json-encoded dictionary with information about javascript-rendered webpage.
#'
#' It can include HTML, PNG and other information, based on arguments passed.
#'
#' @md
#' @rdname render_json
#' @param html Whether to include HTML in output.
#' @param png Whether to include PNG in output.
#' @param jpeg Whether to include JPEG in output.
#' @param iframes Whether to include information about child frames in output.
#' @param script Whether to include the result of the custom executed javascript final
#' statement in output
#' @param console Whether to include the executed javascript console messages in output.
#' @param history Whether to include the history of requests/responses for webpage main frame.
#' Use it to get HTTP status codes and headers. Only information about "main"
#' requests/responses is returned (i.e. information about related resources
#' like images and AJAX queries is not returned). To get information about all
#' requests and responses use `har` parameter.
#' @param har Whether to include HAR in output. If `TRUE` the result will contain the same
#' data as [render_har] provides under `har` list entry. By default, response
#' content is not included. To enable it use `response_body` parameter.
#' @param response_body Used with `har` parameter.
#' @return a huge `list`
#' @inheritParams render_jpeg
#' @note All "whether to include..." parameters are default `TRUE` except for `png` and
#' `jpeg` and a custom `print` method is defined to stop your console from being
#' overwhelmed with data. Use [str] to inspect various portions of the result.
#' @references [Splash docs](http://splash.readthedocs.io/en/stable/index.html)
#' @export
render_json <- function(splash_obj, url, base_url=NULL, quality=75, width=1024, height=768,
timeout=30, resource_timeout, wait=0, render_all=FALSE,
proxy, js, js_src, filters, allowed_domains, allowed_content_types,
forbidden_content_types, viewport="1024x768", images, headers, body,
http_method, save_args, load_args, html=TRUE, png=FALSE, jpeg=FALSE,
iframes=TRUE, script=TRUE, console=TRUE, history=TRUE, har=TRUE,
response_body=TRUE) {

params <- list(url=url, timeout=timeout, wait=wait, viewport=viewport,
quality=quality, width=width, height=height, render_all=as.numeric(render_all),
html=as.numeric(html), png=as.numeric(png), jpeg=as.numeric(jpeg),
iframes=as.numeric(iframes), script=as.numeric(script),
console=as.numeric(console), history=as.numeric(history), har=as.numeric(har),
response_body=as.numeric(response_body))

if (!missing(base_url)) params$base_url <- base_url
if (!missing(resource_timeout)) params$resource_timeout <- resource_timeout
if (!missing(proxy)) proxy$base_url <- proxy
if (!missing(js)) params$js <- js
if (!missing(js_src)) params$js_src <- js_src
if (!missing(filters)) params$filters <- filters
if (!missing(allowed_domains)) params$allowed_domains <- allowed_domains
if (!missing(allowed_content_types)) params$allowed_content_types <- allowed_content_types
if (!missing(forbidden_content_types)) params$forbidden_content_types <- forbidden_content_types
if (!missing(images)) params$images <- images
if (!missing(headers)) params$headers <- headers
if (!missing(body)) params$body <- body
if (!missing(http_method)) params$http_method <- http_method
if (!missing(save_args)) params$save_args <- save_args
if (!missing(load_args)) params$load_args <- load_args

res <- httr::GET(splash_url(splash_obj), path="render.json", encode="json", query=params)

httr::stop_for_status(res)

out <- httr::content(res, as="text", encoding="UTF-8")
out <- jsonlite::fromJSON(out)

class(out) <- c("splash_json", class(out))

out

}

#' @export
print.splash_json <- function(x, ...) {
cat("<splashr render_json() object>")
invisible(x)
}

+ 1
- 0
R/render-png.r View File

@@ -3,6 +3,7 @@
#' @md
#' @param width,height Resize the rendered image to the given width/height (in pixels) keeping the aspect ratio.
#' @param render_all If `TRUE` extend the viewport to include the whole webpage (possibly very tall) before rendering. Default is `FASLE`
#' @return a [magick] image object
#' @references [Splash docs](http://splash.readthedocs.io/en/stable/index.html)
#' @inheritParams render_html
#' @export


+ 1
- 0
R/splashr-package.R View File

@@ -10,6 +10,7 @@
#' disabling images or use Adblock Plus rules to make rendering faster; executing custom
#' JavaScript in page context; getting detailed rendering info in HAR format.
#'
#' @md
#' @name splashr
#' @docType package
#' @author Bob Rudis (bob@@rud.is)


+ 1
- 1
README.Rmd View File

@@ -37,7 +37,7 @@ The following functions are implemented:

Suggest more in a feature req!

- Implement `render.json`
- <strike>Implement `render.json`</strike>
- Implement `execute` (you can script Splash!)
- _Possibly_ writing R function wrappers to start Splash which would also support enabling javascript profiles, request filters and proxy profiles from with R directly, possibly using [`harbor`](https://github.com/wch/harbor)
- Testing results with all combinations of parameters


+ 3
- 3
README.md View File

@@ -34,7 +34,7 @@ The following functions are implemented:

Suggest more in a feature req!

- Implement `render.json`
- <strike>Implement `render.json`</strike>
- Implement `execute` (you can script Splash!)
- *Possibly* writing R function wrappers to start Splash which would also support enabling javascript profiles, request filters and proxy profiles from with R directly, possibly using [`harbor`](https://github.com/wch/harbor)
- Testing results with all combinations of parameters
@@ -80,7 +80,7 @@ splash("splash", 8050L) %>%
## List of 7
## $ active : list()
## $ argcache: int 0
## $ fds : int 18
## $ fds : int 17
## $ leaks :List of 4
## ..$ Deferred : int 50
## ..$ LuaRuntime: int 1
@@ -169,7 +169,7 @@ library(testthat)
date()
```

## [1] "Sat Feb 4 08:02:49 2017"
## [1] "Sat Feb 4 09:18:19 2017"

``` r
test_dir("tests/")


BIN
README_files/figure-markdown_github/unnamed-chunk-5-1.png View File

Before After
Width: 2112  |  Height: 1248  |  Size: 357KB Width: 2112  |  Height: 1248  |  Size: 317KB

BIN
img/cap.jpg View File

Before After
Width: 1024  |  Height: 768  |  Size: 123KB Width: 1024  |  Height: 768  |  Size: 124KB

BIN
img/cap.png View File

Before After
Width: 1024  |  Height: 768  |  Size: 433KB Width: 1024  |  Height: 768  |  Size: 433KB

+ 7
- 0
man/render_har.Rd View File

@@ -52,10 +52,17 @@ render_har(splash_obj, url, base_url, response_body = FALSE, timeout = 30,

\item{load_args}{Parameter values to load from cache}
}
\value{
a huge \code{list}
}
\description{
It includes information about requests made, responses received, timings, headers, etc and
is incredibly detailed, full of information on every componenent loaded.
}
\note{
a custom \code{print} method is defined to stop your console from being
overwhelmed with data. Use \link{str} to inspect various portions of the result.
}
\references{
\href{http://splash.readthedocs.io/en/stable/index.html}{Splash docs}
}

+ 3
- 0
man/render_jpeg.Rd View File

@@ -59,6 +59,9 @@ render_jpeg(splash_obj, url, base_url = NULL, quality = 75, width = 1024,

\item{load_args}{Parameter values to load from cache}
}
\value{
a \link{magick} image object
}
\description{
Return a image (in JPEG format) of the javascript-rendered page.
}


+ 102
- 0
man/render_json.Rd View File

@@ -0,0 +1,102 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/render-json.r
\name{render_json}
\alias{render_json}
\title{Return a json-encoded dictionary with information about javascript-rendered webpage.}
\usage{
render_json(splash_obj, url, base_url = NULL, quality = 75, width = 1024,
height = 768, timeout = 30, resource_timeout, wait = 0,
render_all = FALSE, proxy, js, js_src, filters, allowed_domains,
allowed_content_types, forbidden_content_types, viewport = "1024x768",
images, headers, body, http_method, save_args, load_args, html = TRUE,
png = FALSE, jpeg = FALSE, iframes = TRUE, script = TRUE,
console = TRUE, history = TRUE, har = TRUE, response_body = TRUE)
}
\arguments{
\item{splash_obj}{Object created by a call to \link{splash}}

\item{url}{The URL to render (required)}

\item{base_url}{The base url to render the page with.}

\item{quality}{JPEG quality parameter in range from 0 to 100. Default is quality=75.}

\item{width}{Resize the rendered image to the given width/height (in pixels) keeping the aspect ratio.}

\item{height}{Resize the rendered image to the given width/height (in pixels) keeping the aspect ratio.}

\item{timeout}{A timeout (in seconds) for the render (defaults to 30).}

\item{resource_timeout}{A timeout (in seconds) for individual network requests.}

\item{wait}{Time (in seconds) to wait for updates after page is loaded (defaults to 0).}

\item{render_all}{If \code{TRUE} extend the viewport to include the whole webpage (possibly very tall) before rendering. Default is \code{FASLE}}

\item{proxy}{Proxy profile name or proxy URL.}

\item{js}{Javascript profile name.}

\item{js_src}{JavaScript code to be executed in page context.}

\item{filters}{Comma-separated list of request filter names.}

\item{allowed_domains}{Comma-separated list of allowed domain names. If present, Splash won’t load anything neither from domains not in this list nor from subdomains of domains not in this list.}

\item{allowed_content_types}{Comma-separated list of allowed content types. If present, Splash will abort any request if the response’s content type doesn’t match any of the content types in this list. Wildcards are supported.}

\item{forbidden_content_types}{Comma-separated list of forbidden content types. If present, Splash will abort any request if the response’s content type matches any of the content types in this list. Wildcards are supported.}

\item{viewport}{View width and height (in pixels) of the browser viewport to render the web page. Format is “<width>x<height>”, e.g. 800x600. Default value is 1024x768.}

\item{images}{Whether to download images.}

\item{headers}{HTTP headers to set for the first outgoing request.}

\item{body}{Body of HTTP POST request to be sent if method is POST.}

\item{http_method}{HTTP method of outgoing Splash request.}

\item{save_args}{A list of argument names to put in cache.}

\item{load_args}{Parameter values to load from cache}

\item{html}{Whether to include HTML in output.}

\item{png}{Whether to include PNG in output.}

\item{jpeg}{Whether to include JPEG in output.}

\item{iframes}{Whether to include information about child frames in output.}

\item{script}{Whether to include the result of the custom executed javascript final
statement in output}

\item{console}{Whether to include the executed javascript console messages in output.}

\item{history}{Whether to include the history of requests/responses for webpage main frame.
Use it to get HTTP status codes and headers. Only information about "main"
requests/responses is returned (i.e. information about related resources
like images and AJAX queries is not returned). To get information about all
requests and responses use \code{har} parameter.}

\item{har}{Whether to include HAR in output. If \code{TRUE} the result will contain the same
data as \link{render_har} provides under \code{har} list entry. By default, response
content is not included. To enable it use \code{response_body} parameter.}

\item{response_body}{Used with \code{har} parameter.}
}
\value{
a huge \code{list}
}
\description{
It can include HTML, PNG and other information, based on arguments passed.
}
\note{
All "whether to include..." parameters are default \code{TRUE} except for \code{png} and
\code{jpeg} and a custom \code{print} method is defined to stop your console from being
overwhelmed with data. Use \link{str} to inspect various portions of the result.
}
\references{
\href{http://splash.readthedocs.io/en/stable/index.html}{Splash docs}
}

+ 3
- 0
man/render_png.Rd View File

@@ -55,6 +55,9 @@ render_png(splash_obj, url, base_url, width = 1024, height = 768,

\item{load_args}{Parameter values to load from cache}
}
\value{
a \link{magick} image object
}
\description{
Return a image (in PNG format) of the javascript-rendered page.
}


+ 1
- 1
man/splashr.Rd View File

@@ -6,7 +6,7 @@
\alias{splashr-package}
\title{Tools to Work with the 'Splash' JavaScript Rendering Service}
\description{
'Splash' <https://github.com/scrapinghub/splash> is a javascript rendering service.
'Splash' \url{https://github.com/scrapinghub/splash} is a javascript rendering service.
It’s a lightweight web browser with an 'HTTP' API, implemented in Python using
'Twisted'and 'QT' and provides some of the core functionality of the 'RSelenium' or
'seleniumPipes'R pacakges but with a Java-free footprint. The (twisted) 'QT' reactor is


Loading…
Cancel
Save