Browse Source

initial commit

master
boB Rudis 7 years ago
commit
5c2b43b154
  1. 8
      .Rbuildignore
  2. 7
      .gitignore
  3. 24
      .travis.yml
  4. 31
      DESCRIPTION
  5. 16
      NAMESPACE
  6. 2
      NEWS.md
  7. 41
      R/render-html.r
  8. 23
      R/render-jpg.r
  9. 23
      R/render-png.r
  10. 33
      R/splashr-package.R
  11. 64
      R/splashr.r
  12. 107
      README.Rmd
  13. 134
      README.md
  14. BIN
      img/cap.jpg
  15. BIN
      img/cap.png
  16. 56
      man/render_html.Rd
  17. 65
      man/render_jpeg.Rd
  18. 61
      man/render_png.Rd
  19. 16
      man/splash.Rd
  20. 18
      man/splash_active.Rd
  21. 18
      man/splash_debug.Rd
  22. 11
      man/splashr-exports.Rd
  23. 21
      man/splashr.Rd
  24. 21
      splashr.Rproj
  25. 2
      tests/test-all.R
  26. 6
      tests/testthat/test-splash.R

8
.Rbuildignore

@ -0,0 +1,8 @@
^.*\.Rproj$
^\.Rproj\.user$
^\.travis\.yml$
^README\.*Rmd$
^README\.*html$
^NOTES\.*Rmd$
^NOTES\.*html$
^img$

7
.gitignore

@ -0,0 +1,7 @@
.Rproj.user
.Rhistory
.RData
.Rproj
src/*.o
src/*.so
src/*.dll

24
.travis.yml

@ -0,0 +1,24 @@
language: r
warnings_are_errors: true
sudo: required
r:
- oldrel
- release
- devel
apt_packages:
- libv8-dev
- xclip
env:
global:
- CRAN: http://cran.rstudio.com
notifications:
email:
- bob@rud.is
irc:
channels:
- "104.236.112.222#builds"
nick: travisci

31
DESCRIPTION

@ -0,0 +1,31 @@
Package: splashr
Type: Package
Title: Tools to Work with the 'Splash' JavaScript Rendering Service
Version: 0.1.0
Date: 2017-02-03
Encoding: UTF-8
Author: Bob Rudis (bob@rud.is)
Maintainer: Bob Rudis <bob@rud.is>
Description: 'Splash' <https://github.com/scrapinghub/splash> is a javascript rendering service.
It’s a lightweight web browser with an 'HTTP' API, implemented in Python using 'Twisted'
and 'QT' and provides some of the core functionality of the 'RSelenium' or 'seleniumPipes'
R pacakges but with a Java-free footprint. The (twisted) 'QT' reactor is used to make the
sever fully asynchronous allowing to take advantage of 'webkit' concurrency via QT main loop.
Some of Splash features include the ability to process multiple webpages in parallel;
retrieving HTML results and/or take screenshots; disabling images or use Adblock Plus rules
to make rendering faster; executing custom JavaScript in page context; getting detailed
rendering info in HAR format.
URL: http://github.com/hrbrmstr/splashr
BugReports: https://github.com/hrbrmstr/splashr/issues
License: AGPL
Suggests:
testthat
Depends:
R (>= 3.2.0)
Imports:
purrr,
httr,
xml2,
jsonlite,
magick
RoxygenNote: 6.0.0

16
NAMESPACE

@ -0,0 +1,16 @@
# Generated by roxygen2: do not edit by hand
S3method(print,splash_debug)
S3method(print,splash_status)
export("%>%")
export(render_html)
export(render_jpeg)
export(render_png)
export(splash)
export(splash_active)
export(splash_debug)
import(httr)
import(magick)
import(purrr)
importFrom(jsonlite,fromJSON)
importFrom(xml2,read_html)

2
NEWS.md

@ -0,0 +1,2 @@
0.1.0
* Initial release

41
R/render-html.r

@ -0,0 +1,41 @@
#' Return the HTML of the javascript-rendered page.
#'
#' Similar to `rvest::read_html`.
#'
#' @md
#' @param splash_obj Object created by a call to [splash]
#' @param url The URL to render (required)
#' @param base_url TBD The base url to render the page with.
#' @param timeout TBD A timeout (in seconds) for the render (defaults to 30).
#' @param resource_timeout A timeout (in seconds) for individual network requests.
#' @param wait Time (in seconds) to wait for updates after page is loaded (defaults to 0).
#' @param proxy TBD Proxy profile name or proxy URL.
#' @param js TBD Javascript profile name.
#' @param js_src TBD JavaScript code to be executed in page context.
#' @param filters TBD Comma-separated list of request filter names.
#' @param allowed_domains TBD Comma-separated list of allowed domain names. If present, Splash won’t load anything neither from domains not in this list nor from subdomains of domains not in this list.
#' @param allowed_content_types TBD Comma-separated list of allowed content types. If present, Splash will abort any request if the response’s content type doesn’t match any of the content types in this list. Wildcards are supported.
#' @param forbidden_content_types TBD Comma-separated list of forbidden content types. If present, Splash will abort any request if the response’s content type matches any of the content types in this list. Wildcards are supported.
#' @param viewport View width and height (in pixels) of the browser viewport to render the web page. Format is “<width>x<height>”, e.g. 800x600. Default value is 1024x768.
#' @param images TBD Whether to download images.
#' @param headers TBD HTTP headers to set for the first outgoing request.
#' @param body TBD Body of HTTP POST request to be sent if method is POST.
#' @param http_method TBD HTTP method of outgoing Splash request.
#' @param save_args TBD A list of argument names to put in cache.
#' @param load_args TBD Parameter values to load from cache
#' @export
render_html <- function(splash_obj, url, base_url, timeout=30, resource_timeout=NULL, wait=0,
proxy, js, js_src, filters, allowed_domains="", allowed_content_types="",
forbidden_content_types="", viewport="1024x768", images, headers, body,
http_method, save_args, load_args) {
res <- httr::GET(splash_url(splash_obj), path="render.html",
encode="json",
query=list(url=url, timeout=timeout, wait=wait, viewport=viewport))
httr::stop_for_status(res)
httr::content(res, as="text", encoding="UTF-8") %>%
xml2::read_html()
}

23
R/render-jpg.r

@ -0,0 +1,23 @@
#' Return a image (in JPEG format) of the javascript-rendered page.
#'
#' @md
#' @param quality JPEG quality parameter in range from 0 to 100. Default is quality=75.
#' @inheritParams render_html
#' @inheritParams render_png
#' @export
render_jpeg <- function(splash_obj, url, base_url=NULL, quality=75, width=1024, height=768,
timeout=30, resource_timeout=NULL, wait=0, render_all=FALSE,
proxy, js, js_src, filters, allowed_domains="", allowed_content_types="",
forbidden_content_types="", viewport="1024x768", images, headers, body,
http_method, save_args, load_args) {
res <- httr::GET(splash_url(splash_obj), path="render.jpeg",
encode="json",
query=list(url=url, timeout=timeout, wait=wait, viewport=viewport,
quality=quality, width=width, height=height, render_all=as.numeric(render_all)))
httr::stop_for_status(res)
magick::image_read(httr::content(res, as="raw"))
}

23
R/render-png.r

@ -0,0 +1,23 @@
#' Return a image (in PNG format) of the javascript-rendered page.
#'
#' @md
#' @param width,height Resize the rendered image to the given width/height (in pixels) keeping the aspect ratio.
#' @param render_all If `TRUE` extend the viewport to include the whole webpage (possibly very tall) before rendering. Default is `FASLE`
#' @inheritParams render_html
#' @export
render_png <- function(splash_obj, url, base_url=NULL, width=1024, height=768, render_all=FALSE,
timeout=30, resource_timeout=NULL, wait=0,
proxy, js, js_src, filters, allowed_domains="", allowed_content_types="",
forbidden_content_types="", viewport="1024x768", images, headers, body,
http_method, save_args, load_args) {
res <- httr::GET(splash_url(splash_obj), path="render.png",
encode="json",
query=list(url=url, timeout=timeout, wait=wait, viewport=viewport,
width=width, height=height, render_all=as.numeric(render_all)))
httr::stop_for_status(res)
magick::image_read(httr::content(res, as="raw"))
}

33
R/splashr-package.R

@ -0,0 +1,33 @@
#' Tools to Work with the 'Splash' JavaScript Rendering Service
#'
#' 'Splash' <https://github.com/scrapinghub/splash> is a javascript rendering service.
#' It’s a lightweight web browser with an 'HTTP' API, implemented in Python using
#' 'Twisted'and 'QT' and provides some of the core functionality of the 'RSelenium' or
#' 'seleniumPipes'R pacakges but with a Java-free footprint. The (twisted) 'QT' reactor is
#' used to make the sever fully asynchronous allowing to take advantage of 'webkit'
#' concurrency via QT main loop. Some of Splash features include the ability to process
#' multiple webpages in parallel; retrieving HTML results and/or take screenshots;
#' disabling images or use Adblock Plus rules to make rendering faster; executing custom
#' JavaScript in page context; getting detailed rendering info in HAR format.
#'
#' @name splashr
#' @docType package
#' @author Bob Rudis (bob@@rud.is)
#' @import purrr httr magick
#' @importFrom xml2 read_html
#' @importFrom jsonlite fromJSON
NULL
#' splashr exported operators
#'
#' The following functions are imported and then re-exported
#' from the splashr package to enable use of the magrittr
#' pipe operator with no additional library calls
#'
#' @name splashr-exports
NULL
#' @name %>%
#' @export
#' @rdname splashr-exports
NULL

64
R/splashr.r

@ -0,0 +1,64 @@
splash_url <- function(splash_obj) { sprintf("http://%s:%s", splash_obj$host, splash_obj$port) }
#' Configure parameters for connecting to a Splash server
#'
#' @param host host or IP address
#' @param port port the server is running on (default is 8050)
#' @export
splash <- function(host, port=8050L) {
list(host=host, port=port)
}
#' Test if a Splash server is up
#'
#' @param splash_obj A splash connection object
#' @export
splash_active <- function(splash_obj) {
httr::GET(splash_url(splash_obj), path="_ping") %>%
httr::stop_for_status() %>%
httr::content(as="text", encoding="UTF-8") %>%
jsonlite::fromJSON() -> out
out$url <- splash_url(splash_obj)
class(out) <- c("splash_status", class(out))
out
}
#' @rdname splash_active
#' @keywords internal
#' @export
print.splash_status <- function(x, ...) {
cat(sprintf("Status of splash instance on [%s]: %s. Max RSS: %s\n", x$url, x$status, x$maxrss))
invisible(x)
}
#' Retrieve debug-level info for a Splash server
#'
#' @param splash_obj A splash connection object
#' @export
splash_debug <- function(splash_obj) {
httr::GET(splash_url(splash_obj), path="_debug") %>%
httr::stop_for_status() %>%
httr::content(as="text", encoding="UTF-8") %>%
jsonlite::fromJSON() -> out
out$url <- splash_url(splash_obj)
class(out) <- c("splash_debug", class(out))
out
}
#' @rdname splash_debug
#' @keywords internal
#' @export
print.splash_debug <- function(x, ...) {
print(str(x))
invisible(x)
}

107
README.Rmd

@ -0,0 +1,107 @@
---
output: rmarkdown::github_document
---
`splashr` : Tools to Work with the 'Splash' JavaScript Rendering Service
**Ridicuously basic functionality working at the moment. More coming soon**
TL;DR: This package works with Splash rendering servers which are really just a REST API & `lua` scripting interface to a QT browser. It's an alternative to the Selenium ecosystem and does not do everything Selenium can, but if you're just trying to get a page back that needs javascript rendering, this is a nice alternative.
You can also get it running with two commands:
sudo docker pull scrapinghub/splash
sudo docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 scrapinghub/splash
(Do whatever you Windows ppl do with Docker on your systems to make ^^ work.)
All you need for this package to work is a running Splash instance. You provide the host/port for it and it's scrape-tastic from there.
### About Splash
>'Splash' <https://github.com/scrapinghub/splash> is a javascript rendering service. It’s a lightweight web browser with an 'HTTP' API, implemented in Python using 'Twisted'and 'QT' and provides some of the core functionality of the 'RSelenium' or 'seleniumPipes'R pacakges but with a Java-free footprint. The (twisted) 'QT' reactor is used to make the sever fully asynchronous allowing to take advantage of 'webkit' concurrency via QT main loop. Some of Splash features include the ability to process multiple webpages in parallel; retrieving HTML results and/or take screenshots; disabling images or use Adblock Plus rules to make rendering faster; executing custom JavaScript in page context; getting detailed rendering info in HAR format.
The following functions are implemented:
- `render_html`: Return the HTML of the javascript-rendered page.
- `render_jpeg`: Return a image (in JPEG format) of the javascript-rendered page.
- `render_png`: Return a image (in PNG format) of the javascript-rendered page.
- `splash`: Configure parameters for connecting to a Splash server
- `splashr`: Tools to Work with the 'Splash' JavaScript Rendering Service
### Installation
```{r eval=FALSE}
devtools::install_github("hrbrmstr/splashr")
```
```{r message=FALSE, warning=FALSE, error=FALSE}
options(width=120)
```
### Usage
```{r message=FALSE, warning=FALSE, error=FALSE}
library(splashr)
library(magick)
library(rvest)
# current verison
packageVersion("splashr")
splash("splash", 8050L) %>%
splash_active()
splash("splash", 8050L) %>%
splash_debug()
```
Notice the difference between a rendered HTML scrape and a non-rendered one:
```{r}
splash("splash", 8050L) %>%
render_html("http://marvel.com/universe/Captain_America_(Steve_Rogers)")
read_html("http://marvel.com/universe/Captain_America_(Steve_Rogers)")
```
Web page snapshots are easy-peasy too:
```{r eval=FALSE}
splash("splash", 8050L) %>%
render_png("http://marvel.com/universe/Captain_America_(Steve_Rogers)")
```
```{r eval=TRUE, include=FALSE}
splash("splash", 8050L) %>%
render_png("http://marvel.com/universe/Captain_America_(Steve_Rogers)") %>%
image_write("img/cap.png")
```
![](img/cap.png)
```{r eval=FALSE}
splash("splash", 8050L) %>%
render_jpeg("http://marvel.com/universe/Captain_America_(Steve_Rogers)")
```
```{r eval=TRUE, include=FALSE}
splash("splash", 8050L) %>%
render_jpeg("http://marvel.com/universe/Captain_America_(Steve_Rogers)") %>%
image_write("img/cap.jpg")
```
![](img/cap.jpg)
### Test Results
```{r message=FALSE, warning=FALSE, error=FALSE}
library(splashr)
library(testthat)
date()
test_dir("tests/")
```
```{r eval = FALSE, include = FALSE}

134
README.md

@ -0,0 +1,134 @@
`splashr` : Tools to Work with the 'Splash' JavaScript Rendering Service
**Ridicuously basic functionality working at the moment. More coming soon**
TL;DR: This package works with Splash rendering servers which are really just a REST API & `lua` scripting interface to a QT browser. It's an alternative to the Selenium ecosystem and does not do everything Selenium can, but if you're just trying to get a page back that needs javascript rendering, this is a nice alternative.
You can also get it running with two commands:
sudo docker pull scrapinghub/splash
sudo docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 scrapinghub/splash
(Do whatever you Windows ppl do with Docker on your systems to make ^^ work.)
All you need for this package to work is a running Splash instance. You provide the host/port for it and it's scrape-tastic from there.
### About Splash
> 'Splash' <https://github.com/scrapinghub/splash> is a javascript rendering service. It’s a lightweight web browser with an 'HTTP' API, implemented in Python using 'Twisted'and 'QT' and provides some of the core functionality of the 'RSelenium' or 'seleniumPipes'R pacakges but with a Java-free footprint. The (twisted) 'QT' reactor is used to make the sever fully asynchronous allowing to take advantage of 'webkit' concurrency via QT main loop. Some of Splash features include the ability to process multiple webpages in parallel; retrieving HTML results and/or take screenshots; disabling images or use Adblock Plus rules to make rendering faster; executing custom JavaScript in page context; getting detailed rendering info in HAR format.
The following functions are implemented:
- `render_html`: Return the HTML of the javascript-rendered page.
- `render_jpeg`: Return a image (in JPEG format) of the javascript-rendered page.
- `render_png`: Return a image (in PNG format) of the javascript-rendered page.
- `splash`: Configure parameters for connecting to a Splash server
- `splashr`: Tools to Work with the 'Splash' JavaScript Rendering Service
### Installation
``` r
devtools::install_github("hrbrmstr/splashr")
```
``` r
options(width=120)
```
### Usage
``` r
library(splashr)
library(magick)
library(rvest)
# current verison
packageVersion("splashr")
```
## [1] '0.1.0'
``` r
splash("splash", 8050L) %>%
splash_active()
```
## Status of splash instance on [http://splash:8050]: ok. Max RSS: 349298688
``` r
splash("splash", 8050L) %>%
splash_debug()
```
## List of 7
## $ active : list()
## $ argcache: int 0
## $ fds : int 18
## $ leaks :List of 4
## ..$ Deferred : int 50
## ..$ LuaRuntime: int 1
## ..$ QTimer : int 1
## ..$ Request : int 1
## $ maxrss : int 341112
## $ qsize : int 0
## $ url : chr "http://splash:8050"
## - attr(*, "class")= chr [1:2] "splash_debug" "list"
## NULL
Notice the difference between a rendered HTML scrape and a non-rendered one:
``` r
splash("splash", 8050L) %>%
render_html("http://marvel.com/universe/Captain_America_(Steve_Rogers)")
```
## {xml_document}
## <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
## [1] <head>\n<script src="http://widget-cdn.rpxnow.com/manifest/login?version=1.114.1_widgets_244" type="text/javascri ...
## [2] <body id="index-index" class="index-index" onload="findLinks('myLink');">\n\n\t<div id="page_frame" style="overfl ...
``` r
read_html("http://marvel.com/universe/Captain_America_(Steve_Rogers)")
```
## {xml_document}
## <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="X-UA-Compatible" content="IE=Edge">\n<link href="https://plus.google.com/10852333737344 ...
## [2] <body id="index-index" class="index-index" onload="findLinks('myLink');">\n\n\t<div id="page_frame" style="overfl ...
Web page snapshots are easy-peasy too:
``` r
splash("splash", 8050L) %>%
render_png("http://marvel.com/universe/Captain_America_(Steve_Rogers)")
```
![](img/cap.png)
``` r
splash("splash", 8050L) %>%
render_jpeg("http://marvel.com/universe/Captain_America_(Steve_Rogers)")
```
![](img/cap.jpg)
### Test Results
``` r
library(splashr)
library(testthat)
date()
```
## [1] "Fri Feb 3 14:58:40 2017"
``` r
test_dir("tests/")
```
## testthat results ========================================================================================================
## OK: 0 SKIPPED: 0 FAILED: 0
##
## DONE ===================================================================================================================

BIN
img/cap.jpg

Binary file not shown.

After

Width:  |  Height:  |  Size: 125 KiB

BIN
img/cap.png

Binary file not shown.

After

Width:  |  Height:  |  Size: 438 KiB

56
man/render_html.Rd

@ -0,0 +1,56 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/render-html.r
\name{render_html}
\alias{render_html}
\title{Return the HTML of the javascript-rendered page.}
\usage{
render_html(splash_obj, url, base_url, timeout = 30,
resource_timeout = NULL, wait = 0, proxy, js, js_src, filters,
allowed_domains = "", allowed_content_types = "",
forbidden_content_types = "", viewport = "1024x768", images, headers,
body, http_method, save_args, load_args)
}
\arguments{
\item{splash_obj}{Object created by a call to \link{splash}}
\item{url}{The URL to render (required)}
\item{base_url}{TBD The base url to render the page with.}
\item{timeout}{TBD A timeout (in seconds) for the render (defaults to 30).}
\item{resource_timeout}{A timeout (in seconds) for individual network requests.}
\item{wait}{Time (in seconds) to wait for updates after page is loaded (defaults to 0).}
\item{proxy}{TBD Proxy profile name or proxy URL.}
\item{js}{TBD Javascript profile name.}
\item{js_src}{TBD JavaScript code to be executed in page context.}
\item{filters}{TBD Comma-separated list of request filter names.}
\item{allowed_domains}{TBD Comma-separated list of allowed domain names. If present, Splash won’t load anything neither from domains not in this list nor from subdomains of domains not in this list.}
\item{allowed_content_types}{TBD Comma-separated list of allowed content types. If present, Splash will abort any request if the response’s content type doesn’t match any of the content types in this list. Wildcards are supported.}
\item{forbidden_content_types}{TBD Comma-separated list of forbidden content types. If present, Splash will abort any request if the response’s content type matches any of the content types in this list. Wildcards are supported.}
\item{viewport}{View width and height (in pixels) of the browser viewport to render the web page. Format is “<width>x<height>”, e.g. 800x600. Default value is 1024x768.}
\item{images}{TBD Whether to download images.}
\item{headers}{TBD HTTP headers to set for the first outgoing request.}
\item{body}{TBD Body of HTTP POST request to be sent if method is POST.}
\item{http_method}{TBD HTTP method of outgoing Splash request.}
\item{save_args}{TBD A list of argument names to put in cache.}
\item{load_args}{TBD Parameter values to load from cache}
}
\description{
Similar to \code{rvest::read_html}.
}

65
man/render_jpeg.Rd

@ -0,0 +1,65 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/render-jpg.r
\name{render_jpeg}
\alias{render_jpeg}
\title{Return a image (in JPEG format) of the javascript-rendered page.}
\usage{
render_jpeg(splash_obj, url, base_url = NULL, quality = 75, width = 1024,
height = 768, timeout = 30, resource_timeout = NULL, wait = 0,
render_all = FALSE, proxy, js, js_src, filters, allowed_domains = "",
allowed_content_types = "", forbidden_content_types = "",
viewport = "1024x768", images, headers, body, http_method, save_args,
load_args)
}
\arguments{
\item{splash_obj}{Object created by a call to \link{splash}}
\item{url}{The URL to render (required)}
\item{base_url}{TBD The base url to render the page with.}
\item{quality}{JPEG quality parameter in range from 0 to 100. Default is quality=75.}
\item{width}{Resize the rendered image to the given width/height (in pixels) keeping the aspect ratio.}
\item{height}{Resize the rendered image to the given width/height (in pixels) keeping the aspect ratio.}
\item{timeout}{TBD A timeout (in seconds) for the render (defaults to 30).}
\item{resource_timeout}{A timeout (in seconds) for individual network requests.}
\item{wait}{Time (in seconds) to wait for updates after page is loaded (defaults to 0).}
\item{render_all}{If \code{TRUE} extend the viewport to include the whole webpage (possibly very tall) before rendering. Default is \code{FASLE}}
\item{proxy}{TBD Proxy profile name or proxy URL.}
\item{js}{TBD Javascript profile name.}
\item{js_src}{TBD JavaScript code to be executed in page context.}
\item{filters}{TBD Comma-separated list of request filter names.}
\item{allowed_domains}{TBD Comma-separated list of allowed domain names. If present, Splash won’t load anything neither from domains not in this list nor from subdomains of domains not in this list.}
\item{allowed_content_types}{TBD Comma-separated list of allowed content types. If present, Splash will abort any request if the response’s content type doesn’t match any of the content types in this list. Wildcards are supported.}
\item{forbidden_content_types}{TBD Comma-separated list of forbidden content types. If present, Splash will abort any request if the response’s content type matches any of the content types in this list. Wildcards are supported.}
\item{viewport}{View width and height (in pixels) of the browser viewport to render the web page. Format is “<width>x<height>”, e.g. 800x600. Default value is 1024x768.}
\item{images}{TBD Whether to download images.}
\item{headers}{TBD HTTP headers to set for the first outgoing request.}
\item{body}{TBD Body of HTTP POST request to be sent if method is POST.}
\item{http_method}{TBD HTTP method of outgoing Splash request.}
\item{save_args}{TBD A list of argument names to put in cache.}
\item{load_args}{TBD Parameter values to load from cache}
}
\description{
Return a image (in JPEG format) of the javascript-rendered page.
}

61
man/render_png.Rd

@ -0,0 +1,61 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/render-png.r
\name{render_png}
\alias{render_png}
\title{Return a image (in PNG format) of the javascript-rendered page.}
\usage{
render_png(splash_obj, url, base_url = NULL, width = 1024, height = 768,
render_all = FALSE, timeout = 30, resource_timeout = NULL, wait = 0,
proxy, js, js_src, filters, allowed_domains = "",
allowed_content_types = "", forbidden_content_types = "",
viewport = "1024x768", images, headers, body, http_method, save_args,
load_args)
}
\arguments{
\item{splash_obj}{Object created by a call to \link{splash}}
\item{url}{The URL to render (required)}
\item{base_url}{TBD The base url to render the page with.}
\item{width, height}{Resize the rendered image to the given width/height (in pixels) keeping the aspect ratio.}
\item{render_all}{If \code{TRUE} extend the viewport to include the whole webpage (possibly very tall) before rendering. Default is \code{FASLE}}
\item{timeout}{TBD A timeout (in seconds) for the render (defaults to 30).}
\item{resource_timeout}{A timeout (in seconds) for individual network requests.}
\item{wait}{Time (in seconds) to wait for updates after page is loaded (defaults to 0).}
\item{proxy}{TBD Proxy profile name or proxy URL.}
\item{js}{TBD Javascript profile name.}
\item{js_src}{TBD JavaScript code to be executed in page context.}
\item{filters}{TBD Comma-separated list of request filter names.}
\item{allowed_domains}{TBD Comma-separated list of allowed domain names. If present, Splash won’t load anything neither from domains not in this list nor from subdomains of domains not in this list.}
\item{allowed_content_types}{TBD Comma-separated list of allowed content types. If present, Splash will abort any request if the response’s content type doesn’t match any of the content types in this list. Wildcards are supported.}
\item{forbidden_content_types}{TBD Comma-separated list of forbidden content types. If present, Splash will abort any request if the response’s content type matches any of the content types in this list. Wildcards are supported.}
\item{viewport}{View width and height (in pixels) of the browser viewport to render the web page. Format is “<width>x<height>”, e.g. 800x600. Default value is 1024x768.}
\item{images}{TBD Whether to download images.}
\item{headers}{TBD HTTP headers to set for the first outgoing request.}
\item{body}{TBD Body of HTTP POST request to be sent if method is POST.}
\item{http_method}{TBD HTTP method of outgoing Splash request.}
\item{save_args}{TBD A list of argument names to put in cache.}
\item{load_args}{TBD Parameter values to load from cache}
}
\description{
Return a image (in PNG format) of the javascript-rendered page.
}

16
man/splash.Rd

@ -0,0 +1,16 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/splashr.r
\name{splash}
\alias{splash}
\title{Configure parameters for connecting to a Splash server}
\usage{
splash(host, port = 8050L)
}
\arguments{
\item{host}{host or IP address}
\item{port}{port the server is running on (default is 8050)}
}
\description{
Configure parameters for connecting to a Splash server
}

18
man/splash_active.Rd

@ -0,0 +1,18 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/splashr.r
\name{splash_active}
\alias{splash_active}
\alias{print.splash_status}
\title{Test if a Splash server is up}
\usage{
splash_active(splash_obj)
\method{print}{splash_status}(x, ...)
}
\arguments{
\item{splash_obj}{A splash connection object}
}
\description{
Test if a Splash server is up
}
\keyword{internal}

18
man/splash_debug.Rd

@ -0,0 +1,18 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/splashr.r
\name{splash_debug}
\alias{splash_debug}
\alias{print.splash_debug}
\title{Retrieve debug-level info for a Splash server}
\usage{
splash_debug(splash_obj)
\method{print}{splash_debug}(x, ...)
}
\arguments{
\item{splash_obj}{A splash connection object}
}
\description{
Retrieve debug-level info for a Splash server
}
\keyword{internal}

11
man/splashr-exports.Rd

@ -0,0 +1,11 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/splashr-package.R
\name{splashr-exports}
\alias{splashr-exports}
\alias{\%>\%}
\title{splashr exported operators}
\description{
The following functions are imported and then re-exported
from the splashr package to enable use of the magrittr
pipe operator with no additional library calls
}

21
man/splashr.Rd

@ -0,0 +1,21 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/splashr-package.R
\docType{package}
\name{splashr}
\alias{splashr}
\alias{splashr-package}
\title{Tools to Work with the 'Splash' JavaScript Rendering Service}
\description{
'Splash' <https://github.com/scrapinghub/splash> is a javascript rendering service.
It’s a lightweight web browser with an 'HTTP' API, implemented in Python using
'Twisted'and 'QT' and provides some of the core functionality of the 'RSelenium' or
'seleniumPipes'R pacakges but with a Java-free footprint. The (twisted) 'QT' reactor is
used to make the sever fully asynchronous allowing to take advantage of 'webkit'
concurrency via QT main loop. Some of Splash features include the ability to process
multiple webpages in parallel; retrieving HTML results and/or take screenshots;
disabling images or use Adblock Plus rules to make rendering faster; executing custom
JavaScript in page context; getting detailed rendering info in HAR format.
}
\author{
Bob Rudis (bob@rud.is)
}

21
splashr.Rproj

@ -0,0 +1,21 @@
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
StripTrailingWhitespace: Yes
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageBuildArgs: --resave-data
PackageRoxygenize: rd,collate,namespace

2
tests/test-all.R

@ -0,0 +1,2 @@
library(testthat)
test_check("splashr")

6
tests/testthat/test-splash.R

@ -0,0 +1,6 @@
context("basic functionality")
test_that("we can do something", {
#expect_that(some_function(), is_a("data.frame"))
})
Loading…
Cancel
Save