Browse Source

initial commit

boB Rudis 2 years ago
commit
5c2b43b154
26 changed files with 808 additions and 0 deletions
  1. 8
    0
      .Rbuildignore
  2. 7
    0
      .gitignore
  3. 24
    0
      .travis.yml
  4. 31
    0
      DESCRIPTION
  5. 16
    0
      NAMESPACE
  6. 2
    0
      NEWS.md
  7. 41
    0
      R/render-html.r
  8. 23
    0
      R/render-jpg.r
  9. 23
    0
      R/render-png.r
  10. 33
    0
      R/splashr-package.R
  11. 64
    0
      R/splashr.r
  12. 107
    0
      README.Rmd
  13. 134
    0
      README.md
  14. BIN
      img/cap.jpg
  15. BIN
      img/cap.png
  16. 56
    0
      man/render_html.Rd
  17. 65
    0
      man/render_jpeg.Rd
  18. 61
    0
      man/render_png.Rd
  19. 16
    0
      man/splash.Rd
  20. 18
    0
      man/splash_active.Rd
  21. 18
    0
      man/splash_debug.Rd
  22. 11
    0
      man/splashr-exports.Rd
  23. 21
    0
      man/splashr.Rd
  24. 21
    0
      splashr.Rproj
  25. 2
    0
      tests/test-all.R
  26. 6
    0
      tests/testthat/test-splash.R

+ 8
- 0
.Rbuildignore View File

@@ -0,0 +1,8 @@
1
+^.*\.Rproj$
2
+^\.Rproj\.user$
3
+^\.travis\.yml$
4
+^README\.*Rmd$
5
+^README\.*html$
6
+^NOTES\.*Rmd$
7
+^NOTES\.*html$
8
+^img$

+ 7
- 0
.gitignore View File

@@ -0,0 +1,7 @@
1
+.Rproj.user
2
+.Rhistory
3
+.RData
4
+.Rproj
5
+src/*.o
6
+src/*.so
7
+src/*.dll

+ 24
- 0
.travis.yml View File

@@ -0,0 +1,24 @@
1
+language: r
2
+warnings_are_errors: true
3
+sudo: required
4
+
5
+r:
6
+ - oldrel
7
+ - release
8
+ - devel
9
+
10
+apt_packages:
11
+  - libv8-dev
12
+  - xclip
13
+
14
+env:
15
+ global:
16
+   - CRAN: http://cran.rstudio.com
17
+
18
+notifications:
19
+  email:
20
+    - bob@rud.is
21
+  irc:
22
+    channels:
23
+      - "104.236.112.222#builds"
24
+    nick: travisci

+ 31
- 0
DESCRIPTION View File

@@ -0,0 +1,31 @@
1
+Package: splashr
2
+Type: Package
3
+Title: Tools to Work with the 'Splash' JavaScript Rendering Service
4
+Version: 0.1.0
5
+Date: 2017-02-03
6
+Encoding: UTF-8
7
+Author: Bob Rudis (bob@rud.is)
8
+Maintainer: Bob Rudis <bob@rud.is>
9
+Description: 'Splash' <https://github.com/scrapinghub/splash> is a javascript rendering service.
10
+    It’s a lightweight web browser with an 'HTTP' API, implemented in Python using 'Twisted' 
11
+    and 'QT' and provides some of the core functionality of the 'RSelenium' or 'seleniumPipes'
12
+    R pacakges but with a Java-free footprint. The (twisted) 'QT' reactor is used to make the 
13
+    sever fully asynchronous allowing to take advantage of 'webkit' concurrency via QT main loop. 
14
+    Some of Splash features include the ability to process multiple webpages in parallel; 
15
+    retrieving HTML results and/or take screenshots; disabling images or use Adblock Plus rules 
16
+    to make rendering faster; executing custom JavaScript in page context; getting detailed
17
+    rendering info in HAR format.
18
+URL: http://github.com/hrbrmstr/splashr
19
+BugReports: https://github.com/hrbrmstr/splashr/issues
20
+License: AGPL
21
+Suggests:
22
+    testthat
23
+Depends:
24
+    R (>= 3.2.0)
25
+Imports:
26
+    purrr,
27
+    httr,
28
+    xml2,
29
+    jsonlite,
30
+    magick
31
+RoxygenNote: 6.0.0

+ 16
- 0
NAMESPACE View File

@@ -0,0 +1,16 @@
1
+# Generated by roxygen2: do not edit by hand
2
+
3
+S3method(print,splash_debug)
4
+S3method(print,splash_status)
5
+export("%>%")
6
+export(render_html)
7
+export(render_jpeg)
8
+export(render_png)
9
+export(splash)
10
+export(splash_active)
11
+export(splash_debug)
12
+import(httr)
13
+import(magick)
14
+import(purrr)
15
+importFrom(jsonlite,fromJSON)
16
+importFrom(xml2,read_html)

+ 2
- 0
NEWS.md View File

@@ -0,0 +1,2 @@
1
+0.1.0 
2
+* Initial release

+ 41
- 0
R/render-html.r View File

@@ -0,0 +1,41 @@
1
+#' Return the HTML of the javascript-rendered page.
2
+#'
3
+#' Similar to `rvest::read_html`.
4
+#'
5
+#' @md
6
+#' @param splash_obj Object created by a call to [splash]
7
+#' @param url The URL to render (required)
8
+#' @param base_url TBD The base url to render the page with.
9
+#' @param timeout TBD A timeout (in seconds) for the render (defaults to 30).
10
+#' @param resource_timeout A timeout (in seconds) for individual network requests.
11
+#' @param wait Time (in seconds) to wait for updates after page is loaded (defaults to 0).
12
+#' @param proxy TBD Proxy profile name or proxy URL.
13
+#' @param js TBD Javascript profile name.
14
+#' @param js_src TBD JavaScript code to be executed in page context.
15
+#' @param filters TBD Comma-separated list of request filter names.
16
+#' @param allowed_domains TBD Comma-separated list of allowed domain names. If present, Splash won’t load anything neither from domains not in this list nor from subdomains of domains not in this list.
17
+#' @param allowed_content_types TBD Comma-separated list of allowed content types. If present, Splash will abort any request if the response’s content type doesn’t match any of the content types in this list. Wildcards are supported.
18
+#' @param forbidden_content_types TBD Comma-separated list of forbidden content types. If present, Splash will abort any request if the response’s content type matches any of the content types in this list. Wildcards are supported.
19
+#' @param viewport View width and height (in pixels) of the browser viewport to render the web page. Format is “<width>x<height>”, e.g. 800x600. Default value is 1024x768.
20
+#' @param images TBD Whether to download images.
21
+#' @param headers TBD HTTP headers to set for the first outgoing request.
22
+#' @param body TBD Body of HTTP POST request to be sent if method is POST.
23
+#' @param http_method TBD HTTP method of outgoing Splash request.
24
+#' @param save_args TBD A list of argument names to put in cache.
25
+#' @param load_args TBD Parameter values to load from cache
26
+#' @export
27
+render_html <- function(splash_obj, url, base_url, timeout=30, resource_timeout=NULL, wait=0,
28
+                        proxy, js, js_src, filters, allowed_domains="", allowed_content_types="",
29
+                        forbidden_content_types="", viewport="1024x768", images, headers, body,
30
+                        http_method, save_args, load_args) {
31
+
32
+  res <- httr::GET(splash_url(splash_obj), path="render.html",
33
+                   encode="json",
34
+                   query=list(url=url, timeout=timeout, wait=wait, viewport=viewport))
35
+
36
+  httr::stop_for_status(res)
37
+
38
+  httr::content(res, as="text", encoding="UTF-8") %>%
39
+    xml2::read_html()
40
+
41
+}

+ 23
- 0
R/render-jpg.r View File

@@ -0,0 +1,23 @@
1
+#' Return a image (in JPEG format) of the javascript-rendered page.
2
+#'
3
+#' @md
4
+#' @param quality JPEG quality parameter in range from 0 to 100. Default is quality=75.
5
+#' @inheritParams render_html
6
+#' @inheritParams render_png
7
+#' @export
8
+render_jpeg <- function(splash_obj, url, base_url=NULL, quality=75, width=1024, height=768,
9
+                        timeout=30, resource_timeout=NULL, wait=0, render_all=FALSE,
10
+                        proxy, js, js_src, filters, allowed_domains="", allowed_content_types="",
11
+                        forbidden_content_types="", viewport="1024x768", images, headers, body,
12
+                        http_method, save_args, load_args) {
13
+
14
+  res <- httr::GET(splash_url(splash_obj), path="render.jpeg",
15
+                   encode="json",
16
+                   query=list(url=url, timeout=timeout, wait=wait, viewport=viewport,
17
+                              quality=quality, width=width, height=height, render_all=as.numeric(render_all)))
18
+
19
+  httr::stop_for_status(res)
20
+
21
+  magick::image_read(httr::content(res, as="raw"))
22
+
23
+}

+ 23
- 0
R/render-png.r View File

@@ -0,0 +1,23 @@
1
+#' Return a image (in PNG format) of the javascript-rendered page.
2
+#'
3
+#' @md
4
+#' @param width,height Resize the rendered image to the given width/height (in pixels) keeping the aspect ratio.
5
+#' @param render_all If `TRUE` extend the viewport to include the whole webpage (possibly very tall) before rendering. Default is `FASLE`
6
+#' @inheritParams render_html
7
+#' @export
8
+render_png <- function(splash_obj, url, base_url=NULL, width=1024, height=768, render_all=FALSE,
9
+                       timeout=30, resource_timeout=NULL, wait=0,
10
+                       proxy, js, js_src, filters, allowed_domains="", allowed_content_types="",
11
+                       forbidden_content_types="", viewport="1024x768", images, headers, body,
12
+                       http_method, save_args, load_args) {
13
+
14
+  res <- httr::GET(splash_url(splash_obj), path="render.png",
15
+                   encode="json",
16
+                   query=list(url=url, timeout=timeout, wait=wait, viewport=viewport,
17
+                              width=width, height=height, render_all=as.numeric(render_all)))
18
+
19
+  httr::stop_for_status(res)
20
+
21
+  magick::image_read(httr::content(res, as="raw"))
22
+
23
+}

+ 33
- 0
R/splashr-package.R View File

@@ -0,0 +1,33 @@
1
+#' Tools to Work with the 'Splash' JavaScript Rendering Service
2
+#'
3
+#' 'Splash' <https://github.com/scrapinghub/splash> is a javascript rendering service.
4
+#' It’s a lightweight web browser with an 'HTTP' API, implemented in Python using
5
+#' 'Twisted'and 'QT' and provides some of the core functionality of the 'RSelenium' or
6
+#' 'seleniumPipes'R pacakges but with a Java-free footprint. The (twisted) 'QT' reactor is
7
+#' used to make the sever fully asynchronous allowing to take advantage of 'webkit'
8
+#' concurrency via QT main loop. Some of Splash features include the ability to process
9
+#' multiple webpages in parallel; retrieving HTML results and/or take screenshots;
10
+#' disabling images or use Adblock Plus rules to make rendering faster; executing custom
11
+#' JavaScript in page context; getting detailed rendering info in HAR format.
12
+#'
13
+#' @name splashr
14
+#' @docType package
15
+#' @author Bob Rudis (bob@@rud.is)
16
+#' @import purrr httr magick
17
+#' @importFrom xml2 read_html
18
+#' @importFrom jsonlite fromJSON
19
+NULL
20
+
21
+#' splashr exported operators
22
+#'
23
+#' The following functions are imported and then re-exported
24
+#' from the splashr package to enable use of the magrittr
25
+#' pipe operator with no additional library calls
26
+#'
27
+#' @name splashr-exports
28
+NULL
29
+
30
+#' @name %>%
31
+#' @export
32
+#' @rdname splashr-exports
33
+NULL

+ 64
- 0
R/splashr.r View File

@@ -0,0 +1,64 @@
1
+splash_url <- function(splash_obj) { sprintf("http://%s:%s", splash_obj$host, splash_obj$port) }
2
+
3
+#' Configure parameters for connecting to a Splash server
4
+#'
5
+#' @param host host or IP address
6
+#' @param port port the server is running on (default is 8050)
7
+#' @export
8
+splash <- function(host, port=8050L) {
9
+  list(host=host, port=port)
10
+}
11
+
12
+#' Test if a Splash server is up
13
+#'
14
+#' @param splash_obj A splash connection object
15
+#' @export
16
+splash_active <- function(splash_obj) {
17
+
18
+  httr::GET(splash_url(splash_obj), path="_ping") %>%
19
+    httr::stop_for_status() %>%
20
+    httr::content(as="text", encoding="UTF-8") %>%
21
+    jsonlite::fromJSON() -> out
22
+
23
+  out$url <- splash_url(splash_obj)
24
+
25
+  class(out) <- c("splash_status", class(out))
26
+
27
+  out
28
+
29
+}
30
+
31
+#' @rdname splash_active
32
+#' @keywords internal
33
+#' @export
34
+print.splash_status <- function(x, ...) {
35
+  cat(sprintf("Status of splash instance on [%s]: %s. Max RSS: %s\n", x$url, x$status, x$maxrss))
36
+  invisible(x)
37
+}
38
+
39
+#' Retrieve debug-level info for a Splash server
40
+#'
41
+#' @param splash_obj A splash connection object
42
+#' @export
43
+splash_debug <- function(splash_obj) {
44
+
45
+  httr::GET(splash_url(splash_obj), path="_debug") %>%
46
+    httr::stop_for_status() %>%
47
+    httr::content(as="text", encoding="UTF-8") %>%
48
+    jsonlite::fromJSON() -> out
49
+
50
+  out$url <- splash_url(splash_obj)
51
+
52
+  class(out) <- c("splash_debug", class(out))
53
+
54
+  out
55
+
56
+}
57
+
58
+#' @rdname splash_debug
59
+#' @keywords internal
60
+#' @export
61
+print.splash_debug <- function(x, ...) {
62
+  print(str(x))
63
+  invisible(x)
64
+}

+ 107
- 0
README.Rmd View File

@@ -0,0 +1,107 @@
1
+---
2
+output: rmarkdown::github_document
3
+---
4
+
5
+`splashr` : Tools to Work with the 'Splash' JavaScript Rendering Service
6
+
7
+**Ridicuously basic functionality working at the moment. More coming soon**
8
+
9
+TL;DR: This package works with Splash rendering servers which are really just a REST API & `lua` scripting interface to a QT browser. It's an alternative to the Selenium ecosystem and does not do everything Selenium can, but if you're just trying to get a page back that needs javascript rendering, this is a nice alternative. 
10
+
11
+You can also get it running with two commands:
12
+
13
+    sudo docker pull scrapinghub/splash
14
+    sudo docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 scrapinghub/splash
15
+    
16
+(Do whatever you Windows ppl do with Docker on your systems to make ^^ work.)
17
+
18
+All you need for this package to work is a running Splash instance. You provide the host/port for it and it's scrape-tastic from there.
19
+
20
+### About Splash
21
+
22
+>'Splash' <https://github.com/scrapinghub/splash> is a javascript rendering service. It’s a lightweight web browser with an 'HTTP' API, implemented in Python using 'Twisted'and 'QT' and provides some of the core functionality of the 'RSelenium' or 'seleniumPipes'R pacakges but with a Java-free footprint. The (twisted) 'QT' reactor is used to make the sever fully asynchronous allowing to take advantage of 'webkit' concurrency via QT main loop. Some of Splash features include the ability to process multiple webpages in parallel; retrieving HTML results and/or take screenshots; disabling images or use Adblock Plus rules to make rendering faster; executing custom JavaScript in page context; getting detailed rendering info in HAR format.
23
+    
24
+The following functions are implemented:
25
+
26
+- `render_html`:	Return the HTML of the javascript-rendered page.
27
+- `render_jpeg`:	Return a image (in JPEG format) of the javascript-rendered page.
28
+- `render_png`:	Return a image (in PNG format) of the javascript-rendered page.
29
+- `splash`:	Configure parameters for connecting to a Splash server
30
+- `splashr`:	Tools to Work with the 'Splash' JavaScript Rendering Service
31
+
32
+### Installation
33
+
34
+```{r eval=FALSE}
35
+devtools::install_github("hrbrmstr/splashr")
36
+```
37
+
38
+```{r message=FALSE, warning=FALSE, error=FALSE}
39
+options(width=120)
40
+```
41
+
42
+### Usage
43
+
44
+```{r message=FALSE, warning=FALSE, error=FALSE}
45
+library(splashr)
46
+library(magick)
47
+library(rvest)
48
+
49
+# current verison
50
+packageVersion("splashr")
51
+
52
+splash("splash", 8050L) %>%
53
+  splash_active()
54
+
55
+splash("splash", 8050L) %>%
56
+  splash_debug()
57
+```
58
+
59
+Notice the difference between a rendered HTML scrape and a non-rendered one:
60
+
61
+```{r}
62
+splash("splash", 8050L) %>%
63
+  render_html("http://marvel.com/universe/Captain_America_(Steve_Rogers)")
64
+
65
+read_html("http://marvel.com/universe/Captain_America_(Steve_Rogers)")
66
+```
67
+
68
+Web page snapshots are easy-peasy too:
69
+
70
+```{r eval=FALSE}
71
+splash("splash", 8050L) %>%
72
+  render_png("http://marvel.com/universe/Captain_America_(Steve_Rogers)")
73
+```
74
+
75
+```{r eval=TRUE, include=FALSE}
76
+splash("splash", 8050L) %>%
77
+  render_png("http://marvel.com/universe/Captain_America_(Steve_Rogers)") %>% 
78
+  image_write("img/cap.png")
79
+```
80
+
81
+![](img/cap.png)
82
+
83
+```{r eval=FALSE}
84
+splash("splash", 8050L) %>%
85
+  render_jpeg("http://marvel.com/universe/Captain_America_(Steve_Rogers)") 
86
+```
87
+
88
+```{r eval=TRUE, include=FALSE}
89
+splash("splash", 8050L) %>%
90
+  render_jpeg("http://marvel.com/universe/Captain_America_(Steve_Rogers)") %>% 
91
+  image_write("img/cap.jpg")
92
+```
93
+
94
+![](img/cap.jpg)
95
+
96
+### Test Results
97
+
98
+```{r message=FALSE, warning=FALSE, error=FALSE}
99
+library(splashr)
100
+library(testthat)
101
+
102
+date()
103
+
104
+test_dir("tests/")
105
+```
106
+
107
+```{r eval = FALSE, include = FALSE}

+ 134
- 0
README.md View File

@@ -0,0 +1,134 @@
1
+
2
+`splashr` : Tools to Work with the 'Splash' JavaScript Rendering Service
3
+
4
+**Ridicuously basic functionality working at the moment. More coming soon**
5
+
6
+TL;DR: This package works with Splash rendering servers which are really just a REST API & `lua` scripting interface to a QT browser. It's an alternative to the Selenium ecosystem and does not do everything Selenium can, but if you're just trying to get a page back that needs javascript rendering, this is a nice alternative.
7
+
8
+You can also get it running with two commands:
9
+
10
+    sudo docker pull scrapinghub/splash
11
+    sudo docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 scrapinghub/splash
12
+
13
+(Do whatever you Windows ppl do with Docker on your systems to make ^^ work.)
14
+
15
+All you need for this package to work is a running Splash instance. You provide the host/port for it and it's scrape-tastic from there.
16
+
17
+### About Splash
18
+
19
+> 'Splash' <https://github.com/scrapinghub/splash> is a javascript rendering service. It’s a lightweight web browser with an 'HTTP' API, implemented in Python using 'Twisted'and 'QT' and provides some of the core functionality of the 'RSelenium' or 'seleniumPipes'R pacakges but with a Java-free footprint. The (twisted) 'QT' reactor is used to make the sever fully asynchronous allowing to take advantage of 'webkit' concurrency via QT main loop. Some of Splash features include the ability to process multiple webpages in parallel; retrieving HTML results and/or take screenshots; disabling images or use Adblock Plus rules to make rendering faster; executing custom JavaScript in page context; getting detailed rendering info in HAR format.
20
+
21
+The following functions are implemented:
22
+
23
+-   `render_html`: Return the HTML of the javascript-rendered page.
24
+-   `render_jpeg`: Return a image (in JPEG format) of the javascript-rendered page.
25
+-   `render_png`: Return a image (in PNG format) of the javascript-rendered page.
26
+-   `splash`: Configure parameters for connecting to a Splash server
27
+-   `splashr`: Tools to Work with the 'Splash' JavaScript Rendering Service
28
+
29
+### Installation
30
+
31
+``` r
32
+devtools::install_github("hrbrmstr/splashr")
33
+```
34
+
35
+``` r
36
+options(width=120)
37
+```
38
+
39
+### Usage
40
+
41
+``` r
42
+library(splashr)
43
+library(magick)
44
+library(rvest)
45
+
46
+# current verison
47
+packageVersion("splashr")
48
+```
49
+
50
+    ## [1] '0.1.0'
51
+
52
+``` r
53
+splash("splash", 8050L) %>%
54
+  splash_active()
55
+```
56
+
57
+    ## Status of splash instance on [http://splash:8050]: ok. Max RSS: 349298688
58
+
59
+``` r
60
+splash("splash", 8050L) %>%
61
+  splash_debug()
62
+```
63
+
64
+    ## List of 7
65
+    ##  $ active  : list()
66
+    ##  $ argcache: int 0
67
+    ##  $ fds     : int 18
68
+    ##  $ leaks   :List of 4
69
+    ##   ..$ Deferred  : int 50
70
+    ##   ..$ LuaRuntime: int 1
71
+    ##   ..$ QTimer    : int 1
72
+    ##   ..$ Request   : int 1
73
+    ##  $ maxrss  : int 341112
74
+    ##  $ qsize   : int 0
75
+    ##  $ url     : chr "http://splash:8050"
76
+    ##  - attr(*, "class")= chr [1:2] "splash_debug" "list"
77
+    ## NULL
78
+
79
+Notice the difference between a rendered HTML scrape and a non-rendered one:
80
+
81
+``` r
82
+splash("splash", 8050L) %>%
83
+  render_html("http://marvel.com/universe/Captain_America_(Steve_Rogers)")
84
+```
85
+
86
+    ## {xml_document}
87
+    ## <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
88
+    ## [1] <head>\n<script src="http://widget-cdn.rpxnow.com/manifest/login?version=1.114.1_widgets_244" type="text/javascri ...
89
+    ## [2] <body id="index-index" class="index-index" onload="findLinks('myLink');">\n\n\t<div id="page_frame" style="overfl ...
90
+
91
+``` r
92
+read_html("http://marvel.com/universe/Captain_America_(Steve_Rogers)")
93
+```
94
+
95
+    ## {xml_document}
96
+    ## <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
97
+    ## [1] <head>\n<meta http-equiv="X-UA-Compatible" content="IE=Edge">\n<link href="https://plus.google.com/10852333737344 ...
98
+    ## [2] <body id="index-index" class="index-index" onload="findLinks('myLink');">\n\n\t<div id="page_frame" style="overfl ...
99
+
100
+Web page snapshots are easy-peasy too:
101
+
102
+``` r
103
+splash("splash", 8050L) %>%
104
+  render_png("http://marvel.com/universe/Captain_America_(Steve_Rogers)")
105
+```
106
+
107
+![](img/cap.png)
108
+
109
+``` r
110
+splash("splash", 8050L) %>%
111
+  render_jpeg("http://marvel.com/universe/Captain_America_(Steve_Rogers)") 
112
+```
113
+
114
+![](img/cap.jpg)
115
+
116
+### Test Results
117
+
118
+``` r
119
+library(splashr)
120
+library(testthat)
121
+
122
+date()
123
+```
124
+
125
+    ## [1] "Fri Feb  3 14:58:40 2017"
126
+
127
+``` r
128
+test_dir("tests/")
129
+```
130
+
131
+    ## testthat results ========================================================================================================
132
+    ## OK: 0 SKIPPED: 0 FAILED: 0
133
+    ## 
134
+    ## DONE ===================================================================================================================

BIN
img/cap.jpg View File


BIN
img/cap.png View File


+ 56
- 0
man/render_html.Rd View File

@@ -0,0 +1,56 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/render-html.r
3
+\name{render_html}
4
+\alias{render_html}
5
+\title{Return the HTML of the javascript-rendered page.}
6
+\usage{
7
+render_html(splash_obj, url, base_url, timeout = 30,
8
+  resource_timeout = NULL, wait = 0, proxy, js, js_src, filters,
9
+  allowed_domains = "", allowed_content_types = "",
10
+  forbidden_content_types = "", viewport = "1024x768", images, headers,
11
+  body, http_method, save_args, load_args)
12
+}
13
+\arguments{
14
+\item{splash_obj}{Object created by a call to \link{splash}}
15
+
16
+\item{url}{The URL to render (required)}
17
+
18
+\item{base_url}{TBD The base url to render the page with.}
19
+
20
+\item{timeout}{TBD A timeout (in seconds) for the render (defaults to 30).}
21
+
22
+\item{resource_timeout}{A timeout (in seconds) for individual network requests.}
23
+
24
+\item{wait}{Time (in seconds) to wait for updates after page is loaded (defaults to 0).}
25
+
26
+\item{proxy}{TBD Proxy profile name or proxy URL.}
27
+
28
+\item{js}{TBD Javascript profile name.}
29
+
30
+\item{js_src}{TBD JavaScript code to be executed in page context.}
31
+
32
+\item{filters}{TBD Comma-separated list of request filter names.}
33
+
34
+\item{allowed_domains}{TBD Comma-separated list of allowed domain names. If present, Splash won’t load anything neither from domains not in this list nor from subdomains of domains not in this list.}
35
+
36
+\item{allowed_content_types}{TBD Comma-separated list of allowed content types. If present, Splash will abort any request if the response’s content type doesn’t match any of the content types in this list. Wildcards are supported.}
37
+
38
+\item{forbidden_content_types}{TBD Comma-separated list of forbidden content types. If present, Splash will abort any request if the response’s content type matches any of the content types in this list. Wildcards are supported.}
39
+
40
+\item{viewport}{View width and height (in pixels) of the browser viewport to render the web page. Format is “<width>x<height>”, e.g. 800x600. Default value is 1024x768.}
41
+
42
+\item{images}{TBD Whether to download images.}
43
+
44
+\item{headers}{TBD HTTP headers to set for the first outgoing request.}
45
+
46
+\item{body}{TBD Body of HTTP POST request to be sent if method is POST.}
47
+
48
+\item{http_method}{TBD HTTP method of outgoing Splash request.}
49
+
50
+\item{save_args}{TBD A list of argument names to put in cache.}
51
+
52
+\item{load_args}{TBD Parameter values to load from cache}
53
+}
54
+\description{
55
+Similar to \code{rvest::read_html}.
56
+}

+ 65
- 0
man/render_jpeg.Rd View File

@@ -0,0 +1,65 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/render-jpg.r
3
+\name{render_jpeg}
4
+\alias{render_jpeg}
5
+\title{Return a image (in JPEG format) of the javascript-rendered page.}
6
+\usage{
7
+render_jpeg(splash_obj, url, base_url = NULL, quality = 75, width = 1024,
8
+  height = 768, timeout = 30, resource_timeout = NULL, wait = 0,
9
+  render_all = FALSE, proxy, js, js_src, filters, allowed_domains = "",
10
+  allowed_content_types = "", forbidden_content_types = "",
11
+  viewport = "1024x768", images, headers, body, http_method, save_args,
12
+  load_args)
13
+}
14
+\arguments{
15
+\item{splash_obj}{Object created by a call to \link{splash}}
16
+
17
+\item{url}{The URL to render (required)}
18
+
19
+\item{base_url}{TBD The base url to render the page with.}
20
+
21
+\item{quality}{JPEG quality parameter in range from 0 to 100. Default is quality=75.}
22
+
23
+\item{width}{Resize the rendered image to the given width/height (in pixels) keeping the aspect ratio.}
24
+
25
+\item{height}{Resize the rendered image to the given width/height (in pixels) keeping the aspect ratio.}
26
+
27
+\item{timeout}{TBD A timeout (in seconds) for the render (defaults to 30).}
28
+
29
+\item{resource_timeout}{A timeout (in seconds) for individual network requests.}
30
+
31
+\item{wait}{Time (in seconds) to wait for updates after page is loaded (defaults to 0).}
32
+
33
+\item{render_all}{If \code{TRUE} extend the viewport to include the whole webpage (possibly very tall) before rendering. Default is \code{FASLE}}
34
+
35
+\item{proxy}{TBD Proxy profile name or proxy URL.}
36
+
37
+\item{js}{TBD Javascript profile name.}
38
+
39
+\item{js_src}{TBD JavaScript code to be executed in page context.}
40
+
41
+\item{filters}{TBD Comma-separated list of request filter names.}
42
+
43
+\item{allowed_domains}{TBD Comma-separated list of allowed domain names. If present, Splash won’t load anything neither from domains not in this list nor from subdomains of domains not in this list.}
44
+
45
+\item{allowed_content_types}{TBD Comma-separated list of allowed content types. If present, Splash will abort any request if the response’s content type doesn’t match any of the content types in this list. Wildcards are supported.}
46
+
47
+\item{forbidden_content_types}{TBD Comma-separated list of forbidden content types. If present, Splash will abort any request if the response’s content type matches any of the content types in this list. Wildcards are supported.}
48
+
49
+\item{viewport}{View width and height (in pixels) of the browser viewport to render the web page. Format is “<width>x<height>”, e.g. 800x600. Default value is 1024x768.}
50
+
51
+\item{images}{TBD Whether to download images.}
52
+
53
+\item{headers}{TBD HTTP headers to set for the first outgoing request.}
54
+
55
+\item{body}{TBD Body of HTTP POST request to be sent if method is POST.}
56
+
57
+\item{http_method}{TBD HTTP method of outgoing Splash request.}
58
+
59
+\item{save_args}{TBD A list of argument names to put in cache.}
60
+
61
+\item{load_args}{TBD Parameter values to load from cache}
62
+}
63
+\description{
64
+Return a image (in JPEG format) of the javascript-rendered page.
65
+}

+ 61
- 0
man/render_png.Rd View File

@@ -0,0 +1,61 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/render-png.r
3
+\name{render_png}
4
+\alias{render_png}
5
+\title{Return a image (in PNG format) of the javascript-rendered page.}
6
+\usage{
7
+render_png(splash_obj, url, base_url = NULL, width = 1024, height = 768,
8
+  render_all = FALSE, timeout = 30, resource_timeout = NULL, wait = 0,
9
+  proxy, js, js_src, filters, allowed_domains = "",
10
+  allowed_content_types = "", forbidden_content_types = "",
11
+  viewport = "1024x768", images, headers, body, http_method, save_args,
12
+  load_args)
13
+}
14
+\arguments{
15
+\item{splash_obj}{Object created by a call to \link{splash}}
16
+
17
+\item{url}{The URL to render (required)}
18
+
19
+\item{base_url}{TBD The base url to render the page with.}
20
+
21
+\item{width, height}{Resize the rendered image to the given width/height (in pixels) keeping the aspect ratio.}
22
+
23
+\item{render_all}{If \code{TRUE} extend the viewport to include the whole webpage (possibly very tall) before rendering. Default is \code{FASLE}}
24
+
25
+\item{timeout}{TBD A timeout (in seconds) for the render (defaults to 30).}
26
+
27
+\item{resource_timeout}{A timeout (in seconds) for individual network requests.}
28
+
29
+\item{wait}{Time (in seconds) to wait for updates after page is loaded (defaults to 0).}
30
+
31
+\item{proxy}{TBD Proxy profile name or proxy URL.}
32
+
33
+\item{js}{TBD Javascript profile name.}
34
+
35
+\item{js_src}{TBD JavaScript code to be executed in page context.}
36
+
37
+\item{filters}{TBD Comma-separated list of request filter names.}
38
+
39
+\item{allowed_domains}{TBD Comma-separated list of allowed domain names. If present, Splash won’t load anything neither from domains not in this list nor from subdomains of domains not in this list.}
40
+
41
+\item{allowed_content_types}{TBD Comma-separated list of allowed content types. If present, Splash will abort any request if the response’s content type doesn’t match any of the content types in this list. Wildcards are supported.}
42
+
43
+\item{forbidden_content_types}{TBD Comma-separated list of forbidden content types. If present, Splash will abort any request if the response’s content type matches any of the content types in this list. Wildcards are supported.}
44
+
45
+\item{viewport}{View width and height (in pixels) of the browser viewport to render the web page. Format is “<width>x<height>”, e.g. 800x600. Default value is 1024x768.}
46
+
47
+\item{images}{TBD Whether to download images.}
48
+
49
+\item{headers}{TBD HTTP headers to set for the first outgoing request.}
50
+
51
+\item{body}{TBD Body of HTTP POST request to be sent if method is POST.}
52
+
53
+\item{http_method}{TBD HTTP method of outgoing Splash request.}
54
+
55
+\item{save_args}{TBD A list of argument names to put in cache.}
56
+
57
+\item{load_args}{TBD Parameter values to load from cache}
58
+}
59
+\description{
60
+Return a image (in PNG format) of the javascript-rendered page.
61
+}

+ 16
- 0
man/splash.Rd View File

@@ -0,0 +1,16 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/splashr.r
3
+\name{splash}
4
+\alias{splash}
5
+\title{Configure parameters for connecting to a Splash server}
6
+\usage{
7
+splash(host, port = 8050L)
8
+}
9
+\arguments{
10
+\item{host}{host or IP address}
11
+
12
+\item{port}{port the server is running on (default is 8050)}
13
+}
14
+\description{
15
+Configure parameters for connecting to a Splash server
16
+}

+ 18
- 0
man/splash_active.Rd View File

@@ -0,0 +1,18 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/splashr.r
3
+\name{splash_active}
4
+\alias{splash_active}
5
+\alias{print.splash_status}
6
+\title{Test if a Splash server is up}
7
+\usage{
8
+splash_active(splash_obj)
9
+
10
+\method{print}{splash_status}(x, ...)
11
+}
12
+\arguments{
13
+\item{splash_obj}{A splash connection object}
14
+}
15
+\description{
16
+Test if a Splash server is up
17
+}
18
+\keyword{internal}

+ 18
- 0
man/splash_debug.Rd View File

@@ -0,0 +1,18 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/splashr.r
3
+\name{splash_debug}
4
+\alias{splash_debug}
5
+\alias{print.splash_debug}
6
+\title{Retrieve debug-level info for a Splash server}
7
+\usage{
8
+splash_debug(splash_obj)
9
+
10
+\method{print}{splash_debug}(x, ...)
11
+}
12
+\arguments{
13
+\item{splash_obj}{A splash connection object}
14
+}
15
+\description{
16
+Retrieve debug-level info for a Splash server
17
+}
18
+\keyword{internal}

+ 11
- 0
man/splashr-exports.Rd View File

@@ -0,0 +1,11 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/splashr-package.R
3
+\name{splashr-exports}
4
+\alias{splashr-exports}
5
+\alias{\%>\%}
6
+\title{splashr exported operators}
7
+\description{
8
+The following functions are imported and then re-exported
9
+from the splashr package to enable use of the magrittr
10
+pipe operator with no additional library calls
11
+}

+ 21
- 0
man/splashr.Rd View File

@@ -0,0 +1,21 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/splashr-package.R
3
+\docType{package}
4
+\name{splashr}
5
+\alias{splashr}
6
+\alias{splashr-package}
7
+\title{Tools to Work with the 'Splash' JavaScript Rendering Service}
8
+\description{
9
+'Splash' <https://github.com/scrapinghub/splash> is a javascript rendering service.
10
+It’s a lightweight web browser with an 'HTTP' API, implemented in Python using
11
+'Twisted'and 'QT' and provides some of the core functionality of the 'RSelenium' or
12
+'seleniumPipes'R pacakges but with a Java-free footprint. The (twisted) 'QT' reactor is
13
+used to make the sever fully asynchronous allowing to take advantage of 'webkit'
14
+concurrency via QT main loop. Some of Splash features include the ability to process
15
+multiple webpages in parallel; retrieving HTML results and/or take screenshots;
16
+disabling images or use Adblock Plus rules to make rendering faster; executing custom
17
+JavaScript in page context; getting detailed rendering info in HAR format.
18
+}
19
+\author{
20
+Bob Rudis (bob@rud.is)
21
+}

+ 21
- 0
splashr.Rproj View File

@@ -0,0 +1,21 @@
1
+Version: 1.0
2
+
3
+RestoreWorkspace: Default
4
+SaveWorkspace: Default
5
+AlwaysSaveHistory: Default
6
+
7
+EnableCodeIndexing: Yes
8
+UseSpacesForTab: Yes
9
+NumSpacesForTab: 2
10
+Encoding: UTF-8
11
+
12
+RnwWeave: Sweave
13
+LaTeX: pdfLaTeX
14
+
15
+StripTrailingWhitespace: Yes
16
+
17
+BuildType: Package
18
+PackageUseDevtools: Yes
19
+PackageInstallArgs: --no-multiarch --with-keep.source
20
+PackageBuildArgs: --resave-data
21
+PackageRoxygenize: rd,collate,namespace

+ 2
- 0
tests/test-all.R View File

@@ -0,0 +1,2 @@
1
+library(testthat)
2
+test_check("splashr")

+ 6
- 0
tests/testthat/test-splash.R View File

@@ -0,0 +1,6 @@
1
+context("basic functionality")
2
+test_that("we can do something", {
3
+
4
+  #expect_that(some_function(), is_a("data.frame"))
5
+
6
+})

Loading…
Cancel
Save