Browse Source

working on vignettes

boB Rudis 2 years ago
parent
commit
fc77b059c2
No known key found for this signature in database

+ 1
- 0
.gitignore View File

@@ -5,3 +5,4 @@
5 5
 src/*.o
6 6
 src/*.so
7 7
 src/*.dll
8
+inst/doc

+ 4
- 1
DESCRIPTION View File

@@ -21,7 +21,9 @@ Suggests:
21 21
     tibble,
22 22
     jpeg,
23 23
     png,
24
-    covr
24
+    covr,
25
+    knitr,
26
+    rmarkdown
25 27
 Depends:
26 28
     R (>= 3.2.0)
27 29
 Imports:
@@ -42,3 +44,4 @@ Imports:
42 44
     HARtools,
43 45
     lubridate
44 46
 RoxygenNote: 6.0.1
47
+VignetteBuilder: knitr

+ 1
- 1
NAMESPACE View File

@@ -35,8 +35,8 @@ export(is_post)
35 35
 export(is_svg)
36 36
 export(is_xhr)
37 37
 export(is_xml)
38
+export(killall_splash)
38 39
 export(renderHARviewer)
39
-export(render_file)
40 40
 export(render_har)
41 41
 export(render_html)
42 42
 export(render_jpeg)

+ 8
- 2
R/as_req.r View File

@@ -1,9 +1,15 @@
1 1
 #' Create an httr verb request function from an HAR request
2 2
 #'
3
+#' This function is very useful if you used `splashr` to find XHR requests in a dynamic
4
+#' page and want to be able to make a call directly to that XHR resource. Once you
5
+#' identify the proper HAR entry, pass it to this function and fully working function
6
+#' that makes an `httr::VERB()` request will be created and returned. The text of the function
7
+#' will also be put onto the clipboad if `add_clip` is `TRUE``
8
+#'
3 9
 #' @md
4 10
 #' @param entry HAR entry
5
-#' @param quiet quiet
6
-#' @param add_clip add clip
11
+#' @param quiet quiet (no messages)
12
+#' @param add_clip add clip (paste the function text to the clipboard)
7 13
 #' @export
8 14
 as_httr_req <- function(entry, quiet=TRUE, add_clip=TRUE) {
9 15
 

+ 23
- 0
R/docker-splash.r View File

@@ -65,6 +65,29 @@ stop_splash <- function(splash_container) {
65 65
   splash_container$remove()
66 66
 }
67 67
 
68
+#' Prune all dead and running Splash Docker containers
69
+#'
70
+#' _This is a destructive function._ It will stop **any** Docker container that
71
+#' is based on an image matching "`scrapinghub/splashr`". It's best used when you
72
+#' had a session forcefully interuppted and had been using the R helper functions
73
+#' to start/stop the Splash Docker container. You may want to consider using the
74
+#' Docker command-line interface to perform this work manually.
75
+#'
76
+#' @export
77
+killall_splash <- function() {
78
+
79
+  client <- docker::docker$from_env()
80
+  x <- client$containers$list(all = TRUE)
81
+
82
+  for (cntnr in x) {
83
+    if (grepl("scrapinghub/splash", cntnr$image$tags[1])) {
84
+      message(sprintf("Pruning: %s...", cntnr$id))
85
+      if (cntnr$status == "running") cntnr$stop()
86
+      cntnr$remove()
87
+    }
88
+  }
89
+}
90
+
68 91
 
69 92
 # @param add_tempdir This is `FALSE` initially since you could try to run
70 93
 #   the splash image on a remote system. It has to be a local one for this to work.

+ 5
- 2
R/helpers.r View File

@@ -5,10 +5,13 @@
5 5
 #' @family splash_har_helpers
6 6
 #' @return A `raw` vector of the content or `NULL`
7 7
 #' @export
8
-get_response_body <- function(har_resp_obj) {
8
+get_response_body <- function(har_resp_obj, type=c("raw", "text")) {
9
+  type <- match.arg(type, c("raw", "text"))
9 10
   resp <- har_resp_obj$response$content$text
10 11
   if (resp == "") return(NULL)
11
-  openssl::base64_decode(resp)
12
+  tmp <- openssl::base64_decode(resp)
13
+  if (type == "text") tmp <- readBin(tmp, "character")
14
+  tmp
12 15
 }
13 16
 
14 17
 #' Retrieve or test content type of a HAR request object

+ 1
- 1
R/render-html.r View File

@@ -1,6 +1,6 @@
1 1
 #' Return the HTML of the javascript-rendered page.
2 2
 #'
3
-#' Similar to `rvest::read_html`.
3
+#' Similar (i.e. a dynamic equivalent) to `rvest::read_html`.
4 4
 #'
5 5
 #' @md
6 6
 #' @param splash_obj Object created by a call to [splash()]

+ 3
- 0
R/render-png.r View File

@@ -8,6 +8,9 @@
8 8
 #' @references [Splash docs](http://splash.readthedocs.io/en/stable/index.html)
9 9
 #' @inheritParams render_html
10 10
 #' @export
11
+#' @examples \dontrun{
12
+#' render_png(url = "https://httpbin.org/")
13
+#' }
11 14
 render_png <- function(
12 15
   splash_obj = splash_local, url, base_url=NULL, width, height,
13 16
   timeout=30, resource_timeout, wait=0, render_all=TRUE,

+ 36
- 36
R/render_file.R View File

@@ -1,37 +1,37 @@
1
-#' Return the HTML or image (png) of the javascript-rendered page in a local file
2
-#'
3
-#' The suggested use-case for this is rendering a widget
4
-#'
5
-#' TODO Test if container is running
6
-#' TODO Enable passing in of an htmlwidget and use saveWidget
7
-#'
8
-#' @md
9
-#' @param splash_obj Object created by a call to [splash()]
10
-#' @param file_path Absolute path to a filename on the local host. **This only works with a locally running Splash instance started with [start_splash]().**
11
-#' @param wait seconds to wait
12
-#' @param output either `html` or `png` to get the page content or an image capture
13
-#' @param viewport View width and height (in pixels) of the browser viewport to render the web page. Format is "`<width>x<height>`". e.g. 800x600. Default value is 1024x768.
14
-#' @param ... other params to [render_html()] or [render_png()]
15
-#' @family splash_renderers
16
-#' @return An XML document or `magick` object
17
-#' @export
18
-render_file <- function(splash_obj = splash_local, file_path, output=c("html", "png"), wait=0, viewport="1024x768", ...) {
19
-
20
-  wait <- check_wait(wait)
21
-
22
-  output <- match.arg(output, c("html", "png"))
23
-
24
-  file.copy(file_path, .pkgenv$temp_dir)
25
-
26
-  fil <- basename(file_path)
27
-
28
-  URL <- sprintf("http://localhost:9999/%s", fil)
29
-
30
-  if (output == "html") {
31
-    render_html(splash_obj, URL, wait=wait, viewport=viewport, ...)
32
-  } else {
33
-    render_png(splash_obj, URL, wait=wait, viewport=viewport, ...)
34
-  }
35
-
36
-}
1
+# Return the HTML or image (png) of the javascript-rendered page in a local file
2
+#
3
+# The suggested use-case for this is rendering a widget
4
+#
5
+# TODO Test if container is running
6
+# TODO Enable passing in of an htmlwidget and use saveWidget
7
+#
8
+# @md
9
+# @param splash_obj Object created by a call to [splash()]
10
+# @param file_path Absolute path to a filename on the local host. **This only works with a locally running Splash instance started with [start_splash]().**
11
+# @param wait seconds to wait
12
+# @param output either `html` or `png` to get the page content or an image capture
13
+# @param viewport View width and height (in pixels) of the browser viewport to render the web page. Format is "`<width>x<height>`". e.g. 800x600. Default value is 1024x768.
14
+# @param ... other params to [render_html()] or [render_png()]
15
+# @family splash_renderers
16
+# @return An XML document or `magick` object
17
+# @export
18
+# render_file <- function(splash_obj = splash_local, file_path, output=c("html", "png"), wait=0, viewport="1024x768", ...) {
19
+#
20
+#   wait <- check_wait(wait)
21
+#
22
+#   output <- match.arg(output, c("html", "png"))
23
+#
24
+#   file.copy(file_path, .pkgenv$temp_dir)
25
+#
26
+#   fil <- basename(file_path)
27
+#
28
+#   URL <- sprintf("http://localhost:9999/%s", fil)
29
+#
30
+#   if (output == "html") {
31
+#     render_html(splash_obj, URL, wait=wait, viewport=viewport, ...)
32
+#   } else {
33
+#     render_png(splash_obj, URL, wait=wait, viewport=viewport, ...)
34
+#   }
35
+#
36
+# }
37 37
 

+ 3
- 1
R/splashr-package.R View File

@@ -3,7 +3,9 @@
3 3
 #' 'Splash' <https://github.com/scrapinghub/splash> is a 'JavaScript' rendering service.
4 4
 #'  It’s a lightweight web browser with an 'HTTP' API, implemented in 'Python' using 'Twisted'
5 5
 #'  and 'QT' and provides some of the core functionality of the 'RSelenium' or 'seleniumPipes'
6
-#'  R pacakges in a lightweight footprint. Some of 'Splash' features include the ability to process
6
+#'  R pacakges in a lightweight footprint.
7
+#'
8
+#'  Some of 'Splash' features include the ability to process
7 9
 #'  multiple webpages in parallel; retrieving 'HTML' results and/or take screenshots; disabling
8 10
 #'  images or use 'Adblock Plus' rules to make rendering faster; executing custom 'JavaScript' in
9 11
 #'  page context; getting detailed rendering info in 'HAR' format.

+ 23
- 0
R/splashr.r View File

@@ -5,6 +5,9 @@ splash_url <- function(splash_obj) { sprintf("http://%s:%s", splash_obj$host, sp
5 5
 #' @param host host or IP address
6 6
 #' @param port port the server is running on (default is 8050)
7 7
 #' @export
8
+#' @examples \dontrun{
9
+#' sp <- splash()
10
+#' }
8 11
 splash <- function(host, port=8050L) {
9 12
   list(host=host, port=port)
10 13
 }
@@ -22,6 +25,10 @@ s_GET <- purrr::safely(GET)
22 25
 #' @family splash_info_functions
23 26
 #' @return `TRUE` if Slash server is running, otherwise `FALSE`
24 27
 #' @export
28
+#' @examples \dontrun{
29
+#' sp <- splash()
30
+#' splash_active(sp)
31
+#' }
25 32
 splash_active <- function(splash_obj = splash_local) {
26 33
 
27 34
   res <- s_GET(splash_url(splash_obj), path="_ping")
@@ -48,6 +55,10 @@ splash_active <- function(splash_obj = splash_local) {
48 55
 #' @param splash_obj A splash connection object
49 56
 #' @family splash_info_functions
50 57
 #' @export
58
+#' @examples \dontrun{
59
+#' sp <- splash()
60
+#' splash_version(sp)
61
+#' }
51 62
 splash_version <- function(splash_obj = splash_local) {
52 63
   execute_lua(splash_obj, '
53 64
 function main(splash)
@@ -62,6 +73,10 @@ end
62 73
 #' @param splash_obj A splash connection object
63 74
 #' @family splash_info_functions
64 75
 #' @export
76
+#' @examples \dontrun{
77
+#' sp <- splash()
78
+#' splash_history(sp)
79
+#' }
65 80
 splash_history <- function(splash_obj = splash_local) {
66 81
   execute_lua(splash_obj, '
67 82
 function main(splash)
@@ -77,6 +92,10 @@ end
77 92
 #' @param splash_obj A splash connection object
78 93
 #' @family splash_info_functions
79 94
 #' @export
95
+#' @examples \dontrun{
96
+#' sp <- splash()
97
+#' splash_perf_stats(sp)
98
+#' }
80 99
 splash_perf_stats <- function(splash_obj = splash_local) {
81 100
   execute_lua(splash_obj, '
82 101
 function main(splash)
@@ -91,6 +110,10 @@ end
91 110
 #' @param splash_obj A splash connection object
92 111
 #' @family splash_info_functions
93 112
 #' @export
113
+#' @examples \dontrun{
114
+#' sp <- splash()
115
+#' splash_debug(sp)
116
+#' }
94 117
 splash_debug <- function(splash_obj = splash_local) {
95 118
 
96 119
   httr::GET(splash_url(splash_obj), path="_debug") %>%

+ 17
- 0
R/utils.r View File

@@ -0,0 +1,17 @@
1
+#' Convert a Base64 encoded string into an R object
2
+#'
3
+#' A simple wrapper around calls to `openssl::base64_decode()` and
4
+#' `jsonlite::fromJSON()`.
5
+#'
6
+#' @md
7
+#' @param x a string
8
+#' @param flatten flatten JSON structures upon conversion?
9
+#' @param ... passed on to `jsonlite::fromJSON()`
10
+#' @export
11
+json_fromb64 <- function(x, flatten=TRUE, ...) {
12
+
13
+  tmp <- openssl::base64_decode(x)
14
+  tmp <- readBin(tmp, "character")
15
+  jsonlite::fromJSON(tmp, flatten=flatten, ...)
16
+
17
+}

+ 7
- 3
man/as_httr_req.Rd View File

@@ -9,10 +9,14 @@ as_httr_req(entry, quiet = TRUE, add_clip = TRUE)
9 9
 \arguments{
10 10
 \item{entry}{HAR entry}
11 11
 
12
-\item{quiet}{quiet}
12
+\item{quiet}{quiet (no messages)}
13 13
 
14
-\item{add_clip}{add clip}
14
+\item{add_clip}{add clip (paste the function text to the clipboard)}
15 15
 }
16 16
 \description{
17
-Create an httr verb request function from an HAR request
17
+This function is very useful if you used \code{splashr} to find XHR requests in a dynamic
18
+page and want to be able to make a call directly to that XHR resource. Once you
19
+identify the proper HAR entry, pass it to this function and fully working function
20
+that makes an \code{httr::VERB()} request will be created and returned. The text of the function
21
+will also be put onto the clipboad if \code{add_clip} is `TRUE``
18 22
 }

+ 3
- 4
man/execute_lua.Rd View File

@@ -54,8 +54,7 @@ member_scores
54 54
 }
55 55
 }
56 56
 \seealso{
57
-Other splash_renderers: \code{\link{render_file}},
58
-  \code{\link{render_har}}, \code{\link{render_html}},
59
-  \code{\link{render_jpeg}}, \code{\link{render_json}},
60
-  \code{\link{render_png}}
57
+Other splash_renderers: \code{\link{render_har}},
58
+  \code{\link{render_html}}, \code{\link{render_jpeg}},
59
+  \code{\link{render_json}}, \code{\link{render_png}}
61 60
 }

+ 15
- 0
man/killall_splash.Rd View File

@@ -0,0 +1,15 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/docker-splash.r
3
+\name{killall_splash}
4
+\alias{killall_splash}
5
+\title{Prune all dead and running Splash Docker containers}
6
+\usage{
7
+killall_splash()
8
+}
9
+\description{
10
+_This is a destructive function._ It will stop **any** Docker container that
11
+is based on an image matching "`scrapinghub/splashr`". It's best used when you
12
+had a session forcefully interuppted and had been using the R helper functions
13
+to start/stop the Splash Docker container. You may want to consider using the
14
+Docker command-line interface to perform this work manually.
15
+}

+ 0
- 38
man/render_file.Rd View File

@@ -1,38 +0,0 @@
1
-% Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/render_file.R
3
-\name{render_file}
4
-\alias{render_file}
5
-\title{Return the HTML or image (png) of the javascript-rendered page in a local file}
6
-\usage{
7
-render_file(splash_obj = splash_local, file_path, output = c("html", "png"),
8
-  wait = 0, viewport = "1024x768", ...)
9
-}
10
-\arguments{
11
-\item{splash_obj}{Object created by a call to \code{\link[=splash]{splash()}}}
12
-
13
-\item{file_path}{Absolute path to a filename on the local host. \strong{This only works with a locally running Splash instance started with \url{start_splash}.}}
14
-
15
-\item{output}{either \code{html} or \code{png} to get the page content or an image capture}
16
-
17
-\item{wait}{seconds to wait}
18
-
19
-\item{viewport}{View width and height (in pixels) of the browser viewport to render the web page. Format is "\code{<width>x<height>}". e.g. 800x600. Default value is 1024x768.}
20
-
21
-\item{...}{other params to \code{\link[=render_html]{render_html()}} or \code{\link[=render_png]{render_png()}}}
22
-}
23
-\value{
24
-An XML document or \code{magick} object
25
-}
26
-\description{
27
-The suggested use-case for this is rendering a widget
28
-}
29
-\details{
30
-TODO Test if container is running
31
-TODO Enable passing in of an htmlwidget and use saveWidget
32
-}
33
-\seealso{
34
-Other splash_renderers: \code{\link{execute_lua}},
35
-  \code{\link{render_har}}, \code{\link{render_html}},
36
-  \code{\link{render_jpeg}}, \code{\link{render_json}},
37
-  \code{\link{render_png}}
38
-}

+ 2
- 3
man/render_har.Rd View File

@@ -74,7 +74,6 @@ is incredibly detailed, full of information on every component loaded.
74 74
 }
75 75
 \seealso{
76 76
 Other splash_renderers: \code{\link{execute_lua}},
77
-  \code{\link{render_file}}, \code{\link{render_html}},
78
-  \code{\link{render_jpeg}}, \code{\link{render_json}},
79
-  \code{\link{render_png}}
77
+  \code{\link{render_html}}, \code{\link{render_jpeg}},
78
+  \code{\link{render_json}}, \code{\link{render_png}}
80 79
 }

+ 3
- 4
man/render_html.Rd View File

@@ -68,14 +68,13 @@ to be processed first by \code{xml2}. If you choose \code{raw_html=TRUE} you'll
68 68
 character vector.
69 69
 }
70 70
 \description{
71
-Similar to \code{rvest::read_html}.
71
+Similar (i.e. a dynamic equivalent) to \code{rvest::read_html}.
72 72
 }
73 73
 \references{
74 74
 \href{http://splash.readthedocs.io/en/stable/index.html}{Splash docs}
75 75
 }
76 76
 \seealso{
77 77
 Other splash_renderers: \code{\link{execute_lua}},
78
-  \code{\link{render_file}}, \code{\link{render_har}},
79
-  \code{\link{render_jpeg}}, \code{\link{render_json}},
80
-  \code{\link{render_png}}
78
+  \code{\link{render_har}}, \code{\link{render_jpeg}},
79
+  \code{\link{render_json}}, \code{\link{render_png}}
81 80
 }

+ 2
- 3
man/render_jpeg.Rd View File

@@ -79,7 +79,6 @@ Return a image (in JPEG format) of the javascript-rendered page.
79 79
 }
80 80
 \seealso{
81 81
 Other splash_renderers: \code{\link{execute_lua}},
82
-  \code{\link{render_file}}, \code{\link{render_har}},
83
-  \code{\link{render_html}}, \code{\link{render_json}},
84
-  \code{\link{render_png}}
82
+  \code{\link{render_har}}, \code{\link{render_html}},
83
+  \code{\link{render_json}}, \code{\link{render_png}}
85 84
 }

+ 2
- 3
man/render_json.Rd View File

@@ -111,7 +111,6 @@ overwhelmed with data. Use \code{\link[=str]{str()}} to inspect various portions
111 111
 }
112 112
 \seealso{
113 113
 Other splash_renderers: \code{\link{execute_lua}},
114
-  \code{\link{render_file}}, \code{\link{render_har}},
115
-  \code{\link{render_html}}, \code{\link{render_jpeg}},
116
-  \code{\link{render_png}}
114
+  \code{\link{render_har}}, \code{\link{render_html}},
115
+  \code{\link{render_jpeg}}, \code{\link{render_png}}
117 116
 }

+ 7
- 3
man/render_png.Rd View File

@@ -70,12 +70,16 @@ a \link{magick} image object
70 70
 \description{
71 71
 Return a image (in PNG format) of the javascript-rendered page.
72 72
 }
73
+\examples{
74
+\dontrun{
75
+render_png(url = "https://httpbin.org/")
76
+}
77
+}
73 78
 \references{
74 79
 \href{http://splash.readthedocs.io/en/stable/index.html}{Splash docs}
75 80
 }
76 81
 \seealso{
77 82
 Other splash_renderers: \code{\link{execute_lua}},
78
-  \code{\link{render_file}}, \code{\link{render_har}},
79
-  \code{\link{render_html}}, \code{\link{render_jpeg}},
80
-  \code{\link{render_json}}
83
+  \code{\link{render_har}}, \code{\link{render_html}},
84
+  \code{\link{render_jpeg}}, \code{\link{render_json}}
81 85
 }

+ 5
- 0
man/splash.Rd View File

@@ -19,4 +19,9 @@ splash_local
19 19
 \description{
20 20
 Configure parameters for connecting to a Splash server
21 21
 }
22
+\examples{
23
+\dontrun{
24
+sp <- splash()
25
+}
26
+}
22 27
 \keyword{datasets}

+ 6
- 0
man/splash_active.Rd View File

@@ -15,6 +15,12 @@ splash_active(splash_obj = splash_local)
15 15
 \description{
16 16
 Test if a Splash server is up
17 17
 }
18
+\examples{
19
+\dontrun{
20
+sp <- splash()
21
+splash_active(sp)
22
+}
23
+}
18 24
 \seealso{
19 25
 Other splash_info_functions: \code{\link{splash_debug}},
20 26
   \code{\link{splash_history}},

+ 6
- 0
man/splash_debug.Rd View File

@@ -15,6 +15,12 @@ splash_debug(splash_obj = splash_local)
15 15
 \description{
16 16
 Retrieve debug-level info for a Splash server
17 17
 }
18
+\examples{
19
+\dontrun{
20
+sp <- splash()
21
+splash_debug(sp)
22
+}
23
+}
18 24
 \seealso{
19 25
 Other splash_info_functions: \code{\link{splash_active}},
20 26
   \code{\link{splash_history}},

+ 6
- 0
man/splash_history.Rd View File

@@ -12,6 +12,12 @@ splash_history(splash_obj = splash_local)
12 12
 \description{
13 13
 Get information about requests/responses for the pages loaded
14 14
 }
15
+\examples{
16
+\dontrun{
17
+sp <- splash()
18
+splash_history(sp)
19
+}
20
+}
15 21
 \seealso{
16 22
 Other splash_info_functions: \code{\link{splash_active}},
17 23
   \code{\link{splash_debug}},

+ 6
- 0
man/splash_perf_stats.Rd View File

@@ -12,6 +12,12 @@ splash_perf_stats(splash_obj = splash_local)
12 12
 \description{
13 13
 Get Splash performance-related statistics
14 14
 }
15
+\examples{
16
+\dontrun{
17
+sp <- splash()
18
+splash_perf_stats(sp)
19
+}
20
+}
15 21
 \seealso{
16 22
 Other splash_info_functions: \code{\link{splash_active}},
17 23
   \code{\link{splash_debug}}, \code{\link{splash_history}},

+ 6
- 0
man/splash_version.Rd View File

@@ -12,6 +12,12 @@ splash_version(splash_obj = splash_local)
12 12
 \description{
13 13
 Get Splash version information
14 14
 }
15
+\examples{
16
+\dontrun{
17
+sp <- splash()
18
+splash_version(sp)
19
+}
20
+}
15 21
 \seealso{
16 22
 Other splash_info_functions: \code{\link{splash_active}},
17 23
   \code{\link{splash_debug}}, \code{\link{splash_history}},

BIN
vignettes/figures/splashr01.png View File


+ 304
- 0
vignettes/intro_to_splashr.Rmd View File

@@ -0,0 +1,304 @@
1
+---
2
+title: "Introduction to splashr"
3
+author: "Bob Rudis"
4
+date: "`r Sys.Date()`"
5
+output: 
6
+  rmarkdown::html_vignette:
7
+    toc: true
8
+vignette: >
9
+  %\VignetteIndexEntry{Introduction to splashr}
10
+  %\VignetteEngine{knitr::rmarkdown}
11
+  %\VignetteEncoding{UTF-8}
12
+---
13
+
14
+Capturing information/conent from internet resources can be a tricky endeavour. Along with the many legal + ethical issues there are an increasing numbner of sites that render content dynamically, either through `XMLHttpRequests` (XHR) or on-page JavaScript (JS) rendering of
15
+in-page content. There are also many sites that make it difficult to fill-in form data programmatically.
16
+
17
+There are ways to capture these types of resources in R. One way is via the [`RSelenium`](https://CRAN.R-project.org/package=RSelenium) ecosystem of packages. Another is with packages such as [`webshot`](https://CRAN.R-project.org/package=webshot). One can also write custom [`phantomjs`](http://phantomjs.org/) scripts and post-process the HTML output.
18
+
19
+The `splashr` package provides tooling around another web-scraping ecosystem: [Splash](https://scrapinghub.com/splash). A Splash environment is fundamentally a headless web browser based on the QT WebKit library. Unlike the Selenium ecosystem, Splash is not based on the [WebDriver](https://www.w3.org/TR/webdriver/) protocol, but has a custom HTTP API that provides both similar and different idioms for accessing and maniuplating web content.
20
+
21
+## Getting Started
22
+
23
+Before you can use `splashr` you will need access to a Splash environment. You can either:
24
+
25
+- [pay for instances](https://app.scrapinghub.com/account/signup/);
26
+- [get a Splash server running locally by hand](https://github.com/scrapinghub/splash), or
27
+- use Splash in a [Docker](https://www.docker.com/) container.
28
+
29
+The package and this document are going to steer you into using Docker containers. Docker is free for macOS, Windows and Linux systems, plus most major cloud computing providers have support for Docker containers. If you don't have Docker installed, then your first step should be to get Docker going and [verifying your setup](https://docs.docker.com/get-started/).
30
+
31
+Once you have Docker working, you can follow the [Splash installation guidance](https://splash.readthedocs.io/en/stable/install.html) to manually obtain, start and stop Splash docker containers. _There must be a running, accessible Splash instance for `splashr` to work_.
32
+
33
+If you're comfortable trying to get a working Python environment working on your system, you can also use the Splash Docker helper functions that come with this package:
34
+
35
+- `install_splash()` will perform the same operation as `docker pull ...`
36
+- `start_splash()` will perform the same operation as `docker run ...`, and
37
+- `stop_splash()` will stop and remove the conainter object returned by `start_splash()`
38
+
39
+Follow the vignettes in the [`docker`](https://CRAN.R-project.org/package=docker) package to get the `docker` package up and running.
40
+
41
+The remainder of this document assumes that you have a Splash instance up and running on your localhost.
42
+
43
+## Scraping Bascis --- `render_` functions
44
+
45
+Splash (and, hence, `splashr`) has a feature-rich API that ranges from quick-and-easy to complex-detailed-and-powerful. We'll start with some easy basics. First make sure Splash is running:
46
+
47
+```
48
+library(splashr)
49
+
50
+splash_active()
51
+## Status of splash instance on [http://localhost:8050]: ok. Max RSS: 74.42578 Mb
52
+## 
53
+## [1] TRUE
54
+```
55
+
56
+THe first action we'll perform may surprise you. We're going to take a screenshot of the <https://analytics.usa.gov/> site. Why that site? First, the Terms of Service allow for scraping. Second, it has a great deal of dynamic content. And, third, we can validate our scraping findings with a direct data download (which will be an exercise left to the reader).
57
+
58
+Enough words. Let's see what this site looks like!
59
+
60
+```
61
+library(magick)
62
+
63
+render_png(url = "https://analytics.usa.gov/", wait = 5)
64
+##   format width height colorspace filesize
65
+## 1    PNG  1024   2761       sRGB   531597
66
+```
67
+
68
+<img style="max-widgh:100%" width="100%" src="figures/splashr01.png"/>
69
+
70
+Let's decompose what we just did:
71
+
72
+1. We called `render_png()` function. The job of this function is to --- by default -- take a "screenshot" of the fully rendered page content at a specified URL.
73
+1. We passed in the `url = ` parameter. The default first parameter is a `splashr` object created by the `splash()`. However, since it's highly likely most folks will be running a Splash server locally with the default configuration, most `splashr` functions will use an inherent, "`splash_local`" object if you're willing to use named parameters for all other parameter values. 
74
+1. We passed in a `wait = ` parameter, asking the Splash server to wait for a few seconds to give the content time to render. This is an important consideration which we'll go into later in this document.
75
+1. `splashr` passed on our command to the running Splash instance and the Splash server sent back a PNG file which the `splashr` package read in with the help of the `magick` package. If you're operating in RStudio you'll see the above image in the viewer. Alternatively, you can do:
76
+
77
+```
78
+image_browse(render_png(url = "https://analytics.usa.gov/", wait = 5))
79
+```
80
+
81
+to see the image if you're in another R environment. NOTE: web page screenshots can be captured in PNG or JPEG format by choosing the appropriate `render_` function.
82
+
83
+Now that we've validated that we're getting the content we want, we can do something a bit more useful, like retrieve the HTML content of the page:
84
+
85
+```
86
+pg <- render_html(url = "https://analytics.usa.gov/")
87
+pg
88
+## {xml_document}
89
+## <html lang="en">
90
+## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<!--\n\n    Hi! Welcome to our source code.\n\n    This d ...
91
+## [2] <body>\n    <!-- Google Tag Manager (noscript) -->\n<noscript>&lt;iframe src="https://www.googletagmanager.com/ns.html?id=GTM-MQSGZS"\ ...
92
+```
93
+
94
+The `render_html()` function behaves a great deal like `xml2::read_html()` function except that it's just retrieving the current web page [HTML DOM](https://www.w3schools.com/js/js_htmldom.asp). What do we mean by that? Well, unlike `httr::GET()` or `xml2::read_html()`, the Splash environment is a bona-fide browser environment, just like Chrome, Safari or Firefox. It's always running (until you shut down the Splash server). That means any active JS on the page can be modifying the content (like ticking a time counter or updating stock prices, etc). We didn't specify a `wait = ` delay this time, but it's generally a good idea to do that for very dynamic sites. This particular site seems update the various tables and charts every 10 seconds to show "live" stats.
95
+
96
+We can work with that `pg` content just like we would with `rvest` / `xml2`. Let's look at the visitor total from the past 90 days:
97
+
98
+```
99
+library(rvest)
100
+
101
+html_text(html_nodes(pg, "span#total_visitors"))
102
+## [1] "2.37 billion"
103
+```
104
+
105
+If we tried to read that value with plain, ol' `read_html` here's what we'd get:
106
+
107
+```
108
+pg2 <- read_html("https://analytics.usa.gov/")
109
+html_text(html_nodes(pg2, "span#total_visitors"))
110
+## [1] "..."
111
+```
112
+
113
+Not exactly helpful.
114
+
115
+So, with just a small example, we've seen that it's pretty simple to pull dyanmic content out of a web site with just a few more steps than `read_html()` requires.
116
+
117
+But, we can do even more with these `render_` functions.
118
+
119
+## Your Own Private 'Developer Tools'
120
+
121
+Anyone performing scraping operations likely knows about each browser's "developer tools" environment. If you're not familiar with them you can get a quick primer [on their secrets](http://devtoolsecrets.com/) before continuing with this vignette.
122
+
123
+The devtools inspector lets you see --- amongst other items -- network resources that were pulled down with the web page. So, while `read_html()` just gets the individual HTML file for a web site, its Splash devtools counterpart --- `render_har()` --- is pulling every image, JS file, CSS sheet, etc that can be rendered in QT WebKit. We can see what the USA.Gov Analytics site is making us load with it:
124
+
125
+```
126
+har <- render_har(url = "https://analytics.usa.gov/")
127
+har
128
+## --------HAR VERSION-------- 
129
+## HAR specification version: 1.2 
130
+## --------HAR CREATOR-------- 
131
+## Created by: Splash 
132
+## version: 3.0 
133
+## --------HAR BROWSER-------- 
134
+## Browser: QWebKit 
135
+## version: 602.1 
136
+## --------HAR PAGES-------- 
137
+## Page id: 1 , Page title: analytics.usa.gov | The US government's web traffic. 
138
+## --------HAR ENTRIES-------- 
139
+## Number of entries: 29 
140
+## REQUESTS: 
141
+## Page: 1 
142
+## Number of entries: 29 
143
+##   -  https://analytics.usa.gov/ 
144
+##   -  https://analytics.usa.gov/css/vendor/css/uswds.v0.9.1.css 
145
+##   -  https://analytics.usa.gov/css/public_analytics.css 
146
+##   -  https://analytics.usa.gov/js/vendor/d3.v3.min.js 
147
+##   -  https://analytics.usa.gov/js/vendor/q.min.js 
148
+##      ........ 
149
+##   -  https://analytics.usa.gov/data/live/top-downloads-yesterday.json 
150
+##   -  https://analytics.usa.gov/css/vendor/fonts/sourcesanspro-bold-webfont.woff2 
151
+##   -  https://analytics.usa.gov/css/vendor/fonts/sourcesanspro-regular-webfont.woff2 
152
+##   -  https://analytics.usa.gov/css/vendor/fonts/sourcesanspro-light-webfont.woff2 
153
+##   -  https://analytics.usa.gov/css/vendor/fonts/sourcesanspro-italic-webfont.woff2  
154
+```
155
+
156
+A "HAR" is an HTTP Archive and `splashr` works with the R [`hartools`](https://CRAN.R-project.org/package=HARtools) package to provide access to the elements loaded with a Splash QT WebKit page request. We can see all of them if we perform a manual inspection:
157
+
158
+```
159
+for (e in har$log$entries) cat(e$request$url, "\n")
160
+## https://analytics.usa.gov/ 
161
+## https://analytics.usa.gov/css/vendor/css/uswds.v0.9.1.css 
162
+## https://analytics.usa.gov/css/public_analytics.css 
163
+## https://analytics.usa.gov/js/vendor/d3.v3.min.js 
164
+## https://analytics.usa.gov/js/vendor/q.min.js 
165
+## https://analytics.usa.gov/css/google-fonts.css 
166
+## https://analytics.usa.gov/js/vendor/uswds.v0.9.1.js 
167
+## https://analytics.usa.gov/js/index.js 
168
+## https://www.googletagmanager.com/gtm.js?id=GTM-MQSGZS 
169
+## https://www.google-analytics.com/analytics.js 
170
+## https://analytics.usa.gov/css/img/arrow-down.svg 
171
+## https://analytics.usa.gov/data/live/realtime.json 
172
+## https://analytics.usa.gov/data/live/today.json 
173
+## https://analytics.usa.gov/data/live/devices.json 
174
+## https://analytics.usa.gov/data/live/browsers.json 
175
+## https://analytics.usa.gov/data/live/ie.json 
176
+## https://analytics.usa.gov/data/live/os.json 
177
+## https://analytics.usa.gov/data/live/windows.json 
178
+## https://analytics.usa.gov/data/live/top-cities-realtime.json 
179
+## https://analytics.usa.gov/data/live/top-countries-realtime.json 
180
+## https://analytics.usa.gov/data/live/top-countries-realtime.json 
181
+## https://analytics.usa.gov/data/live/top-pages-realtime.json 
182
+## https://analytics.usa.gov/data/live/top-domains-7-days.json 
183
+## https://analytics.usa.gov/data/live/top-domains-30-days.json 
184
+## https://analytics.usa.gov/data/live/top-downloads-yesterday.json 
185
+## https://analytics.usa.gov/css/vendor/fonts/sourcesanspro-bold-webfont.woff2 
186
+## https://analytics.usa.gov/css/vendor/fonts/sourcesanspro-regular-webfont.woff2 
187
+## https://analytics.usa.gov/css/vendor/fonts/sourcesanspro-light-webfont.woff2 
188
+## https://analytics.usa.gov/css/vendor/fonts/sourcesanspro-italic-webfont.woff2 
189
+```
190
+
191
+With just a visual inspection, we can see there are JSON files being loaded at some point that likely contain some of the data we're. The content for each of them can be available to us in the HAR object if we specify the `response_body = TRUE` parameter:
192
+
193
+```
194
+har <- render_har(url = "https://analytics.usa.gov/", wait = 5, response_body = TRUE)
195
+for (e in har$log$entries) {
196
+  cat(sprintf("%s => [%s] is %s bytes\n", 
197
+              e$request$url, e$response$content$mimeType, 
198
+              scales::comma(e$response$content$size)))
199
+}
200
+## https://analytics.usa.gov/ => [text/html] is 19,718 bytes
201
+## https://analytics.usa.gov/css/vendor/css/uswds.v0.9.1.css => [text/css] is 64,676 bytes
202
+## https://analytics.usa.gov/css/public_analytics.css => [text/css] is 13,932 bytes
203
+## https://analytics.usa.gov/js/vendor/d3.v3.min.js => [application/x-javascript] is 150,760 bytes
204
+## https://analytics.usa.gov/js/vendor/q.min.js => [application/x-javascript] is 41,625 bytes
205
+## https://analytics.usa.gov/css/google-fonts.css => [text/css] is 112,171 bytes
206
+## https://analytics.usa.gov/js/vendor/uswds.v0.9.1.js => [application/x-javascript] is 741,447 bytes
207
+## https://analytics.usa.gov/js/index.js => [application/x-javascript] is 29,868 bytes
208
+## https://www.googletagmanager.com/gtm.js?id=GTM-MQSGZS => [] is 0 bytes
209
+## https://www.google-analytics.com/analytics.js => [] is 0 bytes
210
+## https://analytics.usa.gov/css/img/arrow-down.svg => [image/svg+xml] is 780 bytes
211
+## https://analytics.usa.gov/css/vendor/fonts/sourcesanspro-bold-webfont.woff2 => [font/woff2] is 23,368 bytes
212
+## https://analytics.usa.gov/css/vendor/fonts/sourcesanspro-regular-webfont.woff2 => [font/woff2] is 23,684 bytes
213
+## https://analytics.usa.gov/css/vendor/fonts/sourcesanspro-light-webfont.woff2 => [font/woff2] is 23,608 bytes
214
+## https://analytics.usa.gov/css/vendor/fonts/sourcesanspro-italic-webfont.woff2 => [font/woff2] is 17,472 bytes
215
+## https://analytics.usa.gov/data/live/realtime.json => [application/json] is 357 bytes
216
+## https://analytics.usa.gov/data/live/today.json => [application/json] is 2,467 bytes
217
+## https://analytics.usa.gov/data/live/devices.json => [application/json] is 625 bytes
218
+## https://analytics.usa.gov/data/live/browsers.json => [application/json] is 4,697 bytes
219
+## https://analytics.usa.gov/data/live/ie.json => [application/json] is 944 bytes
220
+## https://analytics.usa.gov/data/live/os.json => [application/json] is 1,378 bytes
221
+## https://analytics.usa.gov/data/live/windows.json => [application/json] is 978 bytes
222
+## https://analytics.usa.gov/data/live/top-cities-realtime.json => [application/json] is 604,096 bytes
223
+## https://analytics.usa.gov/data/live/top-countries-realtime.json => [application/json] is 15,179 bytes
224
+## https://analytics.usa.gov/data/live/top-pages-realtime.json => [application/json] is 3,565 bytes
225
+## https://analytics.usa.gov/data/live/top-domains-7-days.json => [application/json] is 1,979 bytes
226
+## https://analytics.usa.gov/data/live/top-domains-30-days.json => [application/json] is 5,915 bytes
227
+## https://analytics.usa.gov/data/live/top-downloads-yesterday.json => [application/json] is 25,751 bytes
228
+```
229
+
230
+I happen to know that the `devices.json` file has the visitor counts and we can retrieve it from the HAR object directly with some helpers:
231
+
232
+```
233
+har_entries(har)[[18]] %>% 
234
+  get_response_body("text") %>% 
235
+  jsonlite::fromJSON() %>% 
236
+  str()
237
+## List of 5
238
+##  $ name    : chr "devices"
239
+##  $ query   :List of 8
240
+##   ..$ start-date   : chr "90daysAgo"
241
+##   ..$ end-date     : chr "yesterday"
242
+##   ..$ dimensions   : chr "ga:date,ga:deviceCategory"
243
+##   ..$ metrics      : chr "ga:sessions"
244
+##   ..$ sort         : chr "ga:date"
245
+##   ..$ start-index  : int 1
246
+##   ..$ max-results  : int 10000
247
+##   ..$ samplingLevel: chr "HIGHER_PRECISION"
248
+##  $ meta    :List of 2
249
+##   ..$ name       : chr "Devices"
250
+##   ..$ description: chr "90 days of desktop/mobile/tablet visits for all sites."
251
+##  $ totals  :List of 2
252
+##   ..$ visits : num 2.37e+09
253
+##   ..$ devices:List of 3
254
+##   .. ..$ desktop: int 1303660363
255
+##   .. ..$ mobile : int 924913139
256
+##   .. ..$ tablet : int 137183761
257
+##  $ taken_at: chr "2017-08-27T10:00:02.175Z"
258
+```
259
+
260
+Now, if we wanted to make that request on our own, we could fiddle with the various `list` element details to build our own `httr` function, or we could make use of another helper to automatigally build an `httr` function for us:
261
+
262
+```
263
+library(httr)
264
+
265
+req <- as_httr_req(har_entries(har)[[18]])
266
+req() %>% 
267
+  content(as="parsed") %>% 
268
+  str()
269
+## Output is the same as previous block
270
+```
271
+
272
+The text of the function is also put on the clipboard by default, so you can paste it right into a script or package for use later on:
273
+
274
+```
275
+httr::VERB(verb = "GET", url = "https://analytics.usa.gov/data/live/devices.json", 
276
+     httr::add_headers(`User-Agent` = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/602.1 (KHTML, like Gecko) splash Version/9.0 Safari/602.1", 
277
+         Accept = "application/json,*/*", 
278
+         Referer = "https://analytics.usa.gov/"))
279
+```
280
+
281
+## The Full Monty
282
+
283
+One final `render_` function is the `render_json()` function. Let's see what it does before explaining it:
284
+
285
+```
286
+json <- render_json(url = "https://analytics.usa.gov/", wait = 5, png = TRUE, response_body = TRUE)
287
+
288
+str(json, 1)
289
+## List of 10
290
+##  $ frameName   : chr ""
291
+##  $ requestedUrl: chr "https://analytics.usa.gov/"
292
+##  $ geometry    :List of 4
293
+##  $ png         : chr "iVBORw0KGgoAAAANSUhEUgAABAAAAAMACAYAAAC6uhUNAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAgAElEQVR4AeydBZxUVRvGX7pTEBURBCWkVEQ"| __truncated__
294
+##  $ html        : chr "<!DOCTYPE html><html lang=\"en\"><!-- Initalize title and data source variables --><head>\n  <!--\n\n    Hi! We"| __truncated__
295
+##  $ title       : chr "analytics.usa.gov | The US government's web traffic."
296
+##  $ history     :List of 1
297
+##  $ url         : chr "https://analytics.usa.gov/"
298
+##  $ childFrames : list()
299
+##  $ har         :List of 1
300
+##   ..- attr(*, "class")= chr [1:2] "har" "list"
301
+##  - attr(*, "class")= chr [1:2] "splash_json" "list"
302
+```
303
+
304
+The function name corresponds to the [Splash HTTP API call](https://splash.readthedocs.io/en/stable/api.html). It is actally returning JSON => a JSON object holding pretty much everything associated with the page. Think of it as a one-stop-shop function if you want a screen shot, page content and HAR resources with just one call.

Loading…
Cancel
Save