Browse Source

make httr functions and response objects from HAR entries

boB Rudis 3 years ago
parent
commit
4ef5b7a1aa
16 changed files with 333 additions and 36 deletions
  1. 6
    2
      DESCRIPTION
  2. 13
    0
      NAMESPACE
  3. 5
    0
      NEWS.md
  4. 93
    0
      R/as_req.r
  5. 36
    0
      R/as_request.r
  6. 26
    0
      R/content.r
  7. 34
    21
      R/helpers.r
  8. 5
    0
      R/splashr-package.R
  9. 6
    4
      README.Rmd
  10. 10
    8
      README.md
  11. 18
    0
      man/as_req.Rd
  12. 29
    0
      man/as_request.Rd
  13. 1
    1
      man/get_content_size.Rd
  14. 20
    0
      man/get_request_type.Rd
  15. 14
    0
      man/get_request_url.Rd
  16. 17
    0
      man/get_response_body.Rd

+ 6
- 2
DESCRIPTION View File

@@ -1,7 +1,7 @@
1 1
 Package: splashr
2 2
 Type: Package
3 3
 Title: Tools to Work with the 'Splash' JavaScript Rendering Service
4
-Version: 0.2.0
4
+Version: 0.3.0
5 5
 Date: 2017-02-14
6 6
 Encoding: UTF-8
7 7
 Author: Bob Rudis (bob@rud.is)
@@ -29,6 +29,10 @@ Imports:
29 29
     xml2,
30 30
     jsonlite,
31 31
     magick,
32
-    HARtools
32
+    stringi,
33
+    clipr,
34
+    HARtools,
35
+    openssl,
36
+    lubridate
33 37
 RoxygenNote: 6.0.0
34 38
 Remotes: wch/harbor

+ 13
- 0
NAMESPACE View File

@@ -6,15 +6,21 @@ export("%>%")
6 6
 export(HARviewer)
7 7
 export(HARviewerOutput)
8 8
 export(as_har)
9
+export(as_req)
10
+export(as_request)
9 11
 export(execute_lua)
10 12
 export(get_body_size)
11 13
 export(get_content_size)
12 14
 export(get_content_type)
13 15
 export(get_headers_size)
16
+export(get_request_type)
17
+export(get_request_url)
18
+export(get_response_body)
14 19
 export(install_splash)
15 20
 export(is_binary)
16 21
 export(is_content_type)
17 22
 export(is_css)
23
+export(is_get)
18 24
 export(is_gif)
19 25
 export(is_html)
20 26
 export(is_javascript)
@@ -22,6 +28,7 @@ export(is_jpeg)
22 28
 export(is_json)
23 29
 export(is_plain)
24 30
 export(is_png)
31
+export(is_post)
25 32
 export(is_svg)
26 33
 export(is_xhr)
27 34
 export(is_xml)
@@ -55,6 +62,12 @@ importFrom(HARtools,HARviewer)
55 62
 importFrom(HARtools,HARviewerOutput)
56 63
 importFrom(HARtools,renderHARviewer)
57 64
 importFrom(HARtools,writeHAR)
65
+importFrom(clipr,read_clip)
58 66
 importFrom(jsonlite,fromJSON)
67
+importFrom(lubridate,ymd_hms)
68
+importFrom(openssl,base64_decode)
69
+importFrom(stringi,stri_detect_regex)
70
+importFrom(stringi,stri_split_fixed)
71
+importFrom(stringi,stri_split_regex)
59 72
 importFrom(xml2,read_html)
60 73
 importFrom(xml2,url_parse)

+ 5
- 0
NEWS.md View File

@@ -1,3 +1,8 @@
1
+0.3.0
2
+
3
+* added `as_req()`
4
+* added `as_request()`
5
+
1 6
 0.2.0
2 7
 
3 8
 * added `execute`()

+ 93
- 0
R/as_req.r View File

@@ -0,0 +1,93 @@
1
+#' Create an httr function from an HAR request
2
+#'
3
+#' @md
4
+#' @param entry HAR entry
5
+#' @param quiet quiet
6
+#' @param add_clip add clip
7
+#' @export
8
+as_req <- function(entry, quiet=TRUE, add_clip=TRUE) {
9
+
10
+  req <- entry$request
11
+
12
+  req$headers <- purrr::map(req$headers, "value") %>%
13
+    setNames(map_chr(req$headers, "name"))
14
+
15
+  ml <- getOption("deparse.max.lines")
16
+  options(deparse.max.lines=10000)
17
+
18
+  template <- "httr::VERB(verb = '%s', url = '%s' %s%s%s%s%s%s)"
19
+
20
+  hdrs <- enc <- bdy <- ckies <- auth <- verbos <- cfg <- ""
21
+
22
+  if (length(req$headers) > 0) {
23
+
24
+    # try to determine encoding
25
+    ct_idx <- which(grepl("content-type", names(req$headers), ignore.case=TRUE))
26
+    if (length(ct_idx) > 0) {
27
+      # retrieve & delete the content type
28
+      ct <- req$headers[[ct_idx]]
29
+      req$headers[[ct_idx]] <- NULL
30
+
31
+      if (stringi::stri_detect_regex(ct, "multipart")) {
32
+        enc <- ", encode = 'multipart'"
33
+      } else if (stringi::stri_detect_regex(ct, "form")) {
34
+        enc <- ", encode = 'form'"
35
+      } else if (stringi::stri_detect_regex(ct, "json")) {
36
+        enc <- ", encode = 'json'"
37
+      } else {
38
+        enc <- ""
39
+      }
40
+    }
41
+
42
+    hdrs <- paste0(capture.output(dput(req$headers,  control=NULL)),
43
+                   collapse="")
44
+    hdrs <- sub("^list", ", httr::add_headers", hdrs)
45
+
46
+  }
47
+
48
+  if (length(req$data) > 0) {
49
+    bdy_bits <- paste0(capture.output(dput(parse_query(req$data), control=NULL)),
50
+                       collapse="")
51
+    bdy <- sprintf(", body = %s", bdy_bits)
52
+  }
53
+
54
+  if (length(req$url_parts$username) > 0) {
55
+    auth <- sprintf(", httr::authenticate(user='%s', password='%s')",
56
+                    req$url_parts$username, req$url_parts$password)
57
+  }
58
+
59
+  if (length(req$verbose) > 0) {
60
+    verbos <- ", httr::verbose()"
61
+  }
62
+
63
+  if (length(req$cookies) > 0) {
64
+    ckies <- paste0(capture.output(dput(req$cookies, control=NULL)),
65
+                    collapse="")
66
+    ckies <- sub("^list", ", httr::set_cookies", ckies)
67
+  }
68
+
69
+  REQ_URL <- req$url
70
+
71
+  out <- sprintf(template, toupper(req$method), REQ_URL, auth, verbos, hdrs, ckies, bdy, enc)
72
+
73
+  # this does a half-decent job formatting the R function text
74
+  fil <- tempfile(fileext=".R")
75
+  on.exit(unlink(fil))
76
+  formatR::tidy_source(text=out, width.cutoff=30, indent=4, file=fil)
77
+  tmp <- paste0(readLines(fil), collapse="\n")
78
+
79
+  if (add_clip) clipr::write_clip(tmp)
80
+
81
+  if (!quiet) cat(tmp, "\n")
82
+
83
+  # make a bona fide R function
84
+  f <- function() {}
85
+  formals(f) <- NULL
86
+  environment(f) <- parent.frame()
87
+  body(f) <- as.expression(parse(text=tmp))
88
+
89
+  options(deparse.max.lines=ml)
90
+
91
+  return(f)
92
+
93
+}

+ 36
- 0
R/as_request.r View File

@@ -0,0 +1,36 @@
1
+#' Return a HAR entry response as an httr::response object
2
+#'
3
+#' @param har_entry a HAR object (should contain a response body to be most useful)
4
+#' @export
5
+#' @examples \dontrun{
6
+#' URL <- "http://www.svs.cl/portal/principal/605/w3-propertyvalue-18554.html"
7
+#'
8
+#' splash_local %>%
9
+#'   splash_response_body(TRUE) %>%
10
+#'   splash_go(URL) %>%
11
+#'   splash_wait(2) %>%
12
+#'   splash_har() -> har
13
+#'
14
+#' keep(har$log$entries, is_xhr) %>%
15
+#'   map(as_request) %>%
16
+#'   map(httr::content, as="parsed")
17
+#' }
18
+as_request <- function(har_entry) {
19
+
20
+  if (length(har_entry$response$content$text) > 0) {
21
+    content_body <- openssl::base64_decode(har_entry$response$content$text)
22
+  } else {
23
+    content_body <- NULL
24
+  }
25
+
26
+  structure(list(
27
+    url = har_entry$request$url,
28
+    status_code = har_entry$response$status,
29
+    date = lubridate::ymd_hms(har_entry$startedDateTime),
30
+    headers = setNames(map(har_entry$response$headers, "value"),
31
+                       map(har_entry$response$headers, "name")) %>%
32
+      insensitive(),
33
+    content = content_body
34
+  ), class="response")
35
+
36
+}

+ 26
- 0
R/content.r View File

@@ -0,0 +1,26 @@
1
+#' Retrieve size of content |  body | headers
2
+#'
3
+#' @param har_resp_obj HAR response object
4
+#' @export
5
+get_content_size <- function(har_resp_obj) {
6
+  csize <- har_resp_obj$response$content$size
7
+  if (is.null(csize)) return(NA_real_)
8
+  return(as.numeric(csize))
9
+}
10
+
11
+#' @rdname get_content_size
12
+#' @export
13
+get_body_size <- function(har_resp_obj) {
14
+  bsize <- har_resp_obj$response$bodySize
15
+  if (is.null(bsize)) return(NA_real_)
16
+  return(as.numeric(bsize))
17
+}
18
+
19
+#' @rdname get_content_size
20
+#' @export
21
+get_headers_size <- function(har_resp_obj) {
22
+  hsize <- har_resp_obj$response$headersSize
23
+  if (is.null(hsize)) return(NA_real_)
24
+  return(as.numeric(hsize))
25
+}
26
+

+ 34
- 21
R/helpers.r View File

@@ -1,3 +1,15 @@
1
+#' Retrieve the body content of a HAR entry
2
+#'
3
+#' @md
4
+#' @param har_resp_obj HAR response object
5
+#' @return A `raw` vector of the content or `NULL`
6
+#' @export
7
+get_response_body <- function(har_resp_obj) {
8
+  resp <- har_resp_obj$response$content$text
9
+  if (resp == "") return(NULL)
10
+  openssl::base64_decode(resp)
11
+}
12
+
1 13
 #' Retrieve or test content type of a HAR request object
2 14
 #'
3 15
 #' @param har_resp_obj HAR response object
@@ -46,23 +58,23 @@ is_javascript <- function(har_resp_obj) {
46 58
 
47 59
 #' @rdname get_content_type
48 60
 #' @export
49
-is_html <- function(har_resp_obj) {  is_content_type(har_resp_obj, type="text/html") }
61
+is_html <- function(har_resp_obj) { is_content_type(har_resp_obj, type="text/html") }
50 62
 
51 63
 #' @rdname get_content_type
52 64
 #' @export
53
-is_jpeg <- function(har_resp_obj) {  is_content_type(har_resp_obj, type="image/jpeg") }
65
+is_jpeg <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/jpeg") }
54 66
 
55 67
 #' @rdname get_content_type
56 68
 #' @export
57
-is_png <- function(har_resp_obj) {  is_content_type(har_resp_obj, type="image/png") }
69
+is_png <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/png") }
58 70
 
59 71
 #' @rdname get_content_type
60 72
 #' @export
61
-is_svg <- function(har_resp_obj) {  is_content_type(har_resp_obj, type="image/svg+xml") }
73
+is_svg <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/svg+xml") }
62 74
 
63 75
 #' @rdname get_content_type
64 76
 #' @export
65
-is_gif <- function(har_resp_obj) {  is_content_type(har_resp_obj, type="image/gif") }
77
+is_gif <- function(har_resp_obj) { is_content_type(har_resp_obj, type="image/gif") }
66 78
 
67 79
 #' @rdname get_content_type
68 80
 #' @export
@@ -82,29 +94,30 @@ is_xhr <- function(x) {
82 94
 
83 95
 }
84 96
 
85
-#' Retrieve size of content |  body | headers
97
+#' Retrieve request URL
86 98
 #'
87 99
 #' @param har_resp_obj HAR response object
88 100
 #' @export
89
-get_content_size <- function(har_resp_obj) {
90
-  csize <- har_resp_obj$response$content$size
91
-  if (is.null(csize)) return(NA_real_)
92
-  return(as.numeric(csize))
101
+get_request_url <- function(har_resp_obj) {
102
+  utype <- har_resp_obj$request$url
103
+  if (utype == "") return(NA_character_)
104
+  utype
93 105
 }
94 106
 
95
-#' @rdname get_content_size
107
+#' Retrieve or test request type
108
+#'
109
+#' @param har_resp_obj HAR response object
96 110
 #' @export
97
-get_body_size <- function(har_resp_obj) {
98
-  bsize <- har_resp_obj$response$bodySize
99
-  if (is.null(bsize)) return(NA_real_)
100
-  return(as.numeric(bsize))
111
+get_request_type <- function(har_resp_obj) {
112
+  rtype <- har_resp_obj$request$method
113
+  if (rtype == "") return(NA_character_)
114
+  rtype
101 115
 }
102 116
 
103
-#' @rdname get_content_size
117
+#' @rdname get_request_type
104 118
 #' @export
105
-get_headers_size <- function(har_resp_obj) {
106
-  hsize <- har_resp_obj$response$headersSize
107
-  if (is.null(hsize)) return(NA_real_)
108
-  return(as.numeric(hsize))
109
-}
119
+is_get <- function(har_resp_obj) { get_requet_type(har_resp_obj) == "GET" }
110 120
 
121
+#' @rdname get_request_type
122
+#' @export
123
+is_post <- function(har_resp_obj) { get_requet_type(har_resp_obj) == "POST" }

+ 5
- 0
R/splashr-package.R View File

@@ -15,9 +15,14 @@
15 15
 #' @docType package
16 16
 #' @author Bob Rudis (bob@@rud.is)
17 17
 #' @import purrr httr magick harbor
18
+#' @importFrom stringi stri_split_regex stri_split_fixed stri_detect_regex
18 19
 #' @importFrom HARtools writeHAR HARviewer renderHARviewer HARviewerOutput
19 20
 #' @importFrom xml2 read_html url_parse
20 21
 #' @importFrom jsonlite fromJSON
22
+#' @importFrom openssl base64_decode
23
+#' @importFrom clipr read_clip
24
+#' @importFrom lubridate ymd_hms
25
+
21 26
 NULL
22 27
 
23 28
 #' splashr exported operators

+ 6
- 4
README.Rmd View File

@@ -49,10 +49,7 @@ The following functions are implemented:
49 49
 - `start_splash`:	Start a Splash server Docker container
50 50
 - `stop_splash`:	Stop a running a Splash server Docker container
51 51
 
52
-Mini-DSL (domain-specific language). These can be used to create a "script" without actually
53
-scripting in Lua. They are a less-powerful/configurable set of calls than what you
54
-can make with a full Lua function but the idea is to have it take care of very common but
55
-simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG image of a site:
52
+Mini-DSL (domain-specific language). These can be used to create a "script" without actually scripting in Lua. They are a less-powerful/configurable set of calls than what you can make with a full Lua function but the idea is to have it take care of very common but simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG image of a site:
56 53
 
57 54
 - `splash_plugins`:	Enable or disable browser plugins (e.g. Flash).
58 55
 - `splash_images`:	Enable/disable images
@@ -63,6 +60,11 @@ simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG
63 60
 - `splash_html`:	Return a HTML snapshot of a current page.
64 61
 - `splash_png`:	Return a screenshot of a current page in PNG format.
65 62
 
63
+`httr` helpers. These help turn various bits of `splashr` objects into `httr`-ish things:
64
+
65
+- `as_req`:  Turn a HAR response entry into a working `httr` function you can use to make a request with
66
+- `as_request`:  Turn a HAR response entry into an `httr` `response`-like object (i.e. you can use `httr::content()` on it)
67
+
66 68
 Helpers:
67 69
 
68 70
 - `get_body_size`:	Retrieve size of content | body | headers

+ 10
- 8
README.md View File

@@ -33,7 +33,7 @@ All you need for this package to work is a running Splash instance. You provide
33 33
 
34 34
 > 'Splash' <https://github.com/scrapinghub/splash> is a javascript rendering service. It’s a lightweight web browser with an 'HTTP' API, implemented in Python using 'Twisted'and 'QT' [and provides some of the core functionality of the 'RSelenium' or 'seleniumPipes' R packages but with a Java-free footprint]. The (twisted) 'QT' reactor is used to make the sever fully asynchronous allowing to take advantage of 'webkit' concurrency via QT main loop. Some of Splash features include the ability to process multiple webpages in parallel; retrieving HTML results and/or take screenshots; disabling images or use Adblock Plus rules to make rendering faster; executing custom JavaScript in page context; getting detailed rendering info in HAR format.
35 35
 
36
-The following functions are implemented: 
36
+The following functions are implemented:
37 37
 
38 38
 -   `render_html`: Return the HTML of the javascript-rendered page.
39 39
 -   `render_file`: Return the HTML or image (png) of the javascript-rendered page in a local file
@@ -46,10 +46,7 @@ The following functions are implemented:
46 46
 -   `start_splash`: Start a Splash server Docker container
47 47
 -   `stop_splash`: Stop a running a Splash server Docker container
48 48
 
49
-Mini-DSL (domain-specific language). These can be used to create a "script" without actually
50
-scripting in Lua. They are a less-powerful/configurable set of calls than what you
51
-can make with a full Lua function but the idea is to have it take care of very common but
52
-simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG image of a site:
49
+Mini-DSL (domain-specific language). These can be used to create a "script" without actually scripting in Lua. They are a less-powerful/configurable set of calls than what you can make with a full Lua function but the idea is to have it take care of very common but simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG image of a site:
53 50
 
54 51
 -   `splash_plugins`:	Enable or disable browser plugins (e.g. Flash).
55 52
 -   `splash_images`:	Enable/disable images
@@ -60,7 +57,12 @@ simple use-cases, like waiting a period of time before capturing a HAR/HTML/PNG
60 57
 -   `splash_html`:	Return a HTML snapshot of a current page.
61 58
 -   `splash_png`:	Return a screenshot of a current page in PNG format.
62 59
 
63
-Helpers: 
60
+`httr` helpers. These help turn various bits of `splashr` objects into `httr`-ish things:
61
+
62
+-   `as_req`:  Turn a HAR response entry into a working `httr` function you can use to make a request with
63
+-   `as_request`:  Turn a HAR response entry into an `httr` `response`-like object (i.e. you can use `httr::content()` on it)
64
+
65
+Helpers:
64 66
 
65 67
 -   `get_body_size`:	Retrieve size of content | body | headers
66 68
 -   `get_content_sie`:	Retrieve size of content | body | headers
@@ -119,7 +121,7 @@ library(tidyverse)
119 121
 packageVersion("splashr")
120 122
 ```
121 123
 
122
-    ## [1] '0.2.0'
124
+    ## [1] '0.3.0'
123 125
 
124 126
 ``` r
125 127
 splash("splash", 8050L) %>%
@@ -292,7 +294,7 @@ library(testthat)
292 294
 date()
293 295
 ```
294 296
 
295
-    ## [1] "Tue Feb 14 09:02:35 2017"
297
+    ## [1] "Tue Feb 15 09:02:35 2017"
296 298
 
297 299
 ``` r
298 300
 test_dir("tests/")

+ 18
- 0
man/as_req.Rd View File

@@ -0,0 +1,18 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/as_req.r
3
+\name{as_req}
4
+\alias{as_req}
5
+\title{Create an httr function from an HAR request}
6
+\usage{
7
+as_req(entry, quiet = TRUE, add_clip = TRUE)
8
+}
9
+\arguments{
10
+\item{entry}{HAR entry}
11
+
12
+\item{quiet}{quiet}
13
+
14
+\item{add_clip}{add clip}
15
+}
16
+\description{
17
+Create an httr function from an HAR request
18
+}

+ 29
- 0
man/as_request.Rd View File

@@ -0,0 +1,29 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/as_request.r
3
+\name{as_request}
4
+\alias{as_request}
5
+\title{Return a HAR entry response as an httr::response object}
6
+\usage{
7
+as_request(har_entry)
8
+}
9
+\arguments{
10
+\item{har_entry}{a HAR object (should contain a response body to be most useful)}
11
+}
12
+\description{
13
+Return a HAR entry response as an httr::response object
14
+}
15
+\examples{
16
+\dontrun{
17
+URL <- "http://www.svs.cl/portal/principal/605/w3-propertyvalue-18554.html"
18
+
19
+splash_local \%>\%
20
+  splash_response_body(TRUE) \%>\%
21
+  splash_go(URL) \%>\%
22
+  splash_wait(2) \%>\%
23
+  splash_har() -> har
24
+
25
+keep(har$log$entries, is_xhr) \%>\%
26
+  map(as_request) \%>\%
27
+  map(httr::content, as="parsed")
28
+}
29
+}

+ 1
- 1
man/get_content_size.Rd View File

@@ -1,5 +1,5 @@
1 1
 % Generated by roxygen2: do not edit by hand
2
-% Please edit documentation in R/helpers.r
2
+% Please edit documentation in R/content.r
3 3
 \name{get_content_size}
4 4
 \alias{get_content_size}
5 5
 \alias{get_body_size}

+ 20
- 0
man/get_request_type.Rd View File

@@ -0,0 +1,20 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/helpers.r
3
+\name{get_request_type}
4
+\alias{get_request_type}
5
+\alias{is_get}
6
+\alias{is_post}
7
+\title{Retrieve or test request type}
8
+\usage{
9
+get_request_type(har_resp_obj)
10
+
11
+is_get(har_resp_obj)
12
+
13
+is_post(har_resp_obj)
14
+}
15
+\arguments{
16
+\item{har_resp_obj}{HAR response object}
17
+}
18
+\description{
19
+Retrieve or test request type
20
+}

+ 14
- 0
man/get_request_url.Rd View File

@@ -0,0 +1,14 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/helpers.r
3
+\name{get_request_url}
4
+\alias{get_request_url}
5
+\title{Retrieve request URL}
6
+\usage{
7
+get_request_url(har_resp_obj)
8
+}
9
+\arguments{
10
+\item{har_resp_obj}{HAR response object}
11
+}
12
+\description{
13
+Retrieve request URL
14
+}

+ 17
- 0
man/get_response_body.Rd View File

@@ -0,0 +1,17 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/helpers.r
3
+\name{get_response_body}
4
+\alias{get_response_body}
5
+\title{Retrieve the body content of a HAR entry}
6
+\usage{
7
+get_response_body(har_resp_obj)
8
+}
9
+\arguments{
10
+\item{har_resp_obj}{HAR response object}
11
+}
12
+\value{
13
+A \code{raw} vector of the content or \code{NULL}
14
+}
15
+\description{
16
+Retrieve the body content of a HAR entry
17
+}

Loading…
Cancel
Save