Browse Source

added wc_inspect() for dev tools-ish HAR retrieval

master
boB Rudis 5 years ago
parent
commit
b9f3321efa
No known key found for this signature in database GPG Key ID: 1D7529BE14E2BBA9
  1. 1
      .Rbuildignore
  2. 6
      DESCRIPTION
  3. 1
      NAMESPACE
  4. 4
      R/utils-infix-helpers.R
  5. 52
      R/wc-inspect.R
  6. 22
      README.Rmd
  7. 63
      README.md
  8. BIN
      inst/java/htmlunit-1.0-SNAPSHOT.jar
  9. 14
      java/htmlunit/Makefile
  10. BIN
      java/htmlunit/deps/commons-codec-1.11.jar
  11. BIN
      java/htmlunit/deps/commons-io-2.6.jar
  12. BIN
      java/htmlunit/deps/commons-lang3-3.9.jar
  13. BIN
      java/htmlunit/deps/commons-logging-1.2.jar
  14. BIN
      java/htmlunit/deps/commons-net-3.6.jar
  15. BIN
      java/htmlunit/deps/commons-text-1.6.jar
  16. BIN
      java/htmlunit/deps/htmlunit-2.35.0.jar
  17. BIN
      java/htmlunit/deps/htmlunit-core-js-2.35.0.jar
  18. BIN
      java/htmlunit/deps/htmlunit-cssparser-1.4.0.jar
  19. BIN
      java/htmlunit/deps/httpclient-4.5.8.jar
  20. BIN
      java/htmlunit/deps/httpcore-4.4.11.jar
  21. BIN
      java/htmlunit/deps/httpmime-4.5.8.jar
  22. BIN
      java/htmlunit/deps/jetty-client-9.4.16.v20190411.jar
  23. BIN
      java/htmlunit/deps/jetty-http-9.4.16.v20190411.jar
  24. BIN
      java/htmlunit/deps/jetty-io-9.4.16.v20190411.jar
  25. BIN
      java/htmlunit/deps/jetty-util-9.4.16.v20190411.jar
  26. BIN
      java/htmlunit/deps/jetty-xml-9.4.16.v20190411.jar
  27. BIN
      java/htmlunit/deps/neko-htmlunit-2.35.0.jar
  28. BIN
      java/htmlunit/deps/serializer-2.7.2.jar
  29. BIN
      java/htmlunit/deps/websocket-api-9.4.16.v20190411.jar
  30. BIN
      java/htmlunit/deps/websocket-client-9.4.16.v20190411.jar
  31. BIN
      java/htmlunit/deps/websocket-common-9.4.16.v20190411.jar
  32. BIN
      java/htmlunit/deps/xalan-2.7.2.jar
  33. BIN
      java/htmlunit/deps/xercesImpl-2.12.0.jar
  34. BIN
      java/htmlunit/deps/xml-apis-1.4.01.jar
  35. 31
      java/htmlunit/pom.xml
  36. 40
      java/htmlunit/src/main/java/is/rud/htmlunit/App.java
  37. BIN
      java/htmlunit/target/classes/is/rud/htmlunit/App$1.class
  38. BIN
      java/htmlunit/target/classes/is/rud/htmlunit/App.class
  39. BIN
      java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar
  40. 5
      java/htmlunit/target/maven-archiver/pom.properties
  41. 2
      java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
  42. 1
      java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst
  43. 0
      java/htmlunit/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst
  44. 1
      java/htmlunit/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst
  45. 16
      man/wc_inspect.Rd

1
.Rbuildignore

@ -11,4 +11,5 @@
^doc$
^tmp$
^notes$
^java$
^\.gitlab-ci\.yml$

6
DESCRIPTION

@ -1,8 +1,8 @@
Package: htmlunit
Type: Package
Title: Tools to Scrape Dynamic Web Content via the 'HtmlUnit' Java Library
Version: 0.1.0
Date: 2018-12-16
Version: 0.2.0
Date: 2019-04-29
Authors@R: c(
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-5670-2640")),
@ -36,4 +36,4 @@ Depends:
xml2
Roxygen: list(markdown = TRUE)
RoxygenNote: 6.1.1
Remotes: hrbrmstr/htmlunitjars
Remotes: https://gitlab.com/hrbrmstr/htmlunitjars.git

1
NAMESPACE

@ -19,6 +19,7 @@ export(wc_html_name)
export(wc_html_nodes)
export(wc_html_text)
export(wc_img_dl)
export(wc_inspect)
export(wc_load_time)
export(wc_render)
export(wc_resize)

4
R/utils-infix-helpers.R

@ -0,0 +1,4 @@
`%l0%` <- function(x, y) if (length(x) == 0) y else x
`%||%` <- function(x, y) if (is.null(x)) y else x
`%@%` <- function(x, name) attr(x, name, exact = TRUE)
`%nin%` <- function(x, table) match(x, table, nomatch = 0) == 0

52
R/wc-inspect.R

@ -0,0 +1,52 @@
#' Perform a "Developer Tools"-like Network Inspection of a URL
#'
#' Retrieves _all_ content loaded
#'
#' @md
#' @param url URL to fetch
#' @param js_delay (ms) How long to wait for JavaScript to execute/XHRs to load? (Default: 5000)
#' @export
wc_inspect <- function(url, js_delay = 5000L) {
app <- J("is.rud.htmlunit.App")
res <- app$getRequestsFor(url, .jlong(js_delay))
res <- as.list(res)
lapply(res, function(.x) {
wr <- .x$getWebRequest()
hdrs <- as.list(.x$getResponseHeaders())
lapply(hdrs, function(.x) {
data.frame(
name = .x$getName() %||% NA_character_,
value = .x$getValue() %||% NA_character_,
stringsAsFactors = FALSE
)
}) -> hdrs
hdrs <- do.call(rbind.data.frame, hdrs)
class(hdrs) <- c("tbl_df", "tbl", "data.frame")
data.frame(
method = wr$getHttpMethod()$toString() %||% NA_character_,
url = wr$getUrl()$toString() %||% NA_character_,
status_code = .x$getStatusCode() %||% NA_integer_,
message = .x$getStatusMessage() %||% NA_character_,
content = .x$getContentAsString()%||% NA_character_,
content_length = as.double(.x$getContentLength() %||% NA_real_),
content_type = .x$getContentType() %||% NA_character_,
load_time = as.double(.x$getLoadTime() %||% NA_real_),
headers = I(list(hdrs)),
stringsAsFactors = FALSE
)
}) -> out
out <- do.call(rbind.data.frame, out)
class(out) <- c("tbl_df", "tbl", "data.frame")
out
}

22
README.Rmd

@ -67,11 +67,14 @@ The following functions are implemented:
- `hu_read_html`: Read HTML from a URL with Browser Emulation & in a JavaScript Context
### Content++
- `wc_inspect`: Perform a "Developer Tools"-like Network Inspection of a URL
## Installation
```{r eval=FALSE}
devtools::install_github("hrbrmstr/htmlunitjars")
devtools::install_github("hrbrmstr/htmlunit")
install.packages(c("htmlunitjars", "htmlunit"), repos = "https://cinc.rud.is", type="source")
```
```{r message=FALSE, warning=FALSE, error=FALSE, include=FALSE}
@ -82,6 +85,7 @@ options(width=120)
```{r message=FALSE, warning=FALSE, error=FALSE}
library(htmlunit)
library(tidyverse) # for some data ops; not req'd for pkg
# current verison
packageVersion("htmlunit")
@ -112,6 +116,20 @@ html_table(pg)
All without needing a separate Selenium or Splash server instance.
### Content++
We can also get a HAR-like content + metadata dump:
```{r}
(xdf <- wc_inspect("https://rud.is/b"))
group_by(xdf, content_type) %>%
summarise(
total_size = sum(content_length),
total_load_time = sum(load_time)/1000
)
```
### DSL
```{r}

63
README.md

@ -93,21 +93,27 @@ The following functions are implemented:
- `hu_read_html`: Read HTML from a URL with Browser Emulation & in a
JavaScript Context
### Content++
- `wc_inspect`: Perform a “Developer Tools”-like Network Inspection of
a
URL
## Installation
``` r
devtools::install_github("hrbrmstr/htmlunitjars")
devtools::install_github("hrbrmstr/htmlunit")
install.packages(c("htmlunitjars", "htmlunit"), repos = "https://cinc.rud.is", type="source")
```
## Usage
``` r
library(htmlunit)
library(tidyverse) # for some data ops; not req'd for pkg
# current verison
packageVersion("htmlunit")
## [1] '0.1.0'
## [1] '0.2.0'
```
Something `xml2::read_html()` cannot do, read the table from
@ -141,13 +147,49 @@ html_table(pg)
All without needing a separate Selenium or Splash server instance.
### Content++
We can also get a HAR-like content + metadata dump:
``` r
(xdf <- wc_inspect("https://rud.is/b"))
## # A tibble: 55 x 9
## method url status_code message content content_length content_type load_time headers
## <chr> <chr> <int> <chr> <chr> <dbl> <chr> <dbl> <I(list>
## 1 GET https://rud.is/b 301 Moved Pe… "<html>\r\n<head><ti 162 text/html 113 <tibble
## 2 GET https://rud.is/b/ 200 OK "<!-- This page is c… 10974 text/html 29 <tibble…
## 3 GET https://rud.is/b/… 200 OK "// Source: wp-inclu… 4426 application/… 29 <tibble
## 4 GET https://rud.is/b/… 200 OK ".wp-block-audio fig… 4320 text/css 21 <tibble
## 5 GET https://rud.is/b/… 200 OK "/* http://prismjs.c… 1601 text/css 19 <tibble
## 6 GET https://rud.is/b/… 200 OK ".wp_syntax {\n\tcol… 820 text/css 18 <tibble
## 7 GET https://rud.is/b/… 200 OK "@media print{body{b… 338 text/css 18 <tibble
## 8 GET https://rud.is/b/… 200 OK ".row-fluid{width:10… 2491 text/css 19 <tibble
## 9 GET https://rud.is/b/… 200 OK "/*! normalize.css v… 850 text/css 21 <tibble
## 10 GET https://rud.is/b/… 200 OK "@font-face{font-fam… 1965 text/css 20 <tibble
## # … with 45 more rows
group_by(xdf, content_type) %>%
summarise(
total_size = sum(content_length),
total_load_time = sum(load_time)/1000
)
## # A tibble: 5 x 3
## content_type total_size total_load_time
## <chr> <dbl> <dbl>
## 1 application/javascript 146930 0.965
## 2 application/x-javascript 9959 0.226
## 3 image/webp 33686 0.225
## 4 text/css 43913 0.348
## 5 text/html 11136 0.142
```
### DSL
``` r
wc <- web_client()
wc %>% wc_browser_info()
## < Netscape / 5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36 / en-US >
## < Netscape / 5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36 / en-US >
wc <- web_client()
@ -177,7 +219,7 @@ wc %>%
wc_html_nodes(xpath=".//a") %>%
sapply(wc_html_attr, "href") %>%
head(10)
## [1] "#skiptarget" "/" "/phone" "/topics"
## [1] "#skiptarget" "/" "/phone" "/#tpcs"
## [5] "/branches-of-government" "/budget" "/statistics" "/history"
## [9] "/flag" "/life-in-the-us"
```
@ -227,10 +269,13 @@ wc %>%
### htmlunit Metrics
| Lang | \# Files | (%) | LoC | (%) | Blank lines | (%) | \# Lines | (%) |
| :--- | -------: | ---: | --: | ---: | ----------: | ---: | -------: | ---: |
| R | 12 | 0.92 | 314 | 0.91 | 182 | 0.79 | 364 | 0.82 |
| Rmd | 1 | 0.08 | 32 | 0.09 | 49 | 0.21 | 81 | 0.18 |
| Lang | \# Files | (%) | LoC | (%) | Blank lines | (%) | \# Lines | (%) |
| :---- | -------: | ---: | --: | ---: | ----------: | ---: | -------: | ---: |
| R | 14 | 0.78 | 351 | 0.77 | 193 | 0.73 | 372 | 0.81 |
| Rmd | 1 | 0.06 | 38 | 0.08 | 55 | 0.21 | 87 | 0.19 |
| Maven | 1 | 0.06 | 30 | 0.07 | 0 | 0.00 | 1 | 0.00 |
| Java | 1 | 0.06 | 28 | 0.06 | 11 | 0.04 | 1 | 0.00 |
| make | 1 | 0.06 | 10 | 0.02 | 4 | 0.02 | 0 | 0.00 |
## Code of Conduct

BIN
inst/java/htmlunit-1.0-SNAPSHOT.jar

Binary file not shown.

14
java/htmlunit/Makefile

@ -0,0 +1,14 @@
.PHONY: clean pkg deps run
pkg:
mvn --quiet package
cp target/htmlunit-1.0-SNAPSHOT.jar ../../inst/java
clean:
mvn clean
deps:
mvn dependency:copy-dependencies -DoutputDirectory=deps
new:
mvn archetype:generate -DgroupId=is.rud.htmlunit -DartifactId=htmlunit -DarchetypeArtifactId=maven-archetype-quickstart -DinteractiveMode=false

BIN
java/htmlunit/deps/commons-codec-1.11.jar

Binary file not shown.

BIN
java/htmlunit/deps/commons-io-2.6.jar

Binary file not shown.

BIN
java/htmlunit/deps/commons-lang3-3.9.jar

Binary file not shown.

BIN
java/htmlunit/deps/commons-logging-1.2.jar

Binary file not shown.

BIN
java/htmlunit/deps/commons-net-3.6.jar

Binary file not shown.

BIN
java/htmlunit/deps/commons-text-1.6.jar

Binary file not shown.

BIN
java/htmlunit/deps/htmlunit-2.35.0.jar

Binary file not shown.

BIN
java/htmlunit/deps/htmlunit-core-js-2.35.0.jar

Binary file not shown.

BIN
java/htmlunit/deps/htmlunit-cssparser-1.4.0.jar

Binary file not shown.

BIN
java/htmlunit/deps/httpclient-4.5.8.jar

Binary file not shown.

BIN
java/htmlunit/deps/httpcore-4.4.11.jar

Binary file not shown.

BIN
java/htmlunit/deps/httpmime-4.5.8.jar

Binary file not shown.

BIN
java/htmlunit/deps/jetty-client-9.4.16.v20190411.jar

Binary file not shown.

BIN
java/htmlunit/deps/jetty-http-9.4.16.v20190411.jar

Binary file not shown.

BIN
java/htmlunit/deps/jetty-io-9.4.16.v20190411.jar

Binary file not shown.

BIN
java/htmlunit/deps/jetty-util-9.4.16.v20190411.jar

Binary file not shown.

BIN
java/htmlunit/deps/jetty-xml-9.4.16.v20190411.jar

Binary file not shown.

BIN
java/htmlunit/deps/neko-htmlunit-2.35.0.jar

Binary file not shown.

BIN
java/htmlunit/deps/serializer-2.7.2.jar

Binary file not shown.

BIN
java/htmlunit/deps/websocket-api-9.4.16.v20190411.jar

Binary file not shown.

BIN
java/htmlunit/deps/websocket-client-9.4.16.v20190411.jar

Binary file not shown.

BIN
java/htmlunit/deps/websocket-common-9.4.16.v20190411.jar

Binary file not shown.

BIN
java/htmlunit/deps/xalan-2.7.2.jar

Binary file not shown.

BIN
java/htmlunit/deps/xercesImpl-2.12.0.jar

Binary file not shown.

BIN
java/htmlunit/deps/xml-apis-1.4.01.jar

Binary file not shown.

31
java/htmlunit/pom.xml

@ -0,0 +1,31 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>is.rud.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<packaging>jar</packaging>
<version>1.0-SNAPSHOT</version>
<name>htmlunit</name>
<url>http://maven.apache.org</url>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.7</source>
<target>1.7</target>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit -->
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.35.0</version>
</dependency>
</dependencies>
</project>

40
java/htmlunit/src/main/java/is/rud/htmlunit/App.java

@ -0,0 +1,40 @@
package is.rud.htmlunit;
import com.gargoylesoftware.htmlunit.*;
import com.gargoylesoftware.htmlunit.util.*;
import java.util.*;
import java.lang.*;
import java.io.*;
public class App {
public static List<WebResponse> getRequestsFor(String url, long jsDelay) throws IOException {
final WebClient webClient = new WebClient(BrowserVersion.CHROME);
WebClientOptions wco = webClient.getOptions();
wco.setThrowExceptionOnScriptError(false);
wco.setCssEnabled(true);
wco.setDownloadImages(true);
wco.setTimeout(30000);
final List<WebResponse> list = new ArrayList<>();
new WebConnectionWrapper(webClient) {
@Override
public WebResponse getResponse(final WebRequest request) throws IOException {
final WebResponse response = super.getResponse(request);
// list.add(request.getHttpMethod() + " " + request.getUrl());
list.add(response);
return response;
}
};
webClient.getPage(url);
webClient.waitForBackgroundJavaScript(jsDelay);
return(list);
}
}

BIN
java/htmlunit/target/classes/is/rud/htmlunit/App$1.class

Binary file not shown.

BIN
java/htmlunit/target/classes/is/rud/htmlunit/App.class

Binary file not shown.

BIN
java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar

Binary file not shown.

5
java/htmlunit/target/maven-archiver/pom.properties

@ -0,0 +1,5 @@
#Generated by Maven
#Mon Apr 29 10:10:01 EDT 2019
version=1.0-SNAPSHOT
groupId=is.rud.htmlunit
artifactId=htmlunit

2
java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst

@ -0,0 +1,2 @@
is/rud/htmlunit/App$1.class
is/rud/htmlunit/App.class

1
java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst

@ -0,0 +1 @@
/Users/hrbrmstr/packages/htmlunit/java/htmlunit/src/main/java/is/rud/htmlunit/App.java

0
java/htmlunit/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/createdFiles.lst

1
java/htmlunit/target/maven-status/maven-compiler-plugin/testCompile/default-testCompile/inputFiles.lst

@ -0,0 +1 @@
/Users/hrbrmstr/packages/htmlunit/java/htmlunit/src/test/java/is/rud/htmlunit/AppTest.java

16
man/wc_inspect.Rd

@ -0,0 +1,16 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/wc-inspect.R
\name{wc_inspect}
\alias{wc_inspect}
\title{Perform a "Developer Tools"-like Network Inspection of a URL}
\usage{
wc_inspect(url, js_delay = 5000L)
}
\arguments{
\item{url}{URL to fetch}
\item{js_delay}{(ms) How long to wait for JavaScript to execute/XHRs to load? (Default: 5000)}
}
\description{
Retrieves \emph{all} content loaded
}
Loading…
Cancel
Save