Browse Source

2.43.0

master
boB Rudis 4 years ago
parent
commit
b9dd94a108
No known key found for this signature in database GPG Key ID: 1D7529BE14E2BBA9
  1. 3
      NEWS.md
  2. 13
      R/hu-read-html.R
  3. 14
      R/wc-inspect.R
  4. 7
      R/web-client.R
  5. 12
      R/zzz.R
  6. 69
      README.md
  7. BIN
      inst/java/htmlunit-1.0-SNAPSHOT.jar
  8. 12
      java/htmlunit/Makefile
  9. 17
      java/htmlunit/src/main/java/is/rud/htmlunit/RDefaultCssErrorHandler.java
  10. 13
      java/htmlunit/src/main/java/is/rud/htmlunit/RIncorrectnessListener.java
  11. 14
      java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java
  12. BIN
      java/htmlunit/target/classes/is/rud/htmlunit/RDefaultCssErrorHandler.class
  13. BIN
      java/htmlunit/target/classes/is/rud/htmlunit/RIncorrectnessListener.class
  14. BIN
      java/htmlunit/target/classes/is/rud/htmlunit/Zapp$1.class
  15. BIN
      java/htmlunit/target/classes/is/rud/htmlunit/Zapp.class
  16. BIN
      java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar
  17. 2
      java/htmlunit/target/maven-archiver/pom.properties
  18. 2
      java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst
  19. 2
      java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst
  20. 2
      man/hu_read_html.Rd
  21. 12
      man/wc_inspect.Rd
  22. 4
      man/web_client.Rd

3
NEWS.md

@ -1,6 +1,7 @@
0.5.0
* Updated for 2.43.0 jars
* Added `timeout` to `wc_inspect()`
* Added support for Microsoft Edge browser
* Added `timeout`, `css`, and `images` parameters to `wc_inspect()`
0.4.0
* Switched to {tinytest}

13
R/hu-read-html.R

@ -45,7 +45,7 @@
#' hu_read_html(test_url)
#' }
hu_read_html <- function(url,
emulate = c("best", "chrome", "firefox", "ie"),
emulate = c("best", "chrome", "firefox", "ie", "edge"),
ret = c("html_document", "text"),
js_delay = 2000L,
timeout = 30000L,
@ -54,7 +54,7 @@ hu_read_html <- function(url,
download_images = FALSE,
options = c("RECOVER", "NOERROR", "NOBLANKS")) {
emulate <- match.arg(emulate, c("best", "chrome", "firefox", "ie"))
emulate <- match.arg(emulate, c("best", "chrome", "firefox", "ie", "edge"))
ret <- match.arg(ret, c("html_document", "text"))
available_browsers <- J("com.gargoylesoftware.htmlunit.BrowserVersion")
@ -63,12 +63,19 @@ hu_read_html <- function(url,
emulate,
best = available_browsers$BEST_SUPPORTED,
chrome = available_browsers$CHROME,
firefox = available_browsers$FIREFOX_60,
firefox = available_browsers$FIREFOX,
edge = available_browsers$EDGE,
ie = available_browsers$INTERNET_EXPLORER
) -> use_browser
wc <- new(J("com.gargoylesoftware.htmlunit.WebClient"), use_browser)
cssErrorHandler <- .jnew("is.rud.htmlunit.RDefaultCssErrorHandler")
wc$setCssErrorHandler(cssErrorHandler)
incorrectListenerHandler <- .jnew("is.rud.htmlunit.RIncorrectnessListener")
wc$setIncorrectnessListener(incorrectListenerHandler)
res <- wc$waitForBackgroundJavaScriptStartingBefore(.jlong(as.integer(js_delay)))
wc_opts <- wc$getOptions()

14
R/wc-inspect.R

@ -5,16 +5,24 @@
#' @md
#' @param url URL to fetch
#' @param js_delay (ms) How long to wait for JavaScript to execute/XHRs to load? (Default: 5000)
#' @param timeout Sets the timeout (milliseconds) of the webc onnection. Set to zero for an infinite wait.
#' @param timeout Sets the timeout (milliseconds) of the web connection. Set to zero for an infinite wait.
#' Defaults to `30000`. Note: The timeout is used twice. The first is for making the socket
#' connection, the second is for data retrieval. If the time is critical you must allow for twice
#' the time specified here.
#' @param css,images enable CSS/download images? (default `FALSE`)
#' @export
wc_inspect <- function(url, js_delay = 5000L, timeout = 30000L) {
wc_inspect <- function(url, js_delay = 5000L, timeout = 30000L, css = FALSE, images = FALSE) {
app <- J("is.rud.htmlunit.Zapp")
res <- app$getRequestsFor(url, .jlong(js_delay), .jint(timeout))
app$getRequestsFor(
url,
.jlong(js_delay),
as.integer(timeout),
.jnew("java/lang/Boolean", css),
.jnew("java/lang/Boolean", images)
) -> res
res <- as.list(res)
lapply(res, function(.x) {

7
R/web-client.R

@ -14,17 +14,18 @@
#' @examples
#' w <- web_client()
#' wc_browser_info(w)
web_client <- function(emulate = c("best", "chrome", "firefox", "ie"),
web_client <- function(emulate = c("best", "chrome", "firefox", "ie", "edge"),
proxy_host = NULL, proxy_port = NULL) {
emulate <- match.arg(emulate, c("best", "chrome", "firefox", "ie"))
emulate <- match.arg(emulate, c("best", "chrome", "firefox", "ie", "edge"))
available_browsers <- J("com.gargoylesoftware.htmlunit.BrowserVersion")
switch(
emulate,
best = available_browsers$BEST_SUPPORTED,
chrome = available_browsers$CHROME,
firefox = available_browsers$FIREFOX_60,
firefox = available_browsers$FIREFOX,
edge = available_browsers$EDGE,
ie = available_browsers$INTERNET_EXPLORER
) -> use_browser

12
R/zzz.R

@ -0,0 +1,12 @@
stop_logging <- function() {
rJava::J("java.util.logging.LogManager")$getLogManager()$reset()
invisible(NULL)
}
.onLoad <- function(libname, pkgname) {
rJava::.jpackage(pkgname, jars = "*", lib.loc = libname)
rJava::.jaddClassPath(dir(file.path(getwd(), "inst/java"), full.names = TRUE))
stop_logging()
}

69
README.md

@ -11,7 +11,7 @@ Status](https://travis-ci.org/hrbrmstr/htmlunit.svg?branch=master)](https://trav
[![Coverage
Status](https://codecov.io/gh/hrbrmstr/htmlunit/branch/master/graph/badge.svg)](https://codecov.io/gh/hrbrmstr/htmlunit)
![Minimal R
Version](https://img.shields.io/badge/R%3E%3D-3.2.0-blue.svg)
Version](https://img.shields.io/badge/R%3E%3D-3.6.0-blue.svg)
![License](https://img.shields.io/badge/License-Apache-blue.svg)
# htmlunit
@ -132,7 +132,7 @@ library(tidyverse) # for some data ops; not req'd for pkg
# current verison
packageVersion("htmlunit")
## [1] '0.4.0'
## [1] '0.5.0'
```
Something `xml2::read_html()` cannot do, read the table from
@ -178,41 +178,36 @@ colnames(xdf)
## [7] "content_type" "load_time" "headers"
select(xdf, method, url, status_code, content_length, load_time)
## # A tibble: 59 x 5
## # A tibble: 36 x 5
## method url status_code content_length load_time
## <chr> <chr> <int> <dbl> <dbl>
## 1 GET https://rstudio.com/ 200 13531 625
## 2 GET https://use.fontawesome.com/releases/v5.0.6/css/all.css 200 8699 376
## 3 GET https://d33wubrfki0l68.cloudfront.net/bundles/c5ddb3e999592179708beea702… 200 53046 563
## 4 GET https://cdn.rawgit.com/noelboss/featherlight/1.7.13/release/featherlight… 200 763 376
## 5 GET https://d33wubrfki0l68.cloudfront.net/css/4a0f49009a213e6e2207c6f66893f0… 200 505 73
## 6 GET https://gitcdn.github.io/bootstrap-toggle/2.2.2/css/bootstrap-toggle.min… 200 548 258
## 7 GET https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-aweso… 200 6663 247
## 8 GET https://metadata-static-files.sfo2.cdn.digitaloceanspaces.com/pixel/lp.js 200 3876 364
## 9 GET https://snap.licdn.com/li.lms-analytics/insight.min.js 200 1576 455
## 10 GET https://connect.facebook.net/en_US/fbevents.js 200 31766 412
## # … with 49 more rows
## 1 GET https://rstudio.com/ 200 14621 495
## 2 GET https://metadata-static-files.sfo2.cdn.digitaloceanspaces.com/pixel/lp.js 200 3576 221
## 3 GET https://snap.licdn.com/li.lms-analytics/insight.min.js 200 1576 162
## 4 GET https://connect.facebook.net/en_US/fbevents.js 200 34269 138
## 5 GET https://connect.facebook.net/signals/config/151855192184380?v=2.9.23&r=s… 200 134841 66
## 6 GET https://munchkin.marketo.net/munchkin-beta.js 200 752 230
## 7 GET https://munchkin.marketo.net/159/munchkin.js 200 4810 27
## 8 GET https://x.clearbitjs.com/v1/pk_60c5aa2221e3c03eca10fb6876aa6df7/clearbit… 200 86568 483
## 9 GET https://cdn.segment.com/analytics.js/v1/gO0uTGfCkO4DQpfkRim9mBsjdKrehtnu… 200 62860 243
## 10 GET https://static.hotjar.com/c/hotjar-1446157.js?sv=6 200 1708 212
## # … with 26 more rows
group_by(xdf, content_type) %>%
summarise(
total_size = sum(content_length),
total_load_time = sum(load_time)/1000
)
## # A tibble: 12 x 3
## content_type total_size total_load_time
## <chr> <dbl> <dbl>
## 1 "" 0 1.02
## 2 "application/javascript" 443531 3.61
## 3 "application/json" 4176 3.10
## 4 "application/x-javascript" 161004 1.69
## 5 "image/gif" 131 0.561
## 6 "image/jpeg" 59772 0.105
## 7 "image/png" 40634 0.234
## 8 "image/svg+xml" 10869 0.303
## 9 "text/css" 121175 2.81
## 10 "text/html" 14425 1.3
## 11 "text/javascript" 174172 1.42
## 12 "text/plain" 28 0.354
## # A tibble: 7 x 3
## content_type total_size total_load_time
## <chr> <dbl> <dbl>
## 1 application/javascript 431338 2.58
## 2 application/json 4118 1.37
## 3 application/x-javascript 176248 0.623
## 4 image/gif 35 0.232
## 5 text/html 16640 1.36
## 6 text/javascript 254971 0.996
## 7 text/plain 28 0.189
```
### DSL
@ -221,7 +216,7 @@ group_by(xdf, content_type) %>%
wc <- web_client(emulate = "chrome")
wc %>% wc_browser_info()
## < Netscape / 5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36 / en-US >
## < Netscape / 5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 / en-US >
wc <- web_client()
@ -268,6 +263,7 @@ wc %>%
## An official website of the United States government Here's how you know
##
##
## Main Navigation
## Search
## Search
## Search
@ -275,18 +271,19 @@ wc %>%
## All Topics and Services
## Benefits, Grants, Loans
## Government Agencies and Elected Officials
## Jobs and Unemplo
```
### htmlunit Metrics
| Lang | \# Files | (%) | LoC | (%) | Blank lines | (%) | \# Lines | (%) |
| :---- | -------: | ---: | --: | ---: | ----------: | ---: | -------: | ---: |
| R | 13 | 0.76 | 320 | 0.75 | 182 | 0.73 | 372 | 0.83 |
| Rmd | 1 | 0.06 | 41 | 0.10 | 52 | 0.21 | 75 | 0.17 |
| Maven | 1 | 0.06 | 30 | 0.07 | 0 | 0.00 | 1 | 0.00 |
| Java | 1 | 0.06 | 28 | 0.07 | 12 | 0.05 | 0 | 0.00 |
| make | 1 | 0.06 | 10 | 0.02 | 4 | 0.02 | 0 | 0.00 |
| R | 14 | 0.70 | 341 | 0.72 | 188 | 0.70 | 377 | 0.82 |
| Java | 3 | 0.15 | 52 | 0.11 | 23 | 0.09 | 3 | 0.01 |
| Rmd | 1 | 0.05 | 41 | 0.09 | 52 | 0.19 | 75 | 0.16 |
| Maven | 1 | 0.05 | 30 | 0.06 | 0 | 0.00 | 1 | 0.00 |
| make | 1 | 0.05 | 10 | 0.02 | 4 | 0.01 | 4 | 0.01 |
clock Package Metrics for htmlunit
## Code of Conduct

BIN
inst/java/htmlunit-1.0-SNAPSHOT.jar

Binary file not shown.

12
java/htmlunit/Makefile

@ -1,14 +1,18 @@
.PHONY: clean pkg deps run
pkg:
JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn --quiet package
# JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn --quiet package
JAVA_HOME=/Library/Java/JavaVirtualMachines/openjdk-11.0.2.jdk/Contents/Home mvn --quiet package
cp target/htmlunit-1.0-SNAPSHOT.jar ../../inst/java
clean:
JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn clean
# JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn clean
JAVA_HOME=/Library/Java/JavaVirtualMachines/openjdk-11.0.2.jdk/Contents/Home mvn clean
deps:
JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn dependency:copy-dependencies -DoutputDirectory=deps
# JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn dependency:copy-dependencies -DoutputDirectory=deps
JAVA_HOME=/Library/Java/JavaVirtualMachines/openjdk-11.0.2.jdk/Contents/Home mvn dependency:copy-dependencies -DoutputDirectory=deps
new:
JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn archetype:generate -DgroupId=is.rud.htmlunit -DartifactId=htmlunit -DarchetypeArtifactId=maven-archetype-quickstart -DinteractiveMode=false
# JAVA_HOME=/Library/Java/JavaVirtualMachines/jdk-11.0.1.jdk/Contents/Home mvn archetype:generate -DgroupId=is.rud.htmlunit -DartifactId=htmlunit -DarchetypeArtifactId=maven-archetype-quickstart -DinteractiveMode=false
JAVA_HOME=/Library/Java/JavaVirtualMachines/openjdk-11.0.2.jdk/Contents/Home mvn archetype:generate -DgroupId=is.rud.htmlunit -DartifactId=htmlunit -DarchetypeArtifactId=maven-archetype-quickstart -DinteractiveMode=false

17
java/htmlunit/src/main/java/is/rud/htmlunit/RDefaultCssErrorHandler.java

@ -0,0 +1,17 @@
package is.rud.htmlunit;
public class RDefaultCssErrorHandler implements com.gargoylesoftware.css.parser.CSSErrorHandler,
java.io.Serializable {
@Override
public void error(final com.gargoylesoftware.css.parser.CSSParseException exception) {
}
@Override
public void fatalError(final com.gargoylesoftware.css.parser.CSSParseException exception) {
}
@Override
public void warning(final com.gargoylesoftware.css.parser.CSSParseException exception) {
}
}

13
java/htmlunit/src/main/java/is/rud/htmlunit/RIncorrectnessListener.java

@ -0,0 +1,13 @@
package is.rud.htmlunit;
public class RIncorrectnessListener implements com.gargoylesoftware.htmlunit.IncorrectnessListener,
java.io.Serializable {
/**
* {@inheritDoc}
*/
@Override
public void notify(final java.lang.String message, final java.lang.Object origin) {
}
}

14
java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java

@ -2,20 +2,28 @@ package is.rud.htmlunit;
import com.gargoylesoftware.htmlunit.*;
import com.gargoylesoftware.htmlunit.util.*;
import java.util.*;
import java.lang.*;
import java.io.*;
public class Zapp {
public static List<WebResponse> getRequestsFor(String url, long jsDelay, int timeout) throws IOException {
private static com.gargoylesoftware.htmlunit.IncorrectnessListener incorrectnessListener_ = new RIncorrectnessListener();
private static com.gargoylesoftware.css.parser.CSSErrorHandler cssErrorHandler_ = new RDefaultCssErrorHandler();
public static List<WebResponse> getRequestsFor(String url, long jsDelay, int timeout, Boolean css, Boolean images) throws IOException {
final WebClient webClient = new WebClient(BrowserVersion.CHROME);
webClient.setCssErrorHandler(cssErrorHandler_);
webClient.setIncorrectnessListener(incorrectnessListener_);
WebClientOptions wco = webClient.getOptions();
wco.setThrowExceptionOnScriptError(false);
wco.setCssEnabled(true);
wco.setDownloadImages(true);
wco.setCssEnabled(css);
wco.setDownloadImages(images);
wco.setTimeout(timeout);
final List<WebResponse> list = new ArrayList<>();

BIN
java/htmlunit/target/classes/is/rud/htmlunit/RDefaultCssErrorHandler.class

Binary file not shown.

BIN
java/htmlunit/target/classes/is/rud/htmlunit/RIncorrectnessListener.class

Binary file not shown.

BIN
java/htmlunit/target/classes/is/rud/htmlunit/Zapp$1.class

Binary file not shown.

BIN
java/htmlunit/target/classes/is/rud/htmlunit/Zapp.class

Binary file not shown.

BIN
java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar

Binary file not shown.

2
java/htmlunit/target/maven-archiver/pom.properties

@ -1,5 +1,5 @@
#Generated by Maven
#Tue Mar 10 08:03:25 EDT 2020
#Wed Aug 19 08:51:02 EDT 2020
groupId=is.rud.htmlunit
artifactId=htmlunit
version=1.0-SNAPSHOT

2
java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst

@ -1,2 +1,4 @@
is/rud/htmlunit/RDefaultCssErrorHandler.class
is/rud/htmlunit/RIncorrectnessListener.class
is/rud/htmlunit/Zapp.class
is/rud/htmlunit/Zapp$1.class

2
java/htmlunit/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst

@ -1 +1,3 @@
/Users/hrbrmstr/packages/htmlunit/java/htmlunit/src/main/java/is/rud/htmlunit/RDefaultCssErrorHandler.java
/Users/hrbrmstr/packages/htmlunit/java/htmlunit/src/main/java/is/rud/htmlunit/RIncorrectnessListener.java
/Users/hrbrmstr/packages/htmlunit/java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java

2
man/hu_read_html.Rd

@ -6,7 +6,7 @@
\usage{
hu_read_html(
url,
emulate = c("best", "chrome", "firefox", "ie"),
emulate = c("best", "chrome", "firefox", "ie", "edge"),
ret = c("html_document", "text"),
js_delay = 2000L,
timeout = 30000L,

12
man/wc_inspect.Rd

@ -4,17 +4,25 @@
\alias{wc_inspect}
\title{Perform a "Developer Tools"-like Network Inspection of a URL}
\usage{
wc_inspect(url, js_delay = 5000L, timeout = 30000L)
wc_inspect(
url,
js_delay = 5000L,
timeout = 30000L,
css = FALSE,
images = FALSE
)
}
\arguments{
\item{url}{URL to fetch}
\item{js_delay}{(ms) How long to wait for JavaScript to execute/XHRs to load? (Default: 5000)}
\item{timeout}{Sets the timeout (milliseconds) of the webc onnection. Set to zero for an infinite wait.
\item{timeout}{Sets the timeout (milliseconds) of the web connection. Set to zero for an infinite wait.
Defaults to \code{30000}. Note: The timeout is used twice. The first is for making the socket
connection, the second is for data retrieval. If the time is critical you must allow for twice
the time specified here.}
\item{css, images}{enable CSS/download images? (default \code{FALSE})}
}
\description{
Retrieves \emph{all} content loaded

4
man/web_client.Rd

@ -6,13 +6,13 @@
\title{Create a new HtmlUnit WebClient instance}
\usage{
web_client(
emulate = c("best", "chrome", "firefox", "ie"),
emulate = c("best", "chrome", "firefox", "ie", "edge"),
proxy_host = NULL,
proxy_port = NULL
)
webclient(
emulate = c("best", "chrome", "firefox", "ie"),
emulate = c("best", "chrome", "firefox", "ie", "edge"),
proxy_host = NULL,
proxy_port = NULL
)

Loading…
Cancel
Save