No known key found for this signature in database
GPG Key ID: 1D7529BE14E2BBA9
27 changed files with
26 additions and
13 deletions
-
DESCRIPTION
-
NEWS.md
-
R/wc-inspect.R
-
BIN
inst/java/htmlunit-1.0-SNAPSHOT.jar
-
BIN
java/htmlunit/deps/commons-io-2.7.jar
-
BIN
java/htmlunit/deps/commons-lang3-3.11.jar
-
BIN
java/htmlunit/deps/commons-net-3.7.jar
-
BIN
java/htmlunit/deps/commons-text-1.9.jar
-
BIN
java/htmlunit/deps/htmlunit-2.43.0.jar
-
BIN
java/htmlunit/deps/htmlunit-core-js-2.43.0.jar
-
BIN
java/htmlunit/deps/jetty-client-9.4.31.v20200723.jar
-
BIN
java/htmlunit/deps/jetty-http-9.4.31.v20200723.jar
-
BIN
java/htmlunit/deps/jetty-io-9.4.31.v20200723.jar
-
BIN
java/htmlunit/deps/jetty-util-9.4.31.v20200723.jar
-
BIN
java/htmlunit/deps/jetty-xml-9.4.31.v20200723.jar
-
BIN
java/htmlunit/deps/neko-htmlunit-2.43.0.jar
-
BIN
java/htmlunit/deps/salvation-2.7.2.jar
-
BIN
java/htmlunit/deps/websocket-api-9.4.31.v20200723.jar
-
BIN
java/htmlunit/deps/websocket-client-9.4.31.v20200723.jar
-
BIN
java/htmlunit/deps/websocket-common-9.4.31.v20200723.jar
-
java/htmlunit/pom.xml
-
java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java
-
BIN
java/htmlunit/target/classes/is/rud/htmlunit/Zapp$1.class
-
BIN
java/htmlunit/target/classes/is/rud/htmlunit/Zapp.class
-
BIN
java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar
-
man/hu_read_html.Rd
-
man/wc_inspect.Rd
|
|
@ -1,8 +1,8 @@ |
|
|
|
Package: htmlunit |
|
|
|
Type: Package |
|
|
|
Title: Tools to Scrape Dynamic Web Content via the 'HtmlUnit' Java Library |
|
|
|
Version: 0.4.0 |
|
|
|
Date: 2020-05-09 |
|
|
|
Version: 0.5.0 |
|
|
|
Date: 2020-07-18 |
|
|
|
Authors@R: c( |
|
|
|
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), |
|
|
|
comment = c(ORCID = "0000-0001-5670-2640")), |
|
|
@ -28,11 +28,11 @@ Imports: |
|
|
|
Suggests: |
|
|
|
covr, tinytest |
|
|
|
Depends: |
|
|
|
R (>= 3.2.0), |
|
|
|
R (>= 3.6.0), |
|
|
|
rJava, |
|
|
|
htmlunitjars (>= 2.40.0), |
|
|
|
htmlunitjars (>= 2.43.0), |
|
|
|
rvest, |
|
|
|
xml2 |
|
|
|
Roxygen: list(markdown = TRUE) |
|
|
|
RoxygenNote: 7.1.0 |
|
|
|
RoxygenNote: 7.1.1 |
|
|
|
Remotes: gitlab::hrbrmstr/htmlunitjars |
|
|
|
|
|
@ -1,3 +1,7 @@ |
|
|
|
0.5.0 |
|
|
|
* Updated for 2.43.0 jars |
|
|
|
* Added `timeout` to `wc_inspect()` |
|
|
|
|
|
|
|
0.4.0 |
|
|
|
* Switched to {tinytest} |
|
|
|
* Updated for 2.40.0 jars |
|
|
|
|
|
@ -5,12 +5,16 @@ |
|
|
|
#' @md |
|
|
|
#' @param url URL to fetch |
|
|
|
#' @param js_delay (ms) How long to wait for JavaScript to execute/XHRs to load? (Default: 5000) |
|
|
|
#' @param timeout Sets the timeout (milliseconds) of the webc onnection. Set to zero for an infinite wait. |
|
|
|
#' Defaults to `30000`. Note: The timeout is used twice. The first is for making the socket |
|
|
|
#' connection, the second is for data retrieval. If the time is critical you must allow for twice |
|
|
|
#' the time specified here. |
|
|
|
#' @export |
|
|
|
wc_inspect <- function(url, js_delay = 5000L) { |
|
|
|
wc_inspect <- function(url, js_delay = 5000L, timeout = 30000L) { |
|
|
|
|
|
|
|
app <- J("is.rud.htmlunit.Zapp") |
|
|
|
|
|
|
|
res <- app$getRequestsFor(url, .jlong(js_delay)) |
|
|
|
res <- app$getRequestsFor(url, .jlong(js_delay), .jint(timeout)) |
|
|
|
res <- as.list(res) |
|
|
|
|
|
|
|
lapply(res, function(.x) { |
|
|
|
|
|
@ -25,7 +25,7 @@ |
|
|
|
<dependency> |
|
|
|
<groupId>net.sourceforge.htmlunit</groupId> |
|
|
|
<artifactId>htmlunit</artifactId> |
|
|
|
<version>2.40.0</version> |
|
|
|
<version>2.43.0</version> |
|
|
|
</dependency> |
|
|
|
</dependencies> |
|
|
|
</project> |
|
|
|
|
|
@ -8,7 +8,7 @@ import java.io.*; |
|
|
|
|
|
|
|
public class Zapp { |
|
|
|
|
|
|
|
public static List<WebResponse> getRequestsFor(String url, long jsDelay) throws IOException { |
|
|
|
public static List<WebResponse> getRequestsFor(String url, long jsDelay, int timeout) throws IOException { |
|
|
|
|
|
|
|
final WebClient webClient = new WebClient(BrowserVersion.CHROME); |
|
|
|
|
|
|
@ -16,7 +16,7 @@ public class Zapp { |
|
|
|
wco.setThrowExceptionOnScriptError(false); |
|
|
|
wco.setCssEnabled(true); |
|
|
|
wco.setDownloadImages(true); |
|
|
|
wco.setTimeout(30000); |
|
|
|
wco.setTimeout(timeout); |
|
|
|
|
|
|
|
final List<WebResponse> list = new ArrayList<>(); |
|
|
|
|
|
|
|
|
|
@ -22,7 +22,7 @@ hu_read_html( |
|
|
|
\item{emulate}{browser to emulate; one of "\code{best}", "\code{chrome}", "\code{firefox}", "\code{ie}"} |
|
|
|
|
|
|
|
\item{ret}{what to return; if \code{html_document} (the default) then the HTML created |
|
|
|
by the \code{HtmlUnit} emulated browser context is passed to \code{\link[xml2:read_html]{xml2::read_html()}} |
|
|
|
by the \code{HtmlUnit} emulated browser context is passed to \code{\link[xml2:read_xml]{xml2::read_html()}} |
|
|
|
and an \code{xml2} \code{html_document}/\code{xml_document} is returned. Note that this causes |
|
|
|
further HTML processing by \code{xml2}/\code{libxml2} so is not \emph{exactly} what |
|
|
|
\code{HtmlUnit} generated. If you want the HTML code (text) without any further |
|
|
@ -47,7 +47,7 @@ function is a high-level wrapper designed to do a read of HTML, |
|
|
|
it is recommended that you leave this the default \code{FALSE} to save |
|
|
|
time/bandwidth.} |
|
|
|
|
|
|
|
\item{options}{options to pass to \code{\link[xml2:read_html]{xml2::read_html()}} if \code{ret} == \code{html_document}.} |
|
|
|
\item{options}{options to pass to \code{\link[xml2:read_xml]{xml2::read_html()}} if \code{ret} == \code{html_document}.} |
|
|
|
} |
|
|
|
\value{ |
|
|
|
an \code{xml2} \code{html_document}/\code{xml_document} if \code{ret} == \code{html_document} else |
|
|
|
|
|
@ -4,12 +4,17 @@ |
|
|
|
\alias{wc_inspect} |
|
|
|
\title{Perform a "Developer Tools"-like Network Inspection of a URL} |
|
|
|
\usage{ |
|
|
|
wc_inspect(url, js_delay = 5000L) |
|
|
|
wc_inspect(url, js_delay = 5000L, timeout = 30000L) |
|
|
|
} |
|
|
|
\arguments{ |
|
|
|
\item{url}{URL to fetch} |
|
|
|
|
|
|
|
\item{js_delay}{(ms) How long to wait for JavaScript to execute/XHRs to load? (Default: 5000)} |
|
|
|
|
|
|
|
\item{timeout}{Sets the timeout (milliseconds) of the webc onnection. Set to zero for an infinite wait. |
|
|
|
Defaults to \code{30000}. Note: The timeout is used twice. The first is for making the socket |
|
|
|
connection, the second is for data retrieval. If the time is critical you must allow for twice |
|
|
|
the time specified here.} |
|
|
|
} |
|
|
|
\description{ |
|
|
|
Retrieves \emph{all} content loaded |
|
|
|