diff --git a/DESCRIPTION b/DESCRIPTION index 72dc0b6..98acc37 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: htmlunit Type: Package Title: Tools to Scrape Dynamic Web Content via the 'HtmlUnit' Java Library -Version: 0.4.0 -Date: 2020-05-09 +Version: 0.5.0 +Date: 2020-07-18 Authors@R: c( person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-5670-2640")), @@ -28,11 +28,11 @@ Imports: Suggests: covr, tinytest Depends: - R (>= 3.2.0), + R (>= 3.6.0), rJava, - htmlunitjars (>= 2.40.0), + htmlunitjars (>= 2.43.0), rvest, xml2 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.1.0 +RoxygenNote: 7.1.1 Remotes: gitlab::hrbrmstr/htmlunitjars diff --git a/NEWS.md b/NEWS.md index 43641de..caa6949 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +0.5.0 +* Updated for 2.43.0 jars +* Added `timeout` to `wc_inspect()` + 0.4.0 * Switched to {tinytest} * Updated for 2.40.0 jars diff --git a/R/wc-inspect.R b/R/wc-inspect.R index 40e0e63..4429c1a 100644 --- a/R/wc-inspect.R +++ b/R/wc-inspect.R @@ -5,12 +5,16 @@ #' @md #' @param url URL to fetch #' @param js_delay (ms) How long to wait for JavaScript to execute/XHRs to load? (Default: 5000) +#' @param timeout Sets the timeout (milliseconds) of the webc onnection. Set to zero for an infinite wait. +#' Defaults to `30000`. Note: The timeout is used twice. The first is for making the socket +#' connection, the second is for data retrieval. If the time is critical you must allow for twice +#' the time specified here. #' @export -wc_inspect <- function(url, js_delay = 5000L) { +wc_inspect <- function(url, js_delay = 5000L, timeout = 30000L) { app <- J("is.rud.htmlunit.Zapp") - res <- app$getRequestsFor(url, .jlong(js_delay)) + res <- app$getRequestsFor(url, .jlong(js_delay), .jint(timeout)) res <- as.list(res) lapply(res, function(.x) { diff --git a/inst/java/htmlunit-1.0-SNAPSHOT.jar b/inst/java/htmlunit-1.0-SNAPSHOT.jar index a85d537..f3a0cf0 100644 Binary files a/inst/java/htmlunit-1.0-SNAPSHOT.jar and b/inst/java/htmlunit-1.0-SNAPSHOT.jar differ diff --git a/java/htmlunit/deps/commons-io-2.7.jar b/java/htmlunit/deps/commons-io-2.7.jar new file mode 100644 index 0000000..5889458 Binary files /dev/null and b/java/htmlunit/deps/commons-io-2.7.jar differ diff --git a/java/htmlunit/deps/commons-lang3-3.11.jar b/java/htmlunit/deps/commons-lang3-3.11.jar new file mode 100644 index 0000000..bbaa8a6 Binary files /dev/null and b/java/htmlunit/deps/commons-lang3-3.11.jar differ diff --git a/java/htmlunit/deps/commons-net-3.7.jar b/java/htmlunit/deps/commons-net-3.7.jar new file mode 100644 index 0000000..7d7bb5f Binary files /dev/null and b/java/htmlunit/deps/commons-net-3.7.jar differ diff --git a/java/htmlunit/deps/commons-text-1.9.jar b/java/htmlunit/deps/commons-text-1.9.jar new file mode 100644 index 0000000..cc0c690 Binary files /dev/null and b/java/htmlunit/deps/commons-text-1.9.jar differ diff --git a/java/htmlunit/deps/htmlunit-2.43.0.jar b/java/htmlunit/deps/htmlunit-2.43.0.jar new file mode 100644 index 0000000..3c8c449 Binary files /dev/null and b/java/htmlunit/deps/htmlunit-2.43.0.jar differ diff --git a/java/htmlunit/deps/htmlunit-core-js-2.43.0.jar b/java/htmlunit/deps/htmlunit-core-js-2.43.0.jar new file mode 100644 index 0000000..491cb35 Binary files /dev/null and b/java/htmlunit/deps/htmlunit-core-js-2.43.0.jar differ diff --git a/java/htmlunit/deps/jetty-client-9.4.31.v20200723.jar b/java/htmlunit/deps/jetty-client-9.4.31.v20200723.jar new file mode 100644 index 0000000..8393c73 Binary files /dev/null and b/java/htmlunit/deps/jetty-client-9.4.31.v20200723.jar differ diff --git a/java/htmlunit/deps/jetty-http-9.4.31.v20200723.jar b/java/htmlunit/deps/jetty-http-9.4.31.v20200723.jar new file mode 100644 index 0000000..edd09e2 Binary files /dev/null and b/java/htmlunit/deps/jetty-http-9.4.31.v20200723.jar differ diff --git a/java/htmlunit/deps/jetty-io-9.4.31.v20200723.jar b/java/htmlunit/deps/jetty-io-9.4.31.v20200723.jar new file mode 100644 index 0000000..aaa26bd Binary files /dev/null and b/java/htmlunit/deps/jetty-io-9.4.31.v20200723.jar differ diff --git a/java/htmlunit/deps/jetty-util-9.4.31.v20200723.jar b/java/htmlunit/deps/jetty-util-9.4.31.v20200723.jar new file mode 100644 index 0000000..97e9836 Binary files /dev/null and b/java/htmlunit/deps/jetty-util-9.4.31.v20200723.jar differ diff --git a/java/htmlunit/deps/jetty-xml-9.4.31.v20200723.jar b/java/htmlunit/deps/jetty-xml-9.4.31.v20200723.jar new file mode 100644 index 0000000..568668f Binary files /dev/null and b/java/htmlunit/deps/jetty-xml-9.4.31.v20200723.jar differ diff --git a/java/htmlunit/deps/neko-htmlunit-2.43.0.jar b/java/htmlunit/deps/neko-htmlunit-2.43.0.jar new file mode 100644 index 0000000..95fa4b5 Binary files /dev/null and b/java/htmlunit/deps/neko-htmlunit-2.43.0.jar differ diff --git a/java/htmlunit/deps/salvation-2.7.2.jar b/java/htmlunit/deps/salvation-2.7.2.jar new file mode 100644 index 0000000..1759a3d Binary files /dev/null and b/java/htmlunit/deps/salvation-2.7.2.jar differ diff --git a/java/htmlunit/deps/websocket-api-9.4.31.v20200723.jar b/java/htmlunit/deps/websocket-api-9.4.31.v20200723.jar new file mode 100644 index 0000000..230e9a2 Binary files /dev/null and b/java/htmlunit/deps/websocket-api-9.4.31.v20200723.jar differ diff --git a/java/htmlunit/deps/websocket-client-9.4.31.v20200723.jar b/java/htmlunit/deps/websocket-client-9.4.31.v20200723.jar new file mode 100644 index 0000000..dc07417 Binary files /dev/null and b/java/htmlunit/deps/websocket-client-9.4.31.v20200723.jar differ diff --git a/java/htmlunit/deps/websocket-common-9.4.31.v20200723.jar b/java/htmlunit/deps/websocket-common-9.4.31.v20200723.jar new file mode 100644 index 0000000..9fbebb0 Binary files /dev/null and b/java/htmlunit/deps/websocket-common-9.4.31.v20200723.jar differ diff --git a/java/htmlunit/pom.xml b/java/htmlunit/pom.xml index 0a5e444..3e2dad3 100644 --- a/java/htmlunit/pom.xml +++ b/java/htmlunit/pom.xml @@ -25,7 +25,7 @@ net.sourceforge.htmlunit htmlunit - 2.40.0 + 2.43.0 diff --git a/java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java b/java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java index 6547afa..d0f3ff6 100644 --- a/java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java +++ b/java/htmlunit/src/main/java/is/rud/htmlunit/Zapp.java @@ -8,7 +8,7 @@ import java.io.*; public class Zapp { - public static List getRequestsFor(String url, long jsDelay) throws IOException { + public static List getRequestsFor(String url, long jsDelay, int timeout) throws IOException { final WebClient webClient = new WebClient(BrowserVersion.CHROME); @@ -16,7 +16,7 @@ public class Zapp { wco.setThrowExceptionOnScriptError(false); wco.setCssEnabled(true); wco.setDownloadImages(true); - wco.setTimeout(30000); + wco.setTimeout(timeout); final List list = new ArrayList<>(); diff --git a/java/htmlunit/target/classes/is/rud/htmlunit/Zapp$1.class b/java/htmlunit/target/classes/is/rud/htmlunit/Zapp$1.class index 1f61b17..f2f0e05 100644 Binary files a/java/htmlunit/target/classes/is/rud/htmlunit/Zapp$1.class and b/java/htmlunit/target/classes/is/rud/htmlunit/Zapp$1.class differ diff --git a/java/htmlunit/target/classes/is/rud/htmlunit/Zapp.class b/java/htmlunit/target/classes/is/rud/htmlunit/Zapp.class index 6eea53d..2c47824 100644 Binary files a/java/htmlunit/target/classes/is/rud/htmlunit/Zapp.class and b/java/htmlunit/target/classes/is/rud/htmlunit/Zapp.class differ diff --git a/java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar b/java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar index a85d537..f3a0cf0 100644 Binary files a/java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar and b/java/htmlunit/target/htmlunit-1.0-SNAPSHOT.jar differ diff --git a/man/hu_read_html.Rd b/man/hu_read_html.Rd index da4ff76..01a02a3 100644 --- a/man/hu_read_html.Rd +++ b/man/hu_read_html.Rd @@ -22,7 +22,7 @@ hu_read_html( \item{emulate}{browser to emulate; one of "\code{best}", "\code{chrome}", "\code{firefox}", "\code{ie}"} \item{ret}{what to return; if \code{html_document} (the default) then the HTML created -by the \code{HtmlUnit} emulated browser context is passed to \code{\link[xml2:read_html]{xml2::read_html()}} +by the \code{HtmlUnit} emulated browser context is passed to \code{\link[xml2:read_xml]{xml2::read_html()}} and an \code{xml2} \code{html_document}/\code{xml_document} is returned. Note that this causes further HTML processing by \code{xml2}/\code{libxml2} so is not \emph{exactly} what \code{HtmlUnit} generated. If you want the HTML code (text) without any further @@ -47,7 +47,7 @@ function is a high-level wrapper designed to do a read of HTML, it is recommended that you leave this the default \code{FALSE} to save time/bandwidth.} -\item{options}{options to pass to \code{\link[xml2:read_html]{xml2::read_html()}} if \code{ret} == \code{html_document}.} +\item{options}{options to pass to \code{\link[xml2:read_xml]{xml2::read_html()}} if \code{ret} == \code{html_document}.} } \value{ an \code{xml2} \code{html_document}/\code{xml_document} if \code{ret} == \code{html_document} else diff --git a/man/wc_inspect.Rd b/man/wc_inspect.Rd index 90019af..c3c142b 100644 --- a/man/wc_inspect.Rd +++ b/man/wc_inspect.Rd @@ -4,12 +4,17 @@ \alias{wc_inspect} \title{Perform a "Developer Tools"-like Network Inspection of a URL} \usage{ -wc_inspect(url, js_delay = 5000L) +wc_inspect(url, js_delay = 5000L, timeout = 30000L) } \arguments{ \item{url}{URL to fetch} \item{js_delay}{(ms) How long to wait for JavaScript to execute/XHRs to load? (Default: 5000)} + +\item{timeout}{Sets the timeout (milliseconds) of the webc onnection. Set to zero for an infinite wait. +Defaults to \code{30000}. Note: The timeout is used twice. The first is for making the socket +connection, the second is for data retrieval. If the time is critical you must allow for twice +the time specified here.} } \description{ Retrieves \emph{all} content loaded