Tools to Scrape Dynamic Web Content via the ‘HtmlUnit’ Java Library
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

100 lines
4.3 KiB

#' Read HTML from a URL with Browser Emulation & in a JavaScript Context
#'
#' Use a JavaScript-enabled browser context to read and render HTML from a URL.
#'
#' For the code in the examples, this is the site that is being scraped:
#'
#' \if{html}{
#' \figure{test-url-table.png}{options: width="100\%" alt="Figure: test-url-table.png"}
#' }
#'
#' \if{latex}{
#' \figure{test-url-table.png}{options: width=10cm}
#' }
#'
#' Note that it has a table of values but it is rendered via JavaScript.
#'
#' @param url URL to retrieve
#' @param emulate browser to emulate; one of "`best`", "`chrome`", "`firefox`", "`ie`"
#' @param ret what to return; if `html_document` (the default) then the HTML created
#' by the `HtmlUnit` emulated browser context is passed to [xml2::read_html()]
#' and an `xml2` `html_document`/`xml_document` is returned. Note that this causes
#' further HTML processing by `xml2`/`libxml2` so is not _exactly_ what
#' `HtmlUnit` generated. If you want the HTML code (text) without any further
#' processing then use `text` as the value.
#' @param js_delay time (ms) to let loaded javascript to execute; default is 2 seconds (2000 ms)
#' @param timeout overall timeout (ms); `0` == infinite wait (not recommended); note: the
#' timeout is used twice: first in making the socket connection,
#' second for data retrieval. If the time is critical you must
#' allow for twice the time specified here. Default 30s (30000 ms)
#' @param ignore_ssl_errors Should SSL/TLS errors be ignored. The default (`TRUE`) is
#' a current hack due to how `HtmlUnit` seems to handle virtual hosted sites
#' with multiple vhosts and multiple certificates. You can try it with `FALSE`
#' initially and revert back to `TRUE` if you encounter issues.
#' @param enable_dnt Enable the "Do Not Track" header. Default: `FALSE`.
#' @param download_images Download images as the page is loaded? Since this
#' function is a high-level wrapper designed to do a read of HTML,
#' it is recommended that you leave this the default `FALSE` to save
#' time/bandwidth.
#' @param options options to pass to [xml2::read_html()] if `ret` == `html_document`.
#' @return an `xml2` `html_document`/`xml_document` if `ret` == `html_document` else
#' the HTML document text generated by `HtmlUnit`.
#' @export
#' @examples \dontrun{
#' test_url <- "https://hrbrmstr.github.io/htmlunitjars/index.html"
#' hu_read_html(test_url)
#' }
hu_read_html <- function(url,
emulate = c("best", "chrome", "firefox", "ie", "edge"),
ret = c("html_document", "text"),
js_delay = 2000L,
timeout = 30000L,
ignore_ssl_errors = TRUE,
enable_dnt = FALSE,
download_images = FALSE,
options = c("RECOVER", "NOERROR", "NOBLANKS")) {
emulate <- match.arg(emulate, c("best", "chrome", "firefox", "ie", "edge"))
ret <- match.arg(ret, c("html_document", "text"))
available_browsers <- J("com.gargoylesoftware.htmlunit.BrowserVersion")
switch(
emulate,
best = available_browsers$BEST_SUPPORTED,
chrome = available_browsers$CHROME,
firefox = available_browsers$FIREFOX,
edge = available_browsers$EDGE,
ie = available_browsers$INTERNET_EXPLORER
) -> use_browser
wc <- new(J("com.gargoylesoftware.htmlunit.WebClient"), use_browser)
cssErrorHandler <- .jnew("is.rud.htmlunit.RDefaultCssErrorHandler")
wc$setCssErrorHandler(cssErrorHandler)
incorrectListenerHandler <- .jnew("is.rud.htmlunit.RIncorrectnessListener")
wc$setIncorrectnessListener(incorrectListenerHandler)
res <- wc$waitForBackgroundJavaScriptStartingBefore(.jlong(as.integer(js_delay)))
wc_opts <- wc$getOptions()
wc_opts$setThrowExceptionOnFailingStatusCode(FALSE)
wc_opts$setThrowExceptionOnScriptError(FALSE)
wc_opts$setTimeout(as.integer(timeout))
if (ignore_ssl_errors) wc_opts$setUseInsecureSSL(TRUE)
if (enable_dnt) wc_opts$setDoNotTrackEnabled(TRUE)
if (download_images) wc_opts$setDownloadImages(TRUE)
pg <- wc$getPage(url)
# response <- pg$getWebResponse()
# content <- response$getContentAsString()
if (ret == "html_document") return(xml2::read_html(pg$asXml(), options = options))
return(pg$asText())
}