mirror of https://git.sr.ht/~hrbrmstr/htmlunit
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
100 lines
4.3 KiB
100 lines
4.3 KiB
#' Read HTML from a URL with Browser Emulation & in a JavaScript Context
|
|
#'
|
|
#' Use a JavaScript-enabled browser context to read and render HTML from a URL.
|
|
#'
|
|
#' For the code in the examples, this is the site that is being scraped:
|
|
#'
|
|
#' \if{html}{
|
|
#' \figure{test-url-table.png}{options: width="100\%" alt="Figure: test-url-table.png"}
|
|
#' }
|
|
#'
|
|
#' \if{latex}{
|
|
#' \figure{test-url-table.png}{options: width=10cm}
|
|
#' }
|
|
#'
|
|
#' Note that it has a table of values but it is rendered via JavaScript.
|
|
#'
|
|
#' @param url URL to retrieve
|
|
#' @param emulate browser to emulate; one of "`best`", "`chrome`", "`firefox`", "`ie`"
|
|
#' @param ret what to return; if `html_document` (the default) then the HTML created
|
|
#' by the `HtmlUnit` emulated browser context is passed to [xml2::read_html()]
|
|
#' and an `xml2` `html_document`/`xml_document` is returned. Note that this causes
|
|
#' further HTML processing by `xml2`/`libxml2` so is not _exactly_ what
|
|
#' `HtmlUnit` generated. If you want the HTML code (text) without any further
|
|
#' processing then use `text` as the value.
|
|
#' @param js_delay time (ms) to let loaded javascript to execute; default is 2 seconds (2000 ms)
|
|
#' @param timeout overall timeout (ms); `0` == infinite wait (not recommended); note: the
|
|
#' timeout is used twice: first in making the socket connection,
|
|
#' second for data retrieval. If the time is critical you must
|
|
#' allow for twice the time specified here. Default 30s (30000 ms)
|
|
#' @param ignore_ssl_errors Should SSL/TLS errors be ignored. The default (`TRUE`) is
|
|
#' a current hack due to how `HtmlUnit` seems to handle virtual hosted sites
|
|
#' with multiple vhosts and multiple certificates. You can try it with `FALSE`
|
|
#' initially and revert back to `TRUE` if you encounter issues.
|
|
#' @param enable_dnt Enable the "Do Not Track" header. Default: `FALSE`.
|
|
#' @param download_images Download images as the page is loaded? Since this
|
|
#' function is a high-level wrapper designed to do a read of HTML,
|
|
#' it is recommended that you leave this the default `FALSE` to save
|
|
#' time/bandwidth.
|
|
#' @param options options to pass to [xml2::read_html()] if `ret` == `html_document`.
|
|
#' @return an `xml2` `html_document`/`xml_document` if `ret` == `html_document` else
|
|
#' the HTML document text generated by `HtmlUnit`.
|
|
#' @export
|
|
#' @examples \dontrun{
|
|
#' test_url <- "https://hrbrmstr.github.io/htmlunitjars/index.html"
|
|
#' hu_read_html(test_url)
|
|
#' }
|
|
hu_read_html <- function(url,
|
|
emulate = c("best", "chrome", "firefox", "ie", "edge"),
|
|
ret = c("html_document", "text"),
|
|
js_delay = 2000L,
|
|
timeout = 30000L,
|
|
ignore_ssl_errors = TRUE,
|
|
enable_dnt = FALSE,
|
|
download_images = FALSE,
|
|
options = c("RECOVER", "NOERROR", "NOBLANKS")) {
|
|
|
|
emulate <- match.arg(emulate, c("best", "chrome", "firefox", "ie", "edge"))
|
|
ret <- match.arg(ret, c("html_document", "text"))
|
|
|
|
available_browsers <- J("com.gargoylesoftware.htmlunit.BrowserVersion")
|
|
|
|
switch(
|
|
emulate,
|
|
best = available_browsers$BEST_SUPPORTED,
|
|
chrome = available_browsers$CHROME,
|
|
firefox = available_browsers$FIREFOX,
|
|
edge = available_browsers$EDGE,
|
|
ie = available_browsers$INTERNET_EXPLORER
|
|
) -> use_browser
|
|
|
|
wc <- new(J("com.gargoylesoftware.htmlunit.WebClient"), use_browser)
|
|
|
|
cssErrorHandler <- .jnew("is.rud.htmlunit.RDefaultCssErrorHandler")
|
|
wc$setCssErrorHandler(cssErrorHandler)
|
|
|
|
incorrectListenerHandler <- .jnew("is.rud.htmlunit.RIncorrectnessListener")
|
|
wc$setIncorrectnessListener(incorrectListenerHandler)
|
|
|
|
res <- wc$waitForBackgroundJavaScriptStartingBefore(.jlong(as.integer(js_delay)))
|
|
|
|
wc_opts <- wc$getOptions()
|
|
wc_opts$setThrowExceptionOnFailingStatusCode(FALSE)
|
|
wc_opts$setThrowExceptionOnScriptError(FALSE)
|
|
wc_opts$setTimeout(as.integer(timeout))
|
|
|
|
if (ignore_ssl_errors) wc_opts$setUseInsecureSSL(TRUE)
|
|
if (enable_dnt) wc_opts$setDoNotTrackEnabled(TRUE)
|
|
if (download_images) wc_opts$setDownloadImages(TRUE)
|
|
|
|
pg <- wc$getPage(url)
|
|
|
|
# response <- pg$getWebResponse()
|
|
# content <- response$getContentAsString()
|
|
|
|
if (ret == "html_document") return(xml2::read_html(pg$asXml(), options = options))
|
|
|
|
return(pg$asText())
|
|
|
|
}
|
|
|
|
|