Browse Source

more DSL

master
boB Rudis 5 years ago
parent
commit
eddc3d8d22
No known key found for this signature in database GPG Key ID: 1D7529BE14E2BBA9
  1. 5
      DESCRIPTION
  2. 4
      NAMESPACE
  3. 93
      R/wc-html-nodes.R
  4. 2
      R/wc-options.R
  5. 5
      R/web-client.R
  6. 54
      README.Rmd
  7. 115
      README.md
  8. 34
      man/hu_read_html.Rd
  9. 6
      man/print.browserinfo.Rd
  10. 6
      man/print.webclient.Rd
  11. 8
      man/wc_browser_info.Rd
  12. 6
      man/wc_content_length.Rd
  13. 6
      man/wc_content_type.Rd
  14. 8
      man/wc_css.Rd
  15. 8
      man/wc_dnt.Rd
  16. 8
      man/wc_geo.Rd
  17. 6
      man/wc_go.Rd
  18. 6
      man/wc_headers.Rd
  19. 35
      man/wc_html_nodes.Rd
  20. 43
      man/wc_html_text.Rd
  21. 8
      man/wc_img_dl.Rd
  22. 6
      man/wc_load_time.Rd
  23. 27
      man/wc_render.Rd
  24. 6
      man/wc_resize.Rd
  25. 6
      man/wc_status.Rd
  26. 6
      man/wc_timeout.Rd
  27. 6
      man/wc_title.Rd
  28. 6
      man/wc_url.Rd
  29. 8
      man/wc_use_insecure_ssl.Rd
  30. 4
      man/wc_wait.Rd
  31. 10
      man/web_client.Rd

5
DESCRIPTION

@ -22,6 +22,8 @@ URL: https://gitlab.com/hrbrmstr/htmlunit
BugReports: https://gitlab.com/hrbrmstr/htmlunit/issues
Encoding: UTF-8
License: Apache License 2.0 | file LICENSE
Imports:
magrittr
Suggests:
testthat,
covr
@ -30,7 +32,6 @@ Depends:
rJava,
htmlunitjars,
rvest
Roxygen: list(markdown = TRUE)
RoxygenNote: 6.1.1
Remotes: hrbrmstr/htmlunitjars
Imports:
magrittr

4
NAMESPACE

@ -12,6 +12,10 @@ export(wc_dnt)
export(wc_geo)
export(wc_go)
export(wc_headers)
export(wc_html_attr)
export(wc_html_name)
export(wc_html_nodes)
export(wc_html_text)
export(wc_img_dl)
export(wc_load_time)
export(wc_render)

93
R/wc-html-nodes.R

@ -0,0 +1,93 @@
#' Select nodes from web client active page html content
#'
#' @md
#' @param wc_obj a `webclient` object
#' @param css,xpath Nodes to select. Supply one of css or xpath depending on whether you want to use a css or xpath 1.0 selector.
#' @export
#' @examples \dontrun{
#' wc <- web_client()
#'
#' wc %>% wc_go("https://usa.gov/")
#'
#' wc %>%
#' wc_html_nodes("a") %>%
#' sapply(wc_html_text)
#'
#' wc %>%
#' wc_html_nodes(xpath=".//a") %>%
#' sapply(wc_html_text)
#'
#' wc %>%
#' wc_html_nodes(xpath=".//a") %>%
#' sapply(wc_html_attr, "href")
#' }
wc_html_nodes <- function(wc_obj, css, xpath) {
pg <- wc_obj$wc$getCurrentWindow()$getEnclosedPage()
if (.jnull() == pg) return(NULL)
if (missing(css) && missing(xpath))
stop("Please supply one of css or xpath", call. = FALSE)
if (!missing(css) && !missing(xpath))
stop("Please supply css or xpath, not both", call. = FALSE)
if (!missing(css)) {
if (!is.character(css) && length(css) == 1) stop("`css` must be a string")
out <- pg$querySelectorAll(css)
} else {
if (!is.character(xpath) && length(xpath) == 1)
stop("`xpath` must be a string")
out <- pg$getByXPath(xpath)
}
out
}
#' Extract attributes, text and tag name from webclient page html content
#'
#' @md
#' @param dom_node a webclient page DOM node (likely produced by [wc_html_nodes()])
#' @param trim if `TRUE` will trim leading/trailing white space
#' @export
#' @examples \dontrun{
#' wc <- web_client()
#'
#' wc %>% wc_go("https://usa.gov/")
#'
#' wc %>%
#' wc_html_nodes("a") %>%
#' sapply(wc_html_text)
#'
#' wc %>%
#' wc_html_nodes(xpath=".//a") %>%
#' sapply(wc_html_text)
#'
#' wc %>%
#' wc_html_nodes(xpath=".//a") %>%
#' sapply(wc_html_attr, "href")
#' }
wc_html_text <- function(dom_node, trim = FALSE) {
x <- dom_node$getTextContent()
if (trim) x <- trimws(x)
x
}
#' @rdname wc_html_text
#' @export
#' @param attr name of attribute to retrieve
wc_html_attr <- function(dom_node, attr) {
dom_node$getAttribute(attr)
}
#' @rdname wc_html_text
#' @export
wc_html_name <- function(dom_node) {
dom_node$getNodeName()
}

2
R/wc-options.R

@ -53,7 +53,7 @@ wc_dnt <- function(wc_obj, enable) {
#' @note The caller does not have to assign the output of this function to a
#' variable as the browser state is managed internally by HtmlUnit.
#' @param wc_obj a `webclient` object
#' @param enable if `TRUE` enable image downloading (which is the HtmlUnit default)
#' @param enable if `TRUE` enable image downloading (the default is not to download images)
#' @return the `webclient` object (invisibly)
#' @family wc_opts
#' @export

5
R/web-client.R

@ -29,6 +29,7 @@ web_client <- function(emulate = c("best", "chrome", "firefox", "ie"),
wc_opts <- wc$getOptions()
wc_opts$setThrowExceptionOnFailingStatusCode(FALSE)
wc_opts$setThrowExceptionOnScriptError(FALSE)
wc_opts$setDownloadImages(FALSE)
list(
wc = wc,
@ -41,6 +42,10 @@ web_client <- function(emulate = c("best", "chrome", "firefox", "ie"),
}
#' @rdname web_client
#' @export
webclient <- web_client
#' Visit a URL
#'
#' @note The caller does not have to assign the output of this function to a

54
README.Rmd

@ -27,25 +27,29 @@ package.
The following functions are implemented:
### Standard
### DSL
- `hu_read_html`: Read HTML from a URL with Browser Emulation & in a JavaScript Context
- `web_client`/`webclient`: Create a new HtmlUnit WebClient instance<br/><br/>
### DSL
- `wc_go`: Visit a URL<br/>
- `web_client`: Create a new HtmlUnit WebClient instance
- `wc_html_nodes`: Select nodes from web client active page html content
- `wc_html_text`: Extract attributes, text and tag name from webclient page html content<br/><br/>
- `wc_html_attr`: Extract attributes, text and tag name from webclient page html content
- `wc_html_name`: Extract attributes, text and tag name from webclient page html content
- `wc_headers`: Return response headers of the last web request for current page
- `wc_browser_info`: Retreive information about the browser used to create the 'webclient'
- `wc_content_length`: Return content length of the last web request for current page
- `wc_content_type`: Return content type of web request for current page
- `wc_content_type`: Return content type of web request for current page<br/><br/>
- `wc_render`: Retrieve current page contents<br/><br/>
- `wc_css`: Enable/Disable CSS support
- `wc_dnt`: Enable/Disable Do-Not-Track
- `wc_geo`: Enable/Disable Geolocation
- `wc_go`: Visit a URL
- `wc_headers`: Return response headers of the last web request for current page
- `wc_img_dl`: Enable/Disable Image Downloading
- `wc_load_time`: Return load time of the last web request for current page
- `wc_render`: Retrieve current page contents
- `wc_resize`: Resize the virtual browser window
- `wc_status`: Return status code of web request for current page
- `wc_timeout`: Change default request timeout
@ -54,6 +58,10 @@ The following functions are implemented:
- `wc_use_insecure_ssl`: Enable/Disable Ignoring SSL Validation Issues
- `wc_wait`: Block HtlUnit final rendering blocks until all background JavaScript tasks have finished executing
### Just the Content (pls)
- `hu_read_html`: Read HTML from a URL with Browser Emulation & in a JavaScript Context
## Installation
```{r eval=FALSE}
@ -106,11 +114,33 @@ wc <- web_client()
wc %>% wc_browser_info()
wc %>% wc_go(test_url)
wc <- web_client()
wc %>% wc_go("https://usa.gov/")
# if you want to use purrr::map_ functions the result of wc_html_nodes() needs to be passed to as.list()
wc %>% wc_render("html")
wc %>%
wc_html_nodes("a") %>%
sapply(wc_html_text, trim = TRUE) %>%
head(10)
wc %>% wc_render("parsed")
wc %>%
wc_html_nodes(xpath=".//a") %>%
sapply(wc_html_text, trim = TRUE) %>%
head(10)
wc %>% wc_render("text")
wc %>%
wc_html_nodes(xpath=".//a") %>%
sapply(wc_html_attr, "href") %>%
head(10)
```
Handy function to get rendered plain text for text mining:
```{r}
wc %>%
wc_render("text") %>%
substr(1, 300) %>%
cat()
```

115
README.md

@ -23,14 +23,27 @@ provided by the exposed ‘Java’ libraries in the
The following functions are implemented:
### Standard
### DSL
- `hu_read_html`: Read HTML from a URL with Browser Emulation & in a
JavaScript Context
- `web_client`/`webclient`: Create a new HtmlUnit WebClient
instance<br/><br/>
### DSL
- `wc_go`: Visit a URL<br/>
- `wc_html_nodes`: Select nodes from web client active page html
content
- `wc_html_text`: Extract attributes, text and tag name from webclient
page html content<br/><br/>
- `web_client`: Create a new HtmlUnit WebClient instance
- `wc_html_attr`: Extract attributes, text and tag name from webclient
page html content
- `wc_html_name`: Extract attributes, text and tag name from webclient
page html content
- `wc_headers`: Return response headers of the last web request for
current page
- `wc_browser_info`: Retreive information about the browser used to
create the ‘webclient’
@ -39,7 +52,9 @@ The following functions are implemented:
for current page
- `wc_content_type`: Return content type of web request for current
page
page<br/><br/>
- `wc_render`: Retrieve current page contents<br/><br/>
- `wc_css`: Enable/Disable CSS support
@ -47,18 +62,11 @@ The following functions are implemented:
- `wc_geo`: Enable/Disable Geolocation
- `wc_go`: Visit a URL
- `wc_headers`: Return response headers of the last web request for
current page
- `wc_img_dl`: Enable/Disable Image Downloading
- `wc_load_time`: Return load time of the last web request for current
page
- `wc_render`: Retrieve current page contents
- `wc_resize`: Resize the virtual browser window
- `wc_status`: Return status code of web request for current page
@ -74,6 +82,11 @@ The following functions are implemented:
- `wc_wait`: Block HtlUnit final rendering blocks until all background
JavaScript tasks have finished executing
### Just the Content (pls)
- `hu_read_html`: Read HTML from a URL with Browser Emulation & in a
JavaScript Context
## Installation
``` r
@ -130,17 +143,73 @@ wc <- web_client()
wc %>% wc_browser_info()
## < Netscape / 5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36 / en-US >
wc %>% wc_go(test_url)
wc <- web_client()
wc %>% wc_render("html")
## [1] "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<html>\r\n <head>\r\n <meta charset=\"utf-8\"/>\r\n <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge\"/>\r\n <title>\r\n </title>\r\n <meta name=\"description\" content=\"\"/>\r\n <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\"/>\r\n <link rel=\"stylesheet\" href=\"\"/>\r\n </head>\r\n <body>\r\n <script>\r\n//<![CDATA[\r\n\n\n function createTable(tableData) {\n var table = document.createElement('table');\n var row = {};\n var cell = {};\n \n tableData.forEach(function(rowData) {\n row = table.insertRow(-1);\n rowData.forEach(function(cellData) {\n cell = row.insertCell();\n cell.textContent = cellData;\n });\n });\n document.body.appendChild(table);\n }\n \n createTable([\n [\"One\", \"Two\"], \n [\"Three\", \"Four\"], \n [\"Five\", \"Six\"]\n ]);\n\n \r\n//]]>\r\n </script>\r\n <table>\r\n <tbody>\r\n <tr>\r\n <td>\r\n One\r\n </td>\r\n <td>\r\n Two\r\n </td>\r\n </tr>\r\n <tr>\r\n <td>\r\n Three\r\n </td>\r\n <td>\r\n Four\r\n </td>\r\n </tr>\r\n <tr>\r\n <td>\r\n Five\r\n </td>\r\n <td>\r\n Six\r\n </td>\r\n </tr>\r\n </tbody>\r\n </table>\r\n </body>\r\n</html>\r\n"
wc %>% wc_go("https://usa.gov/")
# if you want to use purrr::map_ functions the result of wc_html_nodes() needs to be passed to as.list()
wc %>%
wc_html_nodes("a") %>%
sapply(wc_html_text, trim = TRUE) %>%
head(10)
## [1] "Skip to main content" "" "1-844-USA-GOV1" "All Topics and Services"
## [5] "Benefits" "Help with Bills" "Grants and Loans" "Food Assistance"
## [9] "Social Security Questions" "Affordable Rental Housing"
wc %>%
wc_html_nodes(xpath=".//a") %>%
sapply(wc_html_text, trim = TRUE) %>%
head(10)
## [1] "Skip to main content" "" "1-844-USA-GOV1" "All Topics and Services"
## [5] "Benefits" "Help with Bills" "Grants and Loans" "Food Assistance"
## [9] "Social Security Questions" "Affordable Rental Housing"
wc %>%
wc_html_nodes(xpath=".//a") %>%
sapply(wc_html_attr, "href") %>%
head(10)
## [1] "#skiptarget" "/" "/phone" "/topics"
## [5] "/benefits" "/help-with-bills" "/grants" "/food-help"
## [9] "/about-social-security" "/finding-home"
```
wc %>% wc_render("parsed")
## {xml_document}
## <html>
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<meta charset="utf-8">\n<meta http-e ...
## [2] <body>\r\n <script>\r\n//<![CDATA[\r\n\n\n function createTable(tableData) {\n var table = documen ...
Handy function to get rendered plain text for text mining:
wc %>% wc_render("text")
## [1] "One\tTwo\nThree\tFour\nFive\tSix"
``` r
wc %>%
wc_render("text") %>%
substr(1, 300) %>%
cat()
## Official Guide to Government Information and Services | USAGov
## Skip to main content
##
##
## An official website of the United States government Here's how you know
##
##
##
##
##
##
##
##
##
##
## Search
## Search
##
##
## Search
## 1-844-USA-GOV1
##
##
##
## All Topics and Services
##
##
## Benefits, Grants, Loans
##
##
## Government Agencies and Elected Offic
```

34
man/hu_read_html.Rd

@ -13,39 +13,39 @@ hu_read_html(url, emulate = c("best", "chrome", "firefox", "ie"),
\arguments{
\item{url}{URL to retrieve}
\item{emulate}{browser to emulate; one of "`best`", "`chrome`", "`firefox`", "`ie`"}
\item{emulate}{browser to emulate; one of "\code{best}", "\code{chrome}", "\code{firefox}", "\code{ie}"}
\item{ret}{what to return; if `html_document` (the default) then the HTML created
by the `HtmlUnit` emulated browser context is passed to [xml2::read_html()]
and an `xml2` `html_document`/`xml_document` is returned. Note that this causes
further HTML processing by `xml2`/`libxml2` so is not _exactly_ what
`HtmlUnit` generated. If you want the HTML code (text) without any further
processing then use `text` as the value.}
\item{ret}{what to return; if \code{html_document} (the default) then the HTML created
by the \code{HtmlUnit} emulated browser context is passed to \code{\link[xml2:read_html]{xml2::read_html()}}
and an \code{xml2} \code{html_document}/\code{xml_document} is returned. Note that this causes
further HTML processing by \code{xml2}/\code{libxml2} so is not \emph{exactly} what
\code{HtmlUnit} generated. If you want the HTML code (text) without any further
processing then use \code{text} as the value.}
\item{js_delay}{time (ms) to let loaded javascript to execute; default is 2 seconds (2000 ms)}
\item{timeout}{overall timeout (ms); `0` == infinite wait (not recommended); note: the
\item{timeout}{overall timeout (ms); \code{0} == infinite wait (not recommended); note: the
timeout is used twice: first in making the socket connection,
second for data retrieval. If the time is critical you must
allow for twice the time specified here. Default 30s (30000 ms)}
\item{ignore_ssl_errors}{Should SSL/TLS errors be ignored. The default (`TRUE`) is
a current hack due to how `HtmlUnit` seems to handle virtual hosted sites
with multiple vhosts and multiple certificates. You can try it with `FALSE`
initially and revert back to `TRUE` if you encounter issues.}
\item{ignore_ssl_errors}{Should SSL/TLS errors be ignored. The default (\code{TRUE}) is
a current hack due to how \code{HtmlUnit} seems to handle virtual hosted sites
with multiple vhosts and multiple certificates. You can try it with \code{FALSE}
initially and revert back to \code{TRUE} if you encounter issues.}
\item{enable_dnt}{Enable the "Do Not Track" header. Default: `FALSE`.}
\item{enable_dnt}{Enable the "Do Not Track" header. Default: \code{FALSE}.}
\item{download_images}{Download images as the page is loaded? Since this
function is a high-level wrapper designed to do a read of HTML,
it is recommended that you leave this the default `FALSE` to save
it is recommended that you leave this the default \code{FALSE} to save
time/bandwidth.}
\item{options}{options to pass to [xml2::read_html()] if `ret` == `html_document`.}
\item{options}{options to pass to \code{\link[xml2:read_html]{xml2::read_html()}} if \code{ret} == \code{html_document}.}
}
\value{
an `xml2` `html_document`/`xml_document` if `ret` == `html_document` else
the HTML document text generated by `HtmlUnit`.
an \code{xml2} \code{html_document}/\code{xml_document} if \code{ret} == \code{html_document} else
the HTML document text generated by \code{HtmlUnit}.
}
\description{
Use a JavaScript-enabled browser context to read and render HTML from a URL.

6
man/print.browserinfo.Rd

@ -2,16 +2,16 @@
% Please edit documentation in R/web-client.R
\name{print.browserinfo}
\alias{print.browserinfo}
\title{Print method for `browserinfo` objects}
\title{Print method for \code{browserinfo} objects}
\usage{
\method{print}{browserinfo}(x, ...)
}
\arguments{
\item{x}{`browserinfo` object}
\item{x}{\code{browserinfo} object}
\item{...}{unused}
}
\description{
Print method for `browserinfo` objects
Print method for \code{browserinfo} objects
}
\keyword{internal}

6
man/print.webclient.Rd

@ -2,16 +2,16 @@
% Please edit documentation in R/web-client.R
\name{print.webclient}
\alias{print.webclient}
\title{Print method for `webclient` objects}
\title{Print method for \code{webclient} objects}
\usage{
\method{print}{webclient}(x, ...)
}
\arguments{
\item{x}{`webclient` object}
\item{x}{\code{webclient} object}
\item{...}{unused}
}
\description{
Print method for `webclient` objects
Print method for \code{webclient} objects
}
\keyword{internal}

8
man/wc_browser_info.Rd

@ -2,20 +2,20 @@
% Please edit documentation in R/web-client.R
\name{wc_browser_info}
\alias{wc_browser_info}
\title{Retreive information about the browser used to create the `webclient`}
\title{Retreive information about the browser used to create the \code{webclient}}
\usage{
wc_browser_info(wc_obj)
}
\arguments{
\item{wc_obj}{a `webclient` object}
\item{wc_obj}{a \code{webclient} object}
}
\value{
the browser version
}
\description{
Retreive information about the browser used to create the `webclient`
Retreive information about the browser used to create the \code{webclient}
}
\note{
This is an information retrieval function that does not return
the `wc_obj` so must be the last function call in a `webclient` pipe.
the \code{wc_obj} so must be the last function call in a \code{webclient} pipe.
}

6
man/wc_content_length.Rd

@ -7,15 +7,15 @@
wc_content_length(wc_obj)
}
\arguments{
\item{wc_obj}{a `webclient` object}
\item{wc_obj}{a \code{webclient} object}
}
\value{
the content length (in bytes) of the web request or `NULL` if no active page
the content length (in bytes) of the web request or \code{NULL} if no active page
}
\description{
Return content length of the last web request for current page
}
\note{
This is an information retrieval function that does not return
the `wc_obj` so must be the last function call in a `webclient` pipe.
the \code{wc_obj} so must be the last function call in a \code{webclient} pipe.
}

6
man/wc_content_type.Rd

@ -7,15 +7,15 @@
wc_content_type(wc_obj)
}
\arguments{
\item{wc_obj}{a `webclient` object}
\item{wc_obj}{a \code{webclient} object}
}
\value{
the content type of the web request or `NULL` if no active page
the content type of the web request or \code{NULL} if no active page
}
\description{
Return content type of web request for current page
}
\note{
This is an information retrieval function that does not return
the `wc_obj` so must be the last function call in a `webclient` pipe.
the \code{wc_obj} so must be the last function call in a \code{webclient} pipe.
}

8
man/wc_css.Rd

@ -7,19 +7,19 @@
wc_css(wc_obj, enable)
}
\arguments{
\item{wc_obj}{a `webclient` object}
\item{wc_obj}{a \code{webclient} object}
\item{enable}{if `TRUE` enable CSS support (which is the HtmlUnit default)}
\item{enable}{if \code{TRUE} enable CSS support (which is the HtmlUnit default)}
}
\value{
the `webclient` object (invisibly)
the \code{webclient} object (invisibly)
}
\description{
Enable/Disable CSS support
}
\note{
The caller does not have to assign the output of this function to a
variable as the browser state is managed internally by HtmlUnit.
variable as the browser state is managed internally by HtmlUnit.
}
\seealso{
Other wc_opts: \code{\link{wc_dnt}}, \code{\link{wc_geo}},

8
man/wc_dnt.Rd

@ -7,19 +7,19 @@
wc_dnt(wc_obj, enable)
}
\arguments{
\item{wc_obj}{a `webclient` object}
\item{wc_obj}{a \code{webclient} object}
\item{enable}{if `TRUE` enable Do-Not-Track support (which is the HtmlUnit default)}
\item{enable}{if \code{TRUE} enable Do-Not-Track support (which is the HtmlUnit default)}
}
\value{
the `webclient` object (invisibly)
the \code{webclient} object (invisibly)
}
\description{
Enable/Disable Do-Not-Track
}
\note{
The caller does not have to assign the output of this function to a
variable as the browser state is managed internally by HtmlUnit.
variable as the browser state is managed internally by HtmlUnit.
}
\seealso{
Other wc_opts: \code{\link{wc_css}}, \code{\link{wc_geo}},

8
man/wc_geo.Rd

@ -7,19 +7,19 @@
wc_geo(wc_obj, enable)
}
\arguments{
\item{wc_obj}{a `webclient` object}
\item{wc_obj}{a \code{webclient} object}
\item{enable}{if `TRUE` enable geolocation (which is the HtmlUnit default)}
\item{enable}{if \code{TRUE} enable geolocation (which is the HtmlUnit default)}
}
\value{
the `webclient` object (invisibly)
the \code{webclient} object (invisibly)
}
\description{
Enable/Disable Geolocation
}
\note{
The caller does not have to assign the output of this function to a
variable as the browser state is managed internally by HtmlUnit.
variable as the browser state is managed internally by HtmlUnit.
}
\seealso{
Other wc_opts: \code{\link{wc_css}}, \code{\link{wc_dnt}},

6
man/wc_go.Rd

@ -7,17 +7,17 @@
wc_go(wc_obj, url)
}
\arguments{
\item{wc_obj}{a `webclient` object}
\item{wc_obj}{a \code{webclient} object}
\item{url}{URL to retrieve}
}
\value{
the `webclient` object (invisibly)
the \code{webclient} object (invisibly)
}
\description{
Visit a URL
}
\note{
The caller does not have to assign the output of this function to a
variable as the browser state is managed internally by HtmlUnit.
variable as the browser state is managed internally by HtmlUnit.
}

6
man/wc_headers.Rd

@ -7,15 +7,15 @@
wc_headers(wc_obj)
}
\arguments{
\item{wc_obj}{a `webclient` object}
\item{wc_obj}{a \code{webclient} object}
}
\value{
the response headers of the web request as a data frame or `NULL` if no active page
the response headers of the web request as a data frame or \code{NULL} if no active page
}
\description{
Return response headers of the last web request for current page
}
\note{
This is an information retrieval function that does not return
the `wc_obj` so must be the last function call in a `webclient` pipe.
the \code{wc_obj} so must be the last function call in a \code{webclient} pipe.
}

35
man/wc_html_nodes.Rd

@ -0,0 +1,35 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/wc-html-nodes.R
\name{wc_html_nodes}
\alias{wc_html_nodes}
\title{Select nodes from web client active page html content}
\usage{
wc_html_nodes(wc_obj, css, xpath)
}
\arguments{
\item{wc_obj}{a \code{webclient} object}
\item{css, xpath}{Nodes to select. Supply one of css or xpath depending on whether you want to use a css or xpath 1.0 selector.}
}
\description{
Select nodes from web client active page html content
}
\examples{
\dontrun{
wc <- web_client()
wc \%>\% wc_go("https://usa.gov/")
wc \%>\%
wc_html_nodes("a") \%>\%
sapply(wc_html_text)
wc \%>\%
wc_html_nodes(xpath=".//a") \%>\%
sapply(wc_html_text)
wc \%>\%
wc_html_nodes(xpath=".//a") \%>\%
sapply(wc_html_attr, "href")
}
}

43
man/wc_html_text.Rd

@ -0,0 +1,43 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/wc-html-nodes.R
\name{wc_html_text}
\alias{wc_html_text}
\alias{wc_html_attr}
\alias{wc_html_name}
\title{Extract attributes, text and tag name from webclient page html content}
\usage{
wc_html_text(dom_node, trim = FALSE)
wc_html_attr(dom_node, attr)
wc_html_name(dom_node)
}
\arguments{
\item{dom_node}{a webclient page DOM node (likely produced by \code{\link[=wc_html_nodes]{wc_html_nodes()}})}
\item{trim}{if \code{TRUE} will trim leading/trailing white space}
\item{attr}{name of attribute to retrieve}
}
\description{
Extract attributes, text and tag name from webclient page html content
}
\examples{
\dontrun{
wc <- web_client()
wc \%>\% wc_go("https://usa.gov/")
wc \%>\%
wc_html_nodes("a") \%>\%
sapply(wc_html_text)
wc \%>\%
wc_html_nodes(xpath=".//a") \%>\%
sapply(wc_html_text)
wc \%>\%
wc_html_nodes(xpath=".//a") \%>\%
sapply(wc_html_attr, "href")
}
}

8
man/wc_img_dl.Rd

@ -7,19 +7,19 @@
wc_img_dl(wc_obj, enable)
}
\arguments{
\item{wc_obj}{a `webclient` object}
\item{wc_obj}{a \code{webclient} object}
\item{enable}{if `TRUE` enable image downloading (which is the HtmlUnit default)}
\item{enable}{if \code{TRUE} enable image downloading (the default is not to download images)}
}
\value{
the `webclient` object (invisibly)
the \code{webclient} object (invisibly)
}
\description{
Enable/Disable Image Downloading
}
\note{
The caller does not have to assign the output of this function to a
variable as the browser state is managed internally by HtmlUnit.
variable as the browser state is managed internally by HtmlUnit.
}
\seealso{
Other wc_opts: \code{\link{wc_css}}, \code{\link{wc_dnt}},

6
man/wc_load_time.Rd

@ -7,15 +7,15 @@
wc_load_time(wc_obj)
}
\arguments{
\item{wc_obj}{a `webclient` object}
\item{wc_obj}{a \code{webclient} object}
}
\value{
the load time (in ms) of the web request or `NULL` if no active page
the load time (in ms) of the web request or \code{NULL} if no active page
}
\description{
Return load time of the last web request for current page
}
\note{
This is an information retrieval function that does not return
the `wc_obj` so must be the last function call in a `webclient` pipe.
the \code{wc_obj} so must be the last function call in a \code{webclient} pipe.
}

27
man/wc_render.Rd

@ -7,15 +7,15 @@
wc_render(wc_obj, what = c("parsed", "html", "text"))
}
\arguments{
\item{wc_obj}{a `webclient` object}
\item{wc_obj}{a \code{webclient} object}
\item{what}{what to return (see Details); NOTE that if there is no active
page this function returns `NULL`.}
page this function returns \code{NULL}.}
}
\value{
if `what` is `parsed`, an `xml2` `html_document`; if `html`,
the character HTML representation of the page; if `text`
the rendered text of the document as viewed by a human.
if \code{what} is \code{parsed}, an \code{xml2} \code{html_document}; if \code{html},
the character HTML representation of the page; if \code{text}
the rendered text of the document as viewed by a human.
}
\description{
If there is a page in the active browser context, return the contents of
@ -23,15 +23,16 @@ the page.
}
\details{
The page contents can be returned as one of:
- Parsed HTML (i.e. an `xml2` `html_document`)
- A string representation of the HTML document. NOTE: The charset used is the
current page encoding.
- A textual representation of this page that represents what would be visible
to the user if this page was shown in a web browser. This is useful for,
say, text mining.
\itemize{
\item Parsed HTML (i.e. an \code{xml2} \code{html_document})
\item A string representation of the HTML document. NOTE: The charset used is the
current page encoding.
\item A textual representation of this page that represents what would be visible
to the user if this page was shown in a web browser. This is useful for,
say, text mining.
}
}
\note{
This is an information retrieval function that does not return
the `wc_obj` so must be the last function call in a `webclient` pipe.
the \code{wc_obj} so must be the last function call in a \code{webclient} pipe.
}

6
man/wc_resize.Rd

@ -7,19 +7,19 @@
wc_resize(wc_obj, h, w)
}
\arguments{
\item{wc_obj}{a `webclient` object}
\item{wc_obj}{a \code{webclient} object}
\item{h, w}{height and width (pixels)}
}
\value{
the `webclient` object (invisibly)
the \code{webclient} object (invisibly)
}
\description{
Resize the virtual browser window
}
\note{
The caller does not have to assign the output of this function to a
variable as the browser state is managed internally by HtmlUnit.
variable as the browser state is managed internally by HtmlUnit.
}
\seealso{
Other wc_opts: \code{\link{wc_css}}, \code{\link{wc_dnt}},

6
man/wc_status.Rd

@ -7,15 +7,15 @@
wc_status(wc_obj)
}
\arguments{
\item{wc_obj}{a `webclient` object}
\item{wc_obj}{a \code{webclient} object}
}
\value{
the HTTP status code and message of the web request or `NULL` if no active page
the HTTP status code and message of the web request or \code{NULL} if no active page
}
\description{
Return status code of web request for current page
}
\note{
This is an information retrieval function that does not return
the `wc_obj` so must be the last function call in a `webclient` pipe.
the \code{wc_obj} so must be the last function call in a \code{webclient} pipe.
}

6
man/wc_timeout.Rd

@ -7,21 +7,21 @@
wc_timeout(wc_obj, timeout)
}
\arguments{
\item{wc_obj}{a `webclient` object}
\item{wc_obj}{a \code{webclient} object}
\item{timneout}{timeout (ms); The timeout is used twice. The first is for making
the socket connection, the second is for data retrieval. If the
time is critical you must allow for twice the time specified here.}
}
\value{
the `webclient` object (invisibly)
the \code{webclient} object (invisibly)
}
\description{
Change default request timeout
}
\note{
The caller does not have to assign the output of this function to a
variable as the browser state is managed internally by HtmlUnit.
variable as the browser state is managed internally by HtmlUnit.
}
\seealso{
Other wc_opts: \code{\link{wc_css}}, \code{\link{wc_dnt}},

6
man/wc_title.Rd

@ -7,15 +7,15 @@
wc_title(wc_obj)
}
\arguments{
\item{wc_obj}{a `webclient` object}
\item{wc_obj}{a \code{webclient} object}
}
\value{
page title of the current page `NULL` if no active page
page title of the current page \code{NULL} if no active page
}
\description{
Return page title for current page
}
\note{
This is an information retrieval function that does not return
the `wc_obj` so must be the last function call in a `webclient` pipe.
the \code{wc_obj} so must be the last function call in a \code{webclient} pipe.
}

6
man/wc_url.Rd

@ -7,15 +7,15 @@
wc_url(wc_obj)
}
\arguments{
\item{wc_obj}{a `webclient` object}
\item{wc_obj}{a \code{webclient} object}
}
\value{
the load time (in ms) of the web request or `NULL` if no active page
the load time (in ms) of the web request or \code{NULL} if no active page
}
\description{
Return load time of the last web request for current page
}
\note{
This is an information retrieval function that does not return
the `wc_obj` so must be the last function call in a `webclient` pipe.
the \code{wc_obj} so must be the last function call in a \code{webclient} pipe.
}

8
man/wc_use_insecure_ssl.Rd

@ -7,20 +7,20 @@
wc_use_insecure_ssl(wc_obj, enable)
}
\arguments{
\item{wc_obj}{a `webclient` object}
\item{wc_obj}{a \code{webclient} object}
\item{enable}{if `TRUE` the client will accept connections to any host,
\item{enable}{if \code{TRUE} the client will accept connections to any host,
regardless of whether they have valid certificates or not}
}
\value{
the `webclient` object (invisibly)
the \code{webclient} object (invisibly)
}
\description{
Enable/Disable Ignoring SSL Validation Issues
}
\note{
The caller does not have to assign the output of this function to a
variable as the browser state is managed internally by HtmlUnit.
variable as the browser state is managed internally by HtmlUnit.
}
\seealso{
Other wc_opts: \code{\link{wc_css}}, \code{\link{wc_dnt}},

4
man/wc_wait.Rd

@ -7,7 +7,7 @@
wc_wait(wc_obj, js_delay = 2000L)
}
\arguments{
\item{js_delay}{a `webclient` object}
\item{js_delay}{a \code{webclient} object}
\item{wait}{number of ms to wait/block}
}
@ -16,7 +16,7 @@ Block HtlUnit final rendering blocks until all background JavaScript tasks have
}
\note{
The caller does not have to assign the output of this function to a
variable as the browser state is managed internally by HtmlUnit.
variable as the browser state is managed internally by HtmlUnit.
}
\seealso{
Other wc_opts: \code{\link{wc_css}}, \code{\link{wc_dnt}},

10
man/web_client.Rd

@ -8,19 +8,19 @@ web_client(emulate = c("best", "chrome", "firefox", "ie"),
proxy_host = NULL, proxy_port = NULL)
}
\arguments{
\item{emulate}{browser to emulate; one of "`best`", "`chrome`", "`firefox`", "`ie`"}
\item{emulate}{browser to emulate; one of "\code{best}", "\code{chrome}", "\code{firefox}", "\code{ie}"}
\item{proxy_host, proxy_port}{the server/port that will act as proxy (default
`NULL` = no proxy)}
\code{NULL} = no proxy)}
}
\value{
`webclient` object
\code{webclient} object
}
\description{
A new HtmlUnit web client (virtual browser) will be created and a `webclient`
A new HtmlUnit web client (virtual browser) will be created and a \code{webclient}
object will be returned.
}
\details{
This is part of the `htmlunit` DSL interface.s
This is part of the \code{htmlunit} DSL interface.s
}
\concept{dsl}

Loading…
Cancel
Save