No known key found for this signature in database
GPG Key ID: 2A514A4997464560
2 changed files with
17 additions and
4 deletions
-
R/clean.r
-
man/clean_text.Rd
|
|
@ -8,6 +8,9 @@ |
|
|
|
#' @md |
|
|
|
#' @param doc atomic character vector (i.e. plain text) or an `html_document` |
|
|
|
#' @return atomic character vector of cleaned text |
|
|
|
#' @note the XSLT can be a bit aggressive for some URLs and this function will first |
|
|
|
#' try the XSLT and test for an empty return. If that condition exists, then |
|
|
|
#' it will revert to a plain text conversion with just straight `rvest::html_text()`. |
|
|
|
#' @export |
|
|
|
clean_text <- function(doc) { |
|
|
|
|
|
|
@ -15,10 +18,15 @@ clean_text <- function(doc) { |
|
|
|
|
|
|
|
cleaner <- xml2::read_xml(system.file("xslt/justthetext.xslt", package="hgr")) |
|
|
|
|
|
|
|
doc <- xslt::xml_xslt(doc, cleaner) |
|
|
|
doc <- rvest::html_text(doc) |
|
|
|
doc <- trimws(doc) |
|
|
|
doc_tmp <- xslt::xml_xslt(doc, cleaner) |
|
|
|
doc_tmp <- rvest::html_text(doc_tmp) |
|
|
|
doc_tmp <- trimws(doc_tmp) |
|
|
|
|
|
|
|
doc |
|
|
|
if (nchar(doc_tmp) == 0) { |
|
|
|
doc_tmp <- rvest::html_text(doc) |
|
|
|
doc_tmp <- trimws(doc_tmp) |
|
|
|
} |
|
|
|
|
|
|
|
doc_tmp |
|
|
|
|
|
|
|
} |
|
|
@ -18,3 +18,8 @@ by \code{just_the_facts()}. It can be run on any \code{html_document} or atomic |
|
|
|
(which it will parse into an \code{html_document}) and it will return an atomic character |
|
|
|
vector of only plain text (i.e. it will remove all tags). |
|
|
|
} |
|
|
|
\note{ |
|
|
|
the XSLT can be a bit aggressive for some URLs and this function will first |
|
|
|
try the XSLT and test for an empty return. If that condition exists, then |
|
|
|
it will revert to a plain text conversion with just straight \code{rvest::html_text()}. |
|
|
|
} |
|
|
|