Browse Source

tweaked `clean_text()`

master
boB Rudis 7 years ago
parent
commit
941e6f5bd6
No known key found for this signature in database GPG Key ID: 2A514A4997464560
  1. 16
      R/clean.r
  2. 5
      man/clean_text.Rd

16
R/clean.r

@ -8,6 +8,9 @@
#' @md
#' @param doc atomic character vector (i.e. plain text) or an `html_document`
#' @return atomic character vector of cleaned text
#' @note the XSLT can be a bit aggressive for some URLs and this function will first
#' try the XSLT and test for an empty return. If that condition exists, then
#' it will revert to a plain text conversion with just straight `rvest::html_text()`.
#' @export
clean_text <- function(doc) {
@ -15,10 +18,15 @@ clean_text <- function(doc) {
cleaner <- xml2::read_xml(system.file("xslt/justthetext.xslt", package="hgr"))
doc <- xslt::xml_xslt(doc, cleaner)
doc <- rvest::html_text(doc)
doc <- trimws(doc)
doc_tmp <- xslt::xml_xslt(doc, cleaner)
doc_tmp <- rvest::html_text(doc_tmp)
doc_tmp <- trimws(doc_tmp)
doc
if (nchar(doc_tmp) == 0) {
doc_tmp <- rvest::html_text(doc)
doc_tmp <- trimws(doc_tmp)
}
doc_tmp
}

5
man/clean_text.Rd

@ -18,3 +18,8 @@ by \code{just_the_facts()}. It can be run on any \code{html_document} or atomic
(which it will parse into an \code{html_document}) and it will return an atomic character
vector of only plain text (i.e. it will remove all tags).
}
\note{
the XSLT can be a bit aggressive for some URLs and this function will first
try the XSLT and test for an empty return. If that condition exists, then
it will revert to a plain text conversion with just straight \code{rvest::html_text()}.
}

Loading…
Cancel
Save