Bob Rudis
7 years ago
No known key found for this signature in database
GPG Key ID: 1D7529BE14E2BBA9
4 changed files with
16 additions and
7 deletions
-
R/tidy.r
-
README.md
-
man/tidy_html.Rd
-
src/htmltidy.cpp
|
|
@ -45,7 +45,8 @@ |
|
|
|
#' @param verbose output document errors? (default: \code{FALSE}) |
|
|
|
#' @note If document parsing errors are severe enough, \code{tidy_html()} will not be able |
|
|
|
#' to clean the document and will display the errors (this output can be captured with |
|
|
|
#' \code{sink()} or \code{capture.output()}) along with a warning and return \code{NA}. |
|
|
|
#' \code{sink()} or \code{capture.output()}) along with a warning and return a "best effort" |
|
|
|
#' cleaned version of the document. |
|
|
|
#' @return Tidied HTML/XHTML content. The object type will be the same as that of the input type |
|
|
|
#' except when it is a \code{connection}, then a character vector will be returned. |
|
|
|
#' @references \url{http://api.html-tidy.org/tidy/quickref_5.1.25.html} & |
|
|
|
|
|
@ -214,7 +214,7 @@ sum(map_int(book, nchar)) |
|
|
|
## [1] 207501 |
|
|
|
system.time(tidy_book <- tidy_html(book)) |
|
|
|
## user system elapsed |
|
|
|
## 0.021 0.001 0.023 |
|
|
|
## 0.022 0.002 0.024 |
|
|
|
``` |
|
|
|
|
|
|
|
(It's usually between 20 & 25 milliseconds to process those 202 kilobytes of HTML.) Not too shabby. |
|
|
|
|
|
@ -86,7 +86,8 @@ It may also be advantageous to remove all comments with \code{TidyHideComments}. |
|
|
|
\note{ |
|
|
|
If document parsing errors are severe enough, \code{tidy_html()} will not be able |
|
|
|
to clean the document and will display the errors (this output can be captured with |
|
|
|
\code{sink()} or \code{capture.output()}) along with a warning and return \code{NULL}. |
|
|
|
\code{sink()} or \code{capture.output()}) along with a warning and return a "best effort" |
|
|
|
cleaned version of the document. |
|
|
|
} |
|
|
|
\examples{ |
|
|
|
opts <- list( |
|
|
|
|
|
@ -176,6 +176,9 @@ Rcpp::CharacterVector tidy_html_int(std::string source, Rcpp::List options, |
|
|
|
if (ok == no) Rcpp::stop("Error setting TidyHTML options"); |
|
|
|
} |
|
|
|
|
|
|
|
ok = tidyOptSetBool(tdoc, TidyForceOutput, yes); |
|
|
|
if (ok == no) Rcpp::stop("Error setting TidyHTML options"); |
|
|
|
|
|
|
|
rc = tidySetErrorBuffer(tdoc, &errbuf); |
|
|
|
max_rc = (rc > max_rc) ? rc : max_rc; |
|
|
|
|
|
|
@ -210,10 +213,14 @@ Rcpp::CharacterVector tidy_html_int(std::string source, Rcpp::List options, |
|
|
|
show_errors = true; |
|
|
|
} |
|
|
|
|
|
|
|
if (show_errors & (errbuf.allocated > 0)) { |
|
|
|
Rcpp::Rcout << std::string(reinterpret_cast<const char*>(errbuf.bp)) << std::endl; |
|
|
|
if (max_rc > 1) show_errors = true; |
|
|
|
|
|
|
|
if (show_errors) { |
|
|
|
if (errbuf.allocated > 0) { |
|
|
|
Rcpp::Rcout << std::string(reinterpret_cast<const char*>(errbuf.bp)) << std::endl; |
|
|
|
} |
|
|
|
if (max_rc > 1) { |
|
|
|
Rcpp::warning("\nSevere errors were generated during document evaluation.\nReturing original document"); |
|
|
|
Rcpp::warning("\nSevere errors were generated during document evaluation.\n"); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
@ -222,6 +229,6 @@ Rcpp::CharacterVector tidy_html_int(std::string source, Rcpp::List options, |
|
|
|
|
|
|
|
tidyRelease(tdoc); |
|
|
|
|
|
|
|
return((max_rc > 1) ? NA_STRING : Rcpp::wrap(ret)); |
|
|
|
return(Rcpp::wrap(ret)); |
|
|
|
|
|
|
|
} |
|
|
|