diff --git a/DESCRIPTION b/DESCRIPTION index 8fe0a6c..874df0c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: htmltidy Title: Clean Up or Pretty Print Gnarly HTML and XHTML -Version: 0.2.0 +Version: 0.3.0 Authors@R: c( person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre")), person("Dave", "Dave", email = "dsr@w3.org", role = c("ctb", "aut"), diff --git a/NEWS.md b/NEWS.md index a7f07cb..402e2ab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,10 @@ -# htmltidy 0.2.0.9000 +# htmltidy 0.3.0 + +* Better error handling (fixed crashing bug) +* New option to display document errors + + +# htmltidy 0.2.0 * Bundled tidy-html5 library with the package * Windows compatibility @@ -7,7 +13,7 @@ * Modified tests -# htmltidy 0.1.0.9000 +# htmltidy 0.1.0 * Added a `NEWS.md` file to track changes to the package. * Added Debian & Ubuntu compatibility diff --git a/R/RcppExports.R b/R/RcppExports.R index d17e027..9484152 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,7 +1,7 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -tidy_html_int <- function(source, options) { - .Call('htmltidy_tidy_html_int', PACKAGE = 'htmltidy', source, options) +tidy_html_int <- function(source, options, show_errors) { + .Call('htmltidy_tidy_html_int', PACKAGE = 'htmltidy', source, options, show_errors) } diff --git a/R/tidy.r b/R/tidy.r index 5d8fe7a..3e4b651 100644 --- a/R/tidy.r +++ b/R/tidy.r @@ -42,6 +42,10 @@ #' @param content accepts a character vector, raw vector or parsed content from the \code{xml2} #' or \code{XML} packages. #' @param options named list of options +#' @param verbose output document errors? (default: \code{FALSE}) +#' @note If document parsing errors are severe enough, \code{tidy_html()} will not be able +#' to clean the document and will display the errors (this output can be captured with +#' \code{sink()} or \code{capture.output()}) along with a warning and return \code{NA}. #' @return Tidied HTML/XHTML content. The object type will be the same as that of the input type #' except when it is a \code{connection}, then a character vector will be returned. #' @references \url{http://api.html-tidy.org/tidy/quickref_5.1.25.html} & @@ -77,62 +81,68 @@ #' #' # but, you could also just do: #' cat(tidy_html(url("http://rud.is/test/untidy.html"))) -tidy_html <- function(content, options=list(TidyXhtmlOut=TRUE)) { +tidy_html <- function(content, options=list(TidyXhtmlOut=TRUE), verbose=FALSE) { UseMethod("tidy_html") } #' @export #' @rdname tidy_html -tidy_html.default <- function(content, options=list(TidyXhtmlOut=TRUE)) { +tidy_html.default <- function(content, options=list(TidyXhtmlOut=TRUE), + verbose=FALSE) { content <- paste0(content, collapse="") .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', - source=content, options=options) + source=content, options=options, show_errors=verbose) } #' @export #' @rdname tidy_html -tidy_html.character <- function(content, options=list(TidyXhtmlOut=TRUE)) { +tidy_html.character <- function(content, options=list(TidyXhtmlOut=TRUE), + verbose=FALSE) { content <- paste0(content, collapse="") .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', - source=content, options=options) + source=content, options=options, show_errors=verbose) } #' @export #' @rdname tidy_html -tidy_html.raw <- function(content, options=list(TidyXhtmlOut=TRUE)) { +tidy_html.raw <- function(content, options=list(TidyXhtmlOut=TRUE), + verbose=FALSE) { content <- content[1] content <- iconv(readBin(content, character()), to="UTF-8") out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', - source=content, options=options) + source=content, options=options, show_errors=verbose) charToRaw(out) } #' @export #' @rdname tidy_html -tidy_html.xml_document <- function(content, options=list(TidyXhtmlOut=TRUE)) { +tidy_html.xml_document <- function(content, options=list(TidyXhtmlOut=TRUE), + verbose=FALSE) { content <- toString(content) out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', - source=content, options=options) + source=content, options=options, show_errors=verbose) xml2::read_html(out) } #' @export #' @rdname tidy_html -tidy_html.HTMLInternalDocument <- function(content, options=list(TidyXhtmlOut=TRUE)) { +tidy_html.HTMLInternalDocument <- function(content, options=list(TidyXhtmlOut=TRUE), + verbose=FALSE) { content <- XML::saveXML(content) out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', - source=content, options=options) + source=content, options=options, show_errors=verbose) XML::htmlParse(out) } #' @export #' @rdname tidy_html -tidy_html.connection <- function(content, options=list(TidyXhtmlOut=TRUE)) { +tidy_html.connection <- function(content, options=list(TidyXhtmlOut=TRUE), + verbose=FALSE) { html <- paste0(readLines(content, warn=FALSE), collapse="") close(content) .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', - source=html, options=options) + source=html, options=options, show_errors=verbose) } diff --git a/README.Rmd b/README.Rmd index c316cd3..acdd410 100644 --- a/README.Rmd +++ b/README.Rmd @@ -91,6 +91,12 @@ tidy_html(content(res, as="parsed", encoding="UTF-8")) tidy_html(htmlParse("http://rud.is/test/untidy.html")) ``` +And, show the markup errors: + +```{r message=FALSE, warning=FALSE} +invisible(tidy_html(url("http://rud.is/test/untidy.html"), verbose=TRUE)) +``` + ### Testing Options ```{r message=FALSE, warning=FALSE} diff --git a/README.md b/README.md index 4c392cb..8e62636 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ library(htmltidy) # current verison packageVersion("htmltidy") -## [1] '0.2.0' +## [1] '0.3.0' library(XML) library(xml2) @@ -146,6 +146,23 @@ tidy_html(htmlParse("http://rud.is/test/untidy.html")) ## ``` +And, show the markup errors: + +``` r +invisible(tidy_html(url("http://rud.is/test/untidy.html"), verbose=TRUE)) +## line 1 column 1 - Warning: missing declaration +## line 1 column 68 - Warning: nested emphasis +## line 1 column 138 - Warning: missing before
+## line 1 column 68 - Warning: missing before
+## line 1 column 164 - Warning: inserting implicit +## line 1 column 164 - Warning: missing +## line 1 column 159 - Warning: missing
+## line 1 column 1 - Warning: inserting missing 'title' element +## line 1 column 164 - Warning: anchor "sp" already defined +## Info: Document content looks like XHTML5 +## Tidy found 9 warnings and 0 errors! +``` + ### Testing Options ``` r @@ -197,7 +214,7 @@ sum(map_int(book, nchar)) ## [1] 207501 system.time(tidy_book <- tidy_html(book)) ## user system elapsed -## 0.021 0.000 0.021 +## 0.021 0.001 0.023 ``` (It's usually between 20 & 25 milliseconds to process those 202 kilobytes of HTML.) Not too shabby. diff --git a/man/tidy_html.Rd b/man/tidy_html.Rd index 13783c2..842953f 100644 --- a/man/tidy_html.Rd +++ b/man/tidy_html.Rd @@ -10,26 +10,33 @@ \alias{tidy_html.xml_document} \title{Tidy or "Pretty Print" HTML/XHTML Documents} \usage{ -tidy_html(content, options = list(TidyXhtmlOut = TRUE)) +tidy_html(content, options = list(TidyXhtmlOut = TRUE), verbose = FALSE) -\method{tidy_html}{default}(content, options = list(TidyXhtmlOut = TRUE)) +\method{tidy_html}{default}(content, options = list(TidyXhtmlOut = TRUE), + verbose = FALSE) -\method{tidy_html}{character}(content, options = list(TidyXhtmlOut = TRUE)) +\method{tidy_html}{character}(content, options = list(TidyXhtmlOut = TRUE), + verbose = FALSE) -\method{tidy_html}{raw}(content, options = list(TidyXhtmlOut = TRUE)) +\method{tidy_html}{raw}(content, options = list(TidyXhtmlOut = TRUE), + verbose = FALSE) -\method{tidy_html}{xml_document}(content, options = list(TidyXhtmlOut = TRUE)) +\method{tidy_html}{xml_document}(content, options = list(TidyXhtmlOut = TRUE), + verbose = FALSE) \method{tidy_html}{HTMLInternalDocument}(content, options = list(TidyXhtmlOut - = TRUE)) + = TRUE), verbose = FALSE) -\method{tidy_html}{connection}(content, options = list(TidyXhtmlOut = TRUE)) +\method{tidy_html}{connection}(content, options = list(TidyXhtmlOut = TRUE), + verbose = FALSE) } \arguments{ \item{content}{accepts a character vector, raw vector or parsed content from the \code{xml2} or \code{XML} packages.} \item{options}{named list of options} + +\item{verbose}{output document errors? (default: \code{FALSE})} } \value{ Tidied HTML/XHTML content. The object type will be the same as that of the input type @@ -76,6 +83,11 @@ You can clean up Microsoft Word (2000) and Google Docs HTML via logical settings It may also be advantageous to remove all comments with \code{TidyHideComments}. } +\note{ +If document parsing errors are severe enough, \code{tidy_html()} will not be able + to clean the document and will display the errors (this output can be captured with + \code{sink()} or \code{capture.output()}) along with a warning and return \code{NULL}. +} \examples{ opts <- list( TidyDocType="html5", diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 37745f7..73c0573 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -6,14 +6,15 @@ using namespace Rcpp; // tidy_html_int -std::string tidy_html_int(std::string source, Rcpp::List options); -RcppExport SEXP htmltidy_tidy_html_int(SEXP sourceSEXP, SEXP optionsSEXP) { +Rcpp::CharacterVector tidy_html_int(std::string source, Rcpp::List options, bool show_errors); +RcppExport SEXP htmltidy_tidy_html_int(SEXP sourceSEXP, SEXP optionsSEXP, SEXP show_errorsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< std::string >::type source(sourceSEXP); Rcpp::traits::input_parameter< Rcpp::List >::type options(optionsSEXP); - rcpp_result_gen = Rcpp::wrap(tidy_html_int(source, options)); + Rcpp::traits::input_parameter< bool >::type show_errors(show_errorsSEXP); + rcpp_result_gen = Rcpp::wrap(tidy_html_int(source, options, show_errors)); return rcpp_result_gen; END_RCPP } diff --git a/src/htmltidy.cpp b/src/htmltidy.cpp index be57b98..ef0ac22 100644 --- a/src/htmltidy.cpp +++ b/src/htmltidy.cpp @@ -6,11 +6,12 @@ // NOTE: cannot do "using namespace Rcpp;" b/c of annoying warnings about the ambiguity of 'yes'. //[[Rcpp::export]] -std::string tidy_html_int(std::string source, Rcpp::List options) { +Rcpp::CharacterVector tidy_html_int(std::string source, Rcpp::List options, + bool show_errors) { TidyBuffer output = {0}; TidyBuffer errbuf = {0}; - int rc = -1; + int rc = -1, max_rc = -1; Bool ok; TidyDoc tdoc = tidyCreate(); @@ -176,31 +177,51 @@ std::string tidy_html_int(std::string source, Rcpp::List options) { } rc = tidySetErrorBuffer(tdoc, &errbuf); + max_rc = (rc > max_rc) ? rc : max_rc; if (rc<0) Rcpp::stop("Error setting TidyHTML error buffer"); rc = tidyParseString(tdoc, source.c_str()); + max_rc = (rc > max_rc) ? rc : max_rc; if (rc<0) Rcpp::stop("Error parsing source document"); rc = tidyCleanAndRepair(tdoc); + max_rc = (rc > max_rc) ? rc : max_rc; if (rc<0) Rcpp::stop("Error tidying source document"); rc = tidyRunDiagnostics(tdoc); + max_rc = (rc > max_rc) ? rc : max_rc; if (rc<0) Rcpp::stop("Error generating tidy diagnostics"); rc = tidySaveBuffer(tdoc, &output); + max_rc = (rc > max_rc) ? rc : max_rc; if (rc<0) Rcpp::stop("Error converting parsed document to character vector"); - std::string ret = std::string(reinterpret_cast(output.bp)); + std::string ret; + + if (output.bp) { + ret = std::string(reinterpret_cast(output.bp)); + } else { + ret = source; + show_errors = true; + } + + if (show_errors & (errbuf.allocated > 0)) { + Rcpp::Rcout << std::string(reinterpret_cast(errbuf.bp)) << std::endl; + if (max_rc > 1) { + Rcpp::warning("\nSevere errors were generated during document evaluation.\nReturing original document"); + } + } + + if (output.allocated > 0) tidyBufFree(&output); + if (errbuf.allocated > 0) tidyBufFree(&errbuf); - tidyBufFree(&output); - tidyBufFree(&errbuf); tidyRelease(tdoc); - return(ret); + return((max_rc > 1) ? NA_STRING : Rcpp::wrap(ret)); }