Browse Source

Fixes #1

master
Bob Rudis 5 years ago
parent
commit
d96ae2c99c
No known key found for this signature in database GPG Key ID: 1D7529BE14E2BBA9
  1. 2
      DESCRIPTION
  2. 10
      NEWS.md
  3. 4
      R/RcppExports.R
  4. 36
      R/tidy.r
  5. 6
      README.Rmd
  6. 21
      README.md
  7. 26
      man/tidy_html.Rd
  8. 7
      src/RcppExports.cpp
  9. 33
      src/htmltidy.cpp

2
DESCRIPTION

@ -1,6 +1,6 @@
Package: htmltidy
Title: Clean Up or Pretty Print Gnarly HTML and XHTML
Version: 0.2.0
Version: 0.3.0
Authors@R: c(
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre")),
person("Dave", "Dave", email = "dsr@w3.org", role = c("ctb", "aut"),

10
NEWS.md

@ -1,4 +1,10 @@
# htmltidy 0.2.0.9000
# htmltidy 0.3.0
* Better error handling (fixed crashing bug)
* New option to display document errors
# htmltidy 0.2.0
* Bundled tidy-html5 library with the package
* Windows compatibility
@ -7,7 +13,7 @@
* Modified tests
# htmltidy 0.1.0.9000
# htmltidy 0.1.0
* Added a `NEWS.md` file to track changes to the package.
* Added Debian & Ubuntu compatibility

4
R/RcppExports.R

@ -1,7 +1,7 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
tidy_html_int <- function(source, options) {
.Call('htmltidy_tidy_html_int', PACKAGE = 'htmltidy', source, options)
tidy_html_int <- function(source, options, show_errors) {
.Call('htmltidy_tidy_html_int', PACKAGE = 'htmltidy', source, options, show_errors)
}

36
R/tidy.r

@ -42,6 +42,10 @@
#' @param content accepts a character vector, raw vector or parsed content from the \code{xml2}
#' or \code{XML} packages.
#' @param options named list of options
#' @param verbose output document errors? (default: \code{FALSE})
#' @note If document parsing errors are severe enough, \code{tidy_html()} will not be able
#' to clean the document and will display the errors (this output can be captured with
#' \code{sink()} or \code{capture.output()}) along with a warning and return \code{NA}.
#' @return Tidied HTML/XHTML content. The object type will be the same as that of the input type
#' except when it is a \code{connection}, then a character vector will be returned.
#' @references \url{http://api.html-tidy.org/tidy/quickref_5.1.25.html} &
@ -77,62 +81,68 @@
#'
#' # but, you could also just do:
#' cat(tidy_html(url("http://rud.is/test/untidy.html")))
tidy_html <- function(content, options=list(TidyXhtmlOut=TRUE)) {
tidy_html <- function(content, options=list(TidyXhtmlOut=TRUE), verbose=FALSE) {
UseMethod("tidy_html")
}
#' @export
#' @rdname tidy_html
tidy_html.default <- function(content, options=list(TidyXhtmlOut=TRUE)) {
tidy_html.default <- function(content, options=list(TidyXhtmlOut=TRUE),
verbose=FALSE) {
content <- paste0(content, collapse="")
.Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
source=content, options=options, show_errors=verbose)
}
#' @export
#' @rdname tidy_html
tidy_html.character <- function(content, options=list(TidyXhtmlOut=TRUE)) {
tidy_html.character <- function(content, options=list(TidyXhtmlOut=TRUE),
verbose=FALSE) {
content <- paste0(content, collapse="")
.Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
source=content, options=options, show_errors=verbose)
}
#' @export
#' @rdname tidy_html
tidy_html.raw <- function(content, options=list(TidyXhtmlOut=TRUE)) {
tidy_html.raw <- function(content, options=list(TidyXhtmlOut=TRUE),
verbose=FALSE) {
content <- content[1]
content <- iconv(readBin(content, character()), to="UTF-8")
out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
source=content, options=options, show_errors=verbose)
charToRaw(out)
}
#' @export
#' @rdname tidy_html
tidy_html.xml_document <- function(content, options=list(TidyXhtmlOut=TRUE)) {
tidy_html.xml_document <- function(content, options=list(TidyXhtmlOut=TRUE),
verbose=FALSE) {
content <- toString(content)
out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
source=content, options=options, show_errors=verbose)
xml2::read_html(out)
}
#' @export
#' @rdname tidy_html
tidy_html.HTMLInternalDocument <- function(content, options=list(TidyXhtmlOut=TRUE)) {
tidy_html.HTMLInternalDocument <- function(content, options=list(TidyXhtmlOut=TRUE),
verbose=FALSE) {
content <- XML::saveXML(content)
out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
source=content, options=options, show_errors=verbose)
XML::htmlParse(out)
}
#' @export
#' @rdname tidy_html
tidy_html.connection <- function(content, options=list(TidyXhtmlOut=TRUE)) {
tidy_html.connection <- function(content, options=list(TidyXhtmlOut=TRUE),
verbose=FALSE) {
html <- paste0(readLines(content, warn=FALSE), collapse="")
close(content)
.Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=html, options=options)
source=html, options=options, show_errors=verbose)
}

6
README.Rmd

@ -91,6 +91,12 @@ tidy_html(content(res, as="parsed", encoding="UTF-8"))
tidy_html(htmlParse("http://rud.is/test/untidy.html"))
```
And, show the markup errors:
```{r message=FALSE, warning=FALSE}
invisible(tidy_html(url("http://rud.is/test/untidy.html"), verbose=TRUE))
```
### Testing Options
```{r message=FALSE, warning=FALSE}

21
README.md

@ -25,7 +25,7 @@ library(htmltidy)
# current verison
packageVersion("htmltidy")
## [1] '0.2.0'
## [1] '0.3.0'
library(XML)
library(xml2)
@ -146,6 +146,23 @@ tidy_html(htmlParse("http://rud.is/test/untidy.html"))
##
```
And, show the markup errors:
``` r
invisible(tidy_html(url("http://rud.is/test/untidy.html"), verbose=TRUE))
## line 1 column 1 - Warning: missing <!DOCTYPE> declaration
## line 1 column 68 - Warning: nested emphasis <b>
## line 1 column 138 - Warning: missing </span> before <div>
## line 1 column 68 - Warning: missing </b> before <div>
## line 1 column 164 - Warning: inserting implicit <span>
## line 1 column 164 - Warning: missing </span>
## line 1 column 159 - Warning: missing </div>
## line 1 column 1 - Warning: inserting missing 'title' element
## line 1 column 164 - Warning: <span> anchor "sp" already defined
## Info: Document content looks like XHTML5
## Tidy found 9 warnings and 0 errors!
```
### Testing Options
``` r
@ -197,7 +214,7 @@ sum(map_int(book, nchar))
## [1] 207501
system.time(tidy_book <- tidy_html(book))
## user system elapsed
## 0.021 0.000 0.021
## 0.021 0.001 0.023
```
(It's usually between 20 & 25 milliseconds to process those 202 kilobytes of HTML.) Not too shabby.

26
man/tidy_html.Rd

@ -10,26 +10,33 @@
\alias{tidy_html.xml_document}
\title{Tidy or "Pretty Print" HTML/XHTML Documents}
\usage{
tidy_html(content, options = list(TidyXhtmlOut = TRUE))
tidy_html(content, options = list(TidyXhtmlOut = TRUE), verbose = FALSE)
\method{tidy_html}{default}(content, options = list(TidyXhtmlOut = TRUE))
\method{tidy_html}{default}(content, options = list(TidyXhtmlOut = TRUE),
verbose = FALSE)
\method{tidy_html}{character}(content, options = list(TidyXhtmlOut = TRUE))
\method{tidy_html}{character}(content, options = list(TidyXhtmlOut = TRUE),
verbose = FALSE)
\method{tidy_html}{raw}(content, options = list(TidyXhtmlOut = TRUE))
\method{tidy_html}{raw}(content, options = list(TidyXhtmlOut = TRUE),
verbose = FALSE)
\method{tidy_html}{xml_document}(content, options = list(TidyXhtmlOut = TRUE))
\method{tidy_html}{xml_document}(content, options = list(TidyXhtmlOut = TRUE),
verbose = FALSE)
\method{tidy_html}{HTMLInternalDocument}(content, options = list(TidyXhtmlOut
= TRUE))
= TRUE), verbose = FALSE)
\method{tidy_html}{connection}(content, options = list(TidyXhtmlOut = TRUE))
\method{tidy_html}{connection}(content, options = list(TidyXhtmlOut = TRUE),
verbose = FALSE)
}
\arguments{
\item{content}{accepts a character vector, raw vector or parsed content from the \code{xml2}
or \code{XML} packages.}
\item{options}{named list of options}
\item{verbose}{output document errors? (default: \code{FALSE})}
}
\value{
Tidied HTML/XHTML content. The object type will be the same as that of the input type
@ -76,6 +83,11 @@ You can clean up Microsoft Word (2000) and Google Docs HTML via logical settings
It may also be advantageous to remove all comments with \code{TidyHideComments}.
}
\note{
If document parsing errors are severe enough, \code{tidy_html()} will not be able
to clean the document and will display the errors (this output can be captured with
\code{sink()} or \code{capture.output()}) along with a warning and return \code{NULL}.
}
\examples{
opts <- list(
TidyDocType="html5",

7
src/RcppExports.cpp

@ -6,14 +6,15 @@
using namespace Rcpp;
// tidy_html_int
std::string tidy_html_int(std::string source, Rcpp::List options);
RcppExport SEXP htmltidy_tidy_html_int(SEXP sourceSEXP, SEXP optionsSEXP) {
Rcpp::CharacterVector tidy_html_int(std::string source, Rcpp::List options, bool show_errors);
RcppExport SEXP htmltidy_tidy_html_int(SEXP sourceSEXP, SEXP optionsSEXP, SEXP show_errorsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::string >::type source(sourceSEXP);
Rcpp::traits::input_parameter< Rcpp::List >::type options(optionsSEXP);
rcpp_result_gen = Rcpp::wrap(tidy_html_int(source, options));
Rcpp::traits::input_parameter< bool >::type show_errors(show_errorsSEXP);
rcpp_result_gen = Rcpp::wrap(tidy_html_int(source, options, show_errors));
return rcpp_result_gen;
END_RCPP
}

33
src/htmltidy.cpp

@ -6,11 +6,12 @@
// NOTE: cannot do "using namespace Rcpp;" b/c of annoying warnings about the ambiguity of 'yes'.
//[[Rcpp::export]]
std::string tidy_html_int(std::string source, Rcpp::List options) {
Rcpp::CharacterVector tidy_html_int(std::string source, Rcpp::List options,
bool show_errors) {
TidyBuffer output = {0};
TidyBuffer errbuf = {0};
int rc = -1;
int rc = -1, max_rc = -1;
Bool ok;
TidyDoc tdoc = tidyCreate();
@ -176,31 +177,51 @@ std::string tidy_html_int(std::string source, Rcpp::List options) {
}
rc = tidySetErrorBuffer(tdoc, &errbuf);
max_rc = (rc > max_rc) ? rc : max_rc;
if (rc<0) Rcpp::stop("Error setting TidyHTML error buffer");
rc = tidyParseString(tdoc, source.c_str());
max_rc = (rc > max_rc) ? rc : max_rc;
if (rc<0) Rcpp::stop("Error parsing source document");
rc = tidyCleanAndRepair(tdoc);
max_rc = (rc > max_rc) ? rc : max_rc;
if (rc<0) Rcpp::stop("Error tidying source document");
rc = tidyRunDiagnostics(tdoc);
max_rc = (rc > max_rc) ? rc : max_rc;
if (rc<0) Rcpp::stop("Error generating tidy diagnostics");
rc = tidySaveBuffer(tdoc, &output);
max_rc = (rc > max_rc) ? rc : max_rc;
if (rc<0) Rcpp::stop("Error converting parsed document to character vector");
std::string ret = std::string(reinterpret_cast<const char*>(output.bp));
std::string ret;
if (output.bp) {
ret = std::string(reinterpret_cast<const char*>(output.bp));
} else {
ret = source;
show_errors = true;
}
if (show_errors & (errbuf.allocated > 0)) {
Rcpp::Rcout << std::string(reinterpret_cast<const char*>(errbuf.bp)) << std::endl;
if (max_rc > 1) {
Rcpp::warning("\nSevere errors were generated during document evaluation.\nReturing original document");
}
}
if (output.allocated > 0) tidyBufFree(&output);
if (errbuf.allocated > 0) tidyBufFree(&errbuf);
tidyBufFree(&output);
tidyBufFree(&errbuf);
tidyRelease(tdoc);
return(ret);
return((max_rc > 1) ? NA_STRING : Rcpp::wrap(ret));
}

Loading…
Cancel
Save