From 3518bf4cf4d95e87fafa4100ad0c62e3ae9ecd70 Mon Sep 17 00:00:00 2001 From: Bob Rudis Date: Sat, 10 Sep 2016 11:47:19 -0400 Subject: [PATCH] README --- DESCRIPTION | 17 ++++++------ NAMESPACE | 11 ++++++++ NEWS.md | 2 ++ R/tidy.r | 53 ++++++++++++++++++++++++++++++++++---- R/xml.r | 64 ++++++++++++++++++++++++++++++++++++++++++++++ README.Rmd | 51 +++++++++++++++++++++++++++++++++--- README.md | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++----- man/tidy_html.Rd | 23 ++++++++++++++--- man/tidy_xml.Rd | 50 ++++++++++++++++++++++++++++++++++++ src/htmltidy.cpp | 19 ++------------ 10 files changed, 323 insertions(+), 45 deletions(-) create mode 100644 R/xml.r create mode 100644 man/tidy_xml.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 4b987a0..24bb684 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: htmltidy -Title: Clean Up Gnarly HTML/XML +Title: Clean Up Gnarly HTML and XHTML Version: 0.2.0.9000 Authors@R: c( person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre")), @@ -7,18 +7,19 @@ Authors@R: c( comment="HTML Tidy library") ) Maintainer: Bob Rudis -Description: HTML and XML documents can be beautiful and pristine. They can also be - wretched, evil, malformed hellspawn. Now, you can tidy up that HTML and XML before - processing it with your favorite angle-bracket parsing tools. +Description: HTML documents can be beautiful and pristine. They can also be + wretched, evil, malformed demon-spawn. Now, you can tidy up that HTML and XHTML + before processing it with your favorite angle-bracket crunching tools. Depends: - R (>= 3.3.0) + R (>= 3.2.0) License: AGPL + file LICENSE LazyData: true NeedsCompilation: yes Suggests: - testthat, - xml2 + testthat LinkingTo: Rcpp Imports: - Rcpp + Rcpp, + xml2, + XML RoxygenNote: 5.0.1 diff --git a/NAMESPACE b/NAMESPACE index 6cc908c..d36408f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,16 @@ # Generated by roxygen2: do not edit by hand +S3method(tidy_html,HTMLInternalDocument) +S3method(tidy_html,character) +S3method(tidy_html,default) +S3method(tidy_html,raw) +S3method(tidy_html,xml_document) +S3method(tidy_xml,XMLInternalDocument) +S3method(tidy_xml,character) +S3method(tidy_xml,default) +S3method(tidy_xml,raw) +S3method(tidy_xml,xml_document) export(tidy_html) +export(tidy_xml) importFrom(Rcpp,sourceCpp) useDynLib(htmltidy) diff --git a/NEWS.md b/NEWS.md index 54fd144..856f365 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,8 @@ # htmltidy 0.2.0.9000 * Bundled tidy-html5 library with the package +* Windows compatibility +* Options handling * Modified tests diff --git a/R/tidy.r b/R/tidy.r index 6ebefd3..1b2d774 100644 --- a/R/tidy.r +++ b/R/tidy.r @@ -1,4 +1,4 @@ -#' Tidy HTML/XML/XHTML Documents +#' Tidy HTML/XHTML Documents #' #' Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly}, #' \code{TidyBreakBeforeBR}, \code{TidyCoerceEndTags}, \code{TidyCoerceEndTags}, @@ -9,15 +9,58 @@ #' \code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark}, #' \code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize}, #' \code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000}, -#' \code{TidyWrapLen}, \code{TidyXhtmlOut}, \code{TidyXmlDecl}, \code{TidyXmlOut}, -#' \code{TidyXmlTags}. +#' \code{TidyWrapLen}, \code{TidyXhtmlOut} #' #' @param content atomic character or raw vector of content to tidy #' @param options named list of options -#' @return Atomic character vector of tidy HTML/XML/XHTML content +#' @return tidied HTML/XHTML content #' @references \url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h} #' (for definitions of the options supported above). #' @export tidy_html <- function(content, options=list(TidyXhtmlOut=TRUE)) { - .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', source=content, options=options) + UseMethod("tidy_html") +} + +#' @export +#' @rdname tidy_html +tidy_html.default <- function(content, options=list(TidyXhtmlOut=TRUE)) { + content <- content[1] + .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', + source=content, options=options) +} + +#' @export +#' @rdname tidy_html +tidy_html.character <- function(content, options=list(TidyXhtmlOut=TRUE)) { + content <- content[1] + .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', + source=content, options=options) +} + +#' @export +#' @rdname tidy_html +tidy_html.raw <- function(content, options=list(TidyXhtmlOut=TRUE)) { + content <- content[1] + content <- iconv(readBin(content, character()), to="UTF-8") + out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', + source=content, options=options) + charToRaw(out) +} + +#' @export +#' @rdname tidy_html +tidy_html.xml_document <- function(content, options=list(TidyXhtmlOut=TRUE)) { + content <- toString(content) + out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', + source=content, options=options) + xml2::read_html(out) +} + +#' @export +#' @rdname tidy_html +tidy_html.HTMLInternalDocument <- function(content, options=list(TidyXhtmlOut=TRUE)) { + content <- saveXML(content) + out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', + source=content, options=options) + XML::htmlParse(out) } diff --git a/R/xml.r b/R/xml.r new file mode 100644 index 0000000..e89c795 --- /dev/null +++ b/R/xml.r @@ -0,0 +1,64 @@ +#' Tidy XML Documents +#' +#' Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly}, +#' \code{TidyBreakBeforeBR}, \code{TidyCoerceEndTags}, \code{TidyCoerceEndTags}, +#' \code{TidyDoctype}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas}, +#' \code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyHideComments}, +#' \code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyIndentSpaces}, +#' \code{TidyJoinClasses}, \code{TidyJoinStyles}, \code{TidyLogicalEmphasis}, +#' \code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark}, +#' \code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize}, +#' \code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000}, +#' \code{TidyWrapLen}, \code{TidyXhtmlOut}, \code{TidyXmlDecl}, \code{TidyXmlOut}, +#' \code{TidyXmlTags}. +#' +#' @param content atomic character or raw vector of content to tidy +#' @param options named list of options +#' @return tidied XML content +#' @references \url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h} +#' (for definitions of the options supported above). +#' @export +tidy_xml <- function(content, options=list(TidyXmlOut=TRUE)) { + UseMethod("tidy_xml") +} + +#' @export +#' @rdname tidy_xml +tidy_xml.default <- function(content, options=list(TidyXmlOut=TRUE)) { + .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', + source=content, options=options) +} + +#' @export +#' @rdname tidy_xml +tidy_xml.character <- function(content, options=list(TidyXmlOut=TRUE)) { + .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', + source=content, options=options) +} + +#' @export +#' @rdname tidy_xml +tidy_xml.raw <- function(content, options=list(TidyXmlOut=TRUE)) { + content <- iconv(readBin(content, character()), to="UTF-8") + out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', + source=content, options=options) + charToRaw(out) +} + +#' @export +#' @rdname tidy_xml +tidy_xml.xml_document <- function(content, options=list(TidyXmlOut=TRUE)) { + content <- toString(content) + out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', + source=content, options=options) + xml2::read_xml(out) +} + +#' @export +#' @rdname tidy_xml +tidy_xml.XMLInternalDocument <- function(content, options=list(TidyXmlOut=TRUE)) { + content <- saveXML(content) + out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', + source=content, options=options) + XML::xmlParse(out) +} diff --git a/README.Rmd b/README.Rmd index cb68ffd..3696846 100644 --- a/README.Rmd +++ b/README.Rmd @@ -17,7 +17,7 @@ knitr::opts_chunk$set( ) ``` -`htmltidy` — Clean up gnarly HTML/XML +`htmltidy` — Clean up gnarly HTML/XHTML Inspired by [this SO question](http://stackoverflow.com/questions/37061873/identify-a-weblink-in-bold-in-r) and because there's a great deal of cruddy HTML out there that needs fixing to use properly when scraping data. @@ -25,7 +25,7 @@ It relies on a locally included version of [`libtidy`](http://www.html-tidy.org/ The following functions are implemented: -- `tidy_html` : Clean up gnarly HTML/XML +- `tidy_html` : Clean up gnarly HTML/XHTML ### Installation @@ -39,13 +39,56 @@ options(width=120) ### Usage -```{r} +```{r message=FALSE, warning=FALSE} library(htmltidy) # current verison packageVersion("htmltidy") -cat(tidy_html("

google >

")) +library(XML) +library(xml2) +library(httr) + +res <- GET("http://rud.is") + +head(tidy_html(res$content), 256) + +head(tidy_html(content(res, as="raw")), 256) + +(class(tidy_html(content(res, as="text", encoding="UTF-8")))) # output is too long to show + +tidy_html(content(res, as="parsed", encoding="UTF-8")) # same as tidy_html(read_html("http://rud.is")) + +(class(tidy_html(htmlParse("http://rud.is")))) # output is too long to show +``` + +### Testing Options + +```{r message=FALSE, warning=FALSE} + +opts <- list(TidyDocType="html5", + TidyMakeClean=TRUE, + TidyHideComments=TRUE, + TidyIndentContent=FALSE, + TidyWrapLen=200) + +txt <- " + + + + +

Test

+ + + + +" + +cat(tidy_html(txt, option=opts)) + ``` ### Code of Conduct diff --git a/README.md b/README.md index ec99399..ae6e4ed 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Travis-CI Build Status](https://travis-ci.org/hrbrmstr/htmltidy.svg?branch=master)](https://travis-ci.org/hrbrmstr/htmltidy) -`htmltidy` — Clean up gnarly HTML/XML +`htmltidy` — Clean up gnarly HTML/XHTML Inspired by [this SO question](http://stackoverflow.com/questions/37061873/identify-a-weblink-in-bold-in-r) and because there's a great deal of cruddy HTML out there that needs fixing to use properly when scraping data. @@ -10,7 +10,7 @@ It relies on a locally included version of [`libtidy`](http://www.html-tidy.org/ The following functions are implemented: -- `tidy_html` : Clean up gnarly HTML/XML +- `tidy_html` : Clean up gnarly HTML/XHTML ### Installation @@ -27,16 +27,80 @@ library(htmltidy) packageVersion("htmltidy") #> [1] '0.2.0.9000' -cat(tidy_html("

google >

")) -#> +library(XML) +library(xml2) +library(httr) + +res <- GET("http://rud.is") + +head(tidy_html(res$content), 256) +#> [1] 3c 21 44 4f 43 54 59 50 45 20 68 74 6d 6c 20 50 55 42 4c 49 43 20 22 2d 2f 2f 57 33 43 2f 2f 44 54 44 20 58 48 54 +#> [39] 4d 4c 20 31 2e 30 20 54 72 61 6e 73 69 74 69 6f 6e 61 6c 2f 2f 45 4e 22 0a 20 20 20 20 22 68 74 74 70 3a 2f 2f 77 +#> [77] 77 77 2e 77 33 2e 6f 72 67 2f 54 52 2f 78 68 74 6d 6c 31 2f 44 54 44 2f 78 68 74 6d 6c 31 2d 74 72 61 6e 73 69 74 +#> [115] 69 6f 6e 61 6c 2e 64 74 64 22 3e 0a 3c 68 74 6d 6c 20 78 6d 6c 6e 73 3d 22 68 74 74 70 3a 2f 2f 77 77 77 2e 77 33 +#> [153] 2e 6f 72 67 2f 31 39 39 39 2f 78 68 74 6d 6c 22 3e 0a 3c 68 65 61 64 3e 0a 3c 6d 65 74 61 20 6e 61 6d 65 3d 22 67 +#> [191] 65 6e 65 72 61 74 6f 72 22 20 63 6f 6e 74 65 6e 74 3d 0a 22 48 54 4d 4c 20 54 69 64 79 20 66 6f 72 20 48 54 4d 4c +#> [229] 35 20 66 6f 72 20 52 20 76 65 72 73 69 6f 6e 20 35 2e 30 2e 30 22 20 2f 3e 0a 3c 6d + +head(tidy_html(content(res, as="raw")), 256) +#> [1] 3c 21 44 4f 43 54 59 50 45 20 68 74 6d 6c 20 50 55 42 4c 49 43 20 22 2d 2f 2f 57 33 43 2f 2f 44 54 44 20 58 48 54 +#> [39] 4d 4c 20 31 2e 30 20 54 72 61 6e 73 69 74 69 6f 6e 61 6c 2f 2f 45 4e 22 0a 20 20 20 20 22 68 74 74 70 3a 2f 2f 77 +#> [77] 77 77 2e 77 33 2e 6f 72 67 2f 54 52 2f 78 68 74 6d 6c 31 2f 44 54 44 2f 78 68 74 6d 6c 31 2d 74 72 61 6e 73 69 74 +#> [115] 69 6f 6e 61 6c 2e 64 74 64 22 3e 0a 3c 68 74 6d 6c 20 78 6d 6c 6e 73 3d 22 68 74 74 70 3a 2f 2f 77 77 77 2e 77 33 +#> [153] 2e 6f 72 67 2f 31 39 39 39 2f 78 68 74 6d 6c 22 3e 0a 3c 68 65 61 64 3e 0a 3c 6d 65 74 61 20 6e 61 6d 65 3d 22 67 +#> [191] 65 6e 65 72 61 74 6f 72 22 20 63 6f 6e 74 65 6e 74 3d 0a 22 48 54 4d 4c 20 54 69 64 79 20 66 6f 72 20 48 54 4d 4c +#> [229] 35 20 66 6f 72 20 52 20 76 65 72 73 69 6f 6e 20 35 2e 30 2e 30 22 20 2f 3e 0a 3c 6d + +(class(tidy_html(content(res, as="text", encoding="UTF-8")))) # output is too long to show +#> [1] "character" + +tidy_html(content(res, as="parsed", encoding="UTF-8")) # same as tidy_html(read_html("http://rud.is")) +#> {xml_document} #> +#> [1] \n \n Welcome to rud.is.

\n> You are in a maze of twisty little passages ... + +(class(tidy_html(htmlParse("http://rud.is")))) # output is too long to show +#> [1] "HTMLInternalDocument" "HTMLInternalDocument" "XMLInternalDocument" "XMLAbstractDocument" +``` + +### Testing Options + +``` r + +opts <- list(TidyDocType="html5", + TidyMakeClean=TRUE, + TidyHideComments=TRUE, + TidyIndentContent=FALSE, + TidyWrapLen=200) + +txt <- " + + + + +

Test

+ + + + +" + +cat(tidy_html(txt, option=opts)) +#> +#> #> -#> "HTML Tidy for HTML5 for R version 5.0.0" /> +#> +#> #> #> #> -#>

google >

+#>

Test

#> #> ``` diff --git a/man/tidy_html.Rd b/man/tidy_html.Rd index 278cd45..81fa9fe 100644 --- a/man/tidy_html.Rd +++ b/man/tidy_html.Rd @@ -2,9 +2,25 @@ % Please edit documentation in R/tidy.r \name{tidy_html} \alias{tidy_html} -\title{Tidy HTML/XML/XHTML Documents} +\alias{tidy_html.HTMLInternalDocument} +\alias{tidy_html.character} +\alias{tidy_html.default} +\alias{tidy_html.raw} +\alias{tidy_html.xml_document} +\title{Tidy HTML/XHTML Documents} \usage{ tidy_html(content, options = list(TidyXhtmlOut = TRUE)) + +\method{tidy_html}{default}(content, options = list(TidyXhtmlOut = TRUE)) + +\method{tidy_html}{character}(content, options = list(TidyXhtmlOut = TRUE)) + +\method{tidy_html}{raw}(content, options = list(TidyXhtmlOut = TRUE)) + +\method{tidy_html}{xml_document}(content, options = list(TidyXhtmlOut = TRUE)) + +\method{tidy_html}{HTMLInternalDocument}(content, options = list(TidyXhtmlOut + = TRUE)) } \arguments{ \item{content}{atomic character or raw vector of content to tidy} @@ -12,7 +28,7 @@ tidy_html(content, options = list(TidyXhtmlOut = TRUE)) \item{options}{named list of options} } \value{ -Atomic character vector of tidy HTML/XML/XHTML content +tidied HTML/XHTML content } \description{ Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly}, @@ -24,8 +40,7 @@ Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly}, \code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark}, \code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize}, \code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000}, -\code{TidyWrapLen}, \code{TidyXhtmlOut}, \code{TidyXmlDecl}, \code{TidyXmlOut}, -\code{TidyXmlTags}. +\code{TidyWrapLen}, \code{TidyXhtmlOut} } \references{ \url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h} diff --git a/man/tidy_xml.Rd b/man/tidy_xml.Rd new file mode 100644 index 0000000..584dbdc --- /dev/null +++ b/man/tidy_xml.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xml.r +\name{tidy_xml} +\alias{tidy_xml} +\alias{tidy_xml.XMLInternalDocument} +\alias{tidy_xml.character} +\alias{tidy_xml.default} +\alias{tidy_xml.raw} +\alias{tidy_xml.xml_document} +\title{Tidy XML Documents} +\usage{ +tidy_xml(content, options = list(TidyXmlOut = TRUE)) + +\method{tidy_xml}{default}(content, options = list(TidyXmlOut = TRUE)) + +\method{tidy_xml}{character}(content, options = list(TidyXmlOut = TRUE)) + +\method{tidy_xml}{raw}(content, options = list(TidyXmlOut = TRUE)) + +\method{tidy_xml}{xml_document}(content, options = list(TidyXmlOut = TRUE)) + +\method{tidy_xml}{XMLInternalDocument}(content, options = list(TidyXmlOut = + TRUE)) +} +\arguments{ +\item{content}{atomic character or raw vector of content to tidy} + +\item{options}{named list of options} +} +\value{ +tidied XML content +} +\description{ +Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly}, +\code{TidyBreakBeforeBR}, \code{TidyCoerceEndTags}, \code{TidyCoerceEndTags}, +\code{TidyDoctype}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas}, +\code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyHideComments}, +\code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyIndentSpaces}, +\code{TidyJoinClasses}, \code{TidyJoinStyles}, \code{TidyLogicalEmphasis}, +\code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark}, +\code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize}, +\code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000}, +\code{TidyWrapLen}, \code{TidyXhtmlOut}, \code{TidyXmlDecl}, \code{TidyXmlOut}, +\code{TidyXmlTags}. +} +\references{ +\url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h} + (for definitions of the options supported above). +} + diff --git a/src/htmltidy.cpp b/src/htmltidy.cpp index 3e2ec27..6aceb9c 100644 --- a/src/htmltidy.cpp +++ b/src/htmltidy.cpp @@ -21,31 +21,16 @@ std::string tidy_html_int(std::string source, Rcpp::List options) { if (ok == no) Rcpp::stop("Error setting TidyHTML options"); } - if (options.containsElementNamed("TidyXmlOut")) { - ok = tidyOptSetBool(tdoc, TidyXmlOut, options["TidyXmlOut"] ? yes : no); - if (ok == no) Rcpp::stop("Error setting TidyHTML options"); - } - if (options.containsElementNamed("TidyHtmlOut")) { ok = tidyOptSetBool(tdoc, TidyHtmlOut, options["TidyHtmlOut"] ? yes : no); if (ok == no) Rcpp::stop("Error setting TidyHTML options"); } - if (options.containsElementNamed("TidyXmlTags")) { - ok = tidyOptSetBool(tdoc, TidyXmlTags, options["TidyXmlTags"] ? yes : no); - if (ok == no) Rcpp::stop("Error setting TidyHTML options"); - } - if (options.containsElementNamed("TidyOmitOptionalTags")) { ok = tidyOptSetBool(tdoc, TidyOmitOptionalTags, options["TidyOmitOptionalTags"] ? yes : no); if (ok == no) Rcpp::stop("Error setting TidyHTML options"); } - if (options.containsElementNamed("TidyXmlDecl")) { - ok = tidyOptSetBool(tdoc, TidyXmlDecl, options["TidyXmlDecl"] ? yes : no); - if (ok == no) Rcpp::stop("Error setting TidyHTML options"); - } - if (options.containsElementNamed("TidyBreakBeforeBR")) { ok = tidyOptSetBool(tdoc, TidyBreakBeforeBR, options["TidyBreakBeforeBR"] ? yes : no); if (ok == no) Rcpp::stop("Error setting TidyHTML options"); @@ -87,7 +72,7 @@ std::string tidy_html_int(std::string source, Rcpp::List options) { } if (options.containsElementNamed("TidyHideComments")) { - ok = tidyOptSetBool(tdoc, TidyBodyOnly, options["TidyHideComments"] ? yes : no); + ok = tidyOptSetBool(tdoc, TidyHideComments, options["TidyHideComments"] ? yes : no); if (ok == no) Rcpp::stop("Error setting TidyHTML options"); } @@ -137,7 +122,7 @@ std::string tidy_html_int(std::string source, Rcpp::List options) { } if (options.containsElementNamed("TidyMakeClean")) { - ok = tidyOptSetValue(tdoc, TidyMakeClean, Rcpp::as(options["TidyMakeClean"]).c_str()); + ok = tidyOptSetBool(tdoc, TidyMakeClean, options["TidyMakeClean"] ? yes : no); if (ok == no) Rcpp::stop("Error setting TidyHTML options"); }