Browse Source

README

master
Bob Rudis 8 years ago
parent
commit
e6fce82ec1
No known key found for this signature in database GPG Key ID: 1D7529BE14E2BBA9
  1. 17
      DESCRIPTION
  2. 11
      NAMESPACE
  3. 2
      NEWS.md
  4. 53
      R/tidy.r
  5. 64
      R/xml.r
  6. 51
      README.Rmd
  7. 78
      README.md
  8. 23
      man/tidy_html.Rd
  9. 50
      man/tidy_xml.Rd
  10. 19
      src/htmltidy.cpp

17
DESCRIPTION

@ -1,5 +1,5 @@
Package: htmltidy
Title: Clean Up Gnarly HTML/XML
Title: Clean Up Gnarly HTML and XHTML
Version: 0.2.0.9000
Authors@R: c(
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre")),
@ -7,18 +7,19 @@ Authors@R: c(
comment="HTML Tidy library")
)
Maintainer: Bob Rudis <bob@rud.is>
Description: HTML and XML documents can be beautiful and pristine. They can also be
wretched, evil, malformed hellspawn. Now, you can tidy up that HTML and XML before
processing it with your favorite angle-bracket parsing tools.
Description: HTML documents can be beautiful and pristine. They can also be
wretched, evil, malformed demon-spawn. Now, you can tidy up that HTML and XHTML
before processing it with your favorite angle-bracket crunching tools.
Depends:
R (>= 3.3.0)
R (>= 3.2.0)
License: AGPL + file LICENSE
LazyData: true
NeedsCompilation: yes
Suggests:
testthat,
xml2
testthat
LinkingTo: Rcpp
Imports:
Rcpp
Rcpp,
xml2,
XML
RoxygenNote: 5.0.1

11
NAMESPACE

@ -1,5 +1,16 @@
# Generated by roxygen2: do not edit by hand
S3method(tidy_html,HTMLInternalDocument)
S3method(tidy_html,character)
S3method(tidy_html,default)
S3method(tidy_html,raw)
S3method(tidy_html,xml_document)
S3method(tidy_xml,XMLInternalDocument)
S3method(tidy_xml,character)
S3method(tidy_xml,default)
S3method(tidy_xml,raw)
S3method(tidy_xml,xml_document)
export(tidy_html)
export(tidy_xml)
importFrom(Rcpp,sourceCpp)
useDynLib(htmltidy)

2
NEWS.md

@ -1,6 +1,8 @@
# htmltidy 0.2.0.9000
* Bundled tidy-html5 library with the package
* Windows compatibility
* Options handling
* Modified tests

53
R/tidy.r

@ -1,4 +1,4 @@
#' Tidy HTML/XML/XHTML Documents
#' Tidy HTML/XHTML Documents
#'
#' Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly},
#' \code{TidyBreakBeforeBR}, \code{TidyCoerceEndTags}, \code{TidyCoerceEndTags},
@ -9,15 +9,58 @@
#' \code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark},
#' \code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize},
#' \code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000},
#' \code{TidyWrapLen}, \code{TidyXhtmlOut}, \code{TidyXmlDecl}, \code{TidyXmlOut},
#' \code{TidyXmlTags}.
#' \code{TidyWrapLen}, \code{TidyXhtmlOut}
#'
#' @param content atomic character or raw vector of content to tidy
#' @param options named list of options
#' @return Atomic character vector of tidy HTML/XML/XHTML content
#' @return tidied HTML/XHTML content
#' @references \url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h}
#' (for definitions of the options supported above).
#' @export
tidy_html <- function(content, options=list(TidyXhtmlOut=TRUE)) {
.Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', source=content, options=options)
UseMethod("tidy_html")
}
#' @export
#' @rdname tidy_html
tidy_html.default <- function(content, options=list(TidyXhtmlOut=TRUE)) {
content <- content[1]
.Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
}
#' @export
#' @rdname tidy_html
tidy_html.character <- function(content, options=list(TidyXhtmlOut=TRUE)) {
content <- content[1]
.Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
}
#' @export
#' @rdname tidy_html
tidy_html.raw <- function(content, options=list(TidyXhtmlOut=TRUE)) {
content <- content[1]
content <- iconv(readBin(content, character()), to="UTF-8")
out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
charToRaw(out)
}
#' @export
#' @rdname tidy_html
tidy_html.xml_document <- function(content, options=list(TidyXhtmlOut=TRUE)) {
content <- toString(content)
out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
xml2::read_html(out)
}
#' @export
#' @rdname tidy_html
tidy_html.HTMLInternalDocument <- function(content, options=list(TidyXhtmlOut=TRUE)) {
content <- saveXML(content)
out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
XML::htmlParse(out)
}

64
R/xml.r

@ -0,0 +1,64 @@
#' Tidy XML Documents
#'
#' Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly},
#' \code{TidyBreakBeforeBR}, \code{TidyCoerceEndTags}, \code{TidyCoerceEndTags},
#' \code{TidyDoctype}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas},
#' \code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyHideComments},
#' \code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyIndentSpaces},
#' \code{TidyJoinClasses}, \code{TidyJoinStyles}, \code{TidyLogicalEmphasis},
#' \code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark},
#' \code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize},
#' \code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000},
#' \code{TidyWrapLen}, \code{TidyXhtmlOut}, \code{TidyXmlDecl}, \code{TidyXmlOut},
#' \code{TidyXmlTags}.
#'
#' @param content atomic character or raw vector of content to tidy
#' @param options named list of options
#' @return tidied XML content
#' @references \url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h}
#' (for definitions of the options supported above).
#' @export
tidy_xml <- function(content, options=list(TidyXmlOut=TRUE)) {
UseMethod("tidy_xml")
}
#' @export
#' @rdname tidy_xml
tidy_xml.default <- function(content, options=list(TidyXmlOut=TRUE)) {
.Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
}
#' @export
#' @rdname tidy_xml
tidy_xml.character <- function(content, options=list(TidyXmlOut=TRUE)) {
.Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
}
#' @export
#' @rdname tidy_xml
tidy_xml.raw <- function(content, options=list(TidyXmlOut=TRUE)) {
content <- iconv(readBin(content, character()), to="UTF-8")
out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
charToRaw(out)
}
#' @export
#' @rdname tidy_xml
tidy_xml.xml_document <- function(content, options=list(TidyXmlOut=TRUE)) {
content <- toString(content)
out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
xml2::read_xml(out)
}
#' @export
#' @rdname tidy_xml
tidy_xml.XMLInternalDocument <- function(content, options=list(TidyXmlOut=TRUE)) {
content <- saveXML(content)
out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
XML::xmlParse(out)
}

51
README.Rmd

@ -17,7 +17,7 @@ knitr::opts_chunk$set(
)
```
`htmltidy` — Clean up gnarly HTML/XML
`htmltidy` — Clean up gnarly HTML/XHTML
Inspired by [this SO question](http://stackoverflow.com/questions/37061873/identify-a-weblink-in-bold-in-r) and because there's a great deal of cruddy HTML out there that needs fixing to use properly when scraping data.
@ -25,7 +25,7 @@ It relies on a locally included version of [`libtidy`](http://www.html-tidy.org/
The following functions are implemented:
- `tidy_html` : Clean up gnarly HTML/XML
- `tidy_html` : Clean up gnarly HTML/XHTML
### Installation
@ -39,13 +39,56 @@ options(width=120)
### Usage
```{r}
```{r message=FALSE, warning=FALSE}
library(htmltidy)
# current verison
packageVersion("htmltidy")
cat(tidy_html("<b><p><a href='http://google.com'>google &gt</a></p></b>"))
library(XML)
library(xml2)
library(httr)
res <- GET("http://rud.is")
head(tidy_html(res$content), 256)
head(tidy_html(content(res, as="raw")), 256)
(class(tidy_html(content(res, as="text", encoding="UTF-8")))) # output is too long to show
tidy_html(content(res, as="parsed", encoding="UTF-8")) # same as tidy_html(read_html("http://rud.is"))
(class(tidy_html(htmlParse("http://rud.is")))) # output is too long to show
```
### Testing Options
```{r message=FALSE, warning=FALSE}
opts <- list(TidyDocType="html5",
TidyMakeClean=TRUE,
TidyHideComments=TRUE,
TidyIndentContent=FALSE,
TidyWrapLen=200)
txt <- "<html>
<head>
<style>
p { color: red; }
</style>
<body>
<!-- ===== body ====== -->
<p>Test</p>
</body>
<!--Default Zone
-->
<!--Default Zone End-->
</html>"
cat(tidy_html(txt, option=opts))
```
### Code of Conduct

78
README.md

@ -2,7 +2,7 @@
[![Travis-CI Build Status](https://travis-ci.org/hrbrmstr/htmltidy.svg?branch=master)](https://travis-ci.org/hrbrmstr/htmltidy)
<!-- README.md is generated from README.Rmd. Please edit that file -->
`htmltidy` — Clean up gnarly HTML/XML
`htmltidy` — Clean up gnarly HTML/XHTML
Inspired by [this SO question](http://stackoverflow.com/questions/37061873/identify-a-weblink-in-bold-in-r) and because there's a great deal of cruddy HTML out there that needs fixing to use properly when scraping data.
@ -10,7 +10,7 @@ It relies on a locally included version of [`libtidy`](http://www.html-tidy.org/
The following functions are implemented:
- `tidy_html` : Clean up gnarly HTML/XML
- `tidy_html` : Clean up gnarly HTML/XHTML
### Installation
@ -27,16 +27,80 @@ library(htmltidy)
packageVersion("htmltidy")
#> [1] '0.2.0.9000'
cat(tidy_html("<b><p><a href='http://google.com'>google &gt</a></p></b>"))
#> <!DOCTYPE html>
library(XML)
library(xml2)
library(httr)
res <- GET("http://rud.is")
head(tidy_html(res$content), 256)
#> [1] 3c 21 44 4f 43 54 59 50 45 20 68 74 6d 6c 20 50 55 42 4c 49 43 20 22 2d 2f 2f 57 33 43 2f 2f 44 54 44 20 58 48 54
#> [39] 4d 4c 20 31 2e 30 20 54 72 61 6e 73 69 74 69 6f 6e 61 6c 2f 2f 45 4e 22 0a 20 20 20 20 22 68 74 74 70 3a 2f 2f 77
#> [77] 77 77 2e 77 33 2e 6f 72 67 2f 54 52 2f 78 68 74 6d 6c 31 2f 44 54 44 2f 78 68 74 6d 6c 31 2d 74 72 61 6e 73 69 74
#> [115] 69 6f 6e 61 6c 2e 64 74 64 22 3e 0a 3c 68 74 6d 6c 20 78 6d 6c 6e 73 3d 22 68 74 74 70 3a 2f 2f 77 77 77 2e 77 33
#> [153] 2e 6f 72 67 2f 31 39 39 39 2f 78 68 74 6d 6c 22 3e 0a 3c 68 65 61 64 3e 0a 3c 6d 65 74 61 20 6e 61 6d 65 3d 22 67
#> [191] 65 6e 65 72 61 74 6f 72 22 20 63 6f 6e 74 65 6e 74 3d 0a 22 48 54 4d 4c 20 54 69 64 79 20 66 6f 72 20 48 54 4d 4c
#> [229] 35 20 66 6f 72 20 52 20 76 65 72 73 69 6f 6e 20 35 2e 30 2e 30 22 20 2f 3e 0a 3c 6d
head(tidy_html(content(res, as="raw")), 256)
#> [1] 3c 21 44 4f 43 54 59 50 45 20 68 74 6d 6c 20 50 55 42 4c 49 43 20 22 2d 2f 2f 57 33 43 2f 2f 44 54 44 20 58 48 54
#> [39] 4d 4c 20 31 2e 30 20 54 72 61 6e 73 69 74 69 6f 6e 61 6c 2f 2f 45 4e 22 0a 20 20 20 20 22 68 74 74 70 3a 2f 2f 77
#> [77] 77 77 2e 77 33 2e 6f 72 67 2f 54 52 2f 78 68 74 6d 6c 31 2f 44 54 44 2f 78 68 74 6d 6c 31 2d 74 72 61 6e 73 69 74
#> [115] 69 6f 6e 61 6c 2e 64 74 64 22 3e 0a 3c 68 74 6d 6c 20 78 6d 6c 6e 73 3d 22 68 74 74 70 3a 2f 2f 77 77 77 2e 77 33
#> [153] 2e 6f 72 67 2f 31 39 39 39 2f 78 68 74 6d 6c 22 3e 0a 3c 68 65 61 64 3e 0a 3c 6d 65 74 61 20 6e 61 6d 65 3d 22 67
#> [191] 65 6e 65 72 61 74 6f 72 22 20 63 6f 6e 74 65 6e 74 3d 0a 22 48 54 4d 4c 20 54 69 64 79 20 66 6f 72 20 48 54 4d 4c
#> [229] 35 20 66 6f 72 20 52 20 76 65 72 73 69 6f 6e 20 35 2e 30 2e 30 22 20 2f 3e 0a 3c 6d
(class(tidy_html(content(res, as="text", encoding="UTF-8")))) # output is too long to show
#> [1] "character"
tidy_html(content(res, as="parsed", encoding="UTF-8")) # same as tidy_html(read_html("http://rud.is"))
#> {xml_document}
#> <html xmlns="http://www.w3.org/1999/xhtml">
#> [1] <head>\n <meta name="generator" content="HTML Tidy for HTML5 for R version 5.0.0" />\n <meta http-equiv="Conten ...
#> [2] <body>\n<div id="main"><b>Welcome to rud.is.</b><br /><br />\n&gt; <i>You are in a maze of twisty little passages ...
(class(tidy_html(htmlParse("http://rud.is")))) # output is too long to show
#> [1] "HTMLInternalDocument" "HTMLInternalDocument" "XMLInternalDocument" "XMLAbstractDocument"
```
### Testing Options
``` r
opts <- list(TidyDocType="html5",
TidyMakeClean=TRUE,
TidyHideComments=TRUE,
TidyIndentContent=FALSE,
TidyWrapLen=200)
txt <- "<html>
<head>
<style>
p { color: red; }
</style>
<body>
<!-- ===== body ====== -->
<p>Test</p>
</body>
<!--Default Zone
-->
<!--Default Zone End-->
</html>"
cat(tidy_html(txt, option=opts))
#> <!DOCTYPE html>
#> <html>
#> <head>
#> <meta name="generator" content=
#> "HTML Tidy for HTML5 for R version 5.0.0" />
#> <meta name="generator" content="HTML Tidy for HTML5 for R version 5.0.0">
#> <style>
#> p { color: red; }
#> </style>
#> <title></title>
#> </head>
#> <body>
#> <p><b><a href='http://google.com'>google &gt;</a></b></p>
#> <p>Test</p>
#> </body>
#> </html>
```

23
man/tidy_html.Rd

@ -2,9 +2,25 @@
% Please edit documentation in R/tidy.r
\name{tidy_html}
\alias{tidy_html}
\title{Tidy HTML/XML/XHTML Documents}
\alias{tidy_html.HTMLInternalDocument}
\alias{tidy_html.character}
\alias{tidy_html.default}
\alias{tidy_html.raw}
\alias{tidy_html.xml_document}
\title{Tidy HTML/XHTML Documents}
\usage{
tidy_html(content, options = list(TidyXhtmlOut = TRUE))
\method{tidy_html}{default}(content, options = list(TidyXhtmlOut = TRUE))
\method{tidy_html}{character}(content, options = list(TidyXhtmlOut = TRUE))
\method{tidy_html}{raw}(content, options = list(TidyXhtmlOut = TRUE))
\method{tidy_html}{xml_document}(content, options = list(TidyXhtmlOut = TRUE))
\method{tidy_html}{HTMLInternalDocument}(content, options = list(TidyXhtmlOut
= TRUE))
}
\arguments{
\item{content}{atomic character or raw vector of content to tidy}
@ -12,7 +28,7 @@ tidy_html(content, options = list(TidyXhtmlOut = TRUE))
\item{options}{named list of options}
}
\value{
Atomic character vector of tidy HTML/XML/XHTML content
tidied HTML/XHTML content
}
\description{
Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly},
@ -24,8 +40,7 @@ Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly},
\code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark},
\code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize},
\code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000},
\code{TidyWrapLen}, \code{TidyXhtmlOut}, \code{TidyXmlDecl}, \code{TidyXmlOut},
\code{TidyXmlTags}.
\code{TidyWrapLen}, \code{TidyXhtmlOut}
}
\references{
\url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h}

50
man/tidy_xml.Rd

@ -0,0 +1,50 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xml.r
\name{tidy_xml}
\alias{tidy_xml}
\alias{tidy_xml.XMLInternalDocument}
\alias{tidy_xml.character}
\alias{tidy_xml.default}
\alias{tidy_xml.raw}
\alias{tidy_xml.xml_document}
\title{Tidy XML Documents}
\usage{
tidy_xml(content, options = list(TidyXmlOut = TRUE))
\method{tidy_xml}{default}(content, options = list(TidyXmlOut = TRUE))
\method{tidy_xml}{character}(content, options = list(TidyXmlOut = TRUE))
\method{tidy_xml}{raw}(content, options = list(TidyXmlOut = TRUE))
\method{tidy_xml}{xml_document}(content, options = list(TidyXmlOut = TRUE))
\method{tidy_xml}{XMLInternalDocument}(content, options = list(TidyXmlOut =
TRUE))
}
\arguments{
\item{content}{atomic character or raw vector of content to tidy}
\item{options}{named list of options}
}
\value{
tidied XML content
}
\description{
Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly},
\code{TidyBreakBeforeBR}, \code{TidyCoerceEndTags}, \code{TidyCoerceEndTags},
\code{TidyDoctype}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas},
\code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyHideComments},
\code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyIndentSpaces},
\code{TidyJoinClasses}, \code{TidyJoinStyles}, \code{TidyLogicalEmphasis},
\code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark},
\code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize},
\code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000},
\code{TidyWrapLen}, \code{TidyXhtmlOut}, \code{TidyXmlDecl}, \code{TidyXmlOut},
\code{TidyXmlTags}.
}
\references{
\url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h}
(for definitions of the options supported above).
}

19
src/htmltidy.cpp

@ -21,31 +21,16 @@ std::string tidy_html_int(std::string source, Rcpp::List options) {
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
}
if (options.containsElementNamed("TidyXmlOut")) {
ok = tidyOptSetBool(tdoc, TidyXmlOut, options["TidyXmlOut"] ? yes : no);
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
}
if (options.containsElementNamed("TidyHtmlOut")) {
ok = tidyOptSetBool(tdoc, TidyHtmlOut, options["TidyHtmlOut"] ? yes : no);
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
}
if (options.containsElementNamed("TidyXmlTags")) {
ok = tidyOptSetBool(tdoc, TidyXmlTags, options["TidyXmlTags"] ? yes : no);
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
}
if (options.containsElementNamed("TidyOmitOptionalTags")) {
ok = tidyOptSetBool(tdoc, TidyOmitOptionalTags, options["TidyOmitOptionalTags"] ? yes : no);
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
}
if (options.containsElementNamed("TidyXmlDecl")) {
ok = tidyOptSetBool(tdoc, TidyXmlDecl, options["TidyXmlDecl"] ? yes : no);
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
}
if (options.containsElementNamed("TidyBreakBeforeBR")) {
ok = tidyOptSetBool(tdoc, TidyBreakBeforeBR, options["TidyBreakBeforeBR"] ? yes : no);
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
@ -87,7 +72,7 @@ std::string tidy_html_int(std::string source, Rcpp::List options) {
}
if (options.containsElementNamed("TidyHideComments")) {
ok = tidyOptSetBool(tdoc, TidyBodyOnly, options["TidyHideComments"] ? yes : no);
ok = tidyOptSetBool(tdoc, TidyHideComments, options["TidyHideComments"] ? yes : no);
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
}
@ -137,7 +122,7 @@ std::string tidy_html_int(std::string source, Rcpp::List options) {
}
if (options.containsElementNamed("TidyMakeClean")) {
ok = tidyOptSetValue(tdoc, TidyMakeClean, Rcpp::as<std::string>(options["TidyMakeClean"]).c_str());
ok = tidyOptSetBool(tdoc, TidyMakeClean, options["TidyMakeClean"] ? yes : no);
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
}

Loading…
Cancel
Save