Browse Source

pre-CRAN flight check

master
boB Rudis 8 years ago
parent
commit
f846c741d7
No known key found for this signature in database GPG Key ID: 1D7529BE14E2BBA9
  1. 2
      .Rbuildignore
  2. 17
      DESCRIPTION
  3. 2
      LICENSE
  4. 1
      NAMESPACE
  5. 1
      NEWS.md
  6. 10
      R/htmltidy-package.r
  7. 31
      R/tidy.r
  8. 58
      README.Rmd
  9. 193
      README.md
  10. 22
      inst/COPYRIGHTS
  11. 10
      man/htmltidy.Rd
  12. 20
      man/tidy_html.Rd
  13. 2
      src/alloc.cpp
  14. 3
      src/sprtf.cpp
  15. 6
      src/tidylib.c

2
.Rbuildignore

@ -4,4 +4,4 @@
^README-.*\.png$
^\.travis\.yml$
^CONDUCT\.md$
^README\.md$
^README\.html$

17
DESCRIPTION

@ -1,6 +1,6 @@
Package: htmltidy
Title: Clean Up Gnarly HTML and XHTML
Version: 0.2.0.9000
Title: Clean Up or Pretty Print Gnarly HTML and XHTML
Version: 0.2.0
Authors@R: c(
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre")),
person("Dave", "Dave", email = "dsr@w3.org", role = c("ctb", "aut"),
@ -9,14 +9,21 @@ Authors@R: c(
Maintainer: Bob Rudis <bob@rud.is>
Description: HTML documents can be beautiful and pristine. They can also be
wretched, evil, malformed demon-spawn. Now, you can tidy up that HTML and XHTML
before processing it with your favorite angle-bracket crunching tools.
before processing it with your favorite angle-bracket crunching tools, going beyond
the limited tidying that 'libxml2' affords in the 'XML' and 'xml2' packages and
taming even the ugliest HTML code generated by the likes of Google Docs and Microsoft
Word. It's also possible to use the functions provided to format or "pretty print"
HTML content as it is being tidied.
Copyright: ile inst/COPYRIGHTS
Depends:
R (>= 3.2.0)
License: AGPL + file LICENSE
License: AGPL
LazyData: true
NeedsCompilation: yes
Suggests:
testthat
testthat,
httr,
rvest
LinkingTo: Rcpp
Imports:
Rcpp,

2
LICENSE

@ -1,2 +0,0 @@
YEAR: 2016
COPYRIGHT HOLDER: Bob Rudis

1
NAMESPACE

@ -2,6 +2,7 @@
S3method(tidy_html,HTMLInternalDocument)
S3method(tidy_html,character)
S3method(tidy_html,connection)
S3method(tidy_html,default)
S3method(tidy_html,raw)
S3method(tidy_html,xml_document)

1
NEWS.md

@ -3,6 +3,7 @@
* Bundled tidy-html5 library with the package
* Windows compatibility
* Options handling
* Enabled generics
* Modified tests

10
R/htmltidy-package.r

@ -1,8 +1,12 @@
#' Clean Up Gnarly HTML/XML
#' Clean Up or Pretty Print Gnarly HTML and XHTML
#'
#' HTML documents can be beautiful and pristine. They can also be
#' wretched, evil, malformed hellspawn. Now, you can tidy up that HTML and XML before
#' processing it with your favorite angle-bracket parsing tools.
#' wretched, evil, malformed demon-spawn. Now, you can tidy up that HTML and XHTML
#' before processing it with your favorite angle-bracket crunching tools, going beyond
#' the limited tidying that 'libxml2' affords in the 'XML' and 'xml2' packages and
#' taming even the ugliest HTML code generated by the likes of Google Docs and Microsoft
#' Word. It's also possible to use the functions provided to format or "pretty print"
#' HTML content as it is being tidied.
#'
#' @name htmltidy
#' @docType package

31
R/tidy.r

@ -1,4 +1,4 @@
#' Tidy HTML/XHTML Documents
#' Tidy or "Pretty Print" HTML/XHTML Documents
#'
#' Pass in HTML content as either plain or raw text or parsed objects (either with the
#' \code{XML} or \code{xml2} packages) along with an options list that specifies how
@ -42,7 +42,8 @@
#' @param content accepts a character vector, raw vector or parsed content from the \code{xml2}
#' or \code{XML} packages.
#' @param options named list of options
#' @return Tidied HTML/XHTML content. The object type will be the same as that of the input type.
#' @return Tidied HTML/XHTML content. The object type will be the same as that of the input type
#' except when it is a \code{connection}, then a character vector will be returned.
#' @references \url{http://api.html-tidy.org/tidy/quickref_5.1.25.html} &
#' \url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h}
#' for definitions of the options supported above and \url{https://www.w3.org/People/Raggett/tidy/}
@ -63,6 +64,18 @@
#' collapse="")
#'
#' cat(tidy_html(txt, option=opts))
#'
#' library(httr)
#' res <- GET("http://rud.is/test/untidy.html")
#'
#' # look at the original, un-tidy source
#' cat(content(res, as="text"))
#'
#' # see the tidied version
#' cat(tidy_html(content(res, as="text"), list(TidyDocType="html5", TidyWrapLen=200)))
#'
#' # but, you could also just do:
#' cat(tidy_html(url("http://rud.is/test/untidy.html")))
tidy_html <- function(content, options=list(TidyXhtmlOut=TRUE)) {
UseMethod("tidy_html")
}
@ -70,7 +83,7 @@ tidy_html <- function(content, options=list(TidyXhtmlOut=TRUE)) {
#' @export
#' @rdname tidy_html
tidy_html.default <- function(content, options=list(TidyXhtmlOut=TRUE)) {
content <- content[1]
content <- paste0(content, collapse="")
.Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
}
@ -78,7 +91,7 @@ tidy_html.default <- function(content, options=list(TidyXhtmlOut=TRUE)) {
#' @export
#' @rdname tidy_html
tidy_html.character <- function(content, options=list(TidyXhtmlOut=TRUE)) {
content <- content[1]
content <- paste0(content, collapse="")
.Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
}
@ -110,3 +123,13 @@ tidy_html.HTMLInternalDocument <- function(content, options=list(TidyXhtmlOut=TR
source=content, options=options)
XML::htmlParse(out)
}
#' @export
#' @rdname tidy_html
tidy_html.connection <- function(content, options=list(TidyXhtmlOut=TRUE)) {
content <- paste0(readLines(content), collapse="")
.Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
}

58
README.Rmd

@ -1,14 +1,14 @@
---
output: rmarkdown::github_document
---
[![Travis-CI Build Status](https://travis-ci.org/hrbrmstr/htmltidy.svg?branch=master)](https://travis-ci.org/hrbrmstr/htmltidy)
[![Travis-CI Build Status](https://travis-ci.org/hrbrmstr/htmltidy.svg?branch=master)](https://travis-ci.org/hrbrmstr/htmltidy) [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/htmltidy)](https://cran.r-project.org/package=htmltidy)
<!-- README.md is generated from README.Rmd. Please edit that file -->
```{r, echo = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
comment = "##",
message = FALSE,
warning = FALSE,
error = FALSE,
@ -25,7 +25,7 @@ It relies on a locally included version of [`libtidy`](http://www.html-tidy.org/
The following functions are implemented:
- `tidy_html` : Clean up gnarly HTML/XHTML
- `tidy_html` : Tidy or "Pretty Print" HTML/XHTML Documents
### Installation
@ -48,18 +48,46 @@ packageVersion("htmltidy")
library(XML)
library(xml2)
library(httr)
library(purrr)
```
This is really "un-tidy" content:
```{r message=FALSE, warning=FALSE}
res <- GET("http://rud.is/test/untidy.html")
cat(content(res, as="text"))
```
res <- GET("http://rud.is")
Let's see what `tidy_html()` does to it:
head(tidy_html(res$content), 256)
```{r message=FALSE, warning=FALSE}
cat(tidy_html(content(res, as="text"), list(TidyDocType="html5", TidyWrapLen=200)))
```
head(tidy_html(content(res, as="raw")), 256)
NOTE: you could also just have done:
(class(tidy_html(content(res, as="text", encoding="UTF-8")))) # output is too long to show
```{r message=FALSE, warning=FALSE}
cat(tidy_html(url("http://rud.is/test/untidy.html"),
list(TidyDocType="html5", TidyWrapLen=200)))
```
tidy_html(content(res, as="parsed", encoding="UTF-8")) # same as tidy_html(read_html("http://rud.is"))
You'll see that this differs substantially from the mangling `libxml2` does (via `read_html()`):
(class(tidy_html(htmlParse("http://rud.is")))) # output is too long to show
```{r message=FALSE, warning=FALSE}
pg <- read_html("http://rud.is/test/untidy.html")
cat(toString(pg))
```
It can also deal with "raw" and parsed objects:
```{r message=FALSE, warning=FALSE}
tidy_html(content(res, as="raw"))
tidy_html(content(res, as="text", encoding="UTF-8"))
tidy_html(content(res, as="parsed", encoding="UTF-8"))
tidy_html(htmlParse("http://rud.is/test/untidy.html"))
```
### Testing Options
@ -91,6 +119,18 @@ cat(tidy_html(txt, option=opts))
```
But, you're probably better off running it on plain HTML source.
Since it's C/C++-backed, it's pretty fast:
```{r message=FALSE, warning=FALSE}
book <- readLines("http://singlepageappbook.com/single-page.html")
sum(map_int(book, nchar))
system.time(tidy_book <- tidy_html(book))
```
(It's usually between 20 & 25 milliseconds to process those 202 kilobytes of HTML.) Not too shabby.
### Code of Conduct
Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md).

193
README.md

@ -1,5 +1,5 @@
[![Travis-CI Build Status](https://travis-ci.org/hrbrmstr/htmltidy.svg?branch=master)](https://travis-ci.org/hrbrmstr/htmltidy)
[![Travis-CI Build Status](https://travis-ci.org/hrbrmstr/htmltidy.svg?branch=master)](https://travis-ci.org/hrbrmstr/htmltidy) [![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/htmltidy)](https://cran.r-project.org/package=htmltidy)
<!-- README.md is generated from README.Rmd. Please edit that file -->
`htmltidy` — Clean up gnarly HTML/XHTML
@ -10,7 +10,7 @@ It relies on a locally included version of [`libtidy`](http://www.html-tidy.org/
The following functions are implemented:
- `tidy_html` : Clean up gnarly HTML/XHTML
- `tidy_html` : Tidy or "Pretty Print" HTML/XHTML Documents
### Installation
@ -25,43 +25,125 @@ library(htmltidy)
# current verison
packageVersion("htmltidy")
#> [1] '0.2.0.9000'
## [1] '0.2.0'
library(XML)
library(xml2)
library(httr)
library(purrr)
```
This is really "un-tidy" content:
res <- GET("http://rud.is")
head(tidy_html(res$content), 256)
#> [1] 3c 21 44 4f 43 54 59 50 45 20 68 74 6d 6c 20 50 55 42 4c 49 43 20 22 2d 2f 2f 57 33 43 2f 2f 44 54 44 20 58 48 54
#> [39] 4d 4c 20 31 2e 30 20 54 72 61 6e 73 69 74 69 6f 6e 61 6c 2f 2f 45 4e 22 0a 20 20 20 20 22 68 74 74 70 3a 2f 2f 77
#> [77] 77 77 2e 77 33 2e 6f 72 67 2f 54 52 2f 78 68 74 6d 6c 31 2f 44 54 44 2f 78 68 74 6d 6c 31 2d 74 72 61 6e 73 69 74
#> [115] 69 6f 6e 61 6c 2e 64 74 64 22 3e 0a 3c 68 74 6d 6c 20 78 6d 6c 6e 73 3d 22 68 74 74 70 3a 2f 2f 77 77 77 2e 77 33
#> [153] 2e 6f 72 67 2f 31 39 39 39 2f 78 68 74 6d 6c 22 3e 0a 3c 68 65 61 64 3e 0a 3c 6d 65 74 61 20 6e 61 6d 65 3d 22 67
#> [191] 65 6e 65 72 61 74 6f 72 22 20 63 6f 6e 74 65 6e 74 3d 0a 22 48 54 4d 4c 20 54 69 64 79 20 66 6f 72 20 48 54 4d 4c
#> [229] 35 20 66 6f 72 20 52 20 76 65 72 73 69 6f 6e 20 35 2e 30 2e 30 22 20 2f 3e 0a 3c 6d
head(tidy_html(content(res, as="raw")), 256)
#> [1] 3c 21 44 4f 43 54 59 50 45 20 68 74 6d 6c 20 50 55 42 4c 49 43 20 22 2d 2f 2f 57 33 43 2f 2f 44 54 44 20 58 48 54
#> [39] 4d 4c 20 31 2e 30 20 54 72 61 6e 73 69 74 69 6f 6e 61 6c 2f 2f 45 4e 22 0a 20 20 20 20 22 68 74 74 70 3a 2f 2f 77
#> [77] 77 77 2e 77 33 2e 6f 72 67 2f 54 52 2f 78 68 74 6d 6c 31 2f 44 54 44 2f 78 68 74 6d 6c 31 2d 74 72 61 6e 73 69 74
#> [115] 69 6f 6e 61 6c 2e 64 74 64 22 3e 0a 3c 68 74 6d 6c 20 78 6d 6c 6e 73 3d 22 68 74 74 70 3a 2f 2f 77 77 77 2e 77 33
#> [153] 2e 6f 72 67 2f 31 39 39 39 2f 78 68 74 6d 6c 22 3e 0a 3c 68 65 61 64 3e 0a 3c 6d 65 74 61 20 6e 61 6d 65 3d 22 67
#> [191] 65 6e 65 72 61 74 6f 72 22 20 63 6f 6e 74 65 6e 74 3d 0a 22 48 54 4d 4c 20 54 69 64 79 20 66 6f 72 20 48 54 4d 4c
#> [229] 35 20 66 6f 72 20 52 20 76 65 72 73 69 6f 6e 20 35 2e 30 2e 30 22 20 2f 3e 0a 3c 6d
(class(tidy_html(content(res, as="text", encoding="UTF-8")))) # output is too long to show
#> [1] "character"
tidy_html(content(res, as="parsed", encoding="UTF-8")) # same as tidy_html(read_html("http://rud.is"))
#> {xml_document}
#> <html xmlns="http://www.w3.org/1999/xhtml">
#> [1] <head>\n <meta name="generator" content="HTML Tidy for HTML5 for R version 5.0.0" />\n <meta http-equiv="Conten ...
#> [2] <body>\n<div id="main"><b>Welcome to rud.is.</b><br /><br />\n&gt; <i>You are in a maze of twisty little passages ...
(class(tidy_html(htmlParse("http://rud.is")))) # output is too long to show
#> [1] "HTMLInternalDocument" "HTMLInternalDocument" "XMLInternalDocument" "XMLAbstractDocument"
``` r
res <- GET("http://rud.is/test/untidy.html")
cat(content(res, as="text"))
## <head>
## <style>
## body { font-family: sans-serif; }
## </style>
## </head>
## <body>
## <b>This is <b>some <i>really </i> poorly formatted HTML</b>
##
## as is this <span id="sp">portion<div>
```
Let's see what `tidy_html()` does to it:
``` r
cat(tidy_html(content(res, as="text"), list(TidyDocType="html5", TidyWrapLen=200)))
## <!DOCTYPE html>
## <html>
## <head>
## <meta name="generator" content="HTML Tidy for HTML5 for R version 5.0.0">
## <style>
## body { font-family: sans-serif; }
## </style>
## <title></title>
## </head>
## <body>
## <b>This is some <i>really</i> poorly formatted HTML as is this <span id="sp">portion</span></b>
## <div><span id="sp"></span></div>
## </body>
## </html>
```
NOTE: you could also just have done:
``` r
cat(tidy_html(url("http://rud.is/test/untidy.html"),
list(TidyDocType="html5", TidyWrapLen=200)))
## <!DOCTYPE html>
## <html>
## <head>
## <meta name="generator" content="HTML Tidy for HTML5 for R version 5.0.0">
## <style>
## body { font-family: sans-serif; }
## </style>
## <title></title>
## </head>
## <body>
## <b>This is some <i>really</i> poorly formatted HTMLas is this <span id="sp">portion</span></b>
## <div><span id="sp"></span></div>
## </body>
## </html>
```
You'll see that this differs substantially from the mangling `libxml2` does (via `read_html()`):
``` r
pg <- read_html("http://rud.is/test/untidy.html")
cat(toString(pg))
## <?xml version="1.0" standalone="yes"?>
## <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
## <html><head><style><![CDATA[
## body { font-family: sans-serif; }
## ]]></style></head><body>
## <b>This is <b>some <i>really </i> poorly formatted HTML</b>
##
## as is this <span id="sp">portion<div/></span></b></body></html>
```
It can also deal with "raw" and parsed objects:
``` r
tidy_html(content(res, as="raw"))
## [1] 3c 21 44 4f 43 54 59 50 45 20 68 74 6d 6c 3e 0a 3c 68 74 6d 6c 20 78 6d 6c 6e 73 3d 22 68 74 74 70 3a 2f 2f 77 77
## [39] 77 2e 77 33 2e 6f 72 67 2f 31 39 39 39 2f 78 68 74 6d 6c 22 3e 0a 3c 68 65 61 64 3e 0a 3c 6d 65 74 61 20 6e 61 6d
## [77] 65 3d 22 67 65 6e 65 72 61 74 6f 72 22 20 63 6f 6e 74 65 6e 74 3d 0a 22 48 54 4d 4c 20 54 69 64 79 20 66 6f 72 20
## [115] 48 54 4d 4c 35 20 66 6f 72 20 52 20 76 65 72 73 69 6f 6e 20 35 2e 30 2e 30 22 20 2f 3e 0a 3c 74 69 74 6c 65 3e 3c
## [153] 2f 74 69 74 6c 65 3e 0a 3c 2f 68 65 61 64 3e 0a 3c 62 6f 64 79 3e 0a 3c 2f 62 6f 64 79 3e 0a 3c 2f 68 74 6d 6c 3e
## [191] 0a
tidy_html(content(res, as="text", encoding="UTF-8"))
## [1] "<!DOCTYPE html>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head>\n<meta name=\"generator\" content=\n\"HTML Tidy for HTML5 for R version 5.0.0\" />\n<style>\n<![CDATA[\nbody { font-family: sans-serif; }\n]]>\n</style>\n<title></title>\n</head>\n<body>\n<b>This is some <i>really</i> poorly formatted HTML as is this\n<span id=\"sp\">portion</span></b>\n<div><span id=\"sp\"></span></div>\n</body>\n</html>\n"
tidy_html(content(res, as="parsed", encoding="UTF-8"))
## {xml_document}
## <html xmlns="http://www.w3.org/1999/xhtml">
## [1] <head>\n <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n <meta name="generator" content ...
## [2] <body>\n<b>This is some <i>really</i> poorly formatted HTML as is this\n<span id="sp">portion</span></b>\n</body>
tidy_html(htmlParse("http://rud.is/test/untidy.html"))
## <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
## <html xmlns="http://www.w3.org/1999/xhtml">
## <head>
## <meta name="generator" content="HTML Tidy for HTML5 for R version 5.0.0">
## <style>
## <![CDATA[
## body { font-family: sans-serif; }
## ]]>
## </style>
## <title></title>
## </head>
## <body>
## <b>This is some <i>really</i> poorly formatted HTML as is this
## <span id="sp">portion</span></b>
## <div><span id="sp"></span></div>
## </body>
## </html>
##
```
### Testing Options
@ -75,7 +157,7 @@ opts <- list(TidyDocType="html5",
TidyWrapLen=200)
txt <- "<html>
<head>
<head>
<style>
p { color: red; }
</style>
@ -90,21 +172,36 @@ txt <- "<html>
</html>"
cat(tidy_html(txt, option=opts))
#> <!DOCTYPE html>
#> <html>
#> <head>
#> <meta name="generator" content="HTML Tidy for HTML5 for R version 5.0.0">
#> <style>
#> p { color: red; }
#> </style>
#> <title></title>
#> </head>
#> <body>
#> <p>Test</p>
#> </body>
#> </html>
## <!DOCTYPE html>
## <html>
## <head>
## <meta name="generator" content="HTML Tidy for HTML5 for R version 5.0.0">
## <style>
## p { color: red; }
## </style>
## <title></title>
## </head>
## <body>
## <p>Test</p>
## </body>
## </html>
```
But, you're probably better off running it on plain HTML source.
Since it's C/C++-backed, it's pretty fast:
``` r
book <- readLines("http://singlepageappbook.com/single-page.html")
sum(map_int(book, nchar))
## [1] 207501
system.time(tidy_book <- tidy_html(book))
## user system elapsed
## 0.022 0.001 0.022
```
(It's usually between 20 & 25 milliseconds to process those 202 kilobytes of HTML.) Not too shabby.
### Code of Conduct
Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms.

22
inst/COPYRIGHTS

@ -0,0 +1,22 @@
All R source code and source file src/htmltidy.cpp are released under the GNU AGPL license.
As per https://github.com/htacg/tidy-html5/blob/master/README/LICENSE.md, libtidy source
code is:
"Copyright (c) 1998-2016 World Wide Web Consortium (Massachusetts Institute of
Technology, European Research Consortium for Informatics and Mathematics, Keio University).
All Rights Reserved.
Additional contributions (c) 2001-2016 University of Toronto, Terry Teague, @geoffmcl,
HTACG, and others.""
The authors of the libtidy sources also used other source code that is licensed GPL-2.
All licenses in the included source files have been left intact. As required by the
libtidy copyright notice, the following files are noted as being modified for use in
this package:
- src/alloc.cpp
- src/sprtf.cpp
Both of those files have been marked as modified in the license copyright header.

10
man/htmltidy.Rd

@ -4,11 +4,15 @@
\name{htmltidy}
\alias{htmltidy}
\alias{htmltidy-package}
\title{Clean Up Gnarly HTML/XML}
\title{Clean Up or Pretty Print Gnarly HTML and XHTML}
\description{
HTML documents can be beautiful and pristine. They can also be
wretched, evil, malformed hellspawn. Now, you can tidy up that HTML and XML before
processing it with your favorite angle-bracket parsing tools.
wretched, evil, malformed demon-spawn. Now, you can tidy up that HTML and XHTML
before processing it with your favorite angle-bracket crunching tools, going beyond
the limited tidying that 'libxml2' affords in the 'XML' and 'xml2' packages and
taming even the ugliest HTML code generated by the likes of Google Docs and Microsoft
Word. It's also possible to use the functions provided to format or "pretty print"
HTML content as it is being tidied.
}
\author{
Bob Rudis (bob@rud.is)

20
man/tidy_html.Rd

@ -4,10 +4,11 @@
\alias{tidy_html}
\alias{tidy_html.HTMLInternalDocument}
\alias{tidy_html.character}
\alias{tidy_html.connection}
\alias{tidy_html.default}
\alias{tidy_html.raw}
\alias{tidy_html.xml_document}
\title{Tidy HTML/XHTML Documents}
\title{Tidy or "Pretty Print" HTML/XHTML Documents}
\usage{
tidy_html(content, options = list(TidyXhtmlOut = TRUE))
@ -21,6 +22,8 @@ tidy_html(content, options = list(TidyXhtmlOut = TRUE))
\method{tidy_html}{HTMLInternalDocument}(content, options = list(TidyXhtmlOut
= TRUE))
\method{tidy_html}{connection}(content, options = list(TidyXhtmlOut = TRUE))
}
\arguments{
\item{content}{accepts a character vector, raw vector or parsed content from the \code{xml2}
@ -29,7 +32,8 @@ or \code{XML} packages.}
\item{options}{named list of options}
}
\value{
Tidied HTML/XHTML content. The object type will be the same as that of the input type.
Tidied HTML/XHTML content. The object type will be the same as that of the input type
except when it is a \code{connection}, then a character vector will be returned.
}
\description{
Pass in HTML content as either plain or raw text or parsed objects (either with the
@ -87,6 +91,18 @@ txt <- paste0(
collapse="")
cat(tidy_html(txt, option=opts))
library(httr)
res <- GET("http://rud.is/test/untidy.html")
# look at the original, un-tidy source
cat(content(res, as="text"))
# see the tidied version
cat(tidy_html(content(res, as="text"), list(TidyDocType="html5", TidyWrapLen=200)))
# but, you could also just do:
cat(tidy_html(url("http://rud.is/test/untidy.html")))
}
\references{
\url{http://api.html-tidy.org/tidy/quickref_5.1.25.html} &

2
src/alloc.cpp

@ -5,6 +5,8 @@
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
Modified 2016-09-09 by Bob Rudis for the R package htmltidy
*/
/* #define DEBUG_MEMORY very NOISY extra DEBUG of memory allocation, reallocation and free */

3
src/sprtf.cpp

@ -23,6 +23,9 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, US
*
*
* Modified 2016-09-09 by Bob Rudis for the R package htmltidy
*
*/
#ifdef _MSC_VER

6
src/tidylib.c

@ -16,6 +16,8 @@
Created 2001-05-20 by Charles Reitzel
Modified 2016-09-09 by Bob Rudis for the R package htmltidy
*/
#include <errno.h>
@ -1073,10 +1075,6 @@ int tidyDocSaveStdout( TidyDocImpl* doc )
{
#if !defined(NO_SETMODE_SUPPORT)
#if defined(_WIN32) || defined(OS2_OS)
int oldstdoutmode = -1, oldstderrmode = -1;
#endif
#endif
int status = 0;
// uint outenc = cfg( doc, TidyOutCharEncoding );

Loading…
Cancel
Save