diff --git a/.travis.yml b/.travis.yml index f3a81e0..3b3e9d4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,16 @@ language: R -sudo: false +sudo: required cache: packages r: - oldrel - release - devel + +notifications: + email: + - bob@rud.is + irc: + channels: + - "104.236.112.222#builds" + nick: travisci diff --git a/DESCRIPTION b/DESCRIPTION index 874df0c..f61e344 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,8 +3,11 @@ Title: Clean Up or Pretty Print Gnarly HTML and XHTML Version: 0.3.0 Authors@R: c( person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre")), - person("Dave", "Dave", email = "dsr@w3.org", role = c("ctb", "aut"), - comment="HTML Tidy library") + person("Dave", "Raggett", email = "dsr@w3.org", role = c("ctb", "aut"), + comment="Original HTML Tidy library"), + person("Charles", "Reitzel", role = c("ctb", "aut"), + comment="Modern HTML Tidy library"), + person("Björn", "Höhrmann", role = c("ctb", "aut"), comment="HTML5 Support") ) Maintainer: Bob Rudis Description: HTML documents can be beautiful and pristine. They can also be diff --git a/NAMESPACE b/NAMESPACE index 76b2f65..2905abc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ S3method(tidy_html,character) S3method(tidy_html,connection) S3method(tidy_html,default) S3method(tidy_html,raw) +S3method(tidy_html,response) S3method(tidy_html,xml_document) export(tidy_html) import(XML) diff --git a/NEWS.md b/NEWS.md index 402e2ab..110754c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,8 @@ # htmltidy 0.3.0 -* Better error handling (fixed crashing bug) +* Better error handling (fixed crashing bug in #1) * New option to display document errors +* Support for directly tidying httr::response objects # htmltidy 0.2.0 diff --git a/R/response.r b/R/response.r new file mode 100644 index 0000000..7ee4901 --- /dev/null +++ b/R/response.r @@ -0,0 +1,17 @@ +#' @param encoding specify the encoding when tidying an \code{httr} \code{response} +#' object. Default to "\code{UTF-8}". +#' @export +#' @rdname tidy_html +tidy_html.response <- function(content, options=list(TidyXhtmlOut=TRUE), + verbose=FALSE, encoding="UTF-8") { + + if (!grepl("html", content$headers[["content-type"]])) { + stop("htmltidy only parses HTML content from httr::response objects", + call.=FALSE) + } + + html_txt <- httr::content(content, as="text", encoding=encoding) + + tidy_html(html_txt) + +} diff --git a/R/tidy.r b/R/tidy.r index de5350f..3739d5b 100644 --- a/R/tidy.r +++ b/R/tidy.r @@ -1,9 +1,9 @@ #' Tidy or "Pretty Print" HTML/XHTML Documents #' #' Pass in HTML content as either plain or raw text or parsed objects (either with the -#' \code{XML} or \code{xml2} packages) along with an options list that specifies how -#' the content will be tidied and get back tidied content of the same object type as passed -#' in to the function. +#' \code{XML} or \code{xml2} packages) or as an \code{httr} \code{response} object +#' along with an options list that specifies how the content will be tidied and get back +#' tidied content of the same object type as passed in to the function. #' #' The default option \code{TixyXhtmlOut} will convert the input content to XHTML. #' @@ -147,3 +147,4 @@ tidy_html.connection <- function(content, options=list(TidyXhtmlOut=TRUE), source=html, options=options, show_errors=verbose) } + diff --git a/README.Rmd b/README.Rmd index acdd410..a2b6483 100644 --- a/README.Rmd +++ b/README.Rmd @@ -59,7 +59,15 @@ res <- GET("http://rud.is/test/untidy.html") cat(content(res, as="text")) ``` -Let's see what `tidy_html()` does to it: +Let's see what `tidy_html()` does to it. + +It can handle the `response` object directly: + +```{r message=FALSE, warning=FALSE} +cat(tidy_html(res, list(TidyDocType="html5", TidyWrapLen=200))) +``` + +But, you'll probably mostly use it on HTML you've identified as gnarly and already have that HTML text content handy: ```{r message=FALSE, warning=FALSE} cat(tidy_html(content(res, as="text"), list(TidyDocType="html5", TidyWrapLen=200))) diff --git a/README.md b/README.md index 74b771d..f7b873f 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,33 @@ cat(content(res, as="text")) ## as is this portion
``` -Let's see what `tidy_html()` does to it: +Let's see what `tidy_html()` does to it. + +It can handle the `response` object directly: + +``` r +cat(tidy_html(res, list(TidyDocType="html5", TidyWrapLen=200))) +## +## +## +## +## +## +## +## +## This is some really poorly formatted HTML as is this +## portion +##
+## +## +``` + +But, you'll probably mostly use it on HTML you've identified as gnarly and already have that HTML text content handy: ``` r cat(tidy_html(content(res, as="text"), list(TidyDocType="html5", TidyWrapLen=200))) @@ -214,7 +240,7 @@ sum(map_int(book, nchar)) ## [1] 207501 system.time(tidy_book <- tidy_html(book)) ## user system elapsed -## 0.022 0.002 0.024 +## 0.021 0.001 0.022 ``` (It's usually between 20 & 25 milliseconds to process those 202 kilobytes of HTML.) Not too shabby. diff --git a/cran-comments.md b/cran-comments.md index 95beec2..efe9d58 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,7 +1,7 @@ ## Test environments * local OS X install, R 3.3.1 (clang) -* ubuntu 12.04 (on travis-ci), R 3.3.1 +* ubuntu 12.04 (on travis-ci), oldrel, release & devel * win-builder (devel and release) * local 32-bit Windows 10 R 3.3.1 * local ubuntu 14.04 R 3.3.1 & R-devel @@ -17,7 +17,10 @@ This is a new release, so there are no reverse dependencies. --- -There is an inst/COPYRIGHTS file. The libtidy library is used in a plethora -of other packages/modules (see Perl, Python & npm for starters) and most -don't even bother acknowledging the efforts of the HTML Tidy working group -but I wanted to make sure their efforts were credited appropriately. +This fixes a fairly nasty bug that was +user-identfied fairly early after release +but I didn't want to bug the CRAN team +so quickly after the CRAN acceptange. This +also addes new functionality and (optionally) +provides more informaiton on the tidying +process. diff --git a/man/tidy_html.Rd b/man/tidy_html.Rd index f5e3b58..a262763 100644 --- a/man/tidy_html.Rd +++ b/man/tidy_html.Rd @@ -1,15 +1,19 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tidy.r -\name{tidy_html} +% Please edit documentation in R/response.r, R/tidy.r +\name{tidy_html.response} \alias{tidy_html} \alias{tidy_html.HTMLInternalDocument} \alias{tidy_html.character} \alias{tidy_html.connection} \alias{tidy_html.default} \alias{tidy_html.raw} +\alias{tidy_html.response} \alias{tidy_html.xml_document} \title{Tidy or "Pretty Print" HTML/XHTML Documents} \usage{ +\method{tidy_html}{response}(content, options = list(TidyXhtmlOut = TRUE), + verbose = FALSE, encoding = "UTF-8") + tidy_html(content, options = list(TidyXhtmlOut = TRUE), verbose = FALSE) \method{tidy_html}{default}(content, options = list(TidyXhtmlOut = TRUE), @@ -37,6 +41,9 @@ or \code{XML} packages.} \item{options}{named list of options} \item{verbose}{output document errors? (default: \code{FALSE})} + +\item{encoding}{specify the encoding when tidying an \code{httr} \code{response} +object. Default to "\code{UTF-8}".} } \value{ Tidied HTML/XHTML content. The object type will be the same as that of the input type @@ -44,9 +51,9 @@ Tidied HTML/XHTML content. The object type will be the same as that of the input } \description{ Pass in HTML content as either plain or raw text or parsed objects (either with the -\code{XML} or \code{xml2} packages) along with an options list that specifies how -the content will be tidied and get back tidied content of the same object type as passed -in to the function. +\code{XML} or \code{xml2} packages) or as an \code{httr} \code{response} object +along with an options list that specifies how the content will be tidied and get back +tidied content of the same object type as passed in to the function. } \details{ The default option \code{TixyXhtmlOut} will convert the input content to XHTML. diff --git a/tests/testthat/test-htmltidy.R b/tests/testthat/test-htmltidy.R index 87d1521..01c0f46 100644 --- a/tests/testthat/test-htmltidy.R +++ b/tests/testthat/test-htmltidy.R @@ -1,7 +1,9 @@ context("basic functionality") test_that("tidying works", { - expect_gte(nchar(tidy_html("

google >

")), - 249) + th <- tidy_html("

google >

") + + expect_gte(nchar(th), 249) + expect_equivalent(grepl("HTML Tidy for HTML5 for R", th), TRUE) })