handle response objects

8 years ago · 2c8bb271bd
11 changed files with 99 additions and 22 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -1,8 +1,16 @@
 language: R
-sudo: false
+sudo: required
 cache: packages
 r:
 - oldrel
 - release
 - devel
 notifications:
  email:
    - bob@rud.is
  irc:
    channels:
      - "104.236.112.222#builds"
    nick: travisci
--- a/7
+++ b/7
@ -3,8 +3,11 @@ Title: Clean Up or Pretty Print Gnarly HTML and XHTML
 Version: 0.3.0
 Authors@R: c(
  person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre")),
-  person("Dave", "Dave", email = "dsr@w3.org", role = c("ctb", "aut"),
+  person("Dave", "Raggett", email = "dsr@w3.org", role = c("ctb", "aut"),
-         comment="HTML Tidy library")
+         comment="Original HTML Tidy library"),
  person("Charles", "Reitzel", role = c("ctb", "aut"),
         comment="Modern HTML Tidy library"),
  person("Björn", "Höhrmann", role = c("ctb", "aut"), comment="HTML5 Support")
  )
 Maintainer: Bob Rudis <bob@rud.is>
 Description: HTML documents can be beautiful and pristine. They can also be
--- a/1
+++ b/1
@ -5,6 +5,7 @@ S3method(tidy_html,character)
 S3method(tidy_html,connection)
 S3method(tidy_html,default)
 S3method(tidy_html,raw)
 S3method(tidy_html,response)
 S3method(tidy_html,xml_document)
 export(tidy_html)
 import(XML)
--- a/NEWS.md
+++ b/NEWS.md
@ -1,7 +1,8 @@
 # htmltidy 0.3.0
-* Better error handling (fixed crashing bug)
+* Better error handling (fixed crashing bug in #1)
 * New option to display document errors
 * Support for directly tidying httr::response objects
 # htmltidy 0.2.0
--- a/R/response.r
+++ b/R/response.r
@ -0,0 +1,17 @@
 #' @param encoding specify the encoding when tidying an \code{httr} \code{response}
 #'   object. Default to "\code{UTF-8}".
 #' @export
 #' @rdname tidy_html
 tidy_html.response <- function(content, options=list(TidyXhtmlOut=TRUE),
                               verbose=FALSE, encoding="UTF-8") {
  if (!grepl("html", content$headers[["content-type"]])) {
    stop("htmltidy only parses HTML content from httr::response objects",
         call.=FALSE)
  }
  html_txt <- httr::content(content, as="text", encoding=encoding)
  tidy_html(html_txt)
 }
--- a/R/tidy.r
+++ b/R/tidy.r
@ -1,9 +1,9 @@
 #' Tidy or "Pretty Print" HTML/XHTML Documents
 #'
 #' Pass in HTML content as either plain or raw text or parsed objects (either with the
-#' \code{XML} or \code{xml2} packages) along with an options list that specifies how
+#' \code{XML} or \code{xml2} packages) or as an \code{httr} \code{response} object
-#' the content will be tidied and get back tidied content of the same object type as passed
+#' along with an options list that specifies how the content will be tidied and get back
-#' in to the function.
+#' tidied content of the same object type as passed in to the function.
 #'
 #' The default option \code{TixyXhtmlOut} will convert the input content to XHTML.
 #'
@ -147,3 +147,4 @@ tidy_html.connection <- function(content, options=list(TidyXhtmlOut=TRUE),
        source=html, options=options, show_errors=verbose)
 }
--- a/README.Rmd
+++ b/README.Rmd
@ -59,7 +59,15 @@ res <- GET("http://rud.is/test/untidy.html")
 cat(content(res, as="text"))
 ```
-Let's see what `tidy_html()` does to it:
+Let's see what `tidy_html()` does to it.
 It can handle the `response` object directly:
 ```{r message=FALSE, warning=FALSE}
 cat(tidy_html(res, list(TidyDocType="html5", TidyWrapLen=200)))
 ```
 But, you'll probably mostly use it on HTML you've identified as gnarly and already have that HTML text content handy:
 ```{r message=FALSE, warning=FALSE}
 cat(tidy_html(content(res, as="text"), list(TidyDocType="html5", TidyWrapLen=200)))
--- a/README.md
+++ b/README.md
@ -49,7 +49,33 @@ cat(content(res, as="text"))
 ## as is this <span id="sp">portion<div>
 ```
-Let's see what `tidy_html()` does to it:
+Let's see what `tidy_html()` does to it.
 It can handle the `response` object directly:
 ``` r
 cat(tidy_html(res, list(TidyDocType="html5", TidyWrapLen=200)))
 ## <!DOCTYPE html>
 ## <html xmlns="http://www.w3.org/1999/xhtml">
 ## <head>
 ## <meta name="generator" content=
 ## "HTML Tidy for HTML5 for R version 5.0.0" />
 ## <style>
 ## <![CDATA[
 ## body { font-family: sans-serif; }
 ## ]]>
 ## </style>
 ## <title></title>
 ## </head>
 ## <body>
 ## <b>This is some <i>really</i> poorly formatted HTML as is this
 ## <span id="sp">portion</span></b>
 ## <div><span id="sp"></span></div>
 ## </body>
 ## </html>
 ```
 But, you'll probably mostly use it on HTML you've identified as gnarly and already have that HTML text content handy:
 ``` r
 cat(tidy_html(content(res, as="text"), list(TidyDocType="html5", TidyWrapLen=200)))
@ -214,7 +240,7 @@ sum(map_int(book, nchar))
 ## [1] 207501
 system.time(tidy_book <- tidy_html(book))
 ##    user  system elapsed 
-##   0.022   0.002   0.024
+##   0.021   0.001   0.022
 ```
 (It's usually between 20 & 25 milliseconds to process those 202 kilobytes of HTML.) Not too shabby.
--- a/cran-comments.md
+++ b/cran-comments.md
@ -1,7 +1,7 @@
 ## Test environments
 * local OS X install, R 3.3.1 (clang)
-* ubuntu 12.04 (on travis-ci), R 3.3.1
+* ubuntu 12.04 (on travis-ci), oldrel, release & devel
 * win-builder (devel and release)
 * local 32-bit Windows 10 R 3.3.1
 * local ubuntu 14.04 R 3.3.1 & R-devel
@ -17,7 +17,10 @@ This is a new release, so there are no reverse dependencies.
 ---
-There is an inst/COPYRIGHTS file. The libtidy library is used in a plethora
+This fixes a fairly nasty bug that was 
-of other packages/modules (see Perl, Python & npm for starters) and most 
+user-identfied fairly early after release 
-don't even bother acknowledging the efforts of the HTML Tidy working group
+but I didn't want to bug the CRAN team 
-but I wanted to make sure their efforts were credited appropriately.
+so quickly after the CRAN acceptange. This
 also addes new functionality and (optionally)
 provides more informaiton on the tidying 
 process.
--- a/man/tidy_html.Rd
+++ b/man/tidy_html.Rd
@ -1,15 +1,19 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/tidy.r
+% Please edit documentation in R/response.r, R/tidy.r
-\name{tidy_html}
+\name{tidy_html.response}
 \alias{tidy_html}
 \alias{tidy_html.HTMLInternalDocument}
 \alias{tidy_html.character}
 \alias{tidy_html.connection}
 \alias{tidy_html.default}
 \alias{tidy_html.raw}
 \alias{tidy_html.response}
 \alias{tidy_html.xml_document}
 \title{Tidy or "Pretty Print" HTML/XHTML Documents}
 \usage{
 \method{tidy_html}{response}(content, options = list(TidyXhtmlOut = TRUE),
  verbose = FALSE, encoding = "UTF-8")
 tidy_html(content, options = list(TidyXhtmlOut = TRUE), verbose = FALSE)
 \method{tidy_html}{default}(content, options = list(TidyXhtmlOut = TRUE),
@ -37,6 +41,9 @@ or \code{XML} packages.}
 \item{options}{named list of options}
 \item{verbose}{output document errors? (default: \code{FALSE})}
 \item{encoding}{specify the encoding when tidying an \code{httr} \code{response}
 object. Default to "\code{UTF-8}".}
 }
 \value{
 Tidied HTML/XHTML content. The object type will be the same as that of the input type
@ -44,9 +51,9 @@ Tidied HTML/XHTML content. The object type will be the same as that of the input
 }
 \description{
 Pass in HTML content as either plain or raw text or parsed objects (either with the
-\code{XML} or \code{xml2} packages) along with an options list that specifies how
+\code{XML} or \code{xml2} packages) or as an \code{httr} \code{response} object
-the content will be tidied and get back tidied content of the same object type as passed
+along with an options list that specifies how the content will be tidied and get back
-in to the function.
+tidied content of the same object type as passed in to the function.
 }
 \details{
 The default option \code{TixyXhtmlOut} will convert the input content to XHTML.
--- a/tests/testthat/test-htmltidy.R
+++ b/tests/testthat/test-htmltidy.R
@ -1,7 +1,9 @@
 context("basic functionality")
 test_that("tidying works", {
-  expect_gte(nchar(tidy_html("<b><p><a href='http://google.com'>google &gt</a></p></b>")),
+  th <- tidy_html("<b><p><a href='http://google.com'>google &gt</a></p></b>")
-             249)
+
  expect_gte(nchar(th), 249)
  expect_equivalent(grepl("HTML Tidy for HTML5 for R", th), TRUE)
 })