diff --git a/DESCRIPTION b/DESCRIPTION index fbea740..4027920 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -36,8 +36,8 @@ Description: HTML documents can be beautiful and pristine. They can also be for more information about 'vkbeautify' and 'XMLDisplay', respectively. Copyright: file inst/COPYRIGHTS -URL: https://github.com/hrbrmstr/htmltidy -BugReports: https://github.com/hrbrmstr/htmltidy/issues +URL: https://gitlab.com/hrbrmstr/htmltidy +BugReports: https://gitlab.com/hrbrmstr/htmltidy/issues Depends: R (>= 3.2.0) License: MIT + file LICENSE @@ -54,4 +54,4 @@ Imports: XML, htmlwidgets, htmltools -RoxygenNote: 6.0.1.9000 +RoxygenNote: 6.1.1 diff --git a/README.Rmd b/README.Rmd index 4c883c3..4b4c57d 100644 --- a/README.Rmd +++ b/README.Rmd @@ -1,30 +1,19 @@ --- output: rmarkdown::github_document +editor_options: + chunk_output_type: console --- -[![Travis-CI Build Status](https://travis-ci.org/hrbrmstr/htmltidy.svg?branch=master)](https://travis-ci.org/hrbrmstr/htmltidy) -[![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/hrbrmstr/htmltidy?branch=master&svg=true)](https://ci.appveyor.com/project/hrbrmstr/htmltidy) -[![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/htmltidy)](https://cran.r-project.org/package=htmltidy) -![downloads](http://cranlogs.r-pkg.org/badges/grand-total/htmltidy) - - - -```{r, echo = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "##", - message = FALSE, - warning = FALSE, - error = FALSE, - fig.retina=2, - fig.path = "README-" -) +```{r pkg-knitr-opts, include=FALSE} +hrbrpkghelpr::global_opts() ``` -# htmltidy - -Tidy Up and Test XPath Queries on HTML and XML Content +```{r badges, results='asis', echo=FALSE, cache=FALSE} +hrbrpkghelpr::stinking_badges() +``` -## Description +```{r description, results='asis', echo=FALSE, cache=FALSE} +hrbrpkghelpr::yank_title_and_description() +``` Partly inspired by [this SO question](http://stackoverflow.com/questions/37061873/identify-a-weblink-in-bold-in-r) and because there's a great deal of cruddy HTML out there that needs fixing to use properly when scraping data. @@ -32,29 +21,21 @@ It relies on a locally included version of [`libtidy`](http://www.html-tidy.org/ It also incorporates an `htmlwidget` to view and test XPath queries on HTML/XML content and another widget to view an XML document in a collapseable tree view. -## What's inside the tin? - -The following functions are implemented: +## What's Inside The Tin -- `tidy_html`: Tidy or "Pretty Print" HTML/XHTML Documents -- `html_view`: HTML/XML pretty printer and viewer -- `xml_view`: HTML/XML pretty printer and viewer -- `html_tree_view`: HTML/XML tree viewer -- `xml_tree_view`: HTML/XML tree viewer +```{r ingredients, results='asis', echo=FALSE, cache=FALSE} +hrbrpkghelpr::describe_ingredients() +``` ## Installation -```{r eval=FALSE} -devtools::install_github("hrbrmstr/htmltidy") -``` - -```{r echo=FALSE} -options(width=120) +```{r install-ex, results='asis', echo=FALSE, cache=FALSE} +hrbrpkghelpr::install_block() ``` ## Usage -```{r message=FALSE, warning=FALSE} +```{r usage} library(htmltidy) # current verison @@ -68,7 +49,7 @@ library(purrr) This is really "un-tidy" content: -```{r message=FALSE, warning=FALSE} +```{r untidy-01} res <- GET("https://rud.is/test/untidy.html") cat(content(res, as="text")) ``` @@ -77,52 +58,51 @@ Let's see what `tidy_html()` does to it. It can handle the `response` object directly: -```{r message=FALSE, warning=FALSE} +```{r tidy-01} cat(tidy_html(res, list(TidyDocType="html5", TidyWrapLen=200))) ``` But, you'll probably mostly use it on HTML you've identified as gnarly and already have that HTML text content handy: -```{r message=FALSE, warning=FALSE} +```{r options-01} cat(tidy_html(content(res, as="text"), list(TidyDocType="html5", TidyWrapLen=200))) ``` NOTE: you could also just have done: -```{r message=FALSE, warning=FALSE} +```{r options-02} cat(tidy_html(url("https://rud.is/test/untidy.html"), list(TidyDocType="html5", TidyWrapLen=200))) ``` You'll see that this differs substantially from the mangling `libxml2` does (via `read_html()`): -```{r message=FALSE, warning=FALSE} +```{r options-03} pg <- read_html("https://rud.is/test/untidy.html") cat(toString(pg)) ``` It can also deal with "raw" and parsed objects: -```{r message=FALSE, warning=FALSE} +```{r raw-01} tidy_html(content(res, as="raw")) tidy_html(content(res, as="text", encoding="UTF-8")) tidy_html(content(res, as="parsed", encoding="UTF-8")) -tidy_html(htmlParse("https://rud.is/test/untidy.html")) +tidy_html(suppressWarnings(htmlParse("https://rud.is/test/untidy.html"))) ``` And, show the markup errors: -```{r message=FALSE, warning=FALSE} +```{r errors-01} invisible(tidy_html(url("https://rud.is/test/untidy.html"), verbose=TRUE)) ``` ## Testing Options -```{r message=FALSE, warning=FALSE} - +```{r more-options-01} opts <- list(TidyDocType="html5", TidyMakeClean=TRUE, TidyHideComments=TRUE, @@ -145,14 +125,13 @@ txt <- " " cat(tidy_html(txt, option=opts)) - ``` But, you're probably better off running it on plain HTML source. Since it's C/C++-backed, it's pretty fast: -```{r message=FALSE, warning=FALSE} +```{r speed-01} book <- readLines("http://singlepageappbook.com/single-page.html") sum(map_int(book, nchar)) system.time(tidy_book <- tidy_html(book)) @@ -162,11 +141,10 @@ system.time(tidy_book <- tidy_html(book)) ## htmltidy Metrics -```{r echo=FALSE} +```{r cloc, echo=FALSE} cloc::cloc_pkg_md() ``` ## Code of Conduct -Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). -By participating in this project you agree to abide by its terms. +Please note that this project is released with a Contributor Code of Conduct. By participating in this project you agree to abide by its terms. diff --git a/README.md b/README.md index a89d5a6..a4e9dd1 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,24 @@ -[![Travis-CI Build +[![Project Status: Active – The project has reached a stable, usable +state and is being actively +developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) +[![Signed +by](https://img.shields.io/badge/Keybase-Verified-brightgreen.svg)](https://keybase.io/hrbrmstr) +![Signed commit +%](https://img.shields.io/badge/Signed_Commits-9.26%25-lightgrey.svg) +[![Linux build Status](https://travis-ci.org/hrbrmstr/htmltidy.svg?branch=master)](https://travis-ci.org/hrbrmstr/htmltidy) -[![AppVeyor Build -Status](https://ci.appveyor.com/api/projects/status/github/hrbrmstr/htmltidy?branch=master&svg=true)](https://ci.appveyor.com/project/hrbrmstr/htmltidy) -[![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/htmltidy)](https://cran.r-project.org/package=htmltidy) -![downloads](http://cranlogs.r-pkg.org/badges/grand-total/htmltidy) - - +[![Windows build +status](https://ci.appveyor.com/api/projects/status/github/hrbrmstr/htmltidy?svg=true)](https://ci.appveyor.com/project/hrbrmstr/htmltidy) +[![Coverage +Status](https://codecov.io/gh/hrbrmstr/htmltidy/branch/master/graph/badge.svg)](https://codecov.io/gh/hrbrmstr/htmltidy) +[![cran +checks](https://cranchecks.info/badges/worst/htmltidy)](https://cranchecks.info/pkgs/htmltidy) +[![CRAN +status](https://www.r-pkg.org/badges/version/htmltidy)](https://www.r-pkg.org/pkg/htmltidy) +![Minimal R +Version](https://img.shields.io/badge/R%3E%3D-3.2.0-blue.svg) +![License](https://img.shields.io/badge/License-MIT-blue.svg) # htmltidy @@ -14,6 +26,23 @@ Tidy Up and Test XPath Queries on HTML and XML Content ## Description +HTML documents can be beautiful and pristine. They can also be wretched, +evil, malformed demon-spawn. Now, you can tidy up that HTML and XHTML +before processing it with your favorite angle-bracket crunching tools, +going beyond the limited tidying that ‘libxml2’ affords in the ‘XML’ and +‘xml2’ packages and taming even the ugliest HTML code generated by the +likes of Google Docs and Microsoft Word. It’s also possible to use the +functions provided to format or “pretty print” HTML content as it is +being tidied. Utilities are also included that make it possible to view +formatted and “pretty printed” HTML/XML content from HTML/XML document +objects, nodes, node sets and plain character HTML/XML using +‘vkbeautify’ (by Vadim Kiryukhin) and ‘highlight.js’ (by Ivan +Sagalaev). Also (optionally) enables filtering of nodes via XPath or +viewing an HTML/XML document in “tree” view using ‘XMLDisplay’ (by Lev +Muchnik). See and + +for more information about ‘vkbeautify’ and ‘XMLDisplay’, respectively. + Partly inspired by [this SO question](http://stackoverflow.com/questions/37061873/identify-a-weblink-in-bold-in-r) and because there’s a great deal of cruddy HTML out there that needs @@ -27,22 +56,33 @@ It also incorporates an `htmlwidget` to view and test XPath queries on HTML/XML content and another widget to view an XML document in a collapseable tree view. -## What’s inside the tin? +## What’s Inside The Tin -The following functions are implemented: - - - `tidy_html`: Tidy or “Pretty Print” HTML/XHTML Documents - - `html_view`: HTML/XML pretty printer and viewer - - `xml_view`: HTML/XML pretty printer and viewer - - `html_tree_view`: HTML/XML tree viewer + - `highlight_styles`: List available HTML/XML highlight styles + - `renderXmlview`: Widget render function for use in Shiny + - `tidy_html.response`: Tidy or “Pretty Print” HTML/XHTML Documents - `xml_tree_view`: HTML/XML tree viewer + - `xml_view`: HTML/XML pretty printer and viewer + - `xmltreeview-shiny`: Shiny bindings for xmltreeview + - `xmlviewOutput`: Widget output function for use in Shiny ## Installation ``` r -devtools::install_github("hrbrmstr/htmltidy") +remotes::install_git("https://git.rud.is/hrbrmstr/htmltidy.git") +# or +remotes::install_git("https://git.sr.ht/~hrbrmstr/htmltidy") +# or +remotes::install_gitlab("hrbrmstr/htmltidy") +# or +remotes::install_bitbucket("hrbrmstr/htmltidy") +# or +remotes::install_github("hrbrmstr/htmltidy") ``` +NOTE: To use the ‘remotes’ install options you will need to have the +[{remotes} package](https://github.com/r-lib/remotes) installed. + ## Usage ``` r @@ -101,8 +141,7 @@ cat(tidy_html(res, list(TidyDocType="html5", TidyWrapLen=200))) ``` But, you’ll probably mostly use it on HTML you’ve identified as gnarly -and already have that HTML text content -handy: +and already have that HTML text content handy: ``` r cat(tidy_html(content(res, as="text"), list(TidyDocType="html5", TidyWrapLen=200))) @@ -180,12 +219,12 @@ tidy_html(content(res, as="text", encoding="UTF-8")) ## [1] "\n\n\n\n\n\n\n\nThis is some really poorly formatted HTML as is this\nportion\n
\n\n\n" tidy_html(content(res, as="parsed", encoding="UTF-8")) -## {xml_document} +## {html_document} ## ## [1] \n\nportion\n
## ## @@ -199,8 +238,7 @@ tidy_html(htmlParse("https://rud.is/test/untidy.html")) ## ``` -And, show the markup -errors: +And, show the markup errors: ``` r invisible(tidy_html(url("https://rud.is/test/untidy.html"), verbose=TRUE)) @@ -220,7 +258,6 @@ invisible(tidy_html(url("https://rud.is/test/untidy.html"), verbose=TRUE)) ## Testing Options ``` r - opts <- list(TidyDocType="html5", TidyMakeClean=TRUE, TidyHideComments=TRUE, @@ -268,25 +305,24 @@ sum(map_int(book, nchar)) ## [1] 207501 system.time(tidy_book <- tidy_html(book)) ## user system elapsed -## 0.026 0.001 0.027 +## 0.019 0.000 0.019 ``` (It’s usually between 20 & 25 milliseconds to process those 202 -kilobytes of HTML.) Not too -shabby. +kilobytes of HTML.) Not too shabby. ## htmltidy Metrics | Lang | \# Files | (%) | LoC | (%) | Blank lines | (%) | \# Lines | (%) | | :----------- | -------: | ---: | ----: | ---: | ----------: | ---: | -------: | ---: | -| C | 27 | 0.34 | 28646 | 0.81 | 4696 | 0.77 | 4304 | 0.59 | -| C/C++ Header | 37 | 0.47 | 5799 | 0.16 | 1227 | 0.20 | 2674 | 0.36 | +| C | 27 | 0.34 | 28639 | 0.81 | 4700 | 0.77 | 4325 | 0.59 | +| C/C++ Header | 37 | 0.47 | 5796 | 0.16 | 1227 | 0.20 | 2677 | 0.36 | | C++ | 4 | 0.05 | 647 | 0.02 | 117 | 0.02 | 64 | 0.01 | | R | 10 | 0.13 | 151 | 0.00 | 38 | 0.01 | 235 | 0.03 | -| Rmd | 1 | 0.01 | 53 | 0.00 | 51 | 0.01 | 68 | 0.01 | +| Rmd | 1 | 0.01 | 47 | 0.00 | 44 | 0.01 | 59 | 0.01 | ## Code of Conduct -Please note that this project is released with a [Contributor Code of -Conduct](CONDUCT.md). By participating in this project you agree to -abide by its terms. +Please note that this project is released with a Contributor Code of +Conduct. By participating in this project you agree to abide by its +terms. diff --git a/man/tidy_html.Rd b/man/tidy_html.Rd index e95f4b4..ca4b6ef 100644 --- a/man/tidy_html.Rd +++ b/man/tidy_html.Rd @@ -11,28 +11,29 @@ \alias{tidy_html.connection} \title{Tidy or "Pretty Print" HTML/XHTML Documents} \usage{ -\method{tidy_html}{response}(content, options = list(TidyXhtmlOut = TRUE), - verbose = FALSE) +\method{tidy_html}{response}(content, options = list(TidyXhtmlOut = + TRUE), verbose = FALSE) -tidy_html(content, options = list(TidyXhtmlOut = TRUE), verbose = FALSE) +tidy_html(content, options = list(TidyXhtmlOut = TRUE), + verbose = FALSE) \method{tidy_html}{default}(content, options = list(TidyXhtmlOut = TRUE), verbose = FALSE) -\method{tidy_html}{character}(content, options = list(TidyXhtmlOut = TRUE), - verbose = FALSE) +\method{tidy_html}{character}(content, options = list(TidyXhtmlOut = + TRUE), verbose = FALSE) \method{tidy_html}{raw}(content, options = list(TidyXhtmlOut = TRUE), verbose = FALSE) -\method{tidy_html}{xml_document}(content, options = list(TidyXhtmlOut = TRUE), - verbose = FALSE) +\method{tidy_html}{xml_document}(content, options = list(TidyXhtmlOut = + TRUE), verbose = FALSE) -\method{tidy_html}{HTMLInternalDocument}(content, options = list(TidyXhtmlOut - = TRUE), verbose = FALSE) +\method{tidy_html}{HTMLInternalDocument}(content, + options = list(TidyXhtmlOut = TRUE), verbose = FALSE) -\method{tidy_html}{connection}(content, options = list(TidyXhtmlOut = TRUE), - verbose = FALSE) +\method{tidy_html}{connection}(content, options = list(TidyXhtmlOut = + TRUE), verbose = FALSE) } \arguments{ \item{content}{accepts a character vector, raw vector or parsed content from the \code{xml2} diff --git a/src/.DS_Store b/src/.DS_Store deleted file mode 100644 index 5008ddf..0000000 Binary files a/src/.DS_Store and /dev/null differ diff --git a/src/Makevars b/src/Makevars index 08146a1..c55fead 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,3 +1,3 @@ -PKG_CPPFLAGS = -I. -PKG_CXXFLAGS = -I. +PKG_CPPFLAGS = -I. -Wgnu-flexible-array-initializer -Wpedantic +PKG_CXXFLAGS = -I. -Wgnu-flexible-array-initializer -Wpedantic PKG_LIBS = -L.