diff --git a/NAMESPACE b/NAMESPACE index d36408f..8dba643 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,12 +5,8 @@ S3method(tidy_html,character) S3method(tidy_html,default) S3method(tidy_html,raw) S3method(tidy_html,xml_document) -S3method(tidy_xml,XMLInternalDocument) -S3method(tidy_xml,character) -S3method(tidy_xml,default) -S3method(tidy_xml,raw) -S3method(tidy_xml,xml_document) export(tidy_html) -export(tidy_xml) +import(XML) +import(xml2) importFrom(Rcpp,sourceCpp) useDynLib(htmltidy) diff --git a/R/RcppExports.R b/R/RcppExports.R index d17e027..173cbbb 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,4 +1,4 @@ -# Generated by using Rcpp::compileAttributes() -> do not edit by hand +# This file was generated by Rcpp::compileAttributes # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 tidy_html_int <- function(source, options) { diff --git a/R/htmltidy-package.r b/R/htmltidy-package.r index 7185f72..ab68c8a 100644 --- a/R/htmltidy-package.r +++ b/R/htmltidy-package.r @@ -1,6 +1,6 @@ #' Clean Up Gnarly HTML/XML #' -#' HTML and XML documents can be beautiful and pristine. They can also be +#' HTML documents can be beautiful and pristine. They can also be #' wretched, evil, malformed hellspawn. Now, you can tidy up that HTML and XML before #' processing it with your favorite angle-bracket parsing tools. #' @@ -9,4 +9,5 @@ #' @author Bob Rudis (bob@@rud.is) #' @useDynLib htmltidy #' @importFrom Rcpp sourceCpp +#' @import xml2 XML NULL diff --git a/R/tidy.r b/R/tidy.r index 1b2d774..f857395 100644 --- a/R/tidy.r +++ b/R/tidy.r @@ -1,22 +1,68 @@ #' Tidy HTML/XHTML Documents #' -#' Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly}, -#' \code{TidyBreakBeforeBR}, \code{TidyCoerceEndTags}, \code{TidyCoerceEndTags}, -#' \code{TidyDoctype}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas}, -#' \code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyHideComments}, -#' \code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyIndentSpaces}, -#' \code{TidyJoinClasses}, \code{TidyJoinStyles}, \code{TidyLogicalEmphasis}, -#' \code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark}, -#' \code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize}, -#' \code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000}, -#' \code{TidyWrapLen}, \code{TidyXhtmlOut} +#' Pass in HTML content as either plain or raw text or parsed objects (either with the +#' \code{XML} or \code{xml2} packages) along with an options list that specifies how +#' the content will be tidied and get back tidied content of the same object type as passed +#' in to the function. #' -#' @param content atomic character or raw vector of content to tidy +#' The default option \code{TixyXhtmlOut} will convert the input content to XHTML. +#' +#' Currently supported options: +#' +#' \itemize{ +#' \item{Ones taking a logical value: }{\code{TidyAltText}, \code{TidyBodyOnly}, \code{TidyBreakBeforeBR}, +#' \code{TidyCoerceEndTags}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas}, +#' \code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyGDocClean}, \code{TidyHideComments}, +#' \code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyJoinClasses}, \code{TidyJoinStyles}, +#' \code{TidyLogicalEmphasis}, \code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark}, +#' \code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyUpperCaseAttrs}, +#' \code{TidyUpperCaseTags}, \code{TidyWord2000}, \code{TidyXhtmlOut}} +#' \item{Ones taking a character value: }{\code{TidyDoctype}, \code{TidyInlineTags}, \code{TidyBlockTags}, +#' \code{TidyEmptyTags}, \code{TidyPreTags}} +#' \item{Ones taking an integer value: }{\code{TidyIndentSpaces}, \code{TidyTabSize}, \code{TidyWrapLen}} +#' } +#' +#' File \href{https://github.com/hrbrmstr/htmltidy/issues}{an issue} if there are other \code{libtidy} +#' options you'd like supported. +#' +#' It is likely that the most used options will be: +#' +#' \itemize{ +#' \item{\code{TidyXhtmlOut} (logical)}, +#' \item{\code{TidyHtmlOut} (logical)} and +#' \item{\code{TidyDocType} which should be one of "\code{omit}", +#' "\code{html5}", "\code{auto}", "\code{strict}" or "\code{loose}"}. +#' } +#' +#' You can clean up Microsoft Word (2000) and Google Docs HTML via logical settings for +#' \code{TidyWord2000} and \code{TidyGDocClean}, respectively. +#' +#' It may also be advantageous to remove all comments with \code{TidyHideComments}. +#' +#' @param content accepts a character vector, raw vector or parsed content from the \code{xml2} +#' or \code{XML} packages. #' @param options named list of options -#' @return tidied HTML/XHTML content -#' @references \url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h} -#' (for definitions of the options supported above). +#' @return Tidied HTML/XHTML content. The object type will be the same as that of the input type. +#' @references \url{http://api.html-tidy.org/tidy/quickref_5.1.25.html} & +#' \url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h} +#' for definitions of the options supported above and \url{https://www.w3.org/People/Raggett/tidy/} +#' for an explanation of what "tidy" HTML is and some canonical examples of what it can do. #' @export +#' @examples +#' opts <- list( +#' TidyDocType="html5", +#' TidyMakeClean=TRUE, +#' TidyHideComments=TRUE, +#' TidyIndentContent=TRUE, +#' TidyWrapLen=200 +#' ) +#' +#' txt <- paste0( +#' c("
", +#' "Test
"), +#' collapse="") +#' +#' cat(tidy_html(txt, option=opts)) tidy_html <- function(content, options=list(TidyXhtmlOut=TRUE)) { UseMethod("tidy_html") } @@ -59,7 +105,7 @@ tidy_html.xml_document <- function(content, options=list(TidyXhtmlOut=TRUE)) { #' @export #' @rdname tidy_html tidy_html.HTMLInternalDocument <- function(content, options=list(TidyXhtmlOut=TRUE)) { - content <- saveXML(content) + content <- XML::saveXML(content) out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', source=content, options=options) XML::htmlParse(out) diff --git a/R/xml.r b/R/xml.r deleted file mode 100644 index e89c795..0000000 --- a/R/xml.r +++ /dev/null @@ -1,64 +0,0 @@ -#' Tidy XML Documents -#' -#' Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly}, -#' \code{TidyBreakBeforeBR}, \code{TidyCoerceEndTags}, \code{TidyCoerceEndTags}, -#' \code{TidyDoctype}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas}, -#' \code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyHideComments}, -#' \code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyIndentSpaces}, -#' \code{TidyJoinClasses}, \code{TidyJoinStyles}, \code{TidyLogicalEmphasis}, -#' \code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark}, -#' \code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize}, -#' \code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000}, -#' \code{TidyWrapLen}, \code{TidyXhtmlOut}, \code{TidyXmlDecl}, \code{TidyXmlOut}, -#' \code{TidyXmlTags}. -#' -#' @param content atomic character or raw vector of content to tidy -#' @param options named list of options -#' @return tidied XML content -#' @references \url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h} -#' (for definitions of the options supported above). -#' @export -tidy_xml <- function(content, options=list(TidyXmlOut=TRUE)) { - UseMethod("tidy_xml") -} - -#' @export -#' @rdname tidy_xml -tidy_xml.default <- function(content, options=list(TidyXmlOut=TRUE)) { - .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', - source=content, options=options) -} - -#' @export -#' @rdname tidy_xml -tidy_xml.character <- function(content, options=list(TidyXmlOut=TRUE)) { - .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', - source=content, options=options) -} - -#' @export -#' @rdname tidy_xml -tidy_xml.raw <- function(content, options=list(TidyXmlOut=TRUE)) { - content <- iconv(readBin(content, character()), to="UTF-8") - out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', - source=content, options=options) - charToRaw(out) -} - -#' @export -#' @rdname tidy_xml -tidy_xml.xml_document <- function(content, options=list(TidyXmlOut=TRUE)) { - content <- toString(content) - out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', - source=content, options=options) - xml2::read_xml(out) -} - -#' @export -#' @rdname tidy_xml -tidy_xml.XMLInternalDocument <- function(content, options=list(TidyXmlOut=TRUE)) { - content <- saveXML(content) - out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', - source=content, options=options) - XML::xmlParse(out) -} diff --git a/README.Rmd b/README.Rmd index 3696846..f16b51c 100644 --- a/README.Rmd +++ b/README.Rmd @@ -73,7 +73,7 @@ opts <- list(TidyDocType="html5", TidyWrapLen=200) txt <- " - + diff --git a/man/htmltidy.Rd b/man/htmltidy.Rd index 9562604..932302f 100644 --- a/man/htmltidy.Rd +++ b/man/htmltidy.Rd @@ -6,7 +6,7 @@ \alias{htmltidy-package} \title{Clean Up Gnarly HTML/XML} \description{ -HTML and XML documents can be beautiful and pristine. They can also be +HTML documents can be beautiful and pristine. They can also be wretched, evil, malformed hellspawn. Now, you can tidy up that HTML and XML before processing it with your favorite angle-bracket parsing tools. } diff --git a/man/tidy_html.Rd b/man/tidy_html.Rd index 81fa9fe..95b2ad3 100644 --- a/man/tidy_html.Rd +++ b/man/tidy_html.Rd @@ -23,27 +23,75 @@ tidy_html(content, options = list(TidyXhtmlOut = TRUE)) = TRUE)) } \arguments{ -\item{content}{atomic character or raw vector of content to tidy} +\item{content}{accepts a character vector, raw vector or parsed content from the \code{xml2} +or \code{XML} packages.} \item{options}{named list of options} } \value{ -tidied HTML/XHTML content +Tidied HTML/XHTML content. The object type will be the same as that of the input type. } \description{ -Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly}, -\code{TidyBreakBeforeBR}, \code{TidyCoerceEndTags}, \code{TidyCoerceEndTags}, -\code{TidyDoctype}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas}, -\code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyHideComments}, -\code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyIndentSpaces}, -\code{TidyJoinClasses}, \code{TidyJoinStyles}, \code{TidyLogicalEmphasis}, -\code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark}, -\code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize}, -\code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000}, -\code{TidyWrapLen}, \code{TidyXhtmlOut} +Pass in HTML content as either plain or raw text or parsed objects (either with the +\code{XML} or \code{xml2} packages) along with an options list that specifies how +the content will be tidied and get back tidied content of the same object type as passed +in to the function. +} +\details{ +The default option \code{TixyXhtmlOut} will convert the input content to XHTML. + +Currently supported options: + +\itemize{ + \item{Ones taking a logical value: }{\code{TidyAltText}, \code{TidyBodyOnly}, \code{TidyBreakBeforeBR}, + \code{TidyCoerceEndTags}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas}, + \code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyGDocClean}, \code{TidyHideComments}, + \code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyJoinClasses}, \code{TidyJoinStyles}, + \code{TidyLogicalEmphasis}, \code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark}, + \code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyUpperCaseAttrs}, + \code{TidyUpperCaseTags}, \code{TidyWord2000}, \code{TidyXhtmlOut}} + \item{Ones taking a character value: }{\code{TidyDoctype}, \code{TidyInlineTags}, \code{TidyBlockTags}, + \code{TidyEmptyTags}, \code{TidyPreTags}} + \item{Ones taking an integer value: }{\code{TidyIndentSpaces}, \code{TidyTabSize}, \code{TidyWrapLen}} +} + +File \href{https://github.com/hrbrmstr/htmltidy/issues}{an issue} if there are other \code{libtidy} +options you'd like supported. + +It is likely that the most used options will be: + +\itemize{ + \item{\code{TidyXhtmlOut} (logical)}, + \item{\code{TidyHtmlOut} (logical)} and + \item{\code{TidyDocType} which should be one of "\code{omit}", + "\code{html5}", "\code{auto}", "\code{strict}" or "\code{loose}"}. +} + +You can clean up Microsoft Word (2000) and Google Docs HTML via logical settings for +\code{TidyWord2000} and \code{TidyGDocClean}, respectively. + +It may also be advantageous to remove all comments with \code{TidyHideComments}. +} +\examples{ +opts <- list( + TidyDocType="html5", + TidyMakeClean=TRUE, + TidyHideComments=TRUE, + TidyIndentContent=TRUE, + TidyWrapLen=200 +) + +txt <- paste0( + c("", +"Test
"), + collapse="") + +cat(tidy_html(txt, option=opts)) } \references{ -\url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h} - (for definitions of the options supported above). +\url{http://api.html-tidy.org/tidy/quickref_5.1.25.html} & + \url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h} + for definitions of the options supported above and \url{https://www.w3.org/People/Raggett/tidy/} + for an explanation of what "tidy" HTML is and some canonical examples of what it can do. } diff --git a/man/tidy_xml.Rd b/man/tidy_xml.Rd deleted file mode 100644 index 584dbdc..0000000 --- a/man/tidy_xml.Rd +++ /dev/null @@ -1,50 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/xml.r -\name{tidy_xml} -\alias{tidy_xml} -\alias{tidy_xml.XMLInternalDocument} -\alias{tidy_xml.character} -\alias{tidy_xml.default} -\alias{tidy_xml.raw} -\alias{tidy_xml.xml_document} -\title{Tidy XML Documents} -\usage{ -tidy_xml(content, options = list(TidyXmlOut = TRUE)) - -\method{tidy_xml}{default}(content, options = list(TidyXmlOut = TRUE)) - -\method{tidy_xml}{character}(content, options = list(TidyXmlOut = TRUE)) - -\method{tidy_xml}{raw}(content, options = list(TidyXmlOut = TRUE)) - -\method{tidy_xml}{xml_document}(content, options = list(TidyXmlOut = TRUE)) - -\method{tidy_xml}{XMLInternalDocument}(content, options = list(TidyXmlOut = - TRUE)) -} -\arguments{ -\item{content}{atomic character or raw vector of content to tidy} - -\item{options}{named list of options} -} -\value{ -tidied XML content -} -\description{ -Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly}, -\code{TidyBreakBeforeBR}, \code{TidyCoerceEndTags}, \code{TidyCoerceEndTags}, -\code{TidyDoctype}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas}, -\code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyHideComments}, -\code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyIndentSpaces}, -\code{TidyJoinClasses}, \code{TidyJoinStyles}, \code{TidyLogicalEmphasis}, -\code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark}, -\code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize}, -\code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000}, -\code{TidyWrapLen}, \code{TidyXhtmlOut}, \code{TidyXmlDecl}, \code{TidyXmlOut}, -\code{TidyXmlTags}. -} -\references{ -\url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h} - (for definitions of the options supported above). -} - diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 37745f7..023bc25 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -1,4 +1,4 @@ -// Generated by using Rcpp::compileAttributes() -> do not edit by hand +// This file was generated by Rcpp::compileAttributes // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include