diff --git a/NAMESPACE b/NAMESPACE index d36408f..8dba643 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,12 +5,8 @@ S3method(tidy_html,character) S3method(tidy_html,default) S3method(tidy_html,raw) S3method(tidy_html,xml_document) -S3method(tidy_xml,XMLInternalDocument) -S3method(tidy_xml,character) -S3method(tidy_xml,default) -S3method(tidy_xml,raw) -S3method(tidy_xml,xml_document) export(tidy_html) -export(tidy_xml) +import(XML) +import(xml2) importFrom(Rcpp,sourceCpp) useDynLib(htmltidy) diff --git a/R/RcppExports.R b/R/RcppExports.R index d17e027..173cbbb 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,4 +1,4 @@ -# Generated by using Rcpp::compileAttributes() -> do not edit by hand +# This file was generated by Rcpp::compileAttributes # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 tidy_html_int <- function(source, options) { diff --git a/R/htmltidy-package.r b/R/htmltidy-package.r index 7185f72..ab68c8a 100644 --- a/R/htmltidy-package.r +++ b/R/htmltidy-package.r @@ -1,6 +1,6 @@ #' Clean Up Gnarly HTML/XML #' -#' HTML and XML documents can be beautiful and pristine. They can also be +#' HTML documents can be beautiful and pristine. They can also be #' wretched, evil, malformed hellspawn. Now, you can tidy up that HTML and XML before #' processing it with your favorite angle-bracket parsing tools. #' @@ -9,4 +9,5 @@ #' @author Bob Rudis (bob@@rud.is) #' @useDynLib htmltidy #' @importFrom Rcpp sourceCpp +#' @import xml2 XML NULL diff --git a/R/tidy.r b/R/tidy.r index 1b2d774..f857395 100644 --- a/R/tidy.r +++ b/R/tidy.r @@ -1,22 +1,68 @@ #' Tidy HTML/XHTML Documents #' -#' Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly}, -#' \code{TidyBreakBeforeBR}, \code{TidyCoerceEndTags}, \code{TidyCoerceEndTags}, -#' \code{TidyDoctype}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas}, -#' \code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyHideComments}, -#' \code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyIndentSpaces}, -#' \code{TidyJoinClasses}, \code{TidyJoinStyles}, \code{TidyLogicalEmphasis}, -#' \code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark}, -#' \code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize}, -#' \code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000}, -#' \code{TidyWrapLen}, \code{TidyXhtmlOut} +#' Pass in HTML content as either plain or raw text or parsed objects (either with the +#' \code{XML} or \code{xml2} packages) along with an options list that specifies how +#' the content will be tidied and get back tidied content of the same object type as passed +#' in to the function. #' -#' @param content atomic character or raw vector of content to tidy +#' The default option \code{TixyXhtmlOut} will convert the input content to XHTML. +#' +#' Currently supported options: +#' +#' \itemize{ +#' \item{Ones taking a logical value: }{\code{TidyAltText}, \code{TidyBodyOnly}, \code{TidyBreakBeforeBR}, +#' \code{TidyCoerceEndTags}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas}, +#' \code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyGDocClean}, \code{TidyHideComments}, +#' \code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyJoinClasses}, \code{TidyJoinStyles}, +#' \code{TidyLogicalEmphasis}, \code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark}, +#' \code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyUpperCaseAttrs}, +#' \code{TidyUpperCaseTags}, \code{TidyWord2000}, \code{TidyXhtmlOut}} +#' \item{Ones taking a character value: }{\code{TidyDoctype}, \code{TidyInlineTags}, \code{TidyBlockTags}, +#' \code{TidyEmptyTags}, \code{TidyPreTags}} +#' \item{Ones taking an integer value: }{\code{TidyIndentSpaces}, \code{TidyTabSize}, \code{TidyWrapLen}} +#' } +#' +#' File \href{https://github.com/hrbrmstr/htmltidy/issues}{an issue} if there are other \code{libtidy} +#' options you'd like supported. +#' +#' It is likely that the most used options will be: +#' +#' \itemize{ +#' \item{\code{TidyXhtmlOut} (logical)}, +#' \item{\code{TidyHtmlOut} (logical)} and +#' \item{\code{TidyDocType} which should be one of "\code{omit}", +#' "\code{html5}", "\code{auto}", "\code{strict}" or "\code{loose}"}. +#' } +#' +#' You can clean up Microsoft Word (2000) and Google Docs HTML via logical settings for +#' \code{TidyWord2000} and \code{TidyGDocClean}, respectively. +#' +#' It may also be advantageous to remove all comments with \code{TidyHideComments}. +#' +#' @param content accepts a character vector, raw vector or parsed content from the \code{xml2} +#' or \code{XML} packages. #' @param options named list of options -#' @return tidied HTML/XHTML content -#' @references \url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h} -#' (for definitions of the options supported above). +#' @return Tidied HTML/XHTML content. The object type will be the same as that of the input type. +#' @references \url{http://api.html-tidy.org/tidy/quickref_5.1.25.html} & +#' \url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h} +#' for definitions of the options supported above and \url{https://www.w3.org/People/Raggett/tidy/} +#' for an explanation of what "tidy" HTML is and some canonical examples of what it can do. #' @export +#' @examples +#' opts <- list( +#' TidyDocType="html5", +#' TidyMakeClean=TRUE, +#' TidyHideComments=TRUE, +#' TidyIndentContent=TRUE, +#' TidyWrapLen=200 +#' ) +#' +#' txt <- paste0( +#' c("", +#' "

Test

"), +#' collapse="") +#' +#' cat(tidy_html(txt, option=opts)) tidy_html <- function(content, options=list(TidyXhtmlOut=TRUE)) { UseMethod("tidy_html") } @@ -59,7 +105,7 @@ tidy_html.xml_document <- function(content, options=list(TidyXhtmlOut=TRUE)) { #' @export #' @rdname tidy_html tidy_html.HTMLInternalDocument <- function(content, options=list(TidyXhtmlOut=TRUE)) { - content <- saveXML(content) + content <- XML::saveXML(content) out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', source=content, options=options) XML::htmlParse(out) diff --git a/R/xml.r b/R/xml.r deleted file mode 100644 index e89c795..0000000 --- a/R/xml.r +++ /dev/null @@ -1,64 +0,0 @@ -#' Tidy XML Documents -#' -#' Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly}, -#' \code{TidyBreakBeforeBR}, \code{TidyCoerceEndTags}, \code{TidyCoerceEndTags}, -#' \code{TidyDoctype}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas}, -#' \code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyHideComments}, -#' \code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyIndentSpaces}, -#' \code{TidyJoinClasses}, \code{TidyJoinStyles}, \code{TidyLogicalEmphasis}, -#' \code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark}, -#' \code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize}, -#' \code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000}, -#' \code{TidyWrapLen}, \code{TidyXhtmlOut}, \code{TidyXmlDecl}, \code{TidyXmlOut}, -#' \code{TidyXmlTags}. -#' -#' @param content atomic character or raw vector of content to tidy -#' @param options named list of options -#' @return tidied XML content -#' @references \url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h} -#' (for definitions of the options supported above). -#' @export -tidy_xml <- function(content, options=list(TidyXmlOut=TRUE)) { - UseMethod("tidy_xml") -} - -#' @export -#' @rdname tidy_xml -tidy_xml.default <- function(content, options=list(TidyXmlOut=TRUE)) { - .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', - source=content, options=options) -} - -#' @export -#' @rdname tidy_xml -tidy_xml.character <- function(content, options=list(TidyXmlOut=TRUE)) { - .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', - source=content, options=options) -} - -#' @export -#' @rdname tidy_xml -tidy_xml.raw <- function(content, options=list(TidyXmlOut=TRUE)) { - content <- iconv(readBin(content, character()), to="UTF-8") - out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', - source=content, options=options) - charToRaw(out) -} - -#' @export -#' @rdname tidy_xml -tidy_xml.xml_document <- function(content, options=list(TidyXmlOut=TRUE)) { - content <- toString(content) - out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', - source=content, options=options) - xml2::read_xml(out) -} - -#' @export -#' @rdname tidy_xml -tidy_xml.XMLInternalDocument <- function(content, options=list(TidyXmlOut=TRUE)) { - content <- saveXML(content) - out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', - source=content, options=options) - XML::xmlParse(out) -} diff --git a/README.Rmd b/README.Rmd index 3696846..f16b51c 100644 --- a/README.Rmd +++ b/README.Rmd @@ -73,7 +73,7 @@ opts <- list(TidyDocType="html5", TidyWrapLen=200) txt <- " - + diff --git a/man/htmltidy.Rd b/man/htmltidy.Rd index 9562604..932302f 100644 --- a/man/htmltidy.Rd +++ b/man/htmltidy.Rd @@ -6,7 +6,7 @@ \alias{htmltidy-package} \title{Clean Up Gnarly HTML/XML} \description{ -HTML and XML documents can be beautiful and pristine. They can also be +HTML documents can be beautiful and pristine. They can also be wretched, evil, malformed hellspawn. Now, you can tidy up that HTML and XML before processing it with your favorite angle-bracket parsing tools. } diff --git a/man/tidy_html.Rd b/man/tidy_html.Rd index 81fa9fe..95b2ad3 100644 --- a/man/tidy_html.Rd +++ b/man/tidy_html.Rd @@ -23,27 +23,75 @@ tidy_html(content, options = list(TidyXhtmlOut = TRUE)) = TRUE)) } \arguments{ -\item{content}{atomic character or raw vector of content to tidy} +\item{content}{accepts a character vector, raw vector or parsed content from the \code{xml2} +or \code{XML} packages.} \item{options}{named list of options} } \value{ -tidied HTML/XHTML content +Tidied HTML/XHTML content. The object type will be the same as that of the input type. } \description{ -Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly}, -\code{TidyBreakBeforeBR}, \code{TidyCoerceEndTags}, \code{TidyCoerceEndTags}, -\code{TidyDoctype}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas}, -\code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyHideComments}, -\code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyIndentSpaces}, -\code{TidyJoinClasses}, \code{TidyJoinStyles}, \code{TidyLogicalEmphasis}, -\code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark}, -\code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize}, -\code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000}, -\code{TidyWrapLen}, \code{TidyXhtmlOut} +Pass in HTML content as either plain or raw text or parsed objects (either with the +\code{XML} or \code{xml2} packages) along with an options list that specifies how +the content will be tidied and get back tidied content of the same object type as passed +in to the function. +} +\details{ +The default option \code{TixyXhtmlOut} will convert the input content to XHTML. + +Currently supported options: + +\itemize{ + \item{Ones taking a logical value: }{\code{TidyAltText}, \code{TidyBodyOnly}, \code{TidyBreakBeforeBR}, + \code{TidyCoerceEndTags}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas}, + \code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyGDocClean}, \code{TidyHideComments}, + \code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyJoinClasses}, \code{TidyJoinStyles}, + \code{TidyLogicalEmphasis}, \code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark}, + \code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyUpperCaseAttrs}, + \code{TidyUpperCaseTags}, \code{TidyWord2000}, \code{TidyXhtmlOut}} + \item{Ones taking a character value: }{\code{TidyDoctype}, \code{TidyInlineTags}, \code{TidyBlockTags}, + \code{TidyEmptyTags}, \code{TidyPreTags}} + \item{Ones taking an integer value: }{\code{TidyIndentSpaces}, \code{TidyTabSize}, \code{TidyWrapLen}} +} + +File \href{https://github.com/hrbrmstr/htmltidy/issues}{an issue} if there are other \code{libtidy} +options you'd like supported. + +It is likely that the most used options will be: + +\itemize{ + \item{\code{TidyXhtmlOut} (logical)}, + \item{\code{TidyHtmlOut} (logical)} and + \item{\code{TidyDocType} which should be one of "\code{omit}", + "\code{html5}", "\code{auto}", "\code{strict}" or "\code{loose}"}. +} + +You can clean up Microsoft Word (2000) and Google Docs HTML via logical settings for +\code{TidyWord2000} and \code{TidyGDocClean}, respectively. + +It may also be advantageous to remove all comments with \code{TidyHideComments}. +} +\examples{ +opts <- list( + TidyDocType="html5", + TidyMakeClean=TRUE, + TidyHideComments=TRUE, + TidyIndentContent=TRUE, + TidyWrapLen=200 +) + +txt <- paste0( + c("", +"

Test

"), + collapse="") + +cat(tidy_html(txt, option=opts)) } \references{ -\url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h} - (for definitions of the options supported above). +\url{http://api.html-tidy.org/tidy/quickref_5.1.25.html} & + \url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h} + for definitions of the options supported above and \url{https://www.w3.org/People/Raggett/tidy/} + for an explanation of what "tidy" HTML is and some canonical examples of what it can do. } diff --git a/man/tidy_xml.Rd b/man/tidy_xml.Rd deleted file mode 100644 index 584dbdc..0000000 --- a/man/tidy_xml.Rd +++ /dev/null @@ -1,50 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/xml.r -\name{tidy_xml} -\alias{tidy_xml} -\alias{tidy_xml.XMLInternalDocument} -\alias{tidy_xml.character} -\alias{tidy_xml.default} -\alias{tidy_xml.raw} -\alias{tidy_xml.xml_document} -\title{Tidy XML Documents} -\usage{ -tidy_xml(content, options = list(TidyXmlOut = TRUE)) - -\method{tidy_xml}{default}(content, options = list(TidyXmlOut = TRUE)) - -\method{tidy_xml}{character}(content, options = list(TidyXmlOut = TRUE)) - -\method{tidy_xml}{raw}(content, options = list(TidyXmlOut = TRUE)) - -\method{tidy_xml}{xml_document}(content, options = list(TidyXmlOut = TRUE)) - -\method{tidy_xml}{XMLInternalDocument}(content, options = list(TidyXmlOut = - TRUE)) -} -\arguments{ -\item{content}{atomic character or raw vector of content to tidy} - -\item{options}{named list of options} -} -\value{ -tidied XML content -} -\description{ -Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly}, -\code{TidyBreakBeforeBR}, \code{TidyCoerceEndTags}, \code{TidyCoerceEndTags}, -\code{TidyDoctype}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas}, -\code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyHideComments}, -\code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyIndentSpaces}, -\code{TidyJoinClasses}, \code{TidyJoinStyles}, \code{TidyLogicalEmphasis}, -\code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark}, -\code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize}, -\code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000}, -\code{TidyWrapLen}, \code{TidyXhtmlOut}, \code{TidyXmlDecl}, \code{TidyXmlOut}, -\code{TidyXmlTags}. -} -\references{ -\url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h} - (for definitions of the options supported above). -} - diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 37745f7..023bc25 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -1,4 +1,4 @@ -// Generated by using Rcpp::compileAttributes() -> do not edit by hand +// This file was generated by Rcpp::compileAttributes // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include @@ -9,11 +9,11 @@ using namespace Rcpp; std::string tidy_html_int(std::string source, Rcpp::List options); RcppExport SEXP htmltidy_tidy_html_int(SEXP sourceSEXP, SEXP optionsSEXP) { BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::RObject __result; + Rcpp::RNGScope __rngScope; Rcpp::traits::input_parameter< std::string >::type source(sourceSEXP); Rcpp::traits::input_parameter< Rcpp::List >::type options(optionsSEXP); - rcpp_result_gen = Rcpp::wrap(tidy_html_int(source, options)); - return rcpp_result_gen; + __result = Rcpp::wrap(tidy_html_int(source, options)); + return __result; END_RCPP } diff --git a/src/htmltidy.cpp b/src/htmltidy.cpp index 6aceb9c..be57b98 100644 --- a/src/htmltidy.cpp +++ b/src/htmltidy.cpp @@ -3,8 +3,7 @@ #include #include -// NOTE: cannot do "using namespace Rcpp;" b/c of annoying warnings about the -// ambiguity of 'yes'. +// NOTE: cannot do "using namespace Rcpp;" b/c of annoying warnings about the ambiguity of 'yes'. //[[Rcpp::export]] std::string tidy_html_int(std::string source, Rcpp::List options) { @@ -111,11 +110,6 @@ std::string tidy_html_int(std::string source, Rcpp::List options) { if (ok == no) Rcpp::stop("Error setting TidyHTML options"); } - if (options.containsElementNamed("TidyCoerceEndTags")) { - ok = tidyOptSetBool(tdoc, TidyCoerceEndTags, options["TidyCoerceEndTags"] ? yes : no); - if (ok == no) Rcpp::stop("Error setting TidyHTML options"); - } - if (options.containsElementNamed("TidyMakeBare")) { ok = tidyOptSetBool(tdoc, TidyMakeBare, options["TidyMakeBare"] ? yes : no); if (ok == no) Rcpp::stop("Error setting TidyHTML options"); @@ -126,6 +120,16 @@ std::string tidy_html_int(std::string source, Rcpp::List options) { if (ok == no) Rcpp::stop("Error setting TidyHTML options"); } + if (options.containsElementNamed("TidyGDocClean")) { + ok = tidyOptSetBool(tdoc, TidyGDocClean, options["TidyGDocClean"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyWord2000")) { + ok = tidyOptSetBool(tdoc, TidyWord2000, options["TidyWord2000"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + if (options.containsElementNamed("TidyDoctype")) { ok = tidyOptSetValue(tdoc, TidyDoctype, Rcpp::as(options["TidyDoctype"]).c_str()); if (ok == no) Rcpp::stop("Error setting TidyHTML options"); @@ -136,8 +140,23 @@ std::string tidy_html_int(std::string source, Rcpp::List options) { if (ok == no) Rcpp::stop("Error setting TidyHTML options"); } - if (options.containsElementNamed("TidyWord2000")) { - ok = tidyOptSetValue(tdoc, TidyWord2000, Rcpp::as(options["TidyWord2000"]).c_str()); + if (options.containsElementNamed("TidyInlineTags")) { + ok = tidyOptSetValue(tdoc, TidyInlineTags, Rcpp::as(options["TidyInlineTags"]).c_str()); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyBlockTags")) { + ok = tidyOptSetValue(tdoc, TidyBlockTags, Rcpp::as(options["TidyBlockTags"]).c_str()); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyPreTags")) { + ok = tidyOptSetValue(tdoc, TidyPreTags, Rcpp::as(options["TidyPreTags"]).c_str()); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyEmptyTags")) { + ok = tidyOptSetValue(tdoc, TidyEmptyTags, Rcpp::as(options["TidyEmptyTags"]).c_str()); if (ok == no) Rcpp::stop("Error setting TidyHTML options"); } diff --git a/src/tidyenum.h b/src/tidyenum.h index 5e2f7aa..595ef4a 100644 --- a/src/tidyenum.h +++ b/src/tidyenum.h @@ -79,7 +79,7 @@ typedef enum These TidyOptionId are used throughout libtidy, and also have associated localized strings to describe them. - + Note this enum MUST start at zero due to historical design-time decisions that make assumptions about this starting value. */ @@ -217,7 +217,7 @@ typedef enum N_TIDY_OPTIONS /**< Must be last */ } TidyOptionId; - + /** Option data types */ typedef enum