Browse Source

documentation

tags/v0.2.0
boB Rudis 8 years ago
parent
commit
83bcc2e199
  1. 8
      NAMESPACE
  2. 2
      R/RcppExports.R
  3. 3
      R/htmltidy-package.r
  4. 76
      R/tidy.r
  5. 64
      R/xml.r
  6. 2
      man/htmltidy.Rd
  7. 74
      man/tidy_html.Rd
  8. 50
      man/tidy_xml.Rd
  9. 10
      src/RcppExports.cpp
  10. 37
      src/htmltidy.cpp

8
NAMESPACE

@ -5,12 +5,8 @@ S3method(tidy_html,character)
S3method(tidy_html,default)
S3method(tidy_html,raw)
S3method(tidy_html,xml_document)
S3method(tidy_xml,XMLInternalDocument)
S3method(tidy_xml,character)
S3method(tidy_xml,default)
S3method(tidy_xml,raw)
S3method(tidy_xml,xml_document)
export(tidy_html)
export(tidy_xml)
import(XML)
import(xml2)
importFrom(Rcpp,sourceCpp)
useDynLib(htmltidy)

2
R/RcppExports.R

@ -1,4 +1,4 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# This file was generated by Rcpp::compileAttributes
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
tidy_html_int <- function(source, options) {

3
R/htmltidy-package.r

@ -1,6 +1,6 @@
#' Clean Up Gnarly HTML/XML
#'
#' HTML and XML documents can be beautiful and pristine. They can also be
#' HTML documents can be beautiful and pristine. They can also be
#' wretched, evil, malformed hellspawn. Now, you can tidy up that HTML and XML before
#' processing it with your favorite angle-bracket parsing tools.
#'
@ -9,4 +9,5 @@
#' @author Bob Rudis (bob@@rud.is)
#' @useDynLib htmltidy
#' @importFrom Rcpp sourceCpp
#' @import xml2 XML
NULL

76
R/tidy.r

@ -1,22 +1,68 @@
#' Tidy HTML/XHTML Documents
#'
#' Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly},
#' \code{TidyBreakBeforeBR}, \code{TidyCoerceEndTags}, \code{TidyCoerceEndTags},
#' \code{TidyDoctype}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas},
#' \code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyHideComments},
#' \code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyIndentSpaces},
#' \code{TidyJoinClasses}, \code{TidyJoinStyles}, \code{TidyLogicalEmphasis},
#' \code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark},
#' \code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize},
#' \code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000},
#' \code{TidyWrapLen}, \code{TidyXhtmlOut}
#' Pass in HTML content as either plain or raw text or parsed objects (either with the
#' \code{XML} or \code{xml2} packages) along with an options list that specifies how
#' the content will be tidied and get back tidied content of the same object type as passed
#' in to the function.
#'
#' @param content atomic character or raw vector of content to tidy
#' The default option \code{TixyXhtmlOut} will convert the input content to XHTML.
#'
#' Currently supported options:
#'
#' \itemize{
#' \item{Ones taking a logical value: }{\code{TidyAltText}, \code{TidyBodyOnly}, \code{TidyBreakBeforeBR},
#' \code{TidyCoerceEndTags}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas},
#' \code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyGDocClean}, \code{TidyHideComments},
#' \code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyJoinClasses}, \code{TidyJoinStyles},
#' \code{TidyLogicalEmphasis}, \code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark},
#' \code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyUpperCaseAttrs},
#' \code{TidyUpperCaseTags}, \code{TidyWord2000}, \code{TidyXhtmlOut}}
#' \item{Ones taking a character value: }{\code{TidyDoctype}, \code{TidyInlineTags}, \code{TidyBlockTags},
#' \code{TidyEmptyTags}, \code{TidyPreTags}}
#' \item{Ones taking an integer value: }{\code{TidyIndentSpaces}, \code{TidyTabSize}, \code{TidyWrapLen}}
#' }
#'
#' File \href{https://github.com/hrbrmstr/htmltidy/issues}{an issue} if there are other \code{libtidy}
#' options you'd like supported.
#'
#' It is likely that the most used options will be:
#'
#' \itemize{
#' \item{\code{TidyXhtmlOut} (logical)},
#' \item{\code{TidyHtmlOut} (logical)} and
#' \item{\code{TidyDocType} which should be one of "\code{omit}",
#' "\code{html5}", "\code{auto}", "\code{strict}" or "\code{loose}"}.
#' }
#'
#' You can clean up Microsoft Word (2000) and Google Docs HTML via logical settings for
#' \code{TidyWord2000} and \code{TidyGDocClean}, respectively.
#'
#' It may also be advantageous to remove all comments with \code{TidyHideComments}.
#'
#' @param content accepts a character vector, raw vector or parsed content from the \code{xml2}
#' or \code{XML} packages.
#' @param options named list of options
#' @return tidied HTML/XHTML content
#' @references \url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h}
#' (for definitions of the options supported above).
#' @return Tidied HTML/XHTML content. The object type will be the same as that of the input type.
#' @references \url{http://api.html-tidy.org/tidy/quickref_5.1.25.html} &
#' \url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h}
#' for definitions of the options supported above and \url{https://www.w3.org/People/Raggett/tidy/}
#' for an explanation of what "tidy" HTML is and some canonical examples of what it can do.
#' @export
#' @examples
#' opts <- list(
#' TidyDocType="html5",
#' TidyMakeClean=TRUE,
#' TidyHideComments=TRUE,
#' TidyIndentContent=TRUE,
#' TidyWrapLen=200
#' )
#'
#' txt <- paste0(
#' c("<html><head><style>p { color: red; }</style><body><!-- ===== body ====== -->",
#' "<p>Test</p></body><!--Default Zone --> <!--Default Zone End--></html>"),
#' collapse="")
#'
#' cat(tidy_html(txt, option=opts))
tidy_html <- function(content, options=list(TidyXhtmlOut=TRUE)) {
UseMethod("tidy_html")
}
@ -59,7 +105,7 @@ tidy_html.xml_document <- function(content, options=list(TidyXhtmlOut=TRUE)) {
#' @export
#' @rdname tidy_html
tidy_html.HTMLInternalDocument <- function(content, options=list(TidyXhtmlOut=TRUE)) {
content <- saveXML(content)
content <- XML::saveXML(content)
out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
XML::htmlParse(out)

64
R/xml.r

@ -1,64 +0,0 @@
#' Tidy XML Documents
#'
#' Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly},
#' \code{TidyBreakBeforeBR}, \code{TidyCoerceEndTags}, \code{TidyCoerceEndTags},
#' \code{TidyDoctype}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas},
#' \code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyHideComments},
#' \code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyIndentSpaces},
#' \code{TidyJoinClasses}, \code{TidyJoinStyles}, \code{TidyLogicalEmphasis},
#' \code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark},
#' \code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize},
#' \code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000},
#' \code{TidyWrapLen}, \code{TidyXhtmlOut}, \code{TidyXmlDecl}, \code{TidyXmlOut},
#' \code{TidyXmlTags}.
#'
#' @param content atomic character or raw vector of content to tidy
#' @param options named list of options
#' @return tidied XML content
#' @references \url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h}
#' (for definitions of the options supported above).
#' @export
tidy_xml <- function(content, options=list(TidyXmlOut=TRUE)) {
UseMethod("tidy_xml")
}
#' @export
#' @rdname tidy_xml
tidy_xml.default <- function(content, options=list(TidyXmlOut=TRUE)) {
.Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
}
#' @export
#' @rdname tidy_xml
tidy_xml.character <- function(content, options=list(TidyXmlOut=TRUE)) {
.Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
}
#' @export
#' @rdname tidy_xml
tidy_xml.raw <- function(content, options=list(TidyXmlOut=TRUE)) {
content <- iconv(readBin(content, character()), to="UTF-8")
out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
charToRaw(out)
}
#' @export
#' @rdname tidy_xml
tidy_xml.xml_document <- function(content, options=list(TidyXmlOut=TRUE)) {
content <- toString(content)
out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
xml2::read_xml(out)
}
#' @export
#' @rdname tidy_xml
tidy_xml.XMLInternalDocument <- function(content, options=list(TidyXmlOut=TRUE)) {
content <- saveXML(content)
out <- .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy',
source=content, options=options)
XML::xmlParse(out)
}

2
man/htmltidy.Rd

@ -6,7 +6,7 @@
\alias{htmltidy-package}
\title{Clean Up Gnarly HTML/XML}
\description{
HTML and XML documents can be beautiful and pristine. They can also be
HTML documents can be beautiful and pristine. They can also be
wretched, evil, malformed hellspawn. Now, you can tidy up that HTML and XML before
processing it with your favorite angle-bracket parsing tools.
}

74
man/tidy_html.Rd

@ -23,27 +23,75 @@ tidy_html(content, options = list(TidyXhtmlOut = TRUE))
= TRUE))
}
\arguments{
\item{content}{atomic character or raw vector of content to tidy}
\item{content}{accepts a character vector, raw vector or parsed content from the \code{xml2}
or \code{XML} packages.}
\item{options}{named list of options}
}
\value{
tidied HTML/XHTML content
Tidied HTML/XHTML content. The object type will be the same as that of the input type.
}
\description{
Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly},
\code{TidyBreakBeforeBR}, \code{TidyCoerceEndTags}, \code{TidyCoerceEndTags},
\code{TidyDoctype}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas},
\code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyHideComments},
\code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyIndentSpaces},
\code{TidyJoinClasses}, \code{TidyJoinStyles}, \code{TidyLogicalEmphasis},
\code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark},
\code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize},
\code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000},
\code{TidyWrapLen}, \code{TidyXhtmlOut}
Pass in HTML content as either plain or raw text or parsed objects (either with the
\code{XML} or \code{xml2} packages) along with an options list that specifies how
the content will be tidied and get back tidied content of the same object type as passed
in to the function.
}
\details{
The default option \code{TixyXhtmlOut} will convert the input content to XHTML.
Currently supported options:
\itemize{
\item{Ones taking a logical value: }{\code{TidyAltText}, \code{TidyBodyOnly}, \code{TidyBreakBeforeBR},
\code{TidyCoerceEndTags}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas},
\code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyGDocClean}, \code{TidyHideComments},
\code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyJoinClasses}, \code{TidyJoinStyles},
\code{TidyLogicalEmphasis}, \code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark},
\code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyUpperCaseAttrs},
\code{TidyUpperCaseTags}, \code{TidyWord2000}, \code{TidyXhtmlOut}}
\item{Ones taking a character value: }{\code{TidyDoctype}, \code{TidyInlineTags}, \code{TidyBlockTags},
\code{TidyEmptyTags}, \code{TidyPreTags}}
\item{Ones taking an integer value: }{\code{TidyIndentSpaces}, \code{TidyTabSize}, \code{TidyWrapLen}}
}
File \href{https://github.com/hrbrmstr/htmltidy/issues}{an issue} if there are other \code{libtidy}
options you'd like supported.
It is likely that the most used options will be:
\itemize{
\item{\code{TidyXhtmlOut} (logical)},
\item{\code{TidyHtmlOut} (logical)} and
\item{\code{TidyDocType} which should be one of "\code{omit}",
"\code{html5}", "\code{auto}", "\code{strict}" or "\code{loose}"}.
}
You can clean up Microsoft Word (2000) and Google Docs HTML via logical settings for
\code{TidyWord2000} and \code{TidyGDocClean}, respectively.
It may also be advantageous to remove all comments with \code{TidyHideComments}.
}
\examples{
opts <- list(
TidyDocType="html5",
TidyMakeClean=TRUE,
TidyHideComments=TRUE,
TidyIndentContent=TRUE,
TidyWrapLen=200
)
txt <- paste0(
c("<html><head><style>p { color: red; }</style><body><!-- ===== body ====== -->",
"<p>Test</p></body><!--Default Zone --> <!--Default Zone End--></html>"),
collapse="")
cat(tidy_html(txt, option=opts))
}
\references{
\url{http://api.html-tidy.org/tidy/quickref_5.1.25.html} &
\url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h}
(for definitions of the options supported above).
for definitions of the options supported above and \url{https://www.w3.org/People/Raggett/tidy/}
for an explanation of what "tidy" HTML is and some canonical examples of what it can do.
}

50
man/tidy_xml.Rd

@ -1,50 +0,0 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/xml.r
\name{tidy_xml}
\alias{tidy_xml}
\alias{tidy_xml.XMLInternalDocument}
\alias{tidy_xml.character}
\alias{tidy_xml.default}
\alias{tidy_xml.raw}
\alias{tidy_xml.xml_document}
\title{Tidy XML Documents}
\usage{
tidy_xml(content, options = list(TidyXmlOut = TRUE))
\method{tidy_xml}{default}(content, options = list(TidyXmlOut = TRUE))
\method{tidy_xml}{character}(content, options = list(TidyXmlOut = TRUE))
\method{tidy_xml}{raw}(content, options = list(TidyXmlOut = TRUE))
\method{tidy_xml}{xml_document}(content, options = list(TidyXmlOut = TRUE))
\method{tidy_xml}{XMLInternalDocument}(content, options = list(TidyXmlOut =
TRUE))
}
\arguments{
\item{content}{atomic character or raw vector of content to tidy}
\item{options}{named list of options}
}
\value{
tidied XML content
}
\description{
Currently supported options: \code{TidyAltText}, \code{TidyBodyOnly},
\code{TidyBreakBeforeBR}, \code{TidyCoerceEndTags}, \code{TidyCoerceEndTags},
\code{TidyDoctype}, \code{TidyDropEmptyElems}, \code{TidyDropEmptyParas},
\code{TidyFixBackslash}, \code{TidyFixComments}, \code{TidyHideComments},
\code{TidyHtmlOut}, \code{TidyIndentContent}, \code{TidyIndentSpaces},
\code{TidyJoinClasses}, \code{TidyJoinStyles}, \code{TidyLogicalEmphasis},
\code{TidyMakeBare}, \code{TidyMakeClean}, \code{TidyMark},
\code{TidyOmitOptionalTags}, \code{TidyReplaceColor}, \code{TidyTabSize},
\code{TidyUpperCaseAttrs}, \code{TidyUpperCaseTags}, \code{TidyWord2000},
\code{TidyWrapLen}, \code{TidyXhtmlOut}, \code{TidyXmlDecl}, \code{TidyXmlOut},
\code{TidyXmlTags}.
}
\references{
\url{https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h}
(for definitions of the options supported above).
}

10
src/RcppExports.cpp

@ -1,4 +1,4 @@
// Generated by using Rcpp::compileAttributes() -> do not edit by hand
// This file was generated by Rcpp::compileAttributes
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
#include <Rcpp.h>
@ -9,11 +9,11 @@ using namespace Rcpp;
std::string tidy_html_int(std::string source, Rcpp::List options);
RcppExport SEXP htmltidy_tidy_html_int(SEXP sourceSEXP, SEXP optionsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::RObject __result;
Rcpp::RNGScope __rngScope;
Rcpp::traits::input_parameter< std::string >::type source(sourceSEXP);
Rcpp::traits::input_parameter< Rcpp::List >::type options(optionsSEXP);
rcpp_result_gen = Rcpp::wrap(tidy_html_int(source, options));
return rcpp_result_gen;
__result = Rcpp::wrap(tidy_html_int(source, options));
return __result;
END_RCPP
}

37
src/htmltidy.cpp

@ -3,8 +3,7 @@
#include <tidy.h>
#include <tidybuffio.h>
// NOTE: cannot do "using namespace Rcpp;" b/c of annoying warnings about the
// ambiguity of 'yes'.
// NOTE: cannot do "using namespace Rcpp;" b/c of annoying warnings about the ambiguity of 'yes'.
//[[Rcpp::export]]
std::string tidy_html_int(std::string source, Rcpp::List options) {
@ -111,11 +110,6 @@ std::string tidy_html_int(std::string source, Rcpp::List options) {
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
}
if (options.containsElementNamed("TidyCoerceEndTags")) {
ok = tidyOptSetBool(tdoc, TidyCoerceEndTags, options["TidyCoerceEndTags"] ? yes : no);
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
}
if (options.containsElementNamed("TidyMakeBare")) {
ok = tidyOptSetBool(tdoc, TidyMakeBare, options["TidyMakeBare"] ? yes : no);
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
@ -126,6 +120,16 @@ std::string tidy_html_int(std::string source, Rcpp::List options) {
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
}
if (options.containsElementNamed("TidyGDocClean")) {
ok = tidyOptSetBool(tdoc, TidyGDocClean, options["TidyGDocClean"] ? yes : no);
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
}
if (options.containsElementNamed("TidyWord2000")) {
ok = tidyOptSetBool(tdoc, TidyWord2000, options["TidyWord2000"] ? yes : no);
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
}
if (options.containsElementNamed("TidyDoctype")) {
ok = tidyOptSetValue(tdoc, TidyDoctype, Rcpp::as<std::string>(options["TidyDoctype"]).c_str());
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
@ -136,8 +140,23 @@ std::string tidy_html_int(std::string source, Rcpp::List options) {
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
}
if (options.containsElementNamed("TidyWord2000")) {
ok = tidyOptSetValue(tdoc, TidyWord2000, Rcpp::as<std::string>(options["TidyWord2000"]).c_str());
if (options.containsElementNamed("TidyInlineTags")) {
ok = tidyOptSetValue(tdoc, TidyInlineTags, Rcpp::as<std::string>(options["TidyInlineTags"]).c_str());
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
}
if (options.containsElementNamed("TidyBlockTags")) {
ok = tidyOptSetValue(tdoc, TidyBlockTags, Rcpp::as<std::string>(options["TidyBlockTags"]).c_str());
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
}
if (options.containsElementNamed("TidyPreTags")) {
ok = tidyOptSetValue(tdoc, TidyPreTags, Rcpp::as<std::string>(options["TidyPreTags"]).c_str());
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
}
if (options.containsElementNamed("TidyEmptyTags")) {
ok = tidyOptSetValue(tdoc, TidyEmptyTags, Rcpp::as<std::string>(options["TidyEmptyTags"]).c_str());
if (ok == no) Rcpp::stop("Error setting TidyHTML options");
}

Loading…
Cancel
Save