From 4b1d517c398d7a898ecda427690c1867b77df4de Mon Sep 17 00:00:00 2001 From: hrbrmstr Date: Fri, 9 Sep 2016 18:00:52 -0400 Subject: [PATCH] begin options --- NAMESPACE | 1 + R/RcppExports.R | 8 +-- R/aaa.r | 53 +++++++++++++++++++ R/tidy.r | 68 +++++++++++++++++++++++++ README.Rmd | 4 +- man/tidy_html.Rd | 15 ++++-- man/tidy_options.Rd | 29 +++++++++++ src/RcppExports.cpp | 9 ++-- src/htmltidy.cpp | 143 +++++++++++++++++++++++++++++++++++++++++++++++++--- 9 files changed, 305 insertions(+), 25 deletions(-) create mode 100644 R/aaa.r create mode 100644 R/tidy.r create mode 100644 man/tidy_options.Rd diff --git a/NAMESPACE b/NAMESPACE index 6cc908c..557df98 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand export(tidy_html) +export(tidy_options) importFrom(Rcpp,sourceCpp) useDynLib(htmltidy) diff --git a/R/RcppExports.R b/R/RcppExports.R index 23e0ec4..d17e027 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,11 +1,7 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -#' Tidy HTML/XML -#' -#' @param source length 1 character vetor containing the HTML/XML source to process -#' @export -tidy_html <- function(source) { - .Call('htmltidy_tidy_html', PACKAGE = 'htmltidy', source) +tidy_html_int <- function(source, options) { + .Call('htmltidy_tidy_html_int', PACKAGE = 'htmltidy', source, options) } diff --git a/R/aaa.r b/R/aaa.r new file mode 100644 index 0000000..2daa840 --- /dev/null +++ b/R/aaa.r @@ -0,0 +1,53 @@ +#' @title HTML, XHTML & XML Options for tidy_html +#' @description This dataset contains the options (and their default settings) for +#' tidy_html. They are passed in a named-list to tidy_html +#' +#' \itemize{ +#' \item \code{Option}: Option name +#' \item \code{Type}: Option value type +#' \item \code{Default}: Is it the default for tidy_html? +#' } +#' +#' @docType data +#' @keywords datasets +#' @name tidy_options +#' +#' @references The \href{http://api.html-tidy.org/tidy/quickref_5.1.25.html}{ +#' HTML Tidy Options Quick Reference} +#' +#' @export +#' @usage tidy_options +#' @note Last updated 2016-09-09. +#' @format A data frame with 55 rows and 3 variables +NULL + +tidy_options <- structure(list(Option = c("add-xml-decl", "add-xml-space", "alt-text", +"anchor-as-name", "assume-xml-procins", "bare", "clean", "coerce-endtags", +"css-prefix", "decorate-inferred-ul", "doctype", "drop-empty-elements", +"drop-empty-paras", "drop-font-tags", "drop-proprietary-attributes", +"enclose-block-text", "enclose-text", "escape-cdata", "fix-backslash", +"fix-bad-comments", "fix-uri", "gdoc", "hide-comments", "hide-endtags", +"indent-cdata", "input-xml", "join-classes", "join-styles", "literal-attributes", +"logical-emphasis", "lower-literals", "merge-divs", "merge-emphasis", +"merge-spans", "ncr", "new-blocklevel-tags", "new-empty-tags", +"new-inline-tags", "new-pre-tags", "numeric-entities", "omit-optional-tags", +"output-html", "output-xhtml", "output-xml", "preserve-entities", +"quote-ampersand", "quote-marks", "quote-nbsp", "repeated-attributes", +"replace-color", "show-body-only", "skip-nested", "uppercase-attributes", +"uppercase-tags", "word-2000"), Type = c("Boolean", "Boolean", +"String", "Boolean", "Boolean", "Boolean", "Boolean", "Boolean", +"String", "Boolean", "DocType", "Boolean", "Boolean", "Boolean", +"Boolean", "Boolean", "Boolean", "Boolean", "Boolean", "Boolean", +"Boolean", "Boolean", "Boolean", "Boolean", "Boolean", "Boolean", +"Boolean", "Boolean", "Boolean", "Boolean", "Boolean", "AutoBool", +"Boolean", "AutoBool", "Boolean", "Tag names", "Tag names", "Tag names", +"Tag names", "Boolean", "Boolean", "Boolean", "Boolean", "Boolean", +"Boolean", "Boolean", "Boolean", "Boolean", "enum", "Boolean", +"AutoBool", "Boolean", "Boolean", "Boolean", "Boolean"), Default = c("no", +"no", "-", "yes", "no", "no", "no", "yes", "-", "no", "auto", +"yes", "yes", "no", "no", "no", "no", "no", "yes", "yes", "yes", +"no", "no", "no", "no", "no", "no", "yes", "no", "no", "yes", +"auto", "yes", "auto", "yes", "-", "-", "-", "-", "no", "no", +"no", "no", "no", "no", "yes", "no", "yes", "keep-last", "no", +"no", "yes", "no", "no", "no")), .Names = c("Option", "Type", +"Default"), row.names = 3:57, class = "data.frame") diff --git a/R/tidy.r b/R/tidy.r new file mode 100644 index 0000000..f6c4531 --- /dev/null +++ b/R/tidy.r @@ -0,0 +1,68 @@ +#' Tidy HTML/XML/XHTML Documents +#' +#' @param content atomic character or raw vector of content to tidy +#' @param options named list of options +#' @return atomic character vector of tidy content +#' @export +tidy_html <- function(content, options=list(TidyXhtmlOut=TRUE)) { + .Call('htmltidy_tidy_html_int', PACKAGE='htmltidy', source=content, options=options) +} + +# +# TidyXmlDecl, /**< Add for XML docs */ +# TidyUpperCaseTags, /**< Output tags in upper not lower case */ +# TidyUpperCaseAttrs, /**< Output attributes in upper not lower case */ +# TidyMakeBare, /**< Make bare HTML: remove Microsoft cruft */ +# TidyMakeClean, /**< Replace presentational clutter by style rules */ +# TidyGDocClean, /**< Clean up HTML exported from Google Docs */ +# TidyLogicalEmphasis, /**< Replace i by em and b by strong */ +# TidyDropPropAttrs, /**< Discard proprietary attributes */ +# TidyDropFontTags, /**< Discard presentation tags */ +# TidyDropEmptyElems, /**< Discard empty elements */ +# TidyDropEmptyParas, /**< Discard empty p elements */ +# TidyFixComments, /**< Fix comments with adjacent hyphens */ +# TidyBreakBeforeBR, /**< Output newline before
or not? */ + # TidyNumEntities, /**< Use numeric entities */ + # TidyQuoteMarks, /**< Output " marks as " */ + # TidyQuoteNbsp, /**< Output non-breaking space as entity */ + # TidyQuoteAmpersand, /**< Output naked ampersand as & */ + # TidyWrapAttVals, /**< Wrap within attribute values */ + # TidyWrapScriptlets, /**< Wrap within JavaScript string literals */ + # TidyWrapSection, /**< Wrap within section tags */ + # TidyWrapAsp, /**< Wrap within ASP pseudo elements */ + # TidyWrapJste, /**< Wrap within JSTE pseudo elements */ + # TidyWrapPhp, /**< Wrap within PHP pseudo elements */ + # TidyFixBackslash, /**< Fix URLs by replacing \ with / */ + # TidyIndentAttributes,/**< Newline+indent before each attribute */ + # TidyXmlPIs, /**< If set to yes PIs must end with ?> */ + # TidyXmlSpace, /**< If set to yes adds xml:space attr as needed */ + # TidyEncloseBodyText, /**< If yes text at body is wrapped in P's */ + # TidyEncloseBlockText,/**< If yes text in blocks is wrapped in P's */ + # TidyKeepFileTimes, /**< If yes last modied time is preserved */ + # TidyWord2000, /**< Draconian cleaning for Word2000 */ + # TidyMark, /**< Add meta element indicating tidied doc */ + # TidyEmacs, /**< If true format error output for GNU Emacs */ + # TidyEmacsFile, /**< Name of current Emacs file */ + # TidyLiteralAttribs, /**< If true attributes may use newlines */ + # TidyBodyOnly, /**< Output BODY content only */ + # TidyFixUri, /**< Applies URI encoding if necessary */ + # TidyLowerLiterals, /**< Folds known attribute values to lower case */ + # TidyHideComments, /**< Hides all (real) comments in output */ + # TidyIndentCdata, /**< Indent section */ + # TidyForceOutput, /**< Output document even if errors were found */ + # TidyShowErrors, /**< Number of errors to put out */ + # TidyAsciiChars, /**< Convert quotes and dashes to nearest ASCII char */ + # TidyJoinClasses, /**< Join multiple class attributes */ + # TidyJoinStyles, /**< Join multiple style attributes */ + # TidyEscapeCdata, /**< Replace sections with escaped text */ + # TidyIndentSpaces, /**< Indentation n spaces/tabs */ + # TidyWrapLen, /**< Wrap margin */ + # TidyTabSize, /**< Expand tabs to n spaces */ + + + + + + + + diff --git a/README.Rmd b/README.Rmd index deb95d1..a161f23 100644 --- a/README.Rmd +++ b/README.Rmd @@ -25,8 +25,8 @@ It relies on a locally included version of [`libtidy`](http://www.html-tidy.org/ This works enough for me to use in a pinch. It should be straightforward (but tedious) to: -- enable passing options in a `list` -- Getting it to work on Windows. +- enable passing options in a `list` (IN PROGRESS) +- Getting it to work on Windows (UNTESTED) The following functions are implemented: diff --git a/man/tidy_html.Rd b/man/tidy_html.Rd index 7753e5c..6a7e8ca 100644 --- a/man/tidy_html.Rd +++ b/man/tidy_html.Rd @@ -1,15 +1,20 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/RcppExports.R +% Please edit documentation in R/tidy.r \name{tidy_html} \alias{tidy_html} -\title{Tidy HTML/XML} +\title{Tidy HTML/XML/XHTML Documents} \usage{ -tidy_html(source) +tidy_html(content, options = list(TidyXhtmlOut = TRUE)) } \arguments{ -\item{source}{length 1 character vetor containing the HTML/XML source to process} +\item{content}{atomic character or raw vector of content to tidy} + +\item{options}{named list of options} +} +\value{ +atomic character vector of tidy content } \description{ -Tidy HTML/XML +Tidy HTML/XML/XHTML Documents } diff --git a/man/tidy_options.Rd b/man/tidy_options.Rd new file mode 100644 index 0000000..d79e832 --- /dev/null +++ b/man/tidy_options.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/aaa.r +\docType{data} +\name{tidy_options} +\alias{tidy_options} +\title{HTML, XHTML & XML Options for tidy_html} +\format{A data frame with 55 rows and 3 variables} +\usage{ +tidy_options +} +\description{ +This dataset contains the options (and their default settings) for +tidy_html. They are passed in a named-list to tidy_html + +\itemize{ + \item \code{Option}: Option name + \item \code{Type}: Option value type + \item \code{Default}: Is it the default for tidy_html? +} +} +\note{ +Last updated 2016-09-09. +} +\references{ +The \href{http://api.html-tidy.org/tidy/quickref_5.1.25.html}{ + HTML Tidy Options Quick Reference} +} +\keyword{datasets} + diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index d02a8bf..37745f7 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -5,14 +5,15 @@ using namespace Rcpp; -// tidy_html -std::string tidy_html(std::string source); -RcppExport SEXP htmltidy_tidy_html(SEXP sourceSEXP) { +// tidy_html_int +std::string tidy_html_int(std::string source, Rcpp::List options); +RcppExport SEXP htmltidy_tidy_html_int(SEXP sourceSEXP, SEXP optionsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< std::string >::type source(sourceSEXP); - rcpp_result_gen = Rcpp::wrap(tidy_html(source)); + Rcpp::traits::input_parameter< Rcpp::List >::type options(optionsSEXP); + rcpp_result_gen = Rcpp::wrap(tidy_html_int(source, options)); return rcpp_result_gen; END_RCPP } diff --git a/src/htmltidy.cpp b/src/htmltidy.cpp index 9e9f14f..18a00cc 100644 --- a/src/htmltidy.cpp +++ b/src/htmltidy.cpp @@ -18,12 +18,8 @@ // NOTE: cannot do "using namespace Rcpp;" b/c of annoying warnings about the // ambiguity of 'yes'. -//' Tidy HTML/XML -//' -//' @param source length 1 character vetor containing the HTML/XML source to process -//' @export //[[Rcpp::export]] -std::string tidy_html(std::string source) { +std::string tidy_html_int(std::string source, Rcpp::List options) { TidyBuffer output = {0}; TidyBuffer errbuf = {0}; @@ -32,9 +28,140 @@ std::string tidy_html(std::string source) { TidyDoc tdoc = tidyCreate(); - ok = tidyOptSetBool(tdoc, TidyXhtmlOut, yes); - - if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + if (options.containsElementNamed("TidyXhtmlOut")) { + ok = tidyOptSetBool(tdoc, TidyXhtmlOut, options["TidyXhtmlOut"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyXmlOut")) { + ok = tidyOptSetBool(tdoc, TidyXmlOut, options["TidyXmlOut"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyHtmlOut")) { + ok = tidyOptSetBool(tdoc, TidyHtmlOut, options["TidyHtmlOut"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyXmlTags")) { + ok = tidyOptSetBool(tdoc, TidyXmlTags, options["TidyXmlTags"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyOmitOptionalTags")) { + ok = tidyOptSetBool(tdoc, TidyOmitOptionalTags, options["TidyOmitOptionalTags"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyXmlDecl")) { + ok = tidyOptSetBool(tdoc, TidyXmlDecl, options["TidyXmlDecl"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyBreakBeforeBR")) { + ok = tidyOptSetBool(tdoc, TidyBreakBeforeBR, options["TidyBreakBeforeBR"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyUpperCaseTags")) { + ok = tidyOptSetBool(tdoc, TidyUpperCaseTags, options["TidyUpperCaseTags"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyDropEmptyElems")) { + ok = tidyOptSetBool(tdoc, TidyDropEmptyElems, options["TidyDropEmptyElems"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyDropEmptyParas")) { + ok = tidyOptSetBool(tdoc, TidyDropEmptyParas, options["TidyDropEmptyParas"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyFixComments")) { + ok = tidyOptSetBool(tdoc, TidyFixComments, options["TidyFixComments"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyLogicalEmphasis")) { + ok = tidyOptSetBool(tdoc, TidyLogicalEmphasis, options["TidyLogicalEmphasis"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyBodyOnly")) { + ok = tidyOptSetBool(tdoc, TidyBodyOnly, options["TidyBodyOnly"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyHideComments")) { + ok = tidyOptSetBool(tdoc, TidyBodyOnly, options["TidyHideComments"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyJoinClasses")) { + ok = tidyOptSetBool(tdoc, TidyJoinClasses, options["TidyJoinClasses"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyJoinStyles")) { + ok = tidyOptSetBool(tdoc, TidyJoinStyles, options["TidyJoinStyles"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyFixBackslash")) { + ok = tidyOptSetBool(tdoc, TidyFixBackslash, options["TidyFixBackslash"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyMark")) { + ok = tidyOptSetBool(tdoc, TidyMark, options["TidyMark"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyReplaceColor")) { + ok = tidyOptSetBool(tdoc, TidyReplaceColor, options["TidyReplaceColor"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyIndentContent")) { + ok = tidyOptSetBool(tdoc, TidyIndentContent, options["TidyIndentContent"] ? yes : no); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyCSSPrefix")) { + ok = tidyOptSetValue(tdoc, TidyFixBackslash, Rcpp::as(options["TidyFixBackslash"]).c_str()); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyDoctype")) { + ok = tidyOptSetValue(tdoc, TidyDoctype, Rcpp::as(options["TidyDoctype"]).c_str()); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyAltText")) { + ok = tidyOptSetValue(tdoc, TidyAltText, Rcpp::as(options["TidyAltText"]).c_str()); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyWord2000")) { + ok = tidyOptSetValue(tdoc, TidyWord2000, Rcpp::as(options["TidyWord2000"]).c_str()); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyIndentSpaces")) { + ok = tidyOptSetInt(tdoc, TidyIndentSpaces, Rcpp::as(options["TidyIndentSpaces"])); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyWrapLen")) { + ok = tidyOptSetInt(tdoc, TidyWrapLen, Rcpp::as(options["TidyWrapLen"])); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } + + if (options.containsElementNamed("TidyTabSize")) { + ok = tidyOptSetInt(tdoc, TidyTabSize, Rcpp::as(options["TidyTabSize"])); + if (ok == no) Rcpp::stop("Error setting TidyHTML options"); + } rc = tidySetErrorBuffer(tdoc, &errbuf);