You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
284 lines
15 KiB
284 lines
15 KiB
<!-- Generated by pkgdown: do not edit by hand -->
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
|
|
<title>tidy_html.response. htmltidy</title>
|
|
|
|
<!-- jquery -->
|
|
<script src="https://code.jquery.com/jquery-3.1.0.min.js" integrity="sha384-nrOSfDHtoPMzJHjVTdCopGqIqeYETSXhZDFyniQ8ZHcVy08QesyHcnOUpMpqnmWq" crossorigin="anonymous"></script>
|
|
|
|
<!-- Bootstrap -->
|
|
|
|
<link href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
|
|
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
|
|
|
|
<!-- Font Awesome icons -->
|
|
<link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.3/css/font-awesome.min.css" rel="stylesheet" integrity="sha384-T8Gy5hrqNKT+hzMclPo118YTQO6cYprQmhrYwIiQ/3axmI1hQomh7Ud2hPOy8SP1" crossorigin="anonymous">
|
|
|
|
|
|
<!-- pkgdown -->
|
|
<link href="../pkgdown.css" rel="stylesheet">
|
|
<script src="../pkgdown.js"></script>
|
|
|
|
<!-- mathjax -->
|
|
<script src='https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'></script>
|
|
|
|
<!--[if lt IE 9]>
|
|
<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
|
|
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
|
|
<![endif]-->
|
|
</head>
|
|
|
|
<body>
|
|
<div class="container">
|
|
<header>
|
|
|
|
<div class="navbar navbar-default navbar-fixed-top" role="navigation">
|
|
<div class="container">
|
|
<div class="navbar-header">
|
|
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
|
|
<span class="icon-bar"></span>
|
|
<span class="icon-bar"></span>
|
|
<span class="icon-bar"></span>
|
|
</button>
|
|
<a class="navbar-brand" href="../index.html">htmltidy</a>
|
|
</div>
|
|
<div id="navbar" class="navbar-collapse collapse">
|
|
<ul class="nav navbar-nav">
|
|
<li>
|
|
<a href="../index.html">Home</a>
|
|
</li>
|
|
<li>
|
|
<a href="../reference/index.html">Reference</a>
|
|
</li>
|
|
<li>
|
|
<a href="../news/index.html">News</a>
|
|
</li>
|
|
</ul>
|
|
<ul class="nav navbar-nav navbar-right">
|
|
<li>
|
|
<a href="https://github.com/hrbrmstr/htmltidy">
|
|
<span class="fa fa-github fa-lg"></span>
|
|
|
|
</a>
|
|
</li>
|
|
</ul>
|
|
</div><!--/.nav-collapse -->
|
|
</div><!--/.container -->
|
|
</div><!--/.navbar -->
|
|
|
|
</header>
|
|
|
|
<div class="page-header">
|
|
<h1>Tidy or "Pretty Print" HTML/XHTML Documents</h1>
|
|
</div>
|
|
|
|
<div class="row">
|
|
<div class="col-md-9">
|
|
|
|
<p>Pass in HTML content as either plain or raw text or parsed objects (either with the
|
|
<code>XML</code> or <code>xml2</code> packages) or as an <code>httr</code> <code>response</code> object
|
|
along with an options list that specifies how the content will be tidied and get back
|
|
tidied content of the same object type as passed in to the function.</p>
|
|
|
|
|
|
<pre><span class='co'># S3 method for response</span>
|
|
<span class='fu'>tidy_html</span>(<span class='no'>content</span>, <span class='kw'>options</span> <span class='kw'>=</span> <span class='fu'>list</span>(<span class='kw'>TidyXhtmlOut</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>),
|
|
<span class='kw'>verbose</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>)
|
|
|
|
<span class='fu'>tidy_html</span>(<span class='no'>content</span>, <span class='kw'>options</span> <span class='kw'>=</span> <span class='fu'>list</span>(<span class='kw'>TidyXhtmlOut</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>), <span class='kw'>verbose</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>)
|
|
|
|
<span class='co'># S3 method for default</span>
|
|
<span class='fu'>tidy_html</span>(<span class='no'>content</span>, <span class='kw'>options</span> <span class='kw'>=</span> <span class='fu'>list</span>(<span class='kw'>TidyXhtmlOut</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>),
|
|
<span class='kw'>verbose</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>)
|
|
|
|
<span class='co'># S3 method for character</span>
|
|
<span class='fu'>tidy_html</span>(<span class='no'>content</span>, <span class='kw'>options</span> <span class='kw'>=</span> <span class='fu'>list</span>(<span class='kw'>TidyXhtmlOut</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>),
|
|
<span class='kw'>verbose</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>)
|
|
|
|
<span class='co'># S3 method for raw</span>
|
|
<span class='fu'>tidy_html</span>(<span class='no'>content</span>, <span class='kw'>options</span> <span class='kw'>=</span> <span class='fu'>list</span>(<span class='kw'>TidyXhtmlOut</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>),
|
|
<span class='kw'>verbose</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>)
|
|
|
|
<span class='co'># S3 method for xml_document</span>
|
|
<span class='fu'>tidy_html</span>(<span class='no'>content</span>, <span class='kw'>options</span> <span class='kw'>=</span> <span class='fu'>list</span>(<span class='kw'>TidyXhtmlOut</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>),
|
|
<span class='kw'>verbose</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>)
|
|
|
|
<span class='co'># S3 method for HTMLInternalDocument</span>
|
|
<span class='fu'>tidy_html</span>(<span class='no'>content</span>, <span class='kw'>options</span> <span class='kw'>=</span> <span class='fu'>list</span>(<span class='kw'>TidyXhtmlOut</span>
|
|
<span class='kw'>=</span> <span class='fl'>TRUE</span>), <span class='kw'>verbose</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>)
|
|
|
|
<span class='co'># S3 method for connection</span>
|
|
<span class='fu'>tidy_html</span>(<span class='no'>content</span>, <span class='kw'>options</span> <span class='kw'>=</span> <span class='fu'>list</span>(<span class='kw'>TidyXhtmlOut</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>),
|
|
<span class='kw'>verbose</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>)</pre>
|
|
|
|
<h2>Arguments</h2>
|
|
<dl class="dl-horizontal">
|
|
<dt>content</dt>
|
|
<dd>accepts a character vector, raw vector or parsed content from the <code>xml2</code>
|
|
or <code>XML</code> packages.</dd>
|
|
<dt>options</dt>
|
|
<dd>named list of options</dd>
|
|
<dt>verbose</dt>
|
|
<dd>output document errors? (default: <code>FALSE</code>)</dd>
|
|
</dl>
|
|
|
|
<div class="Value">
|
|
<h2>Value</h2>
|
|
|
|
<p>Tidied HTML/XHTML content. The object type will be the same as that of the input type
|
|
except when it is a <code>connection</code>, then a character vector will be returned.</p>
|
|
</div>
|
|
|
|
<div class="Details">
|
|
<h2>Details</h2>
|
|
|
|
<p>The default option <code>TixyXhtmlOut</code> will convert the input content to XHTML.</p>
|
|
<p>Currently supported options:</p>
|
|
<p><ul>
|
|
<li>Ones taking a logical value: <code>TidyAltText</code>, <code>TidyBodyOnly</code>, <code>TidyBreakBeforeBR</code>,
|
|
<code>TidyCoerceEndTags</code>, <code>TidyDropEmptyElems</code>, <code>TidyDropEmptyParas</code>,
|
|
<code>TidyFixBackslash</code>, <code>TidyFixComments</code>, <code>TidyGDocClean</code>, <code>TidyHideComments</code>,
|
|
<code>TidyHtmlOut</code>, <code>TidyIndentContent</code>, <code>TidyJoinClasses</code>, <code>TidyJoinStyles</code>,
|
|
<code>TidyLogicalEmphasis</code>, <code>TidyMakeBare</code>, <code>TidyMakeClean</code>, <code>TidyMark</code>,
|
|
<code>TidyOmitOptionalTags</code>, <code>TidyReplaceColor</code>, <code>TidyUpperCaseAttrs</code>,
|
|
<code>TidyUpperCaseTags</code>, <code>TidyWord2000</code>, <code>TidyXhtmlOut</code>
|
|
</li>
|
|
<li>Ones taking a character value: <code>TidyDoctype</code>, <code>TidyInlineTags</code>, <code>TidyBlockTags</code>,
|
|
<code>TidyEmptyTags</code>, <code>TidyPreTags</code>
|
|
</li>
|
|
<li>Ones taking an integer value: <code>TidyIndentSpaces</code>, <code>TidyTabSize</code>, <code>TidyWrapLen</code>
|
|
</li>
|
|
</ul></p>
|
|
<p>File <a href = 'an issue'>https://github.com/hrbrmstr/htmltidy/issues</a> if there are other <code>libtidy</code>
|
|
options you'd like supported.</p>
|
|
<p>It is likely that the most used options will be:</p>
|
|
<p><ul>
|
|
<li><code>TidyXhtmlOut</code> (logical),
|
|
</li>
|
|
<li><code>TidyHtmlOut</code> (logical) and
|
|
</li>
|
|
<li><code>TidyDocType</code> which should be one of "<code>omit</code>",
|
|
"<code>html5</code>", "<code>auto</code>", "<code>strict</code>" or "<code>loose</code>".
|
|
</li>
|
|
</ul></p>
|
|
<p>You can clean up Microsoft Word (2000) and Google Docs HTML via logical settings for
|
|
<code>TidyWord2000</code> and <code>TidyGDocClean</code>, respectively.</p>
|
|
<p>It may also be advantageous to remove all comments with <code>TidyHideComments</code>.</p>
|
|
</div>
|
|
|
|
<div class="Note">
|
|
<h2>Note</h2>
|
|
|
|
<p>If document parsing errors are severe enough, <code>tidy_html()</code> will not be able
|
|
to clean the document and will display the errors (this output can be captured with
|
|
<code>sink()</code> or <code>capture.output()</code>) along with a warning and return a "best effort"
|
|
cleaned version of the document.</p>
|
|
</div>
|
|
|
|
<div class="References">
|
|
<h2>References</h2>
|
|
|
|
<p><a href = 'http://api.html-tidy.org/tidy/quickref_5.1.25.html'>http://api.html-tidy.org/tidy/quickref_5.1.25.html</a> &
|
|
<a href = 'https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h'>https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h</a>
|
|
for definitions of the options supported above and <a href = 'https://www.w3.org/People/Raggett/tidy/'>https://www.w3.org/People/Raggett/tidy/</a>
|
|
for an explanation of what "tidy" HTML is and some canonical examples of what it can do.</p>
|
|
</div>
|
|
|
|
<h2 id="examples">Examples</h2>
|
|
<pre class="examples"><div class='input'><span class='no'>opts</span> <span class='kw'><-</span> <span class='fu'>list</span>(
|
|
<span class='kw'>TidyDocType</span><span class='kw'>=</span><span class='st'>"html5"</span>,
|
|
<span class='kw'>TidyMakeClean</span><span class='kw'>=</span><span class='fl'>TRUE</span>,
|
|
<span class='kw'>TidyHideComments</span><span class='kw'>=</span><span class='fl'>TRUE</span>,
|
|
<span class='kw'>TidyIndentContent</span><span class='kw'>=</span><span class='fl'>TRUE</span>,
|
|
<span class='kw'>TidyWrapLen</span><span class='kw'>=</span><span class='fl'>200</span>
|
|
)
|
|
|
|
<span class='no'>txt</span> <span class='kw'><-</span> <span class='fu'>paste0</span>(
|
|
<span class='fu'>c</span>(<span class='st'>"<html><head><style>p { color: red; }</style><body><!-- ===== body ====== -->"</span>,
|
|
<span class='st'>"<p>Test</p></body><!--Default Zone --> <!--Default Zone End--></html>"</span>),
|
|
<span class='kw'>collapse</span><span class='kw'>=</span><span class='st'>""</span>)
|
|
|
|
<span class='fu'>cat</span>(<span class='fu'>tidy_html</span>(<span class='no'>txt</span>, <span class='kw'>option</span><span class='kw'>=</span><span class='no'>opts</span>))</div><div class='output co'>#> <!DOCTYPE html>
|
|
#> <html>
|
|
#> <head>
|
|
#> <meta name="generator" content="HTML Tidy for HTML5 for R version 5.0.0">
|
|
#> <style>
|
|
#> p { color: red; }
|
|
#> </style>
|
|
#> <title></title>
|
|
#> </head>
|
|
#> <body>
|
|
#> <p>
|
|
#> Test
|
|
#> </p>
|
|
#> </body>
|
|
#> </html>
|
|
#> </div><div class='input'>
|
|
<span class='fu'>library</span>(<span class='no'>httr</span>)
|
|
<span class='no'>res</span> <span class='kw'><-</span> <span class='fu'>GET</span>(<span class='st'>"http://rud.is/test/untidy.html"</span>)
|
|
|
|
<span class='co'># look at the original, un-tidy source</span>
|
|
<span class='fu'>cat</span>(<span class='fu'>content</span>(<span class='no'>res</span>, <span class='kw'>as</span><span class='kw'>=</span><span class='st'>"text"</span>, <span class='kw'>encoding</span><span class='kw'>=</span><span class='st'>"UTF-8"</span>))</div><div class='output co'>#> <head>
|
|
#> <style>
|
|
#> body { font-family: sans-serif; }
|
|
#> </style>
|
|
#> </head>
|
|
#> <body>
|
|
#> <b>This is <b>some <i>really </i> poorly formatted HTML</b>
|
|
#>
|
|
#> as is this <span id="sp">portion<div>
|
|
#> </div><div class='input'>
|
|
<span class='co'># see the tidied version</span>
|
|
<span class='fu'>cat</span>(<span class='fu'>tidy_html</span>(<span class='fu'>content</span>(<span class='no'>res</span>, <span class='kw'>as</span><span class='kw'>=</span><span class='st'>"text"</span>, <span class='kw'>encoding</span><span class='kw'>=</span><span class='st'>"UTF-8"</span>),
|
|
<span class='fu'>list</span>(<span class='kw'>TidyDocType</span><span class='kw'>=</span><span class='st'>"html5"</span>, <span class='kw'>TidyWrapLen</span><span class='kw'>=</span><span class='fl'>200</span>)))</div><div class='output co'>#> <!DOCTYPE html>
|
|
#> <html>
|
|
#> <head>
|
|
#> <meta name="generator" content="HTML Tidy for HTML5 for R version 5.0.0">
|
|
#> <style>
|
|
#> body { font-family: sans-serif; }
|
|
#> </style>
|
|
#> <title></title>
|
|
#> </head>
|
|
#> <body>
|
|
#> <b>This is some <i>really</i> poorly formatted HTML as is this <span id="sp">portion</span></b>
|
|
#> <div><span id="sp"></span></div>
|
|
#> </body>
|
|
#> </html>
|
|
#> </div><div class='input'>
|
|
<span class='co'># but, you could also just do:</span>
|
|
<span class='fu'>cat</span>(<span class='fu'>tidy_html</span>(<span class='fu'>url</span>(<span class='st'>"http://rud.is/test/untidy.html"</span>)))</div><div class='output co'>#> <!DOCTYPE html>
|
|
#> <html xmlns="http://www.w3.org/1999/xhtml">
|
|
#> <head>
|
|
#> <meta name="generator" content=
|
|
#> "HTML Tidy for HTML5 for R version 5.0.0" />
|
|
#> <style>
|
|
#> <![CDATA[
|
|
#> body { font-family: sans-serif; }
|
|
#> ]]>
|
|
#> </style>
|
|
#> <title></title>
|
|
#> </head>
|
|
#> <body>
|
|
#> <b>This is some <i>really</i> poorly formatted HTMLas is this
|
|
#> <span id="sp">portion</span></b>
|
|
#> <div><span id="sp"></span></div>
|
|
#> </body>
|
|
#> </html>
|
|
#> </div></pre>
|
|
</div>
|
|
<div class="col-md-3">
|
|
</div>
|
|
</div>
|
|
|
|
<footer>
|
|
<p>Built by <a href="http://hadley.github.io/pkgdown/">pkgdown</a>. Styled with <a href="http://getbootstrap.com">Bootstrap 3</a>.</p>
|
|
</footer>
|
|
</div>
|
|
|
|
</body>
|
|
</html>
|
|
|