You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

285 lines
15 KiB

6 years ago
<!-- Generated by pkgdown: do not edit by hand -->
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>tidy_html.response. htmltidy</title>
<!-- jquery -->
<script src="https://code.jquery.com/jquery-3.1.0.min.js" integrity="sha384-nrOSfDHtoPMzJHjVTdCopGqIqeYETSXhZDFyniQ8ZHcVy08QesyHcnOUpMpqnmWq" crossorigin="anonymous"></script>
<!-- Bootstrap -->
<link href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
<!-- Font Awesome icons -->
<link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.6.3/css/font-awesome.min.css" rel="stylesheet" integrity="sha384-T8Gy5hrqNKT+hzMclPo118YTQO6cYprQmhrYwIiQ/3axmI1hQomh7Ud2hPOy8SP1" crossorigin="anonymous">
<!-- pkgdown -->
<link href="../pkgdown.css" rel="stylesheet">
<script src="../pkgdown.js"></script>
<!-- mathjax -->
<script src='https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'></script>
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]-->
</head>
<body>
<div class="container">
<header>
<div class="navbar navbar-default navbar-fixed-top" role="navigation">
<div class="container">
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar">
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<a class="navbar-brand" href="../index.html">htmltidy</a>
</div>
<div id="navbar" class="navbar-collapse collapse">
<ul class="nav navbar-nav">
<li>
<a href="../index.html">Home</a>
</li>
<li>
<a href="../reference/index.html">Reference</a>
</li>
<li>
<a href="../news/index.html">News</a>
</li>
</ul>
<ul class="nav navbar-nav navbar-right">
<li>
<a href="https://github.com/hrbrmstr/htmltidy">
<span class="fa fa-github fa-lg"></span>
</a>
</li>
</ul>
</div><!--/.nav-collapse -->
</div><!--/.container -->
</div><!--/.navbar -->
</header>
<div class="page-header">
<h1>Tidy or &quot;Pretty Print&quot; HTML/XHTML Documents</h1>
</div>
<div class="row">
<div class="col-md-9">
<p>Pass in HTML content as either plain or raw text or parsed objects (either with the
<code>XML</code> or <code>xml2</code> packages) or as an <code>httr</code> <code>response</code> object
along with an options list that specifies how the content will be tidied and get back
tidied content of the same object type as passed in to the function.</p>
<pre><span class='co'># S3 method for response</span>
<span class='fu'>tidy_html</span>(<span class='no'>content</span>, <span class='kw'>options</span> <span class='kw'>=</span> <span class='fu'>list</span>(<span class='kw'>TidyXhtmlOut</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>),
<span class='kw'>verbose</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>)
<span class='fu'>tidy_html</span>(<span class='no'>content</span>, <span class='kw'>options</span> <span class='kw'>=</span> <span class='fu'>list</span>(<span class='kw'>TidyXhtmlOut</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>), <span class='kw'>verbose</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>)
<span class='co'># S3 method for default</span>
<span class='fu'>tidy_html</span>(<span class='no'>content</span>, <span class='kw'>options</span> <span class='kw'>=</span> <span class='fu'>list</span>(<span class='kw'>TidyXhtmlOut</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>),
<span class='kw'>verbose</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>)
<span class='co'># S3 method for character</span>
<span class='fu'>tidy_html</span>(<span class='no'>content</span>, <span class='kw'>options</span> <span class='kw'>=</span> <span class='fu'>list</span>(<span class='kw'>TidyXhtmlOut</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>),
<span class='kw'>verbose</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>)
<span class='co'># S3 method for raw</span>
<span class='fu'>tidy_html</span>(<span class='no'>content</span>, <span class='kw'>options</span> <span class='kw'>=</span> <span class='fu'>list</span>(<span class='kw'>TidyXhtmlOut</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>),
<span class='kw'>verbose</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>)
<span class='co'># S3 method for xml_document</span>
<span class='fu'>tidy_html</span>(<span class='no'>content</span>, <span class='kw'>options</span> <span class='kw'>=</span> <span class='fu'>list</span>(<span class='kw'>TidyXhtmlOut</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>),
<span class='kw'>verbose</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>)
<span class='co'># S3 method for HTMLInternalDocument</span>
<span class='fu'>tidy_html</span>(<span class='no'>content</span>, <span class='kw'>options</span> <span class='kw'>=</span> <span class='fu'>list</span>(<span class='kw'>TidyXhtmlOut</span>
<span class='kw'>=</span> <span class='fl'>TRUE</span>), <span class='kw'>verbose</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>)
<span class='co'># S3 method for connection</span>
<span class='fu'>tidy_html</span>(<span class='no'>content</span>, <span class='kw'>options</span> <span class='kw'>=</span> <span class='fu'>list</span>(<span class='kw'>TidyXhtmlOut</span> <span class='kw'>=</span> <span class='fl'>TRUE</span>),
<span class='kw'>verbose</span> <span class='kw'>=</span> <span class='fl'>FALSE</span>)</pre>
<h2>Arguments</h2>
<dl class="dl-horizontal">
<dt>content</dt>
<dd>accepts a character vector, raw vector or parsed content from the <code>xml2</code>
or <code>XML</code> packages.</dd>
<dt>options</dt>
<dd>named list of options</dd>
<dt>verbose</dt>
<dd>output document errors? (default: <code>FALSE</code>)</dd>
</dl>
<div class="Value">
<h2>Value</h2>
<p>Tidied HTML/XHTML content. The object type will be the same as that of the input type
except when it is a <code>connection</code>, then a character vector will be returned.</p>
</div>
<div class="Details">
<h2>Details</h2>
<p>The default option <code>TixyXhtmlOut</code> will convert the input content to XHTML.</p>
<p>Currently supported options:</p>
<p><ul>
<li>Ones taking a logical value: <code>TidyAltText</code>, <code>TidyBodyOnly</code>, <code>TidyBreakBeforeBR</code>,
<code>TidyCoerceEndTags</code>, <code>TidyDropEmptyElems</code>, <code>TidyDropEmptyParas</code>,
<code>TidyFixBackslash</code>, <code>TidyFixComments</code>, <code>TidyGDocClean</code>, <code>TidyHideComments</code>,
<code>TidyHtmlOut</code>, <code>TidyIndentContent</code>, <code>TidyJoinClasses</code>, <code>TidyJoinStyles</code>,
<code>TidyLogicalEmphasis</code>, <code>TidyMakeBare</code>, <code>TidyMakeClean</code>, <code>TidyMark</code>,
<code>TidyOmitOptionalTags</code>, <code>TidyReplaceColor</code>, <code>TidyUpperCaseAttrs</code>,
<code>TidyUpperCaseTags</code>, <code>TidyWord2000</code>, <code>TidyXhtmlOut</code>
</li>
<li>Ones taking a character value: <code>TidyDoctype</code>, <code>TidyInlineTags</code>, <code>TidyBlockTags</code>,
<code>TidyEmptyTags</code>, <code>TidyPreTags</code>
</li>
<li>Ones taking an integer value: <code>TidyIndentSpaces</code>, <code>TidyTabSize</code>, <code>TidyWrapLen</code>
</li>
</ul></p>
<p>File <a href = 'an issue'>https://github.com/hrbrmstr/htmltidy/issues</a> if there are other <code>libtidy</code>
options you&#39;d like supported.</p>
<p>It is likely that the most used options will be:</p>
<p><ul>
<li><code>TidyXhtmlOut</code> (logical),
</li>
<li><code>TidyHtmlOut</code> (logical) and
</li>
<li><code>TidyDocType</code> which should be one of &quot;<code>omit</code>&quot;,
&quot;<code>html5</code>&quot;, &quot;<code>auto</code>&quot;, &quot;<code>strict</code>&quot; or &quot;<code>loose</code>&quot;.
</li>
</ul></p>
<p>You can clean up Microsoft Word (2000) and Google Docs HTML via logical settings for
<code>TidyWord2000</code> and <code>TidyGDocClean</code>, respectively.</p>
<p>It may also be advantageous to remove all comments with <code>TidyHideComments</code>.</p>
</div>
<div class="Note">
<h2>Note</h2>
<p>If document parsing errors are severe enough, <code>tidy_html()</code> will not be able
to clean the document and will display the errors (this output can be captured with
<code>sink()</code> or <code>capture.output()</code>) along with a warning and return a &quot;best effort&quot;
cleaned version of the document.</p>
</div>
<div class="References">
<h2>References</h2>
<p><a href = 'http://api.html-tidy.org/tidy/quickref_5.1.25.html'>http://api.html-tidy.org/tidy/quickref_5.1.25.html</a> &amp;
<a href = 'https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h'>https://github.com/htacg/tidy-html5/blob/master/include/tidyenum.h</a>
for definitions of the options supported above and <a href = 'https://www.w3.org/People/Raggett/tidy/'>https://www.w3.org/People/Raggett/tidy/</a>
for an explanation of what &quot;tidy&quot; HTML is and some canonical examples of what it can do.</p>
</div>
<h2 id="examples">Examples</h2>
<pre class="examples"><div class='input'><span class='no'>opts</span> <span class='kw'>&lt;-</span> <span class='fu'>list</span>(
<span class='kw'>TidyDocType</span><span class='kw'>=</span><span class='st'>"html5"</span>,
<span class='kw'>TidyMakeClean</span><span class='kw'>=</span><span class='fl'>TRUE</span>,
<span class='kw'>TidyHideComments</span><span class='kw'>=</span><span class='fl'>TRUE</span>,
<span class='kw'>TidyIndentContent</span><span class='kw'>=</span><span class='fl'>TRUE</span>,
<span class='kw'>TidyWrapLen</span><span class='kw'>=</span><span class='fl'>200</span>
)
<span class='no'>txt</span> <span class='kw'>&lt;-</span> <span class='fu'>paste0</span>(
<span class='fu'>c</span>(<span class='st'>"&lt;html&gt;&lt;head&gt;&lt;style&gt;p { color: red; }&lt;/style&gt;&lt;body&gt;&lt;!-- ===== body ====== --&gt;"</span>,
<span class='st'>"&lt;p&gt;Test&lt;/p&gt;&lt;/body&gt;&lt;!--Default Zone --&gt; &lt;!--Default Zone End--&gt;&lt;/html&gt;"</span>),
<span class='kw'>collapse</span><span class='kw'>=</span><span class='st'>""</span>)
<span class='fu'>cat</span>(<span class='fu'>tidy_html</span>(<span class='no'>txt</span>, <span class='kw'>option</span><span class='kw'>=</span><span class='no'>opts</span>))</div><div class='output co'>#&gt; &lt;!DOCTYPE html&gt;
#&gt; &lt;html&gt;
#&gt; &lt;head&gt;
#&gt; &lt;meta name=&quot;generator&quot; content=&quot;HTML Tidy for HTML5 for R version 5.0.0&quot;&gt;
#&gt; &lt;style&gt;
#&gt; p { color: red; }
#&gt; &lt;/style&gt;
#&gt; &lt;title&gt;&lt;/title&gt;
#&gt; &lt;/head&gt;
#&gt; &lt;body&gt;
#&gt; &lt;p&gt;
#&gt; Test
#&gt; &lt;/p&gt;
#&gt; &lt;/body&gt;
#&gt; &lt;/html&gt;
#&gt; </div><div class='input'>
<span class='fu'>library</span>(<span class='no'>httr</span>)
<span class='no'>res</span> <span class='kw'>&lt;-</span> <span class='fu'>GET</span>(<span class='st'>"http://rud.is/test/untidy.html"</span>)
<span class='co'># look at the original, un-tidy source</span>
<span class='fu'>cat</span>(<span class='fu'>content</span>(<span class='no'>res</span>, <span class='kw'>as</span><span class='kw'>=</span><span class='st'>"text"</span>, <span class='kw'>encoding</span><span class='kw'>=</span><span class='st'>"UTF-8"</span>))</div><div class='output co'>#&gt; &lt;head&gt;
#&gt; &lt;style&gt;
#&gt; body { font-family: sans-serif; }
#&gt; &lt;/style&gt;
#&gt; &lt;/head&gt;
#&gt; &lt;body&gt;
#&gt; &lt;b&gt;This is &lt;b&gt;some &lt;i&gt;really &lt;/i&gt; poorly formatted HTML&lt;/b&gt;
#&gt;
#&gt; as is this &lt;span id=&quot;sp&quot;&gt;portion&lt;div&gt;
#&gt; </div><div class='input'>
<span class='co'># see the tidied version</span>
<span class='fu'>cat</span>(<span class='fu'>tidy_html</span>(<span class='fu'>content</span>(<span class='no'>res</span>, <span class='kw'>as</span><span class='kw'>=</span><span class='st'>"text"</span>, <span class='kw'>encoding</span><span class='kw'>=</span><span class='st'>"UTF-8"</span>),
<span class='fu'>list</span>(<span class='kw'>TidyDocType</span><span class='kw'>=</span><span class='st'>"html5"</span>, <span class='kw'>TidyWrapLen</span><span class='kw'>=</span><span class='fl'>200</span>)))</div><div class='output co'>#&gt; &lt;!DOCTYPE html&gt;
#&gt; &lt;html&gt;
#&gt; &lt;head&gt;
#&gt; &lt;meta name=&quot;generator&quot; content=&quot;HTML Tidy for HTML5 for R version 5.0.0&quot;&gt;
#&gt; &lt;style&gt;
#&gt; body { font-family: sans-serif; }
#&gt; &lt;/style&gt;
#&gt; &lt;title&gt;&lt;/title&gt;
#&gt; &lt;/head&gt;
#&gt; &lt;body&gt;
#&gt; &lt;b&gt;This is some &lt;i&gt;really&lt;/i&gt; poorly formatted HTML as is this &lt;span id=&quot;sp&quot;&gt;portion&lt;/span&gt;&lt;/b&gt;
#&gt; &lt;div&gt;&lt;span id=&quot;sp&quot;&gt;&lt;/span&gt;&lt;/div&gt;
#&gt; &lt;/body&gt;
#&gt; &lt;/html&gt;
#&gt; </div><div class='input'>
<span class='co'># but, you could also just do:</span>
<span class='fu'>cat</span>(<span class='fu'>tidy_html</span>(<span class='fu'>url</span>(<span class='st'>"http://rud.is/test/untidy.html"</span>)))</div><div class='output co'>#&gt; &lt;!DOCTYPE html&gt;
#&gt; &lt;html xmlns=&quot;http://www.w3.org/1999/xhtml&quot;&gt;
#&gt; &lt;head&gt;
#&gt; &lt;meta name=&quot;generator&quot; content=
#&gt; &quot;HTML Tidy for HTML5 for R version 5.0.0&quot; /&gt;
#&gt; &lt;style&gt;
#&gt; &lt;![CDATA[
#&gt; body { font-family: sans-serif; }
#&gt; ]]&gt;
#&gt; &lt;/style&gt;
#&gt; &lt;title&gt;&lt;/title&gt;
#&gt; &lt;/head&gt;
#&gt; &lt;body&gt;
#&gt; &lt;b&gt;This is some &lt;i&gt;really&lt;/i&gt; poorly formatted HTMLas is this
#&gt; &lt;span id=&quot;sp&quot;&gt;portion&lt;/span&gt;&lt;/b&gt;
#&gt; &lt;div&gt;&lt;span id=&quot;sp&quot;&gt;&lt;/span&gt;&lt;/div&gt;
#&gt; &lt;/body&gt;
#&gt; &lt;/html&gt;
#&gt; </div></pre>
</div>
<div class="col-md-3">
</div>
</div>
<footer>
<p>Built by <a href="http://hadley.github.io/pkgdown/">pkgdown</a>. Styled with <a href="http://getbootstrap.com">Bootstrap 3</a>.</p>
</footer>
</div>
</body>
</html>