mirror of https://git.sr.ht/~hrbrmstr/hgr
boB Rudis
7 years ago
10 changed files with 221 additions and 8 deletions
@ -1,6 +1,10 @@ |
|||
# Generated by roxygen2: do not edit by hand |
|||
|
|||
export(clean_text) |
|||
export(just_the_facts) |
|||
import(httr) |
|||
import(purrr) |
|||
import(xslt) |
|||
importFrom(readr,type_convert) |
|||
importFrom(rvest,html_text) |
|||
importFrom(xml2,read_html) |
|||
|
@ -1,2 +1,9 @@ |
|||
0.2.0 |
|||
* New `clean_text()` function which is designed to be run on the `$content` |
|||
component of the `data.frame` returned by `just_the_facts()`. It can be run |
|||
on any `htmnl_document` or atomic character vectors (which it will parse |
|||
into an `html_document`) and it will return an atomic character vector of |
|||
only plain text (i.e. it will remove all tags). |
|||
|
|||
0.1.0 |
|||
* Initial release |
|||
|
@ -0,0 +1,24 @@ |
|||
#' Remove all tags from a document |
|||
#' |
|||
#' This is designed to be run on the `$content` component of the `data.frame` returned |
|||
#' by `just_the_facts()`. It can be run on any `htmnl_document` or atomic character vectors |
|||
#' (which it will parse into an `html_document`) and it will return an atomic character |
|||
#' vector of only plain text (i.e. it will remove all tags). |
|||
#' |
|||
#' @md |
|||
#' @param doc atomic character vector (i.e. plain text) or an `html_document` |
|||
#' @return atomic character vector of cleaned text |
|||
#' @export |
|||
clean_text <- function(doc) { |
|||
|
|||
if (!inherits(doc, "html_document")) doc <- xml2::read_html(doc) |
|||
|
|||
cleaner <- xml2::read_xml(system.file("xslt/justthetext.xslt", package="hgr")) |
|||
|
|||
doc <- xslt::xml_xslt(doc, cleaner) |
|||
doc <- rvest::html_text(doc) |
|||
doc <- trimws(doc) |
|||
|
|||
doc |
|||
|
|||
} |
@ -0,0 +1,124 @@ |
|||
<?xml version="1.0" encoding="utf-8"?> |
|||
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> |
|||
|
|||
<xsl:output method="xml"/> |
|||
|
|||
<xsl:template match="@*|node()"> |
|||
<xsl:copy> |
|||
<xsl:apply-templates select="@*|node()"/> |
|||
</xsl:copy> |
|||
</xsl:template> |
|||
|
|||
<xsl:template match="head"/> |
|||
<xsl:template match="script"/> |
|||
<xsl:template match="style"/> |
|||
<xsl:template match="img"/> |
|||
<xsl:template match="header"/> |
|||
<xsl:template match="footer"/> |
|||
<xsl:template match="link"/> |
|||
<xsl:template match="iframe"/> |
|||
<xsl:template match="form"/> |
|||
<xsl:template match="figure"/> |
|||
<xsl:template match="object"/> |
|||
<xsl:template match="input"/> |
|||
<xsl:template match="textarea"/> |
|||
<xsl:template match="option"/> |
|||
<xsl:template match="select"/> |
|||
<xsl:template match="code"/> |
|||
<xsl:template match="cite"/> |
|||
<xsl:template match="a"/> |
|||
<xsl:template match="comment()"/> |
|||
|
|||
<xsl:template match="@style[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'display:none')]"/> |
|||
<xsl:template match="@style[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'display: none')]"/> |
|||
|
|||
<xsl:template match="*[@class='ad']"/> |
|||
<xsl:template match="*[@id='ad']"/> |
|||
|
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'ad ad')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'topic')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'banner')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'intercept')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'eyebrow')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'about')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'meta')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'combx')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'comment')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'community')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'disqus')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'extra')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'foot')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'header')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'menu')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'remark')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'rss')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'shoutbox')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'sidebar')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'sponsor')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'ad-break')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'agegate')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'pagination')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'pager')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'popup')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'tweet')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'twitter')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'brand')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'related')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'img')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'image')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'thumbnail')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'email')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'friend')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'copyright')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'taboola')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'breadcrumb')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'label')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'fb-')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'panel')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'separator')]"/> |
|||
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'forum')]"/> |
|||
|
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'ad ad')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'topic')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'banner')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'intercept')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'eyebrow')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'about')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'meta')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'combx')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'comment')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'community')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'disqus')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'extra')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'foot')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'header')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'menu')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'remark')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'rss')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'shoutbox')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'sidebar')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'ad-break')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'sponsor')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'agegate')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'pagination')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'pager')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'popup')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'tweet')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'twitter')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'brand')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'related')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'img')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'image')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'thumbnail')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'email')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'friend')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'copyright')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'taboola')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'breadcrumb')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'label')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'fb-')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'panel')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'separator')]"/> |
|||
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'forum')]"/> |
|||
|
|||
</xsl:stylesheet> |
@ -0,0 +1,20 @@ |
|||
% Generated by roxygen2: do not edit by hand |
|||
% Please edit documentation in R/clean.r |
|||
\name{clean_text} |
|||
\alias{clean_text} |
|||
\title{Remove all tags from a document} |
|||
\usage{ |
|||
clean_text(doc) |
|||
} |
|||
\arguments{ |
|||
\item{doc}{atomic character vector (i.e. plain text) or an \code{html_document}} |
|||
} |
|||
\value{ |
|||
atomic character vector of cleaned text |
|||
} |
|||
\description{ |
|||
This is designed to be run on the \code{$content} component of the \code{data.frame} returned |
|||
by \code{just_the_facts()}. It can be run on any \code{htmnl_document} or atomic character vectors |
|||
(which it will parse into an \code{html_document}) and it will return an atomic character |
|||
vector of only plain text (i.e. it will remove all tags). |
|||
} |
Loading…
Reference in new issue