mirror of https://git.sr.ht/~hrbrmstr/hgr
boB Rudis
7 years ago
10 changed files with 221 additions and 8 deletions
@ -1,6 +1,10 @@ |
|||||
# Generated by roxygen2: do not edit by hand |
# Generated by roxygen2: do not edit by hand |
||||
|
|
||||
|
export(clean_text) |
||||
export(just_the_facts) |
export(just_the_facts) |
||||
import(httr) |
import(httr) |
||||
import(purrr) |
import(purrr) |
||||
|
import(xslt) |
||||
importFrom(readr,type_convert) |
importFrom(readr,type_convert) |
||||
|
importFrom(rvest,html_text) |
||||
|
importFrom(xml2,read_html) |
||||
|
@ -1,2 +1,9 @@ |
|||||
|
0.2.0 |
||||
|
* New `clean_text()` function which is designed to be run on the `$content` |
||||
|
component of the `data.frame` returned by `just_the_facts()`. It can be run |
||||
|
on any `htmnl_document` or atomic character vectors (which it will parse |
||||
|
into an `html_document`) and it will return an atomic character vector of |
||||
|
only plain text (i.e. it will remove all tags). |
||||
|
|
||||
0.1.0 |
0.1.0 |
||||
* Initial release |
* Initial release |
||||
|
@ -0,0 +1,24 @@ |
|||||
|
#' Remove all tags from a document |
||||
|
#' |
||||
|
#' This is designed to be run on the `$content` component of the `data.frame` returned |
||||
|
#' by `just_the_facts()`. It can be run on any `htmnl_document` or atomic character vectors |
||||
|
#' (which it will parse into an `html_document`) and it will return an atomic character |
||||
|
#' vector of only plain text (i.e. it will remove all tags). |
||||
|
#' |
||||
|
#' @md |
||||
|
#' @param doc atomic character vector (i.e. plain text) or an `html_document` |
||||
|
#' @return atomic character vector of cleaned text |
||||
|
#' @export |
||||
|
clean_text <- function(doc) { |
||||
|
|
||||
|
if (!inherits(doc, "html_document")) doc <- xml2::read_html(doc) |
||||
|
|
||||
|
cleaner <- xml2::read_xml(system.file("xslt/justthetext.xslt", package="hgr")) |
||||
|
|
||||
|
doc <- xslt::xml_xslt(doc, cleaner) |
||||
|
doc <- rvest::html_text(doc) |
||||
|
doc <- trimws(doc) |
||||
|
|
||||
|
doc |
||||
|
|
||||
|
} |
@ -0,0 +1,124 @@ |
|||||
|
<?xml version="1.0" encoding="utf-8"?> |
||||
|
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> |
||||
|
|
||||
|
<xsl:output method="xml"/> |
||||
|
|
||||
|
<xsl:template match="@*|node()"> |
||||
|
<xsl:copy> |
||||
|
<xsl:apply-templates select="@*|node()"/> |
||||
|
</xsl:copy> |
||||
|
</xsl:template> |
||||
|
|
||||
|
<xsl:template match="head"/> |
||||
|
<xsl:template match="script"/> |
||||
|
<xsl:template match="style"/> |
||||
|
<xsl:template match="img"/> |
||||
|
<xsl:template match="header"/> |
||||
|
<xsl:template match="footer"/> |
||||
|
<xsl:template match="link"/> |
||||
|
<xsl:template match="iframe"/> |
||||
|
<xsl:template match="form"/> |
||||
|
<xsl:template match="figure"/> |
||||
|
<xsl:template match="object"/> |
||||
|
<xsl:template match="input"/> |
||||
|
<xsl:template match="textarea"/> |
||||
|
<xsl:template match="option"/> |
||||
|
<xsl:template match="select"/> |
||||
|
<xsl:template match="code"/> |
||||
|
<xsl:template match="cite"/> |
||||
|
<xsl:template match="a"/> |
||||
|
<xsl:template match="comment()"/> |
||||
|
|
||||
|
<xsl:template match="@style[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'display:none')]"/> |
||||
|
<xsl:template match="@style[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'display: none')]"/> |
||||
|
|
||||
|
<xsl:template match="*[@class='ad']"/> |
||||
|
<xsl:template match="*[@id='ad']"/> |
||||
|
|
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'ad ad')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'topic')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'banner')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'intercept')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'eyebrow')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'about')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'meta')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'combx')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'comment')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'community')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'disqus')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'extra')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'foot')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'header')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'menu')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'remark')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'rss')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'shoutbox')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'sidebar')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'sponsor')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'ad-break')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'agegate')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'pagination')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'pager')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'popup')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'tweet')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'twitter')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'brand')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'related')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'img')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'image')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'thumbnail')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'email')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'friend')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'copyright')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'taboola')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'breadcrumb')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'label')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'fb-')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'panel')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'separator')]"/> |
||||
|
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'forum')]"/> |
||||
|
|
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'ad ad')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'topic')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'banner')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'intercept')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'eyebrow')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'about')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'meta')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'combx')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'comment')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'community')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'disqus')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'extra')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'foot')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'header')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'menu')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'remark')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'rss')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'shoutbox')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'sidebar')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'ad-break')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'sponsor')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'agegate')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'pagination')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'pager')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'popup')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'tweet')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'twitter')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'brand')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'related')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'img')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'image')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'thumbnail')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'email')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'friend')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'copyright')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'taboola')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'breadcrumb')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'label')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'fb-')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'panel')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'separator')]"/> |
||||
|
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'forum')]"/> |
||||
|
|
||||
|
</xsl:stylesheet> |
@ -0,0 +1,20 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/clean.r |
||||
|
\name{clean_text} |
||||
|
\alias{clean_text} |
||||
|
\title{Remove all tags from a document} |
||||
|
\usage{ |
||||
|
clean_text(doc) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{doc}{atomic character vector (i.e. plain text) or an \code{html_document}} |
||||
|
} |
||||
|
\value{ |
||||
|
atomic character vector of cleaned text |
||||
|
} |
||||
|
\description{ |
||||
|
This is designed to be run on the \code{$content} component of the \code{data.frame} returned |
||||
|
by \code{just_the_facts()}. It can be run on any \code{htmnl_document} or atomic character vectors |
||||
|
(which it will parse into an \code{html_document}) and it will return an atomic character |
||||
|
vector of only plain text (i.e. it will remove all tags). |
||||
|
} |
Loading…
Reference in new issue