Browse Source

`clean_text()`

master
boB Rudis 7 years ago
parent
commit
7eb5b3efa8
No known key found for this signature in database GPG Key ID: 2A514A4997464560
  1. 9
      DESCRIPTION
  2. 4
      NAMESPACE
  3. 7
      NEWS.md
  4. 24
      R/clean.r
  5. 3
      R/hgr-package.R
  6. 1
      R/mercury.r
  7. 12
      README.Rmd
  8. 25
      README.md
  9. 124
      inst/xslt/justthetext.xslt
  10. 20
      man/clean_text.Rd

9
DESCRIPTION

@ -1,8 +1,8 @@
Package: hgr
Type: Package
Title: Tools to Work with the 'Postlight' 'Mercury' 'API'
Version: 0.1.0
Date: 2017-04-19
Version: 0.2.0
Date: 2017-06-22
Author: Bob Rudis (bob@rud.is)
Maintainer: Bob Rudis <bob@rud.is>
Description: PTools to Work with the 'Postlight' 'Mercury' 'API' <https://mercury.postlight.com>.
@ -17,5 +17,8 @@ Depends:
Imports:
purrr,
httr,
readr
readr,
xml2,
rvest,
xslt
RoxygenNote: 6.0.1

4
NAMESPACE

@ -1,6 +1,10 @@
# Generated by roxygen2: do not edit by hand
export(clean_text)
export(just_the_facts)
import(httr)
import(purrr)
import(xslt)
importFrom(readr,type_convert)
importFrom(rvest,html_text)
importFrom(xml2,read_html)

7
NEWS.md

@ -1,2 +1,9 @@
0.2.0
* New `clean_text()` function which is designed to be run on the `$content`
component of the `data.frame` returned by `just_the_facts()`. It can be run
on any `htmnl_document` or atomic character vectors (which it will parse
into an `html_document`) and it will return an atomic character vector of
only plain text (i.e. it will remove all tags).
0.1.0
* Initial release

24
R/clean.r

@ -0,0 +1,24 @@
#' Remove all tags from a document
#'
#' This is designed to be run on the `$content` component of the `data.frame` returned
#' by `just_the_facts()`. It can be run on any `htmnl_document` or atomic character vectors
#' (which it will parse into an `html_document`) and it will return an atomic character
#' vector of only plain text (i.e. it will remove all tags).
#'
#' @md
#' @param doc atomic character vector (i.e. plain text) or an `html_document`
#' @return atomic character vector of cleaned text
#' @export
clean_text <- function(doc) {
if (!inherits(doc, "html_document")) doc <- xml2::read_html(doc)
cleaner <- xml2::read_xml(system.file("xslt/justthetext.xslt", package="hgr"))
doc <- xslt::xml_xslt(doc, cleaner)
doc <- rvest::html_text(doc)
doc <- trimws(doc)
doc
}

3
R/hgr-package.R

@ -9,4 +9,7 @@
#' @author Bob Rudis (bob@@rud.is)
#' @import purrr httr
#' @importFrom readr type_convert
#' @import xslt
#' @importFrom xml2 read_html
#' @importFrom rvest html_text
NULL

1
R/mercury.r

@ -12,6 +12,7 @@
just_the_facts <- function(url, mercury_api_key=Sys.getenv("MERCURY_API_KEY")) {
res <- httr::GET("https://mercury.postlight.com/parser",
httr::content_type_json(),
httr::add_headers(`x-api-key`=mercury_api_key),
query = list(url = url))

12
README.Rmd

@ -1,5 +1,7 @@
---
output: rmarkdown::github_document
editor_options:
chunk_output_type: console
---
`hgr` : Tools to Work with the 'Postlight' 'Mercury' 'API'
@ -9,6 +11,7 @@ Mercury takes any web article and returns only the relevant content — headline
The following functions are implemented:
- `just_the_facts`: Retrieve parsed content of a URL processed by the Postlight Mercury API
- `clean_text`: Remove all HTML/XML tags from an HTML document/atomic character vector
### Installation
@ -30,8 +33,15 @@ packageVersion("hgr")
story <- "https://www.nytimes.com/2017/04/18/world/asia/aircraft-carrier-north-korea-carl-vinson.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news&_r=0"
dplyr::glimpse(just_the_facts(story))
doc <- just_the_facts(story)
dplyr::glimpse(doc)
substr(doc$content, 1, 100)
plain <- clean_text(doc$content)
substr(plain, 1, 100)
```
### Test Results

25
README.md

@ -6,6 +6,7 @@ Mercury takes any web article and returns only the relevant content — headline
The following functions are implemented:
- `just_the_facts`: Retrieve parsed content of a URL processed by the Postlight Mercury API
- `clean_text`: Remove all HTML/XML tags from an HTML document/atomic character vector
### Installation
@ -22,12 +23,14 @@ library(hgr)
packageVersion("hgr")
```
## [1] '0.1.0'
## [1] '0.2.0'
``` r
story <- "https://www.nytimes.com/2017/04/18/world/asia/aircraft-carrier-north-korea-carl-vinson.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news&_r=0"
dplyr::glimpse(just_the_facts(story))
doc <- just_the_facts(story)
dplyr::glimpse(doc)
```
## Observations: 1
@ -40,11 +43,25 @@ dplyr::glimpse(just_the_facts(story))
## $ url <chr> "https://www.nytimes.com/2017/04/18/world/asia/aircraft-carrier-north-korea-carl-vinson.html"
## $ domain <chr> "www.nytimes.com"
## $ excerpt <chr> "The saga might never have come to light had the Navy not posted a photograph of the Carl Vi...
## $ word_count <int> 1505
## $ word_count <int> 1499
## $ direction <chr> "ltr"
## $ total_pages <int> 1
## $ rendered_pages <int> 1
``` r
substr(doc$content, 1, 100)
```
## [1] "<div><article id=\"story\" class=\"story theme-main \">\n\n \n\n \n \n\n \n\n "
``` r
plain <- clean_text(doc$content)
substr(plain, 1, 100)
```
## [1] "WASHINGTON — Just over a week ago, the White House declared that ordering an American aircraft carri"
### Test Results
``` r
@ -54,7 +71,7 @@ library(testthat)
date()
```
## [1] "Wed Apr 19 10:26:14 2017"
## [1] "Thu Jun 22 22:49:32 2017"
``` r
test_dir("tests/")

124
inst/xslt/justthetext.xslt

@ -0,0 +1,124 @@
<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml"/>
<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<xsl:template match="head"/>
<xsl:template match="script"/>
<xsl:template match="style"/>
<xsl:template match="img"/>
<xsl:template match="header"/>
<xsl:template match="footer"/>
<xsl:template match="link"/>
<xsl:template match="iframe"/>
<xsl:template match="form"/>
<xsl:template match="figure"/>
<xsl:template match="object"/>
<xsl:template match="input"/>
<xsl:template match="textarea"/>
<xsl:template match="option"/>
<xsl:template match="select"/>
<xsl:template match="code"/>
<xsl:template match="cite"/>
<xsl:template match="a"/>
<xsl:template match="comment()"/>
<xsl:template match="@style[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'display:none')]"/>
<xsl:template match="@style[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'display: none')]"/>
<xsl:template match="*[@class='ad']"/>
<xsl:template match="*[@id='ad']"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'ad ad')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'topic')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'banner')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'intercept')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'eyebrow')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'about')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'meta')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'combx')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'comment')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'community')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'disqus')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'extra')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'foot')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'header')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'menu')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'remark')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'rss')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'shoutbox')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'sidebar')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'sponsor')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'ad-break')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'agegate')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'pagination')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'pager')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'popup')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'tweet')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'twitter')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'brand')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'related')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'img')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'image')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'thumbnail')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'email')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'friend')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'copyright')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'taboola')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'breadcrumb')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'label')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'fb-')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'panel')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'separator')]"/>
<xsl:template match="*[contains(translate(@class, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'forum')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'ad ad')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'topic')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'banner')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'intercept')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'eyebrow')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'about')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'meta')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'combx')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'comment')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'community')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'disqus')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'extra')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'foot')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'header')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'menu')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'remark')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'rss')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'shoutbox')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'sidebar')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'ad-break')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'sponsor')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'agegate')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'pagination')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'pager')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'popup')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'tweet')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'twitter')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'brand')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'related')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'img')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'image')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'thumbnail')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'email')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'friend')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'copyright')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'taboola')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'breadcrumb')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'label')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'fb-')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'panel')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'separator')]"/>
<xsl:template match="*[contains(translate(@id, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'forum')]"/>
</xsl:stylesheet>

20
man/clean_text.Rd

@ -0,0 +1,20 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/clean.r
\name{clean_text}
\alias{clean_text}
\title{Remove all tags from a document}
\usage{
clean_text(doc)
}
\arguments{
\item{doc}{atomic character vector (i.e. plain text) or an \code{html_document}}
}
\value{
atomic character vector of cleaned text
}
\description{
This is designed to be run on the \code{$content} component of the \code{data.frame} returned
by \code{just_the_facts()}. It can be run on any \code{htmnl_document} or atomic character vectors
(which it will parse into an \code{html_document}) and it will return an atomic character
vector of only plain text (i.e. it will remove all tags).
}
Loading…
Cancel
Save