diff --git a/DESCRIPTION b/DESCRIPTION index b8194de..5253471 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: hgr Type: Package Title: Tools to Work with the 'Postlight' 'Mercury' 'API' -Version: 0.1.0 -Date: 2017-04-19 +Version: 0.2.0 +Date: 2017-06-22 Author: Bob Rudis (bob@rud.is) Maintainer: Bob Rudis Description: PTools to Work with the 'Postlight' 'Mercury' 'API' . @@ -17,5 +17,8 @@ Depends: Imports: purrr, httr, - readr + readr, + xml2, + rvest, + xslt RoxygenNote: 6.0.1 diff --git a/NAMESPACE b/NAMESPACE index 0f67cc5..98ca483 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,10 @@ # Generated by roxygen2: do not edit by hand +export(clean_text) export(just_the_facts) import(httr) import(purrr) +import(xslt) importFrom(readr,type_convert) +importFrom(rvest,html_text) +importFrom(xml2,read_html) diff --git a/NEWS.md b/NEWS.md index 9b4679b..ce787e6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,2 +1,9 @@ +0.2.0 +* New `clean_text()` function which is designed to be run on the `$content` + component of the `data.frame` returned by `just_the_facts()`. It can be run + on any `htmnl_document` or atomic character vectors (which it will parse + into an `html_document`) and it will return an atomic character vector of + only plain text (i.e. it will remove all tags). + 0.1.0 * Initial release diff --git a/R/clean.r b/R/clean.r new file mode 100644 index 0000000..bf14951 --- /dev/null +++ b/R/clean.r @@ -0,0 +1,24 @@ +#' Remove all tags from a document +#' +#' This is designed to be run on the `$content` component of the `data.frame` returned +#' by `just_the_facts()`. It can be run on any `htmnl_document` or atomic character vectors +#' (which it will parse into an `html_document`) and it will return an atomic character +#' vector of only plain text (i.e. it will remove all tags). +#' +#' @md +#' @param doc atomic character vector (i.e. plain text) or an `html_document` +#' @return atomic character vector of cleaned text +#' @export +clean_text <- function(doc) { + + if (!inherits(doc, "html_document")) doc <- xml2::read_html(doc) + + cleaner <- xml2::read_xml(system.file("xslt/justthetext.xslt", package="hgr")) + + doc <- xslt::xml_xslt(doc, cleaner) + doc <- rvest::html_text(doc) + doc <- trimws(doc) + + doc + +} \ No newline at end of file diff --git a/R/hgr-package.R b/R/hgr-package.R index 74374af..a8c2134 100644 --- a/R/hgr-package.R +++ b/R/hgr-package.R @@ -9,4 +9,7 @@ #' @author Bob Rudis (bob@@rud.is) #' @import purrr httr #' @importFrom readr type_convert +#' @import xslt +#' @importFrom xml2 read_html +#' @importFrom rvest html_text NULL diff --git a/R/mercury.r b/R/mercury.r index 3bdf5f5..6b43c1d 100644 --- a/R/mercury.r +++ b/R/mercury.r @@ -12,6 +12,7 @@ just_the_facts <- function(url, mercury_api_key=Sys.getenv("MERCURY_API_KEY")) { res <- httr::GET("https://mercury.postlight.com/parser", + httr::content_type_json(), httr::add_headers(`x-api-key`=mercury_api_key), query = list(url = url)) diff --git a/README.Rmd b/README.Rmd index 0081222..ffde921 100644 --- a/README.Rmd +++ b/README.Rmd @@ -1,5 +1,7 @@ --- output: rmarkdown::github_document +editor_options: + chunk_output_type: console --- `hgr` : Tools to Work with the 'Postlight' 'Mercury' 'API' @@ -9,6 +11,7 @@ Mercury takes any web article and returns only the relevant content — headline The following functions are implemented: - `just_the_facts`: Retrieve parsed content of a URL processed by the Postlight Mercury API +- `clean_text`: Remove all HTML/XML tags from an HTML document/atomic character vector ### Installation @@ -30,8 +33,15 @@ packageVersion("hgr") story <- "https://www.nytimes.com/2017/04/18/world/asia/aircraft-carrier-north-korea-carl-vinson.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region®ion=top-news&WT.nav=top-news&_r=0" -dplyr::glimpse(just_the_facts(story)) +doc <- just_the_facts(story) +dplyr::glimpse(doc) + +substr(doc$content, 1, 100) + +plain <- clean_text(doc$content) + +substr(plain, 1, 100) ``` ### Test Results diff --git a/README.md b/README.md index f59cb70..0e6bb6f 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ Mercury takes any web article and returns only the relevant content — headline The following functions are implemented: - `just_the_facts`: Retrieve parsed content of a URL processed by the Postlight Mercury API +- `clean_text`: Remove all HTML/XML tags from an HTML document/atomic character vector ### Installation @@ -22,12 +23,14 @@ library(hgr) packageVersion("hgr") ``` - ## [1] '0.1.0' + ## [1] '0.2.0' ``` r story <- "https://www.nytimes.com/2017/04/18/world/asia/aircraft-carrier-north-korea-carl-vinson.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region®ion=top-news&WT.nav=top-news&_r=0" -dplyr::glimpse(just_the_facts(story)) +doc <- just_the_facts(story) + +dplyr::glimpse(doc) ``` ## Observations: 1 @@ -40,11 +43,25 @@ dplyr::glimpse(just_the_facts(story)) ## $ url "https://www.nytimes.com/2017/04/18/world/asia/aircraft-carrier-north-korea-carl-vinson.html" ## $ domain "www.nytimes.com" ## $ excerpt "The saga might never have come to light had the Navy not posted a photograph of the Carl Vi... - ## $ word_count 1505 + ## $ word_count 1499 ## $ direction "ltr" ## $ total_pages 1 ## $ rendered_pages 1 +``` r +substr(doc$content, 1, 100) +``` + + ## [1] "
\n\n \n\n \n \n\n \n\n " + +``` r +plain <- clean_text(doc$content) + +substr(plain, 1, 100) +``` + + ## [1] "WASHINGTON — Just over a week ago, the White House declared that ordering an American aircraft carri" + ### Test Results ``` r @@ -54,7 +71,7 @@ library(testthat) date() ``` - ## [1] "Wed Apr 19 10:26:14 2017" + ## [1] "Thu Jun 22 22:49:32 2017" ``` r test_dir("tests/") diff --git a/inst/xslt/justthetext.xslt b/inst/xslt/justthetext.xslt new file mode 100644 index 0000000..b096ebf --- /dev/null +++ b/inst/xslt/justthetext.xslt @@ -0,0 +1,124 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/man/clean_text.Rd b/man/clean_text.Rd new file mode 100644 index 0000000..8f91a26 --- /dev/null +++ b/man/clean_text.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clean.r +\name{clean_text} +\alias{clean_text} +\title{Remove all tags from a document} +\usage{ +clean_text(doc) +} +\arguments{ +\item{doc}{atomic character vector (i.e. plain text) or an \code{html_document}} +} +\value{ +atomic character vector of cleaned text +} +\description{ +This is designed to be run on the \code{$content} component of the \code{data.frame} returned +by \code{just_the_facts()}. It can be run on any \code{htmnl_document} or atomic character vectors +(which it will parse into an \code{html_document}) and it will return an atomic character +vector of only plain text (i.e. it will remove all tags). +}