diff --git a/DESCRIPTION b/DESCRIPTION index 5253471..c14af32 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,11 +1,13 @@ Package: hgr Type: Package Title: Tools to Work with the 'Postlight' 'Mercury' 'API' -Version: 0.2.0 +Version: 0.3.0 Date: 2017-06-22 Author: Bob Rudis (bob@rud.is) Maintainer: Bob Rudis -Description: PTools to Work with the 'Postlight' 'Mercury' 'API' . +Description: The 'Postlight' 'Mercury' 'API' takes any web + article and returns only the relevant content - headline, author, body text, relevant + images and more - free from any clutter. URL: https://github.com/hrbrmstr/hgr BugReports: https://github.com/hrbrmstr/hgr/issues License: AGPL @@ -20,5 +22,8 @@ Imports: readr, xml2, rvest, - xslt + xslt, + clipr, + htmltools, + jsonlite RoxygenNote: 6.0.1 diff --git a/NAMESPACE b/NAMESPACE index 98ca483..672da65 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,10 +1,18 @@ # Generated by roxygen2: do not edit by hand +S3method(print,hgr) export(clean_text) export(just_the_facts) +import(clipr) +import(htmltools) import(httr) import(purrr) import(xslt) +importFrom(jsonlite,fromJSON) +importFrom(jsonlite,toJSON) +importFrom(readr,col_character) +importFrom(readr,col_datetime) +importFrom(readr,cols) importFrom(readr,type_convert) importFrom(rvest,html_text) importFrom(xml2,read_html) diff --git a/NEWS.md b/NEWS.md index ce787e6..2207225 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,6 @@ +0.3.0 +* Enhanced "printing" of `hgr` objects (uses `htmltools::html_print()`) + 0.2.0 * New `clean_text()` function which is designed to be run on the `$content` component of the `data.frame` returned by `just_the_facts()`. It can be run diff --git a/R/aaa.r b/R/aaa.r new file mode 100644 index 0000000..d5315ed --- /dev/null +++ b/R/aaa.r @@ -0,0 +1,13 @@ +.hgr_cols <- readr::cols( + title = readr::col_character(), + content = readr::col_character(), + author = readr::col_character(), + date_published = readr::col_datetime(format = ""), + lead_image_url = readr::col_character(), + url = readr::col_character(), + domain = readr::col_character(), + excerpt = readr::col_character(), + direction = readr::col_character() +) + +.hgr_ua <- "hgr R package / github.com/hrbrmstr/hgr" \ No newline at end of file diff --git a/R/hgr-package.R b/R/hgr-package.R index a8c2134..397a149 100644 --- a/R/hgr-package.R +++ b/R/hgr-package.R @@ -1,14 +1,16 @@ #' Tools to Work with the 'Postlight' 'Mercury' 'API' #' -#' [Mercury](https://mercury.postlight.com) takes any web article and returns only the relevant content — headline, author, -#' body text, relevant images and more — free from any clutter. +#' [Mercury](https://mercury.postlight.com) takes any web article and returns only the +#' relevant content --- headline, author, body text, relevant images and more --- free +#' from any clutter. #' #' @md #' @name hgr #' @docType package #' @author Bob Rudis (bob@@rud.is) -#' @import purrr httr -#' @importFrom readr type_convert +#' @import purrr httr clipr htmltools +#' @importFrom jsonlite fromJSON toJSON +#' @importFrom readr type_convert cols col_character col_datetime #' @import xslt #' @importFrom xml2 read_html #' @importFrom rvest html_text diff --git a/R/mercury.r b/R/mercury.r index 6b43c1d..d5beb91 100644 --- a/R/mercury.r +++ b/R/mercury.r @@ -1,7 +1,8 @@ #' Retrieve parsed content of a URL processed by the Postlight Mercury API #' -#' Mercury takes any web article and returns only the relevant content — headline, author, -#' body text, relevant images and more — free from any clutter. +#' [Mercury](https://mercury.postlight.com) takes any web article and returns only the +#' relevant content --- headline, author, body text, relevant images and more --- free +#' from any clutter. #' #' @md #' @param url URL to retrieve @@ -13,6 +14,7 @@ just_the_facts <- function(url, mercury_api_key=Sys.getenv("MERCURY_API_KEY")) { res <- httr::GET("https://mercury.postlight.com/parser", httr::content_type_json(), + httr::user_agent(.hgr_ua), httr::add_headers(`x-api-key`=mercury_api_key), query = list(url = url)) @@ -21,8 +23,20 @@ just_the_facts <- function(url, mercury_api_key=Sys.getenv("MERCURY_API_KEY")) { res <- httr::content(res, as="text", encoding="UTF-8") res <- jsonlite::fromJSON(res, flatten=TRUE) res <- purrr::flatten_df(res) - res <- readr::type_convert(res) + res <- readr::type_convert(res, col_types=.hgr_cols) + + class(res) <- c("hgr") res +} + +#' @md +#' @rdname just_the_facts +#' @param x `hgr` object +#' @param ... unused +#' @export +print.hgr <- function(x, ...) { + tmp <- htmltools::HTML(x$content) + htmltools::html_print(tmp) } \ No newline at end of file diff --git a/README.md b/README.md index 0e6bb6f..27e2bb1 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ library(hgr) packageVersion("hgr") ``` - ## [1] '0.2.0' + ## [1] '0.3.0' ``` r story <- "https://www.nytimes.com/2017/04/18/world/asia/aircraft-carrier-north-korea-carl-vinson.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region®ion=top-news&WT.nav=top-news&_r=0" @@ -33,20 +33,21 @@ doc <- just_the_facts(story) dplyr::glimpse(doc) ``` - ## Observations: 1 - ## Variables: 12 - ## $ title "Aircraft Carrier Wasn’t Sailing to Deter North Korea, as U.S. Suggested" - ## $ content "
\n\n \n\n ... - ## $ author "Mark Landler and Eric Schmitt" - ## $ date_published 2017-04-18 17:57:41 - ## $ lead_image_url "https://static01.nyt.com/images/2017/04/19/world/19carrier-sub/19carrier-sub-facebookJumbo.... - ## $ url "https://www.nytimes.com/2017/04/18/world/asia/aircraft-carrier-north-korea-carl-vinson.html" - ## $ domain "www.nytimes.com" - ## $ excerpt "The saga might never have come to light had the Navy not posted a photograph of the Carl Vi... - ## $ word_count 1499 - ## $ direction "ltr" - ## $ total_pages 1 - ## $ rendered_pages 1 + ## List of 12 + ## $ title : chr "Aircraft Carrier Wasn’t Sailing to Deter North Korea, as U.S. Suggested" + ## $ content : chr "
\n\n \n\n \n \n\n \n\n"| __truncated__ + ## $ author : chr "Mark Landler and Eric Schmitt" + ## $ date_published: POSIXct[1:1], format: "2017-04-18 17:57:41" + ## $ lead_image_url: chr "https://static01.nyt.com/images/2017/04/19/world/19carrier-sub/19carrier-sub-facebookJumbo.jpg" + ## $ url : chr "https://www.nytimes.com/2017/04/18/world/asia/aircraft-carrier-north-korea-carl-vinson.html" + ## $ domain : chr "www.nytimes.com" + ## $ excerpt : chr "The saga might never have come to light had the Navy not posted a photograph of the Carl Vinson sailing through"| __truncated__ + ## $ word_count : int 1499 + ## $ direction : chr "ltr" + ## $ total_pages : int 1 + ## $ rendered_pages: int 1 + ## - attr(*, "row.names")= int 1 + ## - attr(*, "class")= chr "hgr" ``` r substr(doc$content, 1, 100) @@ -71,7 +72,7 @@ library(testthat) date() ``` - ## [1] "Thu Jun 22 22:49:32 2017" + ## [1] "Wed Sep 13 14:41:50 2017" ``` r test_dir("tests/") diff --git a/man/hgr.Rd b/man/hgr.Rd index 59b101a..989eb65 100644 --- a/man/hgr.Rd +++ b/man/hgr.Rd @@ -6,8 +6,9 @@ \alias{hgr-package} \title{Tools to Work with the 'Postlight' 'Mercury' 'API'} \description{ -\href{https://mercury.postlight.com}{Mercury} takes any web article and returns only the relevant content — headline, author, -body text, relevant images and more — free from any clutter. +\href{https://mercury.postlight.com}{Mercury} takes any web article and returns only the +relevant content --- headline, author, body text, relevant images and more --- free +from any clutter. } \author{ Bob Rudis (bob@rud.is) diff --git a/man/just_the_facts.Rd b/man/just_the_facts.Rd index 382199d..263cb97 100644 --- a/man/just_the_facts.Rd +++ b/man/just_the_facts.Rd @@ -2,20 +2,28 @@ % Please edit documentation in R/mercury.r \name{just_the_facts} \alias{just_the_facts} +\alias{print.hgr} \title{Retrieve parsed content of a URL processed by the Postlight Mercury API} \usage{ just_the_facts(url, mercury_api_key = Sys.getenv("MERCURY_API_KEY")) + +\method{print}{hgr}(x, ...) } \arguments{ \item{url}{URL to retrieve} \item{mercury_api_key}{your Mercury API key. The function looks for it in \code{MERCURY_API_KEY} but you can specify it manually as well. Get your key \href{https://mercury.postlight.com}{here}.} + +\item{x}{\code{hgr} object} + +\item{...}{unused} } \value{ \code{data.frame} } \description{ -Mercury takes any web article and returns only the relevant content — headline, author, -body text, relevant images and more — free from any clutter. +\href{https://mercury.postlight.com}{Mercury} takes any web article and returns only the +relevant content --- headline, author, body text, relevant images and more --- free +from any clutter. }