Browse Source

better printing

master
boB Rudis 7 years ago
parent
commit
cb563e5e48
No known key found for this signature in database GPG Key ID: 2A514A4997464560
  1. 11
      DESCRIPTION
  2. 8
      NAMESPACE
  3. 3
      NEWS.md
  4. 13
      R/aaa.r
  5. 10
      R/hgr-package.R
  6. 20
      R/mercury.r
  7. 33
      README.md
  8. 5
      man/hgr.Rd
  9. 12
      man/just_the_facts.Rd

11
DESCRIPTION

@ -1,11 +1,13 @@
Package: hgr
Type: Package
Title: Tools to Work with the 'Postlight' 'Mercury' 'API'
Version: 0.2.0
Version: 0.3.0
Date: 2017-06-22
Author: Bob Rudis (bob@rud.is)
Maintainer: Bob Rudis <bob@rud.is>
Description: PTools to Work with the 'Postlight' 'Mercury' 'API' <https://mercury.postlight.com>.
Description: The 'Postlight' 'Mercury' 'API' <https://mercury.postlight.com> takes any web
article and returns only the relevant content - headline, author, body text, relevant
images and more - free from any clutter.
URL: https://github.com/hrbrmstr/hgr
BugReports: https://github.com/hrbrmstr/hgr/issues
License: AGPL
@ -20,5 +22,8 @@ Imports:
readr,
xml2,
rvest,
xslt
xslt,
clipr,
htmltools,
jsonlite
RoxygenNote: 6.0.1

8
NAMESPACE

@ -1,10 +1,18 @@
# Generated by roxygen2: do not edit by hand
S3method(print,hgr)
export(clean_text)
export(just_the_facts)
import(clipr)
import(htmltools)
import(httr)
import(purrr)
import(xslt)
importFrom(jsonlite,fromJSON)
importFrom(jsonlite,toJSON)
importFrom(readr,col_character)
importFrom(readr,col_datetime)
importFrom(readr,cols)
importFrom(readr,type_convert)
importFrom(rvest,html_text)
importFrom(xml2,read_html)

3
NEWS.md

@ -1,3 +1,6 @@
0.3.0
* Enhanced "printing" of `hgr` objects (uses `htmltools::html_print()`)
0.2.0
* New `clean_text()` function which is designed to be run on the `$content`
component of the `data.frame` returned by `just_the_facts()`. It can be run

13
R/aaa.r

@ -0,0 +1,13 @@
.hgr_cols <- readr::cols(
title = readr::col_character(),
content = readr::col_character(),
author = readr::col_character(),
date_published = readr::col_datetime(format = ""),
lead_image_url = readr::col_character(),
url = readr::col_character(),
domain = readr::col_character(),
excerpt = readr::col_character(),
direction = readr::col_character()
)
.hgr_ua <- "hgr R package / github.com/hrbrmstr/hgr"

10
R/hgr-package.R

@ -1,14 +1,16 @@
#' Tools to Work with the 'Postlight' 'Mercury' 'API'
#'
#' [Mercury](https://mercury.postlight.com) takes any web article and returns only the relevant content — headline, author,
#' body text, relevant images and more — free from any clutter.
#' [Mercury](https://mercury.postlight.com) takes any web article and returns only the
#' relevant content --- headline, author, body text, relevant images and more --- free
#' from any clutter.
#'
#' @md
#' @name hgr
#' @docType package
#' @author Bob Rudis (bob@@rud.is)
#' @import purrr httr
#' @importFrom readr type_convert
#' @import purrr httr clipr htmltools
#' @importFrom jsonlite fromJSON toJSON
#' @importFrom readr type_convert cols col_character col_datetime
#' @import xslt
#' @importFrom xml2 read_html
#' @importFrom rvest html_text

20
R/mercury.r

@ -1,7 +1,8 @@
#' Retrieve parsed content of a URL processed by the Postlight Mercury API
#'
#' Mercury takes any web article and returns only the relevant content — headline, author,
#' body text, relevant images and more — free from any clutter.
#' [Mercury](https://mercury.postlight.com) takes any web article and returns only the
#' relevant content --- headline, author, body text, relevant images and more --- free
#' from any clutter.
#'
#' @md
#' @param url URL to retrieve
@ -13,6 +14,7 @@ just_the_facts <- function(url, mercury_api_key=Sys.getenv("MERCURY_API_KEY")) {
res <- httr::GET("https://mercury.postlight.com/parser",
httr::content_type_json(),
httr::user_agent(.hgr_ua),
httr::add_headers(`x-api-key`=mercury_api_key),
query = list(url = url))
@ -21,8 +23,20 @@ just_the_facts <- function(url, mercury_api_key=Sys.getenv("MERCURY_API_KEY")) {
res <- httr::content(res, as="text", encoding="UTF-8")
res <- jsonlite::fromJSON(res, flatten=TRUE)
res <- purrr::flatten_df(res)
res <- readr::type_convert(res)
res <- readr::type_convert(res, col_types=.hgr_cols)
class(res) <- c("hgr")
res
}
#' @md
#' @rdname just_the_facts
#' @param x `hgr` object
#' @param ... unused
#' @export
print.hgr <- function(x, ...) {
tmp <- htmltools::HTML(x$content)
htmltools::html_print(tmp)
}

33
README.md

@ -23,7 +23,7 @@ library(hgr)
packageVersion("hgr")
```
## [1] '0.2.0'
## [1] '0.3.0'
``` r
story <- "https://www.nytimes.com/2017/04/18/world/asia/aircraft-carrier-north-korea-carl-vinson.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news&_r=0"
@ -33,20 +33,21 @@ doc <- just_the_facts(story)
dplyr::glimpse(doc)
```
## Observations: 1
## Variables: 12
## $ title <chr> "Aircraft Carrier Wasn’t Sailing to Deter North Korea, as U.S. Suggested"
## $ content <chr> "<div><article id=\"story\" class=\"story theme-main \">\n\n \n\n ...
## $ author <chr> "Mark Landler and Eric Schmitt"
## $ date_published <dttm> 2017-04-18 17:57:41
## $ lead_image_url <chr> "https://static01.nyt.com/images/2017/04/19/world/19carrier-sub/19carrier-sub-facebookJumbo....
## $ url <chr> "https://www.nytimes.com/2017/04/18/world/asia/aircraft-carrier-north-korea-carl-vinson.html"
## $ domain <chr> "www.nytimes.com"
## $ excerpt <chr> "The saga might never have come to light had the Navy not posted a photograph of the Carl Vi...
## $ word_count <int> 1499
## $ direction <chr> "ltr"
## $ total_pages <int> 1
## $ rendered_pages <int> 1
## List of 12
## $ title : chr "Aircraft Carrier Wasn’t Sailing to Deter North Korea, as U.S. Suggested"
## $ content : chr "<div><article id=\"story\" class=\"story theme-main \">\n\n \n\n \n \n\n \n\n"| __truncated__
## $ author : chr "Mark Landler and Eric Schmitt"
## $ date_published: POSIXct[1:1], format: "2017-04-18 17:57:41"
## $ lead_image_url: chr "https://static01.nyt.com/images/2017/04/19/world/19carrier-sub/19carrier-sub-facebookJumbo.jpg"
## $ url : chr "https://www.nytimes.com/2017/04/18/world/asia/aircraft-carrier-north-korea-carl-vinson.html"
## $ domain : chr "www.nytimes.com"
## $ excerpt : chr "The saga might never have come to light had the Navy not posted a photograph of the Carl Vinson sailing through"| __truncated__
## $ word_count : int 1499
## $ direction : chr "ltr"
## $ total_pages : int 1
## $ rendered_pages: int 1
## - attr(*, "row.names")= int 1
## - attr(*, "class")= chr "hgr"
``` r
substr(doc$content, 1, 100)
@ -71,7 +72,7 @@ library(testthat)
date()
```
## [1] "Thu Jun 22 22:49:32 2017"
## [1] "Wed Sep 13 14:41:50 2017"
``` r
test_dir("tests/")

5
man/hgr.Rd

@ -6,8 +6,9 @@
\alias{hgr-package}
\title{Tools to Work with the 'Postlight' 'Mercury' 'API'}
\description{
\href{https://mercury.postlight.com}{Mercury} takes any web article and returns only the relevant content — headline, author,
body text, relevant images and more — free from any clutter.
\href{https://mercury.postlight.com}{Mercury} takes any web article and returns only the
relevant content --- headline, author, body text, relevant images and more --- free
from any clutter.
}
\author{
Bob Rudis (bob@rud.is)

12
man/just_the_facts.Rd

@ -2,20 +2,28 @@
% Please edit documentation in R/mercury.r
\name{just_the_facts}
\alias{just_the_facts}
\alias{print.hgr}
\title{Retrieve parsed content of a URL processed by the Postlight Mercury API}
\usage{
just_the_facts(url, mercury_api_key = Sys.getenv("MERCURY_API_KEY"))
\method{print}{hgr}(x, ...)
}
\arguments{
\item{url}{URL to retrieve}
\item{mercury_api_key}{your Mercury API key. The function looks for it in \code{MERCURY_API_KEY}
but you can specify it manually as well. Get your key \href{https://mercury.postlight.com}{here}.}
\item{x}{\code{hgr} object}
\item{...}{unused}
}
\value{
\code{data.frame}
}
\description{
Mercury takes any web article and returns only the relevant content — headline, author,
body text, relevant images and more — free from any clutter.
\href{https://mercury.postlight.com}{Mercury} takes any web article and returns only the
relevant content --- headline, author, body text, relevant images and more --- free
from any clutter.
}

Loading…
Cancel
Save