boB Rudis 5 years ago
parent
commit
46b714796b
No known key found for this signature in database GPG Key ID: 1D7529BE14E2BBA9
  1. 2
      DESCRIPTION
  2. 6
      NAMESPACE
  3. 7
      R/aaa.r
  4. 46
      R/api-key.r
  5. 91
      R/media-list.R
  6. 6
      R/mediacloud-package.R
  7. 83
      R/stats.R
  8. 13
      R/utils.R
  9. 69
      man/mc_media_list.Rd
  10. 18
      man/mc_stats.Rd
  11. 29
      man/mediacloud_api_key.Rd

2
DESCRIPTION

@ -13,6 +13,6 @@ Encoding: UTF-8
License: AGPL
Suggests: covr, tinytest
Depends: R (>= 3.2.0)
Imports: httr, jsonlite
Imports: httr, jsonlite, stringi, scales
Roxygen: list(markdown = TRUE)
RoxygenNote: 6.1.1

6
NAMESPACE

@ -1,4 +1,10 @@
# Generated by roxygen2: do not edit by hand
S3method(print,mediacloud_stats)
export(mc_media_list)
export(mc_stats)
export(mediacloud_api_key)
import(httr)
import(stringi)
importFrom(jsonlite,fromJSON)
importFrom(scales,comma)

7
R/aaa.r

@ -0,0 +1,7 @@
httr::user_agent(
sprintf(
"mediacloud package v%s: (<%s>)",
utils::packageVersion("mediacloud"),
utils::packageDescription("mediacloud")$URL
)
) -> .MEDIACLOUD_UA

46
R/api-key.r

@ -0,0 +1,46 @@
#' Get or set MEDIACLOUD_API_KEY value
#'
#' The API wrapper functions in this package all rely on a Media Cloud API
#' key residing in the environment variable \code{MEDIACLOUD_API_KEY}. The
#' easiest way to accomplish this is to set it in the `\code{.Renviron}` file in your
#' home directory.
#'
#' Every API call must include a key parameter which will authenticate you to
#' the API service. To get a key, register for a user via
#' <https://topics.mediacloud.org/#/user/signup>. Once you have an account go
#' here to see your key <https://topics.mediacloud.org/#/user/profile>.
#'
#' @param force force setting a new Media Cloud API key for the current environment?
#' @return atomic character vector containing the Media Cloud API key
#' @references <https://github.com/berkmancenter/mediacloud/blob/master/doc/api_2_0_spec/api_2_0_spec.md>
#' @export
mediacloud_api_key <- function(force = FALSE) {
env <- Sys.getenv('MEDIACLOUD_API_KEY')
if (!identical(env, "") && !force) return(env)
env <- Sys.getenv("MEDIACLOUD_API_KEY")
if (!identical(env, "") && !force) {
message("MEDIACLOUD_API_KEY is deprecated, please update environment variable to MEDIACLOUD_API_KEY")
return(env)
}
if (!interactive()) {
stop("Please set env var MEDIACLOUD_API_KEY to your Media Cloud API key",
call. = FALSE)
}
message("Couldn't find env var MEDIACLOUD_API_KEY See ?MEDIACLOUD_API_KEY for more details.")
message("Please enter your API key and press enter:")
pat <- readline(": ")
if (identical(pat, "")) {
stop("Media Cloud API key entry failed", call. = FALSE)
}
message("Updating MEDIACLOUD_API_KEY env var to PAT")
Sys.setenv(MEDIACLOUD_API_KEY = pat)
pat
}

91
R/media-list.R

@ -0,0 +1,91 @@
#' Return multiple media sources
#'
#' @param name Name of media source for which to search; If this parameter is
#' specified, the call returns only media sources that match a case
#' insensitive search specified value. If the specified value is less
#' than 3 characters long, the call returns an empty list. Default: none
#' @param query Return media with at least one sentence that matches this Solr
#' query; For a description of the Solr query format, see
#' <https://github.com/berkmancenter/mediacloud/blob/master/doc/api_2_0_spec/api_2_0_spec.md#apiv2stories_publiclist>.
#' Default: null
#' @param tags_id Return media associated with any of the given tags; If this is not
#' a lenth 1 character vector each vector element will be passed in as
#' an additional "tag id" list. These lists are AND'd together on the
#' server. Default: null
#' @param last_media_id Return media sources with a media_id greater than this value; Default: 0
#' @param rows Number of media sources to return. Cannot be larger than 100; Default: 20
#' @param tag_name Name of tag for which to return belonging media; Default: none
#' @param timespans_id Return media within the given timespan; If this parameter
#' is specified, return media within the given time slice, sorted by
#' descending inlink_count within the timespan. If topic_mode is set to
#' 'live', return media from the live topic stories rather than from
#' the frozen snapshot. Default: null
#' @param topic_mode If set to 'live', return media from live topics; Default: null
#' @param include_dups Include duplicate media among the results; Default: 0
#' @param unhealthy Only return media that are currently marked as unhealthy (see mediahealth/list); Default: none
#' @param similar_media_id Return media with the most tags in common; Default: none
#' @param sort sort order of media: id, or num_stories; By default, media are
#' sorted by media_id. If this parameter is set to 'num_stories',
#' the media will be sorted by decreasing number of stories in the past
#' 90 days
#' @param ... passed on to `httr` verb calls
#' @param api_key See [mediacloud_api_key()]
#' @note By default, calls that specify a name parameter will only return media
#' that are not duplicates of some other media source. Media Cloud has
#' many media sources that are either subsets of other media sources or
#' are just holders for spidered media from a given media source, both of
#' which are marked as duplicate media and are not included in the
#' default results. If the 'include_dups' parameter is set to 1, those
#' duplicate sources will be included in the results.
#' @export
mc_media_list <- function(name = NULL, query = NULL, tags_id = NULL,
last_media_id = 0, rows = 0, tag_name = NULL,
timespans_id = NULL, topic_mode = NULL,
include_dups = 0, unhealthy = NULL,
similar_media_id = NULL, sort = c("id", "num_stories"),
..., api_key = mediacloud_api_key()) {
sort <- match.arg(sort[1], choices = c("id", "num_stories"), several.ok = FALSE)
unhealthy <- numerify(unhealthy)
similar_media_id <- numerify(similar_media_id)
list(
last_media_id = last_media_id,
rows = rows,
name = name,
tag_name = tag_name,
timespans_id = timespans_id,
topic_mode = topic_mode,
q = query,
include_dups = include_dups,
unhealthy = unhealthy,
similar_media_id = similar_media_id,
sort = sort,
key = api_key
) -> params
if (length(tags_id) < 2) {
params[["tags_id"]] <- tags_id
} else {
tags_ids <- c("tags_id", sprintf("tags_id_%s", 1:(length(tags_id)-1)))
params <- append(params, as.list(set_names(tags_id, tags_ids)))
}
httr::GET(
url = "https://api.mediacloud.org/api/v2/media/list",
query = params,
.MEDIACLOUD_UA,
...
) -> res
httr::stop_for_status(res)
out <- httr::content(res, as = "text", encoding = "UTF-8")
out <- jsonlite::fromJSON(out)
class(out) <- c("tbl_df", "tbl", "data.frame")
out
}

6
R/mediacloud-package.R

@ -1,12 +1,14 @@
#' ...
#'
#'
#' - URL: <https://gitlab.com/hrbrmstr/mediacloud>
#' - BugReports: <https://gitlab.com/hrbrmstr/mediacloud/issues>
#'
#'
#' @md
#' @name mediacloud
#' @keywords internal
#' @author Bob Rudis (bob@@rud.is)
#' @import httr
#' @import stringi
#' @importFrom scales comma
#' @importFrom jsonlite fromJSON
"_PACKAGE"

83
R/stats.R

@ -0,0 +1,83 @@
#' Return basic summary stats about total sources, stories, feeds, etc
#' processed by Media Cloud
#'
#' @param ... passed on to `httr` verb calls
#' @param api_key See [mediacloud_api_key()]
#' @export
mc_stats <- function(..., api_key = mediacloud_api_key()) {
httr::GET(
url = "https://api.mediacloud.org/api/v2/stats/list",
query = list(
key = api_key
),
.MEDIACLOUD_UA,
...
) -> res
httr::stop_for_status(res)
out <- httr::content(res, as = "text", encoding = "UTF-8")
out <- jsonlite::fromJSON(out)
class(out) <- "mediacloud_stats"
out
}
#' printer
#'
#' @param x,... see [print()]
#' @noRd
#' @export
print.mediacloud_stats <- function(x, ...) {
pretty_names <- names(x)
pretty_names <- pretty_names[!(pretty_names %in% c("stats_date", "mediacloud_stats_id"))]
pretty_vals <- x[pretty_names]
pretty_names <- stri_replace_all_fixed(pretty_names, "_", " ")
pretty_names <- stri_trans_totitle(pretty_names)
pretty_names <- stri_pad_left(pretty_names, max(nchar(pretty_names)))
pretty_vals <- scales::comma(unlist(pretty_vals))
cat(
"Media Cloud summary stats as of ", x[["stats_date"]], "\n\n",
paste0(sprintf("%s: %s", pretty_names, pretty_vals), collapse = "\n"), "\n",
sep = ""
)
}
## $active_crawled_feeds
## [1] 176064
##
## $active_crawled_media
## [1] 59067
##
## $daily_downloads
## [1] 1475208
##
## $daily_stories
## [1] 696983
##
## $mediacloud_stats_id
## [1] 544
##
## $stats_date
## [1] "2019-07-21"
##
## $total_downloads
## [1] 0
##
## $total_sentences
## [1] 0
##
## $total_stories
## [1] 1272053504
##
# list(active_crawled_feeds = 176064L, active_crawled_media = 59067L,
# daily_downloads = 1475208L, daily_stories = 696983L, mediacloud_stats_id = 544L,
# stats_date = "2019-07-21", total_downloads = 0L, total_sentences = 0L,
# total_stories = 1272053504L) -> x

13
R/utils.R

@ -0,0 +1,13 @@
numerify <- function(x) {
if (length(x)) {
if (inherits(x, "logical")) {
x <- as.integer(x)
}
}
x
}
set_names <- function (object = nm, nm) {
names(object) <- nm
object
}

69
man/mc_media_list.Rd

@ -0,0 +1,69 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/media-list.R
\name{mc_media_list}
\alias{mc_media_list}
\title{Return multiple media sources}
\usage{
mc_media_list(name = NULL, query = NULL, tags_id = NULL,
last_media_id = 0, rows = 0, tag_name = NULL,
timespans_id = NULL, topic_mode = NULL, include_dups = 0,
unhealthy = NULL, similar_media_id = NULL, sort = c("id",
"num_stories"), ..., api_key = mediacloud_api_key())
}
\arguments{
\item{name}{Name of media source for which to search; If this parameter is
specified, the call returns only media sources that match a case
insensitive search specified value. If the specified value is less
than 3 characters long, the call returns an empty list. Default: none}
\item{query}{Return media with at least one sentence that matches this Solr
query; For a description of the Solr query format, see
\url{https://github.com/berkmancenter/mediacloud/blob/master/doc/api_2_0_spec/api_2_0_spec.md#apiv2stories_publiclist}.
Default: null}
\item{tags_id}{Return media associated with any of the given tags; If this is not
a lenth 1 character vector each vector element will be passed in as
an additional "tag id" list. These lists are AND'd together on the
server. Default: null}
\item{last_media_id}{Return media sources with a media_id greater than this value; Default: 0}
\item{rows}{Number of media sources to return. Cannot be larger than 100; Default: 20}
\item{tag_name}{Name of tag for which to return belonging media; Default: none}
\item{timespans_id}{Return media within the given timespan; If this parameter
is specified, return media within the given time slice, sorted by
descending inlink_count within the timespan. If topic_mode is set to
'live', return media from the live topic stories rather than from
the frozen snapshot. Default: null}
\item{topic_mode}{If set to 'live', return media from live topics; Default: null}
\item{include_dups}{Include duplicate media among the results; Default: 0}
\item{unhealthy}{Only return media that are currently marked as unhealthy (see mediahealth/list); Default: none}
\item{similar_media_id}{Return media with the most tags in common; Default: none}
\item{sort}{sort order of media: id, or num_stories; By default, media are
sorted by media_id. If this parameter is set to 'num_stories',
the media will be sorted by decreasing number of stories in the past
90 days}
\item{...}{passed on to \code{httr} verb calls}
\item{api_key}{See \code{\link[=mediacloud_api_key]{mediacloud_api_key()}}}
}
\description{
Return multiple media sources
}
\note{
By default, calls that specify a name parameter will only return media
that are not duplicates of some other media source. Media Cloud has
many media sources that are either subsets of other media sources or
are just holders for spidered media from a given media source, both of
which are marked as duplicate media and are not included in the
default results. If the 'include_dups' parameter is set to 1, those
duplicate sources will be included in the results.
}

18
man/mc_stats.Rd

@ -0,0 +1,18 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/stats.R
\name{mc_stats}
\alias{mc_stats}
\title{Return basic summary stats about total sources, stories, feeds, etc
processed by Media Cloud}
\usage{
mc_stats(..., api_key = mediacloud_api_key())
}
\arguments{
\item{...}{passed on to \code{httr} verb calls}
\item{api_key}{See \code{\link[=mediacloud_api_key]{mediacloud_api_key()}}}
}
\description{
Return basic summary stats about total sources, stories, feeds, etc
processed by Media Cloud
}

29
man/mediacloud_api_key.Rd

@ -0,0 +1,29 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/api-key.r
\name{mediacloud_api_key}
\alias{mediacloud_api_key}
\title{Get or set MEDIACLOUD_API_KEY value}
\usage{
mediacloud_api_key(force = FALSE)
}
\arguments{
\item{force}{force setting a new Media Cloud API key for the current environment?}
}
\value{
atomic character vector containing the Media Cloud API key
}
\description{
The API wrapper functions in this package all rely on a Media Cloud API
key residing in the environment variable \code{MEDIACLOUD_API_KEY}. The
easiest way to accomplish this is to set it in the \code{\code{.Renviron}} file in your
home directory.
}
\details{
Every API call must include a key parameter which will authenticate you to
the API service. To get a key, register for a user via
\url{https://topics.mediacloud.org/#/user/signup}. Once you have an account go
here to see your key \url{https://topics.mediacloud.org/#/user/profile}.
}
\references{
\url{https://github.com/berkmancenter/mediacloud/blob/master/doc/api_2_0_spec/api_2_0_spec.md}
}
Loading…
Cancel
Save