You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
216 lines
5.2 KiB
216 lines
5.2 KiB
#!/usr/bin/env Rscript
|
|
|
|
suppressPackageStartupMessages({
|
|
library(argparser, quietly = TRUE)
|
|
library(magrittr, quietly = TRUE)
|
|
})
|
|
|
|
arg_parser(
|
|
description = 'Extract metadata of a specific target based on the results of "commoncrawl.org"
|
|
|
|
Examples:
|
|
|
|
$ ./cc.R --list # list indices
|
|
$ ./cc.R --domain github.com # defaults to most recent index
|
|
$ ./cc.R --domain github.com --out /tmp/gh.json # specify an oputput file
|
|
$ ./cc.R --index CC-MAIN-2018-34 --domain github.com # specify which index'
|
|
) %>%
|
|
add_argument(
|
|
arg = "--domain",
|
|
help = "domain which will be crawled",
|
|
type = "character",
|
|
short = "-d",
|
|
default = NA_character_
|
|
) %>%
|
|
add_argument(
|
|
arg = "--out",
|
|
help = "specify an output file (default: domain.json)",
|
|
type = "character",
|
|
short = "-o",
|
|
default = NA_character_
|
|
) %>%
|
|
add_argument(
|
|
arg = "--list",
|
|
help = "list all available indexes",
|
|
short = "-l",
|
|
flag = TRUE
|
|
) %>%
|
|
add_argument(
|
|
arg = "--index",
|
|
help = "use a specific index file",
|
|
type = "character",
|
|
short = "-i",
|
|
default = NA_character_
|
|
) -> parser
|
|
|
|
opts <- parse_args(parser)
|
|
|
|
if ((is.na(opts$domain)) & (!opts$list)) {
|
|
print(parser)
|
|
quit(save="no", 1)
|
|
}
|
|
|
|
suppressPackageStartupMessages({
|
|
library(xml2, quietly = TRUE)
|
|
library(httr, quietly = TRUE)
|
|
library(rvest, quietly = TRUE)
|
|
library(lubridate, quietly = TRUE)
|
|
library(jsonlite, quietly = TRUE)
|
|
library(tidyverse, quietly = TRUE)
|
|
})
|
|
|
|
cache_dir <- path.expand("~/.cc.R")
|
|
|
|
#' Setup the cache directory
|
|
#'
|
|
#' @md
|
|
#' @return nothing
|
|
setup_cache <- function() {
|
|
if (!dir.exists(cache_dir)) dir.create(path.expand(cache_dir))
|
|
}
|
|
|
|
#' Refresh the CC crawl index cache
|
|
#'
|
|
#' Possible side-effect of writing to the cache dir
|
|
#'
|
|
#' @md
|
|
#' @return data frame (`month`/`year`/`path`)
|
|
refresh_index_cache <- function() {
|
|
|
|
pg <- xml2::read_html("http://index.commoncrawl.org/")
|
|
|
|
rvest::html_nodes(pg, xpath = ".//td[1]/*/a") %>%
|
|
rvest::html_attr("href") -> idx_paths
|
|
|
|
rvest::html_nodes(pg, xpath = ".//td[2]") %>%
|
|
rvest::html_text(trim=TRUE) %>%
|
|
str_replace(" Index", "") %>%
|
|
str_split(" ") %>%
|
|
purrr::map(set_names, c("month", "year")) %>%
|
|
map_df(as.list) %>%
|
|
mutate(path = idx_paths) -> idx
|
|
|
|
readr::write_rds(idx, file.path(cache_dir, "indexes.rds"))
|
|
|
|
}
|
|
|
|
#' Fetch cached or current CC crawl index paths
|
|
#'
|
|
#' @md
|
|
#' @return data frame (`month`/`year`/`path`)
|
|
fetch_indexes <- function() {
|
|
|
|
if (!file.exists(file.path(cache_dir, "indexes.rds"))) {
|
|
return(refresh_index_cache())
|
|
}
|
|
|
|
idx <- readr::read_rds(file.path(cache_dir, "indexes.rds"))
|
|
|
|
dplyr::filter(
|
|
idx,
|
|
month == as.character(lubridate::month(Sys.Date(), abbr=FALSE, label=TRUE)),
|
|
year == lubridate::year(Sys.Date())
|
|
) %>%
|
|
nrow() -> has_this_month
|
|
|
|
if ((!has_this_month) & (lubridate::day(Sys.Date()) > 25)) {
|
|
return(refresh_index_cache())
|
|
} else {
|
|
return(idx)
|
|
}
|
|
|
|
}
|
|
|
|
#' Retrieve domain CDX metadata from CC index
|
|
#'
|
|
#' @md
|
|
#' @param domain domain name
|
|
#' @param index CC index file
|
|
#' @param page API page #
|
|
#' @return data frame (CDX)
|
|
get_data <- function(domain, index, page) {
|
|
|
|
httr::GET(
|
|
url = file.path("http://index.commoncrawl.org", glue::glue("{index}-index")),
|
|
query = list(
|
|
url = glue::glue("*.{domain}"),
|
|
output = "json",
|
|
page = page
|
|
)
|
|
) -> res
|
|
|
|
httr::stop_for_status(res)
|
|
|
|
httr::content(res, as="raw", encoding="UTF-8") %>%
|
|
rawConnection() -> rcon
|
|
|
|
on.exit(close(rcon), add=TRUE)
|
|
|
|
out <- jsonlite::stream_in(rcon, verbose = FALSE)
|
|
|
|
out
|
|
|
|
}
|
|
|
|
#' Grab all the URL data from the CC for a given index and omain
|
|
#'
|
|
#' @md
|
|
#' @param domain domain name
|
|
#' @param index CC index file
|
|
#' @return data frame (CDX)
|
|
crawl_index <- function(domain, index) {
|
|
|
|
httr::GET(
|
|
url = file.path("http://index.commoncrawl.org", glue::glue("{index}-index")),
|
|
query = list(
|
|
url = glue::glue("*.{domain}"),
|
|
output = "json",
|
|
showNumPages = TRUE
|
|
)
|
|
) -> res
|
|
|
|
httr::stop_for_status(res)
|
|
|
|
meta <- httr::content(res, as="text", encoding="UTF-8")
|
|
meta <- jsonlite::fromJSON(meta)
|
|
|
|
purrr::map_df(
|
|
0:(meta$pages-1), get_data, domain=domain, index=index
|
|
) %>%
|
|
tbl_df() -> out
|
|
|
|
}
|
|
|
|
#' List the available CC crawl indices
|
|
#'
|
|
#' Side-effect of output to stdout
|
|
#'
|
|
#' @md
|
|
#' @return indices (invisibly)
|
|
list_indexes <- function() {
|
|
dplyr::select(fetch_indexes(), year, month, path) %>%
|
|
data.frame() -> tmp
|
|
print(tmp[nrow(tmp):1,], row.names = FALSE, quote = FALSE)
|
|
invisible(tmp)
|
|
}
|
|
|
|
setup_cache()
|
|
|
|
if (opts$list) { # just list the indexes (will prime, cache and auto-update)
|
|
list_indexes()
|
|
} else if (!is.na(opts$index)) { # use a specific index
|
|
idx <- fetch_indexes()
|
|
if (!(gsub("^/", "", opts$index) %in% gsub("^/", "", idx$path))) {
|
|
stop("Index does not exist", call.=FALSE)
|
|
}
|
|
out <- crawl_index(opts$domain, opts$index)
|
|
where <- if (is.na(opts$out)) stdout() else file(path.expand(opts$out))
|
|
jsonlite::stream_out(out, where, verbose=FALSE)
|
|
} else { # use latest index
|
|
idx <- fetch_indexes()
|
|
out <- crawl_index(opts$domain, idx$path[1])
|
|
where <- if (is.na(opts$out)) stdout() else file(path.expand(opts$out))
|
|
jsonlite::stream_out(out, where, verbose=FALSE)
|
|
}
|
|
|
|
quit(save="no", status=0)
|
|
|