You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
143 lines
4.4 KiB
143 lines
4.4 KiB
possibly <- function(.f, otherwise, quiet = TRUE) {
|
|
force(otherwise)
|
|
function(...) {
|
|
tryCatch(
|
|
.f(...),
|
|
error = function(e) {
|
|
if (!quiet)
|
|
message("Error: ", e$message)
|
|
otherwise
|
|
},
|
|
interrupt = function(e) {
|
|
stop("Terminated by user", call. = FALSE)
|
|
}
|
|
)
|
|
}
|
|
}
|
|
|
|
safe_GET <- possibly(GET, NULL, quiet = TRUE)
|
|
|
|
#' Download file from the Internet (cache-aware)
|
|
#'
|
|
#' This is an alternative to [utils::download.file()] and a convenience wrapper for
|
|
#' [GET()] + [httr::write_disk()] to perform file downloads.
|
|
#'
|
|
#' Since this function uses [GET()], callers can pass in `httr` configuration
|
|
#' options to customize the behaviour of the download process (e.g. specify a `User-Agent` via
|
|
#' [user_agent()], set proxy config via [use_proxy()], etc.).
|
|
#'
|
|
#' The function is also "cache-aware" in the sense that you deliberately have to specify
|
|
#' `overwrite = TRUE` to force a re-download. This has the potential to save bandwidth
|
|
#' of both the caller and the site hosting files for download.
|
|
#'
|
|
#' @note While this function supports specifying multiple URLs and download paths it
|
|
#' does not perform concurrent downloads.
|
|
#' @param url the url(s) of the file to retrieve. If multiple URLs are provided then the same
|
|
#' number of `path`s must also be provided.
|
|
#' @param path Path(s) to save content to. If more than one `path` is specified then the same
|
|
#' number of `url`s must also be provided. THis parameter will be [path.expand()]ed.
|
|
#' @param overwrite Will only overwrite existing path if `TRUE`.
|
|
#' @param ... passed on to [GET()]
|
|
#' @return a data frame containing the `url`(s), `path`(s), cache status, and HTTP status code(s).
|
|
#' If there was an error downloading a file the path, status code, and HTTP status
|
|
#' columns will be `NA`. If the file was now re-downloaded the status code will be 399
|
|
#' @seealso [GET()]; [write_disk()]
|
|
#' @export
|
|
#' @examples
|
|
#' tmp1 <- tempfile()
|
|
#' tmp2 <- tempfile()
|
|
#' tmp3 <- tempfile()
|
|
#'
|
|
#' download_file("https://google.com", tmp1) # downloads fine
|
|
#' download_file("https://google.com", tmp1) # doesn't re-download since it's cached
|
|
#' download_file("https://google.com", tmp1, overwrite = TRUE) # re-downloads (overwrites file)
|
|
#' download_file("https://google.com", tmp2) # re-downloads (new file)
|
|
#' download_file("badurl", tmp3) # handles major errors gracefully
|
|
#'
|
|
#' # multi-file example with no-caching
|
|
#' download_file(
|
|
#' c(rep("https://google.com", 2), "badurl"),
|
|
#' c(tmp1, tmp2, tmp3),
|
|
#' overwrite = TRUE
|
|
#' )
|
|
#'
|
|
#' # multi-file example with caching
|
|
#' download_file(
|
|
#' c(rep("https://google.com", 2), "badurl"),
|
|
#' c(tmp1, tmp2, tmp3),
|
|
#' overwrite = FALSE
|
|
#' )
|
|
download_file <- function(url, path, overwrite = FALSE, ...) {
|
|
|
|
url <- as.character(url)
|
|
path <- as.character(path)
|
|
|
|
if (length(url) != length(path)) {
|
|
stop("The lengths of the 'url' and 'path' parameters must be equal.", call.=FALSE)
|
|
}
|
|
|
|
path <- path.expand(path)
|
|
|
|
overwrite <- as.logical(overwrite)
|
|
stopifnot(length(overwrite) == 1)
|
|
|
|
out <- vector("list", length = length(url))
|
|
|
|
for (idx in seq_along(url)) {
|
|
|
|
u <- url[[idx]]
|
|
p <- path[[idx]]
|
|
|
|
if (file.exists(p)) {
|
|
|
|
if (overwrite) { # file exists but caller wants to re-download
|
|
res <- safe_GET(u, write_disk(p, overwrite = TRUE), ...)
|
|
if (is.null(res)) {
|
|
p <- NA_character_
|
|
cache_used = FALSE
|
|
status <- NA_integer_
|
|
} else {
|
|
cache_used <- FALSE
|
|
status <- status_code(res)
|
|
}
|
|
} else { # file exists but caller does not want to re-download
|
|
if (is.null(parse_url(u)[["hostname"]])) { # quick non-network test for invalid URL
|
|
p <- NA_character_
|
|
cache_used = FALSE
|
|
status <- NA_integer_
|
|
} else {
|
|
cache_used <- TRUE
|
|
status <- 399L
|
|
}
|
|
}
|
|
|
|
} else { # file does not exist, so do the thing
|
|
|
|
res <- safe_GET(u, write_disk(p, overwrite = overwrite), ...)
|
|
|
|
if (is.null(res)) {
|
|
p <- NA_character_
|
|
cache_used <- FALSE
|
|
status <- NA_integer_
|
|
} else {
|
|
status <- status_code(res)
|
|
cache_used <- FALSE
|
|
}
|
|
|
|
}
|
|
|
|
out[[idx]] <- data.frame(
|
|
url = u, path = p,
|
|
status_code = status,
|
|
cache_used = cache_used,
|
|
stringsAsFactors = FALSE
|
|
)
|
|
|
|
}
|
|
|
|
out <- do.call(rbind.data.frame, out)
|
|
class(out) <- c("tbl_df", "tbl", "data.frame")
|
|
|
|
invisible(out)
|
|
|
|
}
|
|
|