From bbeeb8f5162fbc882cb0c72519832311d62dc176 Mon Sep 17 00:00:00 2001 From: Craig McGowan Date: Thu, 12 Oct 2017 16:37:33 -0400 Subject: [PATCH 1/3] Add function to pull hospital data using new HTML5 interface and update documentation and tests. Resolves #8 --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/get_hosp_data.R | 126 +++++++++++++++++++++++++++++++++ man/cdcfluview.Rd | 1 - man/census_regions.Rd | 1 - man/get_hosp_data.Rd | 42 +++++++++++ man/get_mortality_surveillance_data.Rd | 1 - man/get_state_data.Rd | 1 - man/get_weekly_flu_report.Rd | 1 - man/hhs_regions.Rd | 1 - 10 files changed, 170 insertions(+), 7 deletions(-) create mode 100644 R/get_hosp_data.R create mode 100644 man/get_hosp_data.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 67927b3..4b41a40 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,4 +30,4 @@ Imports: V8 Depends: R (>= 3.2.0) -RoxygenNote: 5.0.1 +RoxygenNote: 6.0.1 diff --git a/NAMESPACE b/NAMESPACE index 8ae1495..9bb4136 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(get_flu_data) +export(get_hosp_data) export(get_mortality_surveillance_data) export(get_state_data) export(get_weekly_flu_report) diff --git a/R/get_hosp_data.R b/R/get_hosp_data.R new file mode 100644 index 0000000..fce27f3 --- /dev/null +++ b/R/get_hosp_data.R @@ -0,0 +1,126 @@ +#' Retrieves influenza hospitalization statistics from the CDC +#' +#' Uses the data source from the +#' \href{https://gis.cdc.gov/GRASP/Fluview/FluHospRates.html}{CDC FluView} +#' and provides influenza hospitalization reporting data as a data frame. +#' +#' @param area one of "\code{flusurvnet}", "\code{eip}", "\code{ihsp}", or two +#' digit state abbreviation for an individual site. Exceptions are +#' New York - Albany ("\code{nya}") and New York - Rochester ("\code{nyr}") +#' @param age_group a vector of age groups to pull data for. Possible values are: +#' "\code{overall}", "\code{0-4y}", "\code{5-17y}, "\code{18-49y}, +#' "\code{50-64y}, "\code{65+y}". +#' @param years a vector of years to retrieve data for (i.e. \code{2014} for CDC +#' flu season 2014-2015). Default value is the current year and all +#' \code{years} values should be >= \code{2009} +#' @return A single \code{data.frame}. +#' @note There is often a noticeable delay when making the API request to the CDC. +#' This is not due to a large download size, but the time it takes for their +#' servers to crunch the data. Wrap the function call in \code{httr::with_verbose} +#' if you would like to see what's going on. +#' @export +#' @examples \dontrun{ +#' # All of FluSurv-NET, 50-64 years old, 2010/11-2014/15 flu seasons +#' flu <- get_hosp_data("flusurvnet", "50-64y", years=2010:2014) +#' } +get_hosp_data <- function(area="flusurvnet", age_group="overall", + years=as.numeric(format(Sys.Date(), "%Y")) - 1) { + + area <- tolower(area) + age_group <- tolower(age_group) + + if (!(area %in% c("flusurvnet", "eip", "ihsp", "ca", "co", "ct", "ga", "md", + "mn", "nm", "nya", "nyr", "or", "tn", "id", "ia", "mi", + "oh", "ok", "ri", "sd", "ut"))) + stop("Error: area must be one of flusurvnet, eip, ihsp, or a valid state abbreviation") + + if (length(area) != 1) + stop("Error: can only select one area") + + if (!all(age_group %in% c("overall", "0-4y", "5-17y", "18-49y", + "50-64y", "65+y"))) + stop("Error: invalid age group specified") + + if (any(years < 2009)) + stop("Error: years should be >= 2009") + + # Match names of age groups to numbers for API + age_match <- data.frame(age_group = c("overall", "0-4y", "5-17y", + "18-49y", "50-64y", "65+y"), + code = c(6, 1, 2, 3, 4, 5)) + + age_group_num <- age_match$code[age_match$age_group %in% age_group] + + + # format the input parameters to fit the CDC API + + years <- years - 1960 + + area_match <- data.frame(area = c("flusurvnet", "eip", "ca", "co", "ct", + "ga", "md", "mn", "nm", "nya", "nyr", "or", + "tn", "ihsp", "id", "ia", "mi", "oh", "ok", + "ri", "sd", "ut"), + catch = c(22, 22, 1, 2, 3, 4, 7, 9, 11, 13, 14, 17, + 20, 22, 6, 5, 8, 15, 16, 18, 19, 21), + network = c(1, rep(2, 12), rep(3, 9))) + + # Format years + year_list <- lapply(seq_along(years), + function(x) list(ID = years[x])) + + # Format age group + age_list <- lapply(seq_along(age_group_num), + function(x) list(ID = age_group_num[x])) + + params <- list(AppVersion = "Public", + agegroups = age_list, + catchmentid = area_match$catch[area_match$area == area], + networkid = area_match$network[area_match$area == area], + seasons = year_list) + + out_file <- tempfile(fileext=".json") + + # CDC API returns a ZIP file so we grab, save & expand it to then read in CSVs + + tmp <- httr::POST("https://gis.cdc.gov/GRASP/Flu3/PostPhase03DownloadData", + body = params, + encode = "json", + httr::write_disk(out_file, overwrite = T)) + + httr::stop_for_status(tmp) + + if (!(file.exists(out_file))) + stop("Error: cannot process downloaded data") + + file <- jsonlite::fromJSON(out_file)[[1]] + + # pb <- dplyr::progress_estimated(length(file)) + # purrr::map(file, function(x) { + # pb$tick()$print() + # ct <- ifelse(grepl("who", x, ignore.case=TRUE), 1, 1) + # suppressMessages(readr::read_csv(x, skip=ct)) + # }) -> file_list + + # names(file_list) <- substr(basename(files), 1, nchar(basename(files)) - 4) + + # Depending on the parameters, there could be more than one + # file returned. When there's only one, return a more usable + # structure. + + # when no rows, then it's likely the caller specified the + # current year and the flu season has technically not started yet. + # so help them out and move the year back and get current flu + # season data. + + if ((nrow(file) == 0) && + (length(years)==1) && + (years == (as.numeric(format(Sys.Date(), "%Y"))-1960))) { + + message("Adjusting [years] to get current season...") + return(get_hosp_data(area=area, age_group=age_group, + years=years+1960-1)) + } else { + return(file) + } + +} diff --git a/man/cdcfluview.Rd b/man/cdcfluview.Rd index 1164f2d..a05f5d0 100644 --- a/man/cdcfluview.Rd +++ b/man/cdcfluview.Rd @@ -16,4 +16,3 @@ provides functions to access the data provided by portal's underlying API. \author{ Bob Rudis (bob@rud.is) } - diff --git a/man/census_regions.Rd b/man/census_regions.Rd index 7b72052..b6cdbaa 100644 --- a/man/census_regions.Rd +++ b/man/census_regions.Rd @@ -25,4 +25,3 @@ Last updated 2015-08-09. \url{https://www.cdc.gov/std/stats12/images/CensusMap.png} } \keyword{datasets} - diff --git a/man/get_hosp_data.Rd b/man/get_hosp_data.Rd new file mode 100644 index 0000000..5f2d7fe --- /dev/null +++ b/man/get_hosp_data.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_hosp_data.R +\name{get_hosp_data} +\alias{get_hosp_data} +\title{Retrieves influenza hospitalization statistics from the CDC} +\usage{ +get_hosp_data(area = "flusurvnet", age_group = "overall", + years = as.numeric(format(Sys.Date(), "\%Y")) - 1) +} +\arguments{ +\item{area}{one of "\code{flusurvnet}", "\code{eip}", "\code{ihsp}", or two +digit state abbreviation for an individual site. Exceptions are +New York - Albany ("\code{nya}") and New York - Rochester ("\code{nyr}")} + +\item{age_group}{a vector of age groups to pull data for. Possible values are: +"\code{overall}", "\code{0-4y}", "\code{5-17y}, "\code{18-49y}, +"\code{50-64y}, "\code{65+y}".} + +\item{years}{a vector of years to retrieve data for (i.e. \code{2014} for CDC +flu season 2014-2015). Default value is the current year and all +\code{years} values should be >= \code{2009}} +} +\value{ +A single \code{data.frame}. +} +\description{ +Uses the data source from the +\href{https://gis.cdc.gov/GRASP/Fluview/FluHospRates.html}{CDC FluView} +and provides influenza hospitalization reporting data as a data frame. +} +\note{ +There is often a noticeable delay when making the API request to the CDC. + This is not due to a large download size, but the time it takes for their + servers to crunch the data. Wrap the function call in \code{httr::with_verbose} + if you would like to see what's going on. +} +\examples{ +\dontrun{ +# All of FluSurv-NET, 50-64 years old, 2010/11-2014/15 flu seasons +flu <- get_hosp_data("flusurvnet", "50-64y", years=2010:2014) +} +} diff --git a/man/get_mortality_surveillance_data.Rd b/man/get_mortality_surveillance_data.Rd index 3c09933..bc54b6c 100644 --- a/man/get_mortality_surveillance_data.Rd +++ b/man/get_mortality_surveillance_data.Rd @@ -39,4 +39,3 @@ get_mortality_surveillance_data() \references{ \url{https://www.cdc.gov/flu/weekly/nchs.htm} } - diff --git a/man/get_state_data.Rd b/man/get_state_data.Rd index fb8ee2a..8700a5e 100644 --- a/man/get_state_data.Rd +++ b/man/get_state_data.Rd @@ -38,4 +38,3 @@ get_state_data(2010:2014) httr::with_verbose(get_state_data(2009:2015)) } } - diff --git a/man/get_weekly_flu_report.Rd b/man/get_weekly_flu_report.Rd index ca0afab..efa44f7 100644 --- a/man/get_weekly_flu_report.Rd +++ b/man/get_weekly_flu_report.Rd @@ -29,4 +29,3 @@ get_weekly_flu_report() \references{ \url{https://www.cdc.gov/flu/weekly/flureport.xml} } - diff --git a/man/hhs_regions.Rd b/man/hhs_regions.Rd index 49d1c90..2857a62 100644 --- a/man/hhs_regions.Rd +++ b/man/hhs_regions.Rd @@ -28,4 +28,3 @@ Last updated 2015-08-09. \url{https://www.hhs.gov/about/agencies/iea/regional-offices/index.html} } \keyword{datasets} - From ecb952e1277ffeeed42dfee577a60099a8e477ce Mon Sep 17 00:00:00 2001 From: Craig McGowan Date: Thu, 12 Oct 2017 16:37:33 -0400 Subject: [PATCH 2/3] Add function to pull hospital data using new HTML5 interface and update documentation and tests. Resolves #8 --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/get_hosp_data.R | 118 +++++++++++++++++++++++++++++++++ man/cdcfluview.Rd | 1 - man/census_regions.Rd | 1 - man/get_hosp_data.Rd | 43 ++++++++++++ man/get_mortality_surveillance_data.Rd | 1 - man/get_state_data.Rd | 1 - man/get_weekly_flu_report.Rd | 1 - man/hhs_regions.Rd | 1 - 10 files changed, 163 insertions(+), 7 deletions(-) create mode 100644 R/get_hosp_data.R create mode 100644 man/get_hosp_data.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 67927b3..4b41a40 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,4 +30,4 @@ Imports: V8 Depends: R (>= 3.2.0) -RoxygenNote: 5.0.1 +RoxygenNote: 6.0.1 diff --git a/NAMESPACE b/NAMESPACE index 8ae1495..9bb4136 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(get_flu_data) +export(get_hosp_data) export(get_mortality_surveillance_data) export(get_state_data) export(get_weekly_flu_report) diff --git a/R/get_hosp_data.R b/R/get_hosp_data.R new file mode 100644 index 0000000..6242426 --- /dev/null +++ b/R/get_hosp_data.R @@ -0,0 +1,118 @@ +#' Retrieves influenza hospitalization statistics from the CDC +#' +#' Uses the data source from the +#' \href{https://gis.cdc.gov/GRASP/Fluview/FluHospRates.html}{CDC FluView} +#' and provides influenza hospitalization reporting data as a data frame. +#' +#' @param area one of "\code{flusurvnet}", "\code{eip}", "\code{ihsp}", or two +#' digit state abbreviation for an individual site. Exceptions are +#' New York - Albany ("\code{nya}") and New York - Rochester +#' ("\code{nyr}") +#' @param age_group a vector of age groups to pull data for. Possible values are: +#' "\code{overall}", "\code{0-4y}", "\code{5-17y}, "\code{18-49y}, +#' "\code{50-64y}, "\code{65+y}". +#' @param years a vector of years to retrieve data for (i.e. \code{2014} for CDC +#' flu season 2014-2015). Default value is the current year and all +#' \code{years} values should be >= \code{2009} +#' @return A single \code{data.frame}. +#' @note There is often a noticeable delay when making the API request to the CDC. +#' This is not due to a large download size, but the time it takes for their +#' servers to crunch the data. Wrap the function call in \code{httr::with_verbose} +#' if you would like to see what's going on. +#' @export +#' @examples \dontrun{ +#' # All of FluSurv-NET, 50-64 years old, 2010/11-2014/15 flu seasons +#' flu <- get_hosp_data("flusurvnet", "50-64y", years=2010:2014) +#' } +get_hosp_data <- function(area="flusurvnet", age_group="overall", + years=as.numeric(format(Sys.Date(), "%Y")) - 1) { + + area <- tolower(area) + age_group <- tolower(age_group) + + if (!(area %in% c("flusurvnet", "eip", "ihsp", "ca", "co", "ct", "ga", "md", + "mn", "nm", "nya", "nyr", "or", "tn", "id", "ia", "mi", + "oh", "ok", "ri", "sd", "ut"))) + stop("Error: area must be one of flusurvnet, eip, ihsp, or a valid state abbreviation") + + if (length(area) != 1) + stop("Error: can only select one area") + + if (!all(age_group %in% c("overall", "0-4y", "5-17y", "18-49y", + "50-64y", "65+y"))) + stop("Error: invalid age group specified") + + if (any(years < 2009)) + stop("Error: years should be >= 2009") + + # Match names of age groups to numbers for API + age_match <- data.frame(age_group = c("overall", "0-4y", "5-17y", + "18-49y", "50-64y", "65+y"), + code = c(6, 1, 2, 3, 4, 5)) + + age_group_num <- age_match$code[age_match$age_group %in% age_group] + + + # format the input parameters to fit the CDC API + + years <- years - 1960 + + area_match <- data.frame(area = c("flusurvnet", "eip", "ca", "co", "ct", + "ga", "md", "mn", "nm", "nya", "nyr", "or", + "tn", "ihsp", "id", "ia", "mi", "oh", "ok", + "ri", "sd", "ut"), + catch = c(22, 22, 1, 2, 3, 4, 7, 9, 11, 13, 14, 17, + 20, 22, 6, 5, 8, 15, 16, 18, 19, 21), + network = c(1, rep(2, 12), rep(3, 9))) + + # Format years + year_list <- lapply(seq_along(years), + function(x) list(ID = years[x])) + + # Format age group + age_list <- lapply(seq_along(age_group_num), + function(x) list(ID = age_group_num[x])) + + params <- list(AppVersion = "Public", + agegroups = age_list, + catchmentid = area_match$catch[area_match$area == area], + networkid = area_match$network[area_match$area == area], + seasons = year_list) + + out_file <- tempfile(fileext=".json") + + # CDC API returns a ZIP file so we grab, save & expand it to then read in CSVs + + tmp <- httr::POST("https://gis.cdc.gov/GRASP/Flu3/PostPhase03DownloadData", + body = params, + encode = "json", + httr::write_disk(out_file, overwrite = T)) + + httr::stop_for_status(tmp) + + if (!(file.exists(out_file))) + stop("Error: cannot process downloaded data") + + file <- jsonlite::fromJSON(out_file)[[1]] + + # pb <- dplyr::progress_estimated(length(file)) + # purrr::map(file, function(x) { + # pb$tick()$print() + # ct <- ifelse(grepl("who", x, ignore.case=TRUE), 1, 1) + # suppressMessages(readr::read_csv(x, skip=ct)) + # }) -> file_list + + # names(file_list) <- substr(basename(files), 1, nchar(basename(files)) - 4) + + # Depending on the parameters, there could be more than one + # file returned. When there's only one, return a more usable + # structure. + + # when no rows, then it's likely the caller specified the + # current year and the flu season has technically not started yet. + # so help them out and move the year back and get current flu + # season data. + + return(file) + +} diff --git a/man/cdcfluview.Rd b/man/cdcfluview.Rd index 1164f2d..a05f5d0 100644 --- a/man/cdcfluview.Rd +++ b/man/cdcfluview.Rd @@ -16,4 +16,3 @@ provides functions to access the data provided by portal's underlying API. \author{ Bob Rudis (bob@rud.is) } - diff --git a/man/census_regions.Rd b/man/census_regions.Rd index 7b72052..b6cdbaa 100644 --- a/man/census_regions.Rd +++ b/man/census_regions.Rd @@ -25,4 +25,3 @@ Last updated 2015-08-09. \url{https://www.cdc.gov/std/stats12/images/CensusMap.png} } \keyword{datasets} - diff --git a/man/get_hosp_data.Rd b/man/get_hosp_data.Rd new file mode 100644 index 0000000..ce68ecb --- /dev/null +++ b/man/get_hosp_data.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_hosp_data.R +\name{get_hosp_data} +\alias{get_hosp_data} +\title{Retrieves influenza hospitalization statistics from the CDC} +\usage{ +get_hosp_data(area = "flusurvnet", age_group = "overall", + years = as.numeric(format(Sys.Date(), "\%Y")) - 1) +} +\arguments{ +\item{area}{one of "\code{flusurvnet}", "\code{eip}", "\code{ihsp}", or two +digit state abbreviation for an individual site. Exceptions are +New York - Albany ("\code{nya}") and New York - Rochester +("\code{nyr}")} + +\item{age_group}{a vector of age groups to pull data for. Possible values are: +"\code{overall}", "\code{0-4y}", "\code{5-17y}, "\code{18-49y}, +"\code{50-64y}, "\code{65+y}".} + +\item{years}{a vector of years to retrieve data for (i.e. \code{2014} for CDC +flu season 2014-2015). Default value is the current year and all +\code{years} values should be >= \code{2009}} +} +\value{ +A single \code{data.frame}. +} +\description{ +Uses the data source from the +\href{https://gis.cdc.gov/GRASP/Fluview/FluHospRates.html}{CDC FluView} +and provides influenza hospitalization reporting data as a data frame. +} +\note{ +There is often a noticeable delay when making the API request to the CDC. + This is not due to a large download size, but the time it takes for their + servers to crunch the data. Wrap the function call in \code{httr::with_verbose} + if you would like to see what's going on. +} +\examples{ +\dontrun{ +# All of FluSurv-NET, 50-64 years old, 2010/11-2014/15 flu seasons +flu <- get_hosp_data("flusurvnet", "50-64y", years=2010:2014) +} +} diff --git a/man/get_mortality_surveillance_data.Rd b/man/get_mortality_surveillance_data.Rd index 3c09933..bc54b6c 100644 --- a/man/get_mortality_surveillance_data.Rd +++ b/man/get_mortality_surveillance_data.Rd @@ -39,4 +39,3 @@ get_mortality_surveillance_data() \references{ \url{https://www.cdc.gov/flu/weekly/nchs.htm} } - diff --git a/man/get_state_data.Rd b/man/get_state_data.Rd index fb8ee2a..8700a5e 100644 --- a/man/get_state_data.Rd +++ b/man/get_state_data.Rd @@ -38,4 +38,3 @@ get_state_data(2010:2014) httr::with_verbose(get_state_data(2009:2015)) } } - diff --git a/man/get_weekly_flu_report.Rd b/man/get_weekly_flu_report.Rd index ca0afab..efa44f7 100644 --- a/man/get_weekly_flu_report.Rd +++ b/man/get_weekly_flu_report.Rd @@ -29,4 +29,3 @@ get_weekly_flu_report() \references{ \url{https://www.cdc.gov/flu/weekly/flureport.xml} } - diff --git a/man/hhs_regions.Rd b/man/hhs_regions.Rd index 49d1c90..2857a62 100644 --- a/man/hhs_regions.Rd +++ b/man/hhs_regions.Rd @@ -28,4 +28,3 @@ Last updated 2015-08-09. \url{https://www.hhs.gov/about/agencies/iea/regional-offices/index.html} } \keyword{datasets} - From da7430bd4eab43ac88606809cebf206fcf6926a5 Mon Sep 17 00:00:00 2001 From: Craig McGowan Date: Mon, 16 Oct 2017 16:00:06 -0400 Subject: [PATCH 3/3] Update get_flu_data to new CDC API and availability of state level data --- DESCRIPTION | 4 +-- R/get_flu_data.r | 96 ++++++++++++++++++++++++++++++++++++++++++----------- man/get_flu_data.Rd | 8 +++-- 3 files changed, 84 insertions(+), 24 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4b41a40..960ad58 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,8 +3,8 @@ Type: Package Title: Retrieve U.S. Flu Season Data from the CDC FluView Portal Version: 0.5.2 Date: 2017-03-14 -Author: Bob Rudis (bob@rud.is) -Maintainer: Bob Rudis +Authors@R: c(person("Bob","Rudis", email = "bob@rud.is", role = c("aut", "cre")), + person("Craig", "McGowan", email = "mcgowan.cj@gmail.com", role = "ctb")) Encoding: UTF-8 Description: The U.S. Centers for Disease Control (CDC) maintains a portal for diff --git a/R/get_flu_data.r b/R/get_flu_data.r index 22d4279..71dbb39 100644 --- a/R/get_flu_data.r +++ b/R/get_flu_data.r @@ -9,11 +9,14 @@ #' A lookup table between HHS regions and their member states/territories #' is provided in \code{\link{hhs_regions}}. #' -#' @param region one of "\code{hhs}", "\code{census}", "\code{national}" +#' @param region one of "\code{hhs}", "\code{census}", "\code{national}", +#' "\code{state}" #' @param sub_region depends on the \code{region_type}.\cr #' For "\code{national}", the \code{sub_region} should be \code{NA}.\cr #' For "\code{hhs}", should be a vector between \code{1:10}.\cr -#' For "\code{census}", should be a vector between \code{1:9} +#' For "\code{census}", should be a vector between \code{1:9}.\cr +#' For "\code{state}", should be a vector of state/territory names +#' or "\code{all}". #' @param data_source either of "\code{who}" (for WHO NREVSS) or "\code{ilinet}" #' or "\code{all}" (for both) #' @param years a vector of years to retrieve data for (i.e. \code{2014} for CDC @@ -37,13 +40,13 @@ get_flu_data <- function(region="hhs", sub_region=1:10, region <- tolower(region) data_source <- tolower(data_source) - if (!(region %in% c("hhs", "census", "national"))) + if (!(region %in% c("hhs", "census", "national", "state"))) stop("Error: region must be one of hhs, census or national") if (length(region) != 1) stop("Error: can only select one region") - if (region=="national") sub_region = "" + if (region=="national") sub_region = 0 if ((region=="hhs") && !all(sub_region %in% 1:10)) stop("Error: sub_region values must fall between 1:10 when region is 'hhs'") @@ -57,31 +60,75 @@ get_flu_data <- function(region="hhs", sub_region=1:10, if (any(years < 1997)) stop("Error: years should be > 1997") - # format the input parameters to fit the CDC API - - years <- years - 1960 + # Match names of states to numbers for API + if (region == "state") { + sub_region <- tolower(sub_region) - reg <- as.numeric(c("hhs"=1, "census"=2, "national"=3)[[region]]) + if (any(sub_region == "all")) { + sub_region_inpt <- 1:57 + } else { + state_match <- data.frame(state = tolower(c(sort(c(datasets::state.name, + "District of Columbia")), + "American Samoa", + "Commonwealth of the Northern Mariana Islands", + "Puerto Rico", + "Virgin Islands", + "New York City", + "Los Angeles")), + num = 1:57, + stringsAsFactors = F) + + sub_region_inpt <- state_match$num[state_match$state %in% sub_region] + + if (length(sub_region_inpt) == 0) + stop("Error: no eligible state/territory names provided") + } + } else sub_region_inpt <- sub_region - if ("all" %in% data_source) data_source <- c("who", "ilinet") + # format the input parameters to fit the CDC API - data_source <- gsub("who", "WHO_NREVSS", data_source) - data_source <- gsub("ilinet", "ILINet", data_source) + years <- years - 1960 - params <- list(SubRegionsList = paste0(sub_region, collapse=","), - DataSources = paste0(data_source, collapse=","), - RegionID = reg, - SeasonsList = paste0(years, collapse=",")) + reg <- as.numeric(c("hhs"=1, "census"=2, "national"=3, "state" = 5)[[region]]) + + # Format data source + if (data_source == "who") { + data_list <- list(list(ID = 0, + Name = "WHO_NREVSS")) + } else if (data_source == "ilinet") { + data_list <- list(list(ID = 1, + Name = "ILINet")) + } else data_list <- list(list(ID = 0, + Name = "WHO_NREVSS"), + list(ID = 1, + Name = "ILINet")) + + # Format years + year_list <- lapply(seq_along(years), + function(x) list(ID = years[x], + Name = paste(years[x]))) + + # Format sub regions + sub_reg_list <- lapply(seq_along(sub_region_inpt), + function(x) list(ID = sub_region_inpt[x], + Name = paste(sub_region_inpt[x]))) + + params <- list(AppVersion = "Public", + DatasourceDT = data_list, + RegionTypeId = reg, + SeasonsDT = year_list, + SubRegionsDT = sub_reg_list) out_file <- tempfile(fileext=".zip") # CDC API returns a ZIP file so we grab, save & expand it to then read in CSVs - tmp <- httr::POST("https://gis.cdc.gov/grasp/fluview/FluViewPhase2CustomDownload.ashx", + tmp <- httr::POST("https://gis.cdc.gov/grasp/flu2/PostPhase02DataDownload", body = params, - write_disk(out_file)) + encode = "json", + httr::write_disk(out_file)) - stop_for_status(tmp) + httr::stop_for_status(tmp) if (!(file.exists(out_file))) stop("Error: cannot process downloaded data") @@ -97,7 +144,17 @@ get_flu_data <- function(region="hhs", sub_region=1:10, suppressMessages(readr::read_csv(x, skip=ct)) }) -> file_list - names(file_list) <- substr(basename(files), 1, 3) + names(file_list) <- substr(basename(files), 1, nchar(basename(files)) - 4) + + # If data are missing, X causes numeric columns to be read as character + purrr::map(file_list, function(x) { + # Create list of columns that should be numeric - exclude character columns + cols <- which(!colnames(x) %in% c("REGION", "REGION TYPE", + "SEASON_DESCRIPTION")) + suppressWarnings(x[cols] <- purrr::map(x[cols], as.numeric)) + return(x) + }) -> file_list + # Depending on the parameters, there could be more than one # file returned. When there's only one, return a more usable @@ -128,3 +185,4 @@ get_flu_data <- function(region="hhs", sub_region=1:10, } } + diff --git a/man/get_flu_data.Rd b/man/get_flu_data.Rd index 7d222cc..a5c3b48 100644 --- a/man/get_flu_data.Rd +++ b/man/get_flu_data.Rd @@ -8,12 +8,15 @@ get_flu_data(region = "hhs", sub_region = 1:10, data_source = "ilinet", years = as.numeric(format(Sys.Date(), "\%Y"))) } \arguments{ -\item{region}{one of "\code{hhs}", "\code{census}", "\code{national}"} +\item{region}{one of "\code{hhs}", "\code{census}", "\code{national}", +"\code{state}"} \item{sub_region}{depends on the \code{region_type}.\cr For "\code{national}", the \code{sub_region} should be \code{NA}.\cr For "\code{hhs}", should be a vector between \code{1:10}.\cr -For "\code{census}", should be a vector between \code{1:9}} +For "\code{census}", should be a vector between \code{1:9}.\cr +For "\code{state}", should be a vector of state/territory names +or "\code{all}".} \item{data_source}{either of "\code{who}" (for WHO NREVSS) or "\code{ilinet}" or "\code{all}" (for both)} @@ -49,4 +52,3 @@ There is often a noticeable delay when making the API request to the CDC. flu <- get_flu_data("hhs", 1:10, c("who", "ilinet"), years=2000:2014) } } -