Authors@R: c(
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-5670-2640")),
person("Derek", "Abdine", comment = "Authentication driver update"),
person("Zachary", "Kurtz", email = "zdkurtz@gmail.com", role = "ctb")
Maintainer: Bob Rudis <bob@rud.is>
is included along with an interface to the 'AWS' command-line utility.
URL: https://github.com/hrbrmstr/metis
BugReports: https://github.com/hrbrmstr/metis/issues
SystemRequirements: JDK 1.8+
) -> .ll_trans
#' AthenaJDBC
#' @export
@ -18,8 +23,8 @@ setClass(
Athena <- function(identifier.quote = '`') {
driverClass = "com.amazonaws.athena.jdbc.AthenaDriver",
system.file("java", "AthenaJDBC41-1.1.0.jar", package = "metis"),
driverClass = "com.simba.athena.jdbc.Driver",
system.file("java", "AthenaJDBC42_2.0.2.jar", package = "metis"),
identifier.quote = identifier.quote
) -> drv
@ -33,10 +38,12 @@ Athena <- function(identifier.quote = '`') {
#' @param region AWS region the Athena tables are in
#' @param s3_staging_dir A write-able bucket on S3 that you have permissions for
#' @param schema_name LOL if only this actually worked with Amazon's hacked Presto driver
#' @param max_error_retries,connection_timeout,socket_timeout,retry_base_delay,retry_max_backoff_time
#' @param max_error_retries,connection_timeout,socket_timeout
#' technical connection info that you should only muck with if you know what you're doing.
#' @param log_path,log_level The Athena JDBC driver can (shockingly) provide a decent bit
#' of data in logs. Set this to a temporary directory or somethign log4j can use.
#' of data in logs. Set this to a temporary directory or something log4j can use. For
#' `log_level` use the names ("INFO", "DEBUG", "WARN", "ERROR", "ALL", "OFF", "FATAL", "TRACE") or
#' their corresponding integer values 0-6.
#' @param ... unused
#' @references <https://docs.aws.amazon.com/athena/latest/ug/connect-with-jdbc.html>
#' @export
@ -45,37 +52,42 @@ setMethod(
def = function(drv,
provider = "com.amazonaws.athena.jdbc.shaded.com.amazonaws.auth.DefaultAWSCredentialsProviderChain",
region = "us-east-1",
s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"),
schema_name = "default",
max_error_retries = 10,
connection_timeout = 10000,
socket_timeout = 10000,
retry_base_delay = 100,
retry_max_backoff_time = 1000,
...) {
def = function(
provider = "com.simba.athena.amazonaws.auth.DefaultAWSCredentialsProviderChain",
region = "us-east-1",
s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"),
schema_name = "default",
max_error_retries = 10,
connection_timeout = 10000,
socket_timeout = 10000,
# retry_base_delay = 100,
# retry_max_backoff_time = 1000,
...) {
conn_string = sprintf(
'jdbc:awsathena://athena.%s.amazonaws.com:443/%s', region, schema_name
if (!(log_level %in% 0:6)) log_level <- .ll_trans[log_level]
s3_staging_dir = s3_staging_dir,
schema_name = schema_name,
max_error_retries = max_error_retries,
connection_timeout = connection_timeout,
socket_timeout = socket_timeout,
retry_base_delay = retry_base_delay,
retry_max_backoff_time = retry_max_backoff_time,
log_path = log_path,
log_level = log_level,
aws_credentials_provider_class = provider,
S3OutputLocation = s3_staging_dir,
Schema = schema_name,
MaxErrorRetry = max_error_retries,
ConnectTimeout = connection_timeout,
SocketTimeout = socket_timeout,
# retry_base_delay = retry_base_delay,
# retry_max_backoff_time = retry_max_backoff_time,
LogPath = log_path,
LogLevel = log_level,
AwsCredentialsProviderClass = provider,
) -> jc


#' @param max_error_retries the maximum number of retries that the JDBC client attempts to make a request to Athena.
#' @param connection_timeout the maximum amount of time, in milliseconds, to make a successful connection to Athena before an attempt is terminated.
#' @param socket_timeout the maximum amount of time, in milliseconds, to wait for a socket in order to send data to Athena.
#' @param retry_base_delay minimum delay amount, in milliseconds, between retrying attempts to connect Athena.
#' @param retry_max_backoff_time maximum delay amount, in milliseconds, between retrying attempts to connect Athena.
# @param retry_base_delay minimum delay amount, in milliseconds, between retrying attempts to connect Athena.
# @param retry_max_backoff_time maximum delay amount, in milliseconds, between retrying attempts to connect Athena.
#' @param log_path local path of the Athena JDBC driver logs. If no log path is provided, then no log files are created.
#' @param log_level log level of the Athena JDBC driver logs.
#' @param log_level log level of the Athena JDBC driver logs. Use names
#' @export
#' @examples \dontrun{
#' use_credentials("personal")
@ -27,21 +28,22 @@
#' dbGetQuery(ath, "SELECT * FROM sampledb.elb_logs LIMIT 1")
#' }
athena_connect <- function(default_schema = "default",
region = c("us-east-1", "us-east-2", "us-west-2"),
s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"),
max_error_retries = 10,
connection_timeout = 10000,
socket_timeout = 10000,
retry_base_delay = 100,
retry_max_backoff_time = 1000,
log_path = "",
log_level = c("INFO", "DEBUG", "WARN", "ERROR", "ALL", "OFF", "FATAL", "TRACE")) {
athena_connect <- function(
default_schema = "default",
region = c("us-east-1", "us-east-2", "us-west-2"),
s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"),
max_error_retries = 10,
connection_timeout = 10000,
socket_timeout = 10000,
# retry_base_delay = 100,
# retry_max_backoff_time = 1000,
log_path = "",
log_level = c("OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE")) {
athena_jdbc <- Athena()
region <- match.arg(region, c("us-east-1", "us-east-2", "us-west-2"))
log_level <- match.arg(log_level, c("INFO", "DEBUG", "WARN", "ERROR", "ALL", "OFF", "FATAL", "TRACE"))
log_level <- match.arg(log_level, c("OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE"))
# if (!simple) {
@ -52,8 +54,8 @@ athena_connect <- function(default_schema = "default",
max_error_retries = max_error_retries,
connection_timeout = connection_timeout,
socket_timeout = socket_timeout,
retry_base_delay = retry_base_delay,
retry_max_backoff_time = retry_max_backoff_time,
# retry_base_delay = retry_base_delay,
# retry_max_backoff_time = retry_max_backoff_time,
log_path = log_path,
log_level = log_level
) -> con


.onLoad <- function(libname, pkgname) {
rJava::.jpackage(pkgname, jars = "*", lib.loc = libname)
rJava::.jaddClassPath(dir(file.path(getwd(), "inst/java"), full.names = TRUE))


output: rmarkdown::github_document
chunk_output_type: console
# `metis`
@ -15,10 +16,11 @@ In Greek mythology, Metis was Athena's "helper".
Still fairly beta-quality level but getting there.
The goal will be to get around enough of the "gotchas" that are preventing raw RJDBC Athena
connections from "just working" with `dplyr` v0.6.0+ and also get around the [`fetchSize` problem](https://www.reddit.com/r/aws/comments/6aq22b/fetchsize_limit/) without having to not use `dbGetQuery()`.
The goal will be to get around enough of the "gotchas" that are preventing raw RJDBC Athena connections from "just working" with `dplyr` v0.6.0+ and also get around the [`fetchSize` problem](https://www.reddit.com/r/aws/comments/6aq22b/fetchsize_limit/) without having to not use `dbGetQuery()`.
The `AthenaJDBC41-1.1.0.jar` JAR file is included out of convenience but that will likely move to a separate package as this gets closer to prime time if this goes on CRAN.
The `AthenaJDBC42_2.0.2.jar` JAR file is included out of convenience but that will likely move to a separate package as this gets closer to prime time if this goes on CRAN.
NOTE that the updated driver *REQUIRES JDK 1.8+*.
See the **Usage** section for an example.


# `metis`
Helpers for Accessing and Querying Amazon Athena
@ -19,10 +17,12 @@ v0.6.0+ and also get around the [`fetchSize`
without having to not use `dbGetQuery()`.
The `AthenaJDBC41-1.1.0.jar` JAR file is included out of convenience but
The `AthenaJDBC42_2.0.2.jar` JAR file is included out of convenience but
that will likely move to a separate package as this gets closer to prime
time if this goes on CRAN.
NOTE that the updated driver *REQUIRES JDK 1.8+*.
See the **Usage** section for an example.
## What’s Inside The Tin?
@ -111,21 +111,21 @@ dbGetQuery(ath, "SELECT * FROM sampledb.elb_logs LIMIT 10") %>%
## Observations: 10
## Variables: 16
## $ timestamp <dttm> 2014-09-30 01:03:00, 2014-09-30 01:03:01, 2014-09-30 01:03:01, 2014-09-30 01:03:01, ...
## $ timestamp <dttm> 2014-09-30 00:00:25, 2014-09-30 00:00:57, 2014-09-30 00:01:06, 2014-09-30 00:01:29, ...
## $ elbname <chr> "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo...
## $ requestip <chr> "", "", "", "", "", ...
## $ requestport <dbl> 4095, 14668, 29796, 38607, 32750, 10182, 64948, 51279, 13331, 2700
## $ backendip <chr> "", "", "", "", "", ...
## $ backendport <dbl> 8888, 443, 8899, 8888, 8888, 8888, 8888, 8888, 8888, 8000
## $ requestprocessingtime <dbl> 7.3e-05, 8.9e-05, 4.5e-05, 4.3e-05, 7.6e-05, 7.3e-05, 7.7e-05, 4.6e-05, 4.9e-05, 5.3e-05
## $ backendprocessingtime <dbl> 0.561864, 0.021517, 0.019530, 0.018937, 0.022727, 0.390384, 0.017017, 0.016437, 0.019...
## $ clientresponsetime <dbl> 9.0e-05, 7.0e-05, 3.0e-05, 3.3e-05, 3.2e-05, 8.4e-05, 5.2e-05, 7.1e-05, 6.9e-05, 5.4e-05
## $ elbresponsecode <int> 200, 304, 304, 304, 200, 200, 304, 304, 200, 304
## $ backendresponsecode <int> 200, 200, 403, 200, 200, 400, 200, 200, 200, 200
## $ requestip <chr> "", "", "", "", "245.27.105....
## $ requestport <dbl> 33998, 33998, 33998, 33998, 33998, 33998, 33998, 14346, 33998, 33998
## $ backendip <chr> "", "", "", "", "
## $ backendport <dbl> 8888, 8888, 8888, 8888, 8888, 8888, 8888, 8000, 8888, 8888
## $ requestprocessingtime <dbl> 0.000091, 0.000092, 0.000105, 0.000091, 0.000091, 0.000091, 0.000090, 0.000077, 0.000...
## $ backendprocessingtime <dbl> 0.048114, 0.055741, 0.008005, 0.037602, 0.039396, 0.053371, 0.040238, 0.192458, 0.027...
## $ clientresponsetime <dbl> 6.2e-05, 5.0e-05, 4.8e-05, 6.1e-05, 4.7e-05, 6.2e-05, 5.5e-05, 8.3e-05, 5.7e-05, 8.5e-05
## $ elbresponsecode <int> 200, 200, 302, 200, 200, 200, 200, 500, 200, 200
## $ backendresponsecode <int> 404, 200, 200, 200, 200, 200, 400, 500, 200, 200
## $ receivedbytes <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
## $ sentbytes <dbl> 58402, 0, 0, 0, 152213, 58402, 0, 0, 152213, 0
## $ sentbytes <dbl> 2, 2, 0, 2, 2, 2, 2, 28098, 2, 2
## $ requestverb <chr> "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET"
## $ url <chr> "http://www.abcxyz.com:80/", "http://www.abcxyz.com:80/static/css/hue3.css", "http://...
## $ url <chr> "http://www.abcxyz.com:80/jobbrowser/?format=json&state=running&user=l29ezwd", "http:...
## $ protocol <chr> "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "...
athena_connect(default_schema = "default", region = c("us-east-1",
"us-east-2", "us-west-2"),
s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"), max_error_retries = 10,
connection_timeout = 10000, socket_timeout = 10000,
retry_base_delay = 100, retry_max_backoff_time = 1000, log_path = "",
log_level = c("INFO", "DEBUG", "WARN", "ERROR", "ALL", "OFF", "FATAL",
connection_timeout = 10000, socket_timeout = 10000, log_path = "",
log_level = c("OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE"))
\item{default_schema}{default schema (you'll still need to fully qualify non-default schema table names)}
@ -25,13 +23,10 @@ athena_connect(default_schema = "default", region = c("us-east-1",
\item{socket_timeout}{the maximum amount of time, in milliseconds, to wait for a socket in order to send data to Athena.}
\item{retry_base_delay}{minimum delay amount, in milliseconds, between retrying attempts to connect Athena.}
\item{retry_max_backoff_time}{maximum delay amount, in milliseconds, between retrying attempts to connect Athena.}
\item{log_path}{local path of the Athena JDBC driver logs. If no log path is provided, then no log files are created.}
\item{log_level}{log level of the Athena JDBC driver logs.}
\item{log_level}{log level of the Athena JDBC driver logs. Use names
Handles the up-front JDBC config


provider = "com.amazonaws.athena.jdbc.shaded.com.amazonaws.auth.DefaultAWSCredentialsProviderChain",
provider = "com.simba.athena.amazonaws.auth.DefaultAWSCredentialsProviderChain",
region = "us-east-1", s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"),
schema_name = "default", max_error_retries = 10,
connection_timeout = 10000, socket_timeout = 10000,
retry_base_delay = 100, retry_max_backoff_time = 1000, log_path,
log_level, ...)
connection_timeout = 10000, socket_timeout = 10000, log_path, log_level,
\item{provider}{JDBC auth provider (ideally leave default)}
@ -22,10 +21,12 @@
\item{schema_name}{LOL if only this actually worked with Amazon's hacked Presto driver}
\item{max_error_retries, connection_timeout, socket_timeout, retry_base_delay, retry_max_backoff_time}{technical connection info that you should only muck with if you know what you're doing.}
\item{max_error_retries, connection_timeout, socket_timeout}{technical connection info that you should only muck with if you know what you're doing.}
\item{log_path, log_level}{The Athena JDBC driver can (shockingly) provide a decent bit
of data in logs. Set this to a temporary directory or somethign log4j can use.}
of data in logs. Set this to a temporary directory or something log4j can use. For
`log_level` use the names ("INFO", "DEBUG", "WARN", "ERROR", "ALL", "OFF", "FATAL", "TRACE") or
their corresponding integer values 0-6.}
