diff --git a/DESCRIPTION b/DESCRIPTION index 9a4ef95..e48ca2d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -6,6 +6,7 @@ Date: 2018-03-19 Authors@R: c( person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-5670-2640")), + person("Derek", "Abdine", comment = "Authentication driver update"), person("Zachary", "Kurtz", email = "zdkurtz@gmail.com", role = "ctb") ) Maintainer: Bob Rudis @@ -15,6 +16,7 @@ Description: Methods are provides to connect to 'Amazon' 'Athena', lookup schema is included along with an interface to the 'AWS' command-line utility. URL: https://github.com/hrbrmstr/metis BugReports: https://github.com/hrbrmstr/metis/issues +SystemRequirements: JDK 1.8+ License: AGPL Suggests: testthat, diff --git a/R/jdbc.r b/R/jdbc.r index 591f869..d8b24ea 100644 --- a/R/jdbc.r +++ b/R/jdbc.r @@ -1,3 +1,8 @@ +stats::setNames( + 0:6, + c("OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE") +) -> .ll_trans + #' AthenaJDBC #' #' @export @@ -18,8 +23,8 @@ setClass( Athena <- function(identifier.quote = '`') { JDBC( - driverClass = "com.amazonaws.athena.jdbc.AthenaDriver", - system.file("java", "AthenaJDBC41-1.1.0.jar", package = "metis"), + driverClass = "com.simba.athena.jdbc.Driver", + system.file("java", "AthenaJDBC42_2.0.2.jar", package = "metis"), identifier.quote = identifier.quote ) -> drv @@ -33,10 +38,12 @@ Athena <- function(identifier.quote = '`') { #' @param region AWS region the Athena tables are in #' @param s3_staging_dir A write-able bucket on S3 that you have permissions for #' @param schema_name LOL if only this actually worked with Amazon's hacked Presto driver -#' @param max_error_retries,connection_timeout,socket_timeout,retry_base_delay,retry_max_backoff_time +#' @param max_error_retries,connection_timeout,socket_timeout #' technical connection info that you should only muck with if you know what you're doing. #' @param log_path,log_level The Athena JDBC driver can (shockingly) provide a decent bit -#' of data in logs. Set this to a temporary directory or somethign log4j can use. +#' of data in logs. Set this to a temporary directory or something log4j can use. For +#' `log_level` use the names ("INFO", "DEBUG", "WARN", "ERROR", "ALL", "OFF", "FATAL", "TRACE") or +#' their corresponding integer values 0-6. #' @param ... unused #' @references #' @export @@ -45,37 +52,42 @@ setMethod( "dbConnect", "AthenaDriver", - def = function(drv, - provider = "com.amazonaws.athena.jdbc.shaded.com.amazonaws.auth.DefaultAWSCredentialsProviderChain", - region = "us-east-1", - s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"), - schema_name = "default", - max_error_retries = 10, - connection_timeout = 10000, - socket_timeout = 10000, - retry_base_delay = 100, - retry_max_backoff_time = 1000, - log_path, - log_level, - ...) { + def = function( + drv, + provider = "com.simba.athena.amazonaws.auth.DefaultAWSCredentialsProviderChain", + region = "us-east-1", + s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"), + schema_name = "default", + max_error_retries = 10, + connection_timeout = 10000, + socket_timeout = 10000, + # retry_base_delay = 100, + # retry_max_backoff_time = 1000, + log_path, + log_level, + ...) { conn_string = sprintf( 'jdbc:awsathena://athena.%s.amazonaws.com:443/%s', region, schema_name ) + if (!(log_level %in% 0:6)) log_level <- .ll_trans[log_level] + + + callNextMethod( drv, conn_string, - s3_staging_dir = s3_staging_dir, - schema_name = schema_name, - max_error_retries = max_error_retries, - connection_timeout = connection_timeout, - socket_timeout = socket_timeout, - retry_base_delay = retry_base_delay, - retry_max_backoff_time = retry_max_backoff_time, - log_path = log_path, - log_level = log_level, - aws_credentials_provider_class = provider, + S3OutputLocation = s3_staging_dir, + Schema = schema_name, + MaxErrorRetry = max_error_retries, + ConnectTimeout = connection_timeout, + SocketTimeout = socket_timeout, + # retry_base_delay = retry_base_delay, + # retry_max_backoff_time = retry_max_backoff_time, + LogPath = log_path, + LogLevel = log_level, + AwsCredentialsProviderClass = provider, ... ) -> jc diff --git a/R/metis.r b/R/metis.r index 64927ed..160dfdb 100644 --- a/R/metis.r +++ b/R/metis.r @@ -9,10 +9,11 @@ #' @param max_error_retries the maximum number of retries that the JDBC client attempts to make a request to Athena. #' @param connection_timeout the maximum amount of time, in milliseconds, to make a successful connection to Athena before an attempt is terminated. #' @param socket_timeout the maximum amount of time, in milliseconds, to wait for a socket in order to send data to Athena. -#' @param retry_base_delay minimum delay amount, in milliseconds, between retrying attempts to connect Athena. -#' @param retry_max_backoff_time maximum delay amount, in milliseconds, between retrying attempts to connect Athena. +# @param retry_base_delay minimum delay amount, in milliseconds, between retrying attempts to connect Athena. +# @param retry_max_backoff_time maximum delay amount, in milliseconds, between retrying attempts to connect Athena. #' @param log_path local path of the Athena JDBC driver logs. If no log path is provided, then no log files are created. -#' @param log_level log level of the Athena JDBC driver logs. +#' @param log_level log level of the Athena JDBC driver logs. Use names +#' "OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE". #' @export #' @examples \dontrun{ #' use_credentials("personal") @@ -27,21 +28,22 @@ #' dbGetQuery(ath, "SELECT * FROM sampledb.elb_logs LIMIT 1") #' #' } -athena_connect <- function(default_schema = "default", - region = c("us-east-1", "us-east-2", "us-west-2"), - s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"), - max_error_retries = 10, - connection_timeout = 10000, - socket_timeout = 10000, - retry_base_delay = 100, - retry_max_backoff_time = 1000, - log_path = "", - log_level = c("INFO", "DEBUG", "WARN", "ERROR", "ALL", "OFF", "FATAL", "TRACE")) { +athena_connect <- function( + default_schema = "default", + region = c("us-east-1", "us-east-2", "us-west-2"), + s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"), + max_error_retries = 10, + connection_timeout = 10000, + socket_timeout = 10000, + # retry_base_delay = 100, + # retry_max_backoff_time = 1000, + log_path = "", + log_level = c("OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE")) { athena_jdbc <- Athena() region <- match.arg(region, c("us-east-1", "us-east-2", "us-west-2")) - log_level <- match.arg(log_level, c("INFO", "DEBUG", "WARN", "ERROR", "ALL", "OFF", "FATAL", "TRACE")) + log_level <- match.arg(log_level, c("OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE")) # if (!simple) { dbConnect( @@ -52,8 +54,8 @@ athena_connect <- function(default_schema = "default", max_error_retries = max_error_retries, connection_timeout = connection_timeout, socket_timeout = socket_timeout, - retry_base_delay = retry_base_delay, - retry_max_backoff_time = retry_max_backoff_time, + # retry_base_delay = retry_base_delay, + # retry_max_backoff_time = retry_max_backoff_time, log_path = log_path, log_level = log_level ) -> con diff --git a/R/zzz.R b/R/zzz.R new file mode 100644 index 0000000..3021a97 --- /dev/null +++ b/R/zzz.R @@ -0,0 +1,4 @@ +.onLoad <- function(libname, pkgname) { + rJava::.jpackage(pkgname, jars = "*", lib.loc = libname) + rJava::.jaddClassPath(dir(file.path(getwd(), "inst/java"), full.names = TRUE)) +} diff --git a/README.Rmd b/README.Rmd index 6b4a8e9..4346955 100644 --- a/README.Rmd +++ b/README.Rmd @@ -1,7 +1,8 @@ --- output: rmarkdown::github_document +editor_options: + chunk_output_type: console --- -![](https://upload.wikimedia.org/wikipedia/commons/thumb/5/53/Winged_goddess_Louvre_F32.jpg/300px-Winged_goddess_Louvre_F32.jpg) # `metis` @@ -15,10 +16,11 @@ In Greek mythology, Metis was Athena's "helper". Still fairly beta-quality level but getting there. -The goal will be to get around enough of the "gotchas" that are preventing raw RJDBC Athena -connections from "just working" with `dplyr` v0.6.0+ and also get around the [`fetchSize` problem](https://www.reddit.com/r/aws/comments/6aq22b/fetchsize_limit/) without having to not use `dbGetQuery()`. +The goal will be to get around enough of the "gotchas" that are preventing raw RJDBC Athena connections from "just working" with `dplyr` v0.6.0+ and also get around the [`fetchSize` problem](https://www.reddit.com/r/aws/comments/6aq22b/fetchsize_limit/) without having to not use `dbGetQuery()`. -The `AthenaJDBC41-1.1.0.jar` JAR file is included out of convenience but that will likely move to a separate package as this gets closer to prime time if this goes on CRAN. +The `AthenaJDBC42_2.0.2.jar` JAR file is included out of convenience but that will likely move to a separate package as this gets closer to prime time if this goes on CRAN. + +NOTE that the updated driver *REQUIRES JDK 1.8+*. See the **Usage** section for an example. diff --git a/README.md b/README.md index 93e6863..09a0f74 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,4 @@ -![](https://upload.wikimedia.org/wikipedia/commons/thumb/5/53/Winged_goddess_Louvre_F32.jpg/300px-Winged_goddess_Louvre_F32.jpg) - # `metis` Helpers for Accessing and Querying Amazon Athena @@ -19,10 +17,12 @@ v0.6.0+ and also get around the [`fetchSize` problem](https://www.reddit.com/r/aws/comments/6aq22b/fetchsize_limit/) without having to not use `dbGetQuery()`. -The `AthenaJDBC41-1.1.0.jar` JAR file is included out of convenience but +The `AthenaJDBC42_2.0.2.jar` JAR file is included out of convenience but that will likely move to a separate package as this gets closer to prime time if this goes on CRAN. +NOTE that the updated driver *REQUIRES JDK 1.8+*. + See the **Usage** section for an example. ## What’s Inside The Tin? @@ -111,21 +111,21 @@ dbGetQuery(ath, "SELECT * FROM sampledb.elb_logs LIMIT 10") %>% ## Observations: 10 ## Variables: 16 - ## $ timestamp 2014-09-30 01:03:00, 2014-09-30 01:03:01, 2014-09-30 01:03:01, 2014-09-30 01:03:01, ... + ## $ timestamp 2014-09-30 00:00:25, 2014-09-30 00:00:57, 2014-09-30 00:01:06, 2014-09-30 00:01:29, ... ## $ elbname "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo... - ## $ requestip "253.90.22.60", "253.51.141.83", "245.59.222.144", "241.35.85.250", "246.245.70.48", ... - ## $ requestport 4095, 14668, 29796, 38607, 32750, 10182, 64948, 51279, 13331, 2700 - ## $ backendip "250.133.18.39", "248.214.120.18", "250.38.70.52", "249.45.101.192", "249.28.120.9", ... - ## $ backendport 8888, 443, 8899, 8888, 8888, 8888, 8888, 8888, 8888, 8000 - ## $ requestprocessingtime 7.3e-05, 8.9e-05, 4.5e-05, 4.3e-05, 7.6e-05, 7.3e-05, 7.7e-05, 4.6e-05, 4.9e-05, 5.3e-05 - ## $ backendprocessingtime 0.561864, 0.021517, 0.019530, 0.018937, 0.022727, 0.390384, 0.017017, 0.016437, 0.019... - ## $ clientresponsetime 9.0e-05, 7.0e-05, 3.0e-05, 3.3e-05, 3.2e-05, 8.4e-05, 5.2e-05, 7.1e-05, 6.9e-05, 5.4e-05 - ## $ elbresponsecode 200, 304, 304, 304, 200, 200, 304, 304, 200, 304 - ## $ backendresponsecode 200, 200, 403, 200, 200, 400, 200, 200, 200, 200 + ## $ requestip "246.247.182.239", "250.128.76.75", "243.157.244.21", "255.172.234.242", "245.27.105.... + ## $ requestport 33998, 33998, 33998, 33998, 33998, 33998, 33998, 14346, 33998, 33998 + ## $ backendip "251.173.42.143", "254.201.134.52", "240.175.197.76", "255.212.79.68", "250.102.227.5... + ## $ backendport 8888, 8888, 8888, 8888, 8888, 8888, 8888, 8000, 8888, 8888 + ## $ requestprocessingtime 0.000091, 0.000092, 0.000105, 0.000091, 0.000091, 0.000091, 0.000090, 0.000077, 0.000... + ## $ backendprocessingtime 0.048114, 0.055741, 0.008005, 0.037602, 0.039396, 0.053371, 0.040238, 0.192458, 0.027... + ## $ clientresponsetime 6.2e-05, 5.0e-05, 4.8e-05, 6.1e-05, 4.7e-05, 6.2e-05, 5.5e-05, 8.3e-05, 5.7e-05, 8.5e-05 + ## $ elbresponsecode 200, 200, 302, 200, 200, 200, 200, 500, 200, 200 + ## $ backendresponsecode 404, 200, 200, 200, 200, 200, 400, 500, 200, 200 ## $ receivedbytes 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - ## $ sentbytes 58402, 0, 0, 0, 152213, 58402, 0, 0, 152213, 0 + ## $ sentbytes 2, 2, 0, 2, 2, 2, 2, 28098, 2, 2 ## $ requestverb "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET" - ## $ url "http://www.abcxyz.com:80/", "http://www.abcxyz.com:80/static/css/hue3.css", "http://... + ## $ url "http://www.abcxyz.com:80/jobbrowser/?format=json&state=running&user=l29ezwd", "http:... ## $ protocol "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "... ## Code of Conduct diff --git a/inst/java/AthenaJDBC41-1.1.0.jar b/inst/java/AthenaJDBC41-1.1.0.jar deleted file mode 100644 index 49d1a1b..0000000 Binary files a/inst/java/AthenaJDBC41-1.1.0.jar and /dev/null differ diff --git a/inst/java/AthenaJDBC42_2.0.2.jar b/inst/java/AthenaJDBC42_2.0.2.jar new file mode 100644 index 0000000..5d541e6 Binary files /dev/null and b/inst/java/AthenaJDBC42_2.0.2.jar differ diff --git a/inst/java/log4j.properties b/inst/java/log4j.properties new file mode 100644 index 0000000..3485ec5 --- /dev/null +++ b/inst/java/log4j.properties @@ -0,0 +1 @@ +log4j.rootLogger=WARN diff --git a/man/athena_connect.Rd b/man/athena_connect.Rd index 4553ddf..8a1040b 100644 --- a/man/athena_connect.Rd +++ b/man/athena_connect.Rd @@ -7,10 +7,8 @@ athena_connect(default_schema = "default", region = c("us-east-1", "us-east-2", "us-west-2"), s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"), max_error_retries = 10, - connection_timeout = 10000, socket_timeout = 10000, - retry_base_delay = 100, retry_max_backoff_time = 1000, log_path = "", - log_level = c("INFO", "DEBUG", "WARN", "ERROR", "ALL", "OFF", "FATAL", - "TRACE")) + connection_timeout = 10000, socket_timeout = 10000, log_path = "", + log_level = c("OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE")) } \arguments{ \item{default_schema}{default schema (you'll still need to fully qualify non-default schema table names)} @@ -25,13 +23,10 @@ athena_connect(default_schema = "default", region = c("us-east-1", \item{socket_timeout}{the maximum amount of time, in milliseconds, to wait for a socket in order to send data to Athena.} -\item{retry_base_delay}{minimum delay amount, in milliseconds, between retrying attempts to connect Athena.} - -\item{retry_max_backoff_time}{maximum delay amount, in milliseconds, between retrying attempts to connect Athena.} - \item{log_path}{local path of the Athena JDBC driver logs. If no log path is provided, then no log files are created.} -\item{log_level}{log level of the Athena JDBC driver logs.} +\item{log_level}{log level of the Athena JDBC driver logs. Use names +"OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE".} } \description{ Handles the up-front JDBC config diff --git a/man/dbConnect-AthenaDriver-method.Rd b/man/dbConnect-AthenaDriver-method.Rd index 76b8dd6..01c0b6d 100644 --- a/man/dbConnect-AthenaDriver-method.Rd +++ b/man/dbConnect-AthenaDriver-method.Rd @@ -6,12 +6,11 @@ \title{AthenaJDBC} \usage{ \S4method{dbConnect}{AthenaDriver}(drv, - provider = "com.amazonaws.athena.jdbc.shaded.com.amazonaws.auth.DefaultAWSCredentialsProviderChain", + provider = "com.simba.athena.amazonaws.auth.DefaultAWSCredentialsProviderChain", region = "us-east-1", s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"), schema_name = "default", max_error_retries = 10, - connection_timeout = 10000, socket_timeout = 10000, - retry_base_delay = 100, retry_max_backoff_time = 1000, log_path, - log_level, ...) + connection_timeout = 10000, socket_timeout = 10000, log_path, log_level, + ...) } \arguments{ \item{provider}{JDBC auth provider (ideally leave default)} @@ -22,10 +21,12 @@ \item{schema_name}{LOL if only this actually worked with Amazon's hacked Presto driver} -\item{max_error_retries, connection_timeout, socket_timeout, retry_base_delay, retry_max_backoff_time}{technical connection info that you should only muck with if you know what you're doing.} +\item{max_error_retries, connection_timeout, socket_timeout}{technical connection info that you should only muck with if you know what you're doing.} \item{log_path, log_level}{The Athena JDBC driver can (shockingly) provide a decent bit -of data in logs. Set this to a temporary directory or somethign log4j can use.} +of data in logs. Set this to a temporary directory or something log4j can use. For +`log_level` use the names ("INFO", "DEBUG", "WARN", "ERROR", "ALL", "OFF", "FATAL", "TRACE") or +their corresponding integer values 0-6.} \item{...}{unused} }