diff --git a/DESCRIPTION b/DESCRIPTION index 849a9c7..ec7b9db 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: metis.lite Type: Package -Title: Helpers for Accessing and Querying Amazon Athena +Title: Access and Query Amazon Athena via DBI/JDBC Version: 0.3.0 Date: 2019-02-14 Authors@R: c( @@ -11,7 +11,7 @@ Maintainer: Bob Rudis Encoding: UTF-8 Description: Methods are provides to connect to 'Amazon' 'Athena', lookup schemas/tables, perform queries and retrieve query results. A lightweight 'RJDBC' implementation - is included along with an interface to the 'AWS' command-line utility. + is included along with additional helpers for 'dplyr'/'dplyr' suppprt. SystemRequirements: JDK 1.8+ License: MIT + file LICENSE Suggests: diff --git a/R/a-utils.R b/R/a-utils.R index 17268fa..13bf73a 100644 --- a/R/a-utils.R +++ b/R/a-utils.R @@ -3,6 +3,10 @@ set_names <- function (object = nm, nm) { object } +as_logical <- function(x) { + as.logical(as.integer(x)) +} + as_date <- function(x) { as.Date(x, origin = "1970-01-01") } diff --git a/R/jdbc.r b/R/jdbc.r index 3859cbc..e1c24a6 100644 --- a/R/jdbc.r +++ b/R/jdbc.r @@ -36,6 +36,14 @@ Athena <- function(identifier.quote = '`') { #' AthenaJDBC #' +#' Connect to Athena +#' +#' @section Driver Configuration Options: +#' +#' - `BinaryColumnLength`: The maximum data length for `BINARY` columns. Default `32767L` +#' - `ComplexTypeColumnLength`: The maximum data length for `ARRAY`, `MAP`, and `STRUCT` columns. Default `65535L` +#' - `StringColumnLength`: The maximum data length for `STRING` columns. Default `255L` +#' #' @param provider JDBC auth provider (ideally leave default) #' @param region AWS region the Athena tables are in #' @param s3_staging_dir A write-able bucket on S3 that you have permissions for @@ -46,8 +54,9 @@ Athena <- function(identifier.quote = '`') { #' of data in logs. Set this to a temporary directory or something log4j can use. For #' `log_level` use the names ("INFO", "DEBUG", "WARN", "ERROR", "ALL", "OFF", "FATAL", "TRACE") or #' their corresponding integer values 0-6. -#' @param ... unused -#' @references +#' @param ... passed on to the driver. See Details. +#' @references [Connect with JDBC](https://docs.aws.amazon.com/athena/latest/ug/connect-with-jdbc.html); +#' [Simba Athena JDBC Driver with SQL Connector Installation and Configuration Guide](https://s3.amazonaws.com/athena-downloads/drivers/JDBC/SimbaAthenaJDBC_2.0.6/docs/Simba+Athena+JDBC+Driver+Install+and+Configuration+Guide.pdf) #' @export setMethod( @@ -60,6 +69,7 @@ setMethod( region = "us-east-1", s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"), schema_name = "default", + fetch_size = 1000L, max_error_retries = 10, connection_timeout = 10000, socket_timeout = 10000, @@ -87,7 +97,11 @@ setMethod( ... ) -> jc - return(as(jc, "AthenaConnection")) + + jc <- as(jc, "AthenaConnection") + jc@fetch_size <- as.integer(fetch_size) + + return(jc) } @@ -96,7 +110,9 @@ setMethod( #' AthenaJDBC #' #' @export -setClass("AthenaConnection", contains = "JDBCConnection") +setClass("AthenaConnection", representation("JDBCConnection", jc="jobjRef", identifier.quote="character", fetch_size="integer")) + +# setClass("AthenaConnection", contains = "JDBCConnection") #' AthenaJDBC #' diff --git a/R/metis-lite-package.R b/R/metis-lite-package.R index 11f6182..fd0cfdd 100644 --- a/R/metis-lite-package.R +++ b/R/metis-lite-package.R @@ -1,4 +1,4 @@ -#' Helpers for Accessing and Querying Amazon Athena +#' Access and Query Amazon Athena via DBI/JDBC #' #' Methods are provides to connect to 'Amazon' 'Athena', lookup schemas/tables, #' perform queries and retrieve query results. A lightweight 'RJDBC' implementation @@ -27,6 +27,7 @@ #' @docType package #' @author Bob Rudis (bob@@rud.is) #' @import RJDBC DBI bit64 dbplyr +#' @references [Simba Athena JDBC Driver with SQL Connector Installation and Configuration Guide](https://s3.amazonaws.com/athena-downloads/drivers/JDBC/SimbaAthenaJDBC_2.0.6/docs/Simba+Athena+JDBC+Driver+Install+and+Configuration+Guide.pdf) NULL diff --git a/R/metis.r b/R/metis.r index 160dfdb..0f9f15f 100644 --- a/R/metis.r +++ b/R/metis.r @@ -1,4 +1,4 @@ -#' Make a JDBC connection to Athena +#' Simplified Athena JDBC connection helper #' #' Handles the up-front JDBC config #' @@ -14,14 +14,19 @@ #' @param log_path local path of the Athena JDBC driver logs. If no log path is provided, then no log files are created. #' @param log_level log level of the Athena JDBC driver logs. Use names #' "OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE". +#' @param ... passed on to the driver #' @export +#' @references [Connect with JDBC](https://docs.aws.amazon.com/athena/latest/ug/connect-with-jdbc.html); +#' [Simba Athena JDBC Driver with SQL Connector Installation and Configuration Guide](https://s3.amazonaws.com/athena-downloads/drivers/JDBC/SimbaAthenaJDBC_2.0.6/docs/Simba+Athena+JDBC+Driver+Install+and+Configuration+Guide.pdf) #' @examples \dontrun{ #' use_credentials("personal") #' -#' ath <- athena_connect(default_schema = "sampledb", -#' s3_staging_dir = "s3://accessible-bucket", -#' log_path = "/tmp/athena.log", -#' log_level = "DEBUG") +#' athena_connect( +#' default_schema = "sampledb", +#' s3_staging_dir = "s3://accessible-bucket", +#' log_path = "/tmp/athena.log", +#' log_level = "DEBUG" +#' ) -> ath #' #' dbListTables(ath) #' @@ -35,17 +40,16 @@ athena_connect <- function( max_error_retries = 10, connection_timeout = 10000, socket_timeout = 10000, - # retry_base_delay = 100, - # retry_max_backoff_time = 1000, log_path = "", - log_level = c("OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE")) { + log_level = c("OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE"), + ... +) { athena_jdbc <- Athena() region <- match.arg(region, c("us-east-1", "us-east-2", "us-west-2")) log_level <- match.arg(log_level, c("OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE")) - # if (!simple) { dbConnect( athena_jdbc, schema_name = default_schema, @@ -54,15 +58,10 @@ athena_connect <- function( max_error_retries = max_error_retries, connection_timeout = connection_timeout, socket_timeout = socket_timeout, - # retry_base_delay = retry_base_delay, - # retry_max_backoff_time = retry_max_backoff_time, log_path = log_path, - log_level = log_level + log_level = log_level, + ... ) -> con - # } else { - # con <- dbConnect(athena_jdbc, provider = NULL, schema_name = default_schema, region = region, - # s3_staging_dir = s3_staging_dir, log_path = log_path, log_level = log_level) - # } con diff --git a/R/z-dbGetQuery.R b/R/z-dbGetQuery.R index 23fa372..9de4762 100644 --- a/R/z-dbGetQuery.R +++ b/R/z-dbGetQuery.R @@ -16,10 +16,11 @@ list( "7" = as.double, # REAL "8" = as.double, # DOUBLE "12" = as.character, # VARCHAR - "16" = as.logical, # BOOLEAN + "16" = as_logical, # BOOLEAN "91" = as_date, # DATE "92" = as.character, # TIME "93" = as_posixct, # TIMESTAMP + "2003" = as.character, # ARRAY "1111" = as.character # OTHER ) -> .jdbc_converters @@ -71,14 +72,19 @@ setMethod( ct <- as.character(.jcall(res@md, "I", "getColumnType", i)) athena_type_convert[[i]] <- .jdbc_converters[[ct]] nms <- c(nms, .jcall(res@md, "S", "getColumnLabel", i)) + # message(ct, "|", tail(nms, 1)) } athena_type_convert <- set_names(athena_type_convert, nms) out <- callNextMethod(res = res, n = n, block = block, ...) + # print(str(out)) + for (nm in names(athena_type_convert)) { - out[[nm]] <- athena_type_convert[[nm]](out[[nm]]) + f <- athena_type_convert[[nm]] + if (length(f) == 0) f <- as.character # catchall in case AMZN is tricksy + out[[nm]] <- f(out[[nm]]) } out @@ -98,13 +104,13 @@ setMethod( "dbGetQuery", signature(conn="AthenaConnection", statement="character"), - definition = function(conn, statement, type_convert=FALSE, ...) { + definition = function(conn, statement, ...) { r <- dbSendQuery(conn, statement, ...) on.exit(.jcall(r@stat, "V", "close")) - res <- fetch(r, -1, block = 1000L) + res <- fetch(r, -1, block = conn@fetch_size) class(res) <- c("tbl_df", "tbl", "data.frame") diff --git a/R/zzz.R b/R/zzz.R index 588f6a7..70b17a3 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,4 +1,11 @@ .onLoad <- function(libname, pkgname) { rJava::.jpackage(pkgname, jars = "*", lib.loc = libname) - # rJava::.jaddClassPath(dir(file.path(getwd(), "inst/java"), full.names = TRUE)) + rJava::.jaddClassPath(dir(file.path(getwd(), "inst/java"), full.names = TRUE)) + o <- getOption("java.parameters", "") + if (!any(grepl("-Xrs", o))) { + packageStartupMessage( + "Did not find '-Xrs' in java.parameters option. Until rJava is updated, ", + "please set this up in your/an Rprofile or at the start of scripts." + ) + } } diff --git a/README.Rmd b/README.Rmd index 44cb779..bcedb72 100644 --- a/README.Rmd +++ b/README.Rmd @@ -4,25 +4,14 @@ editor_options: chunk_output_type: console --- -# `metis` +# metis -Helpers for Accessing and Querying Amazon Athena - -Including a lightweight RJDBC shim. - -In Greek mythology, Metis was Athena's "helper". +Access and Query Amazon Athena via DBI/JDBC ## Description -Still fairly beta-quality level but getting there. - -The goal will be to get around enough of the "gotchas" that are preventing raw RJDBC Athena connections from "just working" with `dplyr` v0.6.0+ and also get around the [`fetchSize` problem](https://www.reddit.com/r/aws/comments/6aq22b/fetchsize_limit/) without having to not use `dbGetQuery()`. - -The `AthenaJDBC42_2.0.2.jar` JAR file is included out of convenience but that will likely move to a separate package as this gets closer to prime time if this goes on CRAN. - -NOTE that the updated driver *REQUIRES JDK 1.8+*. - -See the **Usage** section for an example. +In Greek mythology, Metis was Athena's "helper" so methods are provided to help you accessing and querying Amazon Athena via DBI/JDBC and/or `dplyr`. +#' Methods are provides to connect to 'Amazon' 'Athena', lookup schemas/tables, ## IMPORTANT @@ -41,7 +30,7 @@ The following functions are implemented: Easy-interface connection helper: -- `athena_connect` Make a JDBC connection to Athena +- `athena_connect` Simplified Athena JDBC connection helper Custom JDBC Classes: @@ -52,13 +41,13 @@ Custom JDBC Classes: Custom JDBC Class Methods: -- `dbConnect-method`: AthenaJDBC -- `dbExistsTable-method`: AthenaJDBC -- `dbGetQuery-method`: AthenaJDBC -- `dbListFields-method`: AthenaJDBC -- `dbListTables-method`: AthenaJDBC -- `dbReadTable-method`: AthenaJDBC -- `dbSendQuery-method`: AthenaJDBC +- `dbConnect-method` +- `dbExistsTable-method` +- `dbGetQuery-method` +- `dbListFields-method` +- `dbListTables-method` +- `dbReadTable-method` +- `dbSendQuery-method` Pulled in from other `cloudyr` pkgs: @@ -68,41 +57,97 @@ Pulled in from other `cloudyr` pkgs: ## Installation ```{r eval=FALSE} -devtools::install_github("hrbrmstr/metis") +devtools::install_git("https://git.sr.ht/~hrbrmstr/metis-lite") +# OR +devtools::install_gitlab("hrbrmstr/metis-lite") +# OR +devtools::install_github("hrbrmstr/metis-lite") ``` -```{r message=FALSE, warning=FALSE, error=FALSE, include=FALSE} +```{r message=FALSE, warning=FALSE, include=FALSE} options(width=120) ``` ## Usage -```{r message=FALSE, warning=FALSE, error=FALSE} -library(metis) -library(tidyverse) +```{r message=FALSE, warning=FALSE} +library(metis.lite) # current verison -packageVersion("metis") +packageVersion("metis.lite") ``` -```{r message=FALSE, warning=FALSE, error=FALSE} -use_credentials("default") +```{r message=FALSE, warning=FALSE} +library(rJava) +library(RJDBC) +library(metis.lite) +library(magrittr) +library(dbplyr) +library(dplyr) -athena_connect( - default_schema = "sampledb", - s3_staging_dir = "s3://accessible-bucket", - log_path = "/tmp/athena.log", - log_level = "DEBUG" -) -> ath +dbConnect( + drv = metis.lite::Athena(), + schema_name = "sampledb", + provider = "com.simba.athena.amazonaws.auth.PropertiesFileCredentialsProvider", + AwsCredentialsProviderArguments = path.expand("~/.aws/athenaCredentials.props"), + s3_staging_dir = "s3://aws-athena-query-results-569593279821-us-east-1", +) -> con -dbListTables(ath, schema="sampledb") +dbListTables(con, schema="sampledb") -dbExistsTable(ath, "elb_logs", schema="sampledb") +dbExistsTable(con, "elb_logs", schema="sampledb") -dbListFields(ath, "elb_logs", "sampledb") +dbListFields(con, "elb_logs", "sampledb") + +dbGetQuery(con, "SELECT * FROM sampledb.elb_logs LIMIT 10") %>% + glimpse() +``` + +### Check types + +```{r} +dbGetQuery(con, " +SELECT + CAST('chr' AS CHAR(4)) achar, + CAST('varchr' AS VARCHAR) avarchr, + CAST(SUBSTR(timestamp, 1, 10) AS DATE) AS tsday, + CAST(100.1 AS DOUBLE) AS justadbl, + CAST(127 AS TINYINT) AS asmallint, + CAST(100 AS INTEGER) AS justanint, + CAST(100000000000000000 AS BIGINT) AS abigint, + CAST(('GET' = 'GET') AS BOOLEAN) AS is_get, + ARRAY[1, 2, 3] AS arr1, + ARRAY['1', '2, 3', '4'] AS arr2, + MAP(ARRAY['foo', 'bar'], ARRAY[1, 2]) AS mp, + CAST(ROW(1, 2.0) AS ROW(x BIGINT, y DOUBLE)) AS rw, + CAST('{\"a\":1}' AS JSON) js +FROM elb_logs +LIMIT 1 +") %>% + glimpse() +``` -dbGetQuery(ath, "SELECT * FROM sampledb.elb_logs LIMIT 10") %>% - type_convert() %>% +#### dplyr + +```{r} +tbl(con, sql(" +SELECT + CAST('chr' AS CHAR(4)) achar, + CAST('varchr' AS VARCHAR) avarchr, + CAST(SUBSTR(timestamp, 1, 10) AS DATE) AS tsday, + CAST(100.1 AS DOUBLE) AS justadbl, + CAST(127 AS TINYINT) AS asmallint, + CAST(100 AS INTEGER) AS justanint, + CAST(100000000000000000 AS BIGINT) AS abigint, + CAST(('GET' = 'GET') AS BOOLEAN) AS is_get, + ARRAY[1, 2, 3] AS arr, + ARRAY['1', '2, 3', '4'] AS arr, + MAP(ARRAY['foo', 'bar'], ARRAY[1, 2]) AS mp, + CAST(ROW(1, 2.0) AS ROW(x BIGINT, y DOUBLE)) AS rw, + CAST('{\"a\":1}' AS JSON) js +FROM elb_logs +LIMIT 1 +")) %>% glimpse() ``` diff --git a/README.md b/README.md index 83af003..82f5a56 100644 --- a/README.md +++ b/README.md @@ -1,35 +1,25 @@ -# `metis` +# metis -Helpers for Accessing and Querying Amazon Athena - -Including a lightweight RJDBC shim. - -In Greek mythology, Metis was Athena’s “helper”. +Access and Query Amazon Athena via DBI/JDBC ## Description -Still fairly beta-quality level but getting there. - -The goal will be to get around enough of the “gotchas” that are -preventing raw RJDBC Athena connections from “just working” with `dplyr` -v0.6.0+ and also get around the [`fetchSize` -problem](https://www.reddit.com/r/aws/comments/6aq22b/fetchsize_limit/) -without having to not use `dbGetQuery()`. - -The `AthenaJDBC42_2.0.2.jar` JAR file is included out of convenience but -that will likely move to a separate package as this gets closer to prime -time if this goes on CRAN. - -NOTE that the updated driver *REQUIRES JDK 1.8+*. - -See the **Usage** section for an example. +In Greek mythology, Metis was Athena’s “helper” so methods are provided +to help you accessing and querying Amazon Athena via DBI/JDBC and/or +`dplyr`. \#’ Methods are provides to connect to ‘Amazon’ ‘Athena’, +lookup schemas/tables, ## IMPORTANT -Since R 3.5 (I don't remember this happening in R 3.4.x) signals sent from interrupting Athena JDBC calls crash the R interpreter. You need to set the `-Xrs` option to avoid signals being passed on to the JVM owner. That has to be done _before_ `rJava` is loaded so you either need to remember to put it at the top of all scripts _or_ stick this in your local `~/.Rprofile` and/or sitewide `Rprofile`: +Since R 3.5 (I don’t remember this happening in R 3.4.x) signals sent +from interrupting Athena JDBC calls crash the R interpreter. You need to +set the `-Xrs` option to avoid signals being passed on to the JVM owner. +That has to be done *before* `rJava` is loaded so you either need to +remember to put it at the top of all scripts *or* stick this in your +local `~/.Rprofile` and/or sitewide `Rprofile`: -```r +``` r if (!grepl("-Xrs", getOption("java.parameters", ""))) { options( "java.parameters" = c(getOption("java.parameters", default = NULL), "-Xrs") @@ -43,7 +33,7 @@ The following functions are implemented: Easy-interface connection helper: - - `athena_connect` Make a JDBC connection to Athena + - `athena_connect` Simplified Athena JDBC connection helper Custom JDBC Classes: @@ -54,13 +44,13 @@ Custom JDBC Classes: Custom JDBC Class Methods: - - `dbConnect-method`: AthenaJDBC - - `dbExistsTable-method`: AthenaJDBC - - `dbGetQuery-method`: AthenaJDBC - - `dbListFields-method`: AthenaJDBC - - `dbListTables-method`: AthenaJDBC - - `dbReadTable-method`: AthenaJDBC - - `dbSendQuery-method`: AthenaJDBC + - `dbConnect-method` + - `dbExistsTable-method` + - `dbGetQuery-method` + - `dbListFields-method` + - `dbListTables-method` + - `dbReadTable-method` + - `dbSendQuery-method` Pulled in from other `cloudyr` pkgs: @@ -70,44 +60,53 @@ Pulled in from other `cloudyr` pkgs: ## Installation ``` r -devtools::install_github("hrbrmstr/metis") +devtools::install_git("https://git.sr.ht/~hrbrmstr/metis-lite") +# OR +devtools::install_gitlab("hrbrmstr/metis-lite") +# OR +devtools::install_github("hrbrmstr/metis-lite") ``` ## Usage ``` r -library(metis) -library(tidyverse) +library(metis.lite) # current verison -packageVersion("metis") +packageVersion("metis.lite") ``` ## [1] '0.3.0' ``` r -use_credentials("default") - -athena_connect( - default_schema = "sampledb", - s3_staging_dir = "s3://accessible-bucket", - log_path = "/tmp/athena.log", - log_level = "DEBUG" -) -> ath - -dbListTables(ath, schema="sampledb") +library(rJava) +library(RJDBC) +library(metis.lite) +library(magrittr) +library(dbplyr) +library(dplyr) + +dbConnect( + drv = metis.lite::Athena(), + schema_name = "sampledb", + provider = "com.simba.athena.amazonaws.auth.PropertiesFileCredentialsProvider", + AwsCredentialsProviderArguments = path.expand("~/.aws/athenaCredentials.props"), + s3_staging_dir = "s3://aws-athena-query-results-569593279821-us-east-1", +) -> con + +dbListTables(con, schema="sampledb") ``` ## [1] "elb_logs" ``` r -dbExistsTable(ath, "elb_logs", schema="sampledb") +dbExistsTable(con, "elb_logs", schema="sampledb") ``` ## [1] TRUE ``` r -dbListFields(ath, "elb_logs", "sampledb") +dbListFields(con, "elb_logs", "sampledb") ``` ## [1] "timestamp" "elbname" "requestip" "requestport" @@ -116,29 +115,109 @@ dbListFields(ath, "elb_logs", "sampledb") ## [13] "sentbytes" "requestverb" "url" "protocol" ``` r -dbGetQuery(ath, "SELECT * FROM sampledb.elb_logs LIMIT 10") %>% - type_convert() %>% +dbGetQuery(con, "SELECT * FROM sampledb.elb_logs LIMIT 10") %>% glimpse() ``` ## Observations: 10 ## Variables: 16 - ## $ timestamp 2014-09-30 01:28:17, 2014-09-30 00:01:30, 2014-09-30 00:01:30, 2014-09-30 00:01:30, ... - ## $ elbname "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo... - ## $ requestip "246.140.190.136", "240.109.129.138", "242.251.232.153", "253.227.207.81", "253.227.2... - ## $ requestport 63777, 22705, 22705, 22705, 23282, 24178, 22916, 23807, 22916, 21443 - ## $ backendip "250.193.168.100", "251.103.130.45", "243.140.114.254", "243.82.95.243", "246.129.102... - ## $ backendport 8888, 8888, 8888, 8888, 8899, 8888, 8888, 8888, 8888, 8888 - ## $ requestprocessingtime 7.2e-05, 6.9e-05, 8.7e-05, 9.7e-05, 8.1e-05, 4.6e-05, 4.3e-05, 5.3e-05, 5.5e-05, 4.4e-05 - ## $ backendprocessingtime 0.379241, 0.007541, 0.187126, 0.413337, 0.037030, 0.050222, 0.043706, 0.045953, 0.015... - ## $ clientresponsetime 8.0e-05, 4.3e-05, 7.5e-05, 8.7e-05, 4.5e-05, 3.3e-05, 3.3e-05, 6.9e-05, 8.5e-05, 4.9e-05 - ## $ elbresponsecode 200, 302, 302, 200, 200, 200, 200, 200, 200, 200 - ## $ backendresponsecode 200, 200, 200, 400, 200, 200, 200, 404, 200, 200 - ## $ receivedbytes 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - ## $ sentbytes 58402, 0, 0, 58402, 32370, 20766, 3408, 152213, 84245, 3884 + ## $ timestamp "2014-09-29T18:18:51.826955Z", "2014-09-29T18:18:51.920462Z", "2014-09-29T18:18:52.2725… + ## $ elbname "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo",… + ## $ requestip "255.48.150.122", "249.213.227.93", "245.108.120.229", "241.112.203.216", "241.43.107.2… + ## $ requestport 62096, 62096, 62096, 62096, 56454, 33254, 18918, 64352, 1651, 56454 + ## $ backendip "244.238.214.120", "248.99.214.228", "243.3.190.175", "246.235.181.255", "241.112.203.2… + ## $ backendport 8888, 8888, 8888, 8888, 8888, 8888, 8888, 8888, 8888, 8888 + ## $ requestprocessingtime 9.0e-05, 9.7e-05, 8.7e-05, 9.4e-05, 7.6e-05, 8.3e-05, 6.3e-05, 5.4e-05, 8.2e-05, 8.7e-05 + ## $ backendprocessingtime 0.007410, 0.256533, 0.442659, 0.016772, 0.035036, 0.029892, 0.034148, 0.014858, 0.01518… + ## $ clientresponsetime 0.000055, 0.000075, 0.000131, 0.000078, 0.000057, 0.000043, 0.000033, 0.000043, 0.00007… + ## $ elbresponsecode "302", "302", "200", "200", "200", "200", "200", "200", "200", "200" + ## $ backendresponsecode "200", "200", "200", "200", "200", "200", "200", "200", "200", "200" + ## $ receivedbytes 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + ## $ sentbytes 0, 0, 58402, 152213, 20766, 32370, 3408, 3884, 84245, 3831 ## $ requestverb "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET" - ## $ url "http://www.abcxyz.com:80/", "http://www.abcxyz.com:80/", "http://www.abcxyz.com:80/a... - ## $ protocol "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "... + ## $ url "http://www.abcxyz.com:80/", "http://www.abcxyz.com:80/accounts/login/?next=/", "http:/… + ## $ protocol "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HT… + +### Check types + +``` r +dbGetQuery(con, " +SELECT + CAST('chr' AS CHAR(4)) achar, + CAST('varchr' AS VARCHAR) avarchr, + CAST(SUBSTR(timestamp, 1, 10) AS DATE) AS tsday, + CAST(100.1 AS DOUBLE) AS justadbl, + CAST(127 AS TINYINT) AS asmallint, + CAST(100 AS INTEGER) AS justanint, + CAST(100000000000000000 AS BIGINT) AS abigint, + CAST(('GET' = 'GET') AS BOOLEAN) AS is_get, + ARRAY[1, 2, 3] AS arr1, + ARRAY['1', '2, 3', '4'] AS arr2, + MAP(ARRAY['foo', 'bar'], ARRAY[1, 2]) AS mp, + CAST(ROW(1, 2.0) AS ROW(x BIGINT, y DOUBLE)) AS rw, + CAST('{\"a\":1}' AS JSON) js +FROM elb_logs +LIMIT 1 +") %>% + glimpse() +``` + + ## Observations: 1 + ## Variables: 13 + ## $ achar "chr " + ## $ avarchr "varchr" + ## $ tsday 2014-09-26 + ## $ justadbl 100.1 + ## $ asmallint 127 + ## $ justanint 100 + ## $ abigint 100000000000000000 + ## $ is_get TRUE + ## $ arr1 "1, 2, 3" + ## $ arr2 "1, 2, 3, 4" + ## $ mp "{bar=2, foo=1}" + ## $ rw "{x=1, y=2.0}" + ## $ js "\"{\\\"a\\\":1}\"" + +#### dplyr + +``` r +tbl(con, sql(" +SELECT + CAST('chr' AS CHAR(4)) achar, + CAST('varchr' AS VARCHAR) avarchr, + CAST(SUBSTR(timestamp, 1, 10) AS DATE) AS tsday, + CAST(100.1 AS DOUBLE) AS justadbl, + CAST(127 AS TINYINT) AS asmallint, + CAST(100 AS INTEGER) AS justanint, + CAST(100000000000000000 AS BIGINT) AS abigint, + CAST(('GET' = 'GET') AS BOOLEAN) AS is_get, + ARRAY[1, 2, 3] AS arr, + ARRAY['1', '2, 3', '4'] AS arr, + MAP(ARRAY['foo', 'bar'], ARRAY[1, 2]) AS mp, + CAST(ROW(1, 2.0) AS ROW(x BIGINT, y DOUBLE)) AS rw, + CAST('{\"a\":1}' AS JSON) js +FROM elb_logs +LIMIT 1 +")) %>% + glimpse() +``` + + ## Observations: ?? + ## Variables: 13 + ## Database: AthenaConnection + ## $ achar "chr " + ## $ avarchr "varchr" + ## $ tsday 2014-09-27 + ## $ justadbl 100.1 + ## $ asmallint 127 + ## $ justanint 100 + ## $ abigint 100000000000000000 + ## $ is_get TRUE + ## $ arr "1, 2, 3" + ## $ arr "1, 2, 3, 4" + ## $ mp "{bar=2, foo=1}" + ## $ rw "{x=1, y=2.0}" + ## $ js "\"{\\\"a\\\":1}\"" ## Code of Conduct diff --git a/man/athena_connect.Rd b/man/athena_connect.Rd index 8fe7594..fbed4e5 100644 --- a/man/athena_connect.Rd +++ b/man/athena_connect.Rd @@ -2,14 +2,14 @@ % Please edit documentation in R/metis.r \name{athena_connect} \alias{athena_connect} -\title{Make a JDBC connection to Athena} +\title{Simplified Athena JDBC connection helper} \usage{ athena_connect(default_schema = "default", region = c("us-east-1", "us-east-2", "us-west-2"), s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"), max_error_retries = 10, connection_timeout = 10000, socket_timeout = 10000, log_path = "", log_level = c("OFF", - "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE")) + "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE"), ...) } \arguments{ \item{default_schema}{default schema (you'll still need to fully qualify non-default schema table names)} @@ -28,6 +28,8 @@ athena_connect(default_schema = "default", region = c("us-east-1", \item{log_level}{log level of the Athena JDBC driver logs. Use names "OFF", "FATAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE".} + +\item{...}{passed on to the driver} } \description{ Handles the up-front JDBC config @@ -36,10 +38,12 @@ Handles the up-front JDBC config \dontrun{ use_credentials("personal") -ath <- athena_connect(default_schema = "sampledb", - s3_staging_dir = "s3://accessible-bucket", - log_path = "/tmp/athena.log", - log_level = "DEBUG") +athena_connect( + default_schema = "sampledb", + s3_staging_dir = "s3://accessible-bucket", + log_path = "/tmp/athena.log", + log_level = "DEBUG" +) -> ath dbListTables(ath) @@ -47,3 +51,7 @@ dbGetQuery(ath, "SELECT * FROM sampledb.elb_logs LIMIT 1") } } +\references{ +\href{https://docs.aws.amazon.com/athena/latest/ug/connect-with-jdbc.html}{Connect with JDBC}; +\href{https://s3.amazonaws.com/athena-downloads/drivers/JDBC/SimbaAthenaJDBC_2.0.6/docs/Simba+Athena+JDBC+Driver+Install+and+Configuration+Guide.pdf}{Simba Athena JDBC Driver with SQL Connector Installation and Configuration Guide} +} diff --git a/man/dbConnect-AthenaDriver-method.Rd b/man/dbConnect-AthenaDriver-method.Rd index 98d65e2..001daec 100644 --- a/man/dbConnect-AthenaDriver-method.Rd +++ b/man/dbConnect-AthenaDriver-method.Rd @@ -9,9 +9,9 @@ provider = "com.simba.athena.amazonaws.auth.DefaultAWSCredentialsProviderChain", region = "us-east-1", s3_staging_dir = Sys.getenv("AWS_S3_STAGING_DIR"), - schema_name = "default", max_error_retries = 10, - connection_timeout = 10000, socket_timeout = 10000, log_path = "", - log_level = 0, ...) + schema_name = "default", fetch_size = 1000L, + max_error_retries = 10, connection_timeout = 10000, + socket_timeout = 10000, log_path = "", log_level = 0, ...) } \arguments{ \item{provider}{JDBC auth provider (ideally leave default)} @@ -29,11 +29,20 @@ of data in logs. Set this to a temporary directory or something log4j can use. F `log_level` use the names ("INFO", "DEBUG", "WARN", "ERROR", "ALL", "OFF", "FATAL", "TRACE") or their corresponding integer values 0-6.} -\item{...}{unused} +\item{...}{passed on to the driver. See Details.} } \description{ -AthenaJDBC +Connect to Athena } +\section{Driver Configuration Options}{ + + +- `BinaryColumnLength`: The maximum data length for `BINARY` columns. Default `32767L` +- `ComplexTypeColumnLength`: The maximum data length for `ARRAY`, `MAP`, and `STRUCT` columns. Default `65535L` +- `StringColumnLength`: The maximum data length for `STRING` columns. Default `255L` +} + \references{ - +[Connect with JDBC](https://docs.aws.amazon.com/athena/latest/ug/connect-with-jdbc.html); + [Simba Athena JDBC Driver with SQL Connector Installation and Configuration Guide](https://s3.amazonaws.com/athena-downloads/drivers/JDBC/SimbaAthenaJDBC_2.0.6/docs/Simba+Athena+JDBC+Driver+Install+and+Configuration+Guide.pdf) } diff --git a/man/dbGetQuery-AthenaConnection-character-method.Rd b/man/dbGetQuery-AthenaConnection-character-method.Rd index 6261007..49c3e1b 100644 --- a/man/dbGetQuery-AthenaConnection-character-method.Rd +++ b/man/dbGetQuery-AthenaConnection-character-method.Rd @@ -5,8 +5,7 @@ \alias{dbGetQuery,AthenaConnection,character-method} \title{AthenaJDBC} \usage{ -\S4method{dbGetQuery}{AthenaConnection,character}(conn, statement, - type_convert = FALSE, ...) +\S4method{dbGetQuery}{AthenaConnection,character}(conn, statement, ...) } \arguments{ \item{conn}{Athena connection} diff --git a/man/metis.lite.Rd b/man/metis.lite.Rd index 511f31c..633a08f 100644 --- a/man/metis.lite.Rd +++ b/man/metis.lite.Rd @@ -5,7 +5,7 @@ \name{metis.lite} \alias{metis.lite} \alias{metis.lite-package} -\title{Helpers for Accessing and Querying Amazon Athena} +\title{Access and Query Amazon Athena via DBI/JDBC} \description{ Methods are provides to connect to 'Amazon' 'Athena', lookup schemas/tables, perform queries and retrieve query results. A lightweight 'RJDBC' implementation @@ -26,6 +26,9 @@ in your local #' \code{~/.Rprofile} and/or sitewide \code{Rprofile}:\preformatte } } +\references{ +\href{https://s3.amazonaws.com/athena-downloads/drivers/JDBC/SimbaAthenaJDBC_2.0.6/docs/Simba+Athena+JDBC+Driver+Install+and+Configuration+Guide.pdf}{Simba Athena JDBC Driver with SQL Connector Installation and Configuration Guide} +} \author{ Bob Rudis (bob@rud.is) }