mirror of https://git.sr.ht/~hrbrmstr/metis-tidy
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
4 years ago | |
---|---|---|
R | 4 years ago | |
man | 4 years ago | |
tests | 4 years ago | |
.Rbuildignore | 4 years ago | |
.bash_profile | 5 years ago | |
.codecov.yml | 4 years ago | |
.gitignore | 6 years ago | |
.travis.yml | 4 years ago | |
CONDUCT.md | 5 years ago | |
DESCRIPTION | 4 years ago | |
LICENSE | 4 years ago | |
LICENSE.md | 4 years ago | |
NAMESPACE | 4 years ago | |
NEWS.md | 4 years ago | |
README.Rmd | 4 years ago | |
README.md | 4 years ago | |
metis-tidy.Rproj | 4 years ago |
README.md
metis.tidy
Access and Query Amazon Athena via the Tidyverse
Description
Methods are provided to use the ‘metis’ JDBC/DBI interface via the Tidyverse (e.g. ‘dbplyr’/‘dplyr’ idioms).
What’s Inside The Tin?
Lightweight helpers to make it easier to filter
and mutate
plus type
support for Athena BIGINT
(64-bit integers).
Installation
devtools::install_git("https://git.sr.ht/~hrbrmstr/metis-tidy")
# OR
devtools::install_gitlab("hrbrmstr/metis-tidy")
# OR
devtools::install_github("hrbrmstr/metis-tidy")
Usage
library(metis.tidy)
# current verison
packageVersion("metis.tidy")
## [1] '0.3.0'
Basic Setup (using an alternate provider)
library(metis.tidy)
library(tidyverse)
metis::dbConnect(
metis::Athena(),
Schema = "sampledb",
AwsCredentialsProviderClass = "com.simba.athena.amazonaws.auth.PropertiesFileCredentialsProvider",
AwsCredentialsProviderArguments = path.expand("~/.aws/athenaCredentials.props")
) -> con
elb_logs <- tbl(con, "elb_logs")
glimpse(elb_logs)
## Observations: ??
## Variables: 16
## Database: AthenaConnection
## $ timestamp <chr> "2014-09-29T03:00:52.641389Z", "2014-09-29T03:01:23.603288Z", "2014-09-29T03:01:54.6438…
## $ elbname <chr> "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo", "lb-demo",…
## $ requestip <chr> "252.225.51.44", "252.43.130.244", "248.52.123.119", "246.118.11.120", "240.201.80.176"…
## $ requestport <int> 41423, 41423, 41423, 41423, 41423, 41423, 41423, 41423, 41423, 41423, 41423, 41423, 414…
## $ backendip <chr> "246.185.32.192", "246.5.160.209", "252.12.154.122", "246.191.203.13", "247.27.176.211"…
## $ backendport <int> 8888, 8899, 8000, 8888, 8888, 8888, 8888, 8888, 8000, 8000, 8888, 8888, 8888, 8899, 888…
## $ requestprocessingtime <dbl> 0.000095, 0.000090, 0.000087, 0.000089, 0.000090, 0.000093, 0.000092, 0.000094, 0.00010…
## $ backendprocessingtime <dbl> 0.035755, 0.048942, 0.050951, 0.046141, 0.039483, 0.052850, 0.032934, 0.046127, 0.04017…
## $ clientresponsetime <dbl> 5.8e-05, 5.5e-05, 5.0e-05, 4.9e-05, 4.8e-05, 5.4e-05, 5.1e-05, 4.8e-05, 4.8e-05, 4.9e-0…
## $ elbresponsecode <chr> "200", "200", "200", "200", "200", "200", "200", "200", "200", "200", "200", "200", "20…
## $ backendresponsecode <chr> "200", "400", "200", "200", "200", "200", "404", "200", "200", "403", "404", "200", "20…
## $ receivedbytes <S3: integer64> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ sentbytes <S3: integer64> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ requestverb <chr> "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GET", "GE…
## $ url <chr> "http://www.abcxyz.com:80/jobbrowser/?format=json&state=running&user=248nnm5", "http://…
## $ protocol <chr> "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HT…
Using custom Athena functions
filter(elb_logs, elbresponsecode == "200") %>%
mutate(
tsday = as.Date(substring(timestamp, 1L, 10L)),
host = url_extract_host(url),
proto_version = regexp_extract(protocol, "([[:digit:]\\.]+)"),
) %>%
select(tsday, host, receivedbytes, requestprocessingtime, proto_version) %>%
glimpse()
## Observations: ??
## Variables: 5
## Database: AthenaConnection
## $ tsday <date> 2014-09-26, 2014-09-26, 2014-09-26, 2014-09-26, 2014-09-26, 2014-09-26, 2014-09-26, 20…
## $ host <chr> "www.abcxyz.com", "www.abcxyz.com", "www.abcxyz.com", "www.abcxyz.com", "www.abcxyz.com…
## $ receivedbytes <S3: integer64> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ requestprocessingtime <dbl> 0.000074, 0.000102, 0.000056, 0.000048, 0.000047, 0.000087, 0.000095, 0.000049, 0.00005…
## $ proto_version <chr> "1.1", "1.1", "1.1", "1.1", "1.1", "1.1", "1.1", "1.1", "1.1", "1.1", "1.1", "1.1", "1.…
All the types work. Some are useful.
tbl(con, sql("
SELECT
CAST('chr' AS CHAR(4)) achar,
CAST('varchr' AS VARCHAR) avarchr,
CAST(SUBSTR(timestamp, 1, 10) AS DATE) AS tsday,
CAST(100.1 AS DOUBLE) AS justadbl,
CAST(127 AS TINYINT) AS asmallint,
CAST(100 AS INTEGER) AS justanint,
CAST(100000000000000000 AS BIGINT) AS abigint,
CAST(('GET' = 'GET') AS BOOLEAN) AS is_get,
ARRAY[1, 2, 3] AS arr,
ARRAY['1', '2, 3', '4'] AS arr,
MAP(ARRAY['foo', 'bar'], ARRAY[1, 2]) AS mp,
CAST(ROW(1, 2.0) AS ROW(x BIGINT, y DOUBLE)) AS rw,
CAST('{\"a\":1}' AS JSON) js
FROM elb_logs
LIMIT 1
")) %>%
glimpse()
## Observations: ??
## Variables: 13
## Database: AthenaConnection
## $ achar <chr> "chr "
## $ avarchr <chr> "varchr"
## $ tsday <date> 2014-09-26
## $ justadbl <dbl> 100.1
## $ asmallint <int> 127
## $ justanint <int> 100
## $ abigint <S3: integer64> 100000000000000000
## $ is_get <lgl> TRUE
## $ arr <chr> "1, 2, 3"
## $ arr <chr> "1, 2, 3, 4"
## $ mp <chr> "{bar=2, foo=1}"
## $ rw <chr> "{x=1, y=2.0}"
## $ js <chr> "\"{\\\"a\\\":1}\""
cloc::cloc_pkg_md()
Lang | # Files | (%) | LoC | (%) | Blank lines | (%) | # Lines | (%) |
---|---|---|---|---|---|---|---|---|
R | 6 | 0.86 | 156 | 0.75 | 18 | 0.42 | 26 | 0.41 |
Rmd | 1 | 0.14 | 53 | 0.25 | 25 | 0.58 | 38 | 0.59 |
Code of Conduct
Please note that this project is released with a Contributor Code of Conduct. By participating in this project you agree to abide by its terms.