From 9aeb2c923f34f4a49edfa3d79659d81e08ed48de Mon Sep 17 00:00:00 2001 From: boB Rudis Date: Sun, 27 Aug 2017 07:51:53 -0400 Subject: [PATCH] Splash 3.0 and docker pacakge test on Travis --- .travis.yml | 37 ++++++++++++-------------------- DESCRIPTION | 3 +-- NAMESPACE | 4 +++- R/docker-splash.r | 55 ++++++++++++++++++++++++------------------------ R/splashr-package.R | 7 +++---- README.Rmd | 26 ++++++++++------------- README.md | 57 ++++++++++++++------------------------------------ img/cap.jpg | Bin 42444 -> 43449 bytes img/cap.png | Bin 335346 -> 533591 bytes man/install_splash.Rd | 7 ++----- man/start_splash.Rd | 18 ++++++---------- 11 files changed, 83 insertions(+), 131 deletions(-) diff --git a/.travis.yml b/.travis.yml index bf7a640..9644c45 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,38 +1,27 @@ language: r - -warnings_are_errors: true - cache: packages - sudo: required +dist: trusty services: - docker +env: + global: + - NOT_CRAN=true + matrix: + - PIP="pip" RETICULATE_PYTHON="/usr/bin/python" + - PYTHON=3 PIP="pip3" RETICULATE_PYTHON="/usr/bin/python3" + before_install: - - docker pull hrbrmstr/splashttpd - - docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 -d hrbrmstr/splashttpd + - docker pull scrapinghub/splash:3.0 + - docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 -d scrapinghub/splash:3.0 - docker ps -a + - sudo apt-get -y install python${PYTHON}-pip python-dev python${PYTHON}-numpy + - sudo $PIP install -U setuptools + - sudo $PIP install -U docker r: - oldrel - release - devel - -addons: - apt: - packages: - - xclip - - libv8-dev - -env: - - DISPLAY="" - - DISPLAY=:99.0 - -notifications: - email: - - bob@rud.is - irc: - channels: - - "104.236.112.222#builds" - nick: travisci diff --git a/DESCRIPTION b/DESCRIPTION index 89ba517..584ac9a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,7 +33,7 @@ Imports: purrr, stats, utils, - harbor, + docker, magick, scales, formatR, @@ -43,4 +43,3 @@ Imports: HARtools, lubridate RoxygenNote: 6.0.1 -Remotes: wch/harbor diff --git a/NAMESPACE b/NAMESPACE index ac48985..f3c0701 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -83,7 +83,7 @@ export(ua_win7_chrome) export(ua_win7_firefox) export(ua_win7_ie11) export(writeHAR) -import(harbor) +import(docker) import(httr) import(magick) import(purrr) @@ -95,6 +95,7 @@ importFrom(clipr,read_clip) importFrom(curl,curl_unescape) importFrom(formatR,tidy_source) importFrom(jsonlite,fromJSON) +importFrom(jsonlite,stream_in) importFrom(jsonlite,unbox) importFrom(lubridate,ymd_hms) importFrom(openssl,base64_decode) @@ -102,6 +103,7 @@ importFrom(scales,comma) importFrom(stats,setNames) importFrom(stringi,stri_detect_regex) importFrom(stringi,stri_split_fixed) +importFrom(stringi,stri_split_lines) importFrom(stringi,stri_split_regex) importFrom(utils,capture.output) importFrom(utils,str) diff --git a/R/docker-splash.r b/R/docker-splash.r index 264aa6a..9de0589 100644 --- a/R/docker-splash.r +++ b/R/docker-splash.r @@ -1,8 +1,7 @@ #' Retrieve the Docker image for Splash #' #' @md -#' @param host Docker host; defaults to `localhost` -#' @return `harbor` `host` object +#' @param tag Splash Docker image tag to install #' @export #' @family splash_docker_helpers #' @examples \dontrun{ @@ -10,8 +9,11 @@ #' splash_container <- start_splash() #' stop_splash(splash_container) #' } -install_splash <- function(host = harbor::localhost) { - harbor::docker_pull(host, "hrbrmstr/splashttpd") +install_splash <- function(tag="3.0") { + client <- docker::docker$from_env() + res <- client$api$pull("scrapinghub/splash", tag) + res <- jsonlite::stream_in(textConnection(res), verbose=FALSE) + invisible(lapply(res$status, function(x) { message(x) })) } #' Start a Splash server Docker container @@ -19,40 +21,30 @@ install_splash <- function(host = harbor::localhost) { #' If using this in an automation context, you should consider adding a #' `Sys.sleep(3)` (or higher) after starting the docker container. #' -#' @param host Docker host; defaults to `localhost` -#' @param add_tempdir This is `FALSE` initially since you could try to run -#' the splash image on a remote system. It has to be a local one for this to work. -#' If `TRUE` then a local temporary directory (made with [tempdir()]) -#' will be added to the mount configuration for use with [render_file()]. You will need to -#' ensure the necessary system temp dirs are accessible as a mounts. For -#' macOS this means adding `/private` to said Docker config. +#' @param tag Splash Docker image tag to start #' @note you need Docker running on your system and have pulled the container with -#' [install_splash] for this to work. You should save the resultant `host` -#' object for use in [stop_splash]. +#' [install_splash] for this to work. You should save the resultant +#' object for use in [stop_splash] otherwise you'll have to kill it from the +#' command line interface. #' @family splash_docker_helpers -#' @return `harbor` `container` object +#' @return `docker` `container` object #' @export #' @examples \dontrun{ #' install_splash() #' splash_container <- start_splash() #' stop_splash(splash_container) #' } -start_splash <- function(host = harbor::localhost, add_tempdir=FALSE) { +start_splash <- function(tag="3.0") { - doc_opts <- c("-p", "5023:5023", - "-p", "8050:8050", - "-p", "8051:8051") + client <- docker::docker$from_env() - if (add_tempdir) - doc_opts <- c(doc_opts, - sprintf("--volume=%s", sprintf("%s:/splashfiles", .pkgenv$temp_dir))) + splash_inst <- client$containers$run( + sprintf("scrapinghub/splash:%s", tag), name="splashr", + detach=TRUE, ports=list('8050/tcp'='8050', '5023/tcp'='5023', '8051/tcp'='8051') + ) - # purrr::walk(doc_opts, message) + invisible(splash_inst) - harbor::docker_run(host, - image = "hrbrmstr/splashttpd", - detach = TRUE, - docker_opts = doc_opts) } #' Stop a running a Splash server Docker container @@ -69,5 +61,14 @@ start_splash <- function(host = harbor::localhost, add_tempdir=FALSE) { #' stop_splash(splash_container) #' } stop_splash <- function(splash_container) { - harbor::container_rm(splash_container, force=TRUE) + splash_container$stop() + splash_container$remove() } + + +# @param add_tempdir This is `FALSE` initially since you could try to run +# the splash image on a remote system. It has to be a local one for this to work. +# If `TRUE` then a local temporary directory (made with [tempdir()]) +# will be added to the mount configuration for use with [render_file()]. You will need to +# ensure the necessary system temp dirs are accessible as a mounts. For +# macOS this means adding `/private` to said Docker config. diff --git a/R/splashr-package.R b/R/splashr-package.R index 6ca3a35..93d40ee 100644 --- a/R/splashr-package.R +++ b/R/splashr-package.R @@ -14,11 +14,11 @@ #' @name splashr #' @docType package #' @author Bob Rudis (bob@@rud.is) -#' @import purrr httr magick harbor -#' @importFrom stringi stri_split_regex stri_split_fixed stri_detect_regex +#' @import purrr httr magick docker +#' @importFrom stringi stri_split_regex stri_split_fixed stri_detect_regex stri_split_lines #' @importFrom HARtools writeHAR HARviewer renderHARviewer HARviewerOutput #' @importFrom xml2 read_html url_parse -#' @importFrom jsonlite fromJSON unbox +#' @importFrom jsonlite fromJSON unbox stream_in #' @importFrom openssl base64_decode #' @importFrom clipr read_clip #' @importFrom lubridate ymd_hms @@ -27,7 +27,6 @@ #' @importFrom formatR tidy_source #' @importFrom utils capture.output str #' @importFrom curl curl_unescape - NULL #' splashr exported operators diff --git a/README.Rmd b/README.Rmd index c31ae21..62d8295 100644 --- a/README.Rmd +++ b/README.Rmd @@ -13,8 +13,8 @@ It's also an alternative to `phantomjs` (which you can use in R within or withou You can also get it running with two commands: - sudo docker pull hrbrmstr/splashttpd - sudo docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 hrbrmstr/splashttpd + sudo docker pull scrapinghub/splash:3.0 + sudo docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 scrapinghub/splash:3.0 Do whatever you Windows ppl do with Docker on your systems to make ^^ work. @@ -25,7 +25,7 @@ Folks super-new to Docker on Unix-ish platforms should [make sure to do](https:/ (`$USER` is your username and shld be defined for you in the environment) -If using the [`harbor`](https://github.com/wch/harbor) package you can use the convience wrappers in this pacakge: +If using the [`docker`](https://github.com/bhaskarvk/docker) package you can use the convience wrappers in this pacakge: install_splash() splash_container <- start_splash() @@ -36,7 +36,7 @@ and then run: when done. All of that happens on your localhost and you will not need to specify `splash_obj` to many of the `splashr` functions if you're running Splash in this default configuration as long as you use named parameters. You can also use the pre-defined `splash_local` object if you want to use positional parameters. -You can run Selenium in Docker, so this is not unique to Splash. But, a Docker context makes it so that you don't have to run or maintain icky Python stuff directly on your system. Leave it in the abandoned warehouse district where it belongs. +Now, you can run Selenium in Docker, so this is not unique to Splash. But, a Docker context makes it so that you don't have to run or maintain icky Python stuff directly on your system. Leave it in the abandoned warehouse district where it belongs. All you need for this package to work is a running Splash instance. You provide the host/port for it and it's scrape-tastic fun from there! @@ -47,7 +47,6 @@ All you need for this package to work is a running Splash instance. You provide The following functions are implemented: - `render_html`: Return the HTML of the javascript-rendered page. -- `render_file`: Return the HTML or image (png) of the javascript-rendered page in a local file - `render_har`: Return information about Splash interaction with a website in [HAR](http://www.softwareishard.com/blog/har-12-spec/) format. - `render_jpeg`: Return a image (in JPEG format) of the javascript-rendered page. - `render_png`: Return a image (in PNG format) of the javascript-rendered page. @@ -111,6 +110,7 @@ Suggest more in a feature req! - Implement `execute` (you can script Splash!) - Add integration with [`HARtools`](https://github.com/johndharrison/HARtools) - _Possibly_ writing R function wrappers to install/start/stop Splash which would also support enabling javascript profiles, request filters and proxy profiles from with R directly, using [`harbor`](https://github.com/wch/harbor) +- Re-implement `render_file()` - Testing results with all combinations of parameters ### Installation @@ -132,7 +132,6 @@ library(splashr) library(magick) library(rvest) library(anytime) -library(hrbrmisc) # github library(htmlwidgets) library(DiagrammeR) library(tidyverse) @@ -221,13 +220,11 @@ splash_local %>% -### Rendering Widgets - -```{r eval=FALSE} -splash_vm <- start_splash(add_tempdir = TRUE) -``` - -```{r eval=FALSE} +```{r echo=FALSE, eval=FALSE} + ### Rendering Widgets + {r eval=FALSE} + splash_vm <- start_splash(add_tempdir = TRUE) + DiagrammeR(" graph LR A-->B @@ -241,9 +238,8 @@ DiagrammeR(" saveWidget("/tmp/diag.html") render_file(url = "/tmp/diag.html", output="html") -``` - ![](img/diag.png) +``` ```{r eval=FALSE} stop_splash(splash_vm) diff --git a/README.md b/README.md index e4d9633..6af7b58 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,8 @@ It's also an alternative to `phantomjs` (which you can use in R within or withou You can also get it running with two commands: - sudo docker pull hrbrmstr/splashttpd - sudo docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 hrbrmstr/splashttpd + sudo docker pull scrapinghub/splash:3.0 + sudo docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 scrapinghub/splash:3.0 Do whatever you Windows ppl do with Docker on your systems to make ^^ work. @@ -23,7 +23,7 @@ Folks super-new to Docker on Unix-ish platforms should [make sure to do](https:/ (`$USER` is your username and shld be defined for you in the environment) -If using the [`harbor`](https://github.com/wch/harbor) package you can use the convience wrappers in this pacakge: +If using the [`docker`](https://github.com/bhaskarvk/docker) package you can use the convience wrappers in this pacakge: install_splash() splash_container <- start_splash() @@ -34,7 +34,7 @@ and then run: when done. All of that happens on your localhost and you will not need to specify `splash_obj` to many of the `splashr` functions if you're running Splash in this default configuration as long as you use named parameters. You can also use the pre-defined `splash_local` object if you want to use positional parameters. -You can run Selenium in Docker, so this is not unique to Splash. But, a Docker context makes it so that you don't have to run or maintain icky Python stuff directly on your system. Leave it in the abandoned warehouse district where it belongs. +Now, you can run Selenium in Docker, so this is not unique to Splash. But, a Docker context makes it so that you don't have to run or maintain icky Python stuff directly on your system. Leave it in the abandoned warehouse district where it belongs. All you need for this package to work is a running Splash instance. You provide the host/port for it and it's scrape-tastic fun from there! @@ -45,7 +45,6 @@ All you need for this package to work is a running Splash instance. You provide The following functions are implemented: - `render_html`: Return the HTML of the javascript-rendered page. -- `render_file`: Return the HTML or image (png) of the javascript-rendered page in a local file - `render_har`: Return information about Splash interaction with a website in [HAR](http://www.softwareishard.com/blog/har-12-spec/) format. - `render_jpeg`: Return a image (in JPEG format) of the javascript-rendered page. - `render_png`: Return a image (in PNG format) of the javascript-rendered page. @@ -109,6 +108,7 @@ Suggest more in a feature req! - Implement `execute` (you can script Splash!) - Add integration with [`HARtools`](https://github.com/johndharrison/HARtools) - *Possibly* writing R function wrappers to install/start/stop Splash which would also support enabling javascript profiles, request filters and proxy profiles from with R directly, using [`harbor`](https://github.com/wch/harbor) +- Re-implement `render_file()` - Testing results with all combinations of parameters ### Installation @@ -130,7 +130,6 @@ library(splashr) library(magick) library(rvest) library(anytime) -library(hrbrmisc) # github library(htmlwidgets) library(DiagrammeR) library(tidyverse) @@ -154,13 +153,13 @@ splash_debug() ## List of 7 ## $ active : list() ## $ argcache: int 0 - ## $ fds : int 22 + ## $ fds : int 19 ## $ leaks :List of 4 ## ..$ Deferred : int 50 ## ..$ LuaRuntime: int 1 ## ..$ QTimer : int 1 ## ..$ Request : int 1 - ## $ maxrss : int 293520 + ## $ maxrss : int 197264 ## $ qsize : int 0 ## $ url : chr "http://localhost:8050" ## - attr(*, "class")= chr [1:2] "splash_debug" "list" @@ -174,7 +173,7 @@ render_html(url = "http://marvel.com/universe/Captain_America_(Steve_Rogers)") ## {xml_document} ## - ## [1] \n