Browse Source

Splash 3.0 and docker pacakge test on Travis

master
boB Rudis 7 years ago
parent
commit
9aeb2c923f
No known key found for this signature in database GPG Key ID: 2A514A4997464560
  1. 37
      .travis.yml
  2. 3
      DESCRIPTION
  3. 4
      NAMESPACE
  4. 55
      R/docker-splash.r
  5. 7
      R/splashr-package.R
  6. 26
      README.Rmd
  7. 57
      README.md
  8. BIN
      img/cap.jpg
  9. BIN
      img/cap.png
  10. 7
      man/install_splash.Rd
  11. 18
      man/start_splash.Rd

37
.travis.yml

@ -1,38 +1,27 @@
language: r
warnings_are_errors: true
cache: packages
sudo: required
dist: trusty
services:
- docker
env:
global:
- NOT_CRAN=true
matrix:
- PIP="pip" RETICULATE_PYTHON="/usr/bin/python"
- PYTHON=3 PIP="pip3" RETICULATE_PYTHON="/usr/bin/python3"
before_install:
- docker pull hrbrmstr/splashttpd
- docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 -d hrbrmstr/splashttpd
- docker pull scrapinghub/splash:3.0
- docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 -d scrapinghub/splash:3.0
- docker ps -a
- sudo apt-get -y install python${PYTHON}-pip python-dev python${PYTHON}-numpy
- sudo $PIP install -U setuptools
- sudo $PIP install -U docker
r:
- oldrel
- release
- devel
addons:
apt:
packages:
- xclip
- libv8-dev
env:
- DISPLAY=""
- DISPLAY=:99.0
notifications:
email:
- bob@rud.is
irc:
channels:
- "104.236.112.222#builds"
nick: travisci

3
DESCRIPTION

@ -33,7 +33,7 @@ Imports:
purrr,
stats,
utils,
harbor,
docker,
magick,
scales,
formatR,
@ -43,4 +43,3 @@ Imports:
HARtools,
lubridate
RoxygenNote: 6.0.1
Remotes: wch/harbor

4
NAMESPACE

@ -83,7 +83,7 @@ export(ua_win7_chrome)
export(ua_win7_firefox)
export(ua_win7_ie11)
export(writeHAR)
import(harbor)
import(docker)
import(httr)
import(magick)
import(purrr)
@ -95,6 +95,7 @@ importFrom(clipr,read_clip)
importFrom(curl,curl_unescape)
importFrom(formatR,tidy_source)
importFrom(jsonlite,fromJSON)
importFrom(jsonlite,stream_in)
importFrom(jsonlite,unbox)
importFrom(lubridate,ymd_hms)
importFrom(openssl,base64_decode)
@ -102,6 +103,7 @@ importFrom(scales,comma)
importFrom(stats,setNames)
importFrom(stringi,stri_detect_regex)
importFrom(stringi,stri_split_fixed)
importFrom(stringi,stri_split_lines)
importFrom(stringi,stri_split_regex)
importFrom(utils,capture.output)
importFrom(utils,str)

55
R/docker-splash.r

@ -1,8 +1,7 @@
#' Retrieve the Docker image for Splash
#'
#' @md
#' @param host Docker host; defaults to `localhost`
#' @return `harbor` `host` object
#' @param tag Splash Docker image tag to install
#' @export
#' @family splash_docker_helpers
#' @examples \dontrun{
@ -10,8 +9,11 @@
#' splash_container <- start_splash()
#' stop_splash(splash_container)
#' }
install_splash <- function(host = harbor::localhost) {
harbor::docker_pull(host, "hrbrmstr/splashttpd")
install_splash <- function(tag="3.0") {
client <- docker::docker$from_env()
res <- client$api$pull("scrapinghub/splash", tag)
res <- jsonlite::stream_in(textConnection(res), verbose=FALSE)
invisible(lapply(res$status, function(x) { message(x) }))
}
#' Start a Splash server Docker container
@ -19,40 +21,30 @@ install_splash <- function(host = harbor::localhost) {
#' If using this in an automation context, you should consider adding a
#' `Sys.sleep(3)` (or higher) after starting the docker container.
#'
#' @param host Docker host; defaults to `localhost`
#' @param add_tempdir This is `FALSE` initially since you could try to run
#' the splash image on a remote system. It has to be a local one for this to work.
#' If `TRUE` then a local temporary directory (made with [tempdir()])
#' will be added to the mount configuration for use with [render_file()]. You will need to
#' ensure the necessary system temp dirs are accessible as a mounts. For
#' macOS this means adding `/private` to said Docker config.
#' @param tag Splash Docker image tag to start
#' @note you need Docker running on your system and have pulled the container with
#' [install_splash] for this to work. You should save the resultant `host`
#' object for use in [stop_splash].
#' [install_splash] for this to work. You should save the resultant
#' object for use in [stop_splash] otherwise you'll have to kill it from the
#' command line interface.
#' @family splash_docker_helpers
#' @return `harbor` `container` object
#' @return `docker` `container` object
#' @export
#' @examples \dontrun{
#' install_splash()
#' splash_container <- start_splash()
#' stop_splash(splash_container)
#' }
start_splash <- function(host = harbor::localhost, add_tempdir=FALSE) {
start_splash <- function(tag="3.0") {
doc_opts <- c("-p", "5023:5023",
"-p", "8050:8050",
"-p", "8051:8051")
client <- docker::docker$from_env()
if (add_tempdir)
doc_opts <- c(doc_opts,
sprintf("--volume=%s", sprintf("%s:/splashfiles", .pkgenv$temp_dir)))
splash_inst <- client$containers$run(
sprintf("scrapinghub/splash:%s", tag), name="splashr",
detach=TRUE, ports=list('8050/tcp'='8050', '5023/tcp'='5023', '8051/tcp'='8051')
)
# purrr::walk(doc_opts, message)
invisible(splash_inst)
harbor::docker_run(host,
image = "hrbrmstr/splashttpd",
detach = TRUE,
docker_opts = doc_opts)
}
#' Stop a running a Splash server Docker container
@ -69,5 +61,14 @@ start_splash <- function(host = harbor::localhost, add_tempdir=FALSE) {
#' stop_splash(splash_container)
#' }
stop_splash <- function(splash_container) {
harbor::container_rm(splash_container, force=TRUE)
splash_container$stop()
splash_container$remove()
}
# @param add_tempdir This is `FALSE` initially since you could try to run
# the splash image on a remote system. It has to be a local one for this to work.
# If `TRUE` then a local temporary directory (made with [tempdir()])
# will be added to the mount configuration for use with [render_file()]. You will need to
# ensure the necessary system temp dirs are accessible as a mounts. For
# macOS this means adding `/private` to said Docker config.

7
R/splashr-package.R

@ -14,11 +14,11 @@
#' @name splashr
#' @docType package
#' @author Bob Rudis (bob@@rud.is)
#' @import purrr httr magick harbor
#' @importFrom stringi stri_split_regex stri_split_fixed stri_detect_regex
#' @import purrr httr magick docker
#' @importFrom stringi stri_split_regex stri_split_fixed stri_detect_regex stri_split_lines
#' @importFrom HARtools writeHAR HARviewer renderHARviewer HARviewerOutput
#' @importFrom xml2 read_html url_parse
#' @importFrom jsonlite fromJSON unbox
#' @importFrom jsonlite fromJSON unbox stream_in
#' @importFrom openssl base64_decode
#' @importFrom clipr read_clip
#' @importFrom lubridate ymd_hms
@ -27,7 +27,6 @@
#' @importFrom formatR tidy_source
#' @importFrom utils capture.output str
#' @importFrom curl curl_unescape
NULL
#' splashr exported operators

26
README.Rmd

@ -13,8 +13,8 @@ It's also an alternative to `phantomjs` (which you can use in R within or withou
You can also get it running with two commands:
sudo docker pull hrbrmstr/splashttpd
sudo docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 hrbrmstr/splashttpd
sudo docker pull scrapinghub/splash:3.0
sudo docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 scrapinghub/splash:3.0
Do whatever you Windows ppl do with Docker on your systems to make ^^ work.
@ -25,7 +25,7 @@ Folks super-new to Docker on Unix-ish platforms should [make sure to do](https:/
(`$USER` is your username and shld be defined for you in the environment)
If using the [`harbor`](https://github.com/wch/harbor) package you can use the convience wrappers in this pacakge:
If using the [`docker`](https://github.com/bhaskarvk/docker) package you can use the convience wrappers in this pacakge:
install_splash()
splash_container <- start_splash()
@ -36,7 +36,7 @@ and then run:
when done. All of that happens on your localhost and you will not need to specify `splash_obj` to many of the `splashr` functions if you're running Splash in this default configuration as long as you use named parameters. You can also use the pre-defined `splash_local` object if you want to use positional parameters.
You can run Selenium in Docker, so this is not unique to Splash. But, a Docker context makes it so that you don't have to run or maintain icky Python stuff directly on your system. Leave it in the abandoned warehouse district where it belongs.
Now, you can run Selenium in Docker, so this is not unique to Splash. But, a Docker context makes it so that you don't have to run or maintain icky Python stuff directly on your system. Leave it in the abandoned warehouse district where it belongs.
All you need for this package to work is a running Splash instance. You provide the host/port for it and it's scrape-tastic fun from there!
@ -47,7 +47,6 @@ All you need for this package to work is a running Splash instance. You provide
The following functions are implemented:
- `render_html`: Return the HTML of the javascript-rendered page.
- `render_file`: Return the HTML or image (png) of the javascript-rendered page in a local file
- `render_har`: Return information about Splash interaction with a website in [HAR](http://www.softwareishard.com/blog/har-12-spec/) format.
- `render_jpeg`: Return a image (in JPEG format) of the javascript-rendered page.
- `render_png`: Return a image (in PNG format) of the javascript-rendered page.
@ -111,6 +110,7 @@ Suggest more in a feature req!
- <strike>Implement `execute` (you can script Splash!)</strike>
- <strike>Add integration with [`HARtools`](https://github.com/johndharrison/HARtools)</strike>
- <strike>_Possibly_ writing R function wrappers to install/start/stop Splash</strike> which would also support enabling javascript profiles, request filters and proxy profiles from with R directly, using [`harbor`](https://github.com/wch/harbor)
- Re-implement `render_file()`
- Testing results with all combinations of parameters
### Installation
@ -132,7 +132,6 @@ library(splashr)
library(magick)
library(rvest)
library(anytime)
library(hrbrmisc) # github
library(htmlwidgets)
library(DiagrammeR)
library(tidyverse)
@ -221,13 +220,11 @@ splash_local %>%
<img src="img/flash.png" width="50%"/>
### Rendering Widgets
```{r eval=FALSE}
splash_vm <- start_splash(add_tempdir = TRUE)
```
```{r eval=FALSE}
```{r echo=FALSE, eval=FALSE}
### Rendering Widgets
{r eval=FALSE}
splash_vm <- start_splash(add_tempdir = TRUE)
DiagrammeR("
graph LR
A-->B
@ -241,9 +238,8 @@ DiagrammeR("
saveWidget("/tmp/diag.html")
render_file(url = "/tmp/diag.html", output="html")
```
![](img/diag.png)
```
```{r eval=FALSE}
stop_splash(splash_vm)

57
README.md

@ -11,8 +11,8 @@ It's also an alternative to `phantomjs` (which you can use in R within or withou
You can also get it running with two commands:
sudo docker pull hrbrmstr/splashttpd
sudo docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 hrbrmstr/splashttpd
sudo docker pull scrapinghub/splash:3.0
sudo docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 scrapinghub/splash:3.0
Do whatever you Windows ppl do with Docker on your systems to make ^^ work.
@ -23,7 +23,7 @@ Folks super-new to Docker on Unix-ish platforms should [make sure to do](https:/
(`$USER` is your username and shld be defined for you in the environment)
If using the [`harbor`](https://github.com/wch/harbor) package you can use the convience wrappers in this pacakge:
If using the [`docker`](https://github.com/bhaskarvk/docker) package you can use the convience wrappers in this pacakge:
install_splash()
splash_container <- start_splash()
@ -34,7 +34,7 @@ and then run:
when done. All of that happens on your localhost and you will not need to specify `splash_obj` to many of the `splashr` functions if you're running Splash in this default configuration as long as you use named parameters. You can also use the pre-defined `splash_local` object if you want to use positional parameters.
You can run Selenium in Docker, so this is not unique to Splash. But, a Docker context makes it so that you don't have to run or maintain icky Python stuff directly on your system. Leave it in the abandoned warehouse district where it belongs.
Now, you can run Selenium in Docker, so this is not unique to Splash. But, a Docker context makes it so that you don't have to run or maintain icky Python stuff directly on your system. Leave it in the abandoned warehouse district where it belongs.
All you need for this package to work is a running Splash instance. You provide the host/port for it and it's scrape-tastic fun from there!
@ -45,7 +45,6 @@ All you need for this package to work is a running Splash instance. You provide
The following functions are implemented:
- `render_html`: Return the HTML of the javascript-rendered page.
- `render_file`: Return the HTML or image (png) of the javascript-rendered page in a local file
- `render_har`: Return information about Splash interaction with a website in [HAR](http://www.softwareishard.com/blog/har-12-spec/) format.
- `render_jpeg`: Return a image (in JPEG format) of the javascript-rendered page.
- `render_png`: Return a image (in PNG format) of the javascript-rendered page.
@ -109,6 +108,7 @@ Suggest more in a feature req!
- <strike>Implement `execute` (you can script Splash!)</strike>
- <strike>Add integration with [`HARtools`](https://github.com/johndharrison/HARtools)</strike>
- <strike>*Possibly* writing R function wrappers to install/start/stop Splash</strike> which would also support enabling javascript profiles, request filters and proxy profiles from with R directly, using [`harbor`](https://github.com/wch/harbor)
- Re-implement `render_file()`
- Testing results with all combinations of parameters
### Installation
@ -130,7 +130,6 @@ library(splashr)
library(magick)
library(rvest)
library(anytime)
library(hrbrmisc) # github
library(htmlwidgets)
library(DiagrammeR)
library(tidyverse)
@ -154,13 +153,13 @@ splash_debug()
## List of 7
## $ active : list()
## $ argcache: int 0
## $ fds : int 22
## $ fds : int 19
## $ leaks :List of 4
## ..$ Deferred : int 50
## ..$ LuaRuntime: int 1
## ..$ QTimer : int 1
## ..$ Request : int 1
## $ maxrss : int 293520
## $ maxrss : int 197264
## $ qsize : int 0
## $ url : chr "http://localhost:8050"
## - attr(*, "class")= chr [1:2] "splash_debug" "list"
@ -174,7 +173,7 @@ render_html(url = "http://marvel.com/universe/Captain_America_(Steve_Rogers)")
## {xml_document}
## <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr">
## [1] <head>\n<script type="text/javascript" async="async" src="http://dpm.demdex.net/id?d_rtbd=json&amp;d_ver=2&amp;d_ ...
## [1] <head>\n<script src="http://widget-cdn.rpxnow.com/manifest/login?version=release%2F1.116.0_widgets_767" type="tex ...
## [2] <body id="index-index" class="index-index" onload="findLinks('myLink');">\n\n\t<div id="page_frame" style="overfl ...
``` r
@ -198,27 +197,27 @@ print(har)
## HAR specification version: 1.2
## --------HAR CREATOR--------
## Created by: Splash
## version: 2.3.1
## version: 3.0
## --------HAR BROWSER--------
## Browser: QWebKit
## version: 538.1
## version: 602.1
## --------HAR PAGES--------
## Page id: 1 , Page title: Poynter – A global leader in journalism. Strengthening democracy.
## --------HAR ENTRIES--------
## Number of entries: 27
## Number of entries: 29
## REQUESTS:
## Page: 1
## Number of entries: 27
## Number of entries: 29
## - http://www.poynter.org/
## - http://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css?ver=2016_06_24.1
## - http://cloud.webtype.com/css/162ac332-3b31-4b73-ad44-da375b7f2fe3.css?ver=2016_06_24.1
## - http://maxcdn.bootstrapcdn.com/font-awesome/4.4.0/css/font-awesome.min.css?ver=2016_06_24.1
## - http://www.poynter.org/wp-content/themes/poynter_timber/assets/scrollbar/jquery.mCustomScrollbar.min.css?ver=2016...
## - http://www.poynter.org/wp-content/plugins/jetpack/css/jetpack.css?ver=4.0.4
## ........
## - http://cloud.webtype.com/webtype/ff2/3/c6369fc5-fc59-4a12-ac92-25afa6c567a0?ec_token=8f7c4c4997246fd7fa920171c994...
## - http://cloud.webtype.com/webtype/ff2/3/380e3672-840d-462a-83ee-2ea85a43504a?ec_token=8f7c4c4997246fd7fa920171c994...
## - http://cloud.webtype.com/webtype/ff2/3/4ac7f809-9bdf-4acc-8bd5-a922f05f271b?ec_token=8f7c4c4997246fd7fa920171c994...
## - http://cloud.webtype.com/webtype/ff2/3/c6608520-1978-45ac-9061-74ada664cae4?ec_token=8f7c4c4997246fd7fa920171c994...
## - http://cloud.webtype.com/webtype/ff2/3/380e3672-840d-462a-83ee-2ea85a43504a?ec_token=8f7c4c4997246fd7fa920171c994...
## - http://cloud.webtype.com/webtype/ff2/3/c6369fc5-fc59-4a12-ac92-25afa6c567a0?ec_token=8f7c4c4997246fd7fa920171c994...
## - http://static.chartbeat.com/js/chartbeat.js
You can use [`HARtools::HARviewer`](https://github.com/johndharrison/HARtools/blob/master/R/HARviewer.R) — which this pkg import/exports — to get view the HAR in an interactive HTML widget.
@ -274,30 +273,6 @@ splash_local %>%
<img src="img/flash.png" width="50%"/>
### Rendering Widgets
``` r
splash_vm <- start_splash(add_tempdir = TRUE)
```
``` r
DiagrammeR("
graph LR
A-->B
A-->C
C-->E
B-->D
C-->D
D-->F
E-->F
") %>%
saveWidget("/tmp/diag.html")
render_file(url = "/tmp/diag.html", output="html")
```
![](img/diag.png)
``` r
stop_splash(splash_vm)
```
@ -311,7 +286,7 @@ library(testthat)
date()
```
## [1] "Thu Feb 23 17:30:57 2017"
## [1] "Sun Aug 27 07:51:03 2017"
``` r
test_dir("tests/")

BIN
img/cap.jpg

Binary file not shown.

Before

Width:  |  Height:  |  Size: 41 KiB

After

Width:  |  Height:  |  Size: 42 KiB

BIN
img/cap.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 328 KiB

After

Width:  |  Height:  |  Size: 521 KiB

7
man/install_splash.Rd

@ -4,13 +4,10 @@
\alias{install_splash}
\title{Retrieve the Docker image for Splash}
\usage{
install_splash(host = harbor::localhost)
install_splash(tag = "3.0")
}
\arguments{
\item{host}{Docker host; defaults to \code{localhost}}
}
\value{
\code{harbor} \code{host} object
\item{tag}{Splash Docker image tag to install}
}
\description{
Retrieve the Docker image for Splash

18
man/start_splash.Rd

@ -4,20 +4,13 @@
\alias{start_splash}
\title{Start a Splash server Docker container}
\usage{
start_splash(host = harbor::localhost, add_tempdir = FALSE)
start_splash(tag = "3.0")
}
\arguments{
\item{host}{Docker host; defaults to `localhost`}
\item{add_tempdir}{This is `FALSE` initially since you could try to run
the splash image on a remote system. It has to be a local one for this to work.
If `TRUE` then a local temporary directory (made with [tempdir()])
will be added to the mount configuration for use with [render_file()]. You will need to
ensure the necessary system temp dirs are accessible as a mounts. For
macOS this means adding `/private` to said Docker config.}
\item{tag}{Splash Docker image tag to start}
}
\value{
`harbor` `container` object
`docker` `container` object
}
\description{
If using this in an automation context, you should consider adding a
@ -25,8 +18,9 @@ If using this in an automation context, you should consider adding a
}
\note{
you need Docker running on your system and have pulled the container with
[install_splash] for this to work. You should save the resultant `host`
object for use in [stop_splash].
[install_splash] for this to work. You should save the resultant
object for use in [stop_splash] otherwise you'll have to kill it from the
command line interface.
}
\examples{
\dontrun{

Loading…
Cancel
Save