From a3d7095ab17165c4a1fb3b9a4c918f3633a6c5de Mon Sep 17 00:00:00 2001 From: Bob Rudis Date: Fri, 10 Feb 2017 14:03:32 -0500 Subject: [PATCH] Added `execute_lua()` enabling scripting the a browser session with lua scripts http://splash.readthedocs.io/en/stable/scripting-tutorial.html#scripting-tutorial --- NAMESPACE | 2 +- NEWS.md | 6 ++++++ R/execute.r | 33 ++++++++++++++++++++++++++++++ R/splashr.r | 25 ++++++++++++----------- README.Rmd | 31 ++++++++++++++++++++-------- README.md | 56 ++++++++++++++++++++++++++++++++++----------------- img/cap.jpg | Bin 126959 -> 126298 bytes img/cap.png | Bin 477738 -> 447385 bytes man/execute_lua.Rd | 32 +++++++++++++++++++++++++++++ man/splash_active.Rd | 7 +++---- 10 files changed, 148 insertions(+), 44 deletions(-) create mode 100644 R/execute.r create mode 100644 man/execute_lua.Rd diff --git a/NAMESPACE b/NAMESPACE index 400d17a..c6f7ab1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,10 +2,10 @@ S3method(print,splash_debug) S3method(print,splash_json) -S3method(print,splash_status) export("%>%") export(HARviewer) export(HARviewerOutput) +export(execute_lua) export(install_splash) export(renderHARviewer) export(render_file) diff --git a/NEWS.md b/NEWS.md index 9b4679b..9ba16f6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,2 +1,8 @@ +0.2.0 + +* added `execute`() +* modified `splash_active`() + 0.1.0 + * Initial release diff --git a/R/execute.r b/R/execute.r new file mode 100644 index 0000000..0ac7951 --- /dev/null +++ b/R/execute.r @@ -0,0 +1,33 @@ +#' Execute a custom rendering script and return a result. +#' +#' @md +#' @param splash_obj Object created by a call to [splash] +#' @param lua_sourc Browser automation script. See [Splash Script](http://splash.readthedocs.io/en/stable/scripting-tutorial.html#scripting-tutorial) Tutorial for more info. +#' @param timeout A timeout (in seconds) for the render (defaults to 30). +#' @param allowed_domains Comma-separated list of allowed domain names. If present, Splash won’t load anything neither from domains not in this list nor from subdomains of domains not in this list. +#' @param proxy Proxy profile name or proxy URL. +#' @param filters Comma-separated list of request filter names. +#' @param save_args A list of argument names to put in cache. +#' @param load_args Parameter values to load from cache +#' @return `raw` content from the `httr` call. Given the vast diversity of possible return values, it's up to the caller to handle the return value. +#' @export +execute_lua <- function(splash_obj, lua_source, timeout=30, allowed_domains, + proxy, filters, save_args, load_args) { + + params <- list(lua_source=lua_source, timeout=timeout) + + if (!missing(allowed_domains)) params$allowed_domains <- allowed_domains + if (!missing(proxy)) params$proxy <- proxy + if (!missing(filters)) params$filters <- filters + if (!missing(save_args)) params$save_args <- save_args + if (!missing(load_args)) params$load_args <- load_args + + res <- httr::GET(splash_url(splash_obj), path="execute", encode="json", query=params) + + httr::stop_for_status(res) + + out <- httr::content(res, as="raw") + + out + +} \ No newline at end of file diff --git a/R/splashr.r b/R/splashr.r index 4cecb24..1a1a2c5 100644 --- a/R/splashr.r +++ b/R/splashr.r @@ -9,31 +9,32 @@ splash <- function(host, port=8050L) { list(host=host, port=port) } +s_GET <- purrr::safely(GET) + #' Test if a Splash server is up #' +#' @md #' @param splash_obj A splash connection object +#' @return `TRUE` if Slash server is running, otherwise `FALSE` #' @export splash_active <- function(splash_obj) { - httr::GET(splash_url(splash_obj), path="_ping") %>% - httr::stop_for_status() %>% - httr::content(as="text", encoding="UTF-8") %>% + res <- s_GET(splash_url(splash_obj), path="_ping") + + if (is.null(res$result)) return(FALSE) + if (httr::status_code(res$result) >=300) return(FALSE) + + httr::content(res$result, as="text", encoding="UTF-8") %>% jsonlite::fromJSON() -> out out$url <- splash_url(splash_obj) - class(out) <- c("splash_status", class(out)) + message(sprintf("Status of splash instance on [%s]: %s. Max RSS: %s\n", out$url, out$status, out$maxrss)) - out + if ("status" %in% names(out)) return(out$status == "ok") -} + return(FALSE) -#' @rdname splash_active -#' @keywords internal -#' @export -print.splash_status <- function(x, ...) { - cat(sprintf("Status of splash instance on [%s]: %s. Max RSS: %s\n", x$url, x$status, x$maxrss)) - invisible(x) } #' Retrieve debug-level info for a Splash server diff --git a/README.Rmd b/README.Rmd index 5b046bf..ce7a757 100644 --- a/README.Rmd +++ b/README.Rmd @@ -12,8 +12,8 @@ It's also an alternative to `phantomjs` (which you can use in R within or withou You can also get it running with two commands: - sudo docker pull scrapinghub/splash - sudo docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 scrapinghub/splash + sudo docker pull hrbrmstr/splashttpd + sudo docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 hrbrmstr/splashttpd (Do whatever you Windows ppl do with Docker on your systems to make ^^ work.) @@ -43,6 +43,7 @@ The following functions are implemented: - `render_har`: Return information about Splash interaction with a website in [HAR](http://www.softwareishard.com/blog/har-12-spec/) format. - `render_jpeg`: Return a image (in JPEG format) of the javascript-rendered page. - `render_png`: Return a image (in PNG format) of the javascript-rendered page. +- `execute_lua`: Execute a custom rendering script and return a result. - `splash`: Configure parameters for connecting to a Splash server - `install_splash`: Retrieve the Docker image for Splash - `start_splash`: Start a Splash server Docker container @@ -56,7 +57,7 @@ Suggest more in a feature req! - Implement `render.json` - Implement "file rendering" -- Implement `execute` (you can script Splash!) +- Implement `execute` (you can script Splash!) - Add integration with [`HARtools`](https://github.com/johndharrison/HARtools) - _Possibly_ writing R function wrappers to install/start/stop Splash which would also support enabling javascript profiles, request filters and proxy profiles from with R directly, using [`harbor`](https://github.com/wch/harbor) - Testing results with all combinations of parameters @@ -141,14 +142,28 @@ splash("splash", 8050L) %>% ![](img/cap.jpg) -### Rendering Widgets +### Executing custom Lua scripts ```{r} -splash_vm <- start_splash(add_tempdir=TRUE) +lua_ex <- ' +function main(splash) + splash:go("http://rud.is/b") + splash:wait(0.5) + local title = splash:evaljs("document.title") + return {title=title} +end +' + +res <- splash("localhost") %>% execute_lua(lua_ex) + +rawToChar(res) %>% + jsonlite::fromJSON() ``` -```{r include=FALSE, echo=FALSE} -Sys.sleep(3) +### Rendering Widgets + +```{r eval=FALSE} +splash_vm <- start_splash(add_tempdir=TRUE) ``` ```{r} @@ -175,7 +190,7 @@ splash("localhost") %>% ![](img/diag.png) -```{r} +```{r eval=FALSE} stop_splash(splash_vm) ``` diff --git a/README.md b/README.md index 866122f..a517c26 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,8 @@ It's also an alternative to `phantomjs` (which you can use in R within or withou You can also get it running with two commands: - sudo docker pull scrapinghub/splash - sudo docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 scrapinghub/splash + sudo docker pull hrbrmstr/splashttpd + sudo docker run -p 5023:5023 -p 8050:8050 -p 8051:8051 hrbrmstr/splashttpd (Do whatever you Windows ppl do with Docker on your systems to make ^^ work.) @@ -40,6 +40,7 @@ The following functions are implemented: - `render_har`: Return information about Splash interaction with a website in [HAR](http://www.softwareishard.com/blog/har-12-spec/) format. - `render_jpeg`: Return a image (in JPEG format) of the javascript-rendered page. - `render_png`: Return a image (in PNG format) of the javascript-rendered page. +- `execute_lua`: Execute a custom rendering script and return a result. - `splash`: Configure parameters for connecting to a Splash server - `install_splash`: Retrieve the Docker image for Splash - `start_splash`: Start a Splash server Docker container @@ -53,7 +54,7 @@ Suggest more in a feature req! - Implement `render.json` - Implement "file rendering" -- Implement `execute` (you can script Splash!) +- Implement `execute` (you can script Splash!) - Add integration with [`HARtools`](https://github.com/johndharrison/HARtools) - *Possibly* writing R function wrappers to install/start/stop Splash which would also support enabling javascript profiles, request filters and proxy profiles from with R directly, using [`harbor`](https://github.com/wch/harbor) - Testing results with all combinations of parameters @@ -91,7 +92,7 @@ splash("splash", 8050L) %>% splash_active() ``` - ## Status of splash instance on [http://splash:8050]: ok. Max RSS: 462295040 + ## [1] TRUE ``` r splash("splash", 8050L) %>% @@ -107,7 +108,7 @@ splash("splash", 8050L) %>% ## ..$ LuaRuntime: int 1 ## ..$ QTimer : int 1 ## ..$ Request : int 1 - ## $ maxrss : int 451460 + ## $ maxrss : int 491092 ## $ qsize : int 0 ## $ url : chr "http://splash:8050" ## - attr(*, "class")= chr [1:2] "splash_debug" "list" @@ -122,8 +123,8 @@ splash("splash", 8050L) %>% ## {xml_document} ## - ## [1] \n