diff --git a/NAMESPACE b/NAMESPACE index 39989a2..f54d914 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,3 +15,4 @@ import(magick) import(purrr) importFrom(jsonlite,fromJSON) importFrom(xml2,read_html) +importFrom(xml2,url_parse) diff --git a/R/render-har.r b/R/render-har.r index fc77f60..35cd86d 100644 --- a/R/render-har.r +++ b/R/render-har.r @@ -9,9 +9,9 @@ #' @references [Splash docs](http://splash.readthedocs.io/en/stable/index.html) #' @export render_har <- function(splash_obj, url, base_url, response_body=FALSE, timeout=30, resource_timeout, wait=0, - proxy, js, js_src, filters, allowed_domains, allowed_content_types, - forbidden_content_types, viewport="1024x768", images, headers, body, - http_method, save_args, load_args) { + proxy, js, js_src, filters, allowed_domains, allowed_content_types, + forbidden_content_types, viewport="1024x768", images, headers, body, + http_method, save_args, load_args) { params <- list(url=url, timeout=timeout, wait=wait, viewport=viewport, response_body=as.numeric(response_body)) @@ -36,6 +36,11 @@ render_har <- function(splash_obj, url, base_url, response_body=FALSE, timeout=3 httr::stop_for_status(res) - httr::content(res, as="text", encoding="UTF-8") + out <- httr::content(res, as="text", encoding="UTF-8") + out <- jsonlite::fromJSON(out) -} \ No newline at end of file + class(out) <- c("splash_har", class(out)) + + out + +} diff --git a/R/splashr-package.R b/R/splashr-package.R index 53d0b26..ca58f1f 100644 --- a/R/splashr-package.R +++ b/R/splashr-package.R @@ -14,7 +14,7 @@ #' @docType package #' @author Bob Rudis (bob@@rud.is) #' @import purrr httr magick -#' @importFrom xml2 read_html +#' @importFrom xml2 read_html url_parse #' @importFrom jsonlite fromJSON NULL diff --git a/README.Rmd b/README.Rmd index c7b9531..ba29534 100644 --- a/README.Rmd +++ b/README.Rmd @@ -4,8 +4,6 @@ output: rmarkdown::github_document `splashr` : Tools to Work with the 'Splash' JavaScript Rendering Service -**Ridicuously basic functionality working at the moment. More coming soon** - TL;DR: This package works with Splash rendering servers which are really just a REST API & `lua` scripting interface to a QT browser. It's an alternative to the Selenium ecosystem which was really engineerdfor application testing & validation. Sometimes, all you need is a page scrape after javascript has been allowed to roam wild and free over your meticulously craefted HTML tags. So, this package does not do _everything_ Selenium can, but if you're just trying to get a page back that needs javascript rendering, this is a nice alternative. @@ -30,6 +28,7 @@ All you need for this package to work is a running Splash instance. You provide The following functions are implemented: - `render_html`: Return the HTML of the javascript-rendered page. +- `render_har`: Return information about Splash interaction with a website in [HAR](http://www.softwareishard.com/blog/har-12-spec/) format. - `render_jpeg`: Return a image (in JPEG format) of the javascript-rendered page. - `render_png`: Return a image (in PNG format) of the javascript-rendered page. - `splash`: Configure parameters for connecting to a Splash server @@ -50,6 +49,9 @@ options(width=120) library(splashr) library(magick) library(rvest) +library(anytime) +library(hrbrmisc) # github +library(tidyverse) # current verison packageVersion("splashr") @@ -70,7 +72,36 @@ splash("splash", 8050L) %>% read_html("http://marvel.com/universe/Captain_America_(Steve_Rogers)") ``` -Web page snapshots are easy-peasy too: +You can also profile pages: + +```{r fig.width=11, fig.height=6.5, fig.retina=2} +splash("splash", 8050L) %>% + render_har("http://www.poynter.org/") -> har + +data_frame( + start=anytime::anytime(har$log$entries$startedDateTime), + end=(start + lubridate::milliseconds(har$log$entries$time)), + rsrc=sprintf("%02d: %s...", 1:length(start), substr(har$log$entries$request$url, 1, 30))) %>% + mutate(rsrc=factor(rsrc, levels=rev(rsrc))) %>% + bind_cols(xml2::url_parse(har$log$entries$request$url) %>% select(server)) -> df + +total_time <- diff(range(c(df$start, df$end))) +total_time <- sprintf("Total time: %s %s", + format(unclass(total_time), digits = getOption("digits")), + attr(total_time, "units")) + +ggplot(df) + + geom_segment(data=df, aes(x=start, xend=end, y=rsrc, yend=rsrc, color=server), + size=0.25) + + scale_x_datetime(expand=c(0,0)) + + labs(x=total_time, y=NULL, + title=sprintf("HAR Waterfalll Profile for [%s]", "http://www.poynter.org/")) + + theme_hrbrmstr_msc(grid="") + + theme(legend.position="none") + + theme(panel.background=element_rect(color="#2b2b2b", fill="#2b2b2b")) +``` + +And, web page snapshots are easy-peasy too: ```{r eval=FALSE} splash("splash", 8050L) %>% diff --git a/README.md b/README.md index dff6eaa..5ce5efa 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,6 @@ `splashr` : Tools to Work with the 'Splash' JavaScript Rendering Service -**Ridicuously basic functionality working at the moment. More coming soon** - TL;DR: This package works with Splash rendering servers which are really just a REST API & `lua` scripting interface to a QT browser. It's an alternative to the Selenium ecosystem which was really engineerdfor application testing & validation. Sometimes, all you need is a page scrape after javascript has been allowed to roam wild and free over your meticulously craefted HTML tags. So, this package does not do *everything* Selenium can, but if you're just trying to get a page back that needs javascript rendering, this is a nice alternative. @@ -27,6 +25,7 @@ All you need for this package to work is a running Splash instance. You provide The following functions are implemented: - `render_html`: Return the HTML of the javascript-rendered page. +- `render_har`: Return information about Splash interaction with a website in [HAR](http://www.softwareishard.com/blog/har-12-spec/) format. - `render_jpeg`: Return a image (in JPEG format) of the javascript-rendered page. - `render_png`: Return a image (in PNG format) of the javascript-rendered page. - `splash`: Configure parameters for connecting to a Splash server @@ -47,6 +46,9 @@ options(width=120) library(splashr) library(magick) library(rvest) +library(anytime) +library(hrbrmisc) # github +library(tidyverse) # current verison packageVersion("splashr") @@ -59,7 +61,7 @@ splash("splash", 8050L) %>% splash_active() ``` - ## Status of splash instance on [http://splash:8050]: ok. Max RSS: 349298688 + ## Status of splash instance on [http://splash:8050]: ok. Max RSS: 313761792 ``` r splash("splash", 8050L) %>% @@ -75,7 +77,7 @@ splash("splash", 8050L) %>% ## ..$ LuaRuntime: int 1 ## ..$ QTimer : int 1 ## ..$ Request : int 1 - ## $ maxrss : int 341112 + ## $ maxrss : int 306408 ## $ qsize : int 0 ## $ url : chr "http://splash:8050" ## - attr(*, "class")= chr [1:2] "splash_debug" "list" @@ -90,7 +92,7 @@ splash("splash", 8050L) %>% ## {xml_document} ## - ## [1] \n