Tools to Work with the 'Splash' JavaScript Rendering Service in R
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

66 lines
3.9KB

  1. #' Return the HTML of the javascript-rendered page.
  2. #'
  3. #' Similar to `rvest::read_html`.
  4. #'
  5. #' @md
  6. #' @param splash_obj Object created by a call to [splash()]
  7. #' @param url The URL to render (required)
  8. #' @param base_url The base url to render the page with.
  9. #' @param timeout A timeout (in seconds) for the render (defaults to 30).
  10. #' @param resource_timeout A timeout (in seconds) for individual network requests.
  11. #' @param wait Time (in seconds) to wait for updates after page is loaded (defaults to 0).
  12. #' @param proxy Proxy profile name or proxy URL.
  13. #' @param js Javascript profile name.
  14. #' @param js_src JavaScript code to be executed in page context.
  15. #' @param filters Comma-separated list of request filter names.
  16. #' @param allowed_domains Comma-separated list of allowed domain names. If present, Splash won’t load anything neither from domains not in this list nor from subdomains of domains not in this list.
  17. #' @param allowed_content_types Comma-separated list of allowed content types. If present, Splash will abort any request if the response’s content type doesn’t match any of the content types in this list. Wildcards are supported.
  18. #' @param forbidden_content_types Comma-separated list of forbidden content types. If present, Splash will abort any request if the response’s content type matches any of the content types in this list. Wildcards are supported.
  19. #' @param viewport View width and height (in pixels) of the browser viewport to render the web page. Format is “<width>x<height>”, e.g. 800x600. Default value is 1024x768.
  20. #' @param images Whether to download images.
  21. #' @param headers HTTP headers to set for the first outgoing request.
  22. #' @param body Body of HTTP POST request to be sent if method is POST.
  23. #' @param http_method HTTP method of outgoing Splash request.
  24. #' @param save_args A list of argument names to put in cache.
  25. #' @param load_args Parameter values to load from cache
  26. #' @param raw_html if `TRUE` then return a character vector vs an XML document. Only valid for `render_html`
  27. #' @return An XML document. Note that this is processed by [xml2::read_html()] so it will not be
  28. #' the pristine, raw, rendered HTML from the site. Use `raw_html=TRUE` if you do not want it
  29. #' to be processed first by [xml2]. If you choose `raw_html=TRUE` you'll get back a
  30. #' character vector.
  31. #' @references [Splash docs](http://splash.readthedocs.io/en/stable/index.html)
  32. #' @export
  33. render_html <- function(splash_obj = splash_local, url, base_url, timeout=30, resource_timeout, wait=0,
  34. proxy, js, js_src, filters, allowed_domains, allowed_content_types,
  35. forbidden_content_types, viewport="1024x768", images, headers, body,
  36. http_method, save_args, load_args, raw_html=FALSE) {
  37. params <- list(url=url, timeout=timeout, wait=wait, viewport=viewport)
  38. if (!missing(base_url)) params$base_url <- base_url
  39. if (!missing(resource_timeout)) params$resource_timeout <- resource_timeout
  40. if (!missing(proxy)) proxy$base_url <- proxy
  41. if (!missing(js)) params$js <- js
  42. if (!missing(js_src)) params$js_src <- js_src
  43. if (!missing(filters)) params$filters <- filters
  44. if (!missing(allowed_domains)) params$allowed_domains <- allowed_domains
  45. if (!missing(allowed_content_types)) params$allowed_content_types <- allowed_content_types
  46. if (!missing(forbidden_content_types)) params$forbidden_content_types <- forbidden_content_types
  47. if (!missing(images)) params$images <- images
  48. if (!missing(headers)) params$headers <- headers
  49. if (!missing(body)) params$body <- body
  50. if (!missing(http_method)) params$http_method <- http_method
  51. if (!missing(save_args)) params$save_args <- save_args
  52. if (!missing(load_args)) params$load_args <- load_args
  53. res <- httr::GET(splash_url(splash_obj), path="render.html", encode="json", query=params)
  54. httr::stop_for_status(res)
  55. out <- httr::content(res, as="text", encoding="UTF-8")
  56. if (!raw_html) out <- xml2::read_html(out)
  57. out
  58. }