Tools to Work with the 'Splash' JavaScript Rendering Service in R
Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.

101 wiersze
5.2KB

  1. #' Return the HTML of the javascript-rendered page.
  2. #'
  3. #' Similar (i.e. a dynamic equivalent) to `rvest::read_html`.
  4. #'
  5. #' @md
  6. #' @param splash_obj Object created by a call to [splash()]
  7. #' @param url The URL to render (required)
  8. #' @param base_url The base URL to render the page with.
  9. #' @param timeout A timeout (in seconds) for the render (defaults to 30). Without
  10. #' re-configuring the start-up parameters of the Splash server (not this package)
  11. #' the maximum allowed value for the timeout is 60 seconds.
  12. #' @param resource_timeout A timeout (in seconds) for individual network requests.
  13. #' @param wait Time (in seconds) to wait for updates after page is loaded (defaults to 0).
  14. #' @param proxy Proxy profile name or proxy URL.
  15. #' @param js Javascript profile name.
  16. #' @param js_src JavaScript code to be executed in page context.
  17. #' @param filters Comma-separated list of request filter names.
  18. #' @param allowed_domains Comma-separated list of allowed domain names. If present, Splash
  19. #' won’t load anything neither from domains not in this list nor from subdomains of
  20. #' domains not in this list.
  21. #' @param allowed_content_types Comma-separated list of allowed content types. If present,
  22. #' Splash will abort any request if the response’s content type doesn’t match any of
  23. #' the content types in this list. Wildcards are supported.
  24. #' @param forbidden_content_types Comma-separated list of forbidden content types. If
  25. #' present, Splash will abort any request if the response’s content type matches
  26. #' any of the content types in this list. Wildcards are supported.
  27. #' @param viewport View width and height (in pixels) of the browser viewport to render the
  28. #' web page. Format is “width>xheight”, e.g. 800x600. Default value is "full".
  29. #' @param images Whether to download images.
  30. #' @param headers HTTP headers to set for the first outgoing request.
  31. #' @param body Body of HTTP POST request to be sent if method is POST.
  32. #' @param http_method HTTP method of outgoing Splash request.
  33. #' @param save_args A list of argument names to put in cache.
  34. #' @param load_args Parameter values to load from cache
  35. #' @param http2 Enable or disable HTTP2 support. `TRUE` to enable; `FALSE` to disable; defaults to `FALSE`
  36. #' when `engine` is `webkit` due to malformed behaviour in 3.4.x of Splash
  37. #' @param engine one of `webkit` or `chromium`; defaults to `webkit`
  38. #' @param raw_html if `TRUE` then return a character vector vs an XML document. Only valid for `render_html`
  39. #' @family splash_renderers
  40. #' @return An XML document. Note that this is processed by [xml2::read_html()] so it will not be
  41. #' the pristine, raw, rendered HTML from the site. Use `raw_html=TRUE` if you do not want it
  42. #' to be processed first by `xml2`. If you choose `raw_html=TRUE` you'll get back a
  43. #' character vector.
  44. #' @references [Splash docs](http://splash.readthedocs.io/en/stable/index.html)
  45. #' @export
  46. render_html <- function(splash_obj = splash_local, url, base_url, timeout=30, resource_timeout, wait=0,
  47. proxy, js, js_src, filters, allowed_domains, allowed_content_types,
  48. forbidden_content_types, viewport="1024x768", images, headers, body,
  49. http_method, save_args, load_args, http2 = FALSE,
  50. engine = c("webkit", "chromium"), raw_html=FALSE) {
  51. wait <- check_wait(wait)
  52. engine <- match.arg(engine[1], c("webkit", "chromium"))
  53. http2 <- ifelse(engine == "chromium", 1, as.integer(as.logical(http2[1])))
  54. list(
  55. url = url,
  56. timeout = timeout,
  57. wait = wait,
  58. viewport = jsonlite::unbox(viewport),
  59. http2 = http2,
  60. engine = engine
  61. ) -> params
  62. if (!missing(base_url)) params$base_url <- jsonlite::unbox(base_url)
  63. if (!missing(resource_timeout)) params$resource_timeout <- resource_timeout
  64. if (!missing(proxy)) params$proxy <- jsonlite::unbox(proxy)
  65. if (!missing(js)) params$js <- jsonlite::unbox(js)
  66. if (!missing(js_src)) params$js_src <- jsonlite::unbox(js_src)
  67. if (!missing(filters)) params$filters <- jsonlite::unbox(filters)
  68. if (!missing(allowed_domains)) params$allowed_domains <- jsonlite::unbox(allowed_domains)
  69. if (!missing(allowed_content_types)) params$allowed_content_types <- jsonlite::unbox(allowed_content_types)
  70. if (!missing(forbidden_content_types)) params$forbidden_content_types <- jsonlite::unbox(forbidden_content_types)
  71. if (!missing(images)) params$images <- as.numeric(images)
  72. if (!missing(headers)) params$headers <- headers
  73. if (!missing(body)) params$body <- jsonlite::unbox(body)
  74. if (!missing(http_method)) params$http_method <- jsonlite::unbox(http_method)
  75. if (!missing(save_args)) params$save_args <- jsonlite::unbox(save_args)
  76. if (!missing(load_args)) params$load_args <- jsonlite::unbox(load_args)
  77. if (is.null(splash_obj$user)) {
  78. res <- httr::GET(splash_url(splash_obj), path="render.html", encode="json", query=params)
  79. } else {
  80. res <- httr::GET(
  81. splash_url(splash_obj), path="render.html", encode="json", query=params,
  82. httr::authenticate(splash_obj$user, splash_obj$pass)
  83. )
  84. }
  85. check_or_report_status(res)
  86. out <- httr::content(res, as="text", encoding="UTF-8")
  87. if (!raw_html) out <- xml2::read_html(out)
  88. out
  89. }