Tools to Work with the 'Splash' JavaScript Rendering Service in R
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

391 lines
12KB

  1. make_splash_call <- function(splash_obj) {
  2. sprintf('
  3. function main(splash)
  4. %s
  5. end
  6. ', paste0(sprintf(" %s", splash_obj$calls), collapse="\n")) -> out
  7. out
  8. }
  9. #' Add raw lua code into DSL call chain
  10. #'
  11. #' The `splashr` `lua` DSL (domain specific language) wrapper wraps what the package
  12. #' author believes to be the most common/useful `lua` functions. Users of the package
  13. #' may have need to insert some custom `lua` code within a DSL call chain they are
  14. #' building. You can insert any Splash `lua` code you like with this function call.
  15. #'
  16. #' The code is inserted at the position the `splash_add_lua`() is called in the chain
  17. #' which will be within the main "splash' function which is defined as:
  18. #'
  19. #' ```
  20. #' function main(splash)
  21. #' ...
  22. #' end
  23. #' ```
  24. #'
  25. #' If you need more flexibility, use the [execute_lua()] function.
  26. #'
  27. #' @md
  28. #' @param splash_obj splashr object
  29. #' @param lua_code length 1 character vector of raw `lua` code
  30. #' @export
  31. splash_add_lua <- function(splash_obj, lua_code) {
  32. splash_obj$calls <- c(splash_obj$calls, lua_code, "\n")
  33. splash_obj
  34. }
  35. #' Enable or disable response content tracking.
  36. #'
  37. #' By default Splash doesn’t keep bodies of each response in memory, for efficiency reasons.
  38. #'
  39. #' @param splash_obj splashr object
  40. #' @param enable logical
  41. #' @export
  42. #' @examples \dontrun{
  43. #' splash_local %>%
  44. #' splash_response_body(TRUE) %>%
  45. #' splash_user_agent(ua_macos_chrome) %>%
  46. #' splash_go("https://rud.is/b") %>%
  47. #' splash_wait(2) %>%
  48. #' splash_har() -> rud_har
  49. #' }
  50. splash_response_body <- function(splash_obj, enable=FALSE) {
  51. splash_obj$calls <- c(splash_obj$calls, sprintf('splash.response_body_enabled = %s',
  52. if (enable) "true" else "false"))
  53. splash_obj
  54. }
  55. #' Enable or disable browser plugins (e.g. Flash).
  56. #'
  57. #' Plugins are disabled by default.
  58. #'
  59. #' @param splash_obj splashr object
  60. #' @param enable logical
  61. #' @export
  62. #' @examples \dontrun{
  63. #' splash_local %>%
  64. #' splash_plugins(TRUE) %>%
  65. #' splash_user_agent(ua_macos_chrome) %>%
  66. #' splash_go("https://rud.is/b") %>%
  67. #' splash_wait(2) %>%
  68. #' splash_har() -> rud_har
  69. #' }
  70. splash_plugins <- function(splash_obj, enable=FALSE) {
  71. splash_obj$calls <- c(splash_obj$calls, sprintf('splash.plugins_enabled = %s',
  72. if (enable) "true" else "false"))
  73. splash_obj
  74. }
  75. #' Enable/disable images
  76. #'
  77. #' By default, images are enabled. Disabling of the images can save a lot of network
  78. #' traffic (usually around ~50%) and make rendering faster. Note that this option can
  79. #' affect the JavaScript code inside page: disabling of the images may change sizes and
  80. #' positions of DOM elements, and scripts may read and use them.
  81. #'
  82. #' @param splash_obj splashr object
  83. #' @param enable logical
  84. #' @export
  85. #' @examples \dontrun{
  86. #' splash_local %>%
  87. #' splash_images(TRUE) %>%
  88. #' splash_user_agent(ua_macos_chrome) %>%
  89. #' splash_go("https://rud.is/b") %>%
  90. #' splash_wait(2) %>%
  91. #' splash_har() -> rud_har
  92. #' }
  93. splash_images <- function(splash_obj, enable=TRUE) {
  94. splash_obj$calls <- c(splash_obj$calls, sprintf('splash.images_enabled = %s',
  95. if (enable) "true" else "false"))
  96. splash_obj
  97. }
  98. #' Go to an URL.
  99. #'
  100. #' This is similar to entering an URL in a browser address bar, pressing Enter and waiting
  101. #' until page loads.
  102. #'
  103. #' @param splash_obj splashr object
  104. #' @param url - URL to load;
  105. #' @export
  106. #' @examples \dontrun{
  107. #' splash_local %>%
  108. #' splash_response_body(TRUE) %>%
  109. #' splash_user_agent(ua_macos_chrome) %>%
  110. #' splash_go("https://rud.is/b") %>%
  111. #' splash_wait(2) %>%
  112. #' splash_har() -> rud_har
  113. #' }
  114. splash_go <- function(splash_obj, url) {
  115. splash_obj$calls <- c(splash_obj$calls,
  116. sprintf('url = "%s"', url),
  117. "splash:go(url)")
  118. splash_obj
  119. }
  120. #' Trigger mouse click event in web page.
  121. #'
  122. #' @param splash_obj splashr object
  123. #' @param x,y coordinates (distances from the left or top, relative to the current viewport)
  124. #' @export
  125. splash_click <- function(splash_obj, x, y) {
  126. splash_obj$calls <- c(splash_obj$calls,
  127. sprintf("splash:mouse_click(%s, %s)", x, y))
  128. splash_obj
  129. }
  130. #' Focus on a document element provided by a CSS selector
  131. #'
  132. #' @md
  133. #' @param splash_obj splashr object
  134. #' @param selector valid CSS selector
  135. #' @references See [the docs](https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-send-text) for more info
  136. #' @export
  137. splash_focus <- function(splash_obj, selector) {
  138. splash_obj$calls <- c(splash_obj$calls,
  139. sprintf('splash:select("%s").node:focus()', selector))
  140. splash_obj
  141. }
  142. #' Send text as input to page context, literally, character by character.
  143. #'
  144. #' This is different from [splash_send_keys()]
  145. #'
  146. #' @md
  147. #' @note This adds a call to `splash:wait` so you do not have to
  148. #' @param splash_obj splashr object
  149. #' @param text string to send
  150. #' @references See [the docs](https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-send-keys) for more info
  151. #' @export
  152. splash_send_text <- function(splash_obj, text) {
  153. splash_obj$calls <- c(splash_obj$calls,
  154. sprintf('splash:send_text("%s")', text),
  155. "splash:wait(0.1)")
  156. splash_obj
  157. }
  158. #' Send keyboard events to page context.
  159. #'
  160. #' - whitespace is ignored and only used to separate the different keys
  161. #' - characters are literally represented
  162. #'
  163. #' This is different from [splash_send_text()]
  164. #'
  165. #' @md
  166. #' @param splash_obj splashr object
  167. #' @param keys string to send
  168. #' @references See [the docs](https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-send-keys) for more info
  169. #' @export
  170. splash_send_keys <- function(splash_obj, keys) {
  171. splash_obj$calls <- c(splash_obj$calls,
  172. sprintf('splash:send_keys("%s")', keys),
  173. "splash:wait(0.1)")
  174. splash_obj
  175. }
  176. #' Trigger mouse release event in web page.
  177. #'
  178. #' @param splash_obj splashr object
  179. #' @param x,y coordinates (distances from the left or top, relative to the current viewport)
  180. #' @export
  181. splash_release <- function(splash_obj, x, y) {
  182. splash_obj$calls <- c(splash_obj$calls,
  183. sprintf("splash:mouse_release(%s, %s)", x, y))
  184. splash_obj
  185. }
  186. #' Trigger mouse press event in web page.
  187. #'
  188. #' @param splash_obj splashr object
  189. #' @param x,y coordinates (distances from the left or top, relative to the current viewport)
  190. #' @export
  191. splash_press <- function(splash_obj, x, y) {
  192. splash_obj$calls <- c(splash_obj$calls,
  193. sprintf("splash:mouse_press(%s, %s)", x, y))
  194. splash_obj
  195. }
  196. #' Wait for a period time
  197. #'
  198. #' When script is waiting WebKit continues processing the webpage
  199. #'
  200. #' @md
  201. #' @param splash_obj splashr object
  202. #' @param time number of seconds to wait
  203. #' @export
  204. #' @examples \dontrun{
  205. #' splash_local %>%
  206. #' splash_response_body(TRUE) %>%
  207. #' splash_user_agent(ua_macos_chrome) %>%
  208. #' splash_go("https://rud.is/b") %>%
  209. #' splash_wait(2) %>%
  210. #' splash_har() -> rud_har
  211. #' }
  212. splash_wait <- function(splash_obj, time=2) {
  213. splash_obj$calls <- c(splash_obj$calls, sprintf('splash:wait(%s)', time))
  214. splash_obj
  215. }
  216. #' Return information about Splash interaction with a website in HAR format.
  217. #'
  218. #' Similar to [render_har()] but used in a script context. Should be the LAST element in
  219. #' a DSL script chain as this will execute the script and return the HAR content
  220. #'
  221. #' @md
  222. #' @param splash_obj splashr object
  223. #' @export
  224. #' @examples \dontrun{
  225. #' splash_local %>%
  226. #' splash_response_body(TRUE) %>%
  227. #' splash_user_agent(ua_macos_chrome) %>%
  228. #' splash_go("https://rud.is/b") %>%
  229. #' splash_wait(2) %>%
  230. #' splash_har() -> rud_har
  231. #' }
  232. splash_har <- function(splash_obj) {
  233. splash_obj$calls <- c(splash_obj$calls, 'return(splash:har())')
  234. call_function <- make_splash_call(splash_obj)
  235. res <- execute_lua(splash_obj, call_function)
  236. as_har(res)
  237. }
  238. #' Return a HTML snapshot of a current page.
  239. #'
  240. #' Similar to [render_html()] but used in a script context. Should be the LAST element in
  241. #' a DSL script chain as this will execute the script and return the HTML content
  242. #'
  243. #' @md
  244. #' @param splash_obj splashr object
  245. #' @param raw_html if `TRUE` then return a character vector vs an XML document.
  246. #' @export
  247. #' @examples \dontrun{
  248. #' splash_local %>%
  249. #' splash_response_body(TRUE) %>%
  250. #' splash_user_agent(ua_macos_chrome) %>%
  251. #' splash_go("https://rud.is/b") %>%
  252. #' splash_wait(2) %>%
  253. #' splash_html() -> rud_pg
  254. #' }
  255. splash_html <- function(splash_obj, raw_html=FALSE) {
  256. splash_obj$calls <- c(splash_obj$calls, 'return(splash:html())')
  257. call_function <- make_splash_call(splash_obj)
  258. out <- execute_lua(splash_obj, call_function)
  259. if (!raw_html) out <- xml2::read_html(out)
  260. out
  261. }
  262. #' Return a screenshot of a current page in PNG format.
  263. #'
  264. #' Similar to [render_png()] but used in a script context. Should be the LAST element in
  265. #' a DSL script chain as this will execute the script and return the PNG content
  266. #'
  267. #' @md
  268. #' @param splash_obj splashr object
  269. #' @return a [magick] image object
  270. #' @export
  271. #' @examples \dontrun{
  272. #' splash_local %>%
  273. #' splash_user_agent(ua_macos_chrome) %>%
  274. #' splash_go("https://rud.is/b") %>%
  275. #' splash_wait(2) %>%
  276. #' splash_png()
  277. #' }
  278. splash_png <- function(splash_obj) {
  279. splash_obj$calls <- c(splash_obj$calls, 'return splash:png{render_all=true}')
  280. call_function <- make_splash_call(splash_obj)
  281. res <- execute_lua(splash_obj, call_function)
  282. magick::image_read(res)
  283. }
  284. #' Overwrite the User-Agent header for all further requests.
  285. #'
  286. #' There are a few built-in user agents, all beginning with `ua_`.
  287. #'
  288. #' @md
  289. #' @param splash_obj splashr object
  290. #' @param user_agent 1 element character vector, defaults to `splashr/#.#.#`.
  291. #' @export
  292. #' @examples \dontrun{
  293. #' library(rvest)
  294. #'
  295. #' URL <- "https://httpbin.org/user-agent"
  296. #'
  297. #' splash_local %>%
  298. #' splash_response_body(TRUE) %>%
  299. #' splash_user_agent(ua_macos_chrome) %>%
  300. #' splash_go(URL) %>%
  301. #' splash_html() %>%
  302. #' html_text("body") %>%
  303. #' jsonlite::fromJSON()
  304. #' }
  305. splash_user_agent <- function(splash_obj, user_agent=ua_splashr) {
  306. splash_obj$calls <- c(splash_obj$calls, sprintf('splash:set_user_agent("%s")', user_agent))
  307. splash_obj
  308. }
  309. #' @rdname splash_user_agent
  310. #' @export
  311. ua_splashr <- sprintf("splashr/%s", packageVersion("splashr"))
  312. #' @rdname splash_user_agent
  313. #' @export
  314. ua_win10_chrome <- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
  315. #' @rdname splash_user_agent
  316. #' @export
  317. ua_win10_firefox <- "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
  318. #' @rdname splash_user_agent
  319. #' @export
  320. ua_win10_ie11 <- "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
  321. #' @rdname splash_user_agent
  322. #' @export
  323. ua_win7_chrome <- "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
  324. #' @rdname splash_user_agent
  325. #' @export
  326. ua_win7_firefox <- "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"
  327. #' @rdname splash_user_agent
  328. #' @export
  329. ua_win7_ie11 <- "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"
  330. #' @rdname splash_user_agent
  331. #' @export
  332. ua_macos_chrome <- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36"
  333. #' @rdname splash_user_agent
  334. #' @export
  335. ua_macos_safari <- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/602.3.12 (KHTML, like Gecko) Version/10.0.2 Safari/602.3.12"
  336. #' @rdname splash_user_agent
  337. #' @export
  338. ua_linux_chrome <- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
  339. #' @rdname splash_user_agent
  340. #' @export
  341. ua_linux_firefox <- "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:51.0) Gecko/20100101 Firefox/51.0"
  342. #' @rdname splash_user_agent
  343. #' @export
  344. ua_ios_safari <- "Mozilla/5.0 (iPad; CPU OS 10_2 like Mac OS X) AppleWebKit/602.3.12 (KHTML, like Gecko) Version/10.0 Mobile/14C92 Safari/602.1"