Tools to Work with the 'Splash' JavaScript Rendering Service in R
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

421 linhas
13KB

  1. make_splash_call <- function(splash_obj) {
  2. sprintf('
  3. function main(splash)
  4. %s
  5. end
  6. ', paste0(sprintf(" %s", splash_obj$calls), collapse="\n")) -> out
  7. out
  8. }
  9. #' Add raw lua code into DSL call chain
  10. #'
  11. #' The `splashr` `lua` DSL (domain specific language) wrapper wraps what the package
  12. #' author believes to be the most common/useful `lua` functions. Users of the package
  13. #' may have need to insert some custom `lua` code within a DSL call chain they are
  14. #' building. You can insert any Splash `lua` code you like with this function call.
  15. #'
  16. #' The code is inserted at the position the `splash_add_lua`() is called in the chain
  17. #' which will be within the main "splash' function which is defined as:
  18. #'
  19. #' ```
  20. #' function main(splash)
  21. #' ...
  22. #' end
  23. #' ```
  24. #'
  25. #' If you need more flexibility, use the [execute_lua()] function.
  26. #'
  27. #' @md
  28. #' @family splash_dsl_functions
  29. #' @param splash_obj splashr object
  30. #' @param lua_code length 1 character vector of raw `lua` code
  31. #' @export
  32. splash_add_lua <- function(splash_obj, lua_code) {
  33. splash_obj$calls <- c(splash_obj$calls, lua_code, "\n")
  34. splash_obj
  35. }
  36. #' Enable or disable response content tracking.
  37. #'
  38. #' By default Splash doesn’t keep bodies of each response in memory, for efficiency reasons.
  39. #'
  40. #' @param splash_obj splashr object
  41. #' @param enable logical
  42. #' @export
  43. #' @family splash_dsl_attributes
  44. #' @examples \dontrun{
  45. #' splash_local %>%
  46. #' splash_response_body(TRUE) %>%
  47. #' splash_user_agent(ua_macos_chrome) %>%
  48. #' splash_go("https://rud.is/b") %>%
  49. #' splash_wait(2) %>%
  50. #' splash_har() -> rud_har
  51. #' }
  52. splash_response_body <- function(splash_obj, enable=FALSE) {
  53. splash_obj$calls <- c(splash_obj$calls, sprintf('splash.response_body_enabled = %s',
  54. if (enable) "true" else "false"))
  55. splash_obj
  56. }
  57. #' Enable or disable execution of JavaSript code embedded in the page.
  58. #'
  59. #' Private mode is enabled by default unless you pass flag `--disable-private-mode`
  60. #' at Splash (server) startup. Note that if you disable private mode browsing data such
  61. #' as cookies or items kept in local storage may persist between requests.
  62. #'
  63. #' @md
  64. #' @param splash_obj splashr object
  65. #' @param enable logical
  66. #' @family splash_dsl_attributes
  67. #' @export
  68. #' @examples \dontrun{
  69. #' splash_local %>%
  70. #' splash_response_body(TRUE) %>%
  71. #' splash_private_mode(TRUE) %>%
  72. #' splash_user_agent(ua_macos_chrome) %>%
  73. #' splash_go("https://rud.is/b") %>%
  74. #' splash_wait(2) %>%
  75. #' splash_har() -> rud_har
  76. #' }
  77. splash_private_mode <- function(splash_obj, enable=FALSE) {
  78. splash_obj$calls <- c(splash_obj$calls, sprintf('splash.private_mode_enabled = %s',
  79. if (enable) "true" else "false"))
  80. splash_obj
  81. }
  82. #' Enable or disable execution of JavaSript code embedded in the page.
  83. #'
  84. #' JavaScript execution is enabled by default.
  85. #'
  86. #' @md
  87. #' @param splash_obj splashr object
  88. #' @param enable logical
  89. #' @export
  90. #' @family splash_dsl_attributes
  91. #' @examples \dontrun{
  92. #' splash_local %>%
  93. #' splash_response_body(TRUE) %>%
  94. #' splash_private_mode(TRUE) %>%
  95. #' splash_enable_javascript(FALSE) %>%
  96. #' splash_user_agent(ua_macos_chrome) %>%
  97. #' splash_go("https://rud.is/b") %>%
  98. #' splash_wait(2) %>%
  99. #' splash_har() -> rud_har
  100. #' }
  101. splash_enable_javascript <- function(splash_obj, enable=TRUE) {
  102. splash_obj$calls <- c(splash_obj$calls, sprintf('splash.js_enabled = %s',
  103. if (enable) "true" else "false"))
  104. splash_obj
  105. }
  106. #' Enable or disable browser plugins (e.g. Flash).
  107. #'
  108. #' Plugins are disabled by default.
  109. #'
  110. #' @param splash_obj splashr object
  111. #' @param enable logical
  112. #' @family splash_dsl_attributes
  113. #' @export
  114. #' @examples \dontrun{
  115. #' splash_local %>%
  116. #' splash_plugins(TRUE) %>%
  117. #' splash_user_agent(ua_macos_chrome) %>%
  118. #' splash_go("https://rud.is/b") %>%
  119. #' splash_wait(2) %>%
  120. #' splash_har() -> rud_har
  121. #' }
  122. splash_plugins <- function(splash_obj, enable=FALSE) {
  123. splash_obj$calls <- c(splash_obj$calls, sprintf('splash.plugins_enabled = %s',
  124. if (enable) "true" else "false"))
  125. splash_obj
  126. }
  127. #' Enable/disable images
  128. #'
  129. #' By default, images are enabled. Disabling of the images can save a lot of network
  130. #' traffic (usually around ~50%) and make rendering faster. Note that this option can
  131. #' affect the JavaScript code inside page: disabling of the images may change sizes and
  132. #' positions of DOM elements, and scripts may read and use them.
  133. #'
  134. #' @param splash_obj splashr object
  135. #' @param enable logical
  136. #' @family splash_dsl_attributes
  137. #' @export
  138. #' @examples \dontrun{
  139. #' splash_local %>%
  140. #' splash_images(TRUE) %>%
  141. #' splash_user_agent(ua_macos_chrome) %>%
  142. #' splash_go("https://rud.is/b") %>%
  143. #' splash_wait(2) %>%
  144. #' splash_har() -> rud_har
  145. #' }
  146. splash_images <- function(splash_obj, enable=TRUE) {
  147. splash_obj$calls <- c(splash_obj$calls, sprintf('splash.images_enabled = %s',
  148. if (enable) "true" else "false"))
  149. splash_obj
  150. }
  151. #' Go to an URL.
  152. #'
  153. #' This is similar to entering an URL in a browser address bar, pressing Enter and waiting
  154. #' until page loads.
  155. #'
  156. #' @param splash_obj splashr object
  157. #' @param url - URL to load;
  158. #' @family splash_dsl_functions
  159. #' @export
  160. #' @examples \dontrun{
  161. #' splash_local %>%
  162. #' splash_response_body(TRUE) %>%
  163. #' splash_user_agent(ua_macos_chrome) %>%
  164. #' splash_go("https://rud.is/b") %>%
  165. #' splash_wait(2) %>%
  166. #' splash_har() -> rud_har
  167. #' }
  168. splash_go <- function(splash_obj, url) {
  169. splash_obj$calls <- c(splash_obj$calls,
  170. sprintf('url = "%s"', url),
  171. "splash:go(url)")
  172. splash_obj
  173. }
  174. #' Trigger mouse click event in web page.
  175. #'
  176. #' @family splash_dsl_functions
  177. #' @param splash_obj splashr object
  178. #' @param x,y coordinates (distances from the left or top, relative to the current viewport)
  179. #' @export
  180. splash_click <- function(splash_obj, x, y) {
  181. splash_obj$calls <- c(splash_obj$calls,
  182. sprintf("splash:mouse_click(%s, %s)", x, y))
  183. splash_obj
  184. }
  185. #' Focus on a document element provided by a CSS selector
  186. #'
  187. #' @md
  188. #' @family splash_dsl_functions
  189. #' @param splash_obj splashr object
  190. #' @param selector valid CSS selector
  191. #' @references See [the docs](https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-send-text) for more info
  192. #' @export
  193. splash_focus <- function(splash_obj, selector) {
  194. splash_obj$calls <- c(splash_obj$calls,
  195. sprintf('splash:select("%s").node:focus()', selector))
  196. splash_obj
  197. }
  198. #' Send text as input to page context, literally, character by character.
  199. #'
  200. #' This is different from [splash_send_keys()]
  201. #'
  202. #' @md
  203. #' @family splash_dsl_functions
  204. #' @note This adds a call to `splash:wait` so you do not have to
  205. #' @param splash_obj splashr object
  206. #' @param text string to send
  207. #' @references See [the docs](https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-send-keys) for more info
  208. #' @export
  209. splash_send_text <- function(splash_obj, text) {
  210. splash_obj$calls <- c(splash_obj$calls,
  211. sprintf('splash:send_text("%s")', text),
  212. "splash:wait(0.1)")
  213. splash_obj
  214. }
  215. #' Send keyboard events to page context.
  216. #'
  217. #' - whitespace is ignored and only used to separate the different keys
  218. #' - characters are literally represented
  219. #'
  220. #' This is different from [splash_send_text()]
  221. #'
  222. #' @md
  223. #' @family splash_dsl_functions
  224. #' @param splash_obj splashr object
  225. #' @param keys string to send
  226. #' @references See [the docs](https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-send-keys) for more info
  227. #' @export
  228. splash_send_keys <- function(splash_obj, keys) {
  229. splash_obj$calls <- c(splash_obj$calls,
  230. sprintf('splash:send_keys("%s")', keys),
  231. "splash:wait(0.1)")
  232. splash_obj
  233. }
  234. #' Trigger mouse release event in web page.
  235. #'
  236. #' @family splash_dsl_functions
  237. #' @param splash_obj splashr object
  238. #' @param x,y coordinates (distances from the left or top, relative to the current viewport)
  239. #' @export
  240. splash_release <- function(splash_obj, x, y) {
  241. splash_obj$calls <- c(splash_obj$calls,
  242. sprintf("splash:mouse_release(%s, %s)", x, y))
  243. splash_obj
  244. }
  245. #' Trigger mouse press event in web page.
  246. #'
  247. #' @family splash_dsl_functions
  248. #' @param splash_obj splashr object
  249. #' @param x,y coordinates (distances from the left or top, relative to the current viewport)
  250. #' @export
  251. splash_press <- function(splash_obj, x, y) {
  252. splash_obj$calls <- c(splash_obj$calls,
  253. sprintf("splash:mouse_press(%s, %s)", x, y))
  254. splash_obj
  255. }
  256. #' Wait for a period time
  257. #'
  258. #' When script is waiting WebKit continues processing the webpage
  259. #'
  260. #' @md
  261. #' @param splash_obj splashr object
  262. #' @param time number of seconds to wait
  263. #' @family splash_dsl_functions
  264. #' @export
  265. #' @examples \dontrun{
  266. #' splash_local %>%
  267. #' splash_response_body(TRUE) %>%
  268. #' splash_user_agent(ua_macos_chrome) %>%
  269. #' splash_go("https://rud.is/b") %>%
  270. #' splash_wait(2) %>%
  271. #' splash_har() -> rud_har
  272. #' }
  273. splash_wait <- function(splash_obj, time=2) {
  274. splash_obj$calls <- c(splash_obj$calls, sprintf('splash:wait(%s)', time))
  275. splash_obj
  276. }
  277. #' Drops all internally stored HAR records.
  278. #'
  279. #' @md
  280. #' @param splash_obj splashr object
  281. #' @family splash_dsl_functions
  282. #' @export
  283. splash_har_reset <- function(splash_obj) {
  284. splash_obj$calls <- c(splash_obj$calls, 'splash:har_reset()')
  285. splash_obj
  286. }
  287. #' Return information about Splash interaction with a website in HAR format.
  288. #'
  289. #' Similar to [render_har()] but used in a script context. Should be the LAST element in
  290. #' a DSL script chain as this will execute the script and return the HAR content
  291. #'
  292. #' @md
  293. #' @param splash_obj splashr object
  294. #' @family splash_dsl_functions
  295. #' @export
  296. #' @examples \dontrun{
  297. #' splash_local %>%
  298. #' splash_response_body(TRUE) %>%
  299. #' splash_user_agent(ua_macos_chrome) %>%
  300. #' splash_go("https://rud.is/b") %>%
  301. #' splash_wait(2) %>%
  302. #' splash_har() -> rud_har
  303. #' }
  304. splash_har <- function(splash_obj) {
  305. splash_obj$calls <- c(splash_obj$calls, 'return(splash:har())')
  306. call_function <- make_splash_call(splash_obj)
  307. res <- execute_lua(splash_obj, call_function)
  308. as_har(res)
  309. }
  310. #' Return a HTML snapshot of a current page.
  311. #'
  312. #' Similar to [render_html()] but used in a script context. Should be the LAST element in
  313. #' a DSL script chain as this will execute the script and return the HTML content
  314. #'
  315. #' @md
  316. #' @param splash_obj splashr object
  317. #' @param raw_html if `TRUE` then return a character vector vs an XML document.
  318. #' @family splash_dsl_functions
  319. #' @export
  320. #' @examples \dontrun{
  321. #' splash_local %>%
  322. #' splash_response_body(TRUE) %>%
  323. #' splash_user_agent(ua_macos_chrome) %>%
  324. #' splash_go("https://rud.is/b") %>%
  325. #' splash_wait(2) %>%
  326. #' splash_html() -> rud_pg
  327. #' }
  328. splash_html <- function(splash_obj, raw_html=FALSE) {
  329. splash_obj$calls <- c(splash_obj$calls, 'return(splash:html())')
  330. call_function <- make_splash_call(splash_obj)
  331. out <- execute_lua(splash_obj, call_function)
  332. if (!raw_html) out <- xml2::read_html(out)
  333. out
  334. }
  335. #' Return a screenshot of a current page in PNG format.
  336. #'
  337. #' Similar to [render_png()] but used in a script context. Should be the LAST element in
  338. #' a DSL script chain as this will execute the script and return the PNG content
  339. #'
  340. #' @md
  341. #' @param splash_obj splashr object
  342. #' @family splash_dsl_functions
  343. #' @return a [magick] image object
  344. #' @export
  345. #' @examples \dontrun{
  346. #' splash_local %>%
  347. #' splash_user_agent(ua_macos_chrome) %>%
  348. #' splash_go("https://rud.is/b") %>%
  349. #' splash_wait(2) %>%
  350. #' splash_png()
  351. #' }
  352. splash_png <- function(splash_obj) {
  353. splash_obj$calls <- c(splash_obj$calls, 'return splash:png{render_all=true}')
  354. call_function <- make_splash_call(splash_obj)
  355. res <- execute_lua(splash_obj, call_function)
  356. magick::image_read(res)
  357. }
  358. #' Overwrite the User-Agent header for all further requests.
  359. #'
  360. #' There are a few built-in user agents, all beginning with `ua_`.
  361. #'
  362. #' @md
  363. #' @param splash_obj splashr object
  364. #' @param user_agent 1 element character vector, defaults to `splashr/#.#.#`.
  365. #' @family splash_dsl_functions_functions
  366. #' @export
  367. #' @examples \dontrun{
  368. #' library(rvest)
  369. #'
  370. #' URL <- "https://httpbin.org/user-agent"
  371. #'
  372. #' splash_local %>%
  373. #' splash_response_body(TRUE) %>%
  374. #' splash_user_agent(ua_macos_chrome) %>%
  375. #' splash_go(URL) %>%
  376. #' splash_html() %>%
  377. #' html_text("body") %>%
  378. #' jsonlite::fromJSON()
  379. #' }
  380. splash_user_agent <- function(splash_obj, user_agent=ua_splashr) {
  381. splash_obj$calls <- c(splash_obj$calls, sprintf('splash:set_user_agent("%s")', user_agent))
  382. splash_obj
  383. }