diff --git a/.Rbuildignore b/.Rbuildignore index ece4c73..15db89e 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,4 @@ +^Makefile$ ^.vscode$ ^.*\.Rproj$ ^\.Rproj\.user$ diff --git a/DESCRIPTION b/DESCRIPTION index 658be2e..2bc803e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,25 +1,28 @@ Package: crux Type: Package -Title: crux title goes here otherwise CRAN checks fail +Title: Identify the Crux of an Article Version: 0.1.0 Date: 2019-03-01 Authors@R: c( person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), - comment = c(ORCID = "0000-0001-5670-2640")) + comment = c(ORCID = "0000-0001-5670-2640")), + person("Chimbori", role = "aut", + comment = "crux Java library ()") ) Maintainer: Bob Rudis -Description: A good description goes here otherwise CRAN checks fail. +Description: Methods are provided to retrieve HTML content and return extracted + metadata and summarised plain text. Further methods are provided to classify + URLs with or without making network calls. Based on . URL: https://gitlab.com/hrbrmstr/crux BugReports: https://gitlab.com/hrbrmstr/crux/issues Encoding: UTF-8 -License: AGPL +License: Apache License 2.0 | file LICENSE Suggests: testthat, covr Depends: - R (>= 3.2.0) -Imports: - httr, - jsonlite + R (>= 3.2.0), + cruxjars, + rJava Roxygen: list(markdown = TRUE) RoxygenNote: 6.1.1 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..df0bb94 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..38ba8e2 --- /dev/null +++ b/Makefile @@ -0,0 +1,29 @@ +PACKAGE := $(shell grep '^Package:' DESCRIPTION | sed -E 's/^Package:[[:space:]]+//') +RSCRIPT = Rscript --no-init-file + +all: install + +test: + ${RSCRIPT} -e 'library(methods); devtools::test()' + +doc: + ${RSCRIPT} -e "library(methods); devtools::document()" + +install: + ${RSCRIPT} -e "library(methods); devtools::install()" + +build: + ${RSCRIPT} -e "library(methods); devtools::build()" + +check: + _R_CHECK_CRAN_INCOMING_=FALSE make check_all + +check_all: + ${RSCRIPT} -e "library(methods); devtools::check(cran=TRUE)" + +README.md: README.Rmd + Rscript -e 'library(methods); devtools::load_all(); rmarkdown::render("README.Rmd", output_file="README.md")' + sed -i.bak 's/[[:space:]]*$$//' $@ + rm -f $@.bak + +.PHONY: all test doc install check check_all diff --git a/NAMESPACE b/NAMESPACE index 5b4b9ae..3ba8d59 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,4 +1,15 @@ # Generated by roxygen2: do not edit by hand -import(httr) -importFrom(jsonlite,fromJSON) +export(classify_url) +export(is_ad_image) +export(is_likely_archive) +export(is_likely_article) +export(is_likely_audio) +export(is_likely_binary_doc) +export(is_likely_executable) +export(is_likely_image) +export(is_likely_video) +export(is_web_scheme) +export(summarise_url) +import(cruxjars) +import(rJava) diff --git a/R/classify.R b/R/classify.R new file mode 100644 index 0000000..6bca0cb --- /dev/null +++ b/R/classify.R @@ -0,0 +1,105 @@ +#' Classify a URL with or without making network calls +#' +#' @md +#' @param x URL to classify +#' @param resolve_redirects if `TRUE` resolve redirects such as when Facebook or +#' Google show an interstitial page instead of redirecting the user to +#' the actual URL. +#' @export +classify_url <- function(x, resolve_redirects = FALSE) { + + crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) + + if (resolve_redirects) crux_url <- crux_url.resolveRedirects() + + data.frame( + url = x, + is_ad_image = crux_url$isAdImage(), + is_web_scheme = crux_url$isWebScheme(), + is_likely_article = crux_url$isLikelyArticle(), + is_likely_video = crux_url$isLikelyVideo(), + is_likely_audio = crux_url$isLikelyAudio(), + is_likely_binary_doc = crux_url$isLikelyBinaryDocument(), + is_likely_archive = crux_url$isLikelyArchive(), + is_likely_executable = crux_url$isLikelyExecutable(), + is_likely_image = crux_url$isLikelyImage(), + stringsAsFactors = FALSE + ) -> out + + class(out) <- c("tbl_df", "tbl", "data.frame") + + out + +} + +#' @rdname classify_url +#' @export +is_ad_image <- function(x, resolve_redirects=FALSE) { + crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) + if (resolve_redirects) crux_url <- crux_url.resolveRedirects() + crux_url$isAdImage() +} + +#' @rdname classify_url +#' @export +is_web_scheme <- function(x, resolve_redirects=FALSE) { + crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) + if (resolve_redirects) crux_url <- crux_url.resolveRedirects() + crux_url$isWebScheme() +} + +#' @rdname classify_url +#' @export +is_likely_article <- function(x, resolve_redirects=FALSE) { + crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) + if (resolve_redirects) crux_url <- crux_url.resolveRedirects() + crux_url$isLikelyArticle() +} + +#' @rdname classify_url +#' @export +is_likely_video <- function(x, resolve_redirects=FALSE) { + crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) + if (resolve_redirects) crux_url <- crux_url.resolveRedirects() + crux_url$isLikelyVideo() +} + +#' @rdname classify_url +#' @export +is_likely_audio <- function(x, resolve_redirects=FALSE) { + crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) + if (resolve_redirects) crux_url <- crux_url.resolveRedirects() + crux_url$isLikelyAudio() +} + +#' @rdname classify_url +#' @export +is_likely_binary_doc <- function(x, resolve_redirects=FALSE) { + crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) + if (resolve_redirects) crux_url <- crux_url.resolveRedirects() + crux_url$isLikelyBinaryDocument() +} + +#' @rdname classify_url +#' @export +is_likely_archive <- function(x, resolve_redirects=FALSE) { + crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) + if (resolve_redirects) crux_url <- crux_url.resolveRedirects() + crux_url$isLikelyArchive() +} + +#' @rdname classify_url +#' @export +is_likely_executable <- function(x, resolve_redirects=FALSE) { + crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) + if (resolve_redirects) crux_url <- crux_url.resolveRedirects() + crux_url$isLikelyExecutable() +} + +#' @rdname classify_url +#' @export +is_likely_image <- function(x, resolve_redirects=FALSE) { + crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) + if (resolve_redirects) crux_url <- crux_url.resolveRedirects() + crux_url$isLikelyImage() +} diff --git a/R/crux-package.R b/R/crux-package.R index 6ead657..1ad769b 100644 --- a/R/crux-package.R +++ b/R/crux-package.R @@ -1,12 +1,19 @@ -#' ... -#' +#' Identify the Crux of an Article +#' +#' Methods are provided to retrieve HTML content and return extracted +#' metadata and summarised plain text. Further methods are provided to classify +#' URLs with or without making network calls. Based on . +#' +#' Based on . +#' #' - URL: #' - BugReports: -#' +#' - Javadoc: +#' #' @md #' @name crux +#' @keywords internal #' @docType package #' @author Bob Rudis (bob@@rud.is) -#' @import httr -#' @importFrom jsonlite fromJSON +#' @import cruxjars rJava NULL diff --git a/R/summarise.R b/R/summarise.R new file mode 100644 index 0000000..7968495 --- /dev/null +++ b/R/summarise.R @@ -0,0 +1,71 @@ +`%es%` <- function(a, b) { + if (is.jnull(a)) return(b) + if (trimws(a) == "") return(b) + return(a) +} + +#' Summarise the contents at a URL to essential bits +#' +#' Fetches the HTML from `x` and returns the essential components +#' including: +#' - `url` +#' - `original_url` +#' - `title` +#' - `description` +#' - `site_name` +#' - `theme_color` +#' - `amp_url` +#' - `canonical_url` +#' - `image_url` +#' - `video_url` +#' - `feed_url` +#' - `favicon_url` +#' - `reading_time` +#' - `text` (the reducted, plain text) +#' If any compontents cannot be derived from the contents of the URL they will be `NA`. +#' +#' @md +#' @param x URL +#' @export +#' @examples +#' ex_url <- "https://techcrunch.com/2019/02/28/thailand-passes-controversial-cybersecurity-law/" +#' str(summarise_url(ex_url), 1) +summarise_url <- function(x) { + + soup <- J("org.jsoup.Jsoup") + con <- soup$connect(x) + doc <- con$get() + + ae <- J("com.chimbori.crux.articles.ArticleExtractor") + + article <- ae$with(x, doc) + + content <- article$extractContent() + content <- content$extractMetadata() + content <- content$estimateReadingTime() + + rec <- content$article() + + txt <- rec$document$text() + + list( + url = rec$url %es% NA_character_, + original_url = rec$originalUrl %es% NA_character_, + title = rec$title %es% NA_character_, + description = rec$description %es% NA_character_, + site_name = rec$siteName %es% NA_character_, + theme_color = rec$themeColor %es% NA_character_, + amp_url = rec$ampUrl %es% NA_character_, + canonical_url = rec$canonicalUrl %es% NA_character_, + image_url = rec$imageUrl %es% NA_character_, + video_url = rec$videoUrl %es% NA_character_, + feed_url = rec$feedUrl %es% NA_character_, + favicon_url = rec$faviconUrl %es% NA_character_, + reading_time = rec$estimatedReadingTimeMinutes %es% NA_integer_, + text = txt %es% NA_character_ + ) -> content + + return(content) + +} + diff --git a/README.Rmd b/README.Rmd index c07fea3..ecdded4 100644 --- a/README.Rmd +++ b/README.Rmd @@ -14,20 +14,34 @@ options(width=120) # crux +Identify the Crux of an Article + ## Description +Methods are provided to retrieve HTML content and return extracted +metadata and summarised plain text. Further methods are provided to classify +URLs with or without making network calls. Based on . + ## What's Inside The Tin The following functions are implemented: +- `classify_url`: Classify a URL with or without making network calls +- `is_ad_image`: Classify a URL with or without making network calls +- `is_likely_archive`: Classify a URL with or without making network calls +- `is_likely_article`: Classify a URL with or without making network calls +- `is_likely_audio`: Classify a URL with or without making network calls +- `is_likely_binary_doc`: Classify a URL with or without making network calls +- `is_likely_executable`: Classify a URL with or without making network calls +- `is_likely_image`: Classify a URL with or without making network calls +- `is_likely_video`: Classify a URL with or without making network calls +- `is_web_scheme`: Classify a URL with or without making network calls +- `summarise_url`: Summarise the contents at a URL to essential bits + ## Installation ```{r install-ex, eval=FALSE} -devtools::install_git("https://sr.ht.com/~hrbrmstr/crux.git") -# or -devtools::install_git("https://gitlab.com/hrbrmstr/crux.git") -# or (if you must) -devtools::install_github("hrbrmstr/crux") +install.packages(c("cruxjars", "crux"), repos = "https://cinc.rud.is/") ``` ## Usage @@ -40,6 +54,18 @@ packageVersion("crux") ``` +```{r} +str( + summarise_url("http://time.com/5541738/joe-biden-backtracks-pence-praise-criticism/"), 1 +) +``` + +```{r} +str( + classify_url("https://www.washingtonpost.com/powerpost/house-democrats-explode-in-recriminations-as-liberals-lash-out-at-moderates/2019/02/28/c3d163fe-3b87-11e9-a06c-3ec8ed509d15_story.html") +) +``` + ## crux Metrics ```{r cloc, echo=FALSE} diff --git a/README.md b/README.md index 90e8c26..e2744f5 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,108 @@ + +[![Travis-CI Build +Status](https://travis-ci.org/hrbrmstr/crux.svg?branch=master)](https://travis-ci.org/hrbrmstr/crux) +[![Coverage +Status](https://codecov.io/gh/hrbrmstr/crux/branch/master/graph/badge.svg)](https://codecov.io/gh/hrbrmstr/crux) +[![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/crux)](https://cran.r-project.org/package=crux) + # crux +Identify the Crux of an Article + +## Description + +Methods are provided to retrieve HTML content and return extracted +metadata and summarised plain text. Further methods are provided to +classify URLs with or without making network calls. Based on +. + +## What’s Inside The Tin + +The following functions are implemented: + + - `classify_url`: Classify a URL with or without making network calls + - `is_ad_image`: Classify a URL with or without making network calls + - `is_likely_archive`: Classify a URL with or without making network + calls + - `is_likely_article`: Classify a URL with or without making network + calls + - `is_likely_audio`: Classify a URL with or without making network + calls + - `is_likely_binary_doc`: Classify a URL with or without making + network calls + - `is_likely_executable`: Classify a URL with or without making + network calls + - `is_likely_image`: Classify a URL with or without making network + calls + - `is_likely_video`: Classify a URL with or without making network + calls + - `is_web_scheme`: Classify a URL with or without making network calls + - `summarise_url`: Summarise the contents at a URL to essential bits + +## Installation + +``` r +install.packages(c("cruxjars", "crux"), repos = "https://cinc.rud.is/") +``` + +## Usage + +``` r +library(crux) + +# current version +packageVersion("crux") +## [1] '0.1.0' +``` + +``` r +str( + summarise_url("http://time.com/5541738/joe-biden-backtracks-pence-praise-criticism/"), 1 +) +## List of 14 +## $ url : chr "http://time.com/5541738/joe-biden-backtracks-pence-praise-criticism/" +## $ original_url : chr NA +## $ title : chr "Joe Biden Backtracks After Calling Pence 'a Decent Guy'" +## $ description : chr "The former Vice President took back his comment that Pence is \"a decent guy\" after fellow Democrats denounced his remarks" +## $ site_name : chr "Time" +## $ theme_color : chr NA +## $ amp_url : chr "http://amp.timeinc.net/time/5541738/joe-biden-backtracks-pence-praise-criticism" +## $ canonical_url: chr "http://time.com/5541738/joe-biden-backtracks-pence-praise-criticism/" +## $ image_url : chr "http://timedotcom.files.wordpress.com/2019/03/ap19059832629402.jpg?quality=85&crop=0px%2C111px%2C6000px%2C3000p"| __truncated__ +## $ video_url : chr NA +## $ feed_url : chr NA +## $ favicon_url : chr "http://time.com/img/favicons/favicon-192.png" +## $ reading_time : int 2 +## $ text : chr "(OMAHA, Neb.) — Former Vice President Joe Biden’s tendency to talk about his good relationships with Republican"| __truncated__ +``` + +``` r +str( + classify_url("https://www.washingtonpost.com/powerpost/house-democrats-explode-in-recriminations-as-liberals-lash-out-at-moderates/2019/02/28/c3d163fe-3b87-11e9-a06c-3ec8ed509d15_story.html") +) +## Classes 'tbl_df', 'tbl' and 'data.frame': 1 obs. of 10 variables: +## $ url : chr "https://www.washingtonpost.com/powerpost/house-democrats-explode-in-recriminations-as-liberals-lash-out-at-mode"| __truncated__ +## $ is_ad_image : logi FALSE +## $ is_web_scheme : logi TRUE +## $ is_likely_article : logi TRUE +## $ is_likely_video : logi FALSE +## $ is_likely_audio : logi FALSE +## $ is_likely_binary_doc: logi FALSE +## $ is_likely_archive : logi FALSE +## $ is_likely_executable: logi FALSE +## $ is_likely_image : logi FALSE +``` + +## crux Metrics + +| Lang | \# Files | (%) | LoC | (%) | Blank lines | (%) | \# Lines | (%) | +| :--- | -------: | ---: | --: | ---: | ----------: | ---: | -------: | ---: | +| R | 5 | 0.71 | 104 | 0.76 | 28 | 0.49 | 71 | 0.61 | +| make | 1 | 0.14 | 20 | 0.15 | 9 | 0.16 | 0 | 0.00 | +| Rmd | 1 | 0.14 | 12 | 0.09 | 20 | 0.35 | 46 | 0.39 | + +## Code of Conduct + +Please note that this project is released with a [Contributor Code of +Conduct](CONDUCT.md). By participating in this project you agree to +abide by its terms. diff --git a/man/classify_url.Rd b/man/classify_url.Rd new file mode 100644 index 0000000..10a20c5 --- /dev/null +++ b/man/classify_url.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/classify.R +\name{classify_url} +\alias{classify_url} +\alias{is_ad_image} +\alias{is_web_scheme} +\alias{is_likely_article} +\alias{is_likely_video} +\alias{is_likely_audio} +\alias{is_likely_binary_doc} +\alias{is_likely_archive} +\alias{is_likely_executable} +\alias{is_likely_image} +\title{Classify a URL with or without making network calls} +\usage{ +classify_url(x, resolve_redirects = FALSE) + +is_ad_image(x, resolve_redirects = FALSE) + +is_web_scheme(x, resolve_redirects = FALSE) + +is_likely_article(x, resolve_redirects = FALSE) + +is_likely_video(x, resolve_redirects = FALSE) + +is_likely_audio(x, resolve_redirects = FALSE) + +is_likely_binary_doc(x, resolve_redirects = FALSE) + +is_likely_archive(x, resolve_redirects = FALSE) + +is_likely_executable(x, resolve_redirects = FALSE) + +is_likely_image(x, resolve_redirects = FALSE) +} +\arguments{ +\item{x}{URL to classify} + +\item{resolve_redirects}{if \code{TRUE} resolve redirects such as when Facebook or +Google show an interstitial page instead of redirecting the user to +the actual URL.} +} +\description{ +Classify a URL with or without making network calls +} diff --git a/man/crux.Rd b/man/crux.Rd index 75b595a..eda567c 100644 --- a/man/crux.Rd +++ b/man/crux.Rd @@ -4,13 +4,21 @@ \name{crux} \alias{crux} \alias{crux-package} -\title{...} +\title{Identify the Crux of an Article} \description{ +Methods are provided to retrieve HTML content and return extracted +metadata and summarised plain text. Further methods are provided to classify +URLs with or without making network calls. Based on \url{https://github.com/chimbori/crux}. +} +\details{ +Based on \url{https://github.com/chimbori/crux}. \itemize{ \item URL: \url{https://gitlab.com/hrbrmstr/crux} \item BugReports: \url{https://gitlab.com/hrbrmstr/crux/issues} +\item Javadoc: \url{https://www.javadoc.io/doc/com.chimbori.crux/crux/2.0.2} } } \author{ Bob Rudis (bob@rud.is) } +\keyword{internal} diff --git a/man/summarise_url.Rd b/man/summarise_url.Rd new file mode 100644 index 0000000..c112d37 --- /dev/null +++ b/man/summarise_url.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/summarise.R +\name{summarise_url} +\alias{summarise_url} +\title{Summarise the contents at a URL to essential bits} +\usage{ +summarise_url(x) +} +\arguments{ +\item{x}{URL} +} +\description{ +Fetches the HTML from \code{x} and returns the essential components +including: +\itemize{ +\item \code{url} +\item \code{original_url} +\item \code{title} +\item \code{description} +\item \code{site_name} +\item \code{theme_color} +\item \code{amp_url} +\item \code{canonical_url} +\item \code{image_url} +\item \code{video_url} +\item \code{feed_url} +\item \code{favicon_url} +\item \code{reading_time} +\item \code{text} (the reducted, plain text) +If any compontents cannot be derived from the contents of the URL they will be \code{NA}. +} +} +\examples{ +ex_url <- "https://techcrunch.com/2019/02/28/thailand-passes-controversial-cybersecurity-law/" +str(summarise_url(ex_url), 1) +}