mirror of https://git.sr.ht/~hrbrmstr/crux
boB Rudis
5 years ago
13 changed files with 670 additions and 21 deletions
@ -1,25 +1,28 @@ |
|||||
Package: crux |
Package: crux |
||||
Type: Package |
Type: Package |
||||
Title: crux title goes here otherwise CRAN checks fail |
Title: Identify the Crux of an Article |
||||
Version: 0.1.0 |
Version: 0.1.0 |
||||
Date: 2019-03-01 |
Date: 2019-03-01 |
||||
Authors@R: c( |
Authors@R: c( |
||||
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), |
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), |
||||
comment = c(ORCID = "0000-0001-5670-2640")) |
comment = c(ORCID = "0000-0001-5670-2640")), |
||||
|
person("Chimbori", role = "aut", |
||||
|
comment = "crux Java library (<https://github.com/chimbori/crux>)") |
||||
) |
) |
||||
Maintainer: Bob Rudis <bob@rud.is> |
Maintainer: Bob Rudis <bob@rud.is> |
||||
Description: A good description goes here otherwise CRAN checks fail. |
Description: Methods are provided to retrieve HTML content and return extracted |
||||
|
metadata and summarised plain text. Further methods are provided to classify |
||||
|
URLs with or without making network calls. Based on <https://github.com/chimbori/crux>. |
||||
URL: https://gitlab.com/hrbrmstr/crux |
URL: https://gitlab.com/hrbrmstr/crux |
||||
BugReports: https://gitlab.com/hrbrmstr/crux/issues |
BugReports: https://gitlab.com/hrbrmstr/crux/issues |
||||
Encoding: UTF-8 |
Encoding: UTF-8 |
||||
License: AGPL |
License: Apache License 2.0 | file LICENSE |
||||
Suggests: |
Suggests: |
||||
testthat, |
testthat, |
||||
covr |
covr |
||||
Depends: |
Depends: |
||||
R (>= 3.2.0) |
R (>= 3.2.0), |
||||
Imports: |
cruxjars, |
||||
httr, |
rJava |
||||
jsonlite |
|
||||
Roxygen: list(markdown = TRUE) |
Roxygen: list(markdown = TRUE) |
||||
RoxygenNote: 6.1.1 |
RoxygenNote: 6.1.1 |
||||
|
@ -0,0 +1,201 @@ |
|||||
|
Apache License |
||||
|
Version 2.0, January 2004 |
||||
|
http://www.apache.org/licenses/ |
||||
|
|
||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION |
||||
|
|
||||
|
1. Definitions. |
||||
|
|
||||
|
"License" shall mean the terms and conditions for use, reproduction, |
||||
|
and distribution as defined by Sections 1 through 9 of this document. |
||||
|
|
||||
|
"Licensor" shall mean the copyright owner or entity authorized by |
||||
|
the copyright owner that is granting the License. |
||||
|
|
||||
|
"Legal Entity" shall mean the union of the acting entity and all |
||||
|
other entities that control, are controlled by, or are under common |
||||
|
control with that entity. For the purposes of this definition, |
||||
|
"control" means (i) the power, direct or indirect, to cause the |
||||
|
direction or management of such entity, whether by contract or |
||||
|
otherwise, or (ii) ownership of fifty percent (50%) or more of the |
||||
|
outstanding shares, or (iii) beneficial ownership of such entity. |
||||
|
|
||||
|
"You" (or "Your") shall mean an individual or Legal Entity |
||||
|
exercising permissions granted by this License. |
||||
|
|
||||
|
"Source" form shall mean the preferred form for making modifications, |
||||
|
including but not limited to software source code, documentation |
||||
|
source, and configuration files. |
||||
|
|
||||
|
"Object" form shall mean any form resulting from mechanical |
||||
|
transformation or translation of a Source form, including but |
||||
|
not limited to compiled object code, generated documentation, |
||||
|
and conversions to other media types. |
||||
|
|
||||
|
"Work" shall mean the work of authorship, whether in Source or |
||||
|
Object form, made available under the License, as indicated by a |
||||
|
copyright notice that is included in or attached to the work |
||||
|
(an example is provided in the Appendix below). |
||||
|
|
||||
|
"Derivative Works" shall mean any work, whether in Source or Object |
||||
|
form, that is based on (or derived from) the Work and for which the |
||||
|
editorial revisions, annotations, elaborations, or other modifications |
||||
|
represent, as a whole, an original work of authorship. For the purposes |
||||
|
of this License, Derivative Works shall not include works that remain |
||||
|
separable from, or merely link (or bind by name) to the interfaces of, |
||||
|
the Work and Derivative Works thereof. |
||||
|
|
||||
|
"Contribution" shall mean any work of authorship, including |
||||
|
the original version of the Work and any modifications or additions |
||||
|
to that Work or Derivative Works thereof, that is intentionally |
||||
|
submitted to Licensor for inclusion in the Work by the copyright owner |
||||
|
or by an individual or Legal Entity authorized to submit on behalf of |
||||
|
the copyright owner. For the purposes of this definition, "submitted" |
||||
|
means any form of electronic, verbal, or written communication sent |
||||
|
to the Licensor or its representatives, including but not limited to |
||||
|
communication on electronic mailing lists, source code control systems, |
||||
|
and issue tracking systems that are managed by, or on behalf of, the |
||||
|
Licensor for the purpose of discussing and improving the Work, but |
||||
|
excluding communication that is conspicuously marked or otherwise |
||||
|
designated in writing by the copyright owner as "Not a Contribution." |
||||
|
|
||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity |
||||
|
on behalf of whom a Contribution has been received by Licensor and |
||||
|
subsequently incorporated within the Work. |
||||
|
|
||||
|
2. Grant of Copyright License. Subject to the terms and conditions of |
||||
|
this License, each Contributor hereby grants to You a perpetual, |
||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable |
||||
|
copyright license to reproduce, prepare Derivative Works of, |
||||
|
publicly display, publicly perform, sublicense, and distribute the |
||||
|
Work and such Derivative Works in Source or Object form. |
||||
|
|
||||
|
3. Grant of Patent License. Subject to the terms and conditions of |
||||
|
this License, each Contributor hereby grants to You a perpetual, |
||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable |
||||
|
(except as stated in this section) patent license to make, have made, |
||||
|
use, offer to sell, sell, import, and otherwise transfer the Work, |
||||
|
where such license applies only to those patent claims licensable |
||||
|
by such Contributor that are necessarily infringed by their |
||||
|
Contribution(s) alone or by combination of their Contribution(s) |
||||
|
with the Work to which such Contribution(s) was submitted. If You |
||||
|
institute patent litigation against any entity (including a |
||||
|
cross-claim or counterclaim in a lawsuit) alleging that the Work |
||||
|
or a Contribution incorporated within the Work constitutes direct |
||||
|
or contributory patent infringement, then any patent licenses |
||||
|
granted to You under this License for that Work shall terminate |
||||
|
as of the date such litigation is filed. |
||||
|
|
||||
|
4. Redistribution. You may reproduce and distribute copies of the |
||||
|
Work or Derivative Works thereof in any medium, with or without |
||||
|
modifications, and in Source or Object form, provided that You |
||||
|
meet the following conditions: |
||||
|
|
||||
|
(a) You must give any other recipients of the Work or |
||||
|
Derivative Works a copy of this License; and |
||||
|
|
||||
|
(b) You must cause any modified files to carry prominent notices |
||||
|
stating that You changed the files; and |
||||
|
|
||||
|
(c) You must retain, in the Source form of any Derivative Works |
||||
|
that You distribute, all copyright, patent, trademark, and |
||||
|
attribution notices from the Source form of the Work, |
||||
|
excluding those notices that do not pertain to any part of |
||||
|
the Derivative Works; and |
||||
|
|
||||
|
(d) If the Work includes a "NOTICE" text file as part of its |
||||
|
distribution, then any Derivative Works that You distribute must |
||||
|
include a readable copy of the attribution notices contained |
||||
|
within such NOTICE file, excluding those notices that do not |
||||
|
pertain to any part of the Derivative Works, in at least one |
||||
|
of the following places: within a NOTICE text file distributed |
||||
|
as part of the Derivative Works; within the Source form or |
||||
|
documentation, if provided along with the Derivative Works; or, |
||||
|
within a display generated by the Derivative Works, if and |
||||
|
wherever such third-party notices normally appear. The contents |
||||
|
of the NOTICE file are for informational purposes only and |
||||
|
do not modify the License. You may add Your own attribution |
||||
|
notices within Derivative Works that You distribute, alongside |
||||
|
or as an addendum to the NOTICE text from the Work, provided |
||||
|
that such additional attribution notices cannot be construed |
||||
|
as modifying the License. |
||||
|
|
||||
|
You may add Your own copyright statement to Your modifications and |
||||
|
may provide additional or different license terms and conditions |
||||
|
for use, reproduction, or distribution of Your modifications, or |
||||
|
for any such Derivative Works as a whole, provided Your use, |
||||
|
reproduction, and distribution of the Work otherwise complies with |
||||
|
the conditions stated in this License. |
||||
|
|
||||
|
5. Submission of Contributions. Unless You explicitly state otherwise, |
||||
|
any Contribution intentionally submitted for inclusion in the Work |
||||
|
by You to the Licensor shall be under the terms and conditions of |
||||
|
this License, without any additional terms or conditions. |
||||
|
Notwithstanding the above, nothing herein shall supersede or modify |
||||
|
the terms of any separate license agreement you may have executed |
||||
|
with Licensor regarding such Contributions. |
||||
|
|
||||
|
6. Trademarks. This License does not grant permission to use the trade |
||||
|
names, trademarks, service marks, or product names of the Licensor, |
||||
|
except as required for reasonable and customary use in describing the |
||||
|
origin of the Work and reproducing the content of the NOTICE file. |
||||
|
|
||||
|
7. Disclaimer of Warranty. Unless required by applicable law or |
||||
|
agreed to in writing, Licensor provides the Work (and each |
||||
|
Contributor provides its Contributions) on an "AS IS" BASIS, |
||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
||||
|
implied, including, without limitation, any warranties or conditions |
||||
|
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A |
||||
|
PARTICULAR PURPOSE. You are solely responsible for determining the |
||||
|
appropriateness of using or redistributing the Work and assume any |
||||
|
risks associated with Your exercise of permissions under this License. |
||||
|
|
||||
|
8. Limitation of Liability. In no event and under no legal theory, |
||||
|
whether in tort (including negligence), contract, or otherwise, |
||||
|
unless required by applicable law (such as deliberate and grossly |
||||
|
negligent acts) or agreed to in writing, shall any Contributor be |
||||
|
liable to You for damages, including any direct, indirect, special, |
||||
|
incidental, or consequential damages of any character arising as a |
||||
|
result of this License or out of the use or inability to use the |
||||
|
Work (including but not limited to damages for loss of goodwill, |
||||
|
work stoppage, computer failure or malfunction, or any and all |
||||
|
other commercial damages or losses), even if such Contributor |
||||
|
has been advised of the possibility of such damages. |
||||
|
|
||||
|
9. Accepting Warranty or Additional Liability. While redistributing |
||||
|
the Work or Derivative Works thereof, You may choose to offer, |
||||
|
and charge a fee for, acceptance of support, warranty, indemnity, |
||||
|
or other liability obligations and/or rights consistent with this |
||||
|
License. However, in accepting such obligations, You may act only |
||||
|
on Your own behalf and on Your sole responsibility, not on behalf |
||||
|
of any other Contributor, and only if You agree to indemnify, |
||||
|
defend, and hold each Contributor harmless for any liability |
||||
|
incurred by, or claims asserted against, such Contributor by reason |
||||
|
of your accepting any such warranty or additional liability. |
||||
|
|
||||
|
END OF TERMS AND CONDITIONS |
||||
|
|
||||
|
APPENDIX: How to apply the Apache License to your work. |
||||
|
|
||||
|
To apply the Apache License to your work, attach the following |
||||
|
boilerplate notice, with the fields enclosed by brackets "[]" |
||||
|
replaced with your own identifying information. (Don't include |
||||
|
the brackets!) The text should be enclosed in the appropriate |
||||
|
comment syntax for the file format. We also recommend that a |
||||
|
file or class name and description of purpose be included on the |
||||
|
same "printed page" as the copyright notice for easier |
||||
|
identification within third-party archives. |
||||
|
|
||||
|
Copyright [yyyy] [name of copyright owner] |
||||
|
|
||||
|
Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
you may not use this file except in compliance with the License. |
||||
|
You may obtain a copy of the License at |
||||
|
|
||||
|
http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
|
||||
|
Unless required by applicable law or agreed to in writing, software |
||||
|
distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
See the License for the specific language governing permissions and |
||||
|
limitations under the License. |
@ -0,0 +1,29 @@ |
|||||
|
PACKAGE := $(shell grep '^Package:' DESCRIPTION | sed -E 's/^Package:[[:space:]]+//') |
||||
|
RSCRIPT = Rscript --no-init-file |
||||
|
|
||||
|
all: install |
||||
|
|
||||
|
test: |
||||
|
${RSCRIPT} -e 'library(methods); devtools::test()' |
||||
|
|
||||
|
doc: |
||||
|
${RSCRIPT} -e "library(methods); devtools::document()" |
||||
|
|
||||
|
install: |
||||
|
${RSCRIPT} -e "library(methods); devtools::install()" |
||||
|
|
||||
|
build: |
||||
|
${RSCRIPT} -e "library(methods); devtools::build()" |
||||
|
|
||||
|
check: |
||||
|
_R_CHECK_CRAN_INCOMING_=FALSE make check_all |
||||
|
|
||||
|
check_all: |
||||
|
${RSCRIPT} -e "library(methods); devtools::check(cran=TRUE)" |
||||
|
|
||||
|
README.md: README.Rmd |
||||
|
Rscript -e 'library(methods); devtools::load_all(); rmarkdown::render("README.Rmd", output_file="README.md")' |
||||
|
sed -i.bak 's/[[:space:]]*$$//' $@ |
||||
|
rm -f $@.bak |
||||
|
|
||||
|
.PHONY: all test doc install check check_all |
@ -1,4 +1,15 @@ |
|||||
# Generated by roxygen2: do not edit by hand |
# Generated by roxygen2: do not edit by hand |
||||
|
|
||||
import(httr) |
export(classify_url) |
||||
importFrom(jsonlite,fromJSON) |
export(is_ad_image) |
||||
|
export(is_likely_archive) |
||||
|
export(is_likely_article) |
||||
|
export(is_likely_audio) |
||||
|
export(is_likely_binary_doc) |
||||
|
export(is_likely_executable) |
||||
|
export(is_likely_image) |
||||
|
export(is_likely_video) |
||||
|
export(is_web_scheme) |
||||
|
export(summarise_url) |
||||
|
import(cruxjars) |
||||
|
import(rJava) |
||||
|
@ -0,0 +1,105 @@ |
|||||
|
#' Classify a URL with or without making network calls |
||||
|
#' |
||||
|
#' @md |
||||
|
#' @param x URL to classify |
||||
|
#' @param resolve_redirects if `TRUE` resolve redirects such as when Facebook or |
||||
|
#' Google show an interstitial page instead of redirecting the user to |
||||
|
#' the actual URL. |
||||
|
#' @export |
||||
|
classify_url <- function(x, resolve_redirects = FALSE) { |
||||
|
|
||||
|
crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) |
||||
|
|
||||
|
if (resolve_redirects) crux_url <- crux_url.resolveRedirects() |
||||
|
|
||||
|
data.frame( |
||||
|
url = x, |
||||
|
is_ad_image = crux_url$isAdImage(), |
||||
|
is_web_scheme = crux_url$isWebScheme(), |
||||
|
is_likely_article = crux_url$isLikelyArticle(), |
||||
|
is_likely_video = crux_url$isLikelyVideo(), |
||||
|
is_likely_audio = crux_url$isLikelyAudio(), |
||||
|
is_likely_binary_doc = crux_url$isLikelyBinaryDocument(), |
||||
|
is_likely_archive = crux_url$isLikelyArchive(), |
||||
|
is_likely_executable = crux_url$isLikelyExecutable(), |
||||
|
is_likely_image = crux_url$isLikelyImage(), |
||||
|
stringsAsFactors = FALSE |
||||
|
) -> out |
||||
|
|
||||
|
class(out) <- c("tbl_df", "tbl", "data.frame") |
||||
|
|
||||
|
out |
||||
|
|
||||
|
} |
||||
|
|
||||
|
#' @rdname classify_url |
||||
|
#' @export |
||||
|
is_ad_image <- function(x, resolve_redirects=FALSE) { |
||||
|
crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) |
||||
|
if (resolve_redirects) crux_url <- crux_url.resolveRedirects() |
||||
|
crux_url$isAdImage() |
||||
|
} |
||||
|
|
||||
|
#' @rdname classify_url |
||||
|
#' @export |
||||
|
is_web_scheme <- function(x, resolve_redirects=FALSE) { |
||||
|
crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) |
||||
|
if (resolve_redirects) crux_url <- crux_url.resolveRedirects() |
||||
|
crux_url$isWebScheme() |
||||
|
} |
||||
|
|
||||
|
#' @rdname classify_url |
||||
|
#' @export |
||||
|
is_likely_article <- function(x, resolve_redirects=FALSE) { |
||||
|
crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) |
||||
|
if (resolve_redirects) crux_url <- crux_url.resolveRedirects() |
||||
|
crux_url$isLikelyArticle() |
||||
|
} |
||||
|
|
||||
|
#' @rdname classify_url |
||||
|
#' @export |
||||
|
is_likely_video <- function(x, resolve_redirects=FALSE) { |
||||
|
crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) |
||||
|
if (resolve_redirects) crux_url <- crux_url.resolveRedirects() |
||||
|
crux_url$isLikelyVideo() |
||||
|
} |
||||
|
|
||||
|
#' @rdname classify_url |
||||
|
#' @export |
||||
|
is_likely_audio <- function(x, resolve_redirects=FALSE) { |
||||
|
crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) |
||||
|
if (resolve_redirects) crux_url <- crux_url.resolveRedirects() |
||||
|
crux_url$isLikelyAudio() |
||||
|
} |
||||
|
|
||||
|
#' @rdname classify_url |
||||
|
#' @export |
||||
|
is_likely_binary_doc <- function(x, resolve_redirects=FALSE) { |
||||
|
crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) |
||||
|
if (resolve_redirects) crux_url <- crux_url.resolveRedirects() |
||||
|
crux_url$isLikelyBinaryDocument() |
||||
|
} |
||||
|
|
||||
|
#' @rdname classify_url |
||||
|
#' @export |
||||
|
is_likely_archive <- function(x, resolve_redirects=FALSE) { |
||||
|
crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) |
||||
|
if (resolve_redirects) crux_url <- crux_url.resolveRedirects() |
||||
|
crux_url$isLikelyArchive() |
||||
|
} |
||||
|
|
||||
|
#' @rdname classify_url |
||||
|
#' @export |
||||
|
is_likely_executable <- function(x, resolve_redirects=FALSE) { |
||||
|
crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) |
||||
|
if (resolve_redirects) crux_url <- crux_url.resolveRedirects() |
||||
|
crux_url$isLikelyExecutable() |
||||
|
} |
||||
|
|
||||
|
#' @rdname classify_url |
||||
|
#' @export |
||||
|
is_likely_image <- function(x, resolve_redirects=FALSE) { |
||||
|
crux_url <- J("com.chimbori.crux.urls.CruxURL")$parse(x) |
||||
|
if (resolve_redirects) crux_url <- crux_url.resolveRedirects() |
||||
|
crux_url$isLikelyImage() |
||||
|
} |
@ -1,12 +1,19 @@ |
|||||
#' ... |
#' Identify the Crux of an Article |
||||
#' |
#' |
||||
|
#' Methods are provided to retrieve HTML content and return extracted |
||||
|
#' metadata and summarised plain text. Further methods are provided to classify |
||||
|
#' URLs with or without making network calls. Based on <https://github.com/chimbori/crux>. |
||||
|
#' |
||||
|
#' Based on <https://github.com/chimbori/crux>. |
||||
|
#' |
||||
#' - URL: <https://gitlab.com/hrbrmstr/crux> |
#' - URL: <https://gitlab.com/hrbrmstr/crux> |
||||
#' - BugReports: <https://gitlab.com/hrbrmstr/crux/issues> |
#' - BugReports: <https://gitlab.com/hrbrmstr/crux/issues> |
||||
#' |
#' - Javadoc: <https://www.javadoc.io/doc/com.chimbori.crux/crux/2.0.2> |
||||
|
#' |
||||
#' @md |
#' @md |
||||
#' @name crux |
#' @name crux |
||||
|
#' @keywords internal |
||||
#' @docType package |
#' @docType package |
||||
#' @author Bob Rudis (bob@@rud.is) |
#' @author Bob Rudis (bob@@rud.is) |
||||
#' @import httr |
#' @import cruxjars rJava |
||||
#' @importFrom jsonlite fromJSON |
|
||||
NULL |
NULL |
||||
|
@ -0,0 +1,71 @@ |
|||||
|
`%es%` <- function(a, b) { |
||||
|
if (is.jnull(a)) return(b) |
||||
|
if (trimws(a) == "") return(b) |
||||
|
return(a) |
||||
|
} |
||||
|
|
||||
|
#' Summarise the contents at a URL to essential bits |
||||
|
#' |
||||
|
#' Fetches the HTML from `x` and returns the essential components |
||||
|
#' including: |
||||
|
#' - `url` |
||||
|
#' - `original_url` |
||||
|
#' - `title` |
||||
|
#' - `description` |
||||
|
#' - `site_name` |
||||
|
#' - `theme_color` |
||||
|
#' - `amp_url` |
||||
|
#' - `canonical_url` |
||||
|
#' - `image_url` |
||||
|
#' - `video_url` |
||||
|
#' - `feed_url` |
||||
|
#' - `favicon_url` |
||||
|
#' - `reading_time` |
||||
|
#' - `text` (the reducted, plain text) |
||||
|
#' If any compontents cannot be derived from the contents of the URL they will be `NA`. |
||||
|
#' |
||||
|
#' @md |
||||
|
#' @param x URL |
||||
|
#' @export |
||||
|
#' @examples |
||||
|
#' ex_url <- "https://techcrunch.com/2019/02/28/thailand-passes-controversial-cybersecurity-law/" |
||||
|
#' str(summarise_url(ex_url), 1) |
||||
|
summarise_url <- function(x) { |
||||
|
|
||||
|
soup <- J("org.jsoup.Jsoup") |
||||
|
con <- soup$connect(x) |
||||
|
doc <- con$get() |
||||
|
|
||||
|
ae <- J("com.chimbori.crux.articles.ArticleExtractor") |
||||
|
|
||||
|
article <- ae$with(x, doc) |
||||
|
|
||||
|
content <- article$extractContent() |
||||
|
content <- content$extractMetadata() |
||||
|
content <- content$estimateReadingTime() |
||||
|
|
||||
|
rec <- content$article() |
||||
|
|
||||
|
txt <- rec$document$text() |
||||
|
|
||||
|
list( |
||||
|
url = rec$url %es% NA_character_, |
||||
|
original_url = rec$originalUrl %es% NA_character_, |
||||
|
title = rec$title %es% NA_character_, |
||||
|
description = rec$description %es% NA_character_, |
||||
|
site_name = rec$siteName %es% NA_character_, |
||||
|
theme_color = rec$themeColor %es% NA_character_, |
||||
|
amp_url = rec$ampUrl %es% NA_character_, |
||||
|
canonical_url = rec$canonicalUrl %es% NA_character_, |
||||
|
image_url = rec$imageUrl %es% NA_character_, |
||||
|
video_url = rec$videoUrl %es% NA_character_, |
||||
|
feed_url = rec$feedUrl %es% NA_character_, |
||||
|
favicon_url = rec$faviconUrl %es% NA_character_, |
||||
|
reading_time = rec$estimatedReadingTimeMinutes %es% NA_integer_, |
||||
|
text = txt %es% NA_character_ |
||||
|
) -> content |
||||
|
|
||||
|
return(content) |
||||
|
|
||||
|
} |
||||
|
|
@ -1,2 +1,108 @@ |
|||||
|
|
||||
|
[![Travis-CI Build |
||||
|
Status](https://travis-ci.org/hrbrmstr/crux.svg?branch=master)](https://travis-ci.org/hrbrmstr/crux) |
||||
|
[![Coverage |
||||
|
Status](https://codecov.io/gh/hrbrmstr/crux/branch/master/graph/badge.svg)](https://codecov.io/gh/hrbrmstr/crux) |
||||
|
[![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/crux)](https://cran.r-project.org/package=crux) |
||||
|
|
||||
# crux |
# crux |
||||
|
|
||||
|
Identify the Crux of an Article |
||||
|
|
||||
|
## Description |
||||
|
|
||||
|
Methods are provided to retrieve HTML content and return extracted |
||||
|
metadata and summarised plain text. Further methods are provided to |
||||
|
classify URLs with or without making network calls. Based on |
||||
|
<https://github.com/chimbori/crux>. |
||||
|
|
||||
|
## What’s Inside The Tin |
||||
|
|
||||
|
The following functions are implemented: |
||||
|
|
||||
|
- `classify_url`: Classify a URL with or without making network calls |
||||
|
- `is_ad_image`: Classify a URL with or without making network calls |
||||
|
- `is_likely_archive`: Classify a URL with or without making network |
||||
|
calls |
||||
|
- `is_likely_article`: Classify a URL with or without making network |
||||
|
calls |
||||
|
- `is_likely_audio`: Classify a URL with or without making network |
||||
|
calls |
||||
|
- `is_likely_binary_doc`: Classify a URL with or without making |
||||
|
network calls |
||||
|
- `is_likely_executable`: Classify a URL with or without making |
||||
|
network calls |
||||
|
- `is_likely_image`: Classify a URL with or without making network |
||||
|
calls |
||||
|
- `is_likely_video`: Classify a URL with or without making network |
||||
|
calls |
||||
|
- `is_web_scheme`: Classify a URL with or without making network calls |
||||
|
- `summarise_url`: Summarise the contents at a URL to essential bits |
||||
|
|
||||
|
## Installation |
||||
|
|
||||
|
``` r |
||||
|
install.packages(c("cruxjars", "crux"), repos = "https://cinc.rud.is/") |
||||
|
``` |
||||
|
|
||||
|
## Usage |
||||
|
|
||||
|
``` r |
||||
|
library(crux) |
||||
|
|
||||
|
# current version |
||||
|
packageVersion("crux") |
||||
|
## [1] '0.1.0' |
||||
|
``` |
||||
|
|
||||
|
``` r |
||||
|
str( |
||||
|
summarise_url("http://time.com/5541738/joe-biden-backtracks-pence-praise-criticism/"), 1 |
||||
|
) |
||||
|
## List of 14 |
||||
|
## $ url : chr "http://time.com/5541738/joe-biden-backtracks-pence-praise-criticism/" |
||||
|
## $ original_url : chr NA |
||||
|
## $ title : chr "Joe Biden Backtracks After Calling Pence 'a Decent Guy'" |
||||
|
## $ description : chr "The former Vice President took back his comment that Pence is \"a decent guy\" after fellow Democrats denounced his remarks" |
||||
|
## $ site_name : chr "Time" |
||||
|
## $ theme_color : chr NA |
||||
|
## $ amp_url : chr "http://amp.timeinc.net/time/5541738/joe-biden-backtracks-pence-praise-criticism" |
||||
|
## $ canonical_url: chr "http://time.com/5541738/joe-biden-backtracks-pence-praise-criticism/" |
||||
|
## $ image_url : chr "http://timedotcom.files.wordpress.com/2019/03/ap19059832629402.jpg?quality=85&crop=0px%2C111px%2C6000px%2C3000p"| __truncated__ |
||||
|
## $ video_url : chr NA |
||||
|
## $ feed_url : chr NA |
||||
|
## $ favicon_url : chr "http://time.com/img/favicons/favicon-192.png" |
||||
|
## $ reading_time : int 2 |
||||
|
## $ text : chr "(OMAHA, Neb.) — Former Vice President Joe Biden’s tendency to talk about his good relationships with Republican"| __truncated__ |
||||
|
``` |
||||
|
|
||||
|
``` r |
||||
|
str( |
||||
|
classify_url("https://www.washingtonpost.com/powerpost/house-democrats-explode-in-recriminations-as-liberals-lash-out-at-moderates/2019/02/28/c3d163fe-3b87-11e9-a06c-3ec8ed509d15_story.html") |
||||
|
) |
||||
|
## Classes 'tbl_df', 'tbl' and 'data.frame': 1 obs. of 10 variables: |
||||
|
## $ url : chr "https://www.washingtonpost.com/powerpost/house-democrats-explode-in-recriminations-as-liberals-lash-out-at-mode"| __truncated__ |
||||
|
## $ is_ad_image : logi FALSE |
||||
|
## $ is_web_scheme : logi TRUE |
||||
|
## $ is_likely_article : logi TRUE |
||||
|
## $ is_likely_video : logi FALSE |
||||
|
## $ is_likely_audio : logi FALSE |
||||
|
## $ is_likely_binary_doc: logi FALSE |
||||
|
## $ is_likely_archive : logi FALSE |
||||
|
## $ is_likely_executable: logi FALSE |
||||
|
## $ is_likely_image : logi FALSE |
||||
|
``` |
||||
|
|
||||
|
## crux Metrics |
||||
|
|
||||
|
| Lang | \# Files | (%) | LoC | (%) | Blank lines | (%) | \# Lines | (%) | |
||||
|
| :--- | -------: | ---: | --: | ---: | ----------: | ---: | -------: | ---: | |
||||
|
| R | 5 | 0.71 | 104 | 0.76 | 28 | 0.49 | 71 | 0.61 | |
||||
|
| make | 1 | 0.14 | 20 | 0.15 | 9 | 0.16 | 0 | 0.00 | |
||||
|
| Rmd | 1 | 0.14 | 12 | 0.09 | 20 | 0.35 | 46 | 0.39 | |
||||
|
|
||||
|
## Code of Conduct |
||||
|
|
||||
|
Please note that this project is released with a [Contributor Code of |
||||
|
Conduct](CONDUCT.md). By participating in this project you agree to |
||||
|
abide by its terms. |
||||
|
@ -0,0 +1,45 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/classify.R |
||||
|
\name{classify_url} |
||||
|
\alias{classify_url} |
||||
|
\alias{is_ad_image} |
||||
|
\alias{is_web_scheme} |
||||
|
\alias{is_likely_article} |
||||
|
\alias{is_likely_video} |
||||
|
\alias{is_likely_audio} |
||||
|
\alias{is_likely_binary_doc} |
||||
|
\alias{is_likely_archive} |
||||
|
\alias{is_likely_executable} |
||||
|
\alias{is_likely_image} |
||||
|
\title{Classify a URL with or without making network calls} |
||||
|
\usage{ |
||||
|
classify_url(x, resolve_redirects = FALSE) |
||||
|
|
||||
|
is_ad_image(x, resolve_redirects = FALSE) |
||||
|
|
||||
|
is_web_scheme(x, resolve_redirects = FALSE) |
||||
|
|
||||
|
is_likely_article(x, resolve_redirects = FALSE) |
||||
|
|
||||
|
is_likely_video(x, resolve_redirects = FALSE) |
||||
|
|
||||
|
is_likely_audio(x, resolve_redirects = FALSE) |
||||
|
|
||||
|
is_likely_binary_doc(x, resolve_redirects = FALSE) |
||||
|
|
||||
|
is_likely_archive(x, resolve_redirects = FALSE) |
||||
|
|
||||
|
is_likely_executable(x, resolve_redirects = FALSE) |
||||
|
|
||||
|
is_likely_image(x, resolve_redirects = FALSE) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{x}{URL to classify} |
||||
|
|
||||
|
\item{resolve_redirects}{if \code{TRUE} resolve redirects such as when Facebook or |
||||
|
Google show an interstitial page instead of redirecting the user to |
||||
|
the actual URL.} |
||||
|
} |
||||
|
\description{ |
||||
|
Classify a URL with or without making network calls |
||||
|
} |
@ -0,0 +1,36 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/summarise.R |
||||
|
\name{summarise_url} |
||||
|
\alias{summarise_url} |
||||
|
\title{Summarise the contents at a URL to essential bits} |
||||
|
\usage{ |
||||
|
summarise_url(x) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{x}{URL} |
||||
|
} |
||||
|
\description{ |
||||
|
Fetches the HTML from \code{x} and returns the essential components |
||||
|
including: |
||||
|
\itemize{ |
||||
|
\item \code{url} |
||||
|
\item \code{original_url} |
||||
|
\item \code{title} |
||||
|
\item \code{description} |
||||
|
\item \code{site_name} |
||||
|
\item \code{theme_color} |
||||
|
\item \code{amp_url} |
||||
|
\item \code{canonical_url} |
||||
|
\item \code{image_url} |
||||
|
\item \code{video_url} |
||||
|
\item \code{feed_url} |
||||
|
\item \code{favicon_url} |
||||
|
\item \code{reading_time} |
||||
|
\item \code{text} (the reducted, plain text) |
||||
|
If any compontents cannot be derived from the contents of the URL they will be \code{NA}. |
||||
|
} |
||||
|
} |
||||
|
\examples{ |
||||
|
ex_url <- "https://techcrunch.com/2019/02/28/thailand-passes-controversial-cybersecurity-law/" |
||||
|
str(summarise_url(ex_url), 1) |
||||
|
} |
Loading…
Reference in new issue