From 693279a385b9a7bce101a5be29aeacb88f0dccc1 Mon Sep 17 00:00:00 2001 From: hrbrmstr Date: Fri, 1 Mar 2019 12:45:56 -0500 Subject: [PATCH] README --- DESCRIPTION | 12 ++-- README.Rmd | 18 ++---- README.md | 75 ++++++++++++---------- java/jericho/src/main/java/is/rud/jericho/App.java | 2 + 4 files changed, 57 insertions(+), 50 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ef8b189..a51974f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: jericho Type: Package Title: Break Down the Walls of 'HTML' Tags into Usable Text Version: 0.2.0 -Date: 2017-09-04 +Date: 2019-03-01 Authors@R: c( person("Bob", "Rudis", role = c("aut", "cre"), email = "bob@rud.is"), person("Martin", "Jericho", role = c("ctb"), comment = "Jericho HTML Parser") @@ -15,9 +15,10 @@ Description: Structured 'HTML' content can be useful when you need to parse data are provied that wrap methods in the 'Jericho HTML Parser' Java library by Martin Jericho . Martin's library is used in many at-scale projects, icluding the 'The Internet Archive'. -URL: https://github.com/hrbrmstr/jericho -BugReports: https://github.com/hrbrmstr/jericho/issues +URL: https://gitlab.com/hrbrmstr/jericho +BugReports: https://gitlab.com/hrbrmstr/jericho/issues License: Apache License 2.0 | file LICENSE +Encoding: UTF-8 Suggests: testthat, covr @@ -25,5 +26,6 @@ Depends: R (>= 3.2.0), rJava, jerichojars -RoxygenNote: 6.0.1 -Remotes: hrbrmstr/jerichojars +RoxygenNote: 6.1.1 +Remotes: + url::https://git.sr.ht/~hrbrmstr/jerichojars diff --git a/README.Rmd b/README.Rmd index 35381f5..cd6a644 100644 --- a/README.Rmd +++ b/README.Rmd @@ -22,12 +22,11 @@ The following functions are implemented: If you do use `devtools`, then it *should* pickup the `Remotes:` section in `DESCRIPTION`. Until the package is on CRAN, you might want to also invoke the installation of `jerichojars` as shown below: ```{r eval=FALSE} -devtools::install_github("hrbrmstr/jerichojars") -devtools::install_github("hrbrmstr/jericho") +install.packages(c("jerichojars", "jericho"), repos = "https://cinc.rud.is/") ``` ```{r message=FALSE, warning=FALSE, error=FALSE, include=FALSE} -options(width=120) +options(width = 120) ``` ### Usage @@ -42,7 +41,7 @@ packageVersion("jericho") URL <- "https://blogs.nasa.gov/spacestation/2017/09/02/touchdown-expedition-52-back-on-earth/" -doc <- paste0(readr::read_lines(URL), collapse="\n") +doc <- paste0(readr::read_lines(URL), collapse = "\n") ``` This is pure text extraction: @@ -59,14 +58,9 @@ render_html_to_text(doc) You should run each to see and compare the output (GitHub markdown documents aren't the best viewing medium). -### Test Results +### `jericho` Metrics -```{r message=FALSE, warning=FALSE, error=FALSE} -library(jericho) -library(testthat) - -date() - -test_dir("tests/") +```{r cloc, echo=FALSE} +cloc::cloc_pkg_md() ``` diff --git a/README.md b/README.md index d1d37c1..839e4c0 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,43 @@ -[![Build Status](https://travis-ci.org/hrbrmstr/jericho.svg?branch=master)](https://travis-ci.org/hrbrmstr/jericho) [![Build status](https://ci.appveyor.com/api/projects/status/nosmgh0b2wthjjf3/branch/master?svg=true)](https://ci.appveyor.com/project/hrbrmstr/jericho/branch/master) [![codecov](https://codecov.io/gh/hrbrmstr/jericho/branch/master/graph/badge.svg)](https://codecov.io/gh/hrbrmstr/jericho) - -`jericho` : Break Down the Walls of 'HTML' Tags into Usable Text - -Structured 'HTML' content can be useful when you need to parse data tables or other tagged data from within a document. However, it is also useful to obtain "just the text" from a document free from the walls of tags that surround it. Tools are provied that wrap methods in the 'Jericho HTML Parser' Java library by Martin Jericho . Martin's library is used in many at-scale projects, icluding the 'The Internet Archive'. +[![Build +Status](https://travis-ci.org/hrbrmstr/jericho.svg?branch=master)](https://travis-ci.org/hrbrmstr/jericho) +[![Build +status](https://ci.appveyor.com/api/projects/status/nosmgh0b2wthjjf3/branch/master?svg=true)](https://ci.appveyor.com/project/hrbrmstr/jericho/branch/master) +[![codecov](https://codecov.io/gh/hrbrmstr/jericho/branch/master/graph/badge.svg)](https://codecov.io/gh/hrbrmstr/jericho) + +`jericho` : Break Down the Walls of ‘HTML’ Tags into Usable Text + +Structured ‘HTML’ content can be useful when you need to parse data +tables or other tagged data from within a document. However, it is also +useful to obtain “just the text” from a document free from the walls of +tags that surround it. Tools are provied that wrap methods in the +‘Jericho HTML Parser’ Java library by Martin Jericho +. Martin’s library is +used in many at-scale projects, icluding the ‘The Internet Archive’. As a result of using a Java library, this package requires `rJava`. The following functions are implemented: -- `html_to_text`: Convert HTML to Text -- `render_html_to_text`: Render HTML to Text + - `html_to_text`: Convert HTML to Text + - `render_html_to_text`: Render HTML to Text ### Installation -If you do use `devtools`, then it *should* pickup the `Remotes:` section in `DESCRIPTION`. Until the package is on CRAN, you might want to also invoke the installation of `jerichojars` as shown below: +If you do use `devtools`, then it *should* pickup the `Remotes:` section +in `DESCRIPTION`. Until the package is on CRAN, you might want to also +invoke the installation of `jerichojars` as shown +below: ``` r -devtools::install_github("hrbrmstr/jerichojars") -devtools::install_github("hrbrmstr/jericho") +install.packages(c("jerichojars", "jericho"), repos = "https://cinc.rud.is/") ``` ### Usage -Let's use [this NASA blog post](https://blogs.nasa.gov/spacestation/2017/09/02/touchdown-expedition-52-back-on-earth/) as an example. +Let’s use [this NASA blog +post](https://blogs.nasa.gov/spacestation/2017/09/02/touchdown-expedition-52-back-on-earth/) +as an example. ``` r library(jericho) @@ -32,12 +46,12 @@ library(jericho) packageVersion("jericho") ``` - ## [1] '0.1.0' + ## [1] '0.2.0' ``` r URL <- "https://blogs.nasa.gov/spacestation/2017/09/02/touchdown-expedition-52-back-on-earth/" -doc <- paste0(readr::read_lines(URL), collapse="\n") +doc <- paste0(readr::read_lines(URL), collapse = "\n") ``` This is pure text extraction: @@ -46,30 +60,25 @@ This is pure text extraction: html_to_text(doc) ``` -This provides a human readable version of the segment content that is modelled on the way Mozilla Thunderbird and other email clients provide an automatic conversion of HTML content to text in their alternative MIME encoding of emails. +This provides a human readable version of the segment content that is +modelled on the way Mozilla Thunderbird and other email clients provide +an automatic conversion of HTML content to text in their alternative +MIME encoding of emails. ``` r render_html_to_text(doc) ``` -You should run each to see and compare the output (GitHub markdown documents aren't the best viewing medium). - -### Test Results - -``` r -library(jericho) -library(testthat) +You should run each to see and compare the output (GitHub markdown +documents aren’t the best viewing +medium). -date() -``` - - ## [1] "Mon Sep 4 21:33:17 2017" - -``` r -test_dir("tests/") -``` +### `jericho` Metrics - ## testthat results ======================================================================================================== - ## OK: 6 SKIPPED: 0 FAILED: 0 - ## - ## DONE =================================================================================================================== +| Lang | \# Files | (%) | LoC | (%) | Blank lines | (%) | \# Lines | (%) | +| :---- | -------: | ---: | --: | ---: | ----------: | ---: | -------: | ---: | +| Java | 2 | 0.18 | 49 | 0.38 | 9 | 0.19 | 14 | 0.13 | +| R | 6 | 0.55 | 40 | 0.31 | 10 | 0.21 | 62 | 0.56 | +| Maven | 1 | 0.09 | 23 | 0.18 | 1 | 0.02 | 1 | 0.01 | +| Rmd | 1 | 0.09 | 9 | 0.07 | 24 | 0.50 | 33 | 0.30 | +| make | 1 | 0.09 | 8 | 0.06 | 4 | 0.08 | 0 | 0.00 | diff --git a/java/jericho/src/main/java/is/rud/jericho/App.java b/java/jericho/src/main/java/is/rud/jericho/App.java index d1969a6..1b948e6 100644 --- a/java/jericho/src/main/java/is/rud/jericho/App.java +++ b/java/jericho/src/main/java/is/rud/jericho/App.java @@ -6,6 +6,7 @@ import java.io.*; import java.net.*; public class App { + public static String html_to_text(String x, boolean include_attributes) throws Exception { try { Source source = new Source(x); @@ -17,6 +18,7 @@ public class App { return(""); } } + public static String render_html_to_text(String x) throws Exception { try { Source source = new Source(x);