Compare commits

...

3 Commits

  1. 1
      .Rbuildignore
  2. 2
      CRAN-RELEASE
  3. 16
      DESCRIPTION
  4. 6
      NEWS.md
  5. 3
      R/can-fetch.r
  6. 2
      R/crawl-delay.r
  7. 8
      R/robxp.r
  8. 51
      README.md
  9. 12
      cran-comments.md
  10. 29
      inst/tinytest/test_spiderbar.R
  11. 5
      man/can_fetch.Rd
  12. 8
      man/crawl_delays.Rd
  13. 8
      man/robxp.Rd
  14. 2
      man/sitemaps.Rd
  15. 1
      man/spiderbar.Rd
  16. 3
      tests/test-all.R
  17. 31
      tests/testthat/test-spiderbar.R
  18. 5
      tests/tinytest.R

1
.Rbuildignore

@ -13,3 +13,4 @@
^appveyor\.yml$
^codecov\.yml$
^cran-comments\.md$
^CRAN-RELEASE$

2
CRAN-RELEASE

@ -0,0 +1,2 @@
This package was submitted to CRAN on 2020-05-29.
Once it is accepted, delete this file and tag the release (commit fb27ce1dec).

16
DESCRIPTION

@ -1,8 +1,8 @@
Package: spiderbar
Type: Package
Title: Parse and Test Robots Exclusion Protocol Files and Rules
Version: 0.2.2
Date: 2019-08-18
Version: 0.2.3
Date: 2020-05-29
Author: Bob Rudis (bob@rud.is) [aut, cre], SEOmoz, Inc [aut]
Maintainer: Bob Rudis <bob@rud.is>
Description: The 'Robots Exclusion Protocol' <https://www.robotstxt.org/orig.html> documents
@ -14,14 +14,14 @@ NeedsCompilation: yes
URL: https://gitlab.com/hrbrmstr/spiderbar
BugReports: https://gitlab.com/hrbrmstr/spiderbar/issues
License: MIT + file LICENSE
Suggests:
testthat,
Suggests:
covr,
robotstxt
Depends:
robotstxt,
tinytest
Depends:
R (>= 3.2.0)
Encoding: UTF-8
Imports:
Imports:
Rcpp
RoxygenNote: 6.1.1
RoxygenNote: 7.1.0
LinkingTo: Rcpp

6
NEWS.md

@ -1,3 +1,9 @@
0.2.3
* fix by Peter Meissner for fetching case
* custom print method now returns the object
* fixed spelling
* ensured there's a roxygen return for every function
0.2.0
* Added crawl delay extraction
* Made all examples local so CRAN doesn't have to hit actual websites

3
R/can-fetch.r

@ -9,6 +9,7 @@
#' @param path path to test
#' @param user_agent user agent to test
#' @export
#' @return logical vector indicating whether you have permission to fetch the content
#' @examples
#' gh <- paste0(readLines(system.file("extdata", "github-robots.txt",
#' package="spiderbar")), collapse="\n")
@ -22,7 +23,7 @@
can_fetch <- function(obj, path = "/", user_agent = "*") {
if (inherits(obj, "robxp")) {
vapply(path, rep_path_allowed, logical(1), x=obj, agent=user_agent, USE.NAMES=FALSE)
vapply(path, rep_path_allowed, logical(1), xp=obj, agent=user_agent, USE.NAMES=FALSE)
} else {
return(NULL)
}

2
R/crawl-delay.r

@ -1,4 +1,4 @@
#' Retrive all agent crawl delay values in a `robxp` `robots.txt` object
#' Retrieve all agent crawl delay values in a `robxp` `robots.txt` object
#'
#' @md
#' @param obj `robxp` object

8
R/robxp.r

@ -8,10 +8,11 @@
#' a `connection` object that will be passed to [readLines()], the result of which
#' will be concatenated into a single string and parsed and the connection will be closed.
#' @export
#' @return a classed object holding an external pointer to parsed robots.txt data
#' @examples
#' imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt",
#' package="spiderbar")), collapse="\n")
#' rt <- robxp(imdb)
# imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt",
# package="spiderbar")), collapse="\n")
# rt <- robxp(imdb)
robxp <- function(x) {
if (inherits(x, "connection")) {
@ -38,4 +39,5 @@ robxp <- function(x) {
#' @export
print.robxp <- function(x, ...) {
cat("<Robots Exclusion Protocol Object>")
invisible(x)
}

51
README.md

@ -5,7 +5,7 @@ developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.re
[![Signed
by](https://img.shields.io/badge/Keybase-Verified-brightgreen.svg)](https://keybase.io/hrbrmstr)
![Signed commit
%](https://img.shields.io/badge/Signed_Commits-100%25-lightgrey.svg)
%](https://img.shields.io/badge/Signed_Commits-89%25-lightgrey.svg)
[![Linux build
Status](https://travis-ci.org/hrbrmstr/spiderbar.svg?branch=master)](https://travis-ci.org/hrbrmstr/spiderbar)
[![Windows build
@ -40,7 +40,7 @@ processing these ‘robots.txt’ files.
The following functions are implemented:
- `can_fetch`: Test URL paths against a robxp robots.txt object
- `crawl_delays`: Retrive all agent crawl delay values in a robxp
- `crawl_delays`: Retrieve all agent crawl delay values in a robxp
robots.txt object
- `print.robxp`: Custom printer for ’robxp“ objects
- `robxp`: Parse a ‘robots.txt’ file & create a ‘robxp’ object
@ -50,7 +50,7 @@ The following functions are implemented:
## Installation
``` r
install.packages("spiderbar", repos = "https://cinc.rud.is")
install.packages("spiderbar", repos = c("https://cinc.rud.is", "https://cloud.r-project.org/"))
# or
remotes::install_git("https://git.rud.is/hrbrmstr/spiderbar.git")
# or
@ -74,7 +74,7 @@ library(robotstxt)
# current verison
packageVersion("spiderbar")
## [1] '0.2.2'
## [1] '0.2.3'
# use helpers from the robotstxt package
@ -99,46 +99,23 @@ can_fetch(gh_rt, "/humans.txt", "*") # TRUE
## [1] TRUE
can_fetch(gh_rt, "/login", "*") # FALSE
## [1] FALSE
## [1] TRUE
can_fetch(gh_rt, "/oembed", "CCBot") # FALSE
## [1] FALSE
## [1] TRUE
can_fetch(gh_rt, c("/humans.txt", "/login", "/oembed"))
## [1] TRUE FALSE FALSE
## [1] TRUE TRUE TRUE
crawl_delays(gh_rt)
```
<div class="kable-table">
| agent | crawl\_delay |
| :---------------- | -----------: |
| yandex | \-1 |
| twitterbot | \-1 |
| ccbot | \-1 |
| mail.ru\_bot | \-1 |
| telefonica | \-1 |
| slurp | \-1 |
| seznambot | \-1 |
| sanddollar | \-1 |
| coccoc | \-1 |
| ia\_archiver | \-1 |
| swiftbot | \-1 |
| red-app-gsa-p-one | \-1 |
| naverbot | \-1 |
| msnbot | \-1 |
| teoma | \-1 |
| \* | \-1 |
| intuitgsacrawler | \-1 |
| bingbot | \-1 |
| daumoa | \-1 |
| googlebot | \-1 |
| httrack | \-1 |
| duckduckbot | \-1 |
| etaospider | \-1 |
| rogerbot | \-1 |
| dotbot | \-1 |
| agent | crawl\_delay |
| :---- | -----------: |
| baidu | 1 |
| \* | \-1 |
</div>
@ -167,9 +144,9 @@ sitemaps(imdb_rt)
| Lang | \# Files | (%) | LoC | (%) | Blank lines | (%) | \# Lines | (%) |
| :----------- | -------: | ---: | ---: | ---: | ----------: | ---: | -------: | ---: |
| C++ | 9 | 0.38 | 1763 | 0.78 | 257 | 0.55 | 258 | 0.38 |
| C/C++ Header | 7 | 0.29 | 395 | 0.18 | 152 | 0.33 | 280 | 0.42 |
| R | 7 | 0.29 | 68 | 0.03 | 26 | 0.06 | 101 | 0.15 |
| C++ | 9 | 0.39 | 1763 | 0.79 | 257 | 0.56 | 258 | 0.38 |
| C/C++ Header | 7 | 0.30 | 395 | 0.18 | 152 | 0.33 | 280 | 0.42 |
| R | 6 | 0.26 | 47 | 0.02 | 18 | 0.04 | 101 | 0.15 |
| Rmd | 1 | 0.04 | 23 | 0.01 | 31 | 0.07 | 33 | 0.05 |
## Code of Conduct

12
cran-comments.md

@ -0,0 +1,12 @@
## Test environments
* local R installation, R 4.0.1
* ubuntu 16.04 (on travis-ci), R 4.0.1
* win-builder (devel)
## R CMD check results
0 errors | 0 warnings | 1 note
* This is a update release to fix a parsing edge case
* Ensured all functions have a @return block
* Fixed spelling mistakes

29
inst/tinytest/test_spiderbar.R

@ -0,0 +1,29 @@
library(spiderbar)
cdc <- paste0(readLines(system.file("extdata", "cdc-robots.txt", package="spiderbar")), collapse="\n")
rt1 <- robxp(cdc)
expect_true(inherits(rt1, "robxp"))
expect_true(can_fetch(rt1, "/asthma/asthma_stats/default.htm", "*"))
expect_false(can_fetch(rt1, "/_borders", "*"))
imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="spiderbar")), collapse="\n")
rt2 <- robxp(imdb)
cd <- crawl_delays(rt2)
expect_true(inherits(cd, "data.frame"))
expect_equal(sort(cd$crawl_delay), sort(c(0.1, 3.0, -1.0)))
imdb <- readLines(system.file("extdata", "imdb-robots.txt", package="spiderbar"))
rt2 <- robxp(imdb)
gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="spiderbar")), collapse="\n")
rt3 <- robxp(gh)
rt3 <- robxp(file(system.file("extdata", "github-robots.txt", package="spiderbar")))
expect_equal(sitemaps(rt1), "http://www.cdc.gov/niosh/sitemaps/sitemapsNIOSH.xml")
expect_equal(sitemaps(rt2), "http://www.imdb.com/sitemap_US_index.xml.gz")
expect_equal(sitemaps(rt3), character(0))

5
man/can_fetch.Rd

@ -13,6 +13,9 @@ can_fetch(obj, path = "/", user_agent = "*")
\item{user_agent}{user agent to test}
}
\value{
logical vector indicating whether you have permission to fetch the content
}
\description{
Provide a character vector of URL paths plus optional user agent and this function will
return a logical vector indicating whether you have permission to fetch the content
@ -20,7 +23,7 @@ at the respective path.
}
\examples{
gh <- paste0(readLines(system.file("extdata", "github-robots.txt",
package="spiderbar")), collapse="\\n")
package="spiderbar")), collapse="\n")
gh_rt <- robxp(gh)
can_fetch(gh_rt, "/humans.txt", "*") # TRUE

8
man/crawl_delays.Rd

@ -2,7 +2,7 @@
% Please edit documentation in R/crawl-delay.r
\name{crawl_delays}
\alias{crawl_delays}
\title{Retrive all agent crawl delay values in a \code{robxp} \code{robots.txt} object}
\title{Retrieve all agent crawl delay values in a \code{robxp} \code{robots.txt} object}
\usage{
crawl_delays(obj)
}
@ -13,19 +13,19 @@ crawl_delays(obj)
data frame of agents and their crawl delays
}
\description{
Retrive all agent crawl delay values in a \code{robxp} \code{robots.txt} object
Retrieve all agent crawl delay values in a \code{robxp} \code{robots.txt} object
}
\note{
\code{-1} will be returned for any listed agent \emph{without} a crawl delay setting
}
\examples{
gh <- paste0(readLines(system.file("extdata", "github-robots.txt",
package="spiderbar")), collapse="\\n")
package="spiderbar")), collapse="\n")
gh_rt <- robxp(gh)
crawl_delays(gh_rt)
imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt",
package="spiderbar")), collapse="\\n")
package="spiderbar")), collapse="\n")
imdb_rt <- robxp(imdb)
crawl_delays(imdb_rt)
}

8
man/robxp.Rd

@ -12,12 +12,10 @@ _or_ a length >1 character vector that will concatenated into a single string _o
a `connection` object that will be passed to [readLines()], the result of which
will be concatenated into a single string and parsed and the connection will be closed.}
}
\value{
a classed object holding an external pointer to parsed robots.txt data
}
\description{
This function takes in a single element character vector and parses it into
a `robxp` object.
}
\examples{
imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt",
package="spiderbar")), collapse="\\n")
rt <- robxp(imdb)
}

2
man/sitemaps.Rd

@ -17,7 +17,7 @@ Retrieve a character vector of sitemaps from a parsed robots.txt object
}
\examples{
imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt",
package="rep")), collapse="\\n")
package="rep")), collapse="\n")
rt <- robxp(imdb)
sitemaps(rt)
}

1
man/spiderbar.Rd

@ -3,7 +3,6 @@
\docType{package}
\name{spiderbar}
\alias{spiderbar}
\alias{spiderbar-package}
\title{Parse and Test Robots Exclusion Protocol Files and Rules}
\description{
The 'Robots Exclusion Protocol' (\url{https://www.robotstxt.org/orig.html}) documents a set

3
tests/test-all.R

@ -1,3 +0,0 @@
library(testthat)
library(robotstxt)
test_check("spiderbar")

31
tests/testthat/test-spiderbar.R

@ -1,31 +0,0 @@
context("basic functionality")
test_that("parsing and fetch testing and sitemaps work", {
cdc <- paste0(readLines(system.file("extdata", "cdc-robots.txt", package="spiderbar")), collapse="\n")
rt1 <- robxp(cdc)
expect_that(rt1, is_a("robxp"))
expect_that(can_fetch(rt1, "/asthma/asthma_stats/default.htm", "*"), equals(TRUE))
expect_that(can_fetch(rt1, "/_borders", "*"), equals(FALSE))
imdb <- paste0(readLines(system.file("extdata", "imdb-robots.txt", package="spiderbar")), collapse="\n")
rt2 <- robxp(imdb)
cd <- crawl_delays(rt2)
expect_that(cd, is_a("data.frame"))
expect_equal(sort(cd$crawl_delay), sort(c(0.1, 3.0, -1.0)))
imdb <- readLines(system.file("extdata", "imdb-robots.txt", package="spiderbar"))
rt2 <- robxp(imdb)
gh <- paste0(readLines(system.file("extdata", "github-robots.txt", package="spiderbar")), collapse="\n")
rt3 <- robxp(gh)
rt3 <- robxp(file(system.file("extdata", "github-robots.txt", package="spiderbar")))
expect_equal(sitemaps(rt1), "http://www.cdc.gov/niosh/sitemaps/sitemapsNIOSH.xml")
expect_equal(sitemaps(rt2), "http://www.imdb.com/sitemap_US_index.xml.gz")
expect_equal(sitemaps(rt3), character(0))
})

5
tests/tinytest.R

@ -0,0 +1,5 @@
if ( requireNamespace("tinytest", quietly=TRUE) ){
tinytest::test_package("spiderbar")
}
Loading…
Cancel
Save