From 878bb7f0455c1afdd2a9ef213da5d630f2c3b797 Mon Sep 17 00:00:00 2001 From: Bob Rudis Date: Mon, 14 Aug 2017 15:00:27 -0400 Subject: [PATCH] initial commit --- .Rbuildignore | 11 + .codecov.yml | 1 + .gitignore | 8 + .travis.yml | 31 ++ CONDUCT.md | 25 ++ DESCRIPTION | 27 ++ LICENSE | 2 + NAMESPACE | 7 + NEWS.md | 2 + R/RcppExports.R | 19 + R/rep-package.R | 14 + R/rep.r | 47 +++ README.Rmd | 58 +++ README.md | 74 ++++ man/can_fetch.Rd | 23 ++ man/print.robxp.Rd | 16 + man/rep.Rd | 16 + man/robxp.Rd | 19 + rep.Rproj | 21 + src/.gitignore | 3 + src/Makevars | 3 + src/RcppExports.cpp | 42 ++ src/agent.cpp | 87 +++++ src/agent.h | 70 ++++ src/directive.cpp | 130 +++++++ src/directive.h | 67 ++++ src/psl.cpp | 183 +++++++++ src/psl.h | 102 +++++ src/punycode.cpp | 409 ++++++++++++++++++++ src/punycode.h | 105 +++++ src/repmain.cpp | 26 ++ src/robots.cpp | 188 +++++++++ src/robots.h | 69 ++++ src/url.cpp | 962 ++++++++++++++++++++++++++++++++++++++++++++++ src/url.h | 323 ++++++++++++++++ src/utf8.cpp | 150 ++++++++ src/utf8.h | 91 +++++ tests/test-all.R | 3 + tests/testthat/test-rep.R | 11 + 39 files changed, 3445 insertions(+) create mode 100644 .Rbuildignore create mode 100644 .codecov.yml create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 CONDUCT.md create mode 100644 DESCRIPTION create mode 100644 LICENSE create mode 100644 NAMESPACE create mode 100644 NEWS.md create mode 100644 R/RcppExports.R create mode 100644 R/rep-package.R create mode 100644 R/rep.r create mode 100644 README.Rmd create mode 100644 README.md create mode 100644 man/can_fetch.Rd create mode 100644 man/print.robxp.Rd create mode 100644 man/rep.Rd create mode 100644 man/robxp.Rd create mode 100644 rep.Rproj create mode 100644 src/.gitignore create mode 100644 src/Makevars create mode 100644 src/RcppExports.cpp create mode 100644 src/agent.cpp create mode 100644 src/agent.h create mode 100644 src/directive.cpp create mode 100644 src/directive.h create mode 100644 src/psl.cpp create mode 100644 src/psl.h create mode 100644 src/punycode.cpp create mode 100644 src/punycode.h create mode 100644 src/repmain.cpp create mode 100644 src/robots.cpp create mode 100644 src/robots.h create mode 100644 src/url.cpp create mode 100644 src/url.h create mode 100644 src/utf8.cpp create mode 100644 src/utf8.h create mode 100644 tests/test-all.R create mode 100644 tests/testthat/test-rep.R diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..edf18c6 --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,11 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^\.travis\.yml$ +^README\.*Rmd$ +^README\.*html$ +^NOTES\.*Rmd$ +^NOTES\.*html$ +^\.codecov\.yml$ +^README_files$ +^doc$ +^CONDUCT\.md$ diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 0000000..69cb760 --- /dev/null +++ b/.codecov.yml @@ -0,0 +1 @@ +comment: false diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cce1f17 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +.DS_Store +.Rproj.user +.Rhistory +.RData +.Rproj +src/*.o +src/*.so +src/*.dll diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..76d9586 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,31 @@ +language: r + +warnings_are_errors: true + +sudo: required + +cache: packages + +r: + - oldrel + - release + - devel + +apt_packages: + - libv8-dev + - xclip + +env: + global: + - CRAN: http://cran.rstudio.com + +after_success: + - Rscript -e 'covr::codecov()' + +notifications: + email: + - bob@rud.is + irc: + channels: + - "104.236.112.222#builds" + nick: travisci diff --git a/CONDUCT.md b/CONDUCT.md new file mode 100644 index 0000000..52a673e --- /dev/null +++ b/CONDUCT.md @@ -0,0 +1,25 @@ +# Contributor Code of Conduct + +As contributors and maintainers of this project, we pledge to respect all people who +contribute through reporting issues, posting feature requests, updating documentation, +submitting pull requests or patches, and other activities. + +We are committed to making participation in this project a harassment-free experience for +everyone, regardless of level of experience, gender, gender identity and expression, +sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. + +Examples of unacceptable behavior by participants include the use of sexual language or +imagery, derogatory comments or personal attacks, trolling, public or private harassment, +insults, or other unprofessional conduct. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, +commits, code, wiki edits, issues, and other contributions that are not aligned to this +Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed +from the project team. + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by +opening an issue or contacting one or more of the project maintainers. + +This Code of Conduct is adapted from the Contributor Covenant +(http:contributor-covenant.org), version 1.0.0, available at +http://contributor-covenant.org/version/1/0/0/ diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..d505f19 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,27 @@ +Package: rep +Type: Package +Title: Tools to Parse and Test Robots Exclusion Protocol Files and Rules +Version: 0.1.0 +Date: 2017-08-14 +Author: Bob Rudis (bob@rud.is) [aut, cre], SEOmoz, Inc [aut] +Maintainer: Bob Rudis +Description: The 'Robots Exclusion Protocol' documents + a set of standards for allowing or excluding robot/spider crawling of different areas of + site content. Tools are provided which wrap The 'rep-cpp` + C++ library for processing these 'robots.txt' files. +SystemRequirements: C++11 +NeedsCompilation: yes +URL: https://github.com/hrbrmstr/rep +BugReports: https://github.com/hrbrmstr/rep/issues +License: MIT + file LICENSE +Suggests: + testthat, + covr, + robotstxt +Depends: + R (>= 3.2.0) +Imports: + purrr, + Rcpp +RoxygenNote: 6.0.1 +LinkingTo: Rcpp diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..48ed424 --- /dev/null +++ b/LICENSE @@ -0,0 +1,2 @@ +YEAR: 2017 +COPYRIGHT HOLDER: Bob Rudis diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..19131d4 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,7 @@ +# Generated by roxygen2: do not edit by hand + +S3method(print,robxp) +export(can_fetch) +export(robxp) +importFrom(Rcpp,sourceCpp) +useDynLib(rep, .registration=TRUE) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..9b4679b --- /dev/null +++ b/NEWS.md @@ -0,0 +1,2 @@ +0.1.0 +* Initial release diff --git a/R/RcppExports.R b/R/RcppExports.R new file mode 100644 index 0000000..dc22683 --- /dev/null +++ b/R/RcppExports.R @@ -0,0 +1,19 @@ +# Generated by using Rcpp::compileAttributes() -> do not edit by hand +# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#' Parse robots.txt +#' +#' @noRd +#' +rep_parse <- function(content) { + .Call(`_rep_rep_parse`, content) +} + +#' Path allowed +#' +#' @noRd +#' +rep_path_allowed <- function(xp, path, agent = "*") { + .Call(`_rep_rep_path_allowed`, xp, path, agent) +} + diff --git a/R/rep-package.R b/R/rep-package.R new file mode 100644 index 0000000..d7c5b44 --- /dev/null +++ b/R/rep-package.R @@ -0,0 +1,14 @@ +#' Tools to Parse and Test Robots Exclusion Protocol Files and Rules +#' +#' The 'Robots Exclusion Protocol' documents a set +#' of standards for allowing or excluding robot/spider crawling of different areas of +#' site content. Tools are provided which wrap The 'rep-cpp` +#' C++ library for processing these 'robots.txt' files. +#' +#' @md +#' @name rep +#' @docType package +#' @author Bob Rudis (bob@@rud.is) +#' @useDynLib rep, .registration=TRUE +#' @importFrom Rcpp sourceCpp +NULL \ No newline at end of file diff --git a/R/rep.r b/R/rep.r new file mode 100644 index 0000000..490dbe7 --- /dev/null +++ b/R/rep.r @@ -0,0 +1,47 @@ +#' Create a robots.txt object +#' +#' @param x atomic character vector containing a complete robots.txt file +#' @export +#' @examples +#' library(robotstxt) +#' can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE +#' can_fetch(rt, "/_borders", "*") # FALSE +robxp <- function(x) { + + robxp <- rep_parse(x) + class(robxp) <- c("robxp") + + robxp + +} + +#' Test URL path against robots.txt +#' +#' @md +#' @param obj `robxp` object +#' @param path path to test +#' @param user_agent user agent to test +#' @export +#' @examples +#' library(robotstxt) +#' can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE +#' can_fetch(rt, "/_borders", "*") # FALSE +can_fetch <- function(obj, path="/", user_agent="*") { + + if (inherits(obj, "robxp")) { + rep_path_allowed(obj, path, user_agent) + } else { + return(NULL) + } + +} + +#' Custom printer for 'robexp' objects +#' +#' @md +#' @param x object to print +#' @param ... unused +#' @export +print.robxp <- function(x, ...) { + cat("") +} \ No newline at end of file diff --git a/README.Rmd b/README.Rmd new file mode 100644 index 0000000..233007e --- /dev/null +++ b/README.Rmd @@ -0,0 +1,58 @@ +--- +output: rmarkdown::github_document +--- + +`rep` : Tools to Parse and Test Robots Exclusion Protocol Files and Rules + +The 'Robots Exclusion Protocol' documents a set of standards for allowing or excluding robot/spider crawling of different areas of site content. Tools are provided which wrap The 'rep-cpp` C++ library for processing these 'robots.txt' files. + +- [`rep-cpp`](https://github.com/seomoz/rep-cpp) +- [`url-cpp`](https://github.com/seomoz/url-cpp) + +The following functions are implemented: + +- `robxp`: Create a robots.txt object +- `can_fetch`: Test URL path against robots.txt + +### Installation + +```{r eval=FALSE} +devtools::install_github("hrbrmstr/rep") +``` + +```{r message=FALSE, warning=FALSE, error=FALSE, include=FALSE} +options(width=120) +``` + +### Usage + +```{r message=FALSE, warning=FALSE, error=FALSE} +library(rep) +library(robotstxt) + +# current verison +packageVersion("rep") + +rt <- robxp(get_robotstxt("https://cdc.gov")) + +print(rt) + +can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") + +can_fetch(rt, "/_borders", "*") +``` + +### Test Results + +```{r message=FALSE, warning=FALSE, error=FALSE} +library(rep) +library(testthat) + +date() + +test_dir("tests/") +``` + +### Code of Conduct + +Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..b8fb215 --- /dev/null +++ b/README.md @@ -0,0 +1,74 @@ + +`rep` : Tools to Parse and Test Robots Exclusion Protocol Files and Rules + +The 'Robots Exclusion Protocol' documents a set of standards for allowing or excluding robot/spider crawling of different areas of site content. Tools are provided which wrap The 'rep-cpp\` C++ library for processing these 'robots.txt' files. + +- [`rep-cpp`](https://github.com/seomoz/rep-cpp) +- [`url-cpp`](https://github.com/seomoz/url-cpp) + +The following functions are implemented: + +- `robxp`: Create a robots.txt object +- `can_fetch`: Test URL path against robots.txt + +### Installation + +``` r +devtools::install_github("hrbrmstr/rep") +``` + +### Usage + +``` r +library(rep) +library(robotstxt) + +# current verison +packageVersion("rep") +``` + + ## [1] '0.1.0' + +``` r +rt <- robxp(get_robotstxt("https://cdc.gov")) + +print(rt) +``` + + ## + +``` r +can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") +``` + + ## [1] TRUE + +``` r +can_fetch(rt, "/_borders", "*") +``` + + ## [1] FALSE + +### Test Results + +``` r +library(rep) +library(testthat) + +date() +``` + + ## [1] "Mon Aug 14 15:00:16 2017" + +``` r +test_dir("tests/") +``` + + ## testthat results ======================================================================================================== + ## OK: 3 SKIPPED: 0 FAILED: 0 + ## + ## DONE =================================================================================================================== + +### Code of Conduct + +Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms. diff --git a/man/can_fetch.Rd b/man/can_fetch.Rd new file mode 100644 index 0000000..e440838 --- /dev/null +++ b/man/can_fetch.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rep.r +\name{can_fetch} +\alias{can_fetch} +\title{Test URL path against robots.txt} +\usage{ +can_fetch(obj, path = "/", user_agent = "*") +} +\arguments{ +\item{obj}{\code{robxp} object} + +\item{path}{path to test} + +\item{user_agent}{user agent to test} +} +\description{ +Test URL path against robots.txt +} +\examples{ +library(robotstxt) +can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE +can_fetch(rt, "/_borders", "*") # FALSE +} diff --git a/man/print.robxp.Rd b/man/print.robxp.Rd new file mode 100644 index 0000000..9138ecc --- /dev/null +++ b/man/print.robxp.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rep.r +\name{print.robxp} +\alias{print.robxp} +\title{Custom printer for 'robexp' objects} +\usage{ +\method{print}{robxp}(x, ...) +} +\arguments{ +\item{x}{object to print} + +\item{...}{unused} +} +\description{ +Custom printer for 'robexp' objects +} diff --git a/man/rep.Rd b/man/rep.Rd new file mode 100644 index 0000000..689f0d5 --- /dev/null +++ b/man/rep.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rep-package.R +\docType{package} +\name{rep} +\alias{rep} +\alias{rep-package} +\title{Tools to Parse and Test Robots Exclusion Protocol Files and Rules} +\description{ +The 'Robots Exclusion Protocol' \url{http://www.robotstxt.org/orig.html} documents a set +of standards for allowing or excluding robot/spider crawling of different areas of +site content. Tools are provided which wrap The 'rep-cpp` \url{https://github.com/seomoz/rep-cpp} +C++ library for processing these 'robots.txt' files. +} +\author{ +Bob Rudis (bob@rud.is) +} diff --git a/man/robxp.Rd b/man/robxp.Rd new file mode 100644 index 0000000..edc787f --- /dev/null +++ b/man/robxp.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rep.r +\name{robxp} +\alias{robxp} +\title{Create a robots.txt object} +\usage{ +robxp(x) +} +\arguments{ +\item{x}{atomic character vector containing a complete robots.txt file} +} +\description{ +Create a robots.txt object +} +\examples{ +library(robotstxt) +can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE +can_fetch(rt, "/_borders", "*") # FALSE +} diff --git a/rep.Rproj b/rep.Rproj new file mode 100644 index 0000000..446d9e1 --- /dev/null +++ b/rep.Rproj @@ -0,0 +1,21 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageBuildArgs: --resave-data +PackageRoxygenize: rd,collate,namespace diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..22034c4 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,3 @@ +*.o +*.so +*.dll diff --git a/src/Makevars b/src/Makevars new file mode 100644 index 0000000..a231a44 --- /dev/null +++ b/src/Makevars @@ -0,0 +1,3 @@ +CXX_STD = CXX11 +PKG_CXXFLAGS = +PKG_LIBS = -L. diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp new file mode 100644 index 0000000..613134a --- /dev/null +++ b/src/RcppExports.cpp @@ -0,0 +1,42 @@ +// Generated by using Rcpp::compileAttributes() -> do not edit by hand +// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#include + +using namespace Rcpp; + +// rep_parse +SEXP rep_parse(std::string content); +RcppExport SEXP _rep_rep_parse(SEXP contentSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< std::string >::type content(contentSEXP); + rcpp_result_gen = Rcpp::wrap(rep_parse(content)); + return rcpp_result_gen; +END_RCPP +} +// rep_path_allowed +bool rep_path_allowed(SEXP xp, std::string path, std::string agent); +RcppExport SEXP _rep_rep_path_allowed(SEXP xpSEXP, SEXP pathSEXP, SEXP agentSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type xp(xpSEXP); + Rcpp::traits::input_parameter< std::string >::type path(pathSEXP); + Rcpp::traits::input_parameter< std::string >::type agent(agentSEXP); + rcpp_result_gen = Rcpp::wrap(rep_path_allowed(xp, path, agent)); + return rcpp_result_gen; +END_RCPP +} + +static const R_CallMethodDef CallEntries[] = { + {"_rep_rep_parse", (DL_FUNC) &_rep_rep_parse, 1}, + {"_rep_rep_path_allowed", (DL_FUNC) &_rep_rep_path_allowed, 3}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_rep(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/src/agent.cpp b/src/agent.cpp new file mode 100644 index 0000000..b91cbf0 --- /dev/null +++ b/src/agent.cpp @@ -0,0 +1,87 @@ +#include +#include + +#include "url.h" + +#include "agent.h" +#include "directive.h" + +namespace Rep +{ + Agent& Agent::allow(const std::string& query) + { + directives_.push_back(Directive(escape(query), true)); + sorted_ = false; + return *this; + } + + Agent& Agent::disallow(const std::string& query) + { + if (query.empty()) + { + // Special case: "Disallow:" means "Allow: /" + directives_.push_back(Directive(query, true)); + } + else + { + directives_.push_back(Directive(escape(query), false)); + } + sorted_ = false; + return *this; + } + + const std::vector& Agent::directives() const + { + if (!sorted_) + { + std::sort(directives_.begin(), directives_.end(), [](const Directive& a, const Directive& b) { + return b.priority() < a.priority(); + }); + sorted_ = true; + } + return directives_; + } + + bool Agent::allowed(const std::string& query) const + { + std::string path(escape(query)); + + if (path.compare("/robots.txt") == 0) + { + return true; + } + + for (auto directive : directives()) + { + if (directive.match(path)) + { + return directive.allowed(); + } + } + return true; + } + + std::string Agent::str() const + { + std::stringstream out; + out << '['; + auto begin = directives().begin(); + auto end = directives().end(); + if (begin != end) + { + out << "Directive(" << begin->str() << ')'; + ++begin; + } + for (; begin != end; ++begin) + { + out << ", Directive(" << begin->str() << ')'; + } + out << ']'; + return out.str(); + } + + std::string Agent::escape(const std::string& query) + { + return Url::Url(query).defrag().escape().fullpath(); + } +} diff --git a/src/agent.h b/src/agent.h new file mode 100644 index 0000000..a30dd47 --- /dev/null +++ b/src/agent.h @@ -0,0 +1,70 @@ +#ifndef AGENT_CPP_H +#define AGENT_CPP_H + +#include + +#include "directive.h" + + +namespace Rep +{ + + class Agent + { + public: + /* The type for the delay. */ + typedef float delay_t; + + /** + * Construct an agent. + */ + Agent(): directives_(), delay_(-1.0), sorted_(true) {} + + /** + * Add an allowed directive. + */ + Agent& allow(const std::string& query); + + /** + * Add a disallowed directive. + */ + Agent& disallow(const std::string& query); + + /** + * Set the delay for this agent. + */ + Agent& delay(delay_t value) { + delay_ = value; + return *this; + } + + /** + * Return the delay for this agent. + */ + delay_t delay() const { return delay_; } + + /** + * A vector of the directives, in priority-sorted order. + */ + const std::vector& directives() const; + + /** + * Return true if the URL (either a full URL or a path) is allowed. + */ + bool allowed(const std::string& path) const; + + std::string str() const; + + /** + * Canonically escape the provided query for matching purposes. + */ + static std::string escape(const std::string& query); + + private: + mutable std::vector directives_; + delay_t delay_; + mutable bool sorted_; + }; +} + +#endif diff --git a/src/directive.cpp b/src/directive.cpp new file mode 100644 index 0000000..21376b7 --- /dev/null +++ b/src/directive.cpp @@ -0,0 +1,130 @@ +#include +#include +#include +#include + +#include "url.h" + +#include "directive.h" + +namespace Rep +{ + Directive::Directive(const std::string& line, bool allowed) + : expression_() + , priority_(line.size()) + , allowed_(allowed) + { + if (line.find('*') == std::string::npos) + { + expression_.assign(line); + return; + } + + // Remove consecutive '*'s + expression_.reserve(line.size()); + bool star = false; + for (auto character : line) + { + if (character == '*') + { + if (!star) + { + expression_.append(1, character); + } + star = true; + } + else + { + expression_.append(1, character); + star = false; + } + } + + // Remove trailing '*'s + std::string::reverse_iterator last = + std::find_if(expression_.rbegin(), expression_.rend(), + [](const char c) { + return c != '*'; + }); + expression_.erase(last.base(), expression_.end()); + + // Priority is the length of the expression + priority_ = expression_.size(); + } + + bool Directive::match(const std::string::const_iterator& e_begin, + const std::string::const_iterator& e_end, + const std::string::const_iterator& p_begin, + const std::string::const_iterator& p_end) const + { + std::string::const_iterator expression_it = e_begin; + std::string::const_iterator path_it = p_begin; + while (expression_it != e_end && path_it != p_end) + { + if (*expression_it == '*') + { + // Advance and recurse + ++expression_it; + for (; path_it != p_end; ++path_it) + { + if (match(expression_it, e_end, path_it, p_end)) + { + return true; + } + } + return false; + } + else if (*expression_it == '$') + { + // This check expects path to be fully consumed. But since one of the + // criteria of being in this while loop is that we've not fully consumed + // path, return false. + return false; + } + else if (*expression_it != *path_it) + { + // These characters must match + return false; + } + else + { + // Advance both by one + ++path_it; + ++expression_it; + } + } + + // Return true only if we've consumed all of the expression + if (expression_it == e_end) + { + return true; + } + else if (*expression_it == '$') + { + return path_it == p_end; + } + else + { + return false; + } + } + + std::string Directive::str() const + { + std::stringstream out; + if (allowed_) + { + out << "Allow: " << expression_; + } + else { + out << "Disallow: " << expression_; + } + return out.str(); + } + + bool Directive::match(const std::string& path) const + { + return match(expression_.begin(), expression_.end(), path.begin(), path.end()); + } + +} diff --git a/src/directive.h b/src/directive.h new file mode 100644 index 0000000..0c2743f --- /dev/null +++ b/src/directive.h @@ -0,0 +1,67 @@ +#ifndef DIRECTIVE_CPP_H +#define DIRECTIVE_CPP_H + + +namespace Rep +{ + + class Directive + { + public: + /** + * The type of our priority value. + */ + typedef size_t priority_t; + + /** + * Default constructor disallowed. + */ + Directive() = delete; + + /** + * The input to this constructor must be stripped of comments and trailing + * whitespace. + */ + Directive(const std::string& line, bool allowed); + + /** + * The priority of the rule. + */ + priority_t priority() const + { + return priority_; + } + + /** + * Whether or not the provided path matches. The path is expected to be properly + * escaped. + */ + bool match(const std::string& path) const; + + /** + * Whether this rule is for an allow or a disallow. + */ + bool allowed() const + { + return allowed_; + } + + std::string str() const; + + private: + std::string expression_; + priority_t priority_; + bool allowed_; + + /** + * Return true if p_begin -> p_end matches the expression e_begin -> e_end. + */ + bool match(const std::string::const_iterator& e_begin, + const std::string::const_iterator& e_end, + const std::string::const_iterator& p_begin, + const std::string::const_iterator& p_end) const; + }; + +} + +#endif diff --git a/src/psl.cpp b/src/psl.cpp new file mode 100644 index 0000000..c078d21 --- /dev/null +++ b/src/psl.cpp @@ -0,0 +1,183 @@ +#include +#include +#include +#include + +#include "psl.h" +#include "punycode.h" + +namespace Url +{ + const std::string PSL::not_found = ""; + + PSL::PSL(std::istream& stream) + { + std::string line; + while (std::getline(stream, line)) + { + // Only take up to the first whitespace. + auto it = std::find_if(line.begin(), line.end(), ::isspace); + line.resize(it - line.begin()); + + // Skip blank lines + if (line.empty()) + { + continue; + } + + // Skip comments + if (line.compare(0, 2, "//") == 0) + { + continue; + } + + // We know the line has at least a single character at this point + if (line[0] == '*') + { + // Line is a wildcard rule + if (line.size() <= 2 || line[1] != '.') + { + throw std::invalid_argument("Wildcard rule must be of form *."); + } + + add(line, 1, 2); + } + else if (line[0] == '!') + { + // Line is an exception, take all but the ! + if (line.size() <= 1) + { + throw std::invalid_argument("Exception rule has no hostname."); + } + + add(line, -1, 1); + } + else + { + add(line, 0, 0); + } + } + } + + PSL PSL::fromPath(const std::string& path) + { + std::ifstream stream(path); + if (!stream.good()) + { + std::stringstream message; + message << "Path '" << path << "' inaccessible."; + throw std::invalid_argument(message.str()); + } + return PSL(stream); + } + + PSL PSL::fromString(const std::string& str) + { + std::stringstream stream(str); + return PSL(stream); + } + + std::string PSL::getTLD(const std::string& hostname) const + { + return getLastSegments(hostname, getTLDLength(hostname)); + } + + std::string PSL::getPLD(const std::string& hostname) const + { + return getLastSegments(hostname, getTLDLength(hostname) + 1); + } + + std::pair PSL::getBoth(const std::string& hostname) const + { + size_t length = getTLDLength(hostname); + return std::make_pair( + getLastSegments(hostname, length), + getLastSegments(hostname, length + 1)); + } + + size_t PSL::getTLDLength(const std::string& hostname) const + { + // Reversed copy of hostname + std::string tld(hostname.rbegin(), hostname.rend()); + std::transform(tld.begin(), tld.end(), tld.begin(), ::tolower); + + while (tld.size()) + { + auto it = levels.find(tld); + if (it != levels.end()) + { + return it->second; + } + + size_t position = tld.rfind('.'); + if (position == std::string::npos || position == 0) + { + tld.resize(0); + } + else + { + tld.resize(position); + } + } + + return 1; + } + + std::string PSL::getLastSegments(const std::string& hostname, size_t segments) const + { + size_t position = hostname.size(); + size_t remaining = segments; + while (remaining != 0 && position && position != std::string::npos) + { + position = hostname.rfind('.', position - 1); + remaining -= 1; + } + + if (remaining >= 1) + { + return not_found; + } + + // Return the whole string if position == std:string::npos + size_t start = (position == std::string::npos) ? 0 : position + 1; + + std::string result(hostname, start); + std::transform(result.begin(), result.end(), result.begin(), ::tolower); + + // Leading .'s indicate that the query had an empty segment + if (result.size() && result[0] == '.') + { + std::stringstream message; + message << "Empty segment in " << result; + throw std::invalid_argument(message.str()); + } + + return result; + } + + size_t PSL::countSegments(const std::string& hostname) const + { + size_t count = 1; + size_t position = hostname.find('.'); + while (position != std::string::npos) + { + count += 1; + position = hostname.find('.', position + 1); + } + return count; + } + + void PSL::add(std::string& rule, int level_adjust, size_t trim) + { + // First unpunycoded + std::string copy(rule.rbegin(), rule.rend() - trim); + size_t length = countSegments(copy) + level_adjust; + levels[copy] = length; + + // And now punycoded + rule = Punycode::encodeHostname(rule); + copy.assign(rule.rbegin(), rule.rend() - trim); + levels[copy] = length; + } + +}; diff --git a/src/psl.h b/src/psl.h new file mode 100644 index 0000000..e1714f0 --- /dev/null +++ b/src/psl.h @@ -0,0 +1,102 @@ +#ifndef PSL_CPP_H +#define PSL_CPP_H + +#include +#include +#include +#include +#include + +namespace Url +{ + + /** + * Find TLDs and PLDs of a hostname according to a PSL. + */ + struct PSL + { + /** + * Indicates the there is no TLD / PLD + */ + static const std::string not_found; + + /** + * Read a PSL from an istream. + */ + PSL(std::istream& stream); + + PSL(): levels() { }; + + PSL(const PSL& other): levels(other.levels) { } + + PSL& operator=(const PSL& other) + { + levels = other.levels; + return *this; + } + + /** + * Read the provided path holding a set of PSL rules. + */ + static PSL fromPath(const std::string& path); + + /** + * Create a PSL object from a string. + */ + static PSL fromString(const std::string& str); + + /** + * Get just the TLD of the hostname. + * + * Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If + * some segments have been appropriately punycoded and others not, it may return + * a wrong answer. If a punycoded host is provided, a punycoded response is + * returned. If an unpunycoded host is provided, an unpunycoded response is + * returned. + */ + std::string getTLD(const std::string& hostname) const; + + /** + * Get just the PLD of the hostname. + * + * Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If + * some segments have been appropriately punycoded and others not, it may return + * a wrong answer. If a punycoded host is provided, a punycoded response is + * returned. If an unpunycoded host is provided, an unpunycoded response is + * returned. + */ + std::string getPLD(const std::string& hostname) const; + + /** + * Get the (TLD, PLD) of the hostname. + * + * Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If + * some segments have been appropriately punycoded and others not, it may return + * a wrong answer. If a punycoded host is provided, a punycoded response is + * returned. If an unpunycoded host is provided, an unpunycoded response is + * returned. + */ + std::pair getBoth(const std::string& hostname) const; + private: + // Mapping of a string rule to its level + std::unordered_map levels; + + // Return the number of segments in a hostname + size_t countSegments(const std::string& hostname) const; + + // Return the number of segments in the TLD of the provided hostname + size_t getTLDLength(const std::string& hostname) const; + + // Return the last `segments` segments of a hostname + std::string getLastSegments(const std::string& hostname, size_t segments) const; + + /** + * Add the provided host with the provided priority, trimming characters off + * the front, and adjusting the level by the provided number. + */ + void add(std::string& host, int level_adjust, size_t trim); + }; + +} + +#endif diff --git a/src/punycode.cpp b/src/punycode.cpp new file mode 100644 index 0000000..eb85d92 --- /dev/null +++ b/src/punycode.cpp @@ -0,0 +1,409 @@ +#include +#include +#include + +#include "punycode.h" +#include "utf8.h" + +namespace Url +{ + + std::string& Punycode::encode(std::string& str) + { + // Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3 + // + // let n = initial_n + // let delta = 0 + // let bias = initial_bias + punycode_uint n = INITIAL_N; + punycode_uint delta = 0; + punycode_uint bias = INITIAL_BIAS; + std::string output; + + // Accumulate the non-basic codepoints + std::vector codepoints; + for (auto it = str.cbegin(); it != str.cend(); ) + { + Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend()); + if (value < 0x80) + { + // copy them to the output in order + output.append(1, static_cast(value)); + } + codepoints.push_back(value); + } + + // let h = b = the number of basic code points in the input + size_t h = output.size(); + size_t b = h; + + // copy a delimiter if b > 0 + if (b > 0) + { + output.append(1, '-'); + } + + // while h < length(input) do begin + while (h < codepoints.size()) + { + // let m = the minimum {non-basic} code point >= n in the input + punycode_uint m = MAX_PUNYCODE_UINT; + for (auto it = codepoints.begin(); it != codepoints.end(); ++it) + { + if ((*it >= n) && (*it < m)) + { + m = *it; + } + } + + // let delta = delta + (m - n) * (h + 1), fail on overflow + if ((m - n) > ((MAX_PUNYCODE_UINT - delta) / (h + 1))) + { + throw std::invalid_argument("Overflow delta update."); + } + delta += (m - n) * (h + 1); + + // let n = m + n = m; + + // for each code point c in the input (in order) do begin + for (auto it = codepoints.begin(); it != codepoints.end(); ++it) + { + // if c < n {or c is basic} then increment delta, fail on overflow + if (*it < n) + { + if (delta == MAX_PUNYCODE_UINT) + { + throw std::invalid_argument("Overflow delta increment."); + } + ++delta; + } + + // if c == n then begin + if (*it == n) + { + // let q = delta + punycode_uint q = delta; + + // for k = base to infinity in steps of base do begin + for (punycode_uint k = BASE; ; k += BASE) + { + // let t = tmin if k <= bias {+ tmin}, or + // tmax if k >= bias + tmax, or k - bias otherwise + punycode_uint t = k <= bias ? TMIN : + k >= bias + TMAX ? TMAX : k - bias; + + // if q < t then break + if (q < t) + { + break; + } + + // output the code point for digit t + ((q - t) mod (base - t)) + output.append(1, DIGIT_TO_BASIC[t + ((q - t) % (BASE - t))]); + + // let q = (q - t) div (base - t) + q = (q - t) / (BASE - t); + } + + // output the code point for digit q + output.append(1, DIGIT_TO_BASIC[q]); + + // let bias = adapt(delta, h + 1, test h equals b?) + bias = adapt(delta, h + 1, h == b); + + // let delta = 0 + delta = 0; + + // increment h + ++h; + + } + } + + // increment delta and n + ++delta; + ++n; + } + + str.assign(output); + return str; + } + + std::string Punycode::encode(const std::string& str) + { + std::string result(str); + encode(result); + return result; + } + + std::string Punycode::encodeHostname(const std::string& hostname) + { + // Avoid any punycoding at all if none is needed + if (!needsPunycoding(hostname)) + { + return hostname; + } + + std::string encoded; + + size_t start = 0; + size_t end = hostname.find('.'); + while(true) + { + std::string segment = hostname.substr(start, end - start); + if (needsPunycoding(segment)) + { + encoded.append("xn--"); + encoded.append(Punycode::encode(segment)); + } + else + { + encoded.append(segment); + } + + if (end == std::string::npos) + { + break; + } + else + { + encoded.append(1, '.'); + start = end + 1; + end = hostname.find('.', start); + } + } + + return encoded; + } + + std::string& Punycode::decode(std::string& str) + { + // Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2 + // + // let n = initial_n + // let i = 0 + // let bias = initial_bias + // let output = an empty string indexed from 0 + punycode_uint n = INITIAL_N; + punycode_uint i = 0; + punycode_uint bias = INITIAL_BIAS; + std::vector codepoints; + + size_t index = str.rfind('-'); + if (index == std::string::npos) + { + index = 0; + } + + // consume all code points before the last delimiter (if there is one) + // and copy them to output, fail on any non-basic code point + for (auto it = str.begin(); it != (str.begin() + index); ++it) + { + if (static_cast(*it) > 127U) + { + throw std::invalid_argument("Argument has non-basic code points."); + } + codepoints.push_back(*it); + } + + // if more than zero code points were consumed then consume one more + // (which will be the last delimiter) + if (index > 0) + { + index += 1; + } + + // while the input is not exhausted do begin + for (auto it = (str.begin() + index); it != str.end(); ++it) + { + // let oldi = i + // let w = 1 + punycode_uint oldi = i; + punycode_uint w = 1; + + // for k = base to infinity in steps of base do begin + for (punycode_uint k = BASE; ; k += BASE, ++it) + { + // consume a code point, or fail if there was none to consume + if (it == str.end()) + { + throw std::invalid_argument("Premature termination"); + } + + // let digit = the code point's digit-value, fail if it has none + int lookup = BASIC_TO_DIGIT[static_cast(*it)]; + if (lookup == -1) + { + throw std::invalid_argument("Invalid base 36 character."); + } + unsigned char digit = static_cast(lookup); + + // let i = i + digit * w, fail on overflow + if (digit > ((MAX_PUNYCODE_UINT - i) / w)) + { + throw std::invalid_argument("Overflow on i."); + } + i += digit * w; + + // let t = tmin if k <= bias {+ tmin}, or + // tmax if k >= bias + tmax, or k - bias otherwise + punycode_uint t = k <= bias ? TMIN : + k >= bias + TMAX ? TMAX : k - bias; + + // if digit < t then break + if (digit < t) + { + break; + } + + // let w = w * (base - t), fail on overflow + if (w > (MAX_PUNYCODE_UINT / (BASE - t))) + { + // I believe this line is unreachable without first overflowing i. + // Since 'i' is updated above as i += digit * w, and w is updated as + // w = w * (BASE - t), we should like to keep (BASE - t) > digit to + // give 'w' a chance to overflow first. To keep t minimized, we must + // have 'bias' maximized. `bias` is driven by the 'adapt' function + // below. + // + // The value returned by 'adapt' increases with the input delta, and + // decreases with the input size. The delta is a function of the input + // size as well, on the order of (delta_n * input size), and + // legitimate delta_n values are limited to 0x10FFFF (the maximum + // unicode codepoint). Even setting that aside, the maximum value that + // adapt() can return is adapt(2 ** 32 - 1, 1, false) = 204. + // + // Using this bias, we could use the input (HERE) to get iterations: + // + // digit = b = 1, i = 2, k = 36, t = 1, w = 35 + // digit = b = 1, i = 37, k = 72, t = 1, w = 1225 + // digit = b = 1, i = 1262, k = 108, t = 1, w = 42875 + // digit = b = 1, i = 44137, k = 144, t = 1, w = 1500625 + // digit = b = 1, i = 1544762, k = 180, t = 1, w = 52521875 + // + // At this point, t now becomes TMAX (26) because k exceeds the bias + // (since the maximum bias is 204). As such, the minimum continuation + // value is 26: + // + // digit = 0 = 26, i = 1367113512, k = 216, t = 26, w = 525218750 + // + // However, the next iteration now overflows i before we can get to + // the w update. + throw std::invalid_argument("Overflow on w."); // LCOV_EXCL_LINE + } + w *= (BASE - t); + } + + // let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?) + bias = adapt(i - oldi, codepoints.size() + 1, oldi == 0); + + // let n = n + i div (length(output) + 1), fail on overflow + if ((i / (codepoints.size() + 1)) > (MAX_PUNYCODE_UINT - n)) + { + throw std::invalid_argument("Overflow on n."); + } + n += i / (codepoints.size() + 1); + + // let i = i mod (length(output) + 1) + i %= (codepoints.size() + 1); + + // insert n into output at position i + codepoints.insert(codepoints.begin() + i, n); + + // increment i + ++i; + } + + std::string output; + for (auto it = codepoints.begin(); it != codepoints.end(); ++it) + { + Utf8::writeCodepoint(output, *it); + } + str.assign(output); + + return str; + } + + std::string Punycode::decode(const std::string& str) + { + std::string result(str); + decode(result); + return result; + } + + std::string Punycode::decodeHostname(const std::string& hostname) + { + std::string unencoded; + + size_t start = 0; + size_t end = hostname.find('.'); + while(true) + { + std::string segment = hostname.substr(start, end - start); + if (segment.substr(0, 4).compare("xn--") == 0) + { + segment = segment.substr(4); + unencoded.append(Punycode::decode(segment)); + } + else + { + unencoded.append(segment); + } + + if (end == std::string::npos) + { + break; + } + else + { + unencoded.append(1, '.'); + start = end + 1; + end = hostname.find('.', start); + } + } + + return unencoded; + } + + bool Punycode::needsPunycoding(const std::string& str) + { + return std::any_of( + str.begin(), + str.end(), + [](char i){ return static_cast(i) & 0x80; }); + } + + Punycode::punycode_uint Punycode::adapt( + punycode_uint delta, punycode_uint numpoints, bool firsttime) + { + // Psuedocode from https://tools.ietf.org/html/rfc3492#section-6.1 + // + // It does not matter whether the modifications to delta and k inside + // adapt() affect variables of the same name inside the + // encoding/decoding procedures, because after calling adapt() the + // caller does not read those variables before overwriting them. + // + // if firsttime then let delta = delta div damp + // else let delta = delta div 2 + delta = firsttime ? delta / DAMP : delta >> 1; + + // let delta = delta + (delta div numpoints) + delta += (delta / numpoints); + + // let k = 0 + punycode_uint k = 0; + + // while delta > ((base - tmin) * tmax) div 2 do begin + for (; delta > ((BASE - TMIN) * TMAX) / 2; k += BASE) + { + // let delta = delta div (base - tmin) + // let k = k + base + delta /= (BASE - TMIN); + } + + // return k + (((base - tmin + 1) * delta) div (delta + skew)) + return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW)); + } + +}; diff --git a/src/punycode.h b/src/punycode.h new file mode 100644 index 0000000..25fce96 --- /dev/null +++ b/src/punycode.h @@ -0,0 +1,105 @@ +#ifndef PUNYCODE_CPP_H +#define PUNYCODE_CPP_H + +#include +#include +#include +#include +#include + +#include "utf8.h" + +namespace Url +{ + + namespace Punycode + { + typedef Utf8::codepoint_t punycode_uint; + + const unsigned int BASE = 36; + const unsigned int TMIN = 1; + const unsigned int TMAX = 26; + const unsigned int SKEW = 38; + const unsigned int DAMP = 700; + const unsigned int INITIAL_BIAS = 72; + const unsigned int INITIAL_N = 128; + + // Codepoints to their base-36 value + const std::vector BASIC_TO_DIGIT = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, + + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + const std::string DIGIT_TO_BASIC = "abcdefghijklmnopqrstuvwxyz0123456789"; + + // The highest codepoint in unicode + const punycode_uint MAX_PUNYCODE_UINT = std::numeric_limits::max(); + //Utf8::MAX_CODEPOINT; + //std::numeric_limits::max(); + + /** + * Replace utf-8-encoded str into punycode. + */ + std::string& encode(std::string& str); + + /** + * Create a new punycoded string from utf-8-encoded input. + */ + std::string encode(const std::string& str); + + /** + * Encode a hostname. + */ + std::string encodeHostname(const std::string& hostname); + + /** + * Replace punycoded str into utf-8-encoded. + */ + std::string& decode(std::string& str); + + /** + * Create a new utf-8-encoded string from punycoded input. + */ + std::string decode(const std::string& str); + + /** + * Decode a hostname. + */ + std::string decodeHostname(const std::string& hostname); + + /** + * Determine if a string needs punycoding. + */ + bool needsPunycoding(const std::string& str); + + /** + * Internal function for calculating bias. + */ + punycode_uint adapt( + punycode_uint delta, punycode_uint numpoints, bool firsttime); + + }; + +} + +#endif diff --git a/src/repmain.cpp b/src/repmain.cpp new file mode 100644 index 0000000..d8b70d2 --- /dev/null +++ b/src/repmain.cpp @@ -0,0 +1,26 @@ +#include +using namespace Rcpp; + +#include "url.h" +#include "robots.h" + +//' Parse robots.txt +//' +//' @noRd +//' +// [[Rcpp::export]] +SEXP rep_parse(std::string content) { + Rcpp::XPtr ptr(new Rep::Robots(content)); + return(ptr); +} + + +//' Path allowed +//' +//' @noRd +//' +// [[Rcpp::export]] +bool rep_path_allowed(SEXP xp, std::string path, std::string agent = "*") { + Rcpp::XPtr ptr(xp); + return(ptr->allowed(path, agent)); +} diff --git a/src/robots.cpp b/src/robots.cpp new file mode 100644 index 0000000..fb54d6e --- /dev/null +++ b/src/robots.cpp @@ -0,0 +1,188 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "url.h" + +#include "robots.h" +#include + +namespace Rep +{ + + void Robots::strip(std::string& string) + { + string.erase(string.begin(), std::find_if(string.begin(), string.end(), + std::not1(std::ptr_fun(std::isspace)))); + string.erase(std::find_if(string.rbegin(), string.rend(), + std::not1(std::ptr_fun(std::isspace))).base(), string.end()); + } + + bool Robots::getpair(std::istringstream& stream, std::string& key, std::string& value) + { + while (getline(stream, key)) + { + size_t index = key.find('#'); + if (index != std::string::npos) + { + key.resize(index); + } + + // Find the colon and divide it into key and value, skipping malformed lines + index = key.find(':'); + if (index == std::string::npos) + { + continue; + } + + value.assign(key.begin() + index + 1, key.end()); + key.resize(index); + + // Strip whitespace off of each + strip(key); + strip(value); + + // Lowercase the key + std::transform(key.begin(), key.end(), key.begin(), ::tolower); + + return true; + } + return false; + } + + Robots::Robots(const std::string& content): agents_(), sitemaps_(), default_(agents_["*"]) + { + std::string agent_name("*"); + std::istringstream input(content); + if (content.compare(0, 3, "\xEF\xBB\xBF") == 0) + { + input.ignore(3); + } + std::string key, value; + std::vector group; + bool last_agent = false; + agent_map_t::iterator current = agents_.find("*"); + while (Robots::getpair(input, key, value)) + { + if (key.compare("user-agent") == 0) + { + // Store the user agent string as lowercased + std::transform(value.begin(), value.end(), value.begin(), ::tolower); + + if (last_agent) + { + group.push_back(value); + } + else + { + if (!agent_name.empty()) + { + for (auto other : group) + { + agents_[other] = current->second; + } + group.clear(); + } + agent_name = value; + current = agents_.emplace(agent_name, Agent()).first; + } + last_agent = true; + continue; + } + else + { + last_agent = false; + } + + if (key.compare("sitemap") == 0) + { + sitemaps_.push_back(value); + } + else if (key.compare("disallow") == 0) + { + current->second.disallow(value); + } + else if (key.compare("allow") == 0) + { + current->second.allow(value); + } + else if (key.compare("crawl-delay") == 0) + { + try + { + current->second.delay(std::stof(value)); + } + catch (const std::exception&) + { + Rcpp::Rcout << "Could not parse " << value << " as float." << std::endl; + } + } + } + + if (!agent_name.empty()) + { + for (auto other : group) + { + agents_[other] = current->second; + } + } + } + + const Agent& Robots::agent(const std::string& name) const + { + // Lowercase the agent + std::string lowered(name); + std::transform(lowered.begin(), lowered.end(), lowered.begin(), ::tolower); + + auto it = agents_.find(lowered); + if (it == agents_.end()) + { + return default_; + } + else + { + return it->second; + } + } + + bool Robots::allowed(const std::string& path, const std::string& name) const + { + return agent(name).allowed(path); + } + + std::string Robots::str() const + { + std::stringstream out; + // TODO: include sitepath info + out << '{'; + auto begin = agents_.begin(); + auto end = agents_.end(); + if (begin != end) + { + out << '"' << begin->first << '"' << ": " << begin->second.str(); + ++begin; + } + for (; begin != end; ++begin) + { + out << ", \"" << begin->first << '"' << ": " << begin->second.str(); + } + out << '}'; + return out.str(); + } + + std::string Robots::robotsUrl(const std::string& url) + { + return Url::Url(url) + .setUserinfo("") + .setPath("robots.txt") + .setParams("") + .setQuery("") + .setFragment("") + .remove_default_port() + .str(); + } +} diff --git a/src/robots.h b/src/robots.h new file mode 100644 index 0000000..56a82c4 --- /dev/null +++ b/src/robots.h @@ -0,0 +1,69 @@ +#ifndef ROBOTS_CPP_H +#define ROBOTS_CPP_H + +#include +#include +#include + +#include "agent.h" + +namespace Rep +{ + + class Robots + { + public: + typedef std::unordered_map agent_map_t; + typedef std::vector sitemaps_t; + + /** + * Create a robots.txt from a utf-8-encoded string. + */ + Robots(const std::string& content); + + /** + * Instantiate a Robots object. + */ + Robots( + const agent_map_t& agents, + const sitemaps_t& sitemaps) + : agents_(agents) + , sitemaps_(sitemaps) + , default_(agents_["*"]) {} + + /** + * Get the sitemaps in this robots.txt + */ + const sitemaps_t& sitemaps() const { return sitemaps_; } + + /** + * Get the agent with the corresponding name. + */ + const Agent& agent(const std::string& name) const; + + /** + * Return true if agent is allowed to fetch the URL (either a + * full URL or a path). + */ + bool allowed(const std::string& path, const std::string& name) const; + + std::string str() const; + + /** + * Return the robots.txt URL corresponding to the provided URL. + */ + static std::string robotsUrl(const std::string& url); + + private: + static void strip(std::string& string); + + static bool getpair( + std::istringstream& stream, std::string& key, std::string& value); + + agent_map_t agents_; + sitemaps_t sitemaps_; + Agent& default_; + }; +} + +#endif diff --git a/src/url.cpp b/src/url.cpp new file mode 100644 index 0000000..900a65e --- /dev/null +++ b/src/url.cpp @@ -0,0 +1,962 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "url.h" +#include "punycode.h" + +namespace Url +{ + + /* Character classes */ + const CharacterClass Url::GEN_DELIMS(":/?#[]@"); + const CharacterClass Url::SUB_DELIMS("!$&'()*+,;="); + const CharacterClass Url::DIGIT("0123456789"); + const CharacterClass Url::ALPHA( + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); + const CharacterClass Url::UNRESERVED( + Url::ALPHA.chars() + Url::DIGIT.chars() + "-._~"); + const CharacterClass Url::RESERVED( + Url::GEN_DELIMS.chars() + Url::SUB_DELIMS.chars()); + const CharacterClass Url::PCHAR( + Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":@"); + const CharacterClass Url::PATH( + Url::PCHAR.chars() + "/"); + const CharacterClass Url::QUERY( + Url::PCHAR.chars() + "/?"); + const CharacterClass Url::FRAGMENT( + Url::PCHAR.chars() + "/?"); + const CharacterClass Url::USERINFO( + Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":"); + const CharacterClass Url::HEX("0123456789ABCDEF"); + const CharacterClass Url::SCHEME( + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-."); + const std::vector Url::HEX_TO_DEC = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, + + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + const std::unordered_map Url::PORTS = { + {"http", 80}, + {"https", 443} + }; + const std::unordered_set Url::USES_RELATIVE = { + "", + "file", + "ftp", + "gopher", + "http", + "https", + "imap", + "mms", + "nntp", + "prospero", + "rtsp", + "rtspu", + "sftp", + "shttp", + "svn", + "svn+ssh", + "wais" + }; + const std::unordered_set Url::USES_NETLOC = { + "", + "file", + "ftp", + "git", + "git+ssh", + "gopher", + "http", + "https", + "imap", + "mms", + "nfs", + "nntp", + "prospero", + "rsync", + "rtsp", + "rtspu", + "sftp", + "shttp", + "snews", + "svn", + "svn+ssh", + "telnet", + "wais" + }; + const std::unordered_set Url::USES_PARAMS = { + "", + "ftp", + "hdl", + "http", + "https", + "imap", + "mms", + "prospero", + "rtsp", + "rtspu", + "sftp", + "shttp", + "sip", + "sips", + "tel" + }; + const std::unordered_set Url::KNOWN_PROTOCOLS = { + "", + "file", + "ftp", + "git", + "git+ssh", + "gopher", + "hdl", + "http", + "https", + "imap", + "mms", + "nfs", + "nntp", + "prospero", + "rsync", + "rtsp", + "rtspu", + "sftp", + "shttp", + "sip", + "sips", + "sms", + "snews", + "svn", + "svn+ssh", + "tel", + "telnet", + "wais" + }; + + Url::Url(const std::string& url): port_(0), has_params_(false), has_query_(false) + { + size_t position = 0; + size_t index = url.find(':'); + if (index != std::string::npos) + { + // All the characters in our would-be scheme must be in SCHEME + if (std::all_of( + url.begin(), + url.begin() + index, + [](char c) { return SCHEME(c); } )) + { + // If there is nothing after the : or there are any non-digits, this is + // the scheme + if ((index + 1) >= url.length() + || std::any_of( + url.begin() + index + 1, + url.end(), + [](char c) { return !DIGIT(c); })) + { + scheme_.assign(url, 0, index); + std::transform( + scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower); + position = index + 1; + } + else + { + scheme_.assign(url, 0, index); + std::transform( + scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower); + if (KNOWN_PROTOCOLS.find(scheme_) != KNOWN_PROTOCOLS.end()) + { + position = index + 1; + } + else + { + scheme_.clear(); + } + } + } + } + + // Search for the netloc + if ((url.length() - position) >= 1 + && url[position] == '/' + && url[position + 1] == '/') + { + // Skip the '//' + position += 2; + index = url.find_first_of("/?#", position); + host_.assign(url, position, index - position); + position = index; + + // Extract any userinfo if there is any + index = host_.find('@'); + if (index != std::string::npos) + { + userinfo_.assign(host_, 0, index); + host_.assign(host_, index + 1, std::string::npos); + } + + // Lowercase the hostname + std::transform(host_.begin(), host_.end(), host_.begin(), ::tolower); + + // Try to find a port + index = host_.find(':'); + if (index != std::string::npos) + { + std::string portText(host_, index + 1, std::string::npos); + host_.resize(index); + + if (portText.empty()) + { + port_ = 0; + } + else + { + try + { + port_ = std::stoi(portText, &index); + + if (index != portText.length()) + { + // Malformed port + throw UrlParseException("Port not a number: " + portText); + } + + if (port_ > 65535) + { + throw UrlParseException("Port too high: " + portText); + } + else if (port_ < 0) + { + throw UrlParseException("Port negative: " + portText); + } + } + catch (const std::invalid_argument&) + { + // Malformed port + throw UrlParseException("Port not a number: " + portText); + } + catch (const std::out_of_range&) + { + throw UrlParseException("Port out of integer range: " + portText); + } + } + } + } + + if (position != std::string::npos) + { + path_.assign(url, position, std::string::npos); + + index = path_.find('#'); + if (index != std::string::npos) + { + fragment_.assign(path_, index + 1, std::string::npos); + path_.resize(index); + } + + index = path_.find('?'); + if (index != std::string::npos) + { + query_.assign(path_, index + 1, std::string::npos); + has_query_ = true; + path_.resize(index); + } + + if (USES_PARAMS.find(scheme_) != USES_PARAMS.end()) + { + index = path_.find(';'); + if (index != std::string::npos) + { + params_.assign(path_, index + 1, std::string::npos); + has_params_ = true; + path_.resize(index); + } + } + } + } + + Url& Url::assign(const Url& other) + { + return (*this) = other; + } + + bool Url::operator==(const Url& other) const + { + return ( + (scheme_ == other.scheme_ ) && + (userinfo_ == other.userinfo_ ) && + (host_ == other.host_ ) && + (port_ == other.port_ ) && + (path_ == other.path_ ) && + (params_ == other.params_ ) && + (query_ == other.query_ ) && + (fragment_ == other.fragment_ ) && + (has_params_ == other.has_params_) && + (has_query_ == other.has_query_ ) + ); + } + + bool Url::operator!=(const Url& other) const + { + return !operator==(other); + } + + bool Url::equiv(const Url& other) + { + Url self_(*this); + Url other_(other); + + self_.strip() + .sort_query() + .defrag() + .deuserinfo() + .abspath() + .escape() + .punycode() + .remove_default_port(); + other_.strip() + .sort_query() + .defrag() + .deuserinfo() + .abspath() + .escape() + .punycode() + .remove_default_port(); + return self_ == other_; + } + + std::string& Url::remove_repeats(std::string& str, const char chr) + { + size_t dest = 0; + // By initializing this to true, it also strips of leading instances of chr + bool seen = true; + for (size_t src = 0; src < str.length(); ++src) + { + if (!seen || (str[src] != chr)) + { + str[dest++] = str[src]; + } + seen = str[src] == chr; + } + // Remove the last character if it happens to be chr + size_t length = ((dest > 0) && (str[dest - 1] == chr)) ? dest - 1 : dest; + str.resize(length); + return str; + } + + std::string Url::fullpath() const + { + std::string result; + if (path_.empty() || path_[0] != '/') + { + result.append(1, '/'); + } + result.append(path_); + + if (has_params_) + { + result.append(";"); + result.append(params_); + } + + if (has_query_) + { + result.append("?"); + result.append(query_); + } + + if (!fragment_.empty()) + { + result.append("#"); + result.append(fragment_); + } + return result; + } + + std::string Url::str() const + { + std::string result; + + if (!scheme_.empty()) + { + result.append(scheme_); + if (USES_NETLOC.find(scheme_) == USES_NETLOC.end()) + { + result.append(":"); + } + else + { + result.append("://"); + } + } + else if (!host_.empty()) + { + result.append("//"); + } + + if (!userinfo_.empty()) + { + result.append(userinfo_); + result.append("@"); + } + + if (!host_.empty()) + { + result.append(host_); + } + + if (port_) + { + result.append(":"); + result.append(std::to_string(port_)); + } + + if (path_.empty()) + { + if (!result.empty()) + { + result.append("/"); + } + } + else + { + if (!host_.empty() && path_[0] != '/') + { + result.append(1, '/'); + } + result.append(path_); + } + + if (has_params_) + { + result.append(";"); + result.append(params_); + } + + if (has_query_) + { + result.append("?"); + result.append(query_); + } + + if (!fragment_.empty()) + { + result.append("#"); + result.append(fragment_); + } + + return result; + } + + Url& Url::strip() + { + size_t start = query_.find_first_not_of('?'); + if (start != std::string::npos) + { + query_.assign(query_, start, std::string::npos); + } + else + { + query_.assign(""); + } + setQuery(remove_repeats(query_, '&')); + setParams(remove_repeats(params_, ';')); + return *this; + } + + Url& Url::abspath() + { + std::string copy; + std::vector segment_starts; + + if (path_.size() >= 1 && path_[0] == '/') + { + copy.append(1, '/'); + segment_starts.push_back(0); + } + + bool directory = false; + size_t previous = 0; + size_t index = 0; + for (index = path_.find('/') + ; index != std::string::npos + ; previous = index + 1, index = path_.find('/', index + 1)) + { + // Skip empty segments + if (index - previous == 0) + { + continue; + } + + if ((index - previous == 2) + && path_[previous] == '.' + && path_[previous + 1] == '.') + { + if (!segment_starts.empty()) + { + copy.resize(segment_starts.back()); + segment_starts.pop_back(); + } + directory = true; + } + else if ((index - previous == 1) && path_[previous] == '.') + { + directory = true; + } + else + { + segment_starts.push_back(copy.length()); + copy.append(path_, previous, index - previous); + copy.append(1, '/'); + directory = false; + } + } + + // Handle the last segment + index = path_.length(); + if (previous == path_.length()) + { + directory = true; + } + else if ((index - previous == 1) && path_[previous] == '.') + { + directory = true; + } + else if ((index - previous == 2) + && path_[previous] == '.' + && path_[previous + 1] == '.') + { + if (!segment_starts.empty()) + { + copy.resize(segment_starts.back()); + } + directory = true; + } + else + { + copy.append(path_, previous, index - previous); + copy.append(1, '/'); + directory = false; + } + + if (!directory && copy.size() >= 1) + { + copy.resize(copy.size() - 1); + } + else if (directory && copy.empty()) + { + copy.append(1, '/'); + } + path_.assign(copy); + + return *this; + } + + Url& Url::relative_to(const Url& other) + { + // If this scheme does not use relative, return it unchanged + if (USES_RELATIVE.find(scheme_) == USES_RELATIVE.end()) + { + return *this; + } + + // Support scheme-relative URLs + if (scheme_.empty()) + { + scheme_ = other.scheme_; + } + + // If this is an absolute URL (or scheme-relative), return early + if (!host_.empty()) { + return *this; + } + + // If it's not an absolute URL, we need to copy the other host and port + host_ = other.host_; + port_ = other.port_; + userinfo_ = other.userinfo_; + + // If the path portion is absolute, then bail out early. + if (!path_.empty() && path_.front() == '/') + { + return *this; + } + + // Otherwise, this is a path that need to be evaluated relative to the other. If + // there is no '/', then we just keep our current path if it's not empty. + if (path_.empty()) + { + if (params_.empty()) + { + path_ = other.path_; + params_ = other.params_; + has_params_ = other.has_params_; + if (query_.empty()) + { + query_ = other.query_; + has_query_ = other.has_query_; + } + } + else + { + path_.assign(other.path_, 0, other.path_.rfind('/') + 1); + } + + if (fragment_.empty()) + { + fragment_ = other.fragment_; + } + } + else + { + size_t index = other.path_.rfind('/'); + if (index != std::string::npos) + { + path_ = other.path_.substr(0, index + 1) + path_; + } + else if (!host_.empty()) + { + path_ = "/" + path_; + } + } + + return *this; + } + + Url& Url::escape(bool strict) + { + escape(path_, PATH, strict); + escape(query_, QUERY, strict); + escape(params_, QUERY, strict); + escape(userinfo_, USERINFO, strict); + return *this; + } + + std::string& Url::escape(std::string& str, const CharacterClass& safe, bool strict) + { + std::string copy(str); + size_t dest = 0; + // Allocate space pessimistically -- if every entity is expanded, it will take 3x + // the space. + str.resize(str.length() * 3); + for (size_t src = 0; src < copy.length(); ++src) + { + if (copy[src] == '%' && (copy.length() - src) >= 2) + { + // Read ahead to see if there's a valid escape sequence. If not, treat + // this like a normal character. + if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1) + { + int value = ( + HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]); + + // In strict mode, we can only unescape parameters if they are both + // safe and node reserved + if (!strict || (strict && safe(value) && !RESERVED(value))) + { + // Replace src + 2 with that byte, advance src to consume it and + // continue. + src += 2; + copy[src] = value; + } + else + { + str[dest++] = copy[src++]; + str[dest++] = ::toupper(copy[src++]); + str[dest++] = ::toupper(copy[src]); + continue; + } + } + } + + if (!safe(copy[src])) + { + // Not safe -- replace with %XX + str[dest++] = '%'; + str[dest++] = HEX.chars()[(copy[src] >> 4) & 0xF]; + str[dest++] = HEX.chars()[copy[src] & 0xF]; + } + else + { + str[dest++] = copy[src]; + } + } + str.resize(dest); + return str; + } + + Url& Url::unescape() + { + unescape(path_); + unescape(query_); + unescape(params_); + unescape(userinfo_); + return *this; + } + + std::string& Url::unescape(std::string& str) + { + std::string copy(str); + size_t dest = 0; + for (size_t src = 0; src < copy.length(); ++src, ++dest) + { + if (copy[src] == '%' && (copy.length() - src) >= 2) + { + // Read ahead to see if there's a valid escape sequence. If not, treat + // this like a normal character. + if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1) + { + int value = ( + HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]); + + // Replace src + 2 with that byte, advance src to consume it and + // continue. + src += 2; + str[dest] = value; + continue; + } + } + + // Either not a % or an incomplete entity + str[dest] = copy[src]; + } + str.resize(dest); + return str; + } + + Url& Url::deparam(const std::unordered_set& blacklist) + { + // Predicate is if it's present in the blacklist. + auto predicate = [blacklist](std::string& name, const std::string& value) + { + std::transform(name.begin(), name.end(), name.begin(), ::tolower); + return blacklist.find(name) != blacklist.end(); + }; + + setQuery(remove_params(query_, predicate, '&')); + setParams(remove_params(params_, predicate, ';')); + return *this; + } + + Url& Url::deparam(const deparam_predicate& predicate) + { + setQuery(remove_params(query_, predicate, '&')); + setParams(remove_params(params_, predicate, ';')); + return *this; + } + + std::string& Url::remove_params(std::string& str, + const deparam_predicate& predicate, + char sep) + { + std::string copy; + std::string piece; + std::string name; + std::string value; + size_t previous = 0; + for (size_t index = str.find(sep) + ; index != std::string::npos + ; previous = index + 1, index = str.find(sep, previous)) + { + piece.assign(str, previous, index - previous); + size_t position = piece.find('='); + name.assign(piece, 0, position); + value.clear(); + if (position != std::string::npos) + { + value.assign(piece, position + 1, std::string::npos); + } + + if (!predicate(name, value)) + { + copy.append(copy.empty() ? 0 : 1, sep); + copy.append(piece); + } + } + + if (previous < str.length()) + { + piece.assign(str, previous, std::string::npos); + size_t position = piece.find('='); + name.assign(piece, 0, position); + value.clear(); + if (position != std::string::npos) + { + value.assign(piece, position + 1, std::string::npos); + } + + if (!predicate(name, value)) + { + copy.append(copy.empty() ? 0 : 1, sep); + copy.append(piece); + } + } + + str.assign(copy); + return str; + } + + Url& Url::sort_query() + { + split_sort_join(query_, '&'); + split_sort_join(params_, ';'); + return *this; + } + + std::string& Url::split_sort_join(std::string& str, const char glue) + { + // Return early if empty + if (str.empty()) + { + return str; + } + + // Split + std::vector pieces; + std::stringstream stream(str); + std::string item; + while (getline(stream, item, glue)) + { + pieces.push_back(item); + } + + // Return early if it's just a single element + if (pieces.size() == 1) + { + return str; + } + + // Sort + std::sort(pieces.begin(), pieces.end()); + + // Join (at this point we know that there's at least one element) + std::stringstream output; + for (auto it = pieces.begin(); it != (pieces.end() - 1); ++it) + { + output << *it << glue; + } + output << pieces.back(); + str.assign(output.str()); + return str; + } + + Url& Url::remove_default_port() + { + if (port_ && !scheme_.empty()) + { + auto it = PORTS.find(scheme_); + if (it != PORTS.end() && port_ == it->second) + { + port_ = 0; + } + } + return *this; + } + + Url& Url::deuserinfo() + { + userinfo_.clear(); + return *this; + } + + Url& Url::defrag() + { + fragment_.clear(); + return *this; + } + + Url& Url::punycode() + { + check_hostname(host_); + std::string encoded(Punycode::encodeHostname(host_)); + check_hostname(encoded); + host_ = encoded; + return *this; + } + + Url& Url::unpunycode() + { + host_ = Punycode::decodeHostname(host_); + return *this; + } + + Url& Url::host_reversed() + { + std::reverse(host_.begin(), host_.end()); + for (size_t index = 0, position = 0; index < host_.size(); index = position + 1) + { + position = host_.find('.', index); + if (position == std::string::npos) + { + std::reverse(host_.begin() + index, host_.end()); + break; + } + else + { + std::reverse(host_.begin() + index, host_.begin() + position); + } + } + return *this; + } + + void Url::check_hostname(std::string& host) + { + // Skip empty hostnames -- they are valid + if (host.empty()) + { + return; + } + + size_t start = 0; + size_t end = host.find('.'); + while (end != std::string::npos) + { + if ((end - start) > 63) + { + throw std::invalid_argument("Label too long."); + } + else if (end == start) + { + throw std::invalid_argument("Empty label."); + } + + start = end + 1; + end = host.find('.', start); + } + + // For the final segment + if ((host.size() - start) > 63) + { + throw std::invalid_argument("Label too long."); + } + else if (host.size() == start && start > 1) + { + // Remove a trailing empty segment + host.resize(start - 1); + } + } + +}; diff --git a/src/url.h b/src/url.h new file mode 100644 index 0000000..6245124 --- /dev/null +++ b/src/url.h @@ -0,0 +1,323 @@ +#ifndef URL_CPP_H +#define URL_CPP_H + +#include +#include +#include +#include +#include +#include + +namespace Url +{ + + struct UrlParseException : public std::logic_error + { + UrlParseException(const std::string& message) : std::logic_error(message) {} + }; + + struct CharacterClass + { + CharacterClass(const std::string& chars) : chars_(chars), map_(256, false) + { + for (auto it = chars_.begin(); it != chars_.end(); ++it) + { + map_[static_cast(*it)] = true; + } + } + + bool operator()(char c) const + { + return map_[static_cast(c)]; + } + + const std::string& chars() const + { + return chars_; + } + + private: + // Private, unimplemented to prevent use + CharacterClass(); + CharacterClass(const CharacterClass& other); + + std::string chars_; + std::vector map_; + }; + + struct Url + { + /* Character classes */ + const static CharacterClass GEN_DELIMS; + const static CharacterClass SUB_DELIMS; + const static CharacterClass ALPHA; + const static CharacterClass DIGIT; + const static CharacterClass UNRESERVED; + const static CharacterClass RESERVED; + const static CharacterClass PCHAR; + const static CharacterClass PATH; + const static CharacterClass QUERY; + const static CharacterClass FRAGMENT; + const static CharacterClass USERINFO; + const static CharacterClass HEX; + const static CharacterClass SCHEME; + const static std::vector HEX_TO_DEC; + const static std::unordered_map PORTS; + const static std::unordered_set USES_RELATIVE; + const static std::unordered_set USES_NETLOC; + const static std::unordered_set USES_PARAMS; + const static std::unordered_set KNOWN_PROTOCOLS; + + // The type of the predicate used for removing parameters + typedef std::function deparam_predicate; + + explicit Url(const std::string& url); + + Url(const Url& other) + : scheme_(other.scheme_) + , host_(other.host_) + , port_(other.port_) + , path_(other.path_) + , params_(other.params_) + , query_(other.query_) + , fragment_(other.fragment_) + , userinfo_(other.userinfo_) + , has_params_(other.has_params_) + , has_query_(other.has_query_) { } + + /** + * Take on the value of the other URL. + */ + Url& assign(const Url& other); + + /** + * To be considered equal, all fields must be equal. + */ + bool operator==(const Url& other) const; + bool operator!=(const Url& other) const; + + /** + * Two URLs are considered equivalent if they have the same meaning. + */ + bool equiv(const Url& other); + + /************************************** + * Component-wise access and setting. * + **************************************/ + const std::string& scheme() const { return scheme_; } + Url& setScheme(const std::string& s) + { + scheme_ = s; + return *this; + } + + const std::string& host() const { return host_; } + Url& setHost(const std::string& s) + { + host_ = s; + return *this; + } + + const int port() const { return port_; } + Url& setPort(int i) + { + port_ = i; + return *this; + } + + const std::string& path() const { return path_; } + Url& setPath(const std::string& s) + { + path_ = s; + return *this; + } + + const std::string& params() const { return params_; } + Url& setParams(const std::string& s) + { + params_ = s; + has_params_ = !s.empty(); + return *this; + } + + const std::string& query() const { return query_; } + Url& setQuery(const std::string& s) + { + query_ = s; + has_query_ = !s.empty(); + return *this; + } + + const std::string& fragment() const { return fragment_; } + Url& setFragment(const std::string& s) + { + fragment_ = s; + return *this; + } + + const std::string& userinfo() const { return userinfo_; } + Url& setUserinfo(const std::string& s) + { + userinfo_ = s; + return *this; + } + + /** + * Get a representation of all components of the path, params, query, fragment. + * + * Always includes a leading /. + */ + std::string fullpath() const; + + /** + * Get a new string representation of the URL. + **/ + std::string str() const; + + /********************* + * Chainable methods * + *********************/ + + /** + * Strip semantically meaningless excess '?', '&', and ';' characters from query + * and params. + */ + Url& strip(); + + /** + * Make the path absolute. + * + * Evaluate '.', '..', and excessive slashes. + */ + Url& abspath(); + + /** + * Evaluate this URL relative fo `other`, placing the result in this object. + */ + Url& relative_to(const std::string& other) + { + return relative_to(Url(other)); + } + + /** + * Evaluate this URL relative fo `other`, placing the result in this object. + */ + Url& relative_to(const Url& other); + + /** + * Ensure that the path, params, query, and userinfo are properly escaped. + * + * In 'strict' mode, only entities that are both safe and not reserved characters + * are unescaped. In non-strict mode, entities that are safe are unescaped. + */ + Url& escape(bool strict=false); + + /** + * Unescape all entities in the path, params, query, and userinfo. + */ + Url& unescape(); + + /** + * Remove any params or queries that appear in the blacklist. + * + * The blacklist should contain only lowercased strings, and the comparison is + * done in a case-insensitive way. + */ + Url& deparam(const std::unordered_set& blacklist); + + /** + * Filter params subject to a predicate for whether it should be filtered. + * + * The predicate must accept two string refs -- the key and value (which may be + * empty). Return `true` if the parameter should be removed, and `false` + * otherwise. + */ + Url& deparam(const deparam_predicate& predicate); + + /** + * Put queries and params in sorted order. + * + * To ensure consistent comparisons, escape should be called beforehand. + */ + Url& sort_query(); + + /** + * Remove the port if it's the default for the scheme. + */ + Url& remove_default_port(); + + /** + * Remove the userinfo portion. + */ + Url& deuserinfo(); + + /** + * Remove the fragment. + */ + Url& defrag(); + + /** + * Punycode the hostname. + */ + Url& punycode(); + + /** + * Unpunycode the hostname. + */ + Url& unpunycode(); + + /** + * Reverse the hostname (a.b.c.d => d.c.b.a) + */ + Url& host_reversed(); + + private: + // Private, unimplemented to prevent use. + Url(); + + /** + * Remove repeated, leading, and trailing instances of chr from the string. + */ + std::string& remove_repeats(std::string& str, const char chr); + + /** + * Ensure all the provided characters are escaped if necessary + */ + std::string& escape(std::string& str, const CharacterClass& safe, bool strict); + + /** + * Unescape entities in the provided string + */ + std::string& unescape(std::string& str); + + /** + * Remove any params that match entries in the blacklist. + */ + std::string& remove_params( + std::string& str, const deparam_predicate& pred, char sep); + + /** + * Split the provided string by char, sort, join by char. + */ + std::string& split_sort_join(std::string& str, const char glue); + + /** + * Check that the hostname is valid, removing an optional trailing '.'. + */ + void check_hostname(std::string& host); + + std::string scheme_; + std::string host_; + int port_; + std::string path_; + std::string params_; + std::string query_; + std::string fragment_; + std::string userinfo_; + bool has_params_; + bool has_query_; + }; + +} + +#endif diff --git a/src/utf8.cpp b/src/utf8.cpp new file mode 100644 index 0000000..3502377 --- /dev/null +++ b/src/utf8.cpp @@ -0,0 +1,150 @@ +#include +#include +#include + +#include "utf8.h" + +namespace Url +{ + + Utf8::codepoint_t Utf8::readCodepoint( + std::string::const_iterator& it, const std::string::const_iterator& end) + { + Utf8::char_t current = static_cast(*it++); + if (current & 0x80) + { + // Number of additional bytes needed + unsigned int bytes = 0; + // The accumulated value + Utf8::codepoint_t result = 0; + if (current < 0xC0) + { + // Invalid sequence + throw std::invalid_argument("Low UTF-8 start byte"); + } + else if (current < 0xE0) + { + // One additional byte, two bytes total, use 5 bits + bytes = 1; + result = current & 0x1F; + } + else if (current < 0xF0) + { + // Two additional bytes, three bytes total, use 4 bits + bytes = 2; + result = current & 0x0F; + } + else if (current < 0xF8) + { + // Three additional bytes, four bytes total, use 3 bits + bytes = 3; + result = current & 0x07; + } + else + { + throw std::invalid_argument("High UTF-8 start byte"); + } + + for (; bytes > 0; --bytes) { + if (it == end) + { + throw std::invalid_argument("UTF-8 sequence terminated early."); + } + + current = static_cast(*it++); + // Ensure the first two bits are 10 + if ((current & 0xC0) != 0x80) + { + throw std::invalid_argument("Invalid continuation byte"); + } + result = (result << 6) | (current & 0x3F); + } + + return result; + } + else + { + return current; + } + } + + std::string& Utf8::writeCodepoint(std::string& str, Utf8::codepoint_t value) + { + if (value > MAX_CODEPOINT) + { + throw std::invalid_argument("Code point too high."); + } + else if (value <= 0x007F) + { + // Just append the character itself + str.append(1, static_cast(value)); + return str; + } + + unsigned int bytes = 0; + if (value > 0xFFFF) + { + /** + * 11110xxx + 3 bytes for 21 bits total + * + * We need to take bits 20-18, which 0x1C0000 masks out. These form the least + * significant bits of this byte (so we shift them back down by 18). The 5 + * most significant bits of this byte are 11110, so we OR this result with + * 0xF0 to get this first byte. + * + * The remaining bits will be consumed from the most-significant end and so + * they must be shifted up by (32 - 18) = 14. + */ + str.append(1, static_cast(((value & 0x1C0000) >> 18) | 0xF0)); + bytes = 3; + value <<= 14; + } + else if (value > 0x07FF) + { + /** + * 1110xxxx + 2 bytes for 16 bits total + * + * We need to take bits 15-12, which 0xF000 masks out. These form the least + * significant bits of this byte (so we shift them back down by 12). The 4 + * most significant bits of this byte are 1110, so we OR this result with + * 0xE0 to get this first byte. + * + * The remaining bits will be consumed from the most-significant end and so + * they must be shifted up by (32 - 12) = 20. + */ + str.append(1, static_cast(((value & 0xF000) >> 12) | 0xE0)); + bytes = 2; + value <<= 20; + } + else + { + /** + * 110xxxxx + 1 byte for 11 bits total + * + * We need to take bits 10-6, which 0x7C0 masks out. These form the least + * significant bits of this byte (so we shift them back down by 6). The 3 + * most significant bits of this byte are 110, so we OR this result with + * 0xC0 to get this first byte. + * + * The remaining bits will be consumed from the most-significant end and so + * they must be shifted up by (32 - 6) = 26. + */ + str.append(1, static_cast(((value & 0x7C0) >> 6) | 0xC0)); + bytes = 1; + value <<= 26; + } + + /** + * The remaining bits are to be consumed 6 at a time from the most-significant + * end. The mask 0xFC000000 grabs these six bits, which then must be shifted down + * by 26, and OR'd with 0x80 to produce the continuation byte. + */ + for (; bytes > 0; --bytes, value <<= 6) + { + str.append(1, static_cast(((value & 0xFC000000) >> 26) | 0x80)); + } + + return str; + } + +}; diff --git a/src/utf8.h b/src/utf8.h new file mode 100644 index 0000000..b677ce8 --- /dev/null +++ b/src/utf8.h @@ -0,0 +1,91 @@ +#ifndef UTF8_CPP_H +#define UTF8_CPP_H + +#include +#include +#include + +namespace Url +{ + + /** + * Work between unicode code points and their UTF-8-encoded representation. + */ + struct Utf8 + { + /** + * The type we use to represent Unicode codepoints. + */ + typedef uint32_t codepoint_t; + + /** + * The type we use when talking about the integral value of bytes. + */ + typedef unsigned char char_t; + + /** + * The highest allowed codepoint. + */ + static const codepoint_t MAX_CODEPOINT = 0x10FFFF; + + /** + * Consume up to the last byte of the sequence, returning the codepoint. + */ + static codepoint_t readCodepoint( + std::string::const_iterator& it, const std::string::const_iterator& end); + + /** + * Write a codepoint to the provided string. + */ + static std::string& writeCodepoint(std::string& str, codepoint_t value); + + /** + * Return the first codepoint stored in the provided string. + */ + static codepoint_t toCodepoint(const std::string& str) + { + auto it = str.begin(); + return readCodepoint(it, str.end()); + } + + /** + * Get a string with the provided codepoint. + */ + static std::string fromCodepoint(codepoint_t value) + { + std::string str; + writeCodepoint(str, value); + return str; + } + + /** + * Return all the codepoints in the string. + */ + static std::vector toCodepoints(const std::string& str) + { + std::vector result; + for (auto it = str.begin(); it != str.end(); ) + { + result.push_back(readCodepoint(it, str.end())); + } + return result; + } + + /** + * Create a string from a vector of codepoints. + */ + static std::string fromCodepoints(const std::vector& points) + { + std::string result; + for (auto it = points.begin(); it != points.end(); ++it) + { + writeCodepoint(result, *it); + } + return result; + } + + }; + +} + +#endif diff --git a/tests/test-all.R b/tests/test-all.R new file mode 100644 index 0000000..0f20a7f --- /dev/null +++ b/tests/test-all.R @@ -0,0 +1,3 @@ +library(testthat) +library(robotstxt) +test_check("rep") diff --git a/tests/testthat/test-rep.R b/tests/testthat/test-rep.R new file mode 100644 index 0000000..cb2771f --- /dev/null +++ b/tests/testthat/test-rep.R @@ -0,0 +1,11 @@ +context("basic functionality") +test_that("we can do something", { + + rt <- robxp(robotstxt::get_robotstxt("https://cdc.gov")) + + expect_that(rt, is_a("robxp")) + + expect_that(can_fetch(rt, "/asthma/asthma_stats/default.htm", "*"), equals(TRUE)) + expect_that(can_fetch(rt, "/_borders", "*"), equals(FALSE)) + +})