commit 878bb7f0455c1afdd2a9ef213da5d630f2c3b797 Author: Bob Rudis Date: Mon Aug 14 15:00:27 2017 -0400 initial commit diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..edf18c6 --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,11 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^\.travis\.yml$ +^README\.*Rmd$ +^README\.*html$ +^NOTES\.*Rmd$ +^NOTES\.*html$ +^\.codecov\.yml$ +^README_files$ +^doc$ +^CONDUCT\.md$ diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 0000000..69cb760 --- /dev/null +++ b/.codecov.yml @@ -0,0 +1 @@ +comment: false diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cce1f17 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +.DS_Store +.Rproj.user +.Rhistory +.RData +.Rproj +src/*.o +src/*.so +src/*.dll diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..76d9586 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,31 @@ +language: r + +warnings_are_errors: true + +sudo: required + +cache: packages + +r: + - oldrel + - release + - devel + +apt_packages: + - libv8-dev + - xclip + +env: + global: + - CRAN: http://cran.rstudio.com + +after_success: + - Rscript -e 'covr::codecov()' + +notifications: + email: + - bob@rud.is + irc: + channels: + - "104.236.112.222#builds" + nick: travisci diff --git a/CONDUCT.md b/CONDUCT.md new file mode 100644 index 0000000..52a673e --- /dev/null +++ b/CONDUCT.md @@ -0,0 +1,25 @@ +# Contributor Code of Conduct + +As contributors and maintainers of this project, we pledge to respect all people who +contribute through reporting issues, posting feature requests, updating documentation, +submitting pull requests or patches, and other activities. + +We are committed to making participation in this project a harassment-free experience for +everyone, regardless of level of experience, gender, gender identity and expression, +sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. + +Examples of unacceptable behavior by participants include the use of sexual language or +imagery, derogatory comments or personal attacks, trolling, public or private harassment, +insults, or other unprofessional conduct. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, +commits, code, wiki edits, issues, and other contributions that are not aligned to this +Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed +from the project team. + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by +opening an issue or contacting one or more of the project maintainers. + +This Code of Conduct is adapted from the Contributor Covenant +(http:contributor-covenant.org), version 1.0.0, available at +http://contributor-covenant.org/version/1/0/0/ diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..d505f19 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,27 @@ +Package: rep +Type: Package +Title: Tools to Parse and Test Robots Exclusion Protocol Files and Rules +Version: 0.1.0 +Date: 2017-08-14 +Author: Bob Rudis (bob@rud.is) [aut, cre], SEOmoz, Inc [aut] +Maintainer: Bob Rudis +Description: The 'Robots Exclusion Protocol' documents + a set of standards for allowing or excluding robot/spider crawling of different areas of + site content. Tools are provided which wrap The 'rep-cpp` + C++ library for processing these 'robots.txt' files. +SystemRequirements: C++11 +NeedsCompilation: yes +URL: https://github.com/hrbrmstr/rep +BugReports: https://github.com/hrbrmstr/rep/issues +License: MIT + file LICENSE +Suggests: + testthat, + covr, + robotstxt +Depends: + R (>= 3.2.0) +Imports: + purrr, + Rcpp +RoxygenNote: 6.0.1 +LinkingTo: Rcpp diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..48ed424 --- /dev/null +++ b/LICENSE @@ -0,0 +1,2 @@ +YEAR: 2017 +COPYRIGHT HOLDER: Bob Rudis diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..19131d4 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,7 @@ +# Generated by roxygen2: do not edit by hand + +S3method(print,robxp) +export(can_fetch) +export(robxp) +importFrom(Rcpp,sourceCpp) +useDynLib(rep, .registration=TRUE) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..9b4679b --- /dev/null +++ b/NEWS.md @@ -0,0 +1,2 @@ +0.1.0 +* Initial release diff --git a/R/RcppExports.R b/R/RcppExports.R new file mode 100644 index 0000000..dc22683 --- /dev/null +++ b/R/RcppExports.R @@ -0,0 +1,19 @@ +# Generated by using Rcpp::compileAttributes() -> do not edit by hand +# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#' Parse robots.txt +#' +#' @noRd +#' +rep_parse <- function(content) { + .Call(`_rep_rep_parse`, content) +} + +#' Path allowed +#' +#' @noRd +#' +rep_path_allowed <- function(xp, path, agent = "*") { + .Call(`_rep_rep_path_allowed`, xp, path, agent) +} + diff --git a/R/rep-package.R b/R/rep-package.R new file mode 100644 index 0000000..d7c5b44 --- /dev/null +++ b/R/rep-package.R @@ -0,0 +1,14 @@ +#' Tools to Parse and Test Robots Exclusion Protocol Files and Rules +#' +#' The 'Robots Exclusion Protocol' documents a set +#' of standards for allowing or excluding robot/spider crawling of different areas of +#' site content. Tools are provided which wrap The 'rep-cpp` +#' C++ library for processing these 'robots.txt' files. +#' +#' @md +#' @name rep +#' @docType package +#' @author Bob Rudis (bob@@rud.is) +#' @useDynLib rep, .registration=TRUE +#' @importFrom Rcpp sourceCpp +NULL \ No newline at end of file diff --git a/R/rep.r b/R/rep.r new file mode 100644 index 0000000..490dbe7 --- /dev/null +++ b/R/rep.r @@ -0,0 +1,47 @@ +#' Create a robots.txt object +#' +#' @param x atomic character vector containing a complete robots.txt file +#' @export +#' @examples +#' library(robotstxt) +#' can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE +#' can_fetch(rt, "/_borders", "*") # FALSE +robxp <- function(x) { + + robxp <- rep_parse(x) + class(robxp) <- c("robxp") + + robxp + +} + +#' Test URL path against robots.txt +#' +#' @md +#' @param obj `robxp` object +#' @param path path to test +#' @param user_agent user agent to test +#' @export +#' @examples +#' library(robotstxt) +#' can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE +#' can_fetch(rt, "/_borders", "*") # FALSE +can_fetch <- function(obj, path="/", user_agent="*") { + + if (inherits(obj, "robxp")) { + rep_path_allowed(obj, path, user_agent) + } else { + return(NULL) + } + +} + +#' Custom printer for 'robexp' objects +#' +#' @md +#' @param x object to print +#' @param ... unused +#' @export +print.robxp <- function(x, ...) { + cat("") +} \ No newline at end of file diff --git a/README.Rmd b/README.Rmd new file mode 100644 index 0000000..233007e --- /dev/null +++ b/README.Rmd @@ -0,0 +1,58 @@ +--- +output: rmarkdown::github_document +--- + +`rep` : Tools to Parse and Test Robots Exclusion Protocol Files and Rules + +The 'Robots Exclusion Protocol' documents a set of standards for allowing or excluding robot/spider crawling of different areas of site content. Tools are provided which wrap The 'rep-cpp` C++ library for processing these 'robots.txt' files. + +- [`rep-cpp`](https://github.com/seomoz/rep-cpp) +- [`url-cpp`](https://github.com/seomoz/url-cpp) + +The following functions are implemented: + +- `robxp`: Create a robots.txt object +- `can_fetch`: Test URL path against robots.txt + +### Installation + +```{r eval=FALSE} +devtools::install_github("hrbrmstr/rep") +``` + +```{r message=FALSE, warning=FALSE, error=FALSE, include=FALSE} +options(width=120) +``` + +### Usage + +```{r message=FALSE, warning=FALSE, error=FALSE} +library(rep) +library(robotstxt) + +# current verison +packageVersion("rep") + +rt <- robxp(get_robotstxt("https://cdc.gov")) + +print(rt) + +can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") + +can_fetch(rt, "/_borders", "*") +``` + +### Test Results + +```{r message=FALSE, warning=FALSE, error=FALSE} +library(rep) +library(testthat) + +date() + +test_dir("tests/") +``` + +### Code of Conduct + +Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..b8fb215 --- /dev/null +++ b/README.md @@ -0,0 +1,74 @@ + +`rep` : Tools to Parse and Test Robots Exclusion Protocol Files and Rules + +The 'Robots Exclusion Protocol' documents a set of standards for allowing or excluding robot/spider crawling of different areas of site content. Tools are provided which wrap The 'rep-cpp\` C++ library for processing these 'robots.txt' files. + +- [`rep-cpp`](https://github.com/seomoz/rep-cpp) +- [`url-cpp`](https://github.com/seomoz/url-cpp) + +The following functions are implemented: + +- `robxp`: Create a robots.txt object +- `can_fetch`: Test URL path against robots.txt + +### Installation + +``` r +devtools::install_github("hrbrmstr/rep") +``` + +### Usage + +``` r +library(rep) +library(robotstxt) + +# current verison +packageVersion("rep") +``` + + ## [1] '0.1.0' + +``` r +rt <- robxp(get_robotstxt("https://cdc.gov")) + +print(rt) +``` + + ## + +``` r +can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") +``` + + ## [1] TRUE + +``` r +can_fetch(rt, "/_borders", "*") +``` + + ## [1] FALSE + +### Test Results + +``` r +library(rep) +library(testthat) + +date() +``` + + ## [1] "Mon Aug 14 15:00:16 2017" + +``` r +test_dir("tests/") +``` + + ## testthat results ======================================================================================================== + ## OK: 3 SKIPPED: 0 FAILED: 0 + ## + ## DONE =================================================================================================================== + +### Code of Conduct + +Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms. diff --git a/man/can_fetch.Rd b/man/can_fetch.Rd new file mode 100644 index 0000000..e440838 --- /dev/null +++ b/man/can_fetch.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rep.r +\name{can_fetch} +\alias{can_fetch} +\title{Test URL path against robots.txt} +\usage{ +can_fetch(obj, path = "/", user_agent = "*") +} +\arguments{ +\item{obj}{\code{robxp} object} + +\item{path}{path to test} + +\item{user_agent}{user agent to test} +} +\description{ +Test URL path against robots.txt +} +\examples{ +library(robotstxt) +can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE +can_fetch(rt, "/_borders", "*") # FALSE +} diff --git a/man/print.robxp.Rd b/man/print.robxp.Rd new file mode 100644 index 0000000..9138ecc --- /dev/null +++ b/man/print.robxp.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rep.r +\name{print.robxp} +\alias{print.robxp} +\title{Custom printer for 'robexp' objects} +\usage{ +\method{print}{robxp}(x, ...) +} +\arguments{ +\item{x}{object to print} + +\item{...}{unused} +} +\description{ +Custom printer for 'robexp' objects +} diff --git a/man/rep.Rd b/man/rep.Rd new file mode 100644 index 0000000..689f0d5 --- /dev/null +++ b/man/rep.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rep-package.R +\docType{package} +\name{rep} +\alias{rep} +\alias{rep-package} +\title{Tools to Parse and Test Robots Exclusion Protocol Files and Rules} +\description{ +The 'Robots Exclusion Protocol' \url{http://www.robotstxt.org/orig.html} documents a set +of standards for allowing or excluding robot/spider crawling of different areas of +site content. Tools are provided which wrap The 'rep-cpp` \url{https://github.com/seomoz/rep-cpp} +C++ library for processing these 'robots.txt' files. +} +\author{ +Bob Rudis (bob@rud.is) +} diff --git a/man/robxp.Rd b/man/robxp.Rd new file mode 100644 index 0000000..edc787f --- /dev/null +++ b/man/robxp.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rep.r +\name{robxp} +\alias{robxp} +\title{Create a robots.txt object} +\usage{ +robxp(x) +} +\arguments{ +\item{x}{atomic character vector containing a complete robots.txt file} +} +\description{ +Create a robots.txt object +} +\examples{ +library(robotstxt) +can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE +can_fetch(rt, "/_borders", "*") # FALSE +} diff --git a/rep.Rproj b/rep.Rproj new file mode 100644 index 0000000..446d9e1 --- /dev/null +++ b/rep.Rproj @@ -0,0 +1,21 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageBuildArgs: --resave-data +PackageRoxygenize: rd,collate,namespace diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..22034c4 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,3 @@ +*.o +*.so +*.dll diff --git a/src/Makevars b/src/Makevars new file mode 100644 index 0000000..a231a44 --- /dev/null +++ b/src/Makevars @@ -0,0 +1,3 @@ +CXX_STD = CXX11 +PKG_CXXFLAGS = +PKG_LIBS = -L. diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp new file mode 100644 index 0000000..613134a --- /dev/null +++ b/src/RcppExports.cpp @@ -0,0 +1,42 @@ +// Generated by using Rcpp::compileAttributes() -> do not edit by hand +// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#include + +using namespace Rcpp; + +// rep_parse +SEXP rep_parse(std::string content); +RcppExport SEXP _rep_rep_parse(SEXP contentSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< std::string >::type content(contentSEXP); + rcpp_result_gen = Rcpp::wrap(rep_parse(content)); + return rcpp_result_gen; +END_RCPP +} +// rep_path_allowed +bool rep_path_allowed(SEXP xp, std::string path, std::string agent); +RcppExport SEXP _rep_rep_path_allowed(SEXP xpSEXP, SEXP pathSEXP, SEXP agentSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type xp(xpSEXP); + Rcpp::traits::input_parameter< std::string >::type path(pathSEXP); + Rcpp::traits::input_parameter< std::string >::type agent(agentSEXP); + rcpp_result_gen = Rcpp::wrap(rep_path_allowed(xp, path, agent)); + return rcpp_result_gen; +END_RCPP +} + +static const R_CallMethodDef CallEntries[] = { + {"_rep_rep_parse", (DL_FUNC) &_rep_rep_parse, 1}, + {"_rep_rep_path_allowed", (DL_FUNC) &_rep_rep_path_allowed, 3}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_rep(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/src/agent.cpp b/src/agent.cpp new file mode 100644 index 0000000..b91cbf0 --- /dev/null +++ b/src/agent.cpp @@ -0,0 +1,87 @@ +#include +#include + +#include "url.h" + +#include "agent.h" +#include "directive.h" + +namespace Rep +{ + Agent& Agent::allow(const std::string& query) + { + directives_.push_back(Directive(escape(query), true)); + sorted_ = false; + return *this; + } + + Agent& Agent::disallow(const std::string& query) + { + if (query.empty()) + { + // Special case: "Disallow:" means "Allow: /" + directives_.push_back(Directive(query, true)); + } + else + { + directives_.push_back(Directive(escape(query), false)); + } + sorted_ = false; + return *this; + } + + const std::vector& Agent::directives() const + { + if (!sorted_) + { + std::sort(directives_.begin(), directives_.end(), [](const Directive& a, const Directive& b) { + return b.priority() < a.priority(); + }); + sorted_ = true; + } + return directives_; + } + + bool Agent::allowed(const std::string& query) const + { + std::string path(escape(query)); + + if (path.compare("/robots.txt") == 0) + { + return true; + } + + for (auto directive : directives()) + { + if (directive.match(path)) + { + return directive.allowed(); + } + } + return true; + } + + std::string Agent::str() const + { + std::stringstream out; + out << '['; + auto begin = directives().begin(); + auto end = directives().end(); + if (begin != end) + { + out << "Directive(" << begin->str() << ')'; + ++begin; + } + for (; begin != end; ++begin) + { + out << ", Directive(" << begin->str() << ')'; + } + out << ']'; + return out.str(); + } + + std::string Agent::escape(const std::string& query) + { + return Url::Url(query).defrag().escape().fullpath(); + } +} diff --git a/src/agent.h b/src/agent.h new file mode 100644 index 0000000..a30dd47 --- /dev/null +++ b/src/agent.h @@ -0,0 +1,70 @@ +#ifndef AGENT_CPP_H +#define AGENT_CPP_H + +#include + +#include "directive.h" + + +namespace Rep +{ + + class Agent + { + public: + /* The type for the delay. */ + typedef float delay_t; + + /** + * Construct an agent. + */ + Agent(): directives_(), delay_(-1.0), sorted_(true) {} + + /** + * Add an allowed directive. + */ + Agent& allow(const std::string& query); + + /** + * Add a disallowed directive. + */ + Agent& disallow(const std::string& query); + + /** + * Set the delay for this agent. + */ + Agent& delay(delay_t value) { + delay_ = value; + return *this; + } + + /** + * Return the delay for this agent. + */ + delay_t delay() const { return delay_; } + + /** + * A vector of the directives, in priority-sorted order. + */ + const std::vector& directives() const; + + /** + * Return true if the URL (either a full URL or a path) is allowed. + */ + bool allowed(const std::string& path) const; + + std::string str() const; + + /** + * Canonically escape the provided query for matching purposes. + */ + static std::string escape(const std::string& query); + + private: + mutable std::vector directives_; + delay_t delay_; + mutable bool sorted_; + }; +} + +#endif diff --git a/src/directive.cpp b/src/directive.cpp new file mode 100644 index 0000000..21376b7 --- /dev/null +++ b/src/directive.cpp @@ -0,0 +1,130 @@ +#include +#include +#include +#include + +#include "url.h" + +#include "directive.h" + +namespace Rep +{ + Directive::Directive(const std::string& line, bool allowed) + : expression_() + , priority_(line.size()) + , allowed_(allowed) + { + if (line.find('*') == std::string::npos) + { + expression_.assign(line); + return; + } + + // Remove consecutive '*'s + expression_.reserve(line.size()); + bool star = false; + for (auto character : line) + { + if (character == '*') + { + if (!star) + { + expression_.append(1, character); + } + star = true; + } + else + { + expression_.append(1, character); + star = false; + } + } + + // Remove trailing '*'s + std::string::reverse_iterator last = + std::find_if(expression_.rbegin(), expression_.rend(), + [](const char c) { + return c != '*'; + }); + expression_.erase(last.base(), expression_.end()); + + // Priority is the length of the expression + priority_ = expression_.size(); + } + + bool Directive::match(const std::string::const_iterator& e_begin, + const std::string::const_iterator& e_end, + const std::string::const_iterator& p_begin, + const std::string::const_iterator& p_end) const + { + std::string::const_iterator expression_it = e_begin; + std::string::const_iterator path_it = p_begin; + while (expression_it != e_end && path_it != p_end) + { + if (*expression_it == '*') + { + // Advance and recurse + ++expression_it; + for (; path_it != p_end; ++path_it) + { + if (match(expression_it, e_end, path_it, p_end)) + { + return true; + } + } + return false; + } + else if (*expression_it == '$') + { + // This check expects path to be fully consumed. But since one of the + // criteria of being in this while loop is that we've not fully consumed + // path, return false. + return false; + } + else if (*expression_it != *path_it) + { + // These characters must match + return false; + } + else + { + // Advance both by one + ++path_it; + ++expression_it; + } + } + + // Return true only if we've consumed all of the expression + if (expression_it == e_end) + { + return true; + } + else if (*expression_it == '$') + { + return path_it == p_end; + } + else + { + return false; + } + } + + std::string Directive::str() const + { + std::stringstream out; + if (allowed_) + { + out << "Allow: " << expression_; + } + else { + out << "Disallow: " << expression_; + } + return out.str(); + } + + bool Directive::match(const std::string& path) const + { + return match(expression_.begin(), expression_.end(), path.begin(), path.end()); + } + +} diff --git a/src/directive.h b/src/directive.h new file mode 100644 index 0000000..0c2743f --- /dev/null +++ b/src/directive.h @@ -0,0 +1,67 @@ +#ifndef DIRECTIVE_CPP_H +#define DIRECTIVE_CPP_H + + +namespace Rep +{ + + class Directive + { + public: + /** + * The type of our priority value. + */ + typedef size_t priority_t; + + /** + * Default constructor disallowed. + */ + Directive() = delete; + + /** + * The input to this constructor must be stripped of comments and trailing + * whitespace. + */ + Directive(const std::string& line, bool allowed); + + /** + * The priority of the rule. + */ + priority_t priority() const + { + return priority_; + } + + /** + * Whether or not the provided path matches. The path is expected to be properly + * escaped. + */ + bool match(const std::string& path) const; + + /** + * Whether this rule is for an allow or a disallow. + */ + bool allowed() const + { + return allowed_; + } + + std::string str() const; + + private: + std::string expression_; + priority_t priority_; + bool allowed_; + + /** + * Return true if p_begin -> p_end matches the expression e_begin -> e_end. + */ + bool match(const std::string::const_iterator& e_begin, + const std::string::const_iterator& e_end, + const std::string::const_iterator& p_begin, + const std::string::const_iterator& p_end) const; + }; + +} + +#endif diff --git a/src/psl.cpp b/src/psl.cpp new file mode 100644 index 0000000..c078d21 --- /dev/null +++ b/src/psl.cpp @@ -0,0 +1,183 @@ +#include +#include +#include +#include + +#include "psl.h" +#include "punycode.h" + +namespace Url +{ + const std::string PSL::not_found = ""; + + PSL::PSL(std::istream& stream) + { + std::string line; + while (std::getline(stream, line)) + { + // Only take up to the first whitespace. + auto it = std::find_if(line.begin(), line.end(), ::isspace); + line.resize(it - line.begin()); + + // Skip blank lines + if (line.empty()) + { + continue; + } + + // Skip comments + if (line.compare(0, 2, "//") == 0) + { + continue; + } + + // We know the line has at least a single character at this point + if (line[0] == '*') + { + // Line is a wildcard rule + if (line.size() <= 2 || line[1] != '.') + { + throw std::invalid_argument("Wildcard rule must be of form *."); + } + + add(line, 1, 2); + } + else if (line[0] == '!') + { + // Line is an exception, take all but the ! + if (line.size() <= 1) + { + throw std::invalid_argument("Exception rule has no hostname."); + } + + add(line, -1, 1); + } + else + { + add(line, 0, 0); + } + } + } + + PSL PSL::fromPath(const std::string& path) + { + std::ifstream stream(path); + if (!stream.good()) + { + std::stringstream message; + message << "Path '" << path << "' inaccessible."; + throw std::invalid_argument(message.str()); + } + return PSL(stream); + } + + PSL PSL::fromString(const std::string& str) + { + std::stringstream stream(str); + return PSL(stream); + } + + std::string PSL::getTLD(const std::string& hostname) const + { + return getLastSegments(hostname, getTLDLength(hostname)); + } + + std::string PSL::getPLD(const std::string& hostname) const + { + return getLastSegments(hostname, getTLDLength(hostname) + 1); + } + + std::pair PSL::getBoth(const std::string& hostname) const + { + size_t length = getTLDLength(hostname); + return std::make_pair( + getLastSegments(hostname, length), + getLastSegments(hostname, length + 1)); + } + + size_t PSL::getTLDLength(const std::string& hostname) const + { + // Reversed copy of hostname + std::string tld(hostname.rbegin(), hostname.rend()); + std::transform(tld.begin(), tld.end(), tld.begin(), ::tolower); + + while (tld.size()) + { + auto it = levels.find(tld); + if (it != levels.end()) + { + return it->second; + } + + size_t position = tld.rfind('.'); + if (position == std::string::npos || position == 0) + { + tld.resize(0); + } + else + { + tld.resize(position); + } + } + + return 1; + } + + std::string PSL::getLastSegments(const std::string& hostname, size_t segments) const + { + size_t position = hostname.size(); + size_t remaining = segments; + while (remaining != 0 && position && position != std::string::npos) + { + position = hostname.rfind('.', position - 1); + remaining -= 1; + } + + if (remaining >= 1) + { + return not_found; + } + + // Return the whole string if position == std:string::npos + size_t start = (position == std::string::npos) ? 0 : position + 1; + + std::string result(hostname, start); + std::transform(result.begin(), result.end(), result.begin(), ::tolower); + + // Leading .'s indicate that the query had an empty segment + if (result.size() && result[0] == '.') + { + std::stringstream message; + message << "Empty segment in " << result; + throw std::invalid_argument(message.str()); + } + + return result; + } + + size_t PSL::countSegments(const std::string& hostname) const + { + size_t count = 1; + size_t position = hostname.find('.'); + while (position != std::string::npos) + { + count += 1; + position = hostname.find('.', position + 1); + } + return count; + } + + void PSL::add(std::string& rule, int level_adjust, size_t trim) + { + // First unpunycoded + std::string copy(rule.rbegin(), rule.rend() - trim); + size_t length = countSegments(copy) + level_adjust; + levels[copy] = length; + + // And now punycoded + rule = Punycode::encodeHostname(rule); + copy.assign(rule.rbegin(), rule.rend() - trim); + levels[copy] = length; + } + +}; diff --git a/src/psl.h b/src/psl.h new file mode 100644 index 0000000..e1714f0 --- /dev/null +++ b/src/psl.h @@ -0,0 +1,102 @@ +#ifndef PSL_CPP_H +#define PSL_CPP_H + +#include +#include +#include +#include +#include + +namespace Url +{ + + /** + * Find TLDs and PLDs of a hostname according to a PSL. + */ + struct PSL + { + /** + * Indicates the there is no TLD / PLD + */ + static const std::string not_found; + + /** + * Read a PSL from an istream. + */ + PSL(std::istream& stream); + + PSL(): levels() { }; + + PSL(const PSL& other): levels(other.levels) { } + + PSL& operator=(const PSL& other) + { + levels = other.levels; + return *this; + } + + /** + * Read the provided path holding a set of PSL rules. + */ + static PSL fromPath(const std::string& path); + + /** + * Create a PSL object from a string. + */ + static PSL fromString(const std::string& str); + + /** + * Get just the TLD of the hostname. + * + * Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If + * some segments have been appropriately punycoded and others not, it may return + * a wrong answer. If a punycoded host is provided, a punycoded response is + * returned. If an unpunycoded host is provided, an unpunycoded response is + * returned. + */ + std::string getTLD(const std::string& hostname) const; + + /** + * Get just the PLD of the hostname. + * + * Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If + * some segments have been appropriately punycoded and others not, it may return + * a wrong answer. If a punycoded host is provided, a punycoded response is + * returned. If an unpunycoded host is provided, an unpunycoded response is + * returned. + */ + std::string getPLD(const std::string& hostname) const; + + /** + * Get the (TLD, PLD) of the hostname. + * + * Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If + * some segments have been appropriately punycoded and others not, it may return + * a wrong answer. If a punycoded host is provided, a punycoded response is + * returned. If an unpunycoded host is provided, an unpunycoded response is + * returned. + */ + std::pair getBoth(const std::string& hostname) const; + private: + // Mapping of a string rule to its level + std::unordered_map levels; + + // Return the number of segments in a hostname + size_t countSegments(const std::string& hostname) const; + + // Return the number of segments in the TLD of the provided hostname + size_t getTLDLength(const std::string& hostname) const; + + // Return the last `segments` segments of a hostname + std::string getLastSegments(const std::string& hostname, size_t segments) const; + + /** + * Add the provided host with the provided priority, trimming characters off + * the front, and adjusting the level by the provided number. + */ + void add(std::string& host, int level_adjust, size_t trim); + }; + +} + +#endif diff --git a/src/punycode.cpp b/src/punycode.cpp new file mode 100644 index 0000000..eb85d92 --- /dev/null +++ b/src/punycode.cpp @@ -0,0 +1,409 @@ +#include +#include +#include + +#include "punycode.h" +#include "utf8.h" + +namespace Url +{ + + std::string& Punycode::encode(std::string& str) + { + // Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3 + // + // let n = initial_n + // let delta = 0 + // let bias = initial_bias + punycode_uint n = INITIAL_N; + punycode_uint delta = 0; + punycode_uint bias = INITIAL_BIAS; + std::string output; + + // Accumulate the non-basic codepoints + std::vector codepoints; + for (auto it = str.cbegin(); it != str.cend(); ) + { + Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend()); + if (value < 0x80) + { + // copy them to the output in order + output.append(1, static_cast(value)); + } + codepoints.push_back(value); + } + + // let h = b = the number of basic code points in the input + size_t h = output.size(); + size_t b = h; + + // copy a delimiter if b > 0 + if (b > 0) + { + output.append(1, '-'); + } + + // while h < length(input) do begin + while (h < codepoints.size()) + { + // let m = the minimum {non-basic} code point >= n in the input + punycode_uint m = MAX_PUNYCODE_UINT; + for (auto it = codepoints.begin(); it != codepoints.end(); ++it) + { + if ((*it >= n) && (*it < m)) + { + m = *it; + } + } + + // let delta = delta + (m - n) * (h + 1), fail on overflow + if ((m - n) > ((MAX_PUNYCODE_UINT - delta) / (h + 1))) + { + throw std::invalid_argument("Overflow delta update."); + } + delta += (m - n) * (h + 1); + + // let n = m + n = m; + + // for each code point c in the input (in order) do begin + for (auto it = codepoints.begin(); it != codepoints.end(); ++it) + { + // if c < n {or c is basic} then increment delta, fail on overflow + if (*it < n) + { + if (delta == MAX_PUNYCODE_UINT) + { + throw std::invalid_argument("Overflow delta increment."); + } + ++delta; + } + + // if c == n then begin + if (*it == n) + { + // let q = delta + punycode_uint q = delta; + + // for k = base to infinity in steps of base do begin + for (punycode_uint k = BASE; ; k += BASE) + { + // let t = tmin if k <= bias {+ tmin}, or + // tmax if k >= bias + tmax, or k - bias otherwise + punycode_uint t = k <= bias ? TMIN : + k >= bias + TMAX ? TMAX : k - bias; + + // if q < t then break + if (q < t) + { + break; + } + + // output the code point for digit t + ((q - t) mod (base - t)) + output.append(1, DIGIT_TO_BASIC[t + ((q - t) % (BASE - t))]); + + // let q = (q - t) div (base - t) + q = (q - t) / (BASE - t); + } + + // output the code point for digit q + output.append(1, DIGIT_TO_BASIC[q]); + + // let bias = adapt(delta, h + 1, test h equals b?) + bias = adapt(delta, h + 1, h == b); + + // let delta = 0 + delta = 0; + + // increment h + ++h; + + } + } + + // increment delta and n + ++delta; + ++n; + } + + str.assign(output); + return str; + } + + std::string Punycode::encode(const std::string& str) + { + std::string result(str); + encode(result); + return result; + } + + std::string Punycode::encodeHostname(const std::string& hostname) + { + // Avoid any punycoding at all if none is needed + if (!needsPunycoding(hostname)) + { + return hostname; + } + + std::string encoded; + + size_t start = 0; + size_t end = hostname.find('.'); + while(true) + { + std::string segment = hostname.substr(start, end - start); + if (needsPunycoding(segment)) + { + encoded.append("xn--"); + encoded.append(Punycode::encode(segment)); + } + else + { + encoded.append(segment); + } + + if (end == std::string::npos) + { + break; + } + else + { + encoded.append(1, '.'); + start = end + 1; + end = hostname.find('.', start); + } + } + + return encoded; + } + + std::string& Punycode::decode(std::string& str) + { + // Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2 + // + // let n = initial_n + // let i = 0 + // let bias = initial_bias + // let output = an empty string indexed from 0 + punycode_uint n = INITIAL_N; + punycode_uint i = 0; + punycode_uint bias = INITIAL_BIAS; + std::vector codepoints; + + size_t index = str.rfind('-'); + if (index == std::string::npos) + { + index = 0; + } + + // consume all code points before the last delimiter (if there is one) + // and copy them to output, fail on any non-basic code point + for (auto it = str.begin(); it != (str.begin() + index); ++it) + { + if (static_cast(*it) > 127U) + { + throw std::invalid_argument("Argument has non-basic code points."); + } + codepoints.push_back(*it); + } + + // if more than zero code points were consumed then consume one more + // (which will be the last delimiter) + if (index > 0) + { + index += 1; + } + + // while the input is not exhausted do begin + for (auto it = (str.begin() + index); it != str.end(); ++it) + { + // let oldi = i + // let w = 1 + punycode_uint oldi = i; + punycode_uint w = 1; + + // for k = base to infinity in steps of base do begin + for (punycode_uint k = BASE; ; k += BASE, ++it) + { + // consume a code point, or fail if there was none to consume + if (it == str.end()) + { + throw std::invalid_argument("Premature termination"); + } + + // let digit = the code point's digit-value, fail if it has none + int lookup = BASIC_TO_DIGIT[static_cast(*it)]; + if (lookup == -1) + { + throw std::invalid_argument("Invalid base 36 character."); + } + unsigned char digit = static_cast(lookup); + + // let i = i + digit * w, fail on overflow + if (digit > ((MAX_PUNYCODE_UINT - i) / w)) + { + throw std::invalid_argument("Overflow on i."); + } + i += digit * w; + + // let t = tmin if k <= bias {+ tmin}, or + // tmax if k >= bias + tmax, or k - bias otherwise + punycode_uint t = k <= bias ? TMIN : + k >= bias + TMAX ? TMAX : k - bias; + + // if digit < t then break + if (digit < t) + { + break; + } + + // let w = w * (base - t), fail on overflow + if (w > (MAX_PUNYCODE_UINT / (BASE - t))) + { + // I believe this line is unreachable without first overflowing i. + // Since 'i' is updated above as i += digit * w, and w is updated as + // w = w * (BASE - t), we should like to keep (BASE - t) > digit to + // give 'w' a chance to overflow first. To keep t minimized, we must + // have 'bias' maximized. `bias` is driven by the 'adapt' function + // below. + // + // The value returned by 'adapt' increases with the input delta, and + // decreases with the input size. The delta is a function of the input + // size as well, on the order of (delta_n * input size), and + // legitimate delta_n values are limited to 0x10FFFF (the maximum + // unicode codepoint). Even setting that aside, the maximum value that + // adapt() can return is adapt(2 ** 32 - 1, 1, false) = 204. + // + // Using this bias, we could use the input (HERE) to get iterations: + // + // digit = b = 1, i = 2, k = 36, t = 1, w = 35 + // digit = b = 1, i = 37, k = 72, t = 1, w = 1225 + // digit = b = 1, i = 1262, k = 108, t = 1, w = 42875 + // digit = b = 1, i = 44137, k = 144, t = 1, w = 1500625 + // digit = b = 1, i = 1544762, k = 180, t = 1, w = 52521875 + // + // At this point, t now becomes TMAX (26) because k exceeds the bias + // (since the maximum bias is 204). As such, the minimum continuation + // value is 26: + // + // digit = 0 = 26, i = 1367113512, k = 216, t = 26, w = 525218750 + // + // However, the next iteration now overflows i before we can get to + // the w update. + throw std::invalid_argument("Overflow on w."); // LCOV_EXCL_LINE + } + w *= (BASE - t); + } + + // let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?) + bias = adapt(i - oldi, codepoints.size() + 1, oldi == 0); + + // let n = n + i div (length(output) + 1), fail on overflow + if ((i / (codepoints.size() + 1)) > (MAX_PUNYCODE_UINT - n)) + { + throw std::invalid_argument("Overflow on n."); + } + n += i / (codepoints.size() + 1); + + // let i = i mod (length(output) + 1) + i %= (codepoints.size() + 1); + + // insert n into output at position i + codepoints.insert(codepoints.begin() + i, n); + + // increment i + ++i; + } + + std::string output; + for (auto it = codepoints.begin(); it != codepoints.end(); ++it) + { + Utf8::writeCodepoint(output, *it); + } + str.assign(output); + + return str; + } + + std::string Punycode::decode(const std::string& str) + { + std::string result(str); + decode(result); + return result; + } + + std::string Punycode::decodeHostname(const std::string& hostname) + { + std::string unencoded; + + size_t start = 0; + size_t end = hostname.find('.'); + while(true) + { + std::string segment = hostname.substr(start, end - start); + if (segment.substr(0, 4).compare("xn--") == 0) + { + segment = segment.substr(4); + unencoded.append(Punycode::decode(segment)); + } + else + { + unencoded.append(segment); + } + + if (end == std::string::npos) + { + break; + } + else + { + unencoded.append(1, '.'); + start = end + 1; + end = hostname.find('.', start); + } + } + + return unencoded; + } + + bool Punycode::needsPunycoding(const std::string& str) + { + return std::any_of( + str.begin(), + str.end(), + [](char i){ return static_cast(i) & 0x80; }); + } + + Punycode::punycode_uint Punycode::adapt( + punycode_uint delta, punycode_uint numpoints, bool firsttime) + { + // Psuedocode from https://tools.ietf.org/html/rfc3492#section-6.1 + // + // It does not matter whether the modifications to delta and k inside + // adapt() affect variables of the same name inside the + // encoding/decoding procedures, because after calling adapt() the + // caller does not read those variables before overwriting them. + // + // if firsttime then let delta = delta div damp + // else let delta = delta div 2 + delta = firsttime ? delta / DAMP : delta >> 1; + + // let delta = delta + (delta div numpoints) + delta += (delta / numpoints); + + // let k = 0 + punycode_uint k = 0; + + // while delta > ((base - tmin) * tmax) div 2 do begin + for (; delta > ((BASE - TMIN) * TMAX) / 2; k += BASE) + { + // let delta = delta div (base - tmin) + // let k = k + base + delta /= (BASE - TMIN); + } + + // return k + (((base - tmin + 1) * delta) div (delta + skew)) + return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW)); + } + +}; diff --git a/src/punycode.h b/src/punycode.h new file mode 100644 index 0000000..25fce96 --- /dev/null +++ b/src/punycode.h @@ -0,0 +1,105 @@ +#ifndef PUNYCODE_CPP_H +#define PUNYCODE_CPP_H + +#include +#include +#include +#include +#include + +#include "utf8.h" + +namespace Url +{ + + namespace Punycode + { + typedef Utf8::codepoint_t punycode_uint; + + const unsigned int BASE = 36; + const unsigned int TMIN = 1; + const unsigned int TMAX = 26; + const unsigned int SKEW = 38; + const unsigned int DAMP = 700; + const unsigned int INITIAL_BIAS = 72; + const unsigned int INITIAL_N = 128; + + // Codepoints to their base-36 value + const std::vector BASIC_TO_DIGIT = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, + + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + const std::string DIGIT_TO_BASIC = "abcdefghijklmnopqrstuvwxyz0123456789"; + + // The highest codepoint in unicode + const punycode_uint MAX_PUNYCODE_UINT = std::numeric_limits::max(); + //Utf8::MAX_CODEPOINT; + //std::numeric_limits::max(); + + /** + * Replace utf-8-encoded str into punycode. + */ + std::string& encode(std::string& str); + + /** + * Create a new punycoded string from utf-8-encoded input. + */ + std::string encode(const std::string& str); + + /** + * Encode a hostname. + */ + std::string encodeHostname(const std::string& hostname); + + /** + * Replace punycoded str into utf-8-encoded. + */ + std::string& decode(std::string& str); + + /** + * Create a new utf-8-encoded string from punycoded input. + */ + std::string decode(const std::string& str); + + /** + * Decode a hostname. + */ + std::string decodeHostname(const std::string& hostname); + + /** + * Determine if a string needs punycoding. + */ + bool needsPunycoding(const std::string& str); + + /** + * Internal function for calculating bias. + */ + punycode_uint adapt( + punycode_uint delta, punycode_uint numpoints, bool firsttime); + + }; + +} + +#endif diff --git a/src/repmain.cpp b/src/repmain.cpp new file mode 100644 index 0000000..d8b70d2 --- /dev/null +++ b/src/repmain.cpp @@ -0,0 +1,26 @@ +#include +using namespace Rcpp; + +#include "url.h" +#include "robots.h" + +//' Parse robots.txt +//' +//' @noRd +//' +// [[Rcpp::export]] +SEXP rep_parse(std::string content) { + Rcpp::XPtr ptr(new Rep::Robots(content)); + return(ptr); +} + + +//' Path allowed +//' +//' @noRd +//' +// [[Rcpp::export]] +bool rep_path_allowed(SEXP xp, std::string path, std::string agent = "*") { + Rcpp::XPtr ptr(xp); + return(ptr->allowed(path, agent)); +} diff --git a/src/robots.cpp b/src/robots.cpp new file mode 100644 index 0000000..fb54d6e --- /dev/null +++ b/src/robots.cpp @@ -0,0 +1,188 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "url.h" + +#include "robots.h" +#include + +namespace Rep +{ + + void Robots::strip(std::string& string) + { + string.erase(string.begin(), std::find_if(string.begin(), string.end(), + std::not1(std::ptr_fun(std::isspace)))); + string.erase(std::find_if(string.rbegin(), string.rend(), + std::not1(std::ptr_fun(std::isspace))).base(), string.end()); + } + + bool Robots::getpair(std::istringstream& stream, std::string& key, std::string& value) + { + while (getline(stream, key)) + { + size_t index = key.find('#'); + if (index != std::string::npos) + { + key.resize(index); + } + + // Find the colon and divide it into key and value, skipping malformed lines + index = key.find(':'); + if (index == std::string::npos) + { + continue; + } + + value.assign(key.begin() + index + 1, key.end()); + key.resize(index); + + // Strip whitespace off of each + strip(key); + strip(value); + + // Lowercase the key + std::transform(key.begin(), key.end(), key.begin(), ::tolower); + + return true; + } + return false; + } + + Robots::Robots(const std::string& content): agents_(), sitemaps_(), default_(agents_["*"]) + { + std::string agent_name("*"); + std::istringstream input(content); + if (content.compare(0, 3, "\xEF\xBB\xBF") == 0) + { + input.ignore(3); + } + std::string key, value; + std::vector group; + bool last_agent = false; + agent_map_t::iterator current = agents_.find("*"); + while (Robots::getpair(input, key, value)) + { + if (key.compare("user-agent") == 0) + { + // Store the user agent string as lowercased + std::transform(value.begin(), value.end(), value.begin(), ::tolower); + + if (last_agent) + { + group.push_back(value); + } + else + { + if (!agent_name.empty()) + { + for (auto other : group) + { + agents_[other] = current->second; + } + group.clear(); + } + agent_name = value; + current = agents_.emplace(agent_name, Agent()).first; + } + last_agent = true; + continue; + } + else + { + last_agent = false; + } + + if (key.compare("sitemap") == 0) + { + sitemaps_.push_back(value); + } + else if (key.compare("disallow") == 0) + { + current->second.disallow(value); + } + else if (key.compare("allow") == 0) + { + current->second.allow(value); + } + else if (key.compare("crawl-delay") == 0) + { + try + { + current->second.delay(std::stof(value)); + } + catch (const std::exception&) + { + Rcpp::Rcout << "Could not parse " << value << " as float." << std::endl; + } + } + } + + if (!agent_name.empty()) + { + for (auto other : group) + { + agents_[other] = current->second; + } + } + } + + const Agent& Robots::agent(const std::string& name) const + { + // Lowercase the agent + std::string lowered(name); + std::transform(lowered.begin(), lowered.end(), lowered.begin(), ::tolower); + + auto it = agents_.find(lowered); + if (it == agents_.end()) + { + return default_; + } + else + { + return it->second; + } + } + + bool Robots::allowed(const std::string& path, const std::string& name) const + { + return agent(name).allowed(path); + } + + std::string Robots::str() const + { + std::stringstream out; + // TODO: include sitepath info + out << '{'; + auto begin = agents_.begin(); + auto end = agents_.end(); + if (begin != end) + { + out << '"' << begin->first << '"' << ": " << begin->second.str(); + ++begin; + } + for (; begin != end; ++begin) + { + out << ", \"" << begin->first << '"' << ": " << begin->second.str(); + } + out << '}'; + return out.str(); + } + + std::string Robots::robotsUrl(const std::string& url) + { + return Url::Url(url) + .setUserinfo("") + .setPath("robots.txt") + .setParams("") + .setQuery("") + .setFragment("") + .remove_default_port() + .str(); + } +} diff --git a/src/robots.h b/src/robots.h new file mode 100644 index 0000000..56a82c4 --- /dev/null +++ b/src/robots.h @@ -0,0 +1,69 @@ +#ifndef ROBOTS_CPP_H +#define ROBOTS_CPP_H + +#include +#include +#include + +#include "agent.h" + +namespace Rep +{ + + class Robots + { + public: + typedef std::unordered_map agent_map_t; + typedef std::vector sitemaps_t; + + /** + * Create a robots.txt from a utf-8-encoded string. + */ + Robots(const std::string& content); + + /** + * Instantiate a Robots object. + */ + Robots( + const agent_map_t& agents, + const sitemaps_t& sitemaps) + : agents_(agents) + , sitemaps_(sitemaps) + , default_(agents_["*"]) {} + + /** + * Get the sitemaps in this robots.txt + */ + const sitemaps_t& sitemaps() const { return sitemaps_; } + + /** + * Get the agent with the corresponding name. + */ + const Agent& agent(const std::string& name) const; + + /** + * Return true if agent is allowed to fetch the URL (either a + * full URL or a path). + */ + bool allowed(const std::string& path, const std::string& name) const; + + std::string str() const; + + /** + * Return the robots.txt URL corresponding to the provided URL. + */ + static std::string robotsUrl(const std::string& url); + + private: + static void strip(std::string& string); + + static bool getpair( + std::istringstream& stream, std::string& key, std::string& value); + + agent_map_t agents_; + sitemaps_t sitemaps_; + Agent& default_; + }; +} + +#endif diff --git a/src/url.cpp b/src/url.cpp new file mode 100644 index 0000000..900a65e --- /dev/null +++ b/src/url.cpp @@ -0,0 +1,962 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "url.h" +#include "punycode.h" + +namespace Url +{ + + /* Character classes */ + const CharacterClass Url::GEN_DELIMS(":/?#[]@"); + const CharacterClass Url::SUB_DELIMS("!$&'()*+,;="); + const CharacterClass Url::DIGIT("0123456789"); + const CharacterClass Url::ALPHA( + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); + const CharacterClass Url::UNRESERVED( + Url::ALPHA.chars() + Url::DIGIT.chars() + "-._~"); + const CharacterClass Url::RESERVED( + Url::GEN_DELIMS.chars() + Url::SUB_DELIMS.chars()); + const CharacterClass Url::PCHAR( + Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":@"); + const CharacterClass Url::PATH( + Url::PCHAR.chars() + "/"); + const CharacterClass Url::QUERY( + Url::PCHAR.chars() + "/?"); + const CharacterClass Url::FRAGMENT( + Url::PCHAR.chars() + "/?"); + const CharacterClass Url::USERINFO( + Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":"); + const CharacterClass Url::HEX("0123456789ABCDEF"); + const CharacterClass Url::SCHEME( + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-."); + const std::vector Url::HEX_TO_DEC = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, + + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + const std::unordered_map Url::PORTS = { + {"http", 80}, + {"https", 443} + }; + const std::unordered_set Url::USES_RELATIVE = { + "", + "file", + "ftp", + "gopher", + "http", + "https", + "imap", + "mms", + "nntp", + "prospero", + "rtsp", + "rtspu", + "sftp", + "shttp", + "svn", + "svn+ssh", + "wais" + }; + const std::unordered_set Url::USES_NETLOC = { + "", + "file", + "ftp", + "git", + "git+ssh", + "gopher", + "http", + "https", + "imap", + "mms", + "nfs", + "nntp", + "prospero", + "rsync", + "rtsp", + "rtspu", + "sftp", + "shttp", + "snews", + "svn", + "svn+ssh", + "telnet", + "wais" + }; + const std::unordered_set Url::USES_PARAMS = { + "", + "ftp", + "hdl", + "http", + "https", + "imap", + "mms", + "prospero", + "rtsp", + "rtspu", + "sftp", + "shttp", + "sip", + "sips", + "tel" + }; + const std::unordered_set Url::KNOWN_PROTOCOLS = { + "", + "file", + "ftp", + "git", + "git+ssh", + "gopher", + "hdl", + "http", + "https", + "imap", + "mms", + "nfs", + "nntp", + "prospero", + "rsync", + "rtsp", + "rtspu", + "sftp", + "shttp", + "sip", + "sips", + "sms", + "snews", + "svn", + "svn+ssh", + "tel", + "telnet", + "wais" + }; + + Url::Url(const std::string& url): port_(0), has_params_(false), has_query_(false) + { + size_t position = 0; + size_t index = url.find(':'); + if (index != std::string::npos) + { + // All the characters in our would-be scheme must be in SCHEME + if (std::all_of( + url.begin(), + url.begin() + index, + [](char c) { return SCHEME(c); } )) + { + // If there is nothing after the : or there are any non-digits, this is + // the scheme + if ((index + 1) >= url.length() + || std::any_of( + url.begin() + index + 1, + url.end(), + [](char c) { return !DIGIT(c); })) + { + scheme_.assign(url, 0, index); + std::transform( + scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower); + position = index + 1; + } + else + { + scheme_.assign(url, 0, index); + std::transform( + scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower); + if (KNOWN_PROTOCOLS.find(scheme_) != KNOWN_PROTOCOLS.end()) + { + position = index + 1; + } + else + { + scheme_.clear(); + } + } + } + } + + // Search for the netloc + if ((url.length() - position) >= 1 + && url[position] == '/' + && url[position + 1] == '/') + { + // Skip the '//' + position += 2; + index = url.find_first_of("/?#", position); + host_.assign(url, position, index - position); + position = index; + + // Extract any userinfo if there is any + index = host_.find('@'); + if (index != std::string::npos) + { + userinfo_.assign(host_, 0, index); + host_.assign(host_, index + 1, std::string::npos); + } + + // Lowercase the hostname + std::transform(host_.begin(), host_.end(), host_.begin(), ::tolower); + + // Try to find a port + index = host_.find(':'); + if (index != std::string::npos) + { + std::string portText(host_, index + 1, std::string::npos); + host_.resize(index); + + if (portText.empty()) + { + port_ = 0; + } + else + { + try + { + port_ = std::stoi(portText, &index); + + if (index != portText.length()) + { + // Malformed port + throw UrlParseException("Port not a number: " + portText); + } + + if (port_ > 65535) + { + throw UrlParseException("Port too high: " + portText); + } + else if (port_ < 0) + { + throw UrlParseException("Port negative: " + portText); + } + } + catch (const std::invalid_argument&) + { + // Malformed port + throw UrlParseException("Port not a number: " + portText); + } + catch (const std::out_of_range&) + { + throw UrlParseException("Port out of integer range: " + portText); + } + } + } + } + + if (position != std::string::npos) + { + path_.assign(url, position, std::string::npos); + + index = path_.find('#'); + if (index != std::string::npos) + { + fragment_.assign(path_, index + 1, std::string::npos); + path_.resize(index); + } + + index = path_.find('?'); + if (index != std::string::npos) + { + query_.assign(path_, index + 1, std::string::npos); + has_query_ = true; + path_.resize(index); + } + + if (USES_PARAMS.find(scheme_) != USES_PARAMS.end()) + { + index = path_.find(';'); + if (index != std::string::npos) + { + params_.assign(path_, index + 1, std::string::npos); + has_params_ = true; + path_.resize(index); + } + } + } + } + + Url& Url::assign(const Url& other) + { + return (*this) = other; + } + + bool Url::operator==(const Url& other) const + { + return ( + (scheme_ == other.scheme_ ) && + (userinfo_ == other.userinfo_ ) && + (host_ == other.host_ ) && + (port_ == other.port_ ) && + (path_ == other.path_ ) && + (params_ == other.params_ ) && + (query_ == other.query_ ) && + (fragment_ == other.fragment_ ) && + (has_params_ == other.has_params_) && + (has_query_ == other.has_query_ ) + ); + } + + bool Url::operator!=(const Url& other) const + { + return !operator==(other); + } + + bool Url::equiv(const Url& other) + { + Url self_(*this); + Url other_(other); + + self_.strip() + .sort_query() + .defrag() + .deuserinfo() + .abspath() + .escape() + .punycode() + .remove_default_port(); + other_.strip() + .sort_query() + .defrag() + .deuserinfo() + .abspath() + .escape() + .punycode() + .remove_default_port(); + return self_ == other_; + } + + std::string& Url::remove_repeats(std::string& str, const char chr) + { + size_t dest = 0; + // By initializing this to true, it also strips of leading instances of chr + bool seen = true; + for (size_t src = 0; src < str.length(); ++src) + { + if (!seen || (str[src] != chr)) + { + str[dest++] = str[src]; + } + seen = str[src] == chr; + } + // Remove the last character if it happens to be chr + size_t length = ((dest > 0) && (str[dest - 1] == chr)) ? dest - 1 : dest; + str.resize(length); + return str; + } + + std::string Url::fullpath() const + { + std::string result; + if (path_.empty() || path_[0] != '/') + { + result.append(1, '/'); + } + result.append(path_); + + if (has_params_) + { + result.append(";"); + result.append(params_); + } + + if (has_query_) + { + result.append("?"); + result.append(query_); + } + + if (!fragment_.empty()) + { + result.append("#"); + result.append(fragment_); + } + return result; + } + + std::string Url::str() const + { + std::string result; + + if (!scheme_.empty()) + { + result.append(scheme_); + if (USES_NETLOC.find(scheme_) == USES_NETLOC.end()) + { + result.append(":"); + } + else + { + result.append("://"); + } + } + else if (!host_.empty()) + { + result.append("//"); + } + + if (!userinfo_.empty()) + { + result.append(userinfo_); + result.append("@"); + } + + if (!host_.empty()) + { + result.append(host_); + } + + if (port_) + { + result.append(":"); + result.append(std::to_string(port_)); + } + + if (path_.empty()) + { + if (!result.empty()) + { + result.append("/"); + } + } + else + { + if (!host_.empty() && path_[0] != '/') + { + result.append(1, '/'); + } + result.append(path_); + } + + if (has_params_) + { + result.append(";"); + result.append(params_); + } + + if (has_query_) + { + result.append("?"); + result.append(query_); + } + + if (!fragment_.empty()) + { + result.append("#"); + result.append(fragment_); + } + + return result; + } + + Url& Url::strip() + { + size_t start = query_.find_first_not_of('?'); + if (start != std::string::npos) + { + query_.assign(query_, start, std::string::npos); + } + else + { + query_.assign(""); + } + setQuery(remove_repeats(query_, '&')); + setParams(remove_repeats(params_, ';')); + return *this; + } + + Url& Url::abspath() + { + std::string copy; + std::vector segment_starts; + + if (path_.size() >= 1 && path_[0] == '/') + { + copy.append(1, '/'); + segment_starts.push_back(0); + } + + bool directory = false; + size_t previous = 0; + size_t index = 0; + for (index = path_.find('/') + ; index != std::string::npos + ; previous = index + 1, index = path_.find('/', index + 1)) + { + // Skip empty segments + if (index - previous == 0) + { + continue; + } + + if ((index - previous == 2) + && path_[previous] == '.' + && path_[previous + 1] == '.') + { + if (!segment_starts.empty()) + { + copy.resize(segment_starts.back()); + segment_starts.pop_back(); + } + directory = true; + } + else if ((index - previous == 1) && path_[previous] == '.') + { + directory = true; + } + else + { + segment_starts.push_back(copy.length()); + copy.append(path_, previous, index - previous); + copy.append(1, '/'); + directory = false; + } + } + + // Handle the last segment + index = path_.length(); + if (previous == path_.length()) + { + directory = true; + } + else if ((index - previous == 1) && path_[previous] == '.') + { + directory = true; + } + else if ((index - previous == 2) + && path_[previous] == '.' + && path_[previous + 1] == '.') + { + if (!segment_starts.empty()) + { + copy.resize(segment_starts.back()); + } + directory = true; + } + else + { + copy.append(path_, previous, index - previous); + copy.append(1, '/'); + directory = false; + } + + if (!directory && copy.size() >= 1) + { + copy.resize(copy.size() - 1); + } + else if (directory && copy.empty()) + { + copy.append(1, '/'); + } + path_.assign(copy); + + return *this; + } + + Url& Url::relative_to(const Url& other) + { + // If this scheme does not use relative, return it unchanged + if (USES_RELATIVE.find(scheme_) == USES_RELATIVE.end()) + { + return *this; + } + + // Support scheme-relative URLs + if (scheme_.empty()) + { + scheme_ = other.scheme_; + } + + // If this is an absolute URL (or scheme-relative), return early + if (!host_.empty()) { + return *this; + } + + // If it's not an absolute URL, we need to copy the other host and port + host_ = other.host_; + port_ = other.port_; + userinfo_ = other.userinfo_; + + // If the path portion is absolute, then bail out early. + if (!path_.empty() && path_.front() == '/') + { + return *this; + } + + // Otherwise, this is a path that need to be evaluated relative to the other. If + // there is no '/', then we just keep our current path if it's not empty. + if (path_.empty()) + { + if (params_.empty()) + { + path_ = other.path_; + params_ = other.params_; + has_params_ = other.has_params_; + if (query_.empty()) + { + query_ = other.query_; + has_query_ = other.has_query_; + } + } + else + { + path_.assign(other.path_, 0, other.path_.rfind('/') + 1); + } + + if (fragment_.empty()) + { + fragment_ = other.fragment_; + } + } + else + { + size_t index = other.path_.rfind('/'); + if (index != std::string::npos) + { + path_ = other.path_.substr(0, index + 1) + path_; + } + else if (!host_.empty()) + { + path_ = "/" + path_; + } + } + + return *this; + } + + Url& Url::escape(bool strict) + { + escape(path_, PATH, strict); + escape(query_, QUERY, strict); + escape(params_, QUERY, strict); + escape(userinfo_, USERINFO, strict); + return *this; + } + + std::string& Url::escape(std::string& str, const CharacterClass& safe, bool strict) + { + std::string copy(str); + size_t dest = 0; + // Allocate space pessimistically -- if every entity is expanded, it will take 3x + // the space. + str.resize(str.length() * 3); + for (size_t src = 0; src < copy.length(); ++src) + { + if (copy[src] == '%' && (copy.length() - src) >= 2) + { + // Read ahead to see if there's a valid escape sequence. If not, treat + // this like a normal character. + if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1) + { + int value = ( + HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]); + + // In strict mode, we can only unescape parameters if they are both + // safe and node reserved + if (!strict || (strict && safe(value) && !RESERVED(value))) + { + // Replace src + 2 with that byte, advance src to consume it and + // continue. + src += 2; + copy[src] = value; + } + else + { + str[dest++] = copy[src++]; + str[dest++] = ::toupper(copy[src++]); + str[dest++] = ::toupper(copy[src]); + continue; + } + } + } + + if (!safe(copy[src])) + { + // Not safe -- replace with %XX + str[dest++] = '%'; + str[dest++] = HEX.chars()[(copy[src] >> 4) & 0xF]; + str[dest++] = HEX.chars()[copy[src] & 0xF]; + } + else + { + str[dest++] = copy[src]; + } + } + str.resize(dest); + return str; + } + + Url& Url::unescape() + { + unescape(path_); + unescape(query_); + unescape(params_); + unescape(userinfo_); + return *this; + } + + std::string& Url::unescape(std::string& str) + { + std::string copy(str); + size_t dest = 0; + for (size_t src = 0; src < copy.length(); ++src, ++dest) + { + if (copy[src] == '%' && (copy.length() - src) >= 2) + { + // Read ahead to see if there's a valid escape sequence. If not, treat + // this like a normal character. + if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1) + { + int value = ( + HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]); + + // Replace src + 2 with that byte, advance src to consume it and + // continue. + src += 2; + str[dest] = value; + continue; + } + } + + // Either not a % or an incomplete entity + str[dest] = copy[src]; + } + str.resize(dest); + return str; + } + + Url& Url::deparam(const std::unordered_set& blacklist) + { + // Predicate is if it's present in the blacklist. + auto predicate = [blacklist](std::string& name, const std::string& value) + { + std::transform(name.begin(), name.end(), name.begin(), ::tolower); + return blacklist.find(name) != blacklist.end(); + }; + + setQuery(remove_params(query_, predicate, '&')); + setParams(remove_params(params_, predicate, ';')); + return *this; + } + + Url& Url::deparam(const deparam_predicate& predicate) + { + setQuery(remove_params(query_, predicate, '&')); + setParams(remove_params(params_, predicate, ';')); + return *this; + } + + std::string& Url::remove_params(std::string& str, + const deparam_predicate& predicate, + char sep) + { + std::string copy; + std::string piece; + std::string name; + std::string value; + size_t previous = 0; + for (size_t index = str.find(sep) + ; index != std::string::npos + ; previous = index + 1, index = str.find(sep, previous)) + { + piece.assign(str, previous, index - previous); + size_t position = piece.find('='); + name.assign(piece, 0, position); + value.clear(); + if (position != std::string::npos) + { + value.assign(piece, position + 1, std::string::npos); + } + + if (!predicate(name, value)) + { + copy.append(copy.empty() ? 0 : 1, sep); + copy.append(piece); + } + } + + if (previous < str.length()) + { + piece.assign(str, previous, std::string::npos); + size_t position = piece.find('='); + name.assign(piece, 0, position); + value.clear(); + if (position != std::string::npos) + { + value.assign(piece, position + 1, std::string::npos); + } + + if (!predicate(name, value)) + { + copy.append(copy.empty() ? 0 : 1, sep); + copy.append(piece); + } + } + + str.assign(copy); + return str; + } + + Url& Url::sort_query() + { + split_sort_join(query_, '&'); + split_sort_join(params_, ';'); + return *this; + } + + std::string& Url::split_sort_join(std::string& str, const char glue) + { + // Return early if empty + if (str.empty()) + { + return str; + } + + // Split + std::vector pieces; + std::stringstream stream(str); + std::string item; + while (getline(stream, item, glue)) + { + pieces.push_back(item); + } + + // Return early if it's just a single element + if (pieces.size() == 1) + { + return str; + } + + // Sort + std::sort(pieces.begin(), pieces.end()); + + // Join (at this point we know that there's at least one element) + std::stringstream output; + for (auto it = pieces.begin(); it != (pieces.end() - 1); ++it) + { + output << *it << glue; + } + output << pieces.back(); + str.assign(output.str()); + return str; + } + + Url& Url::remove_default_port() + { + if (port_ && !scheme_.empty()) + { + auto it = PORTS.find(scheme_); + if (it != PORTS.end() && port_ == it->second) + { + port_ = 0; + } + } + return *this; + } + + Url& Url::deuserinfo() + { + userinfo_.clear(); + return *this; + } + + Url& Url::defrag() + { + fragment_.clear(); + return *this; + } + + Url& Url::punycode() + { + check_hostname(host_); + std::string encoded(Punycode::encodeHostname(host_)); + check_hostname(encoded); + host_ = encoded; + return *this; + } + + Url& Url::unpunycode() + { + host_ = Punycode::decodeHostname(host_); + return *this; + } + + Url& Url::host_reversed() + { + std::reverse(host_.begin(), host_.end()); + for (size_t index = 0, position = 0; index < host_.size(); index = position + 1) + { + position = host_.find('.', index); + if (position == std::string::npos) + { + std::reverse(host_.begin() + index, host_.end()); + break; + } + else + { + std::reverse(host_.begin() + index, host_.begin() + position); + } + } + return *this; + } + + void Url::check_hostname(std::string& host) + { + // Skip empty hostnames -- they are valid + if (host.empty()) + { + return; + } + + size_t start = 0; + size_t end = host.find('.'); + while (end != std::string::npos) + { + if ((end - start) > 63) + { + throw std::invalid_argument("Label too long."); + } + else if (end == start) + { + throw std::invalid_argument("Empty label."); + } + + start = end + 1; + end = host.find('.', start); + } + + // For the final segment + if ((host.size() - start) > 63) + { + throw std::invalid_argument("Label too long."); + } + else if (host.size() == start && start > 1) + { + // Remove a trailing empty segment + host.resize(start - 1); + } + } + +}; diff --git a/src/url.h b/src/url.h new file mode 100644 index 0000000..6245124 --- /dev/null +++ b/src/url.h @@ -0,0 +1,323 @@ +#ifndef URL_CPP_H +#define URL_CPP_H + +#include +#include +#include +#include +#include +#include + +namespace Url +{ + + struct UrlParseException : public std::logic_error + { + UrlParseException(const std::string& message) : std::logic_error(message) {} + }; + + struct CharacterClass + { + CharacterClass(const std::string& chars) : chars_(chars), map_(256, false) + { + for (auto it = chars_.begin(); it != chars_.end(); ++it) + { + map_[static_cast(*it)] = true; + } + } + + bool operator()(char c) const + { + return map_[static_cast(c)]; + } + + const std::string& chars() const + { + return chars_; + } + + private: + // Private, unimplemented to prevent use + CharacterClass(); + CharacterClass(const CharacterClass& other); + + std::string chars_; + std::vector map_; + }; + + struct Url + { + /* Character classes */ + const static CharacterClass GEN_DELIMS; + const static CharacterClass SUB_DELIMS; + const static CharacterClass ALPHA; + const static CharacterClass DIGIT; + const static CharacterClass UNRESERVED; + const static CharacterClass RESERVED; + const static CharacterClass PCHAR; + const static CharacterClass PATH; + const static CharacterClass QUERY; + const static CharacterClass FRAGMENT; + const static CharacterClass USERINFO; + const static CharacterClass HEX; + const static CharacterClass SCHEME; + const static std::vector HEX_TO_DEC; + const static std::unordered_map PORTS; + const static std::unordered_set USES_RELATIVE; + const static std::unordered_set USES_NETLOC; + const static std::unordered_set USES_PARAMS; + const static std::unordered_set KNOWN_PROTOCOLS; + + // The type of the predicate used for removing parameters + typedef std::function deparam_predicate; + + explicit Url(const std::string& url); + + Url(const Url& other) + : scheme_(other.scheme_) + , host_(other.host_) + , port_(other.port_) + , path_(other.path_) + , params_(other.params_) + , query_(other.query_) + , fragment_(other.fragment_) + , userinfo_(other.userinfo_) + , has_params_(other.has_params_) + , has_query_(other.has_query_) { } + + /** + * Take on the value of the other URL. + */ + Url& assign(const Url& other); + + /** + * To be considered equal, all fields must be equal. + */ + bool operator==(const Url& other) const; + bool operator!=(const Url& other) const; + + /** + * Two URLs are considered equivalent if they have the same meaning. + */ + bool equiv(const Url& other); + + /************************************** + * Component-wise access and setting. * + **************************************/ + const std::string& scheme() const { return scheme_; } + Url& setScheme(const std::string& s) + { + scheme_ = s; + return *this; + } + + const std::string& host() const { return host_; } + Url& setHost(const std::string& s) + { + host_ = s; + return *this; + } + + const int port() const { return port_; } + Url& setPort(int i) + { + port_ = i; + return *this; + } + + const std::string& path() const { return path_; } + Url& setPath(const std::string& s) + { + path_ = s; + return *this; + } + + const std::string& params() const { return params_; } + Url& setParams(const std::string& s) + { + params_ = s; + has_params_ = !s.empty(); + return *this; + } + + const std::string& query() const { return query_; } + Url& setQuery(const std::string& s) + { + query_ = s; + has_query_ = !s.empty(); + return *this; + } + + const std::string& fragment() const { return fragment_; } + Url& setFragment(const std::string& s) + { + fragment_ = s; + return *this; + } + + const std::string& userinfo() const { return userinfo_; } + Url& setUserinfo(const std::string& s) + { + userinfo_ = s; + return *this; + } + + /** + * Get a representation of all components of the path, params, query, fragment. + * + * Always includes a leading /. + */ + std::string fullpath() const; + + /** + * Get a new string representation of the URL. + **/ + std::string str() const; + + /********************* + * Chainable methods * + *********************/ + + /** + * Strip semantically meaningless excess '?', '&', and ';' characters from query + * and params. + */ + Url& strip(); + + /** + * Make the path absolute. + * + * Evaluate '.', '..', and excessive slashes. + */ + Url& abspath(); + + /** + * Evaluate this URL relative fo `other`, placing the result in this object. + */ + Url& relative_to(const std::string& other) + { + return relative_to(Url(other)); + } + + /** + * Evaluate this URL relative fo `other`, placing the result in this object. + */ + Url& relative_to(const Url& other); + + /** + * Ensure that the path, params, query, and userinfo are properly escaped. + * + * In 'strict' mode, only entities that are both safe and not reserved characters + * are unescaped. In non-strict mode, entities that are safe are unescaped. + */ + Url& escape(bool strict=false); + + /** + * Unescape all entities in the path, params, query, and userinfo. + */ + Url& unescape(); + + /** + * Remove any params or queries that appear in the blacklist. + * + * The blacklist should contain only lowercased strings, and the comparison is + * done in a case-insensitive way. + */ + Url& deparam(const std::unordered_set& blacklist); + + /** + * Filter params subject to a predicate for whether it should be filtered. + * + * The predicate must accept two string refs -- the key and value (which may be + * empty). Return `true` if the parameter should be removed, and `false` + * otherwise. + */ + Url& deparam(const deparam_predicate& predicate); + + /** + * Put queries and params in sorted order. + * + * To ensure consistent comparisons, escape should be called beforehand. + */ + Url& sort_query(); + + /** + * Remove the port if it's the default for the scheme. + */ + Url& remove_default_port(); + + /** + * Remove the userinfo portion. + */ + Url& deuserinfo(); + + /** + * Remove the fragment. + */ + Url& defrag(); + + /** + * Punycode the hostname. + */ + Url& punycode(); + + /** + * Unpunycode the hostname. + */ + Url& unpunycode(); + + /** + * Reverse the hostname (a.b.c.d => d.c.b.a) + */ + Url& host_reversed(); + + private: + // Private, unimplemented to prevent use. + Url(); + + /** + * Remove repeated, leading, and trailing instances of chr from the string. + */ + std::string& remove_repeats(std::string& str, const char chr); + + /** + * Ensure all the provided characters are escaped if necessary + */ + std::string& escape(std::string& str, const CharacterClass& safe, bool strict); + + /** + * Unescape entities in the provided string + */ + std::string& unescape(std::string& str); + + /** + * Remove any params that match entries in the blacklist. + */ + std::string& remove_params( + std::string& str, const deparam_predicate& pred, char sep); + + /** + * Split the provided string by char, sort, join by char. + */ + std::string& split_sort_join(std::string& str, const char glue); + + /** + * Check that the hostname is valid, removing an optional trailing '.'. + */ + void check_hostname(std::string& host); + + std::string scheme_; + std::string host_; + int port_; + std::string path_; + std::string params_; + std::string query_; + std::string fragment_; + std::string userinfo_; + bool has_params_; + bool has_query_; + }; + +} + +#endif diff --git a/src/utf8.cpp b/src/utf8.cpp new file mode 100644 index 0000000..3502377 --- /dev/null +++ b/src/utf8.cpp @@ -0,0 +1,150 @@ +#include +#include +#include + +#include "utf8.h" + +namespace Url +{ + + Utf8::codepoint_t Utf8::readCodepoint( + std::string::const_iterator& it, const std::string::const_iterator& end) + { + Utf8::char_t current = static_cast(*it++); + if (current & 0x80) + { + // Number of additional bytes needed + unsigned int bytes = 0; + // The accumulated value + Utf8::codepoint_t result = 0; + if (current < 0xC0) + { + // Invalid sequence + throw std::invalid_argument("Low UTF-8 start byte"); + } + else if (current < 0xE0) + { + // One additional byte, two bytes total, use 5 bits + bytes = 1; + result = current & 0x1F; + } + else if (current < 0xF0) + { + // Two additional bytes, three bytes total, use 4 bits + bytes = 2; + result = current & 0x0F; + } + else if (current < 0xF8) + { + // Three additional bytes, four bytes total, use 3 bits + bytes = 3; + result = current & 0x07; + } + else + { + throw std::invalid_argument("High UTF-8 start byte"); + } + + for (; bytes > 0; --bytes) { + if (it == end) + { + throw std::invalid_argument("UTF-8 sequence terminated early."); + } + + current = static_cast(*it++); + // Ensure the first two bits are 10 + if ((current & 0xC0) != 0x80) + { + throw std::invalid_argument("Invalid continuation byte"); + } + result = (result << 6) | (current & 0x3F); + } + + return result; + } + else + { + return current; + } + } + + std::string& Utf8::writeCodepoint(std::string& str, Utf8::codepoint_t value) + { + if (value > MAX_CODEPOINT) + { + throw std::invalid_argument("Code point too high."); + } + else if (value <= 0x007F) + { + // Just append the character itself + str.append(1, static_cast(value)); + return str; + } + + unsigned int bytes = 0; + if (value > 0xFFFF) + { + /** + * 11110xxx + 3 bytes for 21 bits total + * + * We need to take bits 20-18, which 0x1C0000 masks out. These form the least + * significant bits of this byte (so we shift them back down by 18). The 5 + * most significant bits of this byte are 11110, so we OR this result with + * 0xF0 to get this first byte. + * + * The remaining bits will be consumed from the most-significant end and so + * they must be shifted up by (32 - 18) = 14. + */ + str.append(1, static_cast(((value & 0x1C0000) >> 18) | 0xF0)); + bytes = 3; + value <<= 14; + } + else if (value > 0x07FF) + { + /** + * 1110xxxx + 2 bytes for 16 bits total + * + * We need to take bits 15-12, which 0xF000 masks out. These form the least + * significant bits of this byte (so we shift them back down by 12). The 4 + * most significant bits of this byte are 1110, so we OR this result with + * 0xE0 to get this first byte. + * + * The remaining bits will be consumed from the most-significant end and so + * they must be shifted up by (32 - 12) = 20. + */ + str.append(1, static_cast(((value & 0xF000) >> 12) | 0xE0)); + bytes = 2; + value <<= 20; + } + else + { + /** + * 110xxxxx + 1 byte for 11 bits total + * + * We need to take bits 10-6, which 0x7C0 masks out. These form the least + * significant bits of this byte (so we shift them back down by 6). The 3 + * most significant bits of this byte are 110, so we OR this result with + * 0xC0 to get this first byte. + * + * The remaining bits will be consumed from the most-significant end and so + * they must be shifted up by (32 - 6) = 26. + */ + str.append(1, static_cast(((value & 0x7C0) >> 6) | 0xC0)); + bytes = 1; + value <<= 26; + } + + /** + * The remaining bits are to be consumed 6 at a time from the most-significant + * end. The mask 0xFC000000 grabs these six bits, which then must be shifted down + * by 26, and OR'd with 0x80 to produce the continuation byte. + */ + for (; bytes > 0; --bytes, value <<= 6) + { + str.append(1, static_cast(((value & 0xFC000000) >> 26) | 0x80)); + } + + return str; + } + +}; diff --git a/src/utf8.h b/src/utf8.h new file mode 100644 index 0000000..b677ce8 --- /dev/null +++ b/src/utf8.h @@ -0,0 +1,91 @@ +#ifndef UTF8_CPP_H +#define UTF8_CPP_H + +#include +#include +#include + +namespace Url +{ + + /** + * Work between unicode code points and their UTF-8-encoded representation. + */ + struct Utf8 + { + /** + * The type we use to represent Unicode codepoints. + */ + typedef uint32_t codepoint_t; + + /** + * The type we use when talking about the integral value of bytes. + */ + typedef unsigned char char_t; + + /** + * The highest allowed codepoint. + */ + static const codepoint_t MAX_CODEPOINT = 0x10FFFF; + + /** + * Consume up to the last byte of the sequence, returning the codepoint. + */ + static codepoint_t readCodepoint( + std::string::const_iterator& it, const std::string::const_iterator& end); + + /** + * Write a codepoint to the provided string. + */ + static std::string& writeCodepoint(std::string& str, codepoint_t value); + + /** + * Return the first codepoint stored in the provided string. + */ + static codepoint_t toCodepoint(const std::string& str) + { + auto it = str.begin(); + return readCodepoint(it, str.end()); + } + + /** + * Get a string with the provided codepoint. + */ + static std::string fromCodepoint(codepoint_t value) + { + std::string str; + writeCodepoint(str, value); + return str; + } + + /** + * Return all the codepoints in the string. + */ + static std::vector toCodepoints(const std::string& str) + { + std::vector result; + for (auto it = str.begin(); it != str.end(); ) + { + result.push_back(readCodepoint(it, str.end())); + } + return result; + } + + /** + * Create a string from a vector of codepoints. + */ + static std::string fromCodepoints(const std::vector& points) + { + std::string result; + for (auto it = points.begin(); it != points.end(); ++it) + { + writeCodepoint(result, *it); + } + return result; + } + + }; + +} + +#endif diff --git a/tests/test-all.R b/tests/test-all.R new file mode 100644 index 0000000..0f20a7f --- /dev/null +++ b/tests/test-all.R @@ -0,0 +1,3 @@ +library(testthat) +library(robotstxt) +test_check("rep") diff --git a/tests/testthat/test-rep.R b/tests/testthat/test-rep.R new file mode 100644 index 0000000..cb2771f --- /dev/null +++ b/tests/testthat/test-rep.R @@ -0,0 +1,11 @@ +context("basic functionality") +test_that("we can do something", { + + rt <- robxp(robotstxt::get_robotstxt("https://cdc.gov")) + + expect_that(rt, is_a("robxp")) + + expect_that(can_fetch(rt, "/asthma/asthma_stats/default.htm", "*"), equals(TRUE)) + expect_that(can_fetch(rt, "/_borders", "*"), equals(FALSE)) + +})