Browse Source

initial commit

master
boB Rudis 7 years ago
commit
878bb7f045
No known key found for this signature in database GPG Key ID: 1D7529BE14E2BBA9
  1. 11
      .Rbuildignore
  2. 1
      .codecov.yml
  3. 8
      .gitignore
  4. 31
      .travis.yml
  5. 25
      CONDUCT.md
  6. 27
      DESCRIPTION
  7. 2
      LICENSE
  8. 7
      NAMESPACE
  9. 2
      NEWS.md
  10. 19
      R/RcppExports.R
  11. 14
      R/rep-package.R
  12. 47
      R/rep.r
  13. 58
      README.Rmd
  14. 74
      README.md
  15. 23
      man/can_fetch.Rd
  16. 16
      man/print.robxp.Rd
  17. 16
      man/rep.Rd
  18. 19
      man/robxp.Rd
  19. 21
      rep.Rproj
  20. 3
      src/.gitignore
  21. 3
      src/Makevars
  22. 42
      src/RcppExports.cpp
  23. 87
      src/agent.cpp
  24. 70
      src/agent.h
  25. 130
      src/directive.cpp
  26. 67
      src/directive.h
  27. 183
      src/psl.cpp
  28. 102
      src/psl.h
  29. 409
      src/punycode.cpp
  30. 105
      src/punycode.h
  31. 26
      src/repmain.cpp
  32. 188
      src/robots.cpp
  33. 69
      src/robots.h
  34. 962
      src/url.cpp
  35. 323
      src/url.h
  36. 150
      src/utf8.cpp
  37. 91
      src/utf8.h
  38. 3
      tests/test-all.R
  39. 11
      tests/testthat/test-rep.R

11
.Rbuildignore

@ -0,0 +1,11 @@
^.*\.Rproj$
^\.Rproj\.user$
^\.travis\.yml$
^README\.*Rmd$
^README\.*html$
^NOTES\.*Rmd$
^NOTES\.*html$
^\.codecov\.yml$
^README_files$
^doc$
^CONDUCT\.md$

1
.codecov.yml

@ -0,0 +1 @@
comment: false

8
.gitignore

@ -0,0 +1,8 @@
.DS_Store
.Rproj.user
.Rhistory
.RData
.Rproj
src/*.o
src/*.so
src/*.dll

31
.travis.yml

@ -0,0 +1,31 @@
language: r
warnings_are_errors: true
sudo: required
cache: packages
r:
- oldrel
- release
- devel
apt_packages:
- libv8-dev
- xclip
env:
global:
- CRAN: http://cran.rstudio.com
after_success:
- Rscript -e 'covr::codecov()'
notifications:
email:
- bob@rud.is
irc:
channels:
- "104.236.112.222#builds"
nick: travisci

25
CONDUCT.md

@ -0,0 +1,25 @@
# Contributor Code of Conduct
As contributors and maintainers of this project, we pledge to respect all people who
contribute through reporting issues, posting feature requests, updating documentation,
submitting pull requests or patches, and other activities.
We are committed to making participation in this project a harassment-free experience for
everyone, regardless of level of experience, gender, gender identity and expression,
sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
Examples of unacceptable behavior by participants include the use of sexual language or
imagery, derogatory comments or personal attacks, trolling, public or private harassment,
insults, or other unprofessional conduct.
Project maintainers have the right and responsibility to remove, edit, or reject comments,
commits, code, wiki edits, issues, and other contributions that are not aligned to this
Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed
from the project team.
Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by
opening an issue or contacting one or more of the project maintainers.
This Code of Conduct is adapted from the Contributor Covenant
(http:contributor-covenant.org), version 1.0.0, available at
http://contributor-covenant.org/version/1/0/0/

27
DESCRIPTION

@ -0,0 +1,27 @@
Package: rep
Type: Package
Title: Tools to Parse and Test Robots Exclusion Protocol Files and Rules
Version: 0.1.0
Date: 2017-08-14
Author: Bob Rudis (bob@rud.is) [aut, cre], SEOmoz, Inc [aut]
Maintainer: Bob Rudis <bob@rud.is>
Description: The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents
a set of standards for allowing or excluding robot/spider crawling of different areas of
site content. Tools are provided which wrap The 'rep-cpp` <https://github.com/seomoz/rep-cpp>
C++ library for processing these 'robots.txt' files.
SystemRequirements: C++11
NeedsCompilation: yes
URL: https://github.com/hrbrmstr/rep
BugReports: https://github.com/hrbrmstr/rep/issues
License: MIT + file LICENSE
Suggests:
testthat,
covr,
robotstxt
Depends:
R (>= 3.2.0)
Imports:
purrr,
Rcpp
RoxygenNote: 6.0.1
LinkingTo: Rcpp

2
LICENSE

@ -0,0 +1,2 @@
YEAR: 2017
COPYRIGHT HOLDER: Bob Rudis

7
NAMESPACE

@ -0,0 +1,7 @@
# Generated by roxygen2: do not edit by hand
S3method(print,robxp)
export(can_fetch)
export(robxp)
importFrom(Rcpp,sourceCpp)
useDynLib(rep, .registration=TRUE)

2
NEWS.md

@ -0,0 +1,2 @@
0.1.0
* Initial release

19
R/RcppExports.R

@ -0,0 +1,19 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
#' Parse robots.txt
#'
#' @noRd
#'
rep_parse <- function(content) {
.Call(`_rep_rep_parse`, content)
}
#' Path allowed
#'
#' @noRd
#'
rep_path_allowed <- function(xp, path, agent = "*") {
.Call(`_rep_rep_path_allowed`, xp, path, agent)
}

14
R/rep-package.R

@ -0,0 +1,14 @@
#' Tools to Parse and Test Robots Exclusion Protocol Files and Rules
#'
#' The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents a set
#' of standards for allowing or excluding robot/spider crawling of different areas of
#' site content. Tools are provided which wrap The 'rep-cpp` <https://github.com/seomoz/rep-cpp>
#' C++ library for processing these 'robots.txt' files.
#'
#' @md
#' @name rep
#' @docType package
#' @author Bob Rudis (bob@@rud.is)
#' @useDynLib rep, .registration=TRUE
#' @importFrom Rcpp sourceCpp
NULL

47
R/rep.r

@ -0,0 +1,47 @@
#' Create a robots.txt object
#'
#' @param x atomic character vector containing a complete robots.txt file
#' @export
#' @examples
#' library(robotstxt)
#' can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE
#' can_fetch(rt, "/_borders", "*") # FALSE
robxp <- function(x) {
robxp <- rep_parse(x)
class(robxp) <- c("robxp")
robxp
}
#' Test URL path against robots.txt
#'
#' @md
#' @param obj `robxp` object
#' @param path path to test
#' @param user_agent user agent to test
#' @export
#' @examples
#' library(robotstxt)
#' can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE
#' can_fetch(rt, "/_borders", "*") # FALSE
can_fetch <- function(obj, path="/", user_agent="*") {
if (inherits(obj, "robxp")) {
rep_path_allowed(obj, path, user_agent)
} else {
return(NULL)
}
}
#' Custom printer for 'robexp' objects
#'
#' @md
#' @param x object to print
#' @param ... unused
#' @export
print.robxp <- function(x, ...) {
cat("<Robots Exclusion Protocol Object>")
}

58
README.Rmd

@ -0,0 +1,58 @@
---
output: rmarkdown::github_document
---
`rep` : Tools to Parse and Test Robots Exclusion Protocol Files and Rules
The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents a set of standards for allowing or excluding robot/spider crawling of different areas of site content. Tools are provided which wrap The 'rep-cpp` <https://github.com/seomoz/rep-cpp> C++ library for processing these 'robots.txt' files.
- [`rep-cpp`](https://github.com/seomoz/rep-cpp)
- [`url-cpp`](https://github.com/seomoz/url-cpp)
The following functions are implemented:
- `robxp`: Create a robots.txt object
- `can_fetch`: Test URL path against robots.txt
### Installation
```{r eval=FALSE}
devtools::install_github("hrbrmstr/rep")
```
```{r message=FALSE, warning=FALSE, error=FALSE, include=FALSE}
options(width=120)
```
### Usage
```{r message=FALSE, warning=FALSE, error=FALSE}
library(rep)
library(robotstxt)
# current verison
packageVersion("rep")
rt <- robxp(get_robotstxt("https://cdc.gov"))
print(rt)
can_fetch(rt, "/asthma/asthma_stats/default.htm", "*")
can_fetch(rt, "/_borders", "*")
```
### Test Results
```{r message=FALSE, warning=FALSE, error=FALSE}
library(rep)
library(testthat)
date()
test_dir("tests/")
```
### Code of Conduct
Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms.

74
README.md

@ -0,0 +1,74 @@
`rep` : Tools to Parse and Test Robots Exclusion Protocol Files and Rules
The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents a set of standards for allowing or excluding robot/spider crawling of different areas of site content. Tools are provided which wrap The 'rep-cpp\` <https://github.com/seomoz/rep-cpp> C++ library for processing these 'robots.txt' files.
- [`rep-cpp`](https://github.com/seomoz/rep-cpp)
- [`url-cpp`](https://github.com/seomoz/url-cpp)
The following functions are implemented:
- `robxp`: Create a robots.txt object
- `can_fetch`: Test URL path against robots.txt
### Installation
``` r
devtools::install_github("hrbrmstr/rep")
```
### Usage
``` r
library(rep)
library(robotstxt)
# current verison
packageVersion("rep")
```
## [1] '0.1.0'
``` r
rt <- robxp(get_robotstxt("https://cdc.gov"))
print(rt)
```
## <Robots Exclusion Protocol Object>
``` r
can_fetch(rt, "/asthma/asthma_stats/default.htm", "*")
```
## [1] TRUE
``` r
can_fetch(rt, "/_borders", "*")
```
## [1] FALSE
### Test Results
``` r
library(rep)
library(testthat)
date()
```
## [1] "Mon Aug 14 15:00:16 2017"
``` r
test_dir("tests/")
```
## testthat results ========================================================================================================
## OK: 3 SKIPPED: 0 FAILED: 0
##
## DONE ===================================================================================================================
### Code of Conduct
Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms.

23
man/can_fetch.Rd

@ -0,0 +1,23 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/rep.r
\name{can_fetch}
\alias{can_fetch}
\title{Test URL path against robots.txt}
\usage{
can_fetch(obj, path = "/", user_agent = "*")
}
\arguments{
\item{obj}{\code{robxp} object}
\item{path}{path to test}
\item{user_agent}{user agent to test}
}
\description{
Test URL path against robots.txt
}
\examples{
library(robotstxt)
can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE
can_fetch(rt, "/_borders", "*") # FALSE
}

16
man/print.robxp.Rd

@ -0,0 +1,16 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/rep.r
\name{print.robxp}
\alias{print.robxp}
\title{Custom printer for 'robexp' objects}
\usage{
\method{print}{robxp}(x, ...)
}
\arguments{
\item{x}{object to print}
\item{...}{unused}
}
\description{
Custom printer for 'robexp' objects
}

16
man/rep.Rd

@ -0,0 +1,16 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/rep-package.R
\docType{package}
\name{rep}
\alias{rep}
\alias{rep-package}
\title{Tools to Parse and Test Robots Exclusion Protocol Files and Rules}
\description{
The 'Robots Exclusion Protocol' \url{http://www.robotstxt.org/orig.html} documents a set
of standards for allowing or excluding robot/spider crawling of different areas of
site content. Tools are provided which wrap The 'rep-cpp` \url{https://github.com/seomoz/rep-cpp}
C++ library for processing these 'robots.txt' files.
}
\author{
Bob Rudis (bob@rud.is)
}

19
man/robxp.Rd

@ -0,0 +1,19 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/rep.r
\name{robxp}
\alias{robxp}
\title{Create a robots.txt object}
\usage{
robxp(x)
}
\arguments{
\item{x}{atomic character vector containing a complete robots.txt file}
}
\description{
Create a robots.txt object
}
\examples{
library(robotstxt)
can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE
can_fetch(rt, "/_borders", "*") # FALSE
}

21
rep.Rproj

@ -0,0 +1,21 @@
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
StripTrailingWhitespace: Yes
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageBuildArgs: --resave-data
PackageRoxygenize: rd,collate,namespace

3
src/.gitignore

@ -0,0 +1,3 @@
*.o
*.so
*.dll

3
src/Makevars

@ -0,0 +1,3 @@
CXX_STD = CXX11
PKG_CXXFLAGS =
PKG_LIBS = -L.

42
src/RcppExports.cpp

@ -0,0 +1,42 @@
// Generated by using Rcpp::compileAttributes() -> do not edit by hand
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
#include <Rcpp.h>
using namespace Rcpp;
// rep_parse
SEXP rep_parse(std::string content);
RcppExport SEXP _rep_rep_parse(SEXP contentSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::string >::type content(contentSEXP);
rcpp_result_gen = Rcpp::wrap(rep_parse(content));
return rcpp_result_gen;
END_RCPP
}
// rep_path_allowed
bool rep_path_allowed(SEXP xp, std::string path, std::string agent);
RcppExport SEXP _rep_rep_path_allowed(SEXP xpSEXP, SEXP pathSEXP, SEXP agentSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< SEXP >::type xp(xpSEXP);
Rcpp::traits::input_parameter< std::string >::type path(pathSEXP);
Rcpp::traits::input_parameter< std::string >::type agent(agentSEXP);
rcpp_result_gen = Rcpp::wrap(rep_path_allowed(xp, path, agent));
return rcpp_result_gen;
END_RCPP
}
static const R_CallMethodDef CallEntries[] = {
{"_rep_rep_parse", (DL_FUNC) &_rep_rep_parse, 1},
{"_rep_rep_path_allowed", (DL_FUNC) &_rep_rep_path_allowed, 3},
{NULL, NULL, 0}
};
RcppExport void R_init_rep(DllInfo *dll) {
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
}

87
src/agent.cpp

@ -0,0 +1,87 @@
#include <algorithm>
#include <sstream>
#include "url.h"
#include "agent.h"
#include "directive.h"
namespace Rep
{
Agent& Agent::allow(const std::string& query)
{
directives_.push_back(Directive(escape(query), true));
sorted_ = false;
return *this;
}
Agent& Agent::disallow(const std::string& query)
{
if (query.empty())
{
// Special case: "Disallow:" means "Allow: /"
directives_.push_back(Directive(query, true));
}
else
{
directives_.push_back(Directive(escape(query), false));
}
sorted_ = false;
return *this;
}
const std::vector<Directive>& Agent::directives() const
{
if (!sorted_)
{
std::sort(directives_.begin(), directives_.end(), [](const Directive& a, const Directive& b) {
return b.priority() < a.priority();
});
sorted_ = true;
}
return directives_;
}
bool Agent::allowed(const std::string& query) const
{
std::string path(escape(query));
if (path.compare("/robots.txt") == 0)
{
return true;
}
for (auto directive : directives())
{
if (directive.match(path))
{
return directive.allowed();
}
}
return true;
}
std::string Agent::str() const
{
std::stringstream out;
out << '[';
auto begin = directives().begin();
auto end = directives().end();
if (begin != end)
{
out << "Directive(" << begin->str() << ')';
++begin;
}
for (; begin != end; ++begin)
{
out << ", Directive(" << begin->str() << ')';
}
out << ']';
return out.str();
}
std::string Agent::escape(const std::string& query)
{
return Url::Url(query).defrag().escape().fullpath();
}
}

70
src/agent.h

@ -0,0 +1,70 @@
#ifndef AGENT_CPP_H
#define AGENT_CPP_H
#include <vector>
#include "directive.h"
namespace Rep
{
class Agent
{
public:
/* The type for the delay. */
typedef float delay_t;
/**
* Construct an agent.
*/
Agent(): directives_(), delay_(-1.0), sorted_(true) {}
/**
* Add an allowed directive.
*/
Agent& allow(const std::string& query);
/**
* Add a disallowed directive.
*/
Agent& disallow(const std::string& query);
/**
* Set the delay for this agent.
*/
Agent& delay(delay_t value) {
delay_ = value;
return *this;
}
/**
* Return the delay for this agent.
*/
delay_t delay() const { return delay_; }
/**
* A vector of the directives, in priority-sorted order.
*/
const std::vector<Directive>& directives() const;
/**
* Return true if the URL (either a full URL or a path) is allowed.
*/
bool allowed(const std::string& path) const;
std::string str() const;
/**
* Canonically escape the provided query for matching purposes.
*/
static std::string escape(const std::string& query);
private:
mutable std::vector<Directive> directives_;
delay_t delay_;
mutable bool sorted_;
};
}
#endif

130
src/directive.cpp

@ -0,0 +1,130 @@
#include <algorithm>
#include <locale>
#include <sstream>
#include <string>
#include "url.h"
#include "directive.h"
namespace Rep
{
Directive::Directive(const std::string& line, bool allowed)
: expression_()
, priority_(line.size())
, allowed_(allowed)
{
if (line.find('*') == std::string::npos)
{
expression_.assign(line);
return;
}
// Remove consecutive '*'s
expression_.reserve(line.size());
bool star = false;
for (auto character : line)
{
if (character == '*')
{
if (!star)
{
expression_.append(1, character);
}
star = true;
}
else
{
expression_.append(1, character);
star = false;
}
}
// Remove trailing '*'s
std::string::reverse_iterator last =
std::find_if(expression_.rbegin(), expression_.rend(),
[](const char c) {
return c != '*';
});
expression_.erase(last.base(), expression_.end());
// Priority is the length of the expression
priority_ = expression_.size();
}
bool Directive::match(const std::string::const_iterator& e_begin,
const std::string::const_iterator& e_end,
const std::string::const_iterator& p_begin,
const std::string::const_iterator& p_end) const
{
std::string::const_iterator expression_it = e_begin;
std::string::const_iterator path_it = p_begin;
while (expression_it != e_end && path_it != p_end)
{
if (*expression_it == '*')
{
// Advance and recurse
++expression_it;
for (; path_it != p_end; ++path_it)
{
if (match(expression_it, e_end, path_it, p_end))
{
return true;
}
}
return false;
}
else if (*expression_it == '$')
{
// This check expects path to be fully consumed. But since one of the
// criteria of being in this while loop is that we've not fully consumed
// path, return false.
return false;
}
else if (*expression_it != *path_it)
{
// These characters must match
return false;
}
else
{
// Advance both by one
++path_it;
++expression_it;
}
}
// Return true only if we've consumed all of the expression
if (expression_it == e_end)
{
return true;
}
else if (*expression_it == '$')
{
return path_it == p_end;
}
else
{
return false;
}
}
std::string Directive::str() const
{
std::stringstream out;
if (allowed_)
{
out << "Allow: " << expression_;
}
else {
out << "Disallow: " << expression_;
}
return out.str();
}
bool Directive::match(const std::string& path) const
{
return match(expression_.begin(), expression_.end(), path.begin(), path.end());
}
}

67
src/directive.h

@ -0,0 +1,67 @@
#ifndef DIRECTIVE_CPP_H
#define DIRECTIVE_CPP_H
namespace Rep
{
class Directive
{
public:
/**
* The type of our priority value.
*/
typedef size_t priority_t;
/**
* Default constructor disallowed.
*/
Directive() = delete;
/**
* The input to this constructor must be stripped of comments and trailing
* whitespace.
*/
Directive(const std::string& line, bool allowed);
/**
* The priority of the rule.
*/
priority_t priority() const
{
return priority_;
}
/**
* Whether or not the provided path matches. The path is expected to be properly
* escaped.
*/
bool match(const std::string& path) const;
/**
* Whether this rule is for an allow or a disallow.
*/
bool allowed() const
{
return allowed_;
}
std::string str() const;
private:
std::string expression_;
priority_t priority_;
bool allowed_;
/**
* Return true if p_begin -> p_end matches the expression e_begin -> e_end.
*/
bool match(const std::string::const_iterator& e_begin,
const std::string::const_iterator& e_end,
const std::string::const_iterator& p_begin,
const std::string::const_iterator& p_end) const;
};
}
#endif

183
src/psl.cpp

@ -0,0 +1,183 @@
#include <algorithm>
#include <fstream>
#include <iostream>
#include <string>
#include "psl.h"
#include "punycode.h"
namespace Url
{
const std::string PSL::not_found = "";
PSL::PSL(std::istream& stream)
{
std::string line;
while (std::getline(stream, line))
{
// Only take up to the first whitespace.
auto it = std::find_if(line.begin(), line.end(), ::isspace);
line.resize(it - line.begin());
// Skip blank lines
if (line.empty())
{
continue;
}
// Skip comments
if (line.compare(0, 2, "//") == 0)
{
continue;
}
// We know the line has at least a single character at this point
if (line[0] == '*')
{
// Line is a wildcard rule
if (line.size() <= 2 || line[1] != '.')
{
throw std::invalid_argument("Wildcard rule must be of form *.<host>");
}
add(line, 1, 2);
}
else if (line[0] == '!')
{
// Line is an exception, take all but the !
if (line.size() <= 1)
{
throw std::invalid_argument("Exception rule has no hostname.");
}
add(line, -1, 1);
}
else
{
add(line, 0, 0);
}
}
}
PSL PSL::fromPath(const std::string& path)
{
std::ifstream stream(path);
if (!stream.good())
{
std::stringstream message;
message << "Path '" << path << "' inaccessible.";
throw std::invalid_argument(message.str());
}
return PSL(stream);
}
PSL PSL::fromString(const std::string& str)
{
std::stringstream stream(str);
return PSL(stream);
}
std::string PSL::getTLD(const std::string& hostname) const
{
return getLastSegments(hostname, getTLDLength(hostname));
}
std::string PSL::getPLD(const std::string& hostname) const
{
return getLastSegments(hostname, getTLDLength(hostname) + 1);
}
std::pair<std::string, std::string> PSL::getBoth(const std::string& hostname) const
{
size_t length = getTLDLength(hostname);
return std::make_pair(
getLastSegments(hostname, length),
getLastSegments(hostname, length + 1));
}
size_t PSL::getTLDLength(const std::string& hostname) const
{
// Reversed copy of hostname
std::string tld(hostname.rbegin(), hostname.rend());
std::transform(tld.begin(), tld.end(), tld.begin(), ::tolower);
while (tld.size())
{
auto it = levels.find(tld);
if (it != levels.end())
{
return it->second;
}
size_t position = tld.rfind('.');
if (position == std::string::npos || position == 0)
{
tld.resize(0);
}
else
{
tld.resize(position);
}
}
return 1;
}
std::string PSL::getLastSegments(const std::string& hostname, size_t segments) const
{
size_t position = hostname.size();
size_t remaining = segments;
while (remaining != 0 && position && position != std::string::npos)
{
position = hostname.rfind('.', position - 1);
remaining -= 1;
}
if (remaining >= 1)
{
return not_found;
}
// Return the whole string if position == std:string::npos
size_t start = (position == std::string::npos) ? 0 : position + 1;
std::string result(hostname, start);
std::transform(result.begin(), result.end(), result.begin(), ::tolower);
// Leading .'s indicate that the query had an empty segment
if (result.size() && result[0] == '.')
{
std::stringstream message;
message << "Empty segment in " << result;
throw std::invalid_argument(message.str());
}
return result;
}
size_t PSL::countSegments(const std::string& hostname) const
{
size_t count = 1;
size_t position = hostname.find('.');
while (position != std::string::npos)
{
count += 1;
position = hostname.find('.', position + 1);
}
return count;
}
void PSL::add(std::string& rule, int level_adjust, size_t trim)
{
// First unpunycoded
std::string copy(rule.rbegin(), rule.rend() - trim);
size_t length = countSegments(copy) + level_adjust;
levels[copy] = length;
// And now punycoded
rule = Punycode::encodeHostname(rule);
copy.assign(rule.rbegin(), rule.rend() - trim);
levels[copy] = length;
}
};

102
src/psl.h

@ -0,0 +1,102 @@
#ifndef PSL_CPP_H
#define PSL_CPP_H
#include <istream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <utility>
namespace Url
{
/**
* Find TLDs and PLDs of a hostname according to a PSL.
*/
struct PSL
{
/**
* Indicates the there is no TLD / PLD
*/
static const std::string not_found;
/**
* Read a PSL from an istream.
*/
PSL(std::istream& stream);
PSL(): levels() { };
PSL(const PSL& other): levels(other.levels) { }
PSL& operator=(const PSL& other)
{
levels = other.levels;
return *this;
}
/**
* Read the provided path holding a set of PSL rules.
*/
static PSL fromPath(const std::string& path);
/**
* Create a PSL object from a string.
*/
static PSL fromString(const std::string& str);
/**
* Get just the TLD of the hostname.
*
* Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If
* some segments have been appropriately punycoded and others not, it may return
* a wrong answer. If a punycoded host is provided, a punycoded response is
* returned. If an unpunycoded host is provided, an unpunycoded response is
* returned.
*/
std::string getTLD(const std::string& hostname) const;
/**
* Get just the PLD of the hostname.
*
* Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If
* some segments have been appropriately punycoded and others not, it may return
* a wrong answer. If a punycoded host is provided, a punycoded response is
* returned. If an unpunycoded host is provided, an unpunycoded response is
* returned.
*/
std::string getPLD(const std::string& hostname) const;
/**
* Get the (TLD, PLD) of the hostname.
*
* Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If
* some segments have been appropriately punycoded and others not, it may return
* a wrong answer. If a punycoded host is provided, a punycoded response is
* returned. If an unpunycoded host is provided, an unpunycoded response is
* returned.
*/
std::pair<std::string, std::string> getBoth(const std::string& hostname) const;
private:
// Mapping of a string rule to its level
std::unordered_map<std::string, size_t> levels;
// Return the number of segments in a hostname
size_t countSegments(const std::string& hostname) const;
// Return the number of segments in the TLD of the provided hostname
size_t getTLDLength(const std::string& hostname) const;
// Return the last `segments` segments of a hostname
std::string getLastSegments(const std::string& hostname, size_t segments) const;
/**
* Add the provided host with the provided priority, trimming characters off
* the front, and adjusting the level by the provided number.
*/
void add(std::string& host, int level_adjust, size_t trim);
};
}
#endif

409
src/punycode.cpp

@ -0,0 +1,409 @@
#include <algorithm>
#include <string>
#include <iostream>
#include "punycode.h"
#include "utf8.h"
namespace Url
{
std::string& Punycode::encode(std::string& str)
{
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3
//
// let n = initial_n
// let delta = 0
// let bias = initial_bias
punycode_uint n = INITIAL_N;
punycode_uint delta = 0;
punycode_uint bias = INITIAL_BIAS;
std::string output;
// Accumulate the non-basic codepoints
std::vector<punycode_uint> codepoints;
for (auto it = str.cbegin(); it != str.cend(); )
{
Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend());
if (value < 0x80)
{
// copy them to the output in order
output.append(1, static_cast<char>(value));
}
codepoints.push_back(value);
}
// let h = b = the number of basic code points in the input
size_t h = output.size();
size_t b = h;
// copy a delimiter if b > 0
if (b > 0)
{
output.append(1, '-');
}
// while h < length(input) do begin
while (h < codepoints.size())
{
// let m = the minimum {non-basic} code point >= n in the input
punycode_uint m = MAX_PUNYCODE_UINT;
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
{
if ((*it >= n) && (*it < m))
{
m = *it;
}
}
// let delta = delta + (m - n) * (h + 1), fail on overflow
if ((m - n) > ((MAX_PUNYCODE_UINT - delta) / (h + 1)))
{
throw std::invalid_argument("Overflow delta update.");
}
delta += (m - n) * (h + 1);
// let n = m
n = m;
// for each code point c in the input (in order) do begin
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
{
// if c < n {or c is basic} then increment delta, fail on overflow
if (*it < n)
{
if (delta == MAX_PUNYCODE_UINT)
{
throw std::invalid_argument("Overflow delta increment.");
}
++delta;
}
// if c == n then begin
if (*it == n)
{
// let q = delta
punycode_uint q = delta;
// for k = base to infinity in steps of base do begin
for (punycode_uint k = BASE; ; k += BASE)
{
// let t = tmin if k <= bias {+ tmin}, or
// tmax if k >= bias + tmax, or k - bias otherwise
punycode_uint t = k <= bias ? TMIN :
k >= bias + TMAX ? TMAX : k - bias;
// if q < t then break
if (q < t)
{
break;
}
// output the code point for digit t + ((q - t) mod (base - t))
output.append(1, DIGIT_TO_BASIC[t + ((q - t) % (BASE - t))]);
// let q = (q - t) div (base - t)
q = (q - t) / (BASE - t);
}
// output the code point for digit q
output.append(1, DIGIT_TO_BASIC[q]);
// let bias = adapt(delta, h + 1, test h equals b?)
bias = adapt(delta, h + 1, h == b);
// let delta = 0
delta = 0;
// increment h
++h;
}
}
// increment delta and n
++delta;
++n;
}
str.assign(output);
return str;
}
std::string Punycode::encode(const std::string& str)
{
std::string result(str);
encode(result);
return result;
}
std::string Punycode::encodeHostname(const std::string& hostname)
{
// Avoid any punycoding at all if none is needed
if (!needsPunycoding(hostname))
{
return hostname;
}
std::string encoded;
size_t start = 0;
size_t end = hostname.find('.');
while(true)
{
std::string segment = hostname.substr(start, end - start);
if (needsPunycoding(segment))
{
encoded.append("xn--");
encoded.append(Punycode::encode(segment));
}
else
{
encoded.append(segment);
}
if (end == std::string::npos)
{
break;
}
else
{
encoded.append(1, '.');
start = end + 1;
end = hostname.find('.', start);
}
}
return encoded;
}
std::string& Punycode::decode(std::string& str)
{
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2
//
// let n = initial_n
// let i = 0
// let bias = initial_bias
// let output = an empty string indexed from 0
punycode_uint n = INITIAL_N;
punycode_uint i = 0;
punycode_uint bias = INITIAL_BIAS;
std::vector<punycode_uint> codepoints;
size_t index = str.rfind('-');
if (index == std::string::npos)
{
index = 0;
}
// consume all code points before the last delimiter (if there is one)
// and copy them to output, fail on any non-basic code point
for (auto it = str.begin(); it != (str.begin() + index); ++it)
{
if (static_cast<unsigned char>(*it) > 127U)
{
throw std::invalid_argument("Argument has non-basic code points.");
}
codepoints.push_back(*it);
}
// if more than zero code points were consumed then consume one more
// (which will be the last delimiter)
if (index > 0)
{
index += 1;
}
// while the input is not exhausted do begin
for (auto it = (str.begin() + index); it != str.end(); ++it)
{
// let oldi = i
// let w = 1
punycode_uint oldi = i;
punycode_uint w = 1;
// for k = base to infinity in steps of base do begin
for (punycode_uint k = BASE; ; k += BASE, ++it)
{
// consume a code point, or fail if there was none to consume
if (it == str.end())
{
throw std::invalid_argument("Premature termination");
}
// let digit = the code point's digit-value, fail if it has none
int lookup = BASIC_TO_DIGIT[static_cast<size_t>(*it)];
if (lookup == -1)
{
throw std::invalid_argument("Invalid base 36 character.");
}
unsigned char digit = static_cast<unsigned char>(lookup);
// let i = i + digit * w, fail on overflow
if (digit > ((MAX_PUNYCODE_UINT - i) / w))
{
throw std::invalid_argument("Overflow on i.");
}
i += digit * w;
// let t = tmin if k <= bias {+ tmin}, or
// tmax if k >= bias + tmax, or k - bias otherwise
punycode_uint t = k <= bias ? TMIN :
k >= bias + TMAX ? TMAX : k - bias;
// if digit < t then break
if (digit < t)
{
break;
}
// let w = w * (base - t), fail on overflow
if (w > (MAX_PUNYCODE_UINT / (BASE - t)))
{
// I believe this line is unreachable without first overflowing i.
// Since 'i' is updated above as i += digit * w, and w is updated as
// w = w * (BASE - t), we should like to keep (BASE - t) > digit to
// give 'w' a chance to overflow first. To keep t minimized, we must
// have 'bias' maximized. `bias` is driven by the 'adapt' function
// below.
//
// The value returned by 'adapt' increases with the input delta, and
// decreases with the input size. The delta is a function of the input
// size as well, on the order of (delta_n * input size), and
// legitimate delta_n values are limited to 0x10FFFF (the maximum
// unicode codepoint). Even setting that aside, the maximum value that
// adapt() can return is adapt(2 ** 32 - 1, 1, false) = 204.
//
// Using this bias, we could use the input (HERE) to get iterations:
//
// digit = b = 1, i = 2, k = 36, t = 1, w = 35
// digit = b = 1, i = 37, k = 72, t = 1, w = 1225
// digit = b = 1, i = 1262, k = 108, t = 1, w = 42875
// digit = b = 1, i = 44137, k = 144, t = 1, w = 1500625
// digit = b = 1, i = 1544762, k = 180, t = 1, w = 52521875
//
// At this point, t now becomes TMAX (26) because k exceeds the bias
// (since the maximum bias is 204). As such, the minimum continuation
// value is 26:
//
// digit = 0 = 26, i = 1367113512, k = 216, t = 26, w = 525218750
//
// However, the next iteration now overflows i before we can get to
// the w update.
throw std::invalid_argument("Overflow on w."); // LCOV_EXCL_LINE
}
w *= (BASE - t);
}
// let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?)
bias = adapt(i - oldi, codepoints.size() + 1, oldi == 0);
// let n = n + i div (length(output) + 1), fail on overflow
if ((i / (codepoints.size() + 1)) > (MAX_PUNYCODE_UINT - n))
{
throw std::invalid_argument("Overflow on n.");
}
n += i / (codepoints.size() + 1);
// let i = i mod (length(output) + 1)
i %= (codepoints.size() + 1);
// insert n into output at position i
codepoints.insert(codepoints.begin() + i, n);
// increment i
++i;
}
std::string output;
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
{
Utf8::writeCodepoint(output, *it);
}
str.assign(output);
return str;
}
std::string Punycode::decode(const std::string& str)
{
std::string result(str);
decode(result);
return result;
}
std::string Punycode::decodeHostname(const std::string& hostname)
{
std::string unencoded;
size_t start = 0;
size_t end = hostname.find('.');
while(true)
{
std::string segment = hostname.substr(start, end - start);
if (segment.substr(0, 4).compare("xn--") == 0)
{
segment = segment.substr(4);
unencoded.append(Punycode::decode(segment));
}
else
{
unencoded.append(segment);
}
if (end == std::string::npos)
{
break;
}
else
{
unencoded.append(1, '.');
start = end + 1;
end = hostname.find('.', start);
}
}
return unencoded;
}
bool Punycode::needsPunycoding(const std::string& str)
{
return std::any_of(
str.begin(),
str.end(),
[](char i){ return static_cast<unsigned char>(i) & 0x80; });
}
Punycode::punycode_uint Punycode::adapt(
punycode_uint delta, punycode_uint numpoints, bool firsttime)
{
// Psuedocode from https://tools.ietf.org/html/rfc3492#section-6.1
//
// It does not matter whether the modifications to delta and k inside
// adapt() affect variables of the same name inside the
// encoding/decoding procedures, because after calling adapt() the
// caller does not read those variables before overwriting them.
//
// if firsttime then let delta = delta div damp
// else let delta = delta div 2
delta = firsttime ? delta / DAMP : delta >> 1;
// let delta = delta + (delta div numpoints)
delta += (delta / numpoints);
// let k = 0
punycode_uint k = 0;
// while delta > ((base - tmin) * tmax) div 2 do begin
for (; delta > ((BASE - TMIN) * TMAX) / 2; k += BASE)
{
// let delta = delta div (base - tmin)
// let k = k + base
delta /= (BASE - TMIN);
}
// return k + (((base - tmin + 1) * delta) div (delta + skew))
return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
}
};

105
src/punycode.h

@ -0,0 +1,105 @@
#ifndef PUNYCODE_CPP_H
#define PUNYCODE_CPP_H
#include <stdexcept>
#include <string>
#include <vector>
#include <unordered_map>
#include <unordered_set>
#include "utf8.h"
namespace Url
{
namespace Punycode
{
typedef Utf8::codepoint_t punycode_uint;
const unsigned int BASE = 36;
const unsigned int TMIN = 1;
const unsigned int TMAX = 26;
const unsigned int SKEW = 38;
const unsigned int DAMP = 700;
const unsigned int INITIAL_BIAS = 72;
const unsigned int INITIAL_N = 128;
// Codepoints to their base-36 value
const std::vector<int8_t> BASIC_TO_DIGIT = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
const std::string DIGIT_TO_BASIC = "abcdefghijklmnopqrstuvwxyz0123456789";
// The highest codepoint in unicode
const punycode_uint MAX_PUNYCODE_UINT = std::numeric_limits<punycode_uint>::max();
//Utf8::MAX_CODEPOINT;
//std::numeric_limits<punycode_uint>::max();
/**
* Replace utf-8-encoded str into punycode.
*/
std::string& encode(std::string& str);
/**
* Create a new punycoded string from utf-8-encoded input.
*/
std::string encode(const std::string& str);
/**
* Encode a hostname.
*/
std::string encodeHostname(const std::string& hostname);
/**
* Replace punycoded str into utf-8-encoded.
*/
std::string& decode(std::string& str);
/**
* Create a new utf-8-encoded string from punycoded input.
*/
std::string decode(const std::string& str);
/**
* Decode a hostname.
*/
std::string decodeHostname(const std::string& hostname);
/**
* Determine if a string needs punycoding.
*/
bool needsPunycoding(const std::string& str);
/**
* Internal function for calculating bias.
*/
punycode_uint adapt(
punycode_uint delta, punycode_uint numpoints, bool firsttime);
};
}
#endif

26
src/repmain.cpp

@ -0,0 +1,26 @@
#include <Rcpp.h>
using namespace Rcpp;
#include "url.h"
#include "robots.h"
//' Parse robots.txt
//'
//' @noRd
//'
// [[Rcpp::export]]
SEXP rep_parse(std::string content) {
Rcpp::XPtr<Rep::Robots> ptr(new Rep::Robots(content));
return(ptr);
}
//' Path allowed
//'
//' @noRd
//'
// [[Rcpp::export]]
bool rep_path_allowed(SEXP xp, std::string path, std::string agent = "*") {
Rcpp::XPtr<Rep::Robots> ptr(xp);
return(ptr->allowed(path, agent));
}

188
src/robots.cpp

@ -0,0 +1,188 @@
#include <algorithm>
#include <functional>
#include <cctype>
#include <locale>
#include <sstream>
#include <iostream>
#include <unordered_map>
#include "url.h"
#include "robots.h"
#include <Rcpp.h>
namespace Rep
{
void Robots::strip(std::string& string)
{
string.erase(string.begin(), std::find_if(string.begin(), string.end(),
std::not1(std::ptr_fun<int, int>(std::isspace))));
string.erase(std::find_if(string.rbegin(), string.rend(),
std::not1(std::ptr_fun<int, int>(std::isspace))).base(), string.end());
}
bool Robots::getpair(std::istringstream& stream, std::string& key, std::string& value)
{
while (getline(stream, key))
{
size_t index = key.find('#');
if (index != std::string::npos)
{
key.resize(index);
}
// Find the colon and divide it into key and value, skipping malformed lines
index = key.find(':');
if (index == std::string::npos)
{
continue;
}
value.assign(key.begin() + index + 1, key.end());
key.resize(index);
// Strip whitespace off of each
strip(key);
strip(value);
// Lowercase the key
std::transform(key.begin(), key.end(), key.begin(), ::tolower);
return true;
}
return false;
}
Robots::Robots(const std::string& content): agents_(), sitemaps_(), default_(agents_["*"])
{
std::string agent_name("*");
std::istringstream input(content);
if (content.compare(0, 3, "\xEF\xBB\xBF") == 0)
{
input.ignore(3);
}
std::string key, value;
std::vector<std::string> group;
bool last_agent = false;
agent_map_t::iterator current = agents_.find("*");
while (Robots::getpair(input, key, value))
{
if (key.compare("user-agent") == 0)
{
// Store the user agent string as lowercased
std::transform(value.begin(), value.end(), value.begin(), ::tolower);
if (last_agent)
{
group.push_back(value);
}
else
{
if (!agent_name.empty())
{
for (auto other : group)
{
agents_[other] = current->second;
}
group.clear();
}
agent_name = value;
current = agents_.emplace(agent_name, Agent()).first;
}
last_agent = true;
continue;
}
else
{
last_agent = false;
}
if (key.compare("sitemap") == 0)
{
sitemaps_.push_back(value);
}
else if (key.compare("disallow") == 0)
{
current->second.disallow(value);
}
else if (key.compare("allow") == 0)
{
current->second.allow(value);
}
else if (key.compare("crawl-delay") == 0)
{
try
{
current->second.delay(std::stof(value));
}
catch (const std::exception&)
{
Rcpp::Rcout << "Could not parse " << value << " as float." << std::endl;
}
}
}
if (!agent_name.empty())
{
for (auto other : group)
{
agents_[other] = current->second;
}
}
}
const Agent& Robots::agent(const std::string& name) const
{
// Lowercase the agent
std::string lowered(name);
std::transform(lowered.begin(), lowered.end(), lowered.begin(), ::tolower);
auto it = agents_.find(lowered);
if (it == agents_.end())
{
return default_;
}
else
{
return it->second;
}
}
bool Robots::allowed(const std::string& path, const std::string& name) const
{
return agent(name).allowed(path);
}
std::string Robots::str() const
{
std::stringstream out;
// TODO: include sitepath info
out << '{';
auto begin = agents_.begin();
auto end = agents_.end();
if (begin != end)
{
out << '"' << begin->first << '"' << ": " << begin->second.str();
++begin;
}
for (; begin != end; ++begin)
{
out << ", \"" << begin->first << '"' << ": " << begin->second.str();
}
out << '}';
return out.str();
}
std::string Robots::robotsUrl(const std::string& url)
{
return Url::Url(url)
.setUserinfo("")
.setPath("robots.txt")
.setParams("")
.setQuery("")
.setFragment("")
.remove_default_port()
.str();
}
}

69
src/robots.h

@ -0,0 +1,69 @@
#ifndef ROBOTS_CPP_H
#define ROBOTS_CPP_H
#include <sstream>
#include <unordered_map>
#include <vector>
#include "agent.h"
namespace Rep
{
class Robots
{
public:
typedef std::unordered_map<std::string, Agent> agent_map_t;
typedef std::vector<std::string> sitemaps_t;
/**
* Create a robots.txt from a utf-8-encoded string.
*/
Robots(const std::string& content);
/**
* Instantiate a Robots object.
*/
Robots(
const agent_map_t& agents,
const sitemaps_t& sitemaps)
: agents_(agents)
, sitemaps_(sitemaps)
, default_(agents_["*"]) {}
/**
* Get the sitemaps in this robots.txt
*/
const sitemaps_t& sitemaps() const { return sitemaps_; }
/**
* Get the agent with the corresponding name.
*/
const Agent& agent(const std::string& name) const;
/**
* Return true if agent is allowed to fetch the URL (either a
* full URL or a path).
*/
bool allowed(const std::string& path, const std::string& name) const;
std::string str() const;
/**
* Return the robots.txt URL corresponding to the provided URL.
*/
static std::string robotsUrl(const std::string& url);
private:
static void strip(std::string& string);
static bool getpair(
std::istringstream& stream, std::string& key, std::string& value);
agent_map_t agents_;
sitemaps_t sitemaps_;
Agent& default_;
};
}
#endif

962
src/url.cpp

@ -0,0 +1,962 @@
#include <algorithm>
#include <string>
#include <iterator>
#include <unordered_map>
#include <unordered_set>
#include <iostream>
#include <iterator>
#include <sstream>
#include "url.h"
#include "punycode.h"
namespace Url
{
/* Character classes */
const CharacterClass Url::GEN_DELIMS(":/?#[]@");
const CharacterClass Url::SUB_DELIMS("!$&'()*+,;=");
const CharacterClass Url::DIGIT("0123456789");
const CharacterClass Url::ALPHA(
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
const CharacterClass Url::UNRESERVED(
Url::ALPHA.chars() + Url::DIGIT.chars() + "-._~");
const CharacterClass Url::RESERVED(
Url::GEN_DELIMS.chars() + Url::SUB_DELIMS.chars());
const CharacterClass Url::PCHAR(
Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":@");
const CharacterClass Url::PATH(
Url::PCHAR.chars() + "/");
const CharacterClass Url::QUERY(
Url::PCHAR.chars() + "/?");
const CharacterClass Url::FRAGMENT(
Url::PCHAR.chars() + "/?");
const CharacterClass Url::USERINFO(
Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":");
const CharacterClass Url::HEX("0123456789ABCDEF");
const CharacterClass Url::SCHEME(
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-.");
const std::vector<signed char> Url::HEX_TO_DEC = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
const std::unordered_map<std::string, int> Url::PORTS = {
{"http", 80},
{"https", 443}
};
const std::unordered_set<std::string> Url::USES_RELATIVE = {
"",
"file",
"ftp",
"gopher",
"http",
"https",
"imap",
"mms",
"nntp",
"prospero",
"rtsp",
"rtspu",
"sftp",
"shttp",
"svn",
"svn+ssh",
"wais"
};
const std::unordered_set<std::string> Url::USES_NETLOC = {
"",
"file",
"ftp",
"git",
"git+ssh",
"gopher",
"http",
"https",
"imap",
"mms",
"nfs",
"nntp",
"prospero",
"rsync",
"rtsp",
"rtspu",
"sftp",
"shttp",
"snews",
"svn",
"svn+ssh",
"telnet",
"wais"
};
const std::unordered_set<std::string> Url::USES_PARAMS = {
"",
"ftp",
"hdl",
"http",
"https",
"imap",
"mms",
"prospero",
"rtsp",
"rtspu",
"sftp",
"shttp",
"sip",
"sips",
"tel"
};
const std::unordered_set<std::string> Url::KNOWN_PROTOCOLS = {
"",
"file",
"ftp",
"git",
"git+ssh",
"gopher",
"hdl",
"http",
"https",
"imap",
"mms",
"nfs",
"nntp",
"prospero",
"rsync",
"rtsp",
"rtspu",
"sftp",
"shttp",
"sip",
"sips",
"sms",
"snews",
"svn",
"svn+ssh",
"tel",
"telnet",
"wais"
};
Url::Url(const std::string& url): port_(0), has_params_(false), has_query_(false)
{
size_t position = 0;
size_t index = url.find(':');
if (index != std::string::npos)
{
// All the characters in our would-be scheme must be in SCHEME
if (std::all_of(
url.begin(),
url.begin() + index,
[](char c) { return SCHEME(c); } ))
{
// If there is nothing after the : or there are any non-digits, this is
// the scheme
if ((index + 1) >= url.length()
|| std::any_of(
url.begin() + index + 1,
url.end(),
[](char c) { return !DIGIT(c); }))
{
scheme_.assign(url, 0, index);
std::transform(
scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower);
position = index + 1;
}
else
{
scheme_.assign(url, 0, index);
std::transform(
scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower);
if (KNOWN_PROTOCOLS.find(scheme_) != KNOWN_PROTOCOLS.end())
{
position = index + 1;
}
else
{
scheme_.clear();
}
}
}
}
// Search for the netloc
if ((url.length() - position) >= 1
&& url[position] == '/'
&& url[position + 1] == '/')
{
// Skip the '//'
position += 2;
index = url.find_first_of("/?#", position);
host_.assign(url, position, index - position);
position = index;
// Extract any userinfo if there is any
index = host_.find('@');
if (index != std::string::npos)
{
userinfo_.assign(host_, 0, index);
host_.assign(host_, index + 1, std::string::npos);
}
// Lowercase the hostname
std::transform(host_.begin(), host_.end(), host_.begin(), ::tolower);
// Try to find a port
index = host_.find(':');
if (index != std::string::npos)
{
std::string portText(host_, index + 1, std::string::npos);
host_.resize(index);
if (portText.empty())
{
port_ = 0;
}
else
{
try
{
port_ = std::stoi(portText, &index);
if (index != portText.length())
{
// Malformed port
throw UrlParseException("Port not a number: " + portText);
}
if (port_ > 65535)
{
throw UrlParseException("Port too high: " + portText);
}
else if (port_ < 0)
{
throw UrlParseException("Port negative: " + portText);
}
}
catch (const std::invalid_argument&)
{
// Malformed port
throw UrlParseException("Port not a number: " + portText);
}
catch (const std::out_of_range&)
{
throw UrlParseException("Port out of integer range: " + portText);
}
}
}
}
if (position != std::string::npos)
{
path_.assign(url, position, std::string::npos);
index = path_.find('#');
if (index != std::string::npos)
{
fragment_.assign(path_, index + 1, std::string::npos);
path_.resize(index);
}
index = path_.find('?');
if (index != std::string::npos)
{
query_.assign(path_, index + 1, std::string::npos);
has_query_ = true;
path_.resize(index);
}
if (USES_PARAMS.find(scheme_) != USES_PARAMS.end())
{
index = path_.find(';');
if (index != std::string::npos)
{
params_.assign(path_, index + 1, std::string::npos);
has_params_ = true;
path_.resize(index);
}
}
}
}
Url& Url::assign(const Url& other)
{
return (*this) = other;
}
bool Url::operator==(const Url& other) const
{
return (
(scheme_ == other.scheme_ ) &&
(userinfo_ == other.userinfo_ ) &&
(host_ == other.host_ ) &&
(port_ == other.port_ ) &&
(path_ == other.path_ ) &&
(params_ == other.params_ ) &&
(query_ == other.query_ ) &&
(fragment_ == other.fragment_ ) &&
(has_params_ == other.has_params_) &&
(has_query_ == other.has_query_ )
);
}
bool Url::operator!=(const Url& other) const
{
return !operator==(other);
}
bool Url::equiv(const Url& other)
{
Url self_(*this);
Url other_(other);
self_.strip()
.sort_query()
.defrag()
.deuserinfo()
.abspath()
.escape()
.punycode()
.remove_default_port();
other_.strip()
.sort_query()
.defrag()
.deuserinfo()
.abspath()
.escape()
.punycode()
.remove_default_port();
return self_ == other_;
}
std::string& Url::remove_repeats(std::string& str, const char chr)
{
size_t dest = 0;
// By initializing this to true, it also strips of leading instances of chr
bool seen = true;
for (size_t src = 0; src < str.length(); ++src)
{
if (!seen || (str[src] != chr))
{
str[dest++] = str[src];
}
seen = str[src] == chr;
}
// Remove the last character if it happens to be chr
size_t length = ((dest > 0) && (str[dest - 1] == chr)) ? dest - 1 : dest;
str.resize(length);
return str;
}
std::string Url::fullpath() const
{
std::string result;
if (path_.empty() || path_[0] != '/')
{
result.append(1, '/');
}
result.append(path_);
if (has_params_)
{
result.append(";");
result.append(params_);
}
if (has_query_)
{
result.append("?");
result.append(query_);
}
if (!fragment_.empty())
{
result.append("#");
result.append(fragment_);
}
return result;
}
std::string Url::str() const
{
std::string result;
if (!scheme_.empty())
{
result.append(scheme_);
if (USES_NETLOC.find(scheme_) == USES_NETLOC.end())
{
result.append(":");
}
else
{
result.append("://");
}
}
else if (!host_.empty())
{
result.append("//");
}
if (!userinfo_.empty())
{
result.append(userinfo_);
result.append("@");
}
if (!host_.empty())
{
result.append(host_);
}
if (port_)
{
result.append(":");
result.append(std::to_string(port_));
}
if (path_.empty())
{
if (!result.empty())
{
result.append("/");
}
}
else
{
if (!host_.empty() && path_[0] != '/')
{
result.append(1, '/');
}
result.append(path_);
}
if (has_params_)
{
result.append(";");
result.append(params_);
}
if (has_query_)
{
result.append("?");
result.append(query_);
}
if (!fragment_.empty())
{
result.append("#");
result.append(fragment_);
}
return result;
}
Url& Url::strip()
{
size_t start = query_.find_first_not_of('?');
if (start != std::string::npos)
{
query_.assign(query_, start, std::string::npos);
}
else
{
query_.assign("");
}
setQuery(remove_repeats(query_, '&'));
setParams(remove_repeats(params_, ';'));
return *this;
}
Url& Url::abspath()
{
std::string copy;
std::vector<size_t> segment_starts;
if (path_.size() >= 1 && path_[0] == '/')
{
copy.append(1, '/');
segment_starts.push_back(0);
}
bool directory = false;
size_t previous = 0;
size_t index = 0;
for (index = path_.find('/')
; index != std::string::npos
; previous = index + 1, index = path_.find('/', index + 1))
{
// Skip empty segments
if (index - previous == 0)
{
continue;
}
if ((index - previous == 2)
&& path_[previous] == '.'
&& path_[previous + 1] == '.')
{
if (!segment_starts.empty())
{
copy.resize(segment_starts.back());
segment_starts.pop_back();
}
directory = true;
}
else if ((index - previous == 1) && path_[previous] == '.')
{
directory = true;
}
else
{
segment_starts.push_back(copy.length());
copy.append(path_, previous, index - previous);
copy.append(1, '/');
directory = false;
}
}
// Handle the last segment
index = path_.length();
if (previous == path_.length())
{
directory = true;
}
else if ((index - previous == 1) && path_[previous] == '.')
{
directory = true;
}
else if ((index - previous == 2)
&& path_[previous] == '.'
&& path_[previous + 1] == '.')
{
if (!segment_starts.empty())
{
copy.resize(segment_starts.back());
}
directory = true;
}
else
{
copy.append(path_, previous, index - previous);
copy.append(1, '/');
directory = false;
}
if (!directory && copy.size() >= 1)
{
copy.resize(copy.size() - 1);
}
else if (directory && copy.empty())
{
copy.append(1, '/');
}
path_.assign(copy);
return *this;
}
Url& Url::relative_to(const Url& other)
{
// If this scheme does not use relative, return it unchanged
if (USES_RELATIVE.find(scheme_) == USES_RELATIVE.end())
{
return *this;
}
// Support scheme-relative URLs
if (scheme_.empty())
{
scheme_ = other.scheme_;
}
// If this is an absolute URL (or scheme-relative), return early
if (!host_.empty()) {
return *this;
}
// If it's not an absolute URL, we need to copy the other host and port
host_ = other.host_;
port_ = other.port_;
userinfo_ = other.userinfo_;
// If the path portion is absolute, then bail out early.
if (!path_.empty() && path_.front() == '/')
{
return *this;
}
// Otherwise, this is a path that need to be evaluated relative to the other. If
// there is no '/', then we just keep our current path if it's not empty.
if (path_.empty())
{
if (params_.empty())
{
path_ = other.path_;
params_ = other.params_;
has_params_ = other.has_params_;
if (query_.empty())
{
query_ = other.query_;
has_query_ = other.has_query_;
}
}
else
{
path_.assign(other.path_, 0, other.path_.rfind('/') + 1);
}
if (fragment_.empty())
{
fragment_ = other.fragment_;
}
}
else
{
size_t index = other.path_.rfind('/');
if (index != std::string::npos)
{
path_ = other.path_.substr(0, index + 1) + path_;
}
else if (!host_.empty())
{
path_ = "/" + path_;
}
}
return *this;
}
Url& Url::escape(bool strict)
{
escape(path_, PATH, strict);
escape(query_, QUERY, strict);
escape(params_, QUERY, strict);
escape(userinfo_, USERINFO, strict);
return *this;
}
std::string& Url::escape(std::string& str, const CharacterClass& safe, bool strict)
{
std::string copy(str);
size_t dest = 0;
// Allocate space pessimistically -- if every entity is expanded, it will take 3x
// the space.
str.resize(str.length() * 3);
for (size_t src = 0; src < copy.length(); ++src)
{
if (copy[src] == '%' && (copy.length() - src) >= 2)
{
// Read ahead to see if there's a valid escape sequence. If not, treat
// this like a normal character.
if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1)
{
int value = (
HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]);
// In strict mode, we can only unescape parameters if they are both
// safe and node reserved
if (!strict || (strict && safe(value) && !RESERVED(value)))
{
// Replace src + 2 with that byte, advance src to consume it and
// continue.
src += 2;
copy[src] = value;
}
else
{
str[dest++] = copy[src++];
str[dest++] = ::toupper(copy[src++]);
str[dest++] = ::toupper(copy[src]);
continue;
}
}
}
if (!safe(copy[src]))
{
// Not safe -- replace with %XX
str[dest++] = '%';
str[dest++] = HEX.chars()[(copy[src] >> 4) & 0xF];
str[dest++] = HEX.chars()[copy[src] & 0xF];
}
else
{
str[dest++] = copy[src];
}
}
str.resize(dest);
return str;
}
Url& Url::unescape()
{
unescape(path_);
unescape(query_);
unescape(params_);
unescape(userinfo_);
return *this;
}
std::string& Url::unescape(std::string& str)
{
std::string copy(str);
size_t dest = 0;
for (size_t src = 0; src < copy.length(); ++src, ++dest)
{
if (copy[src] == '%' && (copy.length() - src) >= 2)
{
// Read ahead to see if there's a valid escape sequence. If not, treat
// this like a normal character.
if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1)
{
int value = (
HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]);
// Replace src + 2 with that byte, advance src to consume it and
// continue.
src += 2;
str[dest] = value;
continue;
}
}
// Either not a % or an incomplete entity
str[dest] = copy[src];
}
str.resize(dest);
return str;
}
Url& Url::deparam(const std::unordered_set<std::string>& blacklist)
{
// Predicate is if it's present in the blacklist.
auto predicate = [blacklist](std::string& name, const std::string& value)
{
std::transform(name.begin(), name.end(), name.begin(), ::tolower);
return blacklist.find(name) != blacklist.end();
};
setQuery(remove_params(query_, predicate, '&'));
setParams(remove_params(params_, predicate, ';'));
return *this;
}
Url& Url::deparam(const deparam_predicate& predicate)
{
setQuery(remove_params(query_, predicate, '&'));
setParams(remove_params(params_, predicate, ';'));
return *this;
}
std::string& Url::remove_params(std::string& str,
const deparam_predicate& predicate,
char sep)
{
std::string copy;
std::string piece;
std::string name;
std::string value;
size_t previous = 0;
for (size_t index = str.find(sep)
; index != std::string::npos
; previous = index + 1, index = str.find(sep, previous))
{
piece.assign(str, previous, index - previous);
size_t position = piece.find('=');
name.assign(piece, 0, position);
value.clear();
if (position != std::string::npos)
{
value.assign(piece, position + 1, std::string::npos);
}
if (!predicate(name, value))
{
copy.append(copy.empty() ? 0 : 1, sep);
copy.append(piece);
}
}
if (previous < str.length())
{
piece.assign(str, previous, std::string::npos);
size_t position = piece.find('=');
name.assign(piece, 0, position);
value.clear();
if (position != std::string::npos)
{
value.assign(piece, position + 1, std::string::npos);
}
if (!predicate(name, value))
{
copy.append(copy.empty() ? 0 : 1, sep);
copy.append(piece);
}
}
str.assign(copy);
return str;
}
Url& Url::sort_query()
{
split_sort_join(query_, '&');
split_sort_join(params_, ';');
return *this;
}
std::string& Url::split_sort_join(std::string& str, const char glue)
{
// Return early if empty
if (str.empty())
{
return str;
}
// Split
std::vector<std::string> pieces;
std::stringstream stream(str);
std::string item;
while (getline(stream, item, glue))
{
pieces.push_back(item);
}
// Return early if it's just a single element
if (pieces.size() == 1)
{
return str;
}
// Sort
std::sort(pieces.begin(), pieces.end());
// Join (at this point we know that there's at least one element)
std::stringstream output;
for (auto it = pieces.begin(); it != (pieces.end() - 1); ++it)
{
output << *it << glue;
}
output << pieces.back();
str.assign(output.str());
return str;
}
Url& Url::remove_default_port()
{
if (port_ && !scheme_.empty())
{
auto it = PORTS.find(scheme_);
if (it != PORTS.end() && port_ == it->second)
{
port_ = 0;
}
}
return *this;
}
Url& Url::deuserinfo()
{
userinfo_.clear();
return *this;
}
Url& Url::defrag()
{
fragment_.clear();
return *this;
}
Url& Url::punycode()
{
check_hostname(host_);
std::string encoded(Punycode::encodeHostname(host_));
check_hostname(encoded);
host_ = encoded;
return *this;
}
Url& Url::unpunycode()
{
host_ = Punycode::decodeHostname(host_);
return *this;
}
Url& Url::host_reversed()
{
std::reverse(host_.begin(), host_.end());
for (size_t index = 0, position = 0; index < host_.size(); index = position + 1)
{
position = host_.find('.', index);
if (position == std::string::npos)
{
std::reverse(host_.begin() + index, host_.end());
break;
}
else
{
std::reverse(host_.begin() + index, host_.begin() + position);
}
}
return *this;
}
void Url::check_hostname(std::string& host)
{
// Skip empty hostnames -- they are valid
if (host.empty())
{
return;
}
size_t start = 0;
size_t end = host.find('.');
while (end != std::string::npos)
{
if ((end - start) > 63)
{
throw std::invalid_argument("Label too long.");
}
else if (end == start)
{
throw std::invalid_argument("Empty label.");
}
start = end + 1;
end = host.find('.', start);
}
// For the final segment
if ((host.size() - start) > 63)
{
throw std::invalid_argument("Label too long.");
}
else if (host.size() == start && start > 1)
{
// Remove a trailing empty segment
host.resize(start - 1);
}
}
};

323
src/url.h

@ -0,0 +1,323 @@
#ifndef URL_CPP_H
#define URL_CPP_H
#include <stdexcept>
#include <functional>
#include <string>
#include <vector>
#include <unordered_map>
#include <unordered_set>
namespace Url
{
struct UrlParseException : public std::logic_error
{
UrlParseException(const std::string& message) : std::logic_error(message) {}
};
struct CharacterClass
{
CharacterClass(const std::string& chars) : chars_(chars), map_(256, false)
{
for (auto it = chars_.begin(); it != chars_.end(); ++it)
{
map_[static_cast<size_t>(*it)] = true;
}
}
bool operator()(char c) const
{
return map_[static_cast<unsigned char>(c)];
}
const std::string& chars() const
{
return chars_;
}
private:
// Private, unimplemented to prevent use
CharacterClass();
CharacterClass(const CharacterClass& other);
std::string chars_;
std::vector<bool> map_;
};
struct Url
{
/* Character classes */
const static CharacterClass GEN_DELIMS;
const static CharacterClass SUB_DELIMS;
const static CharacterClass ALPHA;
const static CharacterClass DIGIT;
const static CharacterClass UNRESERVED;
const static CharacterClass RESERVED;
const static CharacterClass PCHAR;
const static CharacterClass PATH;
const static CharacterClass QUERY;
const static CharacterClass FRAGMENT;
const static CharacterClass USERINFO;
const static CharacterClass HEX;
const static CharacterClass SCHEME;
const static std::vector<signed char> HEX_TO_DEC;
const static std::unordered_map<std::string, int> PORTS;
const static std::unordered_set<std::string> USES_RELATIVE;
const static std::unordered_set<std::string> USES_NETLOC;
const static std::unordered_set<std::string> USES_PARAMS;
const static std::unordered_set<std::string> KNOWN_PROTOCOLS;
// The type of the predicate used for removing parameters
typedef std::function<bool(std::string&, std::string&)> deparam_predicate;
explicit Url(const std::string& url);
Url(const Url& other)
: scheme_(other.scheme_)
, host_(other.host_)
, port_(other.port_)
, path_(other.path_)
, params_(other.params_)
, query_(other.query_)
, fragment_(other.fragment_)
, userinfo_(other.userinfo_)
, has_params_(other.has_params_)
, has_query_(other.has_query_) { }
/**
* Take on the value of the other URL.
*/
Url& assign(const Url& other);
/**
* To be considered equal, all fields must be equal.
*/
bool operator==(const Url& other) const;
bool operator!=(const Url& other) const;
/**
* Two URLs are considered equivalent if they have the same meaning.
*/
bool equiv(const Url& other);
/**************************************
* Component-wise access and setting. *
**************************************/
const std::string& scheme() const { return scheme_; }
Url& setScheme(const std::string& s)
{
scheme_ = s;
return *this;
}
const std::string& host() const { return host_; }
Url& setHost(const std::string& s)
{
host_ = s;
return *this;
}
const int port() const { return port_; }
Url& setPort(int i)
{
port_ = i;
return *this;
}
const std::string& path() const { return path_; }
Url& setPath(const std::string& s)
{
path_ = s;
return *this;
}
const std::string& params() const { return params_; }
Url& setParams(const std::string& s)
{
params_ = s;
has_params_ = !s.empty();
return *this;
}
const std::string& query() const { return query_; }
Url& setQuery(const std::string& s)
{
query_ = s;
has_query_ = !s.empty();
return *this;
}
const std::string& fragment() const { return fragment_; }
Url& setFragment(const std::string& s)
{
fragment_ = s;
return *this;
}
const std::string& userinfo() const { return userinfo_; }
Url& setUserinfo(const std::string& s)
{
userinfo_ = s;
return *this;
}
/**
* Get a representation of all components of the path, params, query, fragment.
*
* Always includes a leading /.
*/
std::string fullpath() const;
/**
* Get a new string representation of the URL.
**/
std::string str() const;
/*********************
* Chainable methods *
*********************/
/**
* Strip semantically meaningless excess '?', '&', and ';' characters from query
* and params.
*/
Url& strip();
/**
* Make the path absolute.
*
* Evaluate '.', '..', and excessive slashes.
*/
Url& abspath();
/**
* Evaluate this URL relative fo `other`, placing the result in this object.
*/
Url& relative_to(const std::string& other)
{
return relative_to(Url(other));
}
/**
* Evaluate this URL relative fo `other`, placing the result in this object.
*/
Url& relative_to(const Url& other);
/**
* Ensure that the path, params, query, and userinfo are properly escaped.
*
* In 'strict' mode, only entities that are both safe and not reserved characters
* are unescaped. In non-strict mode, entities that are safe are unescaped.
*/
Url& escape(bool strict=false);
/**
* Unescape all entities in the path, params, query, and userinfo.
*/
Url& unescape();
/**
* Remove any params or queries that appear in the blacklist.
*
* The blacklist should contain only lowercased strings, and the comparison is
* done in a case-insensitive way.
*/
Url& deparam(const std::unordered_set<std::string>& blacklist);
/**
* Filter params subject to a predicate for whether it should be filtered.
*
* The predicate must accept two string refs -- the key and value (which may be
* empty). Return `true` if the parameter should be removed, and `false`
* otherwise.
*/
Url& deparam(const deparam_predicate& predicate);
/**
* Put queries and params in sorted order.
*
* To ensure consistent comparisons, escape should be called beforehand.
*/
Url& sort_query();
/**
* Remove the port if it's the default for the scheme.
*/
Url& remove_default_port();
/**
* Remove the userinfo portion.
*/
Url& deuserinfo();
/**
* Remove the fragment.
*/
Url& defrag();
/**
* Punycode the hostname.
*/
Url& punycode();
/**
* Unpunycode the hostname.
*/
Url& unpunycode();
/**
* Reverse the hostname (a.b.c.d => d.c.b.a)
*/
Url& host_reversed();
private:
// Private, unimplemented to prevent use.
Url();
/**
* Remove repeated, leading, and trailing instances of chr from the string.
*/
std::string& remove_repeats(std::string& str, const char chr);
/**
* Ensure all the provided characters are escaped if necessary
*/
std::string& escape(std::string& str, const CharacterClass& safe, bool strict);
/**
* Unescape entities in the provided string
*/
std::string& unescape(std::string& str);
/**
* Remove any params that match entries in the blacklist.
*/
std::string& remove_params(
std::string& str, const deparam_predicate& pred, char sep);
/**
* Split the provided string by char, sort, join by char.
*/
std::string& split_sort_join(std::string& str, const char glue);
/**
* Check that the hostname is valid, removing an optional trailing '.'.
*/
void check_hostname(std::string& host);
std::string scheme_;
std::string host_;
int port_;
std::string path_;
std::string params_;
std::string query_;
std::string fragment_;
std::string userinfo_;
bool has_params_;
bool has_query_;
};
}
#endif

150
src/utf8.cpp

@ -0,0 +1,150 @@
#include <algorithm>
#include <string>
#include <iostream>
#include "utf8.h"
namespace Url
{
Utf8::codepoint_t Utf8::readCodepoint(
std::string::const_iterator& it, const std::string::const_iterator& end)
{
Utf8::char_t current = static_cast<Utf8::char_t>(*it++);
if (current & 0x80)
{
// Number of additional bytes needed
unsigned int bytes = 0;
// The accumulated value
Utf8::codepoint_t result = 0;
if (current < 0xC0)
{
// Invalid sequence
throw std::invalid_argument("Low UTF-8 start byte");
}
else if (current < 0xE0)
{
// One additional byte, two bytes total, use 5 bits
bytes = 1;
result = current & 0x1F;
}
else if (current < 0xF0)
{
// Two additional bytes, three bytes total, use 4 bits
bytes = 2;
result = current & 0x0F;
}
else if (current < 0xF8)
{
// Three additional bytes, four bytes total, use 3 bits
bytes = 3;
result = current & 0x07;
}
else
{
throw std::invalid_argument("High UTF-8 start byte");
}
for (; bytes > 0; --bytes) {
if (it == end)
{
throw std::invalid_argument("UTF-8 sequence terminated early.");
}
current = static_cast<unsigned char>(*it++);
// Ensure the first two bits are 10
if ((current & 0xC0) != 0x80)
{
throw std::invalid_argument("Invalid continuation byte");
}
result = (result << 6) | (current & 0x3F);
}
return result;
}
else
{
return current;
}
}
std::string& Utf8::writeCodepoint(std::string& str, Utf8::codepoint_t value)
{
if (value > MAX_CODEPOINT)
{
throw std::invalid_argument("Code point too high.");
}
else if (value <= 0x007F)
{
// Just append the character itself
str.append(1, static_cast<char>(value));
return str;
}
unsigned int bytes = 0;
if (value > 0xFFFF)
{
/**
* 11110xxx + 3 bytes for 21 bits total
*
* We need to take bits 20-18, which 0x1C0000 masks out. These form the least
* significant bits of this byte (so we shift them back down by 18). The 5
* most significant bits of this byte are 11110, so we OR this result with
* 0xF0 to get this first byte.
*
* The remaining bits will be consumed from the most-significant end and so
* they must be shifted up by (32 - 18) = 14.
*/
str.append(1, static_cast<char>(((value & 0x1C0000) >> 18) | 0xF0));
bytes = 3;
value <<= 14;
}
else if (value > 0x07FF)
{
/**
* 1110xxxx + 2 bytes for 16 bits total
*
* We need to take bits 15-12, which 0xF000 masks out. These form the least
* significant bits of this byte (so we shift them back down by 12). The 4
* most significant bits of this byte are 1110, so we OR this result with
* 0xE0 to get this first byte.
*
* The remaining bits will be consumed from the most-significant end and so
* they must be shifted up by (32 - 12) = 20.
*/
str.append(1, static_cast<char>(((value & 0xF000) >> 12) | 0xE0));
bytes = 2;
value <<= 20;
}
else
{
/**
* 110xxxxx + 1 byte for 11 bits total
*
* We need to take bits 10-6, which 0x7C0 masks out. These form the least
* significant bits of this byte (so we shift them back down by 6). The 3
* most significant bits of this byte are 110, so we OR this result with
* 0xC0 to get this first byte.
*
* The remaining bits will be consumed from the most-significant end and so
* they must be shifted up by (32 - 6) = 26.
*/
str.append(1, static_cast<char>(((value & 0x7C0) >> 6) | 0xC0));
bytes = 1;
value <<= 26;
}
/**
* The remaining bits are to be consumed 6 at a time from the most-significant
* end. The mask 0xFC000000 grabs these six bits, which then must be shifted down
* by 26, and OR'd with 0x80 to produce the continuation byte.
*/
for (; bytes > 0; --bytes, value <<= 6)
{
str.append(1, static_cast<char>(((value & 0xFC000000) >> 26) | 0x80));
}
return str;
}
};

91
src/utf8.h

@ -0,0 +1,91 @@
#ifndef UTF8_CPP_H
#define UTF8_CPP_H
#include <stdexcept>
#include <string>
#include <vector>
namespace Url
{
/**
* Work between unicode code points and their UTF-8-encoded representation.
*/
struct Utf8
{
/**
* The type we use to represent Unicode codepoints.
*/
typedef uint32_t codepoint_t;
/**
* The type we use when talking about the integral value of bytes.
*/
typedef unsigned char char_t;
/**
* The highest allowed codepoint.
*/
static const codepoint_t MAX_CODEPOINT = 0x10FFFF;
/**
* Consume up to the last byte of the sequence, returning the codepoint.
*/
static codepoint_t readCodepoint(
std::string::const_iterator& it, const std::string::const_iterator& end);
/**
* Write a codepoint to the provided string.
*/
static std::string& writeCodepoint(std::string& str, codepoint_t value);
/**
* Return the first codepoint stored in the provided string.
*/
static codepoint_t toCodepoint(const std::string& str)
{
auto it = str.begin();
return readCodepoint(it, str.end());
}
/**
* Get a string with the provided codepoint.
*/
static std::string fromCodepoint(codepoint_t value)
{
std::string str;
writeCodepoint(str, value);
return str;
}
/**
* Return all the codepoints in the string.
*/
static std::vector<codepoint_t> toCodepoints(const std::string& str)
{
std::vector<codepoint_t> result;
for (auto it = str.begin(); it != str.end(); )
{
result.push_back(readCodepoint(it, str.end()));
}
return result;
}
/**
* Create a string from a vector of codepoints.
*/
static std::string fromCodepoints(const std::vector<codepoint_t>& points)
{
std::string result;
for (auto it = points.begin(); it != points.end(); ++it)
{
writeCodepoint(result, *it);
}
return result;
}
};
}
#endif

3
tests/test-all.R

@ -0,0 +1,3 @@
library(testthat)
library(robotstxt)
test_check("rep")

11
tests/testthat/test-rep.R

@ -0,0 +1,11 @@
context("basic functionality")
test_that("we can do something", {
rt <- robxp(robotstxt::get_robotstxt("https://cdc.gov"))
expect_that(rt, is_a("robxp"))
expect_that(can_fetch(rt, "/asthma/asthma_stats/default.htm", "*"), equals(TRUE))
expect_that(can_fetch(rt, "/_borders", "*"), equals(FALSE))
})
Loading…
Cancel
Save