boB Rudis
7 years ago
commit
878bb7f045
39 changed files with 3445 additions and 0 deletions
@ -0,0 +1,11 @@ |
|||
^.*\.Rproj$ |
|||
^\.Rproj\.user$ |
|||
^\.travis\.yml$ |
|||
^README\.*Rmd$ |
|||
^README\.*html$ |
|||
^NOTES\.*Rmd$ |
|||
^NOTES\.*html$ |
|||
^\.codecov\.yml$ |
|||
^README_files$ |
|||
^doc$ |
|||
^CONDUCT\.md$ |
@ -0,0 +1 @@ |
|||
comment: false |
@ -0,0 +1,8 @@ |
|||
.DS_Store |
|||
.Rproj.user |
|||
.Rhistory |
|||
.RData |
|||
.Rproj |
|||
src/*.o |
|||
src/*.so |
|||
src/*.dll |
@ -0,0 +1,31 @@ |
|||
language: r |
|||
|
|||
warnings_are_errors: true |
|||
|
|||
sudo: required |
|||
|
|||
cache: packages |
|||
|
|||
r: |
|||
- oldrel |
|||
- release |
|||
- devel |
|||
|
|||
apt_packages: |
|||
- libv8-dev |
|||
- xclip |
|||
|
|||
env: |
|||
global: |
|||
- CRAN: http://cran.rstudio.com |
|||
|
|||
after_success: |
|||
- Rscript -e 'covr::codecov()' |
|||
|
|||
notifications: |
|||
email: |
|||
- bob@rud.is |
|||
irc: |
|||
channels: |
|||
- "104.236.112.222#builds" |
|||
nick: travisci |
@ -0,0 +1,25 @@ |
|||
# Contributor Code of Conduct |
|||
|
|||
As contributors and maintainers of this project, we pledge to respect all people who |
|||
contribute through reporting issues, posting feature requests, updating documentation, |
|||
submitting pull requests or patches, and other activities. |
|||
|
|||
We are committed to making participation in this project a harassment-free experience for |
|||
everyone, regardless of level of experience, gender, gender identity and expression, |
|||
sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. |
|||
|
|||
Examples of unacceptable behavior by participants include the use of sexual language or |
|||
imagery, derogatory comments or personal attacks, trolling, public or private harassment, |
|||
insults, or other unprofessional conduct. |
|||
|
|||
Project maintainers have the right and responsibility to remove, edit, or reject comments, |
|||
commits, code, wiki edits, issues, and other contributions that are not aligned to this |
|||
Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed |
|||
from the project team. |
|||
|
|||
Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by |
|||
opening an issue or contacting one or more of the project maintainers. |
|||
|
|||
This Code of Conduct is adapted from the Contributor Covenant |
|||
(http:contributor-covenant.org), version 1.0.0, available at |
|||
http://contributor-covenant.org/version/1/0/0/ |
@ -0,0 +1,27 @@ |
|||
Package: rep |
|||
Type: Package |
|||
Title: Tools to Parse and Test Robots Exclusion Protocol Files and Rules |
|||
Version: 0.1.0 |
|||
Date: 2017-08-14 |
|||
Author: Bob Rudis (bob@rud.is) [aut, cre], SEOmoz, Inc [aut] |
|||
Maintainer: Bob Rudis <bob@rud.is> |
|||
Description: The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents |
|||
a set of standards for allowing or excluding robot/spider crawling of different areas of |
|||
site content. Tools are provided which wrap The 'rep-cpp` <https://github.com/seomoz/rep-cpp> |
|||
C++ library for processing these 'robots.txt' files. |
|||
SystemRequirements: C++11 |
|||
NeedsCompilation: yes |
|||
URL: https://github.com/hrbrmstr/rep |
|||
BugReports: https://github.com/hrbrmstr/rep/issues |
|||
License: MIT + file LICENSE |
|||
Suggests: |
|||
testthat, |
|||
covr, |
|||
robotstxt |
|||
Depends: |
|||
R (>= 3.2.0) |
|||
Imports: |
|||
purrr, |
|||
Rcpp |
|||
RoxygenNote: 6.0.1 |
|||
LinkingTo: Rcpp |
@ -0,0 +1,2 @@ |
|||
YEAR: 2017 |
|||
COPYRIGHT HOLDER: Bob Rudis |
@ -0,0 +1,7 @@ |
|||
# Generated by roxygen2: do not edit by hand |
|||
|
|||
S3method(print,robxp) |
|||
export(can_fetch) |
|||
export(robxp) |
|||
importFrom(Rcpp,sourceCpp) |
|||
useDynLib(rep, .registration=TRUE) |
@ -0,0 +1,2 @@ |
|||
0.1.0 |
|||
* Initial release |
@ -0,0 +1,19 @@ |
|||
# Generated by using Rcpp::compileAttributes() -> do not edit by hand |
|||
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 |
|||
|
|||
#' Parse robots.txt |
|||
#' |
|||
#' @noRd |
|||
#' |
|||
rep_parse <- function(content) { |
|||
.Call(`_rep_rep_parse`, content) |
|||
} |
|||
|
|||
#' Path allowed |
|||
#' |
|||
#' @noRd |
|||
#' |
|||
rep_path_allowed <- function(xp, path, agent = "*") { |
|||
.Call(`_rep_rep_path_allowed`, xp, path, agent) |
|||
} |
|||
|
@ -0,0 +1,14 @@ |
|||
#' Tools to Parse and Test Robots Exclusion Protocol Files and Rules |
|||
#' |
|||
#' The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents a set |
|||
#' of standards for allowing or excluding robot/spider crawling of different areas of |
|||
#' site content. Tools are provided which wrap The 'rep-cpp` <https://github.com/seomoz/rep-cpp> |
|||
#' C++ library for processing these 'robots.txt' files. |
|||
#' |
|||
#' @md |
|||
#' @name rep |
|||
#' @docType package |
|||
#' @author Bob Rudis (bob@@rud.is) |
|||
#' @useDynLib rep, .registration=TRUE |
|||
#' @importFrom Rcpp sourceCpp |
|||
NULL |
@ -0,0 +1,47 @@ |
|||
#' Create a robots.txt object |
|||
#' |
|||
#' @param x atomic character vector containing a complete robots.txt file |
|||
#' @export |
|||
#' @examples |
|||
#' library(robotstxt) |
|||
#' can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE |
|||
#' can_fetch(rt, "/_borders", "*") # FALSE |
|||
robxp <- function(x) { |
|||
|
|||
robxp <- rep_parse(x) |
|||
class(robxp) <- c("robxp") |
|||
|
|||
robxp |
|||
|
|||
} |
|||
|
|||
#' Test URL path against robots.txt |
|||
#' |
|||
#' @md |
|||
#' @param obj `robxp` object |
|||
#' @param path path to test |
|||
#' @param user_agent user agent to test |
|||
#' @export |
|||
#' @examples |
|||
#' library(robotstxt) |
|||
#' can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE |
|||
#' can_fetch(rt, "/_borders", "*") # FALSE |
|||
can_fetch <- function(obj, path="/", user_agent="*") { |
|||
|
|||
if (inherits(obj, "robxp")) { |
|||
rep_path_allowed(obj, path, user_agent) |
|||
} else { |
|||
return(NULL) |
|||
} |
|||
|
|||
} |
|||
|
|||
#' Custom printer for 'robexp' objects |
|||
#' |
|||
#' @md |
|||
#' @param x object to print |
|||
#' @param ... unused |
|||
#' @export |
|||
print.robxp <- function(x, ...) { |
|||
cat("<Robots Exclusion Protocol Object>") |
|||
} |
@ -0,0 +1,58 @@ |
|||
--- |
|||
output: rmarkdown::github_document |
|||
--- |
|||
|
|||
`rep` : Tools to Parse and Test Robots Exclusion Protocol Files and Rules |
|||
|
|||
The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents a set of standards for allowing or excluding robot/spider crawling of different areas of site content. Tools are provided which wrap The 'rep-cpp` <https://github.com/seomoz/rep-cpp> C++ library for processing these 'robots.txt' files. |
|||
|
|||
- [`rep-cpp`](https://github.com/seomoz/rep-cpp) |
|||
- [`url-cpp`](https://github.com/seomoz/url-cpp) |
|||
|
|||
The following functions are implemented: |
|||
|
|||
- `robxp`: Create a robots.txt object |
|||
- `can_fetch`: Test URL path against robots.txt |
|||
|
|||
### Installation |
|||
|
|||
```{r eval=FALSE} |
|||
devtools::install_github("hrbrmstr/rep") |
|||
``` |
|||
|
|||
```{r message=FALSE, warning=FALSE, error=FALSE, include=FALSE} |
|||
options(width=120) |
|||
``` |
|||
|
|||
### Usage |
|||
|
|||
```{r message=FALSE, warning=FALSE, error=FALSE} |
|||
library(rep) |
|||
library(robotstxt) |
|||
|
|||
# current verison |
|||
packageVersion("rep") |
|||
|
|||
rt <- robxp(get_robotstxt("https://cdc.gov")) |
|||
|
|||
print(rt) |
|||
|
|||
can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") |
|||
|
|||
can_fetch(rt, "/_borders", "*") |
|||
``` |
|||
|
|||
### Test Results |
|||
|
|||
```{r message=FALSE, warning=FALSE, error=FALSE} |
|||
library(rep) |
|||
library(testthat) |
|||
|
|||
date() |
|||
|
|||
test_dir("tests/") |
|||
``` |
|||
|
|||
### Code of Conduct |
|||
|
|||
Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms. |
@ -0,0 +1,74 @@ |
|||
|
|||
`rep` : Tools to Parse and Test Robots Exclusion Protocol Files and Rules |
|||
|
|||
The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents a set of standards for allowing or excluding robot/spider crawling of different areas of site content. Tools are provided which wrap The 'rep-cpp\` <https://github.com/seomoz/rep-cpp> C++ library for processing these 'robots.txt' files. |
|||
|
|||
- [`rep-cpp`](https://github.com/seomoz/rep-cpp) |
|||
- [`url-cpp`](https://github.com/seomoz/url-cpp) |
|||
|
|||
The following functions are implemented: |
|||
|
|||
- `robxp`: Create a robots.txt object |
|||
- `can_fetch`: Test URL path against robots.txt |
|||
|
|||
### Installation |
|||
|
|||
``` r |
|||
devtools::install_github("hrbrmstr/rep") |
|||
``` |
|||
|
|||
### Usage |
|||
|
|||
``` r |
|||
library(rep) |
|||
library(robotstxt) |
|||
|
|||
# current verison |
|||
packageVersion("rep") |
|||
``` |
|||
|
|||
## [1] '0.1.0' |
|||
|
|||
``` r |
|||
rt <- robxp(get_robotstxt("https://cdc.gov")) |
|||
|
|||
print(rt) |
|||
``` |
|||
|
|||
## <Robots Exclusion Protocol Object> |
|||
|
|||
``` r |
|||
can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") |
|||
``` |
|||
|
|||
## [1] TRUE |
|||
|
|||
``` r |
|||
can_fetch(rt, "/_borders", "*") |
|||
``` |
|||
|
|||
## [1] FALSE |
|||
|
|||
### Test Results |
|||
|
|||
``` r |
|||
library(rep) |
|||
library(testthat) |
|||
|
|||
date() |
|||
``` |
|||
|
|||
## [1] "Mon Aug 14 15:00:16 2017" |
|||
|
|||
``` r |
|||
test_dir("tests/") |
|||
``` |
|||
|
|||
## testthat results ======================================================================================================== |
|||
## OK: 3 SKIPPED: 0 FAILED: 0 |
|||
## |
|||
## DONE =================================================================================================================== |
|||
|
|||
### Code of Conduct |
|||
|
|||
Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms. |
@ -0,0 +1,23 @@ |
|||
% Generated by roxygen2: do not edit by hand |
|||
% Please edit documentation in R/rep.r |
|||
\name{can_fetch} |
|||
\alias{can_fetch} |
|||
\title{Test URL path against robots.txt} |
|||
\usage{ |
|||
can_fetch(obj, path = "/", user_agent = "*") |
|||
} |
|||
\arguments{ |
|||
\item{obj}{\code{robxp} object} |
|||
|
|||
\item{path}{path to test} |
|||
|
|||
\item{user_agent}{user agent to test} |
|||
} |
|||
\description{ |
|||
Test URL path against robots.txt |
|||
} |
|||
\examples{ |
|||
library(robotstxt) |
|||
can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE |
|||
can_fetch(rt, "/_borders", "*") # FALSE |
|||
} |
@ -0,0 +1,16 @@ |
|||
% Generated by roxygen2: do not edit by hand |
|||
% Please edit documentation in R/rep.r |
|||
\name{print.robxp} |
|||
\alias{print.robxp} |
|||
\title{Custom printer for 'robexp' objects} |
|||
\usage{ |
|||
\method{print}{robxp}(x, ...) |
|||
} |
|||
\arguments{ |
|||
\item{x}{object to print} |
|||
|
|||
\item{...}{unused} |
|||
} |
|||
\description{ |
|||
Custom printer for 'robexp' objects |
|||
} |
@ -0,0 +1,16 @@ |
|||
% Generated by roxygen2: do not edit by hand |
|||
% Please edit documentation in R/rep-package.R |
|||
\docType{package} |
|||
\name{rep} |
|||
\alias{rep} |
|||
\alias{rep-package} |
|||
\title{Tools to Parse and Test Robots Exclusion Protocol Files and Rules} |
|||
\description{ |
|||
The 'Robots Exclusion Protocol' \url{http://www.robotstxt.org/orig.html} documents a set |
|||
of standards for allowing or excluding robot/spider crawling of different areas of |
|||
site content. Tools are provided which wrap The 'rep-cpp` \url{https://github.com/seomoz/rep-cpp} |
|||
C++ library for processing these 'robots.txt' files. |
|||
} |
|||
\author{ |
|||
Bob Rudis (bob@rud.is) |
|||
} |
@ -0,0 +1,19 @@ |
|||
% Generated by roxygen2: do not edit by hand |
|||
% Please edit documentation in R/rep.r |
|||
\name{robxp} |
|||
\alias{robxp} |
|||
\title{Create a robots.txt object} |
|||
\usage{ |
|||
robxp(x) |
|||
} |
|||
\arguments{ |
|||
\item{x}{atomic character vector containing a complete robots.txt file} |
|||
} |
|||
\description{ |
|||
Create a robots.txt object |
|||
} |
|||
\examples{ |
|||
library(robotstxt) |
|||
can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE |
|||
can_fetch(rt, "/_borders", "*") # FALSE |
|||
} |
@ -0,0 +1,21 @@ |
|||
Version: 1.0 |
|||
|
|||
RestoreWorkspace: Default |
|||
SaveWorkspace: Default |
|||
AlwaysSaveHistory: Default |
|||
|
|||
EnableCodeIndexing: Yes |
|||
UseSpacesForTab: Yes |
|||
NumSpacesForTab: 2 |
|||
Encoding: UTF-8 |
|||
|
|||
RnwWeave: Sweave |
|||
LaTeX: pdfLaTeX |
|||
|
|||
StripTrailingWhitespace: Yes |
|||
|
|||
BuildType: Package |
|||
PackageUseDevtools: Yes |
|||
PackageInstallArgs: --no-multiarch --with-keep.source |
|||
PackageBuildArgs: --resave-data |
|||
PackageRoxygenize: rd,collate,namespace |
@ -0,0 +1,3 @@ |
|||
*.o |
|||
*.so |
|||
*.dll |
@ -0,0 +1,3 @@ |
|||
CXX_STD = CXX11 |
|||
PKG_CXXFLAGS = |
|||
PKG_LIBS = -L. |
@ -0,0 +1,42 @@ |
|||
// Generated by using Rcpp::compileAttributes() -> do not edit by hand
|
|||
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
|
|||
|
|||
#include <Rcpp.h> |
|||
|
|||
using namespace Rcpp; |
|||
|
|||
// rep_parse
|
|||
SEXP rep_parse(std::string content); |
|||
RcppExport SEXP _rep_rep_parse(SEXP contentSEXP) { |
|||
BEGIN_RCPP |
|||
Rcpp::RObject rcpp_result_gen; |
|||
Rcpp::RNGScope rcpp_rngScope_gen; |
|||
Rcpp::traits::input_parameter< std::string >::type content(contentSEXP); |
|||
rcpp_result_gen = Rcpp::wrap(rep_parse(content)); |
|||
return rcpp_result_gen; |
|||
END_RCPP |
|||
} |
|||
// rep_path_allowed
|
|||
bool rep_path_allowed(SEXP xp, std::string path, std::string agent); |
|||
RcppExport SEXP _rep_rep_path_allowed(SEXP xpSEXP, SEXP pathSEXP, SEXP agentSEXP) { |
|||
BEGIN_RCPP |
|||
Rcpp::RObject rcpp_result_gen; |
|||
Rcpp::RNGScope rcpp_rngScope_gen; |
|||
Rcpp::traits::input_parameter< SEXP >::type xp(xpSEXP); |
|||
Rcpp::traits::input_parameter< std::string >::type path(pathSEXP); |
|||
Rcpp::traits::input_parameter< std::string >::type agent(agentSEXP); |
|||
rcpp_result_gen = Rcpp::wrap(rep_path_allowed(xp, path, agent)); |
|||
return rcpp_result_gen; |
|||
END_RCPP |
|||
} |
|||
|
|||
static const R_CallMethodDef CallEntries[] = { |
|||
{"_rep_rep_parse", (DL_FUNC) &_rep_rep_parse, 1}, |
|||
{"_rep_rep_path_allowed", (DL_FUNC) &_rep_rep_path_allowed, 3}, |
|||
{NULL, NULL, 0} |
|||
}; |
|||
|
|||
RcppExport void R_init_rep(DllInfo *dll) { |
|||
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); |
|||
R_useDynamicSymbols(dll, FALSE); |
|||
} |
@ -0,0 +1,87 @@ |
|||
#include <algorithm> |
|||
#include <sstream> |
|||
|
|||
#include "url.h" |
|||
|
|||
#include "agent.h" |
|||
#include "directive.h" |
|||
|
|||
namespace Rep |
|||
{ |
|||
Agent& Agent::allow(const std::string& query) |
|||
{ |
|||
directives_.push_back(Directive(escape(query), true)); |
|||
sorted_ = false; |
|||
return *this; |
|||
} |
|||
|
|||
Agent& Agent::disallow(const std::string& query) |
|||
{ |
|||
if (query.empty()) |
|||
{ |
|||
// Special case: "Disallow:" means "Allow: /"
|
|||
directives_.push_back(Directive(query, true)); |
|||
} |
|||
else |
|||
{ |
|||
directives_.push_back(Directive(escape(query), false)); |
|||
} |
|||
sorted_ = false; |
|||
return *this; |
|||
} |
|||
|
|||
const std::vector<Directive>& Agent::directives() const |
|||
{ |
|||
if (!sorted_) |
|||
{ |
|||
std::sort(directives_.begin(), directives_.end(), [](const Directive& a, const Directive& b) { |
|||
return b.priority() < a.priority(); |
|||
}); |
|||
sorted_ = true; |
|||
} |
|||
return directives_; |
|||
} |
|||
|
|||
bool Agent::allowed(const std::string& query) const |
|||
{ |
|||
std::string path(escape(query)); |
|||
|
|||
if (path.compare("/robots.txt") == 0) |
|||
{ |
|||
return true; |
|||
} |
|||
|
|||
for (auto directive : directives()) |
|||
{ |
|||
if (directive.match(path)) |
|||
{ |
|||
return directive.allowed(); |
|||
} |
|||
} |
|||
return true; |
|||
} |
|||
|
|||
std::string Agent::str() const |
|||
{ |
|||
std::stringstream out; |
|||
out << '['; |
|||
auto begin = directives().begin(); |
|||
auto end = directives().end(); |
|||
if (begin != end) |
|||
{ |
|||
out << "Directive(" << begin->str() << ')'; |
|||
++begin; |
|||
} |
|||
for (; begin != end; ++begin) |
|||
{ |
|||
out << ", Directive(" << begin->str() << ')'; |
|||
} |
|||
out << ']'; |
|||
return out.str(); |
|||
} |
|||
|
|||
std::string Agent::escape(const std::string& query) |
|||
{ |
|||
return Url::Url(query).defrag().escape().fullpath(); |
|||
} |
|||
} |
@ -0,0 +1,70 @@ |
|||
#ifndef AGENT_CPP_H |
|||
#define AGENT_CPP_H |
|||
|
|||
#include <vector> |
|||
|
|||
#include "directive.h" |
|||
|
|||
|
|||
namespace Rep |
|||
{ |
|||
|
|||
class Agent |
|||
{ |
|||
public: |
|||
/* The type for the delay. */ |
|||
typedef float delay_t; |
|||
|
|||
/**
|
|||
* Construct an agent. |
|||
*/ |
|||
Agent(): directives_(), delay_(-1.0), sorted_(true) {} |
|||
|
|||
/**
|
|||
* Add an allowed directive. |
|||
*/ |
|||
Agent& allow(const std::string& query); |
|||
|
|||
/**
|
|||
* Add a disallowed directive. |
|||
*/ |
|||
Agent& disallow(const std::string& query); |
|||
|
|||
/**
|
|||
* Set the delay for this agent. |
|||
*/ |
|||
Agent& delay(delay_t value) { |
|||
delay_ = value; |
|||
return *this; |
|||
} |
|||
|
|||
/**
|
|||
* Return the delay for this agent. |
|||
*/ |
|||
delay_t delay() const { return delay_; } |
|||
|
|||
/**
|
|||
* A vector of the directives, in priority-sorted order. |
|||
*/ |
|||
const std::vector<Directive>& directives() const; |
|||
|
|||
/**
|
|||
* Return true if the URL (either a full URL or a path) is allowed. |
|||
*/ |
|||
bool allowed(const std::string& path) const; |
|||
|
|||
std::string str() const; |
|||
|
|||
/**
|
|||
* Canonically escape the provided query for matching purposes. |
|||
*/ |
|||
static std::string escape(const std::string& query); |
|||
|
|||
private: |
|||
mutable std::vector<Directive> directives_; |
|||
delay_t delay_; |
|||
mutable bool sorted_; |
|||
}; |
|||
} |
|||
|
|||
#endif |
@ -0,0 +1,130 @@ |
|||
#include <algorithm> |
|||
#include <locale> |
|||
#include <sstream> |
|||
#include <string> |
|||
|
|||
#include "url.h" |
|||
|
|||
#include "directive.h" |
|||
|
|||
namespace Rep |
|||
{ |
|||
Directive::Directive(const std::string& line, bool allowed) |
|||
: expression_() |
|||
, priority_(line.size()) |
|||
, allowed_(allowed) |
|||
{ |
|||
if (line.find('*') == std::string::npos) |
|||
{ |
|||
expression_.assign(line); |
|||
return; |
|||
} |
|||
|
|||
// Remove consecutive '*'s
|
|||
expression_.reserve(line.size()); |
|||
bool star = false; |
|||
for (auto character : line) |
|||
{ |
|||
if (character == '*') |
|||
{ |
|||
if (!star) |
|||
{ |
|||
expression_.append(1, character); |
|||
} |
|||
star = true; |
|||
} |
|||
else |
|||
{ |
|||
expression_.append(1, character); |
|||
star = false; |
|||
} |
|||
} |
|||
|
|||
// Remove trailing '*'s
|
|||
std::string::reverse_iterator last = |
|||
std::find_if(expression_.rbegin(), expression_.rend(), |
|||
[](const char c) { |
|||
return c != '*'; |
|||
}); |
|||
expression_.erase(last.base(), expression_.end()); |
|||
|
|||
// Priority is the length of the expression
|
|||
priority_ = expression_.size(); |
|||
} |
|||
|
|||
bool Directive::match(const std::string::const_iterator& e_begin, |
|||
const std::string::const_iterator& e_end, |
|||
const std::string::const_iterator& p_begin, |
|||
const std::string::const_iterator& p_end) const |
|||
{ |
|||
std::string::const_iterator expression_it = e_begin; |
|||
std::string::const_iterator path_it = p_begin; |
|||
while (expression_it != e_end && path_it != p_end) |
|||
{ |
|||
if (*expression_it == '*') |
|||
{ |
|||
// Advance and recurse
|
|||
++expression_it; |
|||
for (; path_it != p_end; ++path_it) |
|||
{ |
|||
if (match(expression_it, e_end, path_it, p_end)) |
|||
{ |
|||
return true; |
|||
} |
|||
} |
|||
return false; |
|||
} |
|||
else if (*expression_it == '$') |
|||
{ |
|||
// This check expects path to be fully consumed. But since one of the
|
|||
// criteria of being in this while loop is that we've not fully consumed
|
|||
// path, return false.
|
|||
return false; |
|||
} |
|||
else if (*expression_it != *path_it) |
|||
{ |
|||
// These characters must match
|
|||
return false; |
|||
} |
|||
else |
|||
{ |
|||
// Advance both by one
|
|||
++path_it; |
|||
++expression_it; |
|||
} |
|||
} |
|||
|
|||
// Return true only if we've consumed all of the expression
|
|||
if (expression_it == e_end) |
|||
{ |
|||
return true; |
|||
} |
|||
else if (*expression_it == '$') |
|||
{ |
|||
return path_it == p_end; |
|||
} |
|||
else |
|||
{ |
|||
return false; |
|||
} |
|||
} |
|||
|
|||
std::string Directive::str() const |
|||
{ |
|||
std::stringstream out; |
|||
if (allowed_) |
|||
{ |
|||
out << "Allow: " << expression_; |
|||
} |
|||
else { |
|||
out << "Disallow: " << expression_; |
|||
} |
|||
return out.str(); |
|||
} |
|||
|
|||
bool Directive::match(const std::string& path) const |
|||
{ |
|||
return match(expression_.begin(), expression_.end(), path.begin(), path.end()); |
|||
} |
|||
|
|||
} |
@ -0,0 +1,67 @@ |
|||
#ifndef DIRECTIVE_CPP_H |
|||
#define DIRECTIVE_CPP_H |
|||
|
|||
|
|||
namespace Rep |
|||
{ |
|||
|
|||
class Directive |
|||
{ |
|||
public: |
|||
/**
|
|||
* The type of our priority value. |
|||
*/ |
|||
typedef size_t priority_t; |
|||
|
|||
/**
|
|||
* Default constructor disallowed. |
|||
*/ |
|||
Directive() = delete; |
|||
|
|||
/**
|
|||
* The input to this constructor must be stripped of comments and trailing |
|||
* whitespace. |
|||
*/ |
|||
Directive(const std::string& line, bool allowed); |
|||
|
|||
/**
|
|||
* The priority of the rule. |
|||
*/ |
|||
priority_t priority() const |
|||
{ |
|||
return priority_; |
|||
} |
|||
|
|||
/**
|
|||
* Whether or not the provided path matches. The path is expected to be properly |
|||
* escaped. |
|||
*/ |
|||
bool match(const std::string& path) const; |
|||
|
|||
/**
|
|||
* Whether this rule is for an allow or a disallow. |
|||
*/ |
|||
bool allowed() const |
|||
{ |
|||
return allowed_; |
|||
} |
|||
|
|||
std::string str() const; |
|||
|
|||
private: |
|||
std::string expression_; |
|||
priority_t priority_; |
|||
bool allowed_; |
|||
|
|||
/**
|
|||
* Return true if p_begin -> p_end matches the expression e_begin -> e_end. |
|||
*/ |
|||
bool match(const std::string::const_iterator& e_begin, |
|||
const std::string::const_iterator& e_end, |
|||
const std::string::const_iterator& p_begin, |
|||
const std::string::const_iterator& p_end) const; |
|||
}; |
|||
|
|||
} |
|||
|
|||
#endif |
@ -0,0 +1,183 @@ |
|||
#include <algorithm> |
|||
#include <fstream> |
|||
#include <iostream> |
|||
#include <string> |
|||
|
|||
#include "psl.h" |
|||
#include "punycode.h" |
|||
|
|||
namespace Url |
|||
{ |
|||
const std::string PSL::not_found = ""; |
|||
|
|||
PSL::PSL(std::istream& stream) |
|||
{ |
|||
std::string line; |
|||
while (std::getline(stream, line)) |
|||
{ |
|||
// Only take up to the first whitespace.
|
|||
auto it = std::find_if(line.begin(), line.end(), ::isspace); |
|||
line.resize(it - line.begin()); |
|||
|
|||
// Skip blank lines
|
|||
if (line.empty()) |
|||
{ |
|||
continue; |
|||
} |
|||
|
|||
// Skip comments
|
|||
if (line.compare(0, 2, "//") == 0) |
|||
{ |
|||
continue; |
|||
} |
|||
|
|||
// We know the line has at least a single character at this point
|
|||
if (line[0] == '*') |
|||
{ |
|||
// Line is a wildcard rule
|
|||
if (line.size() <= 2 || line[1] != '.') |
|||
{ |
|||
throw std::invalid_argument("Wildcard rule must be of form *.<host>"); |
|||
} |
|||
|
|||
add(line, 1, 2); |
|||
} |
|||
else if (line[0] == '!') |
|||
{ |
|||
// Line is an exception, take all but the !
|
|||
if (line.size() <= 1) |
|||
{ |
|||
throw std::invalid_argument("Exception rule has no hostname."); |
|||
} |
|||
|
|||
add(line, -1, 1); |
|||
} |
|||
else |
|||
{ |
|||
add(line, 0, 0); |
|||
} |
|||
} |
|||
} |
|||
|
|||
PSL PSL::fromPath(const std::string& path) |
|||
{ |
|||
std::ifstream stream(path); |
|||
if (!stream.good()) |
|||
{ |
|||
std::stringstream message; |
|||
message << "Path '" << path << "' inaccessible."; |
|||
throw std::invalid_argument(message.str()); |
|||
} |
|||
return PSL(stream); |
|||
} |
|||
|
|||
PSL PSL::fromString(const std::string& str) |
|||
{ |
|||
std::stringstream stream(str); |
|||
return PSL(stream); |
|||
} |
|||
|
|||
std::string PSL::getTLD(const std::string& hostname) const |
|||
{ |
|||
return getLastSegments(hostname, getTLDLength(hostname)); |
|||
} |
|||
|
|||
std::string PSL::getPLD(const std::string& hostname) const |
|||
{ |
|||
return getLastSegments(hostname, getTLDLength(hostname) + 1); |
|||
} |
|||
|
|||
std::pair<std::string, std::string> PSL::getBoth(const std::string& hostname) const |
|||
{ |
|||
size_t length = getTLDLength(hostname); |
|||
return std::make_pair( |
|||
getLastSegments(hostname, length), |
|||
getLastSegments(hostname, length + 1)); |
|||
} |
|||
|
|||
size_t PSL::getTLDLength(const std::string& hostname) const |
|||
{ |
|||
// Reversed copy of hostname
|
|||
std::string tld(hostname.rbegin(), hostname.rend()); |
|||
std::transform(tld.begin(), tld.end(), tld.begin(), ::tolower); |
|||
|
|||
while (tld.size()) |
|||
{ |
|||
auto it = levels.find(tld); |
|||
if (it != levels.end()) |
|||
{ |
|||
return it->second; |
|||
} |
|||
|
|||
size_t position = tld.rfind('.'); |
|||
if (position == std::string::npos || position == 0) |
|||
{ |
|||
tld.resize(0); |
|||
} |
|||
else |
|||
{ |
|||
tld.resize(position); |
|||
} |
|||
} |
|||
|
|||
return 1; |
|||
} |
|||
|
|||
std::string PSL::getLastSegments(const std::string& hostname, size_t segments) const |
|||
{ |
|||
size_t position = hostname.size(); |
|||
size_t remaining = segments; |
|||
while (remaining != 0 && position && position != std::string::npos) |
|||
{ |
|||
position = hostname.rfind('.', position - 1); |
|||
remaining -= 1; |
|||
} |
|||
|
|||
if (remaining >= 1) |
|||
{ |
|||
return not_found; |
|||
} |
|||
|
|||
// Return the whole string if position == std:string::npos
|
|||
size_t start = (position == std::string::npos) ? 0 : position + 1; |
|||
|
|||
std::string result(hostname, start); |
|||
std::transform(result.begin(), result.end(), result.begin(), ::tolower); |
|||
|
|||
// Leading .'s indicate that the query had an empty segment
|
|||
if (result.size() && result[0] == '.') |
|||
{ |
|||
std::stringstream message; |
|||
message << "Empty segment in " << result; |
|||
throw std::invalid_argument(message.str()); |
|||
} |
|||
|
|||
return result; |
|||
} |
|||
|
|||
size_t PSL::countSegments(const std::string& hostname) const |
|||
{ |
|||
size_t count = 1; |
|||
size_t position = hostname.find('.'); |
|||
while (position != std::string::npos) |
|||
{ |
|||
count += 1; |
|||
position = hostname.find('.', position + 1); |
|||
} |
|||
return count; |
|||
} |
|||
|
|||
void PSL::add(std::string& rule, int level_adjust, size_t trim) |
|||
{ |
|||
// First unpunycoded
|
|||
std::string copy(rule.rbegin(), rule.rend() - trim); |
|||
size_t length = countSegments(copy) + level_adjust; |
|||
levels[copy] = length; |
|||
|
|||
// And now punycoded
|
|||
rule = Punycode::encodeHostname(rule); |
|||
copy.assign(rule.rbegin(), rule.rend() - trim); |
|||
levels[copy] = length; |
|||
} |
|||
|
|||
}; |
@ -0,0 +1,102 @@ |
|||
#ifndef PSL_CPP_H |
|||
#define PSL_CPP_H |
|||
|
|||
#include <istream> |
|||
#include <sstream> |
|||
#include <string> |
|||
#include <unordered_map> |
|||
#include <utility> |
|||
|
|||
namespace Url |
|||
{ |
|||
|
|||
/**
|
|||
* Find TLDs and PLDs of a hostname according to a PSL. |
|||
*/ |
|||
struct PSL |
|||
{ |
|||
/**
|
|||
* Indicates the there is no TLD / PLD |
|||
*/ |
|||
static const std::string not_found; |
|||
|
|||
/**
|
|||
* Read a PSL from an istream. |
|||
*/ |
|||
PSL(std::istream& stream); |
|||
|
|||
PSL(): levels() { }; |
|||
|
|||
PSL(const PSL& other): levels(other.levels) { } |
|||
|
|||
PSL& operator=(const PSL& other) |
|||
{ |
|||
levels = other.levels; |
|||
return *this; |
|||
} |
|||
|
|||
/**
|
|||
* Read the provided path holding a set of PSL rules. |
|||
*/ |
|||
static PSL fromPath(const std::string& path); |
|||
|
|||
/**
|
|||
* Create a PSL object from a string. |
|||
*/ |
|||
static PSL fromString(const std::string& str); |
|||
|
|||
/**
|
|||
* Get just the TLD of the hostname. |
|||
* |
|||
* Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If |
|||
* some segments have been appropriately punycoded and others not, it may return |
|||
* a wrong answer. If a punycoded host is provided, a punycoded response is |
|||
* returned. If an unpunycoded host is provided, an unpunycoded response is |
|||
* returned. |
|||
*/ |
|||
std::string getTLD(const std::string& hostname) const; |
|||
|
|||
/**
|
|||
* Get just the PLD of the hostname. |
|||
* |
|||
* Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If |
|||
* some segments have been appropriately punycoded and others not, it may return |
|||
* a wrong answer. If a punycoded host is provided, a punycoded response is |
|||
* returned. If an unpunycoded host is provided, an unpunycoded response is |
|||
* returned. |
|||
*/ |
|||
std::string getPLD(const std::string& hostname) const; |
|||
|
|||
/**
|
|||
* Get the (TLD, PLD) of the hostname. |
|||
* |
|||
* Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If |
|||
* some segments have been appropriately punycoded and others not, it may return |
|||
* a wrong answer. If a punycoded host is provided, a punycoded response is |
|||
* returned. If an unpunycoded host is provided, an unpunycoded response is |
|||
* returned. |
|||
*/ |
|||
std::pair<std::string, std::string> getBoth(const std::string& hostname) const; |
|||
private: |
|||
// Mapping of a string rule to its level
|
|||
std::unordered_map<std::string, size_t> levels; |
|||
|
|||
// Return the number of segments in a hostname
|
|||
size_t countSegments(const std::string& hostname) const; |
|||
|
|||
// Return the number of segments in the TLD of the provided hostname
|
|||
size_t getTLDLength(const std::string& hostname) const; |
|||
|
|||
// Return the last `segments` segments of a hostname
|
|||
std::string getLastSegments(const std::string& hostname, size_t segments) const; |
|||
|
|||
/**
|
|||
* Add the provided host with the provided priority, trimming characters off |
|||
* the front, and adjusting the level by the provided number. |
|||
*/ |
|||
void add(std::string& host, int level_adjust, size_t trim); |
|||
}; |
|||
|
|||
} |
|||
|
|||
#endif |
@ -0,0 +1,409 @@ |
|||
#include <algorithm> |
|||
#include <string> |
|||
#include <iostream> |
|||
|
|||
#include "punycode.h" |
|||
#include "utf8.h" |
|||
|
|||
namespace Url |
|||
{ |
|||
|
|||
std::string& Punycode::encode(std::string& str) |
|||
{ |
|||
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3
|
|||
//
|
|||
// let n = initial_n
|
|||
// let delta = 0
|
|||
// let bias = initial_bias
|
|||
punycode_uint n = INITIAL_N; |
|||
punycode_uint delta = 0; |
|||
punycode_uint bias = INITIAL_BIAS; |
|||
std::string output; |
|||
|
|||
// Accumulate the non-basic codepoints
|
|||
std::vector<punycode_uint> codepoints; |
|||
for (auto it = str.cbegin(); it != str.cend(); ) |
|||
{ |
|||
Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend()); |
|||
if (value < 0x80) |
|||
{ |
|||
// copy them to the output in order
|
|||
output.append(1, static_cast<char>(value)); |
|||
} |
|||
codepoints.push_back(value); |
|||
} |
|||
|
|||
// let h = b = the number of basic code points in the input
|
|||
size_t h = output.size(); |
|||
size_t b = h; |
|||
|
|||
// copy a delimiter if b > 0
|
|||
if (b > 0) |
|||
{ |
|||
output.append(1, '-'); |
|||
} |
|||
|
|||
// while h < length(input) do begin
|
|||
while (h < codepoints.size()) |
|||
{ |
|||
// let m = the minimum {non-basic} code point >= n in the input
|
|||
punycode_uint m = MAX_PUNYCODE_UINT; |
|||
for (auto it = codepoints.begin(); it != codepoints.end(); ++it) |
|||
{ |
|||
if ((*it >= n) && (*it < m)) |
|||
{ |
|||
m = *it; |
|||
} |
|||
} |
|||
|
|||
// let delta = delta + (m - n) * (h + 1), fail on overflow
|
|||
if ((m - n) > ((MAX_PUNYCODE_UINT - delta) / (h + 1))) |
|||
{ |
|||
throw std::invalid_argument("Overflow delta update."); |
|||
} |
|||
delta += (m - n) * (h + 1); |
|||
|
|||
// let n = m
|
|||
n = m; |
|||
|
|||
// for each code point c in the input (in order) do begin
|
|||
for (auto it = codepoints.begin(); it != codepoints.end(); ++it) |
|||
{ |
|||
// if c < n {or c is basic} then increment delta, fail on overflow
|
|||
if (*it < n) |
|||
{ |
|||
if (delta == MAX_PUNYCODE_UINT) |
|||
{ |
|||
throw std::invalid_argument("Overflow delta increment."); |
|||
} |
|||
++delta; |
|||
} |
|||
|
|||
// if c == n then begin
|
|||
if (*it == n) |
|||
{ |
|||
// let q = delta
|
|||
punycode_uint q = delta; |
|||
|
|||
// for k = base to infinity in steps of base do begin
|
|||
for (punycode_uint k = BASE; ; k += BASE) |
|||
{ |
|||
// let t = tmin if k <= bias {+ tmin}, or
|
|||
// tmax if k >= bias + tmax, or k - bias otherwise
|
|||
punycode_uint t = k <= bias ? TMIN : |
|||
k >= bias + TMAX ? TMAX : k - bias; |
|||
|
|||
// if q < t then break
|
|||
if (q < t) |
|||
{ |
|||
break; |
|||
} |
|||
|
|||
// output the code point for digit t + ((q - t) mod (base - t))
|
|||
output.append(1, DIGIT_TO_BASIC[t + ((q - t) % (BASE - t))]); |
|||
|
|||
// let q = (q - t) div (base - t)
|
|||
q = (q - t) / (BASE - t); |
|||
} |
|||
|
|||
// output the code point for digit q
|
|||
output.append(1, DIGIT_TO_BASIC[q]); |
|||
|
|||
// let bias = adapt(delta, h + 1, test h equals b?)
|
|||
bias = adapt(delta, h + 1, h == b); |
|||
|
|||
// let delta = 0
|
|||
delta = 0; |
|||
|
|||
// increment h
|
|||
++h; |
|||
|
|||
} |
|||
} |
|||
|
|||
// increment delta and n
|
|||
++delta; |
|||
++n; |
|||
} |
|||
|
|||
str.assign(output); |
|||
return str; |
|||
} |
|||
|
|||
std::string Punycode::encode(const std::string& str) |
|||
{ |
|||
std::string result(str); |
|||
encode(result); |
|||
return result; |
|||
} |
|||
|
|||
std::string Punycode::encodeHostname(const std::string& hostname) |
|||
{ |
|||
// Avoid any punycoding at all if none is needed
|
|||
if (!needsPunycoding(hostname)) |
|||
{ |
|||
return hostname; |
|||
} |
|||
|
|||
std::string encoded; |
|||
|
|||
size_t start = 0; |
|||
size_t end = hostname.find('.'); |
|||
while(true) |
|||
{ |
|||
std::string segment = hostname.substr(start, end - start); |
|||
if (needsPunycoding(segment)) |
|||
{ |
|||
encoded.append("xn--"); |
|||
encoded.append(Punycode::encode(segment)); |
|||
} |
|||
else |
|||
{ |
|||
encoded.append(segment); |
|||
} |
|||
|
|||
if (end == std::string::npos) |
|||
{ |
|||
break; |
|||
} |
|||
else |
|||
{ |
|||
encoded.append(1, '.'); |
|||
start = end + 1; |
|||
end = hostname.find('.', start); |
|||
} |
|||
} |
|||
|
|||
return encoded; |
|||
} |
|||
|
|||
std::string& Punycode::decode(std::string& str) |
|||
{ |
|||
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2
|
|||
//
|
|||
// let n = initial_n
|
|||
// let i = 0
|
|||
// let bias = initial_bias
|
|||
// let output = an empty string indexed from 0
|
|||
punycode_uint n = INITIAL_N; |
|||
punycode_uint i = 0; |
|||
punycode_uint bias = INITIAL_BIAS; |
|||
std::vector<punycode_uint> codepoints; |
|||
|
|||
size_t index = str.rfind('-'); |
|||
if (index == std::string::npos) |
|||
{ |
|||
index = 0; |
|||
} |
|||
|
|||
// consume all code points before the last delimiter (if there is one)
|
|||
// and copy them to output, fail on any non-basic code point
|
|||
for (auto it = str.begin(); it != (str.begin() + index); ++it) |
|||
{ |
|||
if (static_cast<unsigned char>(*it) > 127U) |
|||
{ |
|||
throw std::invalid_argument("Argument has non-basic code points."); |
|||
} |
|||
codepoints.push_back(*it); |
|||
} |
|||
|
|||
// if more than zero code points were consumed then consume one more
|
|||
// (which will be the last delimiter)
|
|||
if (index > 0) |
|||
{ |
|||
index += 1; |
|||
} |
|||
|
|||
// while the input is not exhausted do begin
|
|||
for (auto it = (str.begin() + index); it != str.end(); ++it) |
|||
{ |
|||
// let oldi = i
|
|||
// let w = 1
|
|||
punycode_uint oldi = i; |
|||
punycode_uint w = 1; |
|||
|
|||
// for k = base to infinity in steps of base do begin
|
|||
for (punycode_uint k = BASE; ; k += BASE, ++it) |
|||
{ |
|||
// consume a code point, or fail if there was none to consume
|
|||
if (it == str.end()) |
|||
{ |
|||
throw std::invalid_argument("Premature termination"); |
|||
} |
|||
|
|||
// let digit = the code point's digit-value, fail if it has none
|
|||
int lookup = BASIC_TO_DIGIT[static_cast<size_t>(*it)]; |
|||
if (lookup == -1) |
|||
{ |
|||
throw std::invalid_argument("Invalid base 36 character."); |
|||
} |
|||
unsigned char digit = static_cast<unsigned char>(lookup); |
|||
|
|||
// let i = i + digit * w, fail on overflow
|
|||
if (digit > ((MAX_PUNYCODE_UINT - i) / w)) |
|||
{ |
|||
throw std::invalid_argument("Overflow on i."); |
|||
} |
|||
i += digit * w; |
|||
|
|||
// let t = tmin if k <= bias {+ tmin}, or
|
|||
// tmax if k >= bias + tmax, or k - bias otherwise
|
|||
punycode_uint t = k <= bias ? TMIN : |
|||
k >= bias + TMAX ? TMAX : k - bias; |
|||
|
|||
// if digit < t then break
|
|||
if (digit < t) |
|||
{ |
|||
break; |
|||
} |
|||
|
|||
// let w = w * (base - t), fail on overflow
|
|||
if (w > (MAX_PUNYCODE_UINT / (BASE - t))) |
|||
{ |
|||
// I believe this line is unreachable without first overflowing i.
|
|||
// Since 'i' is updated above as i += digit * w, and w is updated as
|
|||
// w = w * (BASE - t), we should like to keep (BASE - t) > digit to
|
|||
// give 'w' a chance to overflow first. To keep t minimized, we must
|
|||
// have 'bias' maximized. `bias` is driven by the 'adapt' function
|
|||
// below.
|
|||
//
|
|||
// The value returned by 'adapt' increases with the input delta, and
|
|||
// decreases with the input size. The delta is a function of the input
|
|||
// size as well, on the order of (delta_n * input size), and
|
|||
// legitimate delta_n values are limited to 0x10FFFF (the maximum
|
|||
// unicode codepoint). Even setting that aside, the maximum value that
|
|||
// adapt() can return is adapt(2 ** 32 - 1, 1, false) = 204.
|
|||
//
|
|||
// Using this bias, we could use the input (HERE) to get iterations:
|
|||
//
|
|||
// digit = b = 1, i = 2, k = 36, t = 1, w = 35
|
|||
// digit = b = 1, i = 37, k = 72, t = 1, w = 1225
|
|||
// digit = b = 1, i = 1262, k = 108, t = 1, w = 42875
|
|||
// digit = b = 1, i = 44137, k = 144, t = 1, w = 1500625
|
|||
// digit = b = 1, i = 1544762, k = 180, t = 1, w = 52521875
|
|||
//
|
|||
// At this point, t now becomes TMAX (26) because k exceeds the bias
|
|||
// (since the maximum bias is 204). As such, the minimum continuation
|
|||
// value is 26:
|
|||
//
|
|||
// digit = 0 = 26, i = 1367113512, k = 216, t = 26, w = 525218750
|
|||
//
|
|||
// However, the next iteration now overflows i before we can get to
|
|||
// the w update.
|
|||
throw std::invalid_argument("Overflow on w."); // LCOV_EXCL_LINE
|
|||
} |
|||
w *= (BASE - t); |
|||
} |
|||
|
|||
// let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?)
|
|||
bias = adapt(i - oldi, codepoints.size() + 1, oldi == 0); |
|||
|
|||
// let n = n + i div (length(output) + 1), fail on overflow
|
|||
if ((i / (codepoints.size() + 1)) > (MAX_PUNYCODE_UINT - n)) |
|||
{ |
|||
throw std::invalid_argument("Overflow on n."); |
|||
} |
|||
n += i / (codepoints.size() + 1); |
|||
|
|||
// let i = i mod (length(output) + 1)
|
|||
i %= (codepoints.size() + 1); |
|||
|
|||
// insert n into output at position i
|
|||
codepoints.insert(codepoints.begin() + i, n); |
|||
|
|||
// increment i
|
|||
++i; |
|||
} |
|||
|
|||
std::string output; |
|||
for (auto it = codepoints.begin(); it != codepoints.end(); ++it) |
|||
{ |
|||
Utf8::writeCodepoint(output, *it); |
|||
} |
|||
str.assign(output); |
|||
|
|||
return str; |
|||
} |
|||
|
|||
std::string Punycode::decode(const std::string& str) |
|||
{ |
|||
std::string result(str); |
|||
decode(result); |
|||
return result; |
|||
} |
|||
|
|||
std::string Punycode::decodeHostname(const std::string& hostname) |
|||
{ |
|||
std::string unencoded; |
|||
|
|||
size_t start = 0; |
|||
size_t end = hostname.find('.'); |
|||
while(true) |
|||
{ |
|||
std::string segment = hostname.substr(start, end - start); |
|||
if (segment.substr(0, 4).compare("xn--") == 0) |
|||
{ |
|||
segment = segment.substr(4); |
|||
unencoded.append(Punycode::decode(segment)); |
|||
} |
|||
else |
|||
{ |
|||
unencoded.append(segment); |
|||
} |
|||
|
|||
if (end == std::string::npos) |
|||
{ |
|||
break; |
|||
} |
|||
else |
|||
{ |
|||
unencoded.append(1, '.'); |
|||
start = end + 1; |
|||
end = hostname.find('.', start); |
|||
} |
|||
} |
|||
|
|||
return unencoded; |
|||
} |
|||
|
|||
bool Punycode::needsPunycoding(const std::string& str) |
|||
{ |
|||
return std::any_of( |
|||
str.begin(), |
|||
str.end(), |
|||
[](char i){ return static_cast<unsigned char>(i) & 0x80; }); |
|||
} |
|||
|
|||
Punycode::punycode_uint Punycode::adapt( |
|||
punycode_uint delta, punycode_uint numpoints, bool firsttime) |
|||
{ |
|||
// Psuedocode from https://tools.ietf.org/html/rfc3492#section-6.1
|
|||
//
|
|||
// It does not matter whether the modifications to delta and k inside
|
|||
// adapt() affect variables of the same name inside the
|
|||
// encoding/decoding procedures, because after calling adapt() the
|
|||
// caller does not read those variables before overwriting them.
|
|||
//
|
|||
// if firsttime then let delta = delta div damp
|
|||
// else let delta = delta div 2
|
|||
delta = firsttime ? delta / DAMP : delta >> 1; |
|||
|
|||
// let delta = delta + (delta div numpoints)
|
|||
delta += (delta / numpoints); |
|||
|
|||
// let k = 0
|
|||
punycode_uint k = 0; |
|||
|
|||
// while delta > ((base - tmin) * tmax) div 2 do begin
|
|||
for (; delta > ((BASE - TMIN) * TMAX) / 2; k += BASE) |
|||
{ |
|||
// let delta = delta div (base - tmin)
|
|||
// let k = k + base
|
|||
delta /= (BASE - TMIN); |
|||
} |
|||
|
|||
// return k + (((base - tmin + 1) * delta) div (delta + skew))
|
|||
return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW)); |
|||
} |
|||
|
|||
}; |
@ -0,0 +1,105 @@ |
|||
#ifndef PUNYCODE_CPP_H |
|||
#define PUNYCODE_CPP_H |
|||
|
|||
#include <stdexcept> |
|||
#include <string> |
|||
#include <vector> |
|||
#include <unordered_map> |
|||
#include <unordered_set> |
|||
|
|||
#include "utf8.h" |
|||
|
|||
namespace Url |
|||
{ |
|||
|
|||
namespace Punycode |
|||
{ |
|||
typedef Utf8::codepoint_t punycode_uint; |
|||
|
|||
const unsigned int BASE = 36; |
|||
const unsigned int TMIN = 1; |
|||
const unsigned int TMAX = 26; |
|||
const unsigned int SKEW = 38; |
|||
const unsigned int DAMP = 700; |
|||
const unsigned int INITIAL_BIAS = 72; |
|||
const unsigned int INITIAL_N = 128; |
|||
|
|||
// Codepoints to their base-36 value
|
|||
const std::vector<int8_t> BASIC_TO_DIGIT = { |
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
|
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, |
|||
|
|||
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, |
|||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, |
|||
|
|||
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, |
|||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, |
|||
|
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
|
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
|
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
|
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 |
|||
}; |
|||
const std::string DIGIT_TO_BASIC = "abcdefghijklmnopqrstuvwxyz0123456789"; |
|||
|
|||
// The highest codepoint in unicode
|
|||
const punycode_uint MAX_PUNYCODE_UINT = std::numeric_limits<punycode_uint>::max(); |
|||
//Utf8::MAX_CODEPOINT;
|
|||
//std::numeric_limits<punycode_uint>::max();
|
|||
|
|||
/**
|
|||
* Replace utf-8-encoded str into punycode. |
|||
*/ |
|||
std::string& encode(std::string& str); |
|||
|
|||
/**
|
|||
* Create a new punycoded string from utf-8-encoded input. |
|||
*/ |
|||
std::string encode(const std::string& str); |
|||
|
|||
/**
|
|||
* Encode a hostname. |
|||
*/ |
|||
std::string encodeHostname(const std::string& hostname); |
|||
|
|||
/**
|
|||
* Replace punycoded str into utf-8-encoded. |
|||
*/ |
|||
std::string& decode(std::string& str); |
|||
|
|||
/**
|
|||
* Create a new utf-8-encoded string from punycoded input. |
|||
*/ |
|||
std::string decode(const std::string& str); |
|||
|
|||
/**
|
|||
* Decode a hostname. |
|||
*/ |
|||
std::string decodeHostname(const std::string& hostname); |
|||
|
|||
/**
|
|||
* Determine if a string needs punycoding. |
|||
*/ |
|||
bool needsPunycoding(const std::string& str); |
|||
|
|||
/**
|
|||
* Internal function for calculating bias. |
|||
*/ |
|||
punycode_uint adapt( |
|||
punycode_uint delta, punycode_uint numpoints, bool firsttime); |
|||
|
|||
}; |
|||
|
|||
} |
|||
|
|||
#endif |
@ -0,0 +1,26 @@ |
|||
#include <Rcpp.h> |
|||
using namespace Rcpp; |
|||
|
|||
#include "url.h" |
|||
#include "robots.h" |
|||
|
|||
//' Parse robots.txt
|
|||
//'
|
|||
//' @noRd
|
|||
//'
|
|||
// [[Rcpp::export]]
|
|||
SEXP rep_parse(std::string content) { |
|||
Rcpp::XPtr<Rep::Robots> ptr(new Rep::Robots(content)); |
|||
return(ptr); |
|||
} |
|||
|
|||
|
|||
//' Path allowed
|
|||
//'
|
|||
//' @noRd
|
|||
//'
|
|||
// [[Rcpp::export]]
|
|||
bool rep_path_allowed(SEXP xp, std::string path, std::string agent = "*") { |
|||
Rcpp::XPtr<Rep::Robots> ptr(xp); |
|||
return(ptr->allowed(path, agent)); |
|||
} |
@ -0,0 +1,188 @@ |
|||
#include <algorithm> |
|||
#include <functional> |
|||
#include <cctype> |
|||
#include <locale> |
|||
#include <sstream> |
|||
#include <iostream> |
|||
#include <unordered_map> |
|||
|
|||
#include "url.h" |
|||
|
|||
#include "robots.h" |
|||
#include <Rcpp.h> |
|||
|
|||
namespace Rep |
|||
{ |
|||
|
|||
void Robots::strip(std::string& string) |
|||
{ |
|||
string.erase(string.begin(), std::find_if(string.begin(), string.end(), |
|||
std::not1(std::ptr_fun<int, int>(std::isspace)))); |
|||
string.erase(std::find_if(string.rbegin(), string.rend(), |
|||
std::not1(std::ptr_fun<int, int>(std::isspace))).base(), string.end()); |
|||
} |
|||
|
|||
bool Robots::getpair(std::istringstream& stream, std::string& key, std::string& value) |
|||
{ |
|||
while (getline(stream, key)) |
|||
{ |
|||
size_t index = key.find('#'); |
|||
if (index != std::string::npos) |
|||
{ |
|||
key.resize(index); |
|||
} |
|||
|
|||
// Find the colon and divide it into key and value, skipping malformed lines
|
|||
index = key.find(':'); |
|||
if (index == std::string::npos) |
|||
{ |
|||
continue; |
|||
} |
|||
|
|||
value.assign(key.begin() + index + 1, key.end()); |
|||
key.resize(index); |
|||
|
|||
// Strip whitespace off of each
|
|||
strip(key); |
|||
strip(value); |
|||
|
|||
// Lowercase the key
|
|||
std::transform(key.begin(), key.end(), key.begin(), ::tolower); |
|||
|
|||
return true; |
|||
} |
|||
return false; |
|||
} |
|||
|
|||
Robots::Robots(const std::string& content): agents_(), sitemaps_(), default_(agents_["*"]) |
|||
{ |
|||
std::string agent_name("*"); |
|||
std::istringstream input(content); |
|||
if (content.compare(0, 3, "\xEF\xBB\xBF") == 0) |
|||
{ |
|||
input.ignore(3); |
|||
} |
|||
std::string key, value; |
|||
std::vector<std::string> group; |
|||
bool last_agent = false; |
|||
agent_map_t::iterator current = agents_.find("*"); |
|||
while (Robots::getpair(input, key, value)) |
|||
{ |
|||
if (key.compare("user-agent") == 0) |
|||
{ |
|||
// Store the user agent string as lowercased
|
|||
std::transform(value.begin(), value.end(), value.begin(), ::tolower); |
|||
|
|||
if (last_agent) |
|||
{ |
|||
group.push_back(value); |
|||
} |
|||
else |
|||
{ |
|||
if (!agent_name.empty()) |
|||
{ |
|||
for (auto other : group) |
|||
{ |
|||
agents_[other] = current->second; |
|||
} |
|||
group.clear(); |
|||
} |
|||
agent_name = value; |
|||
current = agents_.emplace(agent_name, Agent()).first; |
|||
} |
|||
last_agent = true; |
|||
continue; |
|||
} |
|||
else |
|||
{ |
|||
last_agent = false; |
|||
} |
|||
|
|||
if (key.compare("sitemap") == 0) |
|||
{ |
|||
sitemaps_.push_back(value); |
|||
} |
|||
else if (key.compare("disallow") == 0) |
|||
{ |
|||
current->second.disallow(value); |
|||
} |
|||
else if (key.compare("allow") == 0) |
|||
{ |
|||
current->second.allow(value); |
|||
} |
|||
else if (key.compare("crawl-delay") == 0) |
|||
{ |
|||
try |
|||
{ |
|||
current->second.delay(std::stof(value)); |
|||
} |
|||
catch (const std::exception&) |
|||
{ |
|||
Rcpp::Rcout << "Could not parse " << value << " as float." << std::endl; |
|||
} |
|||
} |
|||
} |
|||
|
|||
if (!agent_name.empty()) |
|||
{ |
|||
for (auto other : group) |
|||
{ |
|||
agents_[other] = current->second; |
|||
} |
|||
} |
|||
} |
|||
|
|||
const Agent& Robots::agent(const std::string& name) const |
|||
{ |
|||
// Lowercase the agent
|
|||
std::string lowered(name); |
|||
std::transform(lowered.begin(), lowered.end(), lowered.begin(), ::tolower); |
|||
|
|||
auto it = agents_.find(lowered); |
|||
if (it == agents_.end()) |
|||
{ |
|||
return default_; |
|||
} |
|||
else |
|||
{ |
|||
return it->second; |
|||
} |
|||
} |
|||
|
|||
bool Robots::allowed(const std::string& path, const std::string& name) const |
|||
{ |
|||
return agent(name).allowed(path); |
|||
} |
|||
|
|||
std::string Robots::str() const |
|||
{ |
|||
std::stringstream out; |
|||
// TODO: include sitepath info
|
|||
out << '{'; |
|||
auto begin = agents_.begin(); |
|||
auto end = agents_.end(); |
|||
if (begin != end) |
|||
{ |
|||
out << '"' << begin->first << '"' << ": " << begin->second.str(); |
|||
++begin; |
|||
} |
|||
for (; begin != end; ++begin) |
|||
{ |
|||
out << ", \"" << begin->first << '"' << ": " << begin->second.str(); |
|||
} |
|||
out << '}'; |
|||
return out.str(); |
|||
} |
|||
|
|||
std::string Robots::robotsUrl(const std::string& url) |
|||
{ |
|||
return Url::Url(url) |
|||
.setUserinfo("") |
|||
.setPath("robots.txt") |
|||
.setParams("") |
|||
.setQuery("") |
|||
.setFragment("") |
|||
.remove_default_port() |
|||
.str(); |
|||
} |
|||
} |
@ -0,0 +1,69 @@ |
|||
#ifndef ROBOTS_CPP_H |
|||
#define ROBOTS_CPP_H |
|||
|
|||
#include <sstream> |
|||
#include <unordered_map> |
|||
#include <vector> |
|||
|
|||
#include "agent.h" |
|||
|
|||
namespace Rep |
|||
{ |
|||
|
|||
class Robots |
|||
{ |
|||
public: |
|||
typedef std::unordered_map<std::string, Agent> agent_map_t; |
|||
typedef std::vector<std::string> sitemaps_t; |
|||
|
|||
/**
|
|||
* Create a robots.txt from a utf-8-encoded string. |
|||
*/ |
|||
Robots(const std::string& content); |
|||
|
|||
/**
|
|||
* Instantiate a Robots object. |
|||
*/ |
|||
Robots( |
|||
const agent_map_t& agents, |
|||
const sitemaps_t& sitemaps) |
|||
: agents_(agents) |
|||
, sitemaps_(sitemaps) |
|||
, default_(agents_["*"]) {} |
|||
|
|||
/**
|
|||
* Get the sitemaps in this robots.txt |
|||
*/ |
|||
const sitemaps_t& sitemaps() const { return sitemaps_; } |
|||
|
|||
/**
|
|||
* Get the agent with the corresponding name. |
|||
*/ |
|||
const Agent& agent(const std::string& name) const; |
|||
|
|||
/**
|
|||
* Return true if agent is allowed to fetch the URL (either a |
|||
* full URL or a path). |
|||
*/ |
|||
bool allowed(const std::string& path, const std::string& name) const; |
|||
|
|||
std::string str() const; |
|||
|
|||
/**
|
|||
* Return the robots.txt URL corresponding to the provided URL. |
|||
*/ |
|||
static std::string robotsUrl(const std::string& url); |
|||
|
|||
private: |
|||
static void strip(std::string& string); |
|||
|
|||
static bool getpair( |
|||
std::istringstream& stream, std::string& key, std::string& value); |
|||
|
|||
agent_map_t agents_; |
|||
sitemaps_t sitemaps_; |
|||
Agent& default_; |
|||
}; |
|||
} |
|||
|
|||
#endif |
@ -0,0 +1,962 @@ |
|||
#include <algorithm> |
|||
#include <string> |
|||
#include <iterator> |
|||
#include <unordered_map> |
|||
#include <unordered_set> |
|||
#include <iostream> |
|||
#include <iterator> |
|||
#include <sstream> |
|||
|
|||
#include "url.h" |
|||
#include "punycode.h" |
|||
|
|||
namespace Url |
|||
{ |
|||
|
|||
/* Character classes */ |
|||
const CharacterClass Url::GEN_DELIMS(":/?#[]@"); |
|||
const CharacterClass Url::SUB_DELIMS("!$&'()*+,;="); |
|||
const CharacterClass Url::DIGIT("0123456789"); |
|||
const CharacterClass Url::ALPHA( |
|||
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); |
|||
const CharacterClass Url::UNRESERVED( |
|||
Url::ALPHA.chars() + Url::DIGIT.chars() + "-._~"); |
|||
const CharacterClass Url::RESERVED( |
|||
Url::GEN_DELIMS.chars() + Url::SUB_DELIMS.chars()); |
|||
const CharacterClass Url::PCHAR( |
|||
Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":@"); |
|||
const CharacterClass Url::PATH( |
|||
Url::PCHAR.chars() + "/"); |
|||
const CharacterClass Url::QUERY( |
|||
Url::PCHAR.chars() + "/?"); |
|||
const CharacterClass Url::FRAGMENT( |
|||
Url::PCHAR.chars() + "/?"); |
|||
const CharacterClass Url::USERINFO( |
|||
Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":"); |
|||
const CharacterClass Url::HEX("0123456789ABCDEF"); |
|||
const CharacterClass Url::SCHEME( |
|||
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-."); |
|||
const std::vector<signed char> Url::HEX_TO_DEC = { |
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
|
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, |
|||
|
|||
-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
|
|||
-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
|
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
|
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
|
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
|
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 |
|||
}; |
|||
const std::unordered_map<std::string, int> Url::PORTS = { |
|||
{"http", 80}, |
|||
{"https", 443} |
|||
}; |
|||
const std::unordered_set<std::string> Url::USES_RELATIVE = { |
|||
"", |
|||
"file", |
|||
"ftp", |
|||
"gopher", |
|||
"http", |
|||
"https", |
|||
"imap", |
|||
"mms", |
|||
"nntp", |
|||
"prospero", |
|||
"rtsp", |
|||
"rtspu", |
|||
"sftp", |
|||
"shttp", |
|||
"svn", |
|||
"svn+ssh", |
|||
"wais" |
|||
}; |
|||
const std::unordered_set<std::string> Url::USES_NETLOC = { |
|||
"", |
|||
"file", |
|||
"ftp", |
|||
"git", |
|||
"git+ssh", |
|||
"gopher", |
|||
"http", |
|||
"https", |
|||
"imap", |
|||
"mms", |
|||
"nfs", |
|||
"nntp", |
|||
"prospero", |
|||
"rsync", |
|||
"rtsp", |
|||
"rtspu", |
|||
"sftp", |
|||
"shttp", |
|||
"snews", |
|||
"svn", |
|||
"svn+ssh", |
|||
"telnet", |
|||
"wais" |
|||
}; |
|||
const std::unordered_set<std::string> Url::USES_PARAMS = { |
|||
"", |
|||
"ftp", |
|||
"hdl", |
|||
"http", |
|||
"https", |
|||
"imap", |
|||
"mms", |
|||
"prospero", |
|||
"rtsp", |
|||
"rtspu", |
|||
"sftp", |
|||
"shttp", |
|||
"sip", |
|||
"sips", |
|||
"tel" |
|||
}; |
|||
const std::unordered_set<std::string> Url::KNOWN_PROTOCOLS = { |
|||
"", |
|||
"file", |
|||
"ftp", |
|||
"git", |
|||
"git+ssh", |
|||
"gopher", |
|||
"hdl", |
|||
"http", |
|||
"https", |
|||
"imap", |
|||
"mms", |
|||
"nfs", |
|||
"nntp", |
|||
"prospero", |
|||
"rsync", |
|||
"rtsp", |
|||
"rtspu", |
|||
"sftp", |
|||
"shttp", |
|||
"sip", |
|||
"sips", |
|||
"sms", |
|||
"snews", |
|||
"svn", |
|||
"svn+ssh", |
|||
"tel", |
|||
"telnet", |
|||
"wais" |
|||
}; |
|||
|
|||
Url::Url(const std::string& url): port_(0), has_params_(false), has_query_(false) |
|||
{ |
|||
size_t position = 0; |
|||
size_t index = url.find(':'); |
|||
if (index != std::string::npos) |
|||
{ |
|||
// All the characters in our would-be scheme must be in SCHEME
|
|||
if (std::all_of( |
|||
url.begin(), |
|||
url.begin() + index, |
|||
[](char c) { return SCHEME(c); } )) |
|||
{ |
|||
// If there is nothing after the : or there are any non-digits, this is
|
|||
// the scheme
|
|||
if ((index + 1) >= url.length() |
|||
|| std::any_of( |
|||
url.begin() + index + 1, |
|||
url.end(), |
|||
[](char c) { return !DIGIT(c); })) |
|||
{ |
|||
scheme_.assign(url, 0, index); |
|||
std::transform( |
|||
scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower); |
|||
position = index + 1; |
|||
} |
|||
else |
|||
{ |
|||
scheme_.assign(url, 0, index); |
|||
std::transform( |
|||
scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower); |
|||
if (KNOWN_PROTOCOLS.find(scheme_) != KNOWN_PROTOCOLS.end()) |
|||
{ |
|||
position = index + 1; |
|||
} |
|||
else |
|||
{ |
|||
scheme_.clear(); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
// Search for the netloc
|
|||
if ((url.length() - position) >= 1 |
|||
&& url[position] == '/' |
|||
&& url[position + 1] == '/') |
|||
{ |
|||
// Skip the '//'
|
|||
position += 2; |
|||
index = url.find_first_of("/?#", position); |
|||
host_.assign(url, position, index - position); |
|||
position = index; |
|||
|
|||
// Extract any userinfo if there is any
|
|||
index = host_.find('@'); |
|||
if (index != std::string::npos) |
|||
{ |
|||
userinfo_.assign(host_, 0, index); |
|||
host_.assign(host_, index + 1, std::string::npos); |
|||
} |
|||
|
|||
// Lowercase the hostname
|
|||
std::transform(host_.begin(), host_.end(), host_.begin(), ::tolower); |
|||
|
|||
// Try to find a port
|
|||
index = host_.find(':'); |
|||
if (index != std::string::npos) |
|||
{ |
|||
std::string portText(host_, index + 1, std::string::npos); |
|||
host_.resize(index); |
|||
|
|||
if (portText.empty()) |
|||
{ |
|||
port_ = 0; |
|||
} |
|||
else |
|||
{ |
|||
try |
|||
{ |
|||
port_ = std::stoi(portText, &index); |
|||
|
|||
if (index != portText.length()) |
|||
{ |
|||
// Malformed port
|
|||
throw UrlParseException("Port not a number: " + portText); |
|||
} |
|||
|
|||
if (port_ > 65535) |
|||
{ |
|||
throw UrlParseException("Port too high: " + portText); |
|||
} |
|||
else if (port_ < 0) |
|||
{ |
|||
throw UrlParseException("Port negative: " + portText); |
|||
} |
|||
} |
|||
catch (const std::invalid_argument&) |
|||
{ |
|||
// Malformed port
|
|||
throw UrlParseException("Port not a number: " + portText); |
|||
} |
|||
catch (const std::out_of_range&) |
|||
{ |
|||
throw UrlParseException("Port out of integer range: " + portText); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
if (position != std::string::npos) |
|||
{ |
|||
path_.assign(url, position, std::string::npos); |
|||
|
|||
index = path_.find('#'); |
|||
if (index != std::string::npos) |
|||
{ |
|||
fragment_.assign(path_, index + 1, std::string::npos); |
|||
path_.resize(index); |
|||
} |
|||
|
|||
index = path_.find('?'); |
|||
if (index != std::string::npos) |
|||
{ |
|||
query_.assign(path_, index + 1, std::string::npos); |
|||
has_query_ = true; |
|||
path_.resize(index); |
|||
} |
|||
|
|||
if (USES_PARAMS.find(scheme_) != USES_PARAMS.end()) |
|||
{ |
|||
index = path_.find(';'); |
|||
if (index != std::string::npos) |
|||
{ |
|||
params_.assign(path_, index + 1, std::string::npos); |
|||
has_params_ = true; |
|||
path_.resize(index); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
Url& Url::assign(const Url& other) |
|||
{ |
|||
return (*this) = other; |
|||
} |
|||
|
|||
bool Url::operator==(const Url& other) const |
|||
{ |
|||
return ( |
|||
(scheme_ == other.scheme_ ) && |
|||
(userinfo_ == other.userinfo_ ) && |
|||
(host_ == other.host_ ) && |
|||
(port_ == other.port_ ) && |
|||
(path_ == other.path_ ) && |
|||
(params_ == other.params_ ) && |
|||
(query_ == other.query_ ) && |
|||
(fragment_ == other.fragment_ ) && |
|||
(has_params_ == other.has_params_) && |
|||
(has_query_ == other.has_query_ ) |
|||
); |
|||
} |
|||
|
|||
bool Url::operator!=(const Url& other) const |
|||
{ |
|||
return !operator==(other); |
|||
} |
|||
|
|||
bool Url::equiv(const Url& other) |
|||
{ |
|||
Url self_(*this); |
|||
Url other_(other); |
|||
|
|||
self_.strip() |
|||
.sort_query() |
|||
.defrag() |
|||
.deuserinfo() |
|||
.abspath() |
|||
.escape() |
|||
.punycode() |
|||
.remove_default_port(); |
|||
other_.strip() |
|||
.sort_query() |
|||
.defrag() |
|||
.deuserinfo() |
|||
.abspath() |
|||
.escape() |
|||
.punycode() |
|||
.remove_default_port(); |
|||
return self_ == other_; |
|||
} |
|||
|
|||
std::string& Url::remove_repeats(std::string& str, const char chr) |
|||
{ |
|||
size_t dest = 0; |
|||
// By initializing this to true, it also strips of leading instances of chr
|
|||
bool seen = true; |
|||
for (size_t src = 0; src < str.length(); ++src) |
|||
{ |
|||
if (!seen || (str[src] != chr)) |
|||
{ |
|||
str[dest++] = str[src]; |
|||
} |
|||
seen = str[src] == chr; |
|||
} |
|||
// Remove the last character if it happens to be chr
|
|||
size_t length = ((dest > 0) && (str[dest - 1] == chr)) ? dest - 1 : dest; |
|||
str.resize(length); |
|||
return str; |
|||
} |
|||
|
|||
std::string Url::fullpath() const |
|||
{ |
|||
std::string result; |
|||
if (path_.empty() || path_[0] != '/') |
|||
{ |
|||
result.append(1, '/'); |
|||
} |
|||
result.append(path_); |
|||
|
|||
if (has_params_) |
|||
{ |
|||
result.append(";"); |
|||
result.append(params_); |
|||
} |
|||
|
|||
if (has_query_) |
|||
{ |
|||
result.append("?"); |
|||
result.append(query_); |
|||
} |
|||
|
|||
if (!fragment_.empty()) |
|||
{ |
|||
result.append("#"); |
|||
result.append(fragment_); |
|||
} |
|||
return result; |
|||
} |
|||
|
|||
std::string Url::str() const |
|||
{ |
|||
std::string result; |
|||
|
|||
if (!scheme_.empty()) |
|||
{ |
|||
result.append(scheme_); |
|||
if (USES_NETLOC.find(scheme_) == USES_NETLOC.end()) |
|||
{ |
|||
result.append(":"); |
|||
} |
|||
else |
|||
{ |
|||
result.append("://"); |
|||
} |
|||
} |
|||
else if (!host_.empty()) |
|||
{ |
|||
result.append("//"); |
|||
} |
|||
|
|||
if (!userinfo_.empty()) |
|||
{ |
|||
result.append(userinfo_); |
|||
result.append("@"); |
|||
} |
|||
|
|||
if (!host_.empty()) |
|||
{ |
|||
result.append(host_); |
|||
} |
|||
|
|||
if (port_) |
|||
{ |
|||
result.append(":"); |
|||
result.append(std::to_string(port_)); |
|||
} |
|||
|
|||
if (path_.empty()) |
|||
{ |
|||
if (!result.empty()) |
|||
{ |
|||
result.append("/"); |
|||
} |
|||
} |
|||
else |
|||
{ |
|||
if (!host_.empty() && path_[0] != '/') |
|||
{ |
|||
result.append(1, '/'); |
|||
} |
|||
result.append(path_); |
|||
} |
|||
|
|||
if (has_params_) |
|||
{ |
|||
result.append(";"); |
|||
result.append(params_); |
|||
} |
|||
|
|||
if (has_query_) |
|||
{ |
|||
result.append("?"); |
|||
result.append(query_); |
|||
} |
|||
|
|||
if (!fragment_.empty()) |
|||
{ |
|||
result.append("#"); |
|||
result.append(fragment_); |
|||
} |
|||
|
|||
return result; |
|||
} |
|||
|
|||
Url& Url::strip() |
|||
{ |
|||
size_t start = query_.find_first_not_of('?'); |
|||
if (start != std::string::npos) |
|||
{ |
|||
query_.assign(query_, start, std::string::npos); |
|||
} |
|||
else |
|||
{ |
|||
query_.assign(""); |
|||
} |
|||
setQuery(remove_repeats(query_, '&')); |
|||
setParams(remove_repeats(params_, ';')); |
|||
return *this; |
|||
} |
|||
|
|||
Url& Url::abspath() |
|||
{ |
|||
std::string copy; |
|||
std::vector<size_t> segment_starts; |
|||
|
|||
if (path_.size() >= 1 && path_[0] == '/') |
|||
{ |
|||
copy.append(1, '/'); |
|||
segment_starts.push_back(0); |
|||
} |
|||
|
|||
bool directory = false; |
|||
size_t previous = 0; |
|||
size_t index = 0; |
|||
for (index = path_.find('/') |
|||
; index != std::string::npos |
|||
; previous = index + 1, index = path_.find('/', index + 1)) |
|||
{ |
|||
// Skip empty segments
|
|||
if (index - previous == 0) |
|||
{ |
|||
continue; |
|||
} |
|||
|
|||
if ((index - previous == 2) |
|||
&& path_[previous] == '.' |
|||
&& path_[previous + 1] == '.') |
|||
{ |
|||
if (!segment_starts.empty()) |
|||
{ |
|||
copy.resize(segment_starts.back()); |
|||
segment_starts.pop_back(); |
|||
} |
|||
directory = true; |
|||
} |
|||
else if ((index - previous == 1) && path_[previous] == '.') |
|||
{ |
|||
directory = true; |
|||
} |
|||
else |
|||
{ |
|||
segment_starts.push_back(copy.length()); |
|||
copy.append(path_, previous, index - previous); |
|||
copy.append(1, '/'); |
|||
directory = false; |
|||
} |
|||
} |
|||
|
|||
// Handle the last segment
|
|||
index = path_.length(); |
|||
if (previous == path_.length()) |
|||
{ |
|||
directory = true; |
|||
} |
|||
else if ((index - previous == 1) && path_[previous] == '.') |
|||
{ |
|||
directory = true; |
|||
} |
|||
else if ((index - previous == 2) |
|||
&& path_[previous] == '.' |
|||
&& path_[previous + 1] == '.') |
|||
{ |
|||
if (!segment_starts.empty()) |
|||
{ |
|||
copy.resize(segment_starts.back()); |
|||
} |
|||
directory = true; |
|||
} |
|||
else |
|||
{ |
|||
copy.append(path_, previous, index - previous); |
|||
copy.append(1, '/'); |
|||
directory = false; |
|||
} |
|||
|
|||
if (!directory && copy.size() >= 1) |
|||
{ |
|||
copy.resize(copy.size() - 1); |
|||
} |
|||
else if (directory && copy.empty()) |
|||
{ |
|||
copy.append(1, '/'); |
|||
} |
|||
path_.assign(copy); |
|||
|
|||
return *this; |
|||
} |
|||
|
|||
Url& Url::relative_to(const Url& other) |
|||
{ |
|||
// If this scheme does not use relative, return it unchanged
|
|||
if (USES_RELATIVE.find(scheme_) == USES_RELATIVE.end()) |
|||
{ |
|||
return *this; |
|||
} |
|||
|
|||
// Support scheme-relative URLs
|
|||
if (scheme_.empty()) |
|||
{ |
|||
scheme_ = other.scheme_; |
|||
} |
|||
|
|||
// If this is an absolute URL (or scheme-relative), return early
|
|||
if (!host_.empty()) { |
|||
return *this; |
|||
} |
|||
|
|||
// If it's not an absolute URL, we need to copy the other host and port
|
|||
host_ = other.host_; |
|||
port_ = other.port_; |
|||
userinfo_ = other.userinfo_; |
|||
|
|||
// If the path portion is absolute, then bail out early.
|
|||
if (!path_.empty() && path_.front() == '/') |
|||
{ |
|||
return *this; |
|||
} |
|||
|
|||
// Otherwise, this is a path that need to be evaluated relative to the other. If
|
|||
// there is no '/', then we just keep our current path if it's not empty.
|
|||
if (path_.empty()) |
|||
{ |
|||
if (params_.empty()) |
|||
{ |
|||
path_ = other.path_; |
|||
params_ = other.params_; |
|||
has_params_ = other.has_params_; |
|||
if (query_.empty()) |
|||
{ |
|||
query_ = other.query_; |
|||
has_query_ = other.has_query_; |
|||
} |
|||
} |
|||
else |
|||
{ |
|||
path_.assign(other.path_, 0, other.path_.rfind('/') + 1); |
|||
} |
|||
|
|||
if (fragment_.empty()) |
|||
{ |
|||
fragment_ = other.fragment_; |
|||
} |
|||
} |
|||
else |
|||
{ |
|||
size_t index = other.path_.rfind('/'); |
|||
if (index != std::string::npos) |
|||
{ |
|||
path_ = other.path_.substr(0, index + 1) + path_; |
|||
} |
|||
else if (!host_.empty()) |
|||
{ |
|||
path_ = "/" + path_; |
|||
} |
|||
} |
|||
|
|||
return *this; |
|||
} |
|||
|
|||
Url& Url::escape(bool strict) |
|||
{ |
|||
escape(path_, PATH, strict); |
|||
escape(query_, QUERY, strict); |
|||
escape(params_, QUERY, strict); |
|||
escape(userinfo_, USERINFO, strict); |
|||
return *this; |
|||
} |
|||
|
|||
std::string& Url::escape(std::string& str, const CharacterClass& safe, bool strict) |
|||
{ |
|||
std::string copy(str); |
|||
size_t dest = 0; |
|||
// Allocate space pessimistically -- if every entity is expanded, it will take 3x
|
|||
// the space.
|
|||
str.resize(str.length() * 3); |
|||
for (size_t src = 0; src < copy.length(); ++src) |
|||
{ |
|||
if (copy[src] == '%' && (copy.length() - src) >= 2) |
|||
{ |
|||
// Read ahead to see if there's a valid escape sequence. If not, treat
|
|||
// this like a normal character.
|
|||
if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1) |
|||
{ |
|||
int value = ( |
|||
HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]); |
|||
|
|||
// In strict mode, we can only unescape parameters if they are both
|
|||
// safe and node reserved
|
|||
if (!strict || (strict && safe(value) && !RESERVED(value))) |
|||
{ |
|||
// Replace src + 2 with that byte, advance src to consume it and
|
|||
// continue.
|
|||
src += 2; |
|||
copy[src] = value; |
|||
} |
|||
else |
|||
{ |
|||
str[dest++] = copy[src++]; |
|||
str[dest++] = ::toupper(copy[src++]); |
|||
str[dest++] = ::toupper(copy[src]); |
|||
continue; |
|||
} |
|||
} |
|||
} |
|||
|
|||
if (!safe(copy[src])) |
|||
{ |
|||
// Not safe -- replace with %XX
|
|||
str[dest++] = '%'; |
|||
str[dest++] = HEX.chars()[(copy[src] >> 4) & 0xF]; |
|||
str[dest++] = HEX.chars()[copy[src] & 0xF]; |
|||
} |
|||
else |
|||
{ |
|||
str[dest++] = copy[src]; |
|||
} |
|||
} |
|||
str.resize(dest); |
|||
return str; |
|||
} |
|||
|
|||
Url& Url::unescape() |
|||
{ |
|||
unescape(path_); |
|||
unescape(query_); |
|||
unescape(params_); |
|||
unescape(userinfo_); |
|||
return *this; |
|||
} |
|||
|
|||
std::string& Url::unescape(std::string& str) |
|||
{ |
|||
std::string copy(str); |
|||
size_t dest = 0; |
|||
for (size_t src = 0; src < copy.length(); ++src, ++dest) |
|||
{ |
|||
if (copy[src] == '%' && (copy.length() - src) >= 2) |
|||
{ |
|||
// Read ahead to see if there's a valid escape sequence. If not, treat
|
|||
// this like a normal character.
|
|||
if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1) |
|||
{ |
|||
int value = ( |
|||
HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]); |
|||
|
|||
// Replace src + 2 with that byte, advance src to consume it and
|
|||
// continue.
|
|||
src += 2; |
|||
str[dest] = value; |
|||
continue; |
|||
} |
|||
} |
|||
|
|||
// Either not a % or an incomplete entity
|
|||
str[dest] = copy[src]; |
|||
} |
|||
str.resize(dest); |
|||
return str; |
|||
} |
|||
|
|||
Url& Url::deparam(const std::unordered_set<std::string>& blacklist) |
|||
{ |
|||
// Predicate is if it's present in the blacklist.
|
|||
auto predicate = [blacklist](std::string& name, const std::string& value) |
|||
{ |
|||
std::transform(name.begin(), name.end(), name.begin(), ::tolower); |
|||
return blacklist.find(name) != blacklist.end(); |
|||
}; |
|||
|
|||
setQuery(remove_params(query_, predicate, '&')); |
|||
setParams(remove_params(params_, predicate, ';')); |
|||
return *this; |
|||
} |
|||
|
|||
Url& Url::deparam(const deparam_predicate& predicate) |
|||
{ |
|||
setQuery(remove_params(query_, predicate, '&')); |
|||
setParams(remove_params(params_, predicate, ';')); |
|||
return *this; |
|||
} |
|||
|
|||
std::string& Url::remove_params(std::string& str, |
|||
const deparam_predicate& predicate, |
|||
char sep) |
|||
{ |
|||
std::string copy; |
|||
std::string piece; |
|||
std::string name; |
|||
std::string value; |
|||
size_t previous = 0; |
|||
for (size_t index = str.find(sep) |
|||
; index != std::string::npos |
|||
; previous = index + 1, index = str.find(sep, previous)) |
|||
{ |
|||
piece.assign(str, previous, index - previous); |
|||
size_t position = piece.find('='); |
|||
name.assign(piece, 0, position); |
|||
value.clear(); |
|||
if (position != std::string::npos) |
|||
{ |
|||
value.assign(piece, position + 1, std::string::npos); |
|||
} |
|||
|
|||
if (!predicate(name, value)) |
|||
{ |
|||
copy.append(copy.empty() ? 0 : 1, sep); |
|||
copy.append(piece); |
|||
} |
|||
} |
|||
|
|||
if (previous < str.length()) |
|||
{ |
|||
piece.assign(str, previous, std::string::npos); |
|||
size_t position = piece.find('='); |
|||
name.assign(piece, 0, position); |
|||
value.clear(); |
|||
if (position != std::string::npos) |
|||
{ |
|||
value.assign(piece, position + 1, std::string::npos); |
|||
} |
|||
|
|||
if (!predicate(name, value)) |
|||
{ |
|||
copy.append(copy.empty() ? 0 : 1, sep); |
|||
copy.append(piece); |
|||
} |
|||
} |
|||
|
|||
str.assign(copy); |
|||
return str; |
|||
} |
|||
|
|||
Url& Url::sort_query() |
|||
{ |
|||
split_sort_join(query_, '&'); |
|||
split_sort_join(params_, ';'); |
|||
return *this; |
|||
} |
|||
|
|||
std::string& Url::split_sort_join(std::string& str, const char glue) |
|||
{ |
|||
// Return early if empty
|
|||
if (str.empty()) |
|||
{ |
|||
return str; |
|||
} |
|||
|
|||
// Split
|
|||
std::vector<std::string> pieces; |
|||
std::stringstream stream(str); |
|||
std::string item; |
|||
while (getline(stream, item, glue)) |
|||
{ |
|||
pieces.push_back(item); |
|||
} |
|||
|
|||
// Return early if it's just a single element
|
|||
if (pieces.size() == 1) |
|||
{ |
|||
return str; |
|||
} |
|||
|
|||
// Sort
|
|||
std::sort(pieces.begin(), pieces.end()); |
|||
|
|||
// Join (at this point we know that there's at least one element)
|
|||
std::stringstream output; |
|||
for (auto it = pieces.begin(); it != (pieces.end() - 1); ++it) |
|||
{ |
|||
output << *it << glue; |
|||
} |
|||
output << pieces.back(); |
|||
str.assign(output.str()); |
|||
return str; |
|||
} |
|||
|
|||
Url& Url::remove_default_port() |
|||
{ |
|||
if (port_ && !scheme_.empty()) |
|||
{ |
|||
auto it = PORTS.find(scheme_); |
|||
if (it != PORTS.end() && port_ == it->second) |
|||
{ |
|||
port_ = 0; |
|||
} |
|||
} |
|||
return *this; |
|||
} |
|||
|
|||
Url& Url::deuserinfo() |
|||
{ |
|||
userinfo_.clear(); |
|||
return *this; |
|||
} |
|||
|
|||
Url& Url::defrag() |
|||
{ |
|||
fragment_.clear(); |
|||
return *this; |
|||
} |
|||
|
|||
Url& Url::punycode() |
|||
{ |
|||
check_hostname(host_); |
|||
std::string encoded(Punycode::encodeHostname(host_)); |
|||
check_hostname(encoded); |
|||
host_ = encoded; |
|||
return *this; |
|||
} |
|||
|
|||
Url& Url::unpunycode() |
|||
{ |
|||
host_ = Punycode::decodeHostname(host_); |
|||
return *this; |
|||
} |
|||
|
|||
Url& Url::host_reversed() |
|||
{ |
|||
std::reverse(host_.begin(), host_.end()); |
|||
for (size_t index = 0, position = 0; index < host_.size(); index = position + 1) |
|||
{ |
|||
position = host_.find('.', index); |
|||
if (position == std::string::npos) |
|||
{ |
|||
std::reverse(host_.begin() + index, host_.end()); |
|||
break; |
|||
} |
|||
else |
|||
{ |
|||
std::reverse(host_.begin() + index, host_.begin() + position); |
|||
} |
|||
} |
|||
return *this; |
|||
} |
|||
|
|||
void Url::check_hostname(std::string& host) |
|||
{ |
|||
// Skip empty hostnames -- they are valid
|
|||
if (host.empty()) |
|||
{ |
|||
return; |
|||
} |
|||
|
|||
size_t start = 0; |
|||
size_t end = host.find('.'); |
|||
while (end != std::string::npos) |
|||
{ |
|||
if ((end - start) > 63) |
|||
{ |
|||
throw std::invalid_argument("Label too long."); |
|||
} |
|||
else if (end == start) |
|||
{ |
|||
throw std::invalid_argument("Empty label."); |
|||
} |
|||
|
|||
start = end + 1; |
|||
end = host.find('.', start); |
|||
} |
|||
|
|||
// For the final segment
|
|||
if ((host.size() - start) > 63) |
|||
{ |
|||
throw std::invalid_argument("Label too long."); |
|||
} |
|||
else if (host.size() == start && start > 1) |
|||
{ |
|||
// Remove a trailing empty segment
|
|||
host.resize(start - 1); |
|||
} |
|||
} |
|||
|
|||
}; |
@ -0,0 +1,323 @@ |
|||
#ifndef URL_CPP_H |
|||
#define URL_CPP_H |
|||
|
|||
#include <stdexcept> |
|||
#include <functional> |
|||
#include <string> |
|||
#include <vector> |
|||
#include <unordered_map> |
|||
#include <unordered_set> |
|||
|
|||
namespace Url |
|||
{ |
|||
|
|||
struct UrlParseException : public std::logic_error |
|||
{ |
|||
UrlParseException(const std::string& message) : std::logic_error(message) {} |
|||
}; |
|||
|
|||
struct CharacterClass |
|||
{ |
|||
CharacterClass(const std::string& chars) : chars_(chars), map_(256, false) |
|||
{ |
|||
for (auto it = chars_.begin(); it != chars_.end(); ++it) |
|||
{ |
|||
map_[static_cast<size_t>(*it)] = true; |
|||
} |
|||
} |
|||
|
|||
bool operator()(char c) const |
|||
{ |
|||
return map_[static_cast<unsigned char>(c)]; |
|||
} |
|||
|
|||
const std::string& chars() const |
|||
{ |
|||
return chars_; |
|||
} |
|||
|
|||
private: |
|||
// Private, unimplemented to prevent use
|
|||
CharacterClass(); |
|||
CharacterClass(const CharacterClass& other); |
|||
|
|||
std::string chars_; |
|||
std::vector<bool> map_; |
|||
}; |
|||
|
|||
struct Url |
|||
{ |
|||
/* Character classes */ |
|||
const static CharacterClass GEN_DELIMS; |
|||
const static CharacterClass SUB_DELIMS; |
|||
const static CharacterClass ALPHA; |
|||
const static CharacterClass DIGIT; |
|||
const static CharacterClass UNRESERVED; |
|||
const static CharacterClass RESERVED; |
|||
const static CharacterClass PCHAR; |
|||
const static CharacterClass PATH; |
|||
const static CharacterClass QUERY; |
|||
const static CharacterClass FRAGMENT; |
|||
const static CharacterClass USERINFO; |
|||
const static CharacterClass HEX; |
|||
const static CharacterClass SCHEME; |
|||
const static std::vector<signed char> HEX_TO_DEC; |
|||
const static std::unordered_map<std::string, int> PORTS; |
|||
const static std::unordered_set<std::string> USES_RELATIVE; |
|||
const static std::unordered_set<std::string> USES_NETLOC; |
|||
const static std::unordered_set<std::string> USES_PARAMS; |
|||
const static std::unordered_set<std::string> KNOWN_PROTOCOLS; |
|||
|
|||
// The type of the predicate used for removing parameters
|
|||
typedef std::function<bool(std::string&, std::string&)> deparam_predicate; |
|||
|
|||
explicit Url(const std::string& url); |
|||
|
|||
Url(const Url& other) |
|||
: scheme_(other.scheme_) |
|||
, host_(other.host_) |
|||
, port_(other.port_) |
|||
, path_(other.path_) |
|||
, params_(other.params_) |
|||
, query_(other.query_) |
|||
, fragment_(other.fragment_) |
|||
, userinfo_(other.userinfo_) |
|||
, has_params_(other.has_params_) |
|||
, has_query_(other.has_query_) { } |
|||
|
|||
/**
|
|||
* Take on the value of the other URL. |
|||
*/ |
|||
Url& assign(const Url& other); |
|||
|
|||
/**
|
|||
* To be considered equal, all fields must be equal. |
|||
*/ |
|||
bool operator==(const Url& other) const; |
|||
bool operator!=(const Url& other) const; |
|||
|
|||
/**
|
|||
* Two URLs are considered equivalent if they have the same meaning. |
|||
*/ |
|||
bool equiv(const Url& other); |
|||
|
|||
/**************************************
|
|||
* Component-wise access and setting. * |
|||
**************************************/ |
|||
const std::string& scheme() const { return scheme_; } |
|||
Url& setScheme(const std::string& s) |
|||
{ |
|||
scheme_ = s; |
|||
return *this; |
|||
} |
|||
|
|||
const std::string& host() const { return host_; } |
|||
Url& setHost(const std::string& s) |
|||
{ |
|||
host_ = s; |
|||
return *this; |
|||
} |
|||
|
|||
const int port() const { return port_; } |
|||
Url& setPort(int i) |
|||
{ |
|||
port_ = i; |
|||
return *this; |
|||
} |
|||
|
|||
const std::string& path() const { return path_; } |
|||
Url& setPath(const std::string& s) |
|||
{ |
|||
path_ = s; |
|||
return *this; |
|||
} |
|||
|
|||
const std::string& params() const { return params_; } |
|||
Url& setParams(const std::string& s) |
|||
{ |
|||
params_ = s; |
|||
has_params_ = !s.empty(); |
|||
return *this; |
|||
} |
|||
|
|||
const std::string& query() const { return query_; } |
|||
Url& setQuery(const std::string& s) |
|||
{ |
|||
query_ = s; |
|||
has_query_ = !s.empty(); |
|||
return *this; |
|||
} |
|||
|
|||
const std::string& fragment() const { return fragment_; } |
|||
Url& setFragment(const std::string& s) |
|||
{ |
|||
fragment_ = s; |
|||
return *this; |
|||
} |
|||
|
|||
const std::string& userinfo() const { return userinfo_; } |
|||
Url& setUserinfo(const std::string& s) |
|||
{ |
|||
userinfo_ = s; |
|||
return *this; |
|||
} |
|||
|
|||
/**
|
|||
* Get a representation of all components of the path, params, query, fragment. |
|||
* |
|||
* Always includes a leading /. |
|||
*/ |
|||
std::string fullpath() const; |
|||
|
|||
/**
|
|||
* Get a new string representation of the URL. |
|||
**/ |
|||
std::string str() const; |
|||
|
|||
/*********************
|
|||
* Chainable methods * |
|||
*********************/ |
|||
|
|||
/**
|
|||
* Strip semantically meaningless excess '?', '&', and ';' characters from query |
|||
* and params. |
|||
*/ |
|||
Url& strip(); |
|||
|
|||
/**
|
|||
* Make the path absolute. |
|||
* |
|||
* Evaluate '.', '..', and excessive slashes. |
|||
*/ |
|||
Url& abspath(); |
|||
|
|||
/**
|
|||
* Evaluate this URL relative fo `other`, placing the result in this object. |
|||
*/ |
|||
Url& relative_to(const std::string& other) |
|||
{ |
|||
return relative_to(Url(other)); |
|||
} |
|||
|
|||
/**
|
|||
* Evaluate this URL relative fo `other`, placing the result in this object. |
|||
*/ |
|||
Url& relative_to(const Url& other); |
|||
|
|||
/**
|
|||
* Ensure that the path, params, query, and userinfo are properly escaped. |
|||
* |
|||
* In 'strict' mode, only entities that are both safe and not reserved characters |
|||
* are unescaped. In non-strict mode, entities that are safe are unescaped. |
|||
*/ |
|||
Url& escape(bool strict=false); |
|||
|
|||
/**
|
|||
* Unescape all entities in the path, params, query, and userinfo. |
|||
*/ |
|||
Url& unescape(); |
|||
|
|||
/**
|
|||
* Remove any params or queries that appear in the blacklist. |
|||
* |
|||
* The blacklist should contain only lowercased strings, and the comparison is |
|||
* done in a case-insensitive way. |
|||
*/ |
|||
Url& deparam(const std::unordered_set<std::string>& blacklist); |
|||
|
|||
/**
|
|||
* Filter params subject to a predicate for whether it should be filtered. |
|||
* |
|||
* The predicate must accept two string refs -- the key and value (which may be |
|||
* empty). Return `true` if the parameter should be removed, and `false` |
|||
* otherwise. |
|||
*/ |
|||
Url& deparam(const deparam_predicate& predicate); |
|||
|
|||
/**
|
|||
* Put queries and params in sorted order. |
|||
* |
|||
* To ensure consistent comparisons, escape should be called beforehand. |
|||
*/ |
|||
Url& sort_query(); |
|||
|
|||
/**
|
|||
* Remove the port if it's the default for the scheme. |
|||
*/ |
|||
Url& remove_default_port(); |
|||
|
|||
/**
|
|||
* Remove the userinfo portion. |
|||
*/ |
|||
Url& deuserinfo(); |
|||
|
|||
/**
|
|||
* Remove the fragment. |
|||
*/ |
|||
Url& defrag(); |
|||
|
|||
/**
|
|||
* Punycode the hostname. |
|||
*/ |
|||
Url& punycode(); |
|||
|
|||
/**
|
|||
* Unpunycode the hostname. |
|||
*/ |
|||
Url& unpunycode(); |
|||
|
|||
/**
|
|||
* Reverse the hostname (a.b.c.d => d.c.b.a) |
|||
*/ |
|||
Url& host_reversed(); |
|||
|
|||
private: |
|||
// Private, unimplemented to prevent use.
|
|||
Url(); |
|||
|
|||
/**
|
|||
* Remove repeated, leading, and trailing instances of chr from the string. |
|||
*/ |
|||
std::string& remove_repeats(std::string& str, const char chr); |
|||
|
|||
/**
|
|||
* Ensure all the provided characters are escaped if necessary |
|||
*/ |
|||
std::string& escape(std::string& str, const CharacterClass& safe, bool strict); |
|||
|
|||
/**
|
|||
* Unescape entities in the provided string |
|||
*/ |
|||
std::string& unescape(std::string& str); |
|||
|
|||
/**
|
|||
* Remove any params that match entries in the blacklist. |
|||
*/ |
|||
std::string& remove_params( |
|||
std::string& str, const deparam_predicate& pred, char sep); |
|||
|
|||
/**
|
|||
* Split the provided string by char, sort, join by char. |
|||
*/ |
|||
std::string& split_sort_join(std::string& str, const char glue); |
|||
|
|||
/**
|
|||
* Check that the hostname is valid, removing an optional trailing '.'. |
|||
*/ |
|||
void check_hostname(std::string& host); |
|||
|
|||
std::string scheme_; |
|||
std::string host_; |
|||
int port_; |
|||
std::string path_; |
|||
std::string params_; |
|||
std::string query_; |
|||
std::string fragment_; |
|||
std::string userinfo_; |
|||
bool has_params_; |
|||
bool has_query_; |
|||
}; |
|||
|
|||
} |
|||
|
|||
#endif |
@ -0,0 +1,150 @@ |
|||
#include <algorithm> |
|||
#include <string> |
|||
#include <iostream> |
|||
|
|||
#include "utf8.h" |
|||
|
|||
namespace Url |
|||
{ |
|||
|
|||
Utf8::codepoint_t Utf8::readCodepoint( |
|||
std::string::const_iterator& it, const std::string::const_iterator& end) |
|||
{ |
|||
Utf8::char_t current = static_cast<Utf8::char_t>(*it++); |
|||
if (current & 0x80) |
|||
{ |
|||
// Number of additional bytes needed
|
|||
unsigned int bytes = 0; |
|||
// The accumulated value
|
|||
Utf8::codepoint_t result = 0; |
|||
if (current < 0xC0) |
|||
{ |
|||
// Invalid sequence
|
|||
throw std::invalid_argument("Low UTF-8 start byte"); |
|||
} |
|||
else if (current < 0xE0) |
|||
{ |
|||
// One additional byte, two bytes total, use 5 bits
|
|||
bytes = 1; |
|||
result = current & 0x1F; |
|||
} |
|||
else if (current < 0xF0) |
|||
{ |
|||
// Two additional bytes, three bytes total, use 4 bits
|
|||
bytes = 2; |
|||
result = current & 0x0F; |
|||
} |
|||
else if (current < 0xF8) |
|||
{ |
|||
// Three additional bytes, four bytes total, use 3 bits
|
|||
bytes = 3; |
|||
result = current & 0x07; |
|||
} |
|||
else |
|||
{ |
|||
throw std::invalid_argument("High UTF-8 start byte"); |
|||
} |
|||
|
|||
for (; bytes > 0; --bytes) { |
|||
if (it == end) |
|||
{ |
|||
throw std::invalid_argument("UTF-8 sequence terminated early."); |
|||
} |
|||
|
|||
current = static_cast<unsigned char>(*it++); |
|||
// Ensure the first two bits are 10
|
|||
if ((current & 0xC0) != 0x80) |
|||
{ |
|||
throw std::invalid_argument("Invalid continuation byte"); |
|||
} |
|||
result = (result << 6) | (current & 0x3F); |
|||
} |
|||
|
|||
return result; |
|||
} |
|||
else |
|||
{ |
|||
return current; |
|||
} |
|||
} |
|||
|
|||
std::string& Utf8::writeCodepoint(std::string& str, Utf8::codepoint_t value) |
|||
{ |
|||
if (value > MAX_CODEPOINT) |
|||
{ |
|||
throw std::invalid_argument("Code point too high."); |
|||
} |
|||
else if (value <= 0x007F) |
|||
{ |
|||
// Just append the character itself
|
|||
str.append(1, static_cast<char>(value)); |
|||
return str; |
|||
} |
|||
|
|||
unsigned int bytes = 0; |
|||
if (value > 0xFFFF) |
|||
{ |
|||
/**
|
|||
* 11110xxx + 3 bytes for 21 bits total |
|||
* |
|||
* We need to take bits 20-18, which 0x1C0000 masks out. These form the least |
|||
* significant bits of this byte (so we shift them back down by 18). The 5 |
|||
* most significant bits of this byte are 11110, so we OR this result with |
|||
* 0xF0 to get this first byte. |
|||
* |
|||
* The remaining bits will be consumed from the most-significant end and so |
|||
* they must be shifted up by (32 - 18) = 14. |
|||
*/ |
|||
str.append(1, static_cast<char>(((value & 0x1C0000) >> 18) | 0xF0)); |
|||
bytes = 3; |
|||
value <<= 14; |
|||
} |
|||
else if (value > 0x07FF) |
|||
{ |
|||
/**
|
|||
* 1110xxxx + 2 bytes for 16 bits total |
|||
* |
|||
* We need to take bits 15-12, which 0xF000 masks out. These form the least |
|||
* significant bits of this byte (so we shift them back down by 12). The 4 |
|||
* most significant bits of this byte are 1110, so we OR this result with |
|||
* 0xE0 to get this first byte. |
|||
* |
|||
* The remaining bits will be consumed from the most-significant end and so |
|||
* they must be shifted up by (32 - 12) = 20. |
|||
*/ |
|||
str.append(1, static_cast<char>(((value & 0xF000) >> 12) | 0xE0)); |
|||
bytes = 2; |
|||
value <<= 20; |
|||
} |
|||
else |
|||
{ |
|||
/**
|
|||
* 110xxxxx + 1 byte for 11 bits total |
|||
* |
|||
* We need to take bits 10-6, which 0x7C0 masks out. These form the least |
|||
* significant bits of this byte (so we shift them back down by 6). The 3 |
|||
* most significant bits of this byte are 110, so we OR this result with |
|||
* 0xC0 to get this first byte. |
|||
* |
|||
* The remaining bits will be consumed from the most-significant end and so |
|||
* they must be shifted up by (32 - 6) = 26. |
|||
*/ |
|||
str.append(1, static_cast<char>(((value & 0x7C0) >> 6) | 0xC0)); |
|||
bytes = 1; |
|||
value <<= 26; |
|||
} |
|||
|
|||
/**
|
|||
* The remaining bits are to be consumed 6 at a time from the most-significant |
|||
* end. The mask 0xFC000000 grabs these six bits, which then must be shifted down |
|||
* by 26, and OR'd with 0x80 to produce the continuation byte. |
|||
*/ |
|||
for (; bytes > 0; --bytes, value <<= 6) |
|||
{ |
|||
str.append(1, static_cast<char>(((value & 0xFC000000) >> 26) | 0x80)); |
|||
} |
|||
|
|||
return str; |
|||
} |
|||
|
|||
}; |
@ -0,0 +1,91 @@ |
|||
#ifndef UTF8_CPP_H |
|||
#define UTF8_CPP_H |
|||
|
|||
#include <stdexcept> |
|||
#include <string> |
|||
#include <vector> |
|||
|
|||
namespace Url |
|||
{ |
|||
|
|||
/**
|
|||
* Work between unicode code points and their UTF-8-encoded representation. |
|||
*/ |
|||
struct Utf8 |
|||
{ |
|||
/**
|
|||
* The type we use to represent Unicode codepoints. |
|||
*/ |
|||
typedef uint32_t codepoint_t; |
|||
|
|||
/**
|
|||
* The type we use when talking about the integral value of bytes. |
|||
*/ |
|||
typedef unsigned char char_t; |
|||
|
|||
/**
|
|||
* The highest allowed codepoint. |
|||
*/ |
|||
static const codepoint_t MAX_CODEPOINT = 0x10FFFF; |
|||
|
|||
/**
|
|||
* Consume up to the last byte of the sequence, returning the codepoint. |
|||
*/ |
|||
static codepoint_t readCodepoint( |
|||
std::string::const_iterator& it, const std::string::const_iterator& end); |
|||
|
|||
/**
|
|||
* Write a codepoint to the provided string. |
|||
*/ |
|||
static std::string& writeCodepoint(std::string& str, codepoint_t value); |
|||
|
|||
/**
|
|||
* Return the first codepoint stored in the provided string. |
|||
*/ |
|||
static codepoint_t toCodepoint(const std::string& str) |
|||
{ |
|||
auto it = str.begin(); |
|||
return readCodepoint(it, str.end()); |
|||
} |
|||
|
|||
/**
|
|||
* Get a string with the provided codepoint. |
|||
*/ |
|||
static std::string fromCodepoint(codepoint_t value) |
|||
{ |
|||
std::string str; |
|||
writeCodepoint(str, value); |
|||
return str; |
|||
} |
|||
|
|||
/**
|
|||
* Return all the codepoints in the string. |
|||
*/ |
|||
static std::vector<codepoint_t> toCodepoints(const std::string& str) |
|||
{ |
|||
std::vector<codepoint_t> result; |
|||
for (auto it = str.begin(); it != str.end(); ) |
|||
{ |
|||
result.push_back(readCodepoint(it, str.end())); |
|||
} |
|||
return result; |
|||
} |
|||
|
|||
/**
|
|||
* Create a string from a vector of codepoints. |
|||
*/ |
|||
static std::string fromCodepoints(const std::vector<codepoint_t>& points) |
|||
{ |
|||
std::string result; |
|||
for (auto it = points.begin(); it != points.end(); ++it) |
|||
{ |
|||
writeCodepoint(result, *it); |
|||
} |
|||
return result; |
|||
} |
|||
|
|||
}; |
|||
|
|||
} |
|||
|
|||
#endif |
@ -0,0 +1,3 @@ |
|||
library(testthat) |
|||
library(robotstxt) |
|||
test_check("rep") |
@ -0,0 +1,11 @@ |
|||
context("basic functionality") |
|||
test_that("we can do something", { |
|||
|
|||
rt <- robxp(robotstxt::get_robotstxt("https://cdc.gov")) |
|||
|
|||
expect_that(rt, is_a("robxp")) |
|||
|
|||
expect_that(can_fetch(rt, "/asthma/asthma_stats/default.htm", "*"), equals(TRUE)) |
|||
expect_that(can_fetch(rt, "/_borders", "*"), equals(FALSE)) |
|||
|
|||
}) |
Loading…
Reference in new issue