boB Rudis
7 years ago
commit
878bb7f045
39 changed files with 3445 additions and 0 deletions
@ -0,0 +1,11 @@ |
|||||
|
^.*\.Rproj$ |
||||
|
^\.Rproj\.user$ |
||||
|
^\.travis\.yml$ |
||||
|
^README\.*Rmd$ |
||||
|
^README\.*html$ |
||||
|
^NOTES\.*Rmd$ |
||||
|
^NOTES\.*html$ |
||||
|
^\.codecov\.yml$ |
||||
|
^README_files$ |
||||
|
^doc$ |
||||
|
^CONDUCT\.md$ |
@ -0,0 +1 @@ |
|||||
|
comment: false |
@ -0,0 +1,8 @@ |
|||||
|
.DS_Store |
||||
|
.Rproj.user |
||||
|
.Rhistory |
||||
|
.RData |
||||
|
.Rproj |
||||
|
src/*.o |
||||
|
src/*.so |
||||
|
src/*.dll |
@ -0,0 +1,31 @@ |
|||||
|
language: r |
||||
|
|
||||
|
warnings_are_errors: true |
||||
|
|
||||
|
sudo: required |
||||
|
|
||||
|
cache: packages |
||||
|
|
||||
|
r: |
||||
|
- oldrel |
||||
|
- release |
||||
|
- devel |
||||
|
|
||||
|
apt_packages: |
||||
|
- libv8-dev |
||||
|
- xclip |
||||
|
|
||||
|
env: |
||||
|
global: |
||||
|
- CRAN: http://cran.rstudio.com |
||||
|
|
||||
|
after_success: |
||||
|
- Rscript -e 'covr::codecov()' |
||||
|
|
||||
|
notifications: |
||||
|
email: |
||||
|
- bob@rud.is |
||||
|
irc: |
||||
|
channels: |
||||
|
- "104.236.112.222#builds" |
||||
|
nick: travisci |
@ -0,0 +1,25 @@ |
|||||
|
# Contributor Code of Conduct |
||||
|
|
||||
|
As contributors and maintainers of this project, we pledge to respect all people who |
||||
|
contribute through reporting issues, posting feature requests, updating documentation, |
||||
|
submitting pull requests or patches, and other activities. |
||||
|
|
||||
|
We are committed to making participation in this project a harassment-free experience for |
||||
|
everyone, regardless of level of experience, gender, gender identity and expression, |
||||
|
sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion. |
||||
|
|
||||
|
Examples of unacceptable behavior by participants include the use of sexual language or |
||||
|
imagery, derogatory comments or personal attacks, trolling, public or private harassment, |
||||
|
insults, or other unprofessional conduct. |
||||
|
|
||||
|
Project maintainers have the right and responsibility to remove, edit, or reject comments, |
||||
|
commits, code, wiki edits, issues, and other contributions that are not aligned to this |
||||
|
Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed |
||||
|
from the project team. |
||||
|
|
||||
|
Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by |
||||
|
opening an issue or contacting one or more of the project maintainers. |
||||
|
|
||||
|
This Code of Conduct is adapted from the Contributor Covenant |
||||
|
(http:contributor-covenant.org), version 1.0.0, available at |
||||
|
http://contributor-covenant.org/version/1/0/0/ |
@ -0,0 +1,27 @@ |
|||||
|
Package: rep |
||||
|
Type: Package |
||||
|
Title: Tools to Parse and Test Robots Exclusion Protocol Files and Rules |
||||
|
Version: 0.1.0 |
||||
|
Date: 2017-08-14 |
||||
|
Author: Bob Rudis (bob@rud.is) [aut, cre], SEOmoz, Inc [aut] |
||||
|
Maintainer: Bob Rudis <bob@rud.is> |
||||
|
Description: The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents |
||||
|
a set of standards for allowing or excluding robot/spider crawling of different areas of |
||||
|
site content. Tools are provided which wrap The 'rep-cpp` <https://github.com/seomoz/rep-cpp> |
||||
|
C++ library for processing these 'robots.txt' files. |
||||
|
SystemRequirements: C++11 |
||||
|
NeedsCompilation: yes |
||||
|
URL: https://github.com/hrbrmstr/rep |
||||
|
BugReports: https://github.com/hrbrmstr/rep/issues |
||||
|
License: MIT + file LICENSE |
||||
|
Suggests: |
||||
|
testthat, |
||||
|
covr, |
||||
|
robotstxt |
||||
|
Depends: |
||||
|
R (>= 3.2.0) |
||||
|
Imports: |
||||
|
purrr, |
||||
|
Rcpp |
||||
|
RoxygenNote: 6.0.1 |
||||
|
LinkingTo: Rcpp |
@ -0,0 +1,2 @@ |
|||||
|
YEAR: 2017 |
||||
|
COPYRIGHT HOLDER: Bob Rudis |
@ -0,0 +1,7 @@ |
|||||
|
# Generated by roxygen2: do not edit by hand |
||||
|
|
||||
|
S3method(print,robxp) |
||||
|
export(can_fetch) |
||||
|
export(robxp) |
||||
|
importFrom(Rcpp,sourceCpp) |
||||
|
useDynLib(rep, .registration=TRUE) |
@ -0,0 +1,2 @@ |
|||||
|
0.1.0 |
||||
|
* Initial release |
@ -0,0 +1,19 @@ |
|||||
|
# Generated by using Rcpp::compileAttributes() -> do not edit by hand |
||||
|
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 |
||||
|
|
||||
|
#' Parse robots.txt |
||||
|
#' |
||||
|
#' @noRd |
||||
|
#' |
||||
|
rep_parse <- function(content) { |
||||
|
.Call(`_rep_rep_parse`, content) |
||||
|
} |
||||
|
|
||||
|
#' Path allowed |
||||
|
#' |
||||
|
#' @noRd |
||||
|
#' |
||||
|
rep_path_allowed <- function(xp, path, agent = "*") { |
||||
|
.Call(`_rep_rep_path_allowed`, xp, path, agent) |
||||
|
} |
||||
|
|
@ -0,0 +1,14 @@ |
|||||
|
#' Tools to Parse and Test Robots Exclusion Protocol Files and Rules |
||||
|
#' |
||||
|
#' The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents a set |
||||
|
#' of standards for allowing or excluding robot/spider crawling of different areas of |
||||
|
#' site content. Tools are provided which wrap The 'rep-cpp` <https://github.com/seomoz/rep-cpp> |
||||
|
#' C++ library for processing these 'robots.txt' files. |
||||
|
#' |
||||
|
#' @md |
||||
|
#' @name rep |
||||
|
#' @docType package |
||||
|
#' @author Bob Rudis (bob@@rud.is) |
||||
|
#' @useDynLib rep, .registration=TRUE |
||||
|
#' @importFrom Rcpp sourceCpp |
||||
|
NULL |
@ -0,0 +1,47 @@ |
|||||
|
#' Create a robots.txt object |
||||
|
#' |
||||
|
#' @param x atomic character vector containing a complete robots.txt file |
||||
|
#' @export |
||||
|
#' @examples |
||||
|
#' library(robotstxt) |
||||
|
#' can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE |
||||
|
#' can_fetch(rt, "/_borders", "*") # FALSE |
||||
|
robxp <- function(x) { |
||||
|
|
||||
|
robxp <- rep_parse(x) |
||||
|
class(robxp) <- c("robxp") |
||||
|
|
||||
|
robxp |
||||
|
|
||||
|
} |
||||
|
|
||||
|
#' Test URL path against robots.txt |
||||
|
#' |
||||
|
#' @md |
||||
|
#' @param obj `robxp` object |
||||
|
#' @param path path to test |
||||
|
#' @param user_agent user agent to test |
||||
|
#' @export |
||||
|
#' @examples |
||||
|
#' library(robotstxt) |
||||
|
#' can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE |
||||
|
#' can_fetch(rt, "/_borders", "*") # FALSE |
||||
|
can_fetch <- function(obj, path="/", user_agent="*") { |
||||
|
|
||||
|
if (inherits(obj, "robxp")) { |
||||
|
rep_path_allowed(obj, path, user_agent) |
||||
|
} else { |
||||
|
return(NULL) |
||||
|
} |
||||
|
|
||||
|
} |
||||
|
|
||||
|
#' Custom printer for 'robexp' objects |
||||
|
#' |
||||
|
#' @md |
||||
|
#' @param x object to print |
||||
|
#' @param ... unused |
||||
|
#' @export |
||||
|
print.robxp <- function(x, ...) { |
||||
|
cat("<Robots Exclusion Protocol Object>") |
||||
|
} |
@ -0,0 +1,58 @@ |
|||||
|
--- |
||||
|
output: rmarkdown::github_document |
||||
|
--- |
||||
|
|
||||
|
`rep` : Tools to Parse and Test Robots Exclusion Protocol Files and Rules |
||||
|
|
||||
|
The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents a set of standards for allowing or excluding robot/spider crawling of different areas of site content. Tools are provided which wrap The 'rep-cpp` <https://github.com/seomoz/rep-cpp> C++ library for processing these 'robots.txt' files. |
||||
|
|
||||
|
- [`rep-cpp`](https://github.com/seomoz/rep-cpp) |
||||
|
- [`url-cpp`](https://github.com/seomoz/url-cpp) |
||||
|
|
||||
|
The following functions are implemented: |
||||
|
|
||||
|
- `robxp`: Create a robots.txt object |
||||
|
- `can_fetch`: Test URL path against robots.txt |
||||
|
|
||||
|
### Installation |
||||
|
|
||||
|
```{r eval=FALSE} |
||||
|
devtools::install_github("hrbrmstr/rep") |
||||
|
``` |
||||
|
|
||||
|
```{r message=FALSE, warning=FALSE, error=FALSE, include=FALSE} |
||||
|
options(width=120) |
||||
|
``` |
||||
|
|
||||
|
### Usage |
||||
|
|
||||
|
```{r message=FALSE, warning=FALSE, error=FALSE} |
||||
|
library(rep) |
||||
|
library(robotstxt) |
||||
|
|
||||
|
# current verison |
||||
|
packageVersion("rep") |
||||
|
|
||||
|
rt <- robxp(get_robotstxt("https://cdc.gov")) |
||||
|
|
||||
|
print(rt) |
||||
|
|
||||
|
can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") |
||||
|
|
||||
|
can_fetch(rt, "/_borders", "*") |
||||
|
``` |
||||
|
|
||||
|
### Test Results |
||||
|
|
||||
|
```{r message=FALSE, warning=FALSE, error=FALSE} |
||||
|
library(rep) |
||||
|
library(testthat) |
||||
|
|
||||
|
date() |
||||
|
|
||||
|
test_dir("tests/") |
||||
|
``` |
||||
|
|
||||
|
### Code of Conduct |
||||
|
|
||||
|
Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms. |
@ -0,0 +1,74 @@ |
|||||
|
|
||||
|
`rep` : Tools to Parse and Test Robots Exclusion Protocol Files and Rules |
||||
|
|
||||
|
The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents a set of standards for allowing or excluding robot/spider crawling of different areas of site content. Tools are provided which wrap The 'rep-cpp\` <https://github.com/seomoz/rep-cpp> C++ library for processing these 'robots.txt' files. |
||||
|
|
||||
|
- [`rep-cpp`](https://github.com/seomoz/rep-cpp) |
||||
|
- [`url-cpp`](https://github.com/seomoz/url-cpp) |
||||
|
|
||||
|
The following functions are implemented: |
||||
|
|
||||
|
- `robxp`: Create a robots.txt object |
||||
|
- `can_fetch`: Test URL path against robots.txt |
||||
|
|
||||
|
### Installation |
||||
|
|
||||
|
``` r |
||||
|
devtools::install_github("hrbrmstr/rep") |
||||
|
``` |
||||
|
|
||||
|
### Usage |
||||
|
|
||||
|
``` r |
||||
|
library(rep) |
||||
|
library(robotstxt) |
||||
|
|
||||
|
# current verison |
||||
|
packageVersion("rep") |
||||
|
``` |
||||
|
|
||||
|
## [1] '0.1.0' |
||||
|
|
||||
|
``` r |
||||
|
rt <- robxp(get_robotstxt("https://cdc.gov")) |
||||
|
|
||||
|
print(rt) |
||||
|
``` |
||||
|
|
||||
|
## <Robots Exclusion Protocol Object> |
||||
|
|
||||
|
``` r |
||||
|
can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") |
||||
|
``` |
||||
|
|
||||
|
## [1] TRUE |
||||
|
|
||||
|
``` r |
||||
|
can_fetch(rt, "/_borders", "*") |
||||
|
``` |
||||
|
|
||||
|
## [1] FALSE |
||||
|
|
||||
|
### Test Results |
||||
|
|
||||
|
``` r |
||||
|
library(rep) |
||||
|
library(testthat) |
||||
|
|
||||
|
date() |
||||
|
``` |
||||
|
|
||||
|
## [1] "Mon Aug 14 15:00:16 2017" |
||||
|
|
||||
|
``` r |
||||
|
test_dir("tests/") |
||||
|
``` |
||||
|
|
||||
|
## testthat results ======================================================================================================== |
||||
|
## OK: 3 SKIPPED: 0 FAILED: 0 |
||||
|
## |
||||
|
## DONE =================================================================================================================== |
||||
|
|
||||
|
### Code of Conduct |
||||
|
|
||||
|
Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms. |
@ -0,0 +1,23 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/rep.r |
||||
|
\name{can_fetch} |
||||
|
\alias{can_fetch} |
||||
|
\title{Test URL path against robots.txt} |
||||
|
\usage{ |
||||
|
can_fetch(obj, path = "/", user_agent = "*") |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{obj}{\code{robxp} object} |
||||
|
|
||||
|
\item{path}{path to test} |
||||
|
|
||||
|
\item{user_agent}{user agent to test} |
||||
|
} |
||||
|
\description{ |
||||
|
Test URL path against robots.txt |
||||
|
} |
||||
|
\examples{ |
||||
|
library(robotstxt) |
||||
|
can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE |
||||
|
can_fetch(rt, "/_borders", "*") # FALSE |
||||
|
} |
@ -0,0 +1,16 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/rep.r |
||||
|
\name{print.robxp} |
||||
|
\alias{print.robxp} |
||||
|
\title{Custom printer for 'robexp' objects} |
||||
|
\usage{ |
||||
|
\method{print}{robxp}(x, ...) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{x}{object to print} |
||||
|
|
||||
|
\item{...}{unused} |
||||
|
} |
||||
|
\description{ |
||||
|
Custom printer for 'robexp' objects |
||||
|
} |
@ -0,0 +1,16 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/rep-package.R |
||||
|
\docType{package} |
||||
|
\name{rep} |
||||
|
\alias{rep} |
||||
|
\alias{rep-package} |
||||
|
\title{Tools to Parse and Test Robots Exclusion Protocol Files and Rules} |
||||
|
\description{ |
||||
|
The 'Robots Exclusion Protocol' \url{http://www.robotstxt.org/orig.html} documents a set |
||||
|
of standards for allowing or excluding robot/spider crawling of different areas of |
||||
|
site content. Tools are provided which wrap The 'rep-cpp` \url{https://github.com/seomoz/rep-cpp} |
||||
|
C++ library for processing these 'robots.txt' files. |
||||
|
} |
||||
|
\author{ |
||||
|
Bob Rudis (bob@rud.is) |
||||
|
} |
@ -0,0 +1,19 @@ |
|||||
|
% Generated by roxygen2: do not edit by hand |
||||
|
% Please edit documentation in R/rep.r |
||||
|
\name{robxp} |
||||
|
\alias{robxp} |
||||
|
\title{Create a robots.txt object} |
||||
|
\usage{ |
||||
|
robxp(x) |
||||
|
} |
||||
|
\arguments{ |
||||
|
\item{x}{atomic character vector containing a complete robots.txt file} |
||||
|
} |
||||
|
\description{ |
||||
|
Create a robots.txt object |
||||
|
} |
||||
|
\examples{ |
||||
|
library(robotstxt) |
||||
|
can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE |
||||
|
can_fetch(rt, "/_borders", "*") # FALSE |
||||
|
} |
@ -0,0 +1,21 @@ |
|||||
|
Version: 1.0 |
||||
|
|
||||
|
RestoreWorkspace: Default |
||||
|
SaveWorkspace: Default |
||||
|
AlwaysSaveHistory: Default |
||||
|
|
||||
|
EnableCodeIndexing: Yes |
||||
|
UseSpacesForTab: Yes |
||||
|
NumSpacesForTab: 2 |
||||
|
Encoding: UTF-8 |
||||
|
|
||||
|
RnwWeave: Sweave |
||||
|
LaTeX: pdfLaTeX |
||||
|
|
||||
|
StripTrailingWhitespace: Yes |
||||
|
|
||||
|
BuildType: Package |
||||
|
PackageUseDevtools: Yes |
||||
|
PackageInstallArgs: --no-multiarch --with-keep.source |
||||
|
PackageBuildArgs: --resave-data |
||||
|
PackageRoxygenize: rd,collate,namespace |
@ -0,0 +1,3 @@ |
|||||
|
*.o |
||||
|
*.so |
||||
|
*.dll |
@ -0,0 +1,3 @@ |
|||||
|
CXX_STD = CXX11 |
||||
|
PKG_CXXFLAGS = |
||||
|
PKG_LIBS = -L. |
@ -0,0 +1,42 @@ |
|||||
|
// Generated by using Rcpp::compileAttributes() -> do not edit by hand
|
||||
|
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
|
||||
|
|
||||
|
#include <Rcpp.h> |
||||
|
|
||||
|
using namespace Rcpp; |
||||
|
|
||||
|
// rep_parse
|
||||
|
SEXP rep_parse(std::string content); |
||||
|
RcppExport SEXP _rep_rep_parse(SEXP contentSEXP) { |
||||
|
BEGIN_RCPP |
||||
|
Rcpp::RObject rcpp_result_gen; |
||||
|
Rcpp::RNGScope rcpp_rngScope_gen; |
||||
|
Rcpp::traits::input_parameter< std::string >::type content(contentSEXP); |
||||
|
rcpp_result_gen = Rcpp::wrap(rep_parse(content)); |
||||
|
return rcpp_result_gen; |
||||
|
END_RCPP |
||||
|
} |
||||
|
// rep_path_allowed
|
||||
|
bool rep_path_allowed(SEXP xp, std::string path, std::string agent); |
||||
|
RcppExport SEXP _rep_rep_path_allowed(SEXP xpSEXP, SEXP pathSEXP, SEXP agentSEXP) { |
||||
|
BEGIN_RCPP |
||||
|
Rcpp::RObject rcpp_result_gen; |
||||
|
Rcpp::RNGScope rcpp_rngScope_gen; |
||||
|
Rcpp::traits::input_parameter< SEXP >::type xp(xpSEXP); |
||||
|
Rcpp::traits::input_parameter< std::string >::type path(pathSEXP); |
||||
|
Rcpp::traits::input_parameter< std::string >::type agent(agentSEXP); |
||||
|
rcpp_result_gen = Rcpp::wrap(rep_path_allowed(xp, path, agent)); |
||||
|
return rcpp_result_gen; |
||||
|
END_RCPP |
||||
|
} |
||||
|
|
||||
|
static const R_CallMethodDef CallEntries[] = { |
||||
|
{"_rep_rep_parse", (DL_FUNC) &_rep_rep_parse, 1}, |
||||
|
{"_rep_rep_path_allowed", (DL_FUNC) &_rep_rep_path_allowed, 3}, |
||||
|
{NULL, NULL, 0} |
||||
|
}; |
||||
|
|
||||
|
RcppExport void R_init_rep(DllInfo *dll) { |
||||
|
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); |
||||
|
R_useDynamicSymbols(dll, FALSE); |
||||
|
} |
@ -0,0 +1,87 @@ |
|||||
|
#include <algorithm> |
||||
|
#include <sstream> |
||||
|
|
||||
|
#include "url.h" |
||||
|
|
||||
|
#include "agent.h" |
||||
|
#include "directive.h" |
||||
|
|
||||
|
namespace Rep |
||||
|
{ |
||||
|
Agent& Agent::allow(const std::string& query) |
||||
|
{ |
||||
|
directives_.push_back(Directive(escape(query), true)); |
||||
|
sorted_ = false; |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
Agent& Agent::disallow(const std::string& query) |
||||
|
{ |
||||
|
if (query.empty()) |
||||
|
{ |
||||
|
// Special case: "Disallow:" means "Allow: /"
|
||||
|
directives_.push_back(Directive(query, true)); |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
directives_.push_back(Directive(escape(query), false)); |
||||
|
} |
||||
|
sorted_ = false; |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
const std::vector<Directive>& Agent::directives() const |
||||
|
{ |
||||
|
if (!sorted_) |
||||
|
{ |
||||
|
std::sort(directives_.begin(), directives_.end(), [](const Directive& a, const Directive& b) { |
||||
|
return b.priority() < a.priority(); |
||||
|
}); |
||||
|
sorted_ = true; |
||||
|
} |
||||
|
return directives_; |
||||
|
} |
||||
|
|
||||
|
bool Agent::allowed(const std::string& query) const |
||||
|
{ |
||||
|
std::string path(escape(query)); |
||||
|
|
||||
|
if (path.compare("/robots.txt") == 0) |
||||
|
{ |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
for (auto directive : directives()) |
||||
|
{ |
||||
|
if (directive.match(path)) |
||||
|
{ |
||||
|
return directive.allowed(); |
||||
|
} |
||||
|
} |
||||
|
return true; |
||||
|
} |
||||
|
|
||||
|
std::string Agent::str() const |
||||
|
{ |
||||
|
std::stringstream out; |
||||
|
out << '['; |
||||
|
auto begin = directives().begin(); |
||||
|
auto end = directives().end(); |
||||
|
if (begin != end) |
||||
|
{ |
||||
|
out << "Directive(" << begin->str() << ')'; |
||||
|
++begin; |
||||
|
} |
||||
|
for (; begin != end; ++begin) |
||||
|
{ |
||||
|
out << ", Directive(" << begin->str() << ')'; |
||||
|
} |
||||
|
out << ']'; |
||||
|
return out.str(); |
||||
|
} |
||||
|
|
||||
|
std::string Agent::escape(const std::string& query) |
||||
|
{ |
||||
|
return Url::Url(query).defrag().escape().fullpath(); |
||||
|
} |
||||
|
} |
@ -0,0 +1,70 @@ |
|||||
|
#ifndef AGENT_CPP_H |
||||
|
#define AGENT_CPP_H |
||||
|
|
||||
|
#include <vector> |
||||
|
|
||||
|
#include "directive.h" |
||||
|
|
||||
|
|
||||
|
namespace Rep |
||||
|
{ |
||||
|
|
||||
|
class Agent |
||||
|
{ |
||||
|
public: |
||||
|
/* The type for the delay. */ |
||||
|
typedef float delay_t; |
||||
|
|
||||
|
/**
|
||||
|
* Construct an agent. |
||||
|
*/ |
||||
|
Agent(): directives_(), delay_(-1.0), sorted_(true) {} |
||||
|
|
||||
|
/**
|
||||
|
* Add an allowed directive. |
||||
|
*/ |
||||
|
Agent& allow(const std::string& query); |
||||
|
|
||||
|
/**
|
||||
|
* Add a disallowed directive. |
||||
|
*/ |
||||
|
Agent& disallow(const std::string& query); |
||||
|
|
||||
|
/**
|
||||
|
* Set the delay for this agent. |
||||
|
*/ |
||||
|
Agent& delay(delay_t value) { |
||||
|
delay_ = value; |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
/**
|
||||
|
* Return the delay for this agent. |
||||
|
*/ |
||||
|
delay_t delay() const { return delay_; } |
||||
|
|
||||
|
/**
|
||||
|
* A vector of the directives, in priority-sorted order. |
||||
|
*/ |
||||
|
const std::vector<Directive>& directives() const; |
||||
|
|
||||
|
/**
|
||||
|
* Return true if the URL (either a full URL or a path) is allowed. |
||||
|
*/ |
||||
|
bool allowed(const std::string& path) const; |
||||
|
|
||||
|
std::string str() const; |
||||
|
|
||||
|
/**
|
||||
|
* Canonically escape the provided query for matching purposes. |
||||
|
*/ |
||||
|
static std::string escape(const std::string& query); |
||||
|
|
||||
|
private: |
||||
|
mutable std::vector<Directive> directives_; |
||||
|
delay_t delay_; |
||||
|
mutable bool sorted_; |
||||
|
}; |
||||
|
} |
||||
|
|
||||
|
#endif |
@ -0,0 +1,130 @@ |
|||||
|
#include <algorithm> |
||||
|
#include <locale> |
||||
|
#include <sstream> |
||||
|
#include <string> |
||||
|
|
||||
|
#include "url.h" |
||||
|
|
||||
|
#include "directive.h" |
||||
|
|
||||
|
namespace Rep |
||||
|
{ |
||||
|
Directive::Directive(const std::string& line, bool allowed) |
||||
|
: expression_() |
||||
|
, priority_(line.size()) |
||||
|
, allowed_(allowed) |
||||
|
{ |
||||
|
if (line.find('*') == std::string::npos) |
||||
|
{ |
||||
|
expression_.assign(line); |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
// Remove consecutive '*'s
|
||||
|
expression_.reserve(line.size()); |
||||
|
bool star = false; |
||||
|
for (auto character : line) |
||||
|
{ |
||||
|
if (character == '*') |
||||
|
{ |
||||
|
if (!star) |
||||
|
{ |
||||
|
expression_.append(1, character); |
||||
|
} |
||||
|
star = true; |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
expression_.append(1, character); |
||||
|
star = false; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// Remove trailing '*'s
|
||||
|
std::string::reverse_iterator last = |
||||
|
std::find_if(expression_.rbegin(), expression_.rend(), |
||||
|
[](const char c) { |
||||
|
return c != '*'; |
||||
|
}); |
||||
|
expression_.erase(last.base(), expression_.end()); |
||||
|
|
||||
|
// Priority is the length of the expression
|
||||
|
priority_ = expression_.size(); |
||||
|
} |
||||
|
|
||||
|
bool Directive::match(const std::string::const_iterator& e_begin, |
||||
|
const std::string::const_iterator& e_end, |
||||
|
const std::string::const_iterator& p_begin, |
||||
|
const std::string::const_iterator& p_end) const |
||||
|
{ |
||||
|
std::string::const_iterator expression_it = e_begin; |
||||
|
std::string::const_iterator path_it = p_begin; |
||||
|
while (expression_it != e_end && path_it != p_end) |
||||
|
{ |
||||
|
if (*expression_it == '*') |
||||
|
{ |
||||
|
// Advance and recurse
|
||||
|
++expression_it; |
||||
|
for (; path_it != p_end; ++path_it) |
||||
|
{ |
||||
|
if (match(expression_it, e_end, path_it, p_end)) |
||||
|
{ |
||||
|
return true; |
||||
|
} |
||||
|
} |
||||
|
return false; |
||||
|
} |
||||
|
else if (*expression_it == '$') |
||||
|
{ |
||||
|
// This check expects path to be fully consumed. But since one of the
|
||||
|
// criteria of being in this while loop is that we've not fully consumed
|
||||
|
// path, return false.
|
||||
|
return false; |
||||
|
} |
||||
|
else if (*expression_it != *path_it) |
||||
|
{ |
||||
|
// These characters must match
|
||||
|
return false; |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
// Advance both by one
|
||||
|
++path_it; |
||||
|
++expression_it; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// Return true only if we've consumed all of the expression
|
||||
|
if (expression_it == e_end) |
||||
|
{ |
||||
|
return true; |
||||
|
} |
||||
|
else if (*expression_it == '$') |
||||
|
{ |
||||
|
return path_it == p_end; |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
return false; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
std::string Directive::str() const |
||||
|
{ |
||||
|
std::stringstream out; |
||||
|
if (allowed_) |
||||
|
{ |
||||
|
out << "Allow: " << expression_; |
||||
|
} |
||||
|
else { |
||||
|
out << "Disallow: " << expression_; |
||||
|
} |
||||
|
return out.str(); |
||||
|
} |
||||
|
|
||||
|
bool Directive::match(const std::string& path) const |
||||
|
{ |
||||
|
return match(expression_.begin(), expression_.end(), path.begin(), path.end()); |
||||
|
} |
||||
|
|
||||
|
} |
@ -0,0 +1,67 @@ |
|||||
|
#ifndef DIRECTIVE_CPP_H |
||||
|
#define DIRECTIVE_CPP_H |
||||
|
|
||||
|
|
||||
|
namespace Rep |
||||
|
{ |
||||
|
|
||||
|
class Directive |
||||
|
{ |
||||
|
public: |
||||
|
/**
|
||||
|
* The type of our priority value. |
||||
|
*/ |
||||
|
typedef size_t priority_t; |
||||
|
|
||||
|
/**
|
||||
|
* Default constructor disallowed. |
||||
|
*/ |
||||
|
Directive() = delete; |
||||
|
|
||||
|
/**
|
||||
|
* The input to this constructor must be stripped of comments and trailing |
||||
|
* whitespace. |
||||
|
*/ |
||||
|
Directive(const std::string& line, bool allowed); |
||||
|
|
||||
|
/**
|
||||
|
* The priority of the rule. |
||||
|
*/ |
||||
|
priority_t priority() const |
||||
|
{ |
||||
|
return priority_; |
||||
|
} |
||||
|
|
||||
|
/**
|
||||
|
* Whether or not the provided path matches. The path is expected to be properly |
||||
|
* escaped. |
||||
|
*/ |
||||
|
bool match(const std::string& path) const; |
||||
|
|
||||
|
/**
|
||||
|
* Whether this rule is for an allow or a disallow. |
||||
|
*/ |
||||
|
bool allowed() const |
||||
|
{ |
||||
|
return allowed_; |
||||
|
} |
||||
|
|
||||
|
std::string str() const; |
||||
|
|
||||
|
private: |
||||
|
std::string expression_; |
||||
|
priority_t priority_; |
||||
|
bool allowed_; |
||||
|
|
||||
|
/**
|
||||
|
* Return true if p_begin -> p_end matches the expression e_begin -> e_end. |
||||
|
*/ |
||||
|
bool match(const std::string::const_iterator& e_begin, |
||||
|
const std::string::const_iterator& e_end, |
||||
|
const std::string::const_iterator& p_begin, |
||||
|
const std::string::const_iterator& p_end) const; |
||||
|
}; |
||||
|
|
||||
|
} |
||||
|
|
||||
|
#endif |
@ -0,0 +1,183 @@ |
|||||
|
#include <algorithm> |
||||
|
#include <fstream> |
||||
|
#include <iostream> |
||||
|
#include <string> |
||||
|
|
||||
|
#include "psl.h" |
||||
|
#include "punycode.h" |
||||
|
|
||||
|
namespace Url |
||||
|
{ |
||||
|
const std::string PSL::not_found = ""; |
||||
|
|
||||
|
PSL::PSL(std::istream& stream) |
||||
|
{ |
||||
|
std::string line; |
||||
|
while (std::getline(stream, line)) |
||||
|
{ |
||||
|
// Only take up to the first whitespace.
|
||||
|
auto it = std::find_if(line.begin(), line.end(), ::isspace); |
||||
|
line.resize(it - line.begin()); |
||||
|
|
||||
|
// Skip blank lines
|
||||
|
if (line.empty()) |
||||
|
{ |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
// Skip comments
|
||||
|
if (line.compare(0, 2, "//") == 0) |
||||
|
{ |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
// We know the line has at least a single character at this point
|
||||
|
if (line[0] == '*') |
||||
|
{ |
||||
|
// Line is a wildcard rule
|
||||
|
if (line.size() <= 2 || line[1] != '.') |
||||
|
{ |
||||
|
throw std::invalid_argument("Wildcard rule must be of form *.<host>"); |
||||
|
} |
||||
|
|
||||
|
add(line, 1, 2); |
||||
|
} |
||||
|
else if (line[0] == '!') |
||||
|
{ |
||||
|
// Line is an exception, take all but the !
|
||||
|
if (line.size() <= 1) |
||||
|
{ |
||||
|
throw std::invalid_argument("Exception rule has no hostname."); |
||||
|
} |
||||
|
|
||||
|
add(line, -1, 1); |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
add(line, 0, 0); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
PSL PSL::fromPath(const std::string& path) |
||||
|
{ |
||||
|
std::ifstream stream(path); |
||||
|
if (!stream.good()) |
||||
|
{ |
||||
|
std::stringstream message; |
||||
|
message << "Path '" << path << "' inaccessible."; |
||||
|
throw std::invalid_argument(message.str()); |
||||
|
} |
||||
|
return PSL(stream); |
||||
|
} |
||||
|
|
||||
|
PSL PSL::fromString(const std::string& str) |
||||
|
{ |
||||
|
std::stringstream stream(str); |
||||
|
return PSL(stream); |
||||
|
} |
||||
|
|
||||
|
std::string PSL::getTLD(const std::string& hostname) const |
||||
|
{ |
||||
|
return getLastSegments(hostname, getTLDLength(hostname)); |
||||
|
} |
||||
|
|
||||
|
std::string PSL::getPLD(const std::string& hostname) const |
||||
|
{ |
||||
|
return getLastSegments(hostname, getTLDLength(hostname) + 1); |
||||
|
} |
||||
|
|
||||
|
std::pair<std::string, std::string> PSL::getBoth(const std::string& hostname) const |
||||
|
{ |
||||
|
size_t length = getTLDLength(hostname); |
||||
|
return std::make_pair( |
||||
|
getLastSegments(hostname, length), |
||||
|
getLastSegments(hostname, length + 1)); |
||||
|
} |
||||
|
|
||||
|
size_t PSL::getTLDLength(const std::string& hostname) const |
||||
|
{ |
||||
|
// Reversed copy of hostname
|
||||
|
std::string tld(hostname.rbegin(), hostname.rend()); |
||||
|
std::transform(tld.begin(), tld.end(), tld.begin(), ::tolower); |
||||
|
|
||||
|
while (tld.size()) |
||||
|
{ |
||||
|
auto it = levels.find(tld); |
||||
|
if (it != levels.end()) |
||||
|
{ |
||||
|
return it->second; |
||||
|
} |
||||
|
|
||||
|
size_t position = tld.rfind('.'); |
||||
|
if (position == std::string::npos || position == 0) |
||||
|
{ |
||||
|
tld.resize(0); |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
tld.resize(position); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return 1; |
||||
|
} |
||||
|
|
||||
|
std::string PSL::getLastSegments(const std::string& hostname, size_t segments) const |
||||
|
{ |
||||
|
size_t position = hostname.size(); |
||||
|
size_t remaining = segments; |
||||
|
while (remaining != 0 && position && position != std::string::npos) |
||||
|
{ |
||||
|
position = hostname.rfind('.', position - 1); |
||||
|
remaining -= 1; |
||||
|
} |
||||
|
|
||||
|
if (remaining >= 1) |
||||
|
{ |
||||
|
return not_found; |
||||
|
} |
||||
|
|
||||
|
// Return the whole string if position == std:string::npos
|
||||
|
size_t start = (position == std::string::npos) ? 0 : position + 1; |
||||
|
|
||||
|
std::string result(hostname, start); |
||||
|
std::transform(result.begin(), result.end(), result.begin(), ::tolower); |
||||
|
|
||||
|
// Leading .'s indicate that the query had an empty segment
|
||||
|
if (result.size() && result[0] == '.') |
||||
|
{ |
||||
|
std::stringstream message; |
||||
|
message << "Empty segment in " << result; |
||||
|
throw std::invalid_argument(message.str()); |
||||
|
} |
||||
|
|
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
size_t PSL::countSegments(const std::string& hostname) const |
||||
|
{ |
||||
|
size_t count = 1; |
||||
|
size_t position = hostname.find('.'); |
||||
|
while (position != std::string::npos) |
||||
|
{ |
||||
|
count += 1; |
||||
|
position = hostname.find('.', position + 1); |
||||
|
} |
||||
|
return count; |
||||
|
} |
||||
|
|
||||
|
void PSL::add(std::string& rule, int level_adjust, size_t trim) |
||||
|
{ |
||||
|
// First unpunycoded
|
||||
|
std::string copy(rule.rbegin(), rule.rend() - trim); |
||||
|
size_t length = countSegments(copy) + level_adjust; |
||||
|
levels[copy] = length; |
||||
|
|
||||
|
// And now punycoded
|
||||
|
rule = Punycode::encodeHostname(rule); |
||||
|
copy.assign(rule.rbegin(), rule.rend() - trim); |
||||
|
levels[copy] = length; |
||||
|
} |
||||
|
|
||||
|
}; |
@ -0,0 +1,102 @@ |
|||||
|
#ifndef PSL_CPP_H |
||||
|
#define PSL_CPP_H |
||||
|
|
||||
|
#include <istream> |
||||
|
#include <sstream> |
||||
|
#include <string> |
||||
|
#include <unordered_map> |
||||
|
#include <utility> |
||||
|
|
||||
|
namespace Url |
||||
|
{ |
||||
|
|
||||
|
/**
|
||||
|
* Find TLDs and PLDs of a hostname according to a PSL. |
||||
|
*/ |
||||
|
struct PSL |
||||
|
{ |
||||
|
/**
|
||||
|
* Indicates the there is no TLD / PLD |
||||
|
*/ |
||||
|
static const std::string not_found; |
||||
|
|
||||
|
/**
|
||||
|
* Read a PSL from an istream. |
||||
|
*/ |
||||
|
PSL(std::istream& stream); |
||||
|
|
||||
|
PSL(): levels() { }; |
||||
|
|
||||
|
PSL(const PSL& other): levels(other.levels) { } |
||||
|
|
||||
|
PSL& operator=(const PSL& other) |
||||
|
{ |
||||
|
levels = other.levels; |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
/**
|
||||
|
* Read the provided path holding a set of PSL rules. |
||||
|
*/ |
||||
|
static PSL fromPath(const std::string& path); |
||||
|
|
||||
|
/**
|
||||
|
* Create a PSL object from a string. |
||||
|
*/ |
||||
|
static PSL fromString(const std::string& str); |
||||
|
|
||||
|
/**
|
||||
|
* Get just the TLD of the hostname. |
||||
|
* |
||||
|
* Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If |
||||
|
* some segments have been appropriately punycoded and others not, it may return |
||||
|
* a wrong answer. If a punycoded host is provided, a punycoded response is |
||||
|
* returned. If an unpunycoded host is provided, an unpunycoded response is |
||||
|
* returned. |
||||
|
*/ |
||||
|
std::string getTLD(const std::string& hostname) const; |
||||
|
|
||||
|
/**
|
||||
|
* Get just the PLD of the hostname. |
||||
|
* |
||||
|
* Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If |
||||
|
* some segments have been appropriately punycoded and others not, it may return |
||||
|
* a wrong answer. If a punycoded host is provided, a punycoded response is |
||||
|
* returned. If an unpunycoded host is provided, an unpunycoded response is |
||||
|
* returned. |
||||
|
*/ |
||||
|
std::string getPLD(const std::string& hostname) const; |
||||
|
|
||||
|
/**
|
||||
|
* Get the (TLD, PLD) of the hostname. |
||||
|
* |
||||
|
* Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If |
||||
|
* some segments have been appropriately punycoded and others not, it may return |
||||
|
* a wrong answer. If a punycoded host is provided, a punycoded response is |
||||
|
* returned. If an unpunycoded host is provided, an unpunycoded response is |
||||
|
* returned. |
||||
|
*/ |
||||
|
std::pair<std::string, std::string> getBoth(const std::string& hostname) const; |
||||
|
private: |
||||
|
// Mapping of a string rule to its level
|
||||
|
std::unordered_map<std::string, size_t> levels; |
||||
|
|
||||
|
// Return the number of segments in a hostname
|
||||
|
size_t countSegments(const std::string& hostname) const; |
||||
|
|
||||
|
// Return the number of segments in the TLD of the provided hostname
|
||||
|
size_t getTLDLength(const std::string& hostname) const; |
||||
|
|
||||
|
// Return the last `segments` segments of a hostname
|
||||
|
std::string getLastSegments(const std::string& hostname, size_t segments) const; |
||||
|
|
||||
|
/**
|
||||
|
* Add the provided host with the provided priority, trimming characters off |
||||
|
* the front, and adjusting the level by the provided number. |
||||
|
*/ |
||||
|
void add(std::string& host, int level_adjust, size_t trim); |
||||
|
}; |
||||
|
|
||||
|
} |
||||
|
|
||||
|
#endif |
@ -0,0 +1,409 @@ |
|||||
|
#include <algorithm> |
||||
|
#include <string> |
||||
|
#include <iostream> |
||||
|
|
||||
|
#include "punycode.h" |
||||
|
#include "utf8.h" |
||||
|
|
||||
|
namespace Url |
||||
|
{ |
||||
|
|
||||
|
std::string& Punycode::encode(std::string& str) |
||||
|
{ |
||||
|
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3
|
||||
|
//
|
||||
|
// let n = initial_n
|
||||
|
// let delta = 0
|
||||
|
// let bias = initial_bias
|
||||
|
punycode_uint n = INITIAL_N; |
||||
|
punycode_uint delta = 0; |
||||
|
punycode_uint bias = INITIAL_BIAS; |
||||
|
std::string output; |
||||
|
|
||||
|
// Accumulate the non-basic codepoints
|
||||
|
std::vector<punycode_uint> codepoints; |
||||
|
for (auto it = str.cbegin(); it != str.cend(); ) |
||||
|
{ |
||||
|
Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend()); |
||||
|
if (value < 0x80) |
||||
|
{ |
||||
|
// copy them to the output in order
|
||||
|
output.append(1, static_cast<char>(value)); |
||||
|
} |
||||
|
codepoints.push_back(value); |
||||
|
} |
||||
|
|
||||
|
// let h = b = the number of basic code points in the input
|
||||
|
size_t h = output.size(); |
||||
|
size_t b = h; |
||||
|
|
||||
|
// copy a delimiter if b > 0
|
||||
|
if (b > 0) |
||||
|
{ |
||||
|
output.append(1, '-'); |
||||
|
} |
||||
|
|
||||
|
// while h < length(input) do begin
|
||||
|
while (h < codepoints.size()) |
||||
|
{ |
||||
|
// let m = the minimum {non-basic} code point >= n in the input
|
||||
|
punycode_uint m = MAX_PUNYCODE_UINT; |
||||
|
for (auto it = codepoints.begin(); it != codepoints.end(); ++it) |
||||
|
{ |
||||
|
if ((*it >= n) && (*it < m)) |
||||
|
{ |
||||
|
m = *it; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// let delta = delta + (m - n) * (h + 1), fail on overflow
|
||||
|
if ((m - n) > ((MAX_PUNYCODE_UINT - delta) / (h + 1))) |
||||
|
{ |
||||
|
throw std::invalid_argument("Overflow delta update."); |
||||
|
} |
||||
|
delta += (m - n) * (h + 1); |
||||
|
|
||||
|
// let n = m
|
||||
|
n = m; |
||||
|
|
||||
|
// for each code point c in the input (in order) do begin
|
||||
|
for (auto it = codepoints.begin(); it != codepoints.end(); ++it) |
||||
|
{ |
||||
|
// if c < n {or c is basic} then increment delta, fail on overflow
|
||||
|
if (*it < n) |
||||
|
{ |
||||
|
if (delta == MAX_PUNYCODE_UINT) |
||||
|
{ |
||||
|
throw std::invalid_argument("Overflow delta increment."); |
||||
|
} |
||||
|
++delta; |
||||
|
} |
||||
|
|
||||
|
// if c == n then begin
|
||||
|
if (*it == n) |
||||
|
{ |
||||
|
// let q = delta
|
||||
|
punycode_uint q = delta; |
||||
|
|
||||
|
// for k = base to infinity in steps of base do begin
|
||||
|
for (punycode_uint k = BASE; ; k += BASE) |
||||
|
{ |
||||
|
// let t = tmin if k <= bias {+ tmin}, or
|
||||
|
// tmax if k >= bias + tmax, or k - bias otherwise
|
||||
|
punycode_uint t = k <= bias ? TMIN : |
||||
|
k >= bias + TMAX ? TMAX : k - bias; |
||||
|
|
||||
|
// if q < t then break
|
||||
|
if (q < t) |
||||
|
{ |
||||
|
break; |
||||
|
} |
||||
|
|
||||
|
// output the code point for digit t + ((q - t) mod (base - t))
|
||||
|
output.append(1, DIGIT_TO_BASIC[t + ((q - t) % (BASE - t))]); |
||||
|
|
||||
|
// let q = (q - t) div (base - t)
|
||||
|
q = (q - t) / (BASE - t); |
||||
|
} |
||||
|
|
||||
|
// output the code point for digit q
|
||||
|
output.append(1, DIGIT_TO_BASIC[q]); |
||||
|
|
||||
|
// let bias = adapt(delta, h + 1, test h equals b?)
|
||||
|
bias = adapt(delta, h + 1, h == b); |
||||
|
|
||||
|
// let delta = 0
|
||||
|
delta = 0; |
||||
|
|
||||
|
// increment h
|
||||
|
++h; |
||||
|
|
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// increment delta and n
|
||||
|
++delta; |
||||
|
++n; |
||||
|
} |
||||
|
|
||||
|
str.assign(output); |
||||
|
return str; |
||||
|
} |
||||
|
|
||||
|
std::string Punycode::encode(const std::string& str) |
||||
|
{ |
||||
|
std::string result(str); |
||||
|
encode(result); |
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
std::string Punycode::encodeHostname(const std::string& hostname) |
||||
|
{ |
||||
|
// Avoid any punycoding at all if none is needed
|
||||
|
if (!needsPunycoding(hostname)) |
||||
|
{ |
||||
|
return hostname; |
||||
|
} |
||||
|
|
||||
|
std::string encoded; |
||||
|
|
||||
|
size_t start = 0; |
||||
|
size_t end = hostname.find('.'); |
||||
|
while(true) |
||||
|
{ |
||||
|
std::string segment = hostname.substr(start, end - start); |
||||
|
if (needsPunycoding(segment)) |
||||
|
{ |
||||
|
encoded.append("xn--"); |
||||
|
encoded.append(Punycode::encode(segment)); |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
encoded.append(segment); |
||||
|
} |
||||
|
|
||||
|
if (end == std::string::npos) |
||||
|
{ |
||||
|
break; |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
encoded.append(1, '.'); |
||||
|
start = end + 1; |
||||
|
end = hostname.find('.', start); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return encoded; |
||||
|
} |
||||
|
|
||||
|
std::string& Punycode::decode(std::string& str) |
||||
|
{ |
||||
|
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2
|
||||
|
//
|
||||
|
// let n = initial_n
|
||||
|
// let i = 0
|
||||
|
// let bias = initial_bias
|
||||
|
// let output = an empty string indexed from 0
|
||||
|
punycode_uint n = INITIAL_N; |
||||
|
punycode_uint i = 0; |
||||
|
punycode_uint bias = INITIAL_BIAS; |
||||
|
std::vector<punycode_uint> codepoints; |
||||
|
|
||||
|
size_t index = str.rfind('-'); |
||||
|
if (index == std::string::npos) |
||||
|
{ |
||||
|
index = 0; |
||||
|
} |
||||
|
|
||||
|
// consume all code points before the last delimiter (if there is one)
|
||||
|
// and copy them to output, fail on any non-basic code point
|
||||
|
for (auto it = str.begin(); it != (str.begin() + index); ++it) |
||||
|
{ |
||||
|
if (static_cast<unsigned char>(*it) > 127U) |
||||
|
{ |
||||
|
throw std::invalid_argument("Argument has non-basic code points."); |
||||
|
} |
||||
|
codepoints.push_back(*it); |
||||
|
} |
||||
|
|
||||
|
// if more than zero code points were consumed then consume one more
|
||||
|
// (which will be the last delimiter)
|
||||
|
if (index > 0) |
||||
|
{ |
||||
|
index += 1; |
||||
|
} |
||||
|
|
||||
|
// while the input is not exhausted do begin
|
||||
|
for (auto it = (str.begin() + index); it != str.end(); ++it) |
||||
|
{ |
||||
|
// let oldi = i
|
||||
|
// let w = 1
|
||||
|
punycode_uint oldi = i; |
||||
|
punycode_uint w = 1; |
||||
|
|
||||
|
// for k = base to infinity in steps of base do begin
|
||||
|
for (punycode_uint k = BASE; ; k += BASE, ++it) |
||||
|
{ |
||||
|
// consume a code point, or fail if there was none to consume
|
||||
|
if (it == str.end()) |
||||
|
{ |
||||
|
throw std::invalid_argument("Premature termination"); |
||||
|
} |
||||
|
|
||||
|
// let digit = the code point's digit-value, fail if it has none
|
||||
|
int lookup = BASIC_TO_DIGIT[static_cast<size_t>(*it)]; |
||||
|
if (lookup == -1) |
||||
|
{ |
||||
|
throw std::invalid_argument("Invalid base 36 character."); |
||||
|
} |
||||
|
unsigned char digit = static_cast<unsigned char>(lookup); |
||||
|
|
||||
|
// let i = i + digit * w, fail on overflow
|
||||
|
if (digit > ((MAX_PUNYCODE_UINT - i) / w)) |
||||
|
{ |
||||
|
throw std::invalid_argument("Overflow on i."); |
||||
|
} |
||||
|
i += digit * w; |
||||
|
|
||||
|
// let t = tmin if k <= bias {+ tmin}, or
|
||||
|
// tmax if k >= bias + tmax, or k - bias otherwise
|
||||
|
punycode_uint t = k <= bias ? TMIN : |
||||
|
k >= bias + TMAX ? TMAX : k - bias; |
||||
|
|
||||
|
// if digit < t then break
|
||||
|
if (digit < t) |
||||
|
{ |
||||
|
break; |
||||
|
} |
||||
|
|
||||
|
// let w = w * (base - t), fail on overflow
|
||||
|
if (w > (MAX_PUNYCODE_UINT / (BASE - t))) |
||||
|
{ |
||||
|
// I believe this line is unreachable without first overflowing i.
|
||||
|
// Since 'i' is updated above as i += digit * w, and w is updated as
|
||||
|
// w = w * (BASE - t), we should like to keep (BASE - t) > digit to
|
||||
|
// give 'w' a chance to overflow first. To keep t minimized, we must
|
||||
|
// have 'bias' maximized. `bias` is driven by the 'adapt' function
|
||||
|
// below.
|
||||
|
//
|
||||
|
// The value returned by 'adapt' increases with the input delta, and
|
||||
|
// decreases with the input size. The delta is a function of the input
|
||||
|
// size as well, on the order of (delta_n * input size), and
|
||||
|
// legitimate delta_n values are limited to 0x10FFFF (the maximum
|
||||
|
// unicode codepoint). Even setting that aside, the maximum value that
|
||||
|
// adapt() can return is adapt(2 ** 32 - 1, 1, false) = 204.
|
||||
|
//
|
||||
|
// Using this bias, we could use the input (HERE) to get iterations:
|
||||
|
//
|
||||
|
// digit = b = 1, i = 2, k = 36, t = 1, w = 35
|
||||
|
// digit = b = 1, i = 37, k = 72, t = 1, w = 1225
|
||||
|
// digit = b = 1, i = 1262, k = 108, t = 1, w = 42875
|
||||
|
// digit = b = 1, i = 44137, k = 144, t = 1, w = 1500625
|
||||
|
// digit = b = 1, i = 1544762, k = 180, t = 1, w = 52521875
|
||||
|
//
|
||||
|
// At this point, t now becomes TMAX (26) because k exceeds the bias
|
||||
|
// (since the maximum bias is 204). As such, the minimum continuation
|
||||
|
// value is 26:
|
||||
|
//
|
||||
|
// digit = 0 = 26, i = 1367113512, k = 216, t = 26, w = 525218750
|
||||
|
//
|
||||
|
// However, the next iteration now overflows i before we can get to
|
||||
|
// the w update.
|
||||
|
throw std::invalid_argument("Overflow on w."); // LCOV_EXCL_LINE
|
||||
|
} |
||||
|
w *= (BASE - t); |
||||
|
} |
||||
|
|
||||
|
// let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?)
|
||||
|
bias = adapt(i - oldi, codepoints.size() + 1, oldi == 0); |
||||
|
|
||||
|
// let n = n + i div (length(output) + 1), fail on overflow
|
||||
|
if ((i / (codepoints.size() + 1)) > (MAX_PUNYCODE_UINT - n)) |
||||
|
{ |
||||
|
throw std::invalid_argument("Overflow on n."); |
||||
|
} |
||||
|
n += i / (codepoints.size() + 1); |
||||
|
|
||||
|
// let i = i mod (length(output) + 1)
|
||||
|
i %= (codepoints.size() + 1); |
||||
|
|
||||
|
// insert n into output at position i
|
||||
|
codepoints.insert(codepoints.begin() + i, n); |
||||
|
|
||||
|
// increment i
|
||||
|
++i; |
||||
|
} |
||||
|
|
||||
|
std::string output; |
||||
|
for (auto it = codepoints.begin(); it != codepoints.end(); ++it) |
||||
|
{ |
||||
|
Utf8::writeCodepoint(output, *it); |
||||
|
} |
||||
|
str.assign(output); |
||||
|
|
||||
|
return str; |
||||
|
} |
||||
|
|
||||
|
std::string Punycode::decode(const std::string& str) |
||||
|
{ |
||||
|
std::string result(str); |
||||
|
decode(result); |
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
std::string Punycode::decodeHostname(const std::string& hostname) |
||||
|
{ |
||||
|
std::string unencoded; |
||||
|
|
||||
|
size_t start = 0; |
||||
|
size_t end = hostname.find('.'); |
||||
|
while(true) |
||||
|
{ |
||||
|
std::string segment = hostname.substr(start, end - start); |
||||
|
if (segment.substr(0, 4).compare("xn--") == 0) |
||||
|
{ |
||||
|
segment = segment.substr(4); |
||||
|
unencoded.append(Punycode::decode(segment)); |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
unencoded.append(segment); |
||||
|
} |
||||
|
|
||||
|
if (end == std::string::npos) |
||||
|
{ |
||||
|
break; |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
unencoded.append(1, '.'); |
||||
|
start = end + 1; |
||||
|
end = hostname.find('.', start); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return unencoded; |
||||
|
} |
||||
|
|
||||
|
bool Punycode::needsPunycoding(const std::string& str) |
||||
|
{ |
||||
|
return std::any_of( |
||||
|
str.begin(), |
||||
|
str.end(), |
||||
|
[](char i){ return static_cast<unsigned char>(i) & 0x80; }); |
||||
|
} |
||||
|
|
||||
|
Punycode::punycode_uint Punycode::adapt( |
||||
|
punycode_uint delta, punycode_uint numpoints, bool firsttime) |
||||
|
{ |
||||
|
// Psuedocode from https://tools.ietf.org/html/rfc3492#section-6.1
|
||||
|
//
|
||||
|
// It does not matter whether the modifications to delta and k inside
|
||||
|
// adapt() affect variables of the same name inside the
|
||||
|
// encoding/decoding procedures, because after calling adapt() the
|
||||
|
// caller does not read those variables before overwriting them.
|
||||
|
//
|
||||
|
// if firsttime then let delta = delta div damp
|
||||
|
// else let delta = delta div 2
|
||||
|
delta = firsttime ? delta / DAMP : delta >> 1; |
||||
|
|
||||
|
// let delta = delta + (delta div numpoints)
|
||||
|
delta += (delta / numpoints); |
||||
|
|
||||
|
// let k = 0
|
||||
|
punycode_uint k = 0; |
||||
|
|
||||
|
// while delta > ((base - tmin) * tmax) div 2 do begin
|
||||
|
for (; delta > ((BASE - TMIN) * TMAX) / 2; k += BASE) |
||||
|
{ |
||||
|
// let delta = delta div (base - tmin)
|
||||
|
// let k = k + base
|
||||
|
delta /= (BASE - TMIN); |
||||
|
} |
||||
|
|
||||
|
// return k + (((base - tmin + 1) * delta) div (delta + skew))
|
||||
|
return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW)); |
||||
|
} |
||||
|
|
||||
|
}; |
@ -0,0 +1,105 @@ |
|||||
|
#ifndef PUNYCODE_CPP_H |
||||
|
#define PUNYCODE_CPP_H |
||||
|
|
||||
|
#include <stdexcept> |
||||
|
#include <string> |
||||
|
#include <vector> |
||||
|
#include <unordered_map> |
||||
|
#include <unordered_set> |
||||
|
|
||||
|
#include "utf8.h" |
||||
|
|
||||
|
namespace Url |
||||
|
{ |
||||
|
|
||||
|
namespace Punycode |
||||
|
{ |
||||
|
typedef Utf8::codepoint_t punycode_uint; |
||||
|
|
||||
|
const unsigned int BASE = 36; |
||||
|
const unsigned int TMIN = 1; |
||||
|
const unsigned int TMAX = 26; |
||||
|
const unsigned int SKEW = 38; |
||||
|
const unsigned int DAMP = 700; |
||||
|
const unsigned int INITIAL_BIAS = 72; |
||||
|
const unsigned int INITIAL_N = 128; |
||||
|
|
||||
|
// Codepoints to their base-36 value
|
||||
|
const std::vector<int8_t> BASIC_TO_DIGIT = { |
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
|
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, |
||||
|
|
||||
|
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, |
||||
|
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, |
||||
|
|
||||
|
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, |
||||
|
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, |
||||
|
|
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
|
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
|
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
|
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 |
||||
|
}; |
||||
|
const std::string DIGIT_TO_BASIC = "abcdefghijklmnopqrstuvwxyz0123456789"; |
||||
|
|
||||
|
// The highest codepoint in unicode
|
||||
|
const punycode_uint MAX_PUNYCODE_UINT = std::numeric_limits<punycode_uint>::max(); |
||||
|
//Utf8::MAX_CODEPOINT;
|
||||
|
//std::numeric_limits<punycode_uint>::max();
|
||||
|
|
||||
|
/**
|
||||
|
* Replace utf-8-encoded str into punycode. |
||||
|
*/ |
||||
|
std::string& encode(std::string& str); |
||||
|
|
||||
|
/**
|
||||
|
* Create a new punycoded string from utf-8-encoded input. |
||||
|
*/ |
||||
|
std::string encode(const std::string& str); |
||||
|
|
||||
|
/**
|
||||
|
* Encode a hostname. |
||||
|
*/ |
||||
|
std::string encodeHostname(const std::string& hostname); |
||||
|
|
||||
|
/**
|
||||
|
* Replace punycoded str into utf-8-encoded. |
||||
|
*/ |
||||
|
std::string& decode(std::string& str); |
||||
|
|
||||
|
/**
|
||||
|
* Create a new utf-8-encoded string from punycoded input. |
||||
|
*/ |
||||
|
std::string decode(const std::string& str); |
||||
|
|
||||
|
/**
|
||||
|
* Decode a hostname. |
||||
|
*/ |
||||
|
std::string decodeHostname(const std::string& hostname); |
||||
|
|
||||
|
/**
|
||||
|
* Determine if a string needs punycoding. |
||||
|
*/ |
||||
|
bool needsPunycoding(const std::string& str); |
||||
|
|
||||
|
/**
|
||||
|
* Internal function for calculating bias. |
||||
|
*/ |
||||
|
punycode_uint adapt( |
||||
|
punycode_uint delta, punycode_uint numpoints, bool firsttime); |
||||
|
|
||||
|
}; |
||||
|
|
||||
|
} |
||||
|
|
||||
|
#endif |
@ -0,0 +1,26 @@ |
|||||
|
#include <Rcpp.h> |
||||
|
using namespace Rcpp; |
||||
|
|
||||
|
#include "url.h" |
||||
|
#include "robots.h" |
||||
|
|
||||
|
//' Parse robots.txt
|
||||
|
//'
|
||||
|
//' @noRd
|
||||
|
//'
|
||||
|
// [[Rcpp::export]]
|
||||
|
SEXP rep_parse(std::string content) { |
||||
|
Rcpp::XPtr<Rep::Robots> ptr(new Rep::Robots(content)); |
||||
|
return(ptr); |
||||
|
} |
||||
|
|
||||
|
|
||||
|
//' Path allowed
|
||||
|
//'
|
||||
|
//' @noRd
|
||||
|
//'
|
||||
|
// [[Rcpp::export]]
|
||||
|
bool rep_path_allowed(SEXP xp, std::string path, std::string agent = "*") { |
||||
|
Rcpp::XPtr<Rep::Robots> ptr(xp); |
||||
|
return(ptr->allowed(path, agent)); |
||||
|
} |
@ -0,0 +1,188 @@ |
|||||
|
#include <algorithm> |
||||
|
#include <functional> |
||||
|
#include <cctype> |
||||
|
#include <locale> |
||||
|
#include <sstream> |
||||
|
#include <iostream> |
||||
|
#include <unordered_map> |
||||
|
|
||||
|
#include "url.h" |
||||
|
|
||||
|
#include "robots.h" |
||||
|
#include <Rcpp.h> |
||||
|
|
||||
|
namespace Rep |
||||
|
{ |
||||
|
|
||||
|
void Robots::strip(std::string& string) |
||||
|
{ |
||||
|
string.erase(string.begin(), std::find_if(string.begin(), string.end(), |
||||
|
std::not1(std::ptr_fun<int, int>(std::isspace)))); |
||||
|
string.erase(std::find_if(string.rbegin(), string.rend(), |
||||
|
std::not1(std::ptr_fun<int, int>(std::isspace))).base(), string.end()); |
||||
|
} |
||||
|
|
||||
|
bool Robots::getpair(std::istringstream& stream, std::string& key, std::string& value) |
||||
|
{ |
||||
|
while (getline(stream, key)) |
||||
|
{ |
||||
|
size_t index = key.find('#'); |
||||
|
if (index != std::string::npos) |
||||
|
{ |
||||
|
key.resize(index); |
||||
|
} |
||||
|
|
||||
|
// Find the colon and divide it into key and value, skipping malformed lines
|
||||
|
index = key.find(':'); |
||||
|
if (index == std::string::npos) |
||||
|
{ |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
value.assign(key.begin() + index + 1, key.end()); |
||||
|
key.resize(index); |
||||
|
|
||||
|
// Strip whitespace off of each
|
||||
|
strip(key); |
||||
|
strip(value); |
||||
|
|
||||
|
// Lowercase the key
|
||||
|
std::transform(key.begin(), key.end(), key.begin(), ::tolower); |
||||
|
|
||||
|
return true; |
||||
|
} |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
Robots::Robots(const std::string& content): agents_(), sitemaps_(), default_(agents_["*"]) |
||||
|
{ |
||||
|
std::string agent_name("*"); |
||||
|
std::istringstream input(content); |
||||
|
if (content.compare(0, 3, "\xEF\xBB\xBF") == 0) |
||||
|
{ |
||||
|
input.ignore(3); |
||||
|
} |
||||
|
std::string key, value; |
||||
|
std::vector<std::string> group; |
||||
|
bool last_agent = false; |
||||
|
agent_map_t::iterator current = agents_.find("*"); |
||||
|
while (Robots::getpair(input, key, value)) |
||||
|
{ |
||||
|
if (key.compare("user-agent") == 0) |
||||
|
{ |
||||
|
// Store the user agent string as lowercased
|
||||
|
std::transform(value.begin(), value.end(), value.begin(), ::tolower); |
||||
|
|
||||
|
if (last_agent) |
||||
|
{ |
||||
|
group.push_back(value); |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
if (!agent_name.empty()) |
||||
|
{ |
||||
|
for (auto other : group) |
||||
|
{ |
||||
|
agents_[other] = current->second; |
||||
|
} |
||||
|
group.clear(); |
||||
|
} |
||||
|
agent_name = value; |
||||
|
current = agents_.emplace(agent_name, Agent()).first; |
||||
|
} |
||||
|
last_agent = true; |
||||
|
continue; |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
last_agent = false; |
||||
|
} |
||||
|
|
||||
|
if (key.compare("sitemap") == 0) |
||||
|
{ |
||||
|
sitemaps_.push_back(value); |
||||
|
} |
||||
|
else if (key.compare("disallow") == 0) |
||||
|
{ |
||||
|
current->second.disallow(value); |
||||
|
} |
||||
|
else if (key.compare("allow") == 0) |
||||
|
{ |
||||
|
current->second.allow(value); |
||||
|
} |
||||
|
else if (key.compare("crawl-delay") == 0) |
||||
|
{ |
||||
|
try |
||||
|
{ |
||||
|
current->second.delay(std::stof(value)); |
||||
|
} |
||||
|
catch (const std::exception&) |
||||
|
{ |
||||
|
Rcpp::Rcout << "Could not parse " << value << " as float." << std::endl; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (!agent_name.empty()) |
||||
|
{ |
||||
|
for (auto other : group) |
||||
|
{ |
||||
|
agents_[other] = current->second; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
const Agent& Robots::agent(const std::string& name) const |
||||
|
{ |
||||
|
// Lowercase the agent
|
||||
|
std::string lowered(name); |
||||
|
std::transform(lowered.begin(), lowered.end(), lowered.begin(), ::tolower); |
||||
|
|
||||
|
auto it = agents_.find(lowered); |
||||
|
if (it == agents_.end()) |
||||
|
{ |
||||
|
return default_; |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
return it->second; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
bool Robots::allowed(const std::string& path, const std::string& name) const |
||||
|
{ |
||||
|
return agent(name).allowed(path); |
||||
|
} |
||||
|
|
||||
|
std::string Robots::str() const |
||||
|
{ |
||||
|
std::stringstream out; |
||||
|
// TODO: include sitepath info
|
||||
|
out << '{'; |
||||
|
auto begin = agents_.begin(); |
||||
|
auto end = agents_.end(); |
||||
|
if (begin != end) |
||||
|
{ |
||||
|
out << '"' << begin->first << '"' << ": " << begin->second.str(); |
||||
|
++begin; |
||||
|
} |
||||
|
for (; begin != end; ++begin) |
||||
|
{ |
||||
|
out << ", \"" << begin->first << '"' << ": " << begin->second.str(); |
||||
|
} |
||||
|
out << '}'; |
||||
|
return out.str(); |
||||
|
} |
||||
|
|
||||
|
std::string Robots::robotsUrl(const std::string& url) |
||||
|
{ |
||||
|
return Url::Url(url) |
||||
|
.setUserinfo("") |
||||
|
.setPath("robots.txt") |
||||
|
.setParams("") |
||||
|
.setQuery("") |
||||
|
.setFragment("") |
||||
|
.remove_default_port() |
||||
|
.str(); |
||||
|
} |
||||
|
} |
@ -0,0 +1,69 @@ |
|||||
|
#ifndef ROBOTS_CPP_H |
||||
|
#define ROBOTS_CPP_H |
||||
|
|
||||
|
#include <sstream> |
||||
|
#include <unordered_map> |
||||
|
#include <vector> |
||||
|
|
||||
|
#include "agent.h" |
||||
|
|
||||
|
namespace Rep |
||||
|
{ |
||||
|
|
||||
|
class Robots |
||||
|
{ |
||||
|
public: |
||||
|
typedef std::unordered_map<std::string, Agent> agent_map_t; |
||||
|
typedef std::vector<std::string> sitemaps_t; |
||||
|
|
||||
|
/**
|
||||
|
* Create a robots.txt from a utf-8-encoded string. |
||||
|
*/ |
||||
|
Robots(const std::string& content); |
||||
|
|
||||
|
/**
|
||||
|
* Instantiate a Robots object. |
||||
|
*/ |
||||
|
Robots( |
||||
|
const agent_map_t& agents, |
||||
|
const sitemaps_t& sitemaps) |
||||
|
: agents_(agents) |
||||
|
, sitemaps_(sitemaps) |
||||
|
, default_(agents_["*"]) {} |
||||
|
|
||||
|
/**
|
||||
|
* Get the sitemaps in this robots.txt |
||||
|
*/ |
||||
|
const sitemaps_t& sitemaps() const { return sitemaps_; } |
||||
|
|
||||
|
/**
|
||||
|
* Get the agent with the corresponding name. |
||||
|
*/ |
||||
|
const Agent& agent(const std::string& name) const; |
||||
|
|
||||
|
/**
|
||||
|
* Return true if agent is allowed to fetch the URL (either a |
||||
|
* full URL or a path). |
||||
|
*/ |
||||
|
bool allowed(const std::string& path, const std::string& name) const; |
||||
|
|
||||
|
std::string str() const; |
||||
|
|
||||
|
/**
|
||||
|
* Return the robots.txt URL corresponding to the provided URL. |
||||
|
*/ |
||||
|
static std::string robotsUrl(const std::string& url); |
||||
|
|
||||
|
private: |
||||
|
static void strip(std::string& string); |
||||
|
|
||||
|
static bool getpair( |
||||
|
std::istringstream& stream, std::string& key, std::string& value); |
||||
|
|
||||
|
agent_map_t agents_; |
||||
|
sitemaps_t sitemaps_; |
||||
|
Agent& default_; |
||||
|
}; |
||||
|
} |
||||
|
|
||||
|
#endif |
@ -0,0 +1,962 @@ |
|||||
|
#include <algorithm> |
||||
|
#include <string> |
||||
|
#include <iterator> |
||||
|
#include <unordered_map> |
||||
|
#include <unordered_set> |
||||
|
#include <iostream> |
||||
|
#include <iterator> |
||||
|
#include <sstream> |
||||
|
|
||||
|
#include "url.h" |
||||
|
#include "punycode.h" |
||||
|
|
||||
|
namespace Url |
||||
|
{ |
||||
|
|
||||
|
/* Character classes */ |
||||
|
const CharacterClass Url::GEN_DELIMS(":/?#[]@"); |
||||
|
const CharacterClass Url::SUB_DELIMS("!$&'()*+,;="); |
||||
|
const CharacterClass Url::DIGIT("0123456789"); |
||||
|
const CharacterClass Url::ALPHA( |
||||
|
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); |
||||
|
const CharacterClass Url::UNRESERVED( |
||||
|
Url::ALPHA.chars() + Url::DIGIT.chars() + "-._~"); |
||||
|
const CharacterClass Url::RESERVED( |
||||
|
Url::GEN_DELIMS.chars() + Url::SUB_DELIMS.chars()); |
||||
|
const CharacterClass Url::PCHAR( |
||||
|
Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":@"); |
||||
|
const CharacterClass Url::PATH( |
||||
|
Url::PCHAR.chars() + "/"); |
||||
|
const CharacterClass Url::QUERY( |
||||
|
Url::PCHAR.chars() + "/?"); |
||||
|
const CharacterClass Url::FRAGMENT( |
||||
|
Url::PCHAR.chars() + "/?"); |
||||
|
const CharacterClass Url::USERINFO( |
||||
|
Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":"); |
||||
|
const CharacterClass Url::HEX("0123456789ABCDEF"); |
||||
|
const CharacterClass Url::SCHEME( |
||||
|
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-."); |
||||
|
const std::vector<signed char> Url::HEX_TO_DEC = { |
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
|
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, |
||||
|
|
||||
|
-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
|
||||
|
-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
|
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
|
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
|
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
|
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
||||
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 |
||||
|
}; |
||||
|
const std::unordered_map<std::string, int> Url::PORTS = { |
||||
|
{"http", 80}, |
||||
|
{"https", 443} |
||||
|
}; |
||||
|
const std::unordered_set<std::string> Url::USES_RELATIVE = { |
||||
|
"", |
||||
|
"file", |
||||
|
"ftp", |
||||
|
"gopher", |
||||
|
"http", |
||||
|
"https", |
||||
|
"imap", |
||||
|
"mms", |
||||
|
"nntp", |
||||
|
"prospero", |
||||
|
"rtsp", |
||||
|
"rtspu", |
||||
|
"sftp", |
||||
|
"shttp", |
||||
|
"svn", |
||||
|
"svn+ssh", |
||||
|
"wais" |
||||
|
}; |
||||
|
const std::unordered_set<std::string> Url::USES_NETLOC = { |
||||
|
"", |
||||
|
"file", |
||||
|
"ftp", |
||||
|
"git", |
||||
|
"git+ssh", |
||||
|
"gopher", |
||||
|
"http", |
||||
|
"https", |
||||
|
"imap", |
||||
|
"mms", |
||||
|
"nfs", |
||||
|
"nntp", |
||||
|
"prospero", |
||||
|
"rsync", |
||||
|
"rtsp", |
||||
|
"rtspu", |
||||
|
"sftp", |
||||
|
"shttp", |
||||
|
"snews", |
||||
|
"svn", |
||||
|
"svn+ssh", |
||||
|
"telnet", |
||||
|
"wais" |
||||
|
}; |
||||
|
const std::unordered_set<std::string> Url::USES_PARAMS = { |
||||
|
"", |
||||
|
"ftp", |
||||
|
"hdl", |
||||
|
"http", |
||||
|
"https", |
||||
|
"imap", |
||||
|
"mms", |
||||
|
"prospero", |
||||
|
"rtsp", |
||||
|
"rtspu", |
||||
|
"sftp", |
||||
|
"shttp", |
||||
|
"sip", |
||||
|
"sips", |
||||
|
"tel" |
||||
|
}; |
||||
|
const std::unordered_set<std::string> Url::KNOWN_PROTOCOLS = { |
||||
|
"", |
||||
|
"file", |
||||
|
"ftp", |
||||
|
"git", |
||||
|
"git+ssh", |
||||
|
"gopher", |
||||
|
"hdl", |
||||
|
"http", |
||||
|
"https", |
||||
|
"imap", |
||||
|
"mms", |
||||
|
"nfs", |
||||
|
"nntp", |
||||
|
"prospero", |
||||
|
"rsync", |
||||
|
"rtsp", |
||||
|
"rtspu", |
||||
|
"sftp", |
||||
|
"shttp", |
||||
|
"sip", |
||||
|
"sips", |
||||
|
"sms", |
||||
|
"snews", |
||||
|
"svn", |
||||
|
"svn+ssh", |
||||
|
"tel", |
||||
|
"telnet", |
||||
|
"wais" |
||||
|
}; |
||||
|
|
||||
|
Url::Url(const std::string& url): port_(0), has_params_(false), has_query_(false) |
||||
|
{ |
||||
|
size_t position = 0; |
||||
|
size_t index = url.find(':'); |
||||
|
if (index != std::string::npos) |
||||
|
{ |
||||
|
// All the characters in our would-be scheme must be in SCHEME
|
||||
|
if (std::all_of( |
||||
|
url.begin(), |
||||
|
url.begin() + index, |
||||
|
[](char c) { return SCHEME(c); } )) |
||||
|
{ |
||||
|
// If there is nothing after the : or there are any non-digits, this is
|
||||
|
// the scheme
|
||||
|
if ((index + 1) >= url.length() |
||||
|
|| std::any_of( |
||||
|
url.begin() + index + 1, |
||||
|
url.end(), |
||||
|
[](char c) { return !DIGIT(c); })) |
||||
|
{ |
||||
|
scheme_.assign(url, 0, index); |
||||
|
std::transform( |
||||
|
scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower); |
||||
|
position = index + 1; |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
scheme_.assign(url, 0, index); |
||||
|
std::transform( |
||||
|
scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower); |
||||
|
if (KNOWN_PROTOCOLS.find(scheme_) != KNOWN_PROTOCOLS.end()) |
||||
|
{ |
||||
|
position = index + 1; |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
scheme_.clear(); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// Search for the netloc
|
||||
|
if ((url.length() - position) >= 1 |
||||
|
&& url[position] == '/' |
||||
|
&& url[position + 1] == '/') |
||||
|
{ |
||||
|
// Skip the '//'
|
||||
|
position += 2; |
||||
|
index = url.find_first_of("/?#", position); |
||||
|
host_.assign(url, position, index - position); |
||||
|
position = index; |
||||
|
|
||||
|
// Extract any userinfo if there is any
|
||||
|
index = host_.find('@'); |
||||
|
if (index != std::string::npos) |
||||
|
{ |
||||
|
userinfo_.assign(host_, 0, index); |
||||
|
host_.assign(host_, index + 1, std::string::npos); |
||||
|
} |
||||
|
|
||||
|
// Lowercase the hostname
|
||||
|
std::transform(host_.begin(), host_.end(), host_.begin(), ::tolower); |
||||
|
|
||||
|
// Try to find a port
|
||||
|
index = host_.find(':'); |
||||
|
if (index != std::string::npos) |
||||
|
{ |
||||
|
std::string portText(host_, index + 1, std::string::npos); |
||||
|
host_.resize(index); |
||||
|
|
||||
|
if (portText.empty()) |
||||
|
{ |
||||
|
port_ = 0; |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
try |
||||
|
{ |
||||
|
port_ = std::stoi(portText, &index); |
||||
|
|
||||
|
if (index != portText.length()) |
||||
|
{ |
||||
|
// Malformed port
|
||||
|
throw UrlParseException("Port not a number: " + portText); |
||||
|
} |
||||
|
|
||||
|
if (port_ > 65535) |
||||
|
{ |
||||
|
throw UrlParseException("Port too high: " + portText); |
||||
|
} |
||||
|
else if (port_ < 0) |
||||
|
{ |
||||
|
throw UrlParseException("Port negative: " + portText); |
||||
|
} |
||||
|
} |
||||
|
catch (const std::invalid_argument&) |
||||
|
{ |
||||
|
// Malformed port
|
||||
|
throw UrlParseException("Port not a number: " + portText); |
||||
|
} |
||||
|
catch (const std::out_of_range&) |
||||
|
{ |
||||
|
throw UrlParseException("Port out of integer range: " + portText); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (position != std::string::npos) |
||||
|
{ |
||||
|
path_.assign(url, position, std::string::npos); |
||||
|
|
||||
|
index = path_.find('#'); |
||||
|
if (index != std::string::npos) |
||||
|
{ |
||||
|
fragment_.assign(path_, index + 1, std::string::npos); |
||||
|
path_.resize(index); |
||||
|
} |
||||
|
|
||||
|
index = path_.find('?'); |
||||
|
if (index != std::string::npos) |
||||
|
{ |
||||
|
query_.assign(path_, index + 1, std::string::npos); |
||||
|
has_query_ = true; |
||||
|
path_.resize(index); |
||||
|
} |
||||
|
|
||||
|
if (USES_PARAMS.find(scheme_) != USES_PARAMS.end()) |
||||
|
{ |
||||
|
index = path_.find(';'); |
||||
|
if (index != std::string::npos) |
||||
|
{ |
||||
|
params_.assign(path_, index + 1, std::string::npos); |
||||
|
has_params_ = true; |
||||
|
path_.resize(index); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
Url& Url::assign(const Url& other) |
||||
|
{ |
||||
|
return (*this) = other; |
||||
|
} |
||||
|
|
||||
|
bool Url::operator==(const Url& other) const |
||||
|
{ |
||||
|
return ( |
||||
|
(scheme_ == other.scheme_ ) && |
||||
|
(userinfo_ == other.userinfo_ ) && |
||||
|
(host_ == other.host_ ) && |
||||
|
(port_ == other.port_ ) && |
||||
|
(path_ == other.path_ ) && |
||||
|
(params_ == other.params_ ) && |
||||
|
(query_ == other.query_ ) && |
||||
|
(fragment_ == other.fragment_ ) && |
||||
|
(has_params_ == other.has_params_) && |
||||
|
(has_query_ == other.has_query_ ) |
||||
|
); |
||||
|
} |
||||
|
|
||||
|
bool Url::operator!=(const Url& other) const |
||||
|
{ |
||||
|
return !operator==(other); |
||||
|
} |
||||
|
|
||||
|
bool Url::equiv(const Url& other) |
||||
|
{ |
||||
|
Url self_(*this); |
||||
|
Url other_(other); |
||||
|
|
||||
|
self_.strip() |
||||
|
.sort_query() |
||||
|
.defrag() |
||||
|
.deuserinfo() |
||||
|
.abspath() |
||||
|
.escape() |
||||
|
.punycode() |
||||
|
.remove_default_port(); |
||||
|
other_.strip() |
||||
|
.sort_query() |
||||
|
.defrag() |
||||
|
.deuserinfo() |
||||
|
.abspath() |
||||
|
.escape() |
||||
|
.punycode() |
||||
|
.remove_default_port(); |
||||
|
return self_ == other_; |
||||
|
} |
||||
|
|
||||
|
std::string& Url::remove_repeats(std::string& str, const char chr) |
||||
|
{ |
||||
|
size_t dest = 0; |
||||
|
// By initializing this to true, it also strips of leading instances of chr
|
||||
|
bool seen = true; |
||||
|
for (size_t src = 0; src < str.length(); ++src) |
||||
|
{ |
||||
|
if (!seen || (str[src] != chr)) |
||||
|
{ |
||||
|
str[dest++] = str[src]; |
||||
|
} |
||||
|
seen = str[src] == chr; |
||||
|
} |
||||
|
// Remove the last character if it happens to be chr
|
||||
|
size_t length = ((dest > 0) && (str[dest - 1] == chr)) ? dest - 1 : dest; |
||||
|
str.resize(length); |
||||
|
return str; |
||||
|
} |
||||
|
|
||||
|
std::string Url::fullpath() const |
||||
|
{ |
||||
|
std::string result; |
||||
|
if (path_.empty() || path_[0] != '/') |
||||
|
{ |
||||
|
result.append(1, '/'); |
||||
|
} |
||||
|
result.append(path_); |
||||
|
|
||||
|
if (has_params_) |
||||
|
{ |
||||
|
result.append(";"); |
||||
|
result.append(params_); |
||||
|
} |
||||
|
|
||||
|
if (has_query_) |
||||
|
{ |
||||
|
result.append("?"); |
||||
|
result.append(query_); |
||||
|
} |
||||
|
|
||||
|
if (!fragment_.empty()) |
||||
|
{ |
||||
|
result.append("#"); |
||||
|
result.append(fragment_); |
||||
|
} |
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
std::string Url::str() const |
||||
|
{ |
||||
|
std::string result; |
||||
|
|
||||
|
if (!scheme_.empty()) |
||||
|
{ |
||||
|
result.append(scheme_); |
||||
|
if (USES_NETLOC.find(scheme_) == USES_NETLOC.end()) |
||||
|
{ |
||||
|
result.append(":"); |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
result.append("://"); |
||||
|
} |
||||
|
} |
||||
|
else if (!host_.empty()) |
||||
|
{ |
||||
|
result.append("//"); |
||||
|
} |
||||
|
|
||||
|
if (!userinfo_.empty()) |
||||
|
{ |
||||
|
result.append(userinfo_); |
||||
|
result.append("@"); |
||||
|
} |
||||
|
|
||||
|
if (!host_.empty()) |
||||
|
{ |
||||
|
result.append(host_); |
||||
|
} |
||||
|
|
||||
|
if (port_) |
||||
|
{ |
||||
|
result.append(":"); |
||||
|
result.append(std::to_string(port_)); |
||||
|
} |
||||
|
|
||||
|
if (path_.empty()) |
||||
|
{ |
||||
|
if (!result.empty()) |
||||
|
{ |
||||
|
result.append("/"); |
||||
|
} |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
if (!host_.empty() && path_[0] != '/') |
||||
|
{ |
||||
|
result.append(1, '/'); |
||||
|
} |
||||
|
result.append(path_); |
||||
|
} |
||||
|
|
||||
|
if (has_params_) |
||||
|
{ |
||||
|
result.append(";"); |
||||
|
result.append(params_); |
||||
|
} |
||||
|
|
||||
|
if (has_query_) |
||||
|
{ |
||||
|
result.append("?"); |
||||
|
result.append(query_); |
||||
|
} |
||||
|
|
||||
|
if (!fragment_.empty()) |
||||
|
{ |
||||
|
result.append("#"); |
||||
|
result.append(fragment_); |
||||
|
} |
||||
|
|
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
Url& Url::strip() |
||||
|
{ |
||||
|
size_t start = query_.find_first_not_of('?'); |
||||
|
if (start != std::string::npos) |
||||
|
{ |
||||
|
query_.assign(query_, start, std::string::npos); |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
query_.assign(""); |
||||
|
} |
||||
|
setQuery(remove_repeats(query_, '&')); |
||||
|
setParams(remove_repeats(params_, ';')); |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
Url& Url::abspath() |
||||
|
{ |
||||
|
std::string copy; |
||||
|
std::vector<size_t> segment_starts; |
||||
|
|
||||
|
if (path_.size() >= 1 && path_[0] == '/') |
||||
|
{ |
||||
|
copy.append(1, '/'); |
||||
|
segment_starts.push_back(0); |
||||
|
} |
||||
|
|
||||
|
bool directory = false; |
||||
|
size_t previous = 0; |
||||
|
size_t index = 0; |
||||
|
for (index = path_.find('/') |
||||
|
; index != std::string::npos |
||||
|
; previous = index + 1, index = path_.find('/', index + 1)) |
||||
|
{ |
||||
|
// Skip empty segments
|
||||
|
if (index - previous == 0) |
||||
|
{ |
||||
|
continue; |
||||
|
} |
||||
|
|
||||
|
if ((index - previous == 2) |
||||
|
&& path_[previous] == '.' |
||||
|
&& path_[previous + 1] == '.') |
||||
|
{ |
||||
|
if (!segment_starts.empty()) |
||||
|
{ |
||||
|
copy.resize(segment_starts.back()); |
||||
|
segment_starts.pop_back(); |
||||
|
} |
||||
|
directory = true; |
||||
|
} |
||||
|
else if ((index - previous == 1) && path_[previous] == '.') |
||||
|
{ |
||||
|
directory = true; |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
segment_starts.push_back(copy.length()); |
||||
|
copy.append(path_, previous, index - previous); |
||||
|
copy.append(1, '/'); |
||||
|
directory = false; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// Handle the last segment
|
||||
|
index = path_.length(); |
||||
|
if (previous == path_.length()) |
||||
|
{ |
||||
|
directory = true; |
||||
|
} |
||||
|
else if ((index - previous == 1) && path_[previous] == '.') |
||||
|
{ |
||||
|
directory = true; |
||||
|
} |
||||
|
else if ((index - previous == 2) |
||||
|
&& path_[previous] == '.' |
||||
|
&& path_[previous + 1] == '.') |
||||
|
{ |
||||
|
if (!segment_starts.empty()) |
||||
|
{ |
||||
|
copy.resize(segment_starts.back()); |
||||
|
} |
||||
|
directory = true; |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
copy.append(path_, previous, index - previous); |
||||
|
copy.append(1, '/'); |
||||
|
directory = false; |
||||
|
} |
||||
|
|
||||
|
if (!directory && copy.size() >= 1) |
||||
|
{ |
||||
|
copy.resize(copy.size() - 1); |
||||
|
} |
||||
|
else if (directory && copy.empty()) |
||||
|
{ |
||||
|
copy.append(1, '/'); |
||||
|
} |
||||
|
path_.assign(copy); |
||||
|
|
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
Url& Url::relative_to(const Url& other) |
||||
|
{ |
||||
|
// If this scheme does not use relative, return it unchanged
|
||||
|
if (USES_RELATIVE.find(scheme_) == USES_RELATIVE.end()) |
||||
|
{ |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
// Support scheme-relative URLs
|
||||
|
if (scheme_.empty()) |
||||
|
{ |
||||
|
scheme_ = other.scheme_; |
||||
|
} |
||||
|
|
||||
|
// If this is an absolute URL (or scheme-relative), return early
|
||||
|
if (!host_.empty()) { |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
// If it's not an absolute URL, we need to copy the other host and port
|
||||
|
host_ = other.host_; |
||||
|
port_ = other.port_; |
||||
|
userinfo_ = other.userinfo_; |
||||
|
|
||||
|
// If the path portion is absolute, then bail out early.
|
||||
|
if (!path_.empty() && path_.front() == '/') |
||||
|
{ |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
// Otherwise, this is a path that need to be evaluated relative to the other. If
|
||||
|
// there is no '/', then we just keep our current path if it's not empty.
|
||||
|
if (path_.empty()) |
||||
|
{ |
||||
|
if (params_.empty()) |
||||
|
{ |
||||
|
path_ = other.path_; |
||||
|
params_ = other.params_; |
||||
|
has_params_ = other.has_params_; |
||||
|
if (query_.empty()) |
||||
|
{ |
||||
|
query_ = other.query_; |
||||
|
has_query_ = other.has_query_; |
||||
|
} |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
path_.assign(other.path_, 0, other.path_.rfind('/') + 1); |
||||
|
} |
||||
|
|
||||
|
if (fragment_.empty()) |
||||
|
{ |
||||
|
fragment_ = other.fragment_; |
||||
|
} |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
size_t index = other.path_.rfind('/'); |
||||
|
if (index != std::string::npos) |
||||
|
{ |
||||
|
path_ = other.path_.substr(0, index + 1) + path_; |
||||
|
} |
||||
|
else if (!host_.empty()) |
||||
|
{ |
||||
|
path_ = "/" + path_; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
Url& Url::escape(bool strict) |
||||
|
{ |
||||
|
escape(path_, PATH, strict); |
||||
|
escape(query_, QUERY, strict); |
||||
|
escape(params_, QUERY, strict); |
||||
|
escape(userinfo_, USERINFO, strict); |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
std::string& Url::escape(std::string& str, const CharacterClass& safe, bool strict) |
||||
|
{ |
||||
|
std::string copy(str); |
||||
|
size_t dest = 0; |
||||
|
// Allocate space pessimistically -- if every entity is expanded, it will take 3x
|
||||
|
// the space.
|
||||
|
str.resize(str.length() * 3); |
||||
|
for (size_t src = 0; src < copy.length(); ++src) |
||||
|
{ |
||||
|
if (copy[src] == '%' && (copy.length() - src) >= 2) |
||||
|
{ |
||||
|
// Read ahead to see if there's a valid escape sequence. If not, treat
|
||||
|
// this like a normal character.
|
||||
|
if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1) |
||||
|
{ |
||||
|
int value = ( |
||||
|
HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]); |
||||
|
|
||||
|
// In strict mode, we can only unescape parameters if they are both
|
||||
|
// safe and node reserved
|
||||
|
if (!strict || (strict && safe(value) && !RESERVED(value))) |
||||
|
{ |
||||
|
// Replace src + 2 with that byte, advance src to consume it and
|
||||
|
// continue.
|
||||
|
src += 2; |
||||
|
copy[src] = value; |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
str[dest++] = copy[src++]; |
||||
|
str[dest++] = ::toupper(copy[src++]); |
||||
|
str[dest++] = ::toupper(copy[src]); |
||||
|
continue; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (!safe(copy[src])) |
||||
|
{ |
||||
|
// Not safe -- replace with %XX
|
||||
|
str[dest++] = '%'; |
||||
|
str[dest++] = HEX.chars()[(copy[src] >> 4) & 0xF]; |
||||
|
str[dest++] = HEX.chars()[copy[src] & 0xF]; |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
str[dest++] = copy[src]; |
||||
|
} |
||||
|
} |
||||
|
str.resize(dest); |
||||
|
return str; |
||||
|
} |
||||
|
|
||||
|
Url& Url::unescape() |
||||
|
{ |
||||
|
unescape(path_); |
||||
|
unescape(query_); |
||||
|
unescape(params_); |
||||
|
unescape(userinfo_); |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
std::string& Url::unescape(std::string& str) |
||||
|
{ |
||||
|
std::string copy(str); |
||||
|
size_t dest = 0; |
||||
|
for (size_t src = 0; src < copy.length(); ++src, ++dest) |
||||
|
{ |
||||
|
if (copy[src] == '%' && (copy.length() - src) >= 2) |
||||
|
{ |
||||
|
// Read ahead to see if there's a valid escape sequence. If not, treat
|
||||
|
// this like a normal character.
|
||||
|
if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1) |
||||
|
{ |
||||
|
int value = ( |
||||
|
HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]); |
||||
|
|
||||
|
// Replace src + 2 with that byte, advance src to consume it and
|
||||
|
// continue.
|
||||
|
src += 2; |
||||
|
str[dest] = value; |
||||
|
continue; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// Either not a % or an incomplete entity
|
||||
|
str[dest] = copy[src]; |
||||
|
} |
||||
|
str.resize(dest); |
||||
|
return str; |
||||
|
} |
||||
|
|
||||
|
Url& Url::deparam(const std::unordered_set<std::string>& blacklist) |
||||
|
{ |
||||
|
// Predicate is if it's present in the blacklist.
|
||||
|
auto predicate = [blacklist](std::string& name, const std::string& value) |
||||
|
{ |
||||
|
std::transform(name.begin(), name.end(), name.begin(), ::tolower); |
||||
|
return blacklist.find(name) != blacklist.end(); |
||||
|
}; |
||||
|
|
||||
|
setQuery(remove_params(query_, predicate, '&')); |
||||
|
setParams(remove_params(params_, predicate, ';')); |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
Url& Url::deparam(const deparam_predicate& predicate) |
||||
|
{ |
||||
|
setQuery(remove_params(query_, predicate, '&')); |
||||
|
setParams(remove_params(params_, predicate, ';')); |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
std::string& Url::remove_params(std::string& str, |
||||
|
const deparam_predicate& predicate, |
||||
|
char sep) |
||||
|
{ |
||||
|
std::string copy; |
||||
|
std::string piece; |
||||
|
std::string name; |
||||
|
std::string value; |
||||
|
size_t previous = 0; |
||||
|
for (size_t index = str.find(sep) |
||||
|
; index != std::string::npos |
||||
|
; previous = index + 1, index = str.find(sep, previous)) |
||||
|
{ |
||||
|
piece.assign(str, previous, index - previous); |
||||
|
size_t position = piece.find('='); |
||||
|
name.assign(piece, 0, position); |
||||
|
value.clear(); |
||||
|
if (position != std::string::npos) |
||||
|
{ |
||||
|
value.assign(piece, position + 1, std::string::npos); |
||||
|
} |
||||
|
|
||||
|
if (!predicate(name, value)) |
||||
|
{ |
||||
|
copy.append(copy.empty() ? 0 : 1, sep); |
||||
|
copy.append(piece); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
if (previous < str.length()) |
||||
|
{ |
||||
|
piece.assign(str, previous, std::string::npos); |
||||
|
size_t position = piece.find('='); |
||||
|
name.assign(piece, 0, position); |
||||
|
value.clear(); |
||||
|
if (position != std::string::npos) |
||||
|
{ |
||||
|
value.assign(piece, position + 1, std::string::npos); |
||||
|
} |
||||
|
|
||||
|
if (!predicate(name, value)) |
||||
|
{ |
||||
|
copy.append(copy.empty() ? 0 : 1, sep); |
||||
|
copy.append(piece); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
str.assign(copy); |
||||
|
return str; |
||||
|
} |
||||
|
|
||||
|
Url& Url::sort_query() |
||||
|
{ |
||||
|
split_sort_join(query_, '&'); |
||||
|
split_sort_join(params_, ';'); |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
std::string& Url::split_sort_join(std::string& str, const char glue) |
||||
|
{ |
||||
|
// Return early if empty
|
||||
|
if (str.empty()) |
||||
|
{ |
||||
|
return str; |
||||
|
} |
||||
|
|
||||
|
// Split
|
||||
|
std::vector<std::string> pieces; |
||||
|
std::stringstream stream(str); |
||||
|
std::string item; |
||||
|
while (getline(stream, item, glue)) |
||||
|
{ |
||||
|
pieces.push_back(item); |
||||
|
} |
||||
|
|
||||
|
// Return early if it's just a single element
|
||||
|
if (pieces.size() == 1) |
||||
|
{ |
||||
|
return str; |
||||
|
} |
||||
|
|
||||
|
// Sort
|
||||
|
std::sort(pieces.begin(), pieces.end()); |
||||
|
|
||||
|
// Join (at this point we know that there's at least one element)
|
||||
|
std::stringstream output; |
||||
|
for (auto it = pieces.begin(); it != (pieces.end() - 1); ++it) |
||||
|
{ |
||||
|
output << *it << glue; |
||||
|
} |
||||
|
output << pieces.back(); |
||||
|
str.assign(output.str()); |
||||
|
return str; |
||||
|
} |
||||
|
|
||||
|
Url& Url::remove_default_port() |
||||
|
{ |
||||
|
if (port_ && !scheme_.empty()) |
||||
|
{ |
||||
|
auto it = PORTS.find(scheme_); |
||||
|
if (it != PORTS.end() && port_ == it->second) |
||||
|
{ |
||||
|
port_ = 0; |
||||
|
} |
||||
|
} |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
Url& Url::deuserinfo() |
||||
|
{ |
||||
|
userinfo_.clear(); |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
Url& Url::defrag() |
||||
|
{ |
||||
|
fragment_.clear(); |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
Url& Url::punycode() |
||||
|
{ |
||||
|
check_hostname(host_); |
||||
|
std::string encoded(Punycode::encodeHostname(host_)); |
||||
|
check_hostname(encoded); |
||||
|
host_ = encoded; |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
Url& Url::unpunycode() |
||||
|
{ |
||||
|
host_ = Punycode::decodeHostname(host_); |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
Url& Url::host_reversed() |
||||
|
{ |
||||
|
std::reverse(host_.begin(), host_.end()); |
||||
|
for (size_t index = 0, position = 0; index < host_.size(); index = position + 1) |
||||
|
{ |
||||
|
position = host_.find('.', index); |
||||
|
if (position == std::string::npos) |
||||
|
{ |
||||
|
std::reverse(host_.begin() + index, host_.end()); |
||||
|
break; |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
std::reverse(host_.begin() + index, host_.begin() + position); |
||||
|
} |
||||
|
} |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
void Url::check_hostname(std::string& host) |
||||
|
{ |
||||
|
// Skip empty hostnames -- they are valid
|
||||
|
if (host.empty()) |
||||
|
{ |
||||
|
return; |
||||
|
} |
||||
|
|
||||
|
size_t start = 0; |
||||
|
size_t end = host.find('.'); |
||||
|
while (end != std::string::npos) |
||||
|
{ |
||||
|
if ((end - start) > 63) |
||||
|
{ |
||||
|
throw std::invalid_argument("Label too long."); |
||||
|
} |
||||
|
else if (end == start) |
||||
|
{ |
||||
|
throw std::invalid_argument("Empty label."); |
||||
|
} |
||||
|
|
||||
|
start = end + 1; |
||||
|
end = host.find('.', start); |
||||
|
} |
||||
|
|
||||
|
// For the final segment
|
||||
|
if ((host.size() - start) > 63) |
||||
|
{ |
||||
|
throw std::invalid_argument("Label too long."); |
||||
|
} |
||||
|
else if (host.size() == start && start > 1) |
||||
|
{ |
||||
|
// Remove a trailing empty segment
|
||||
|
host.resize(start - 1); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
}; |
@ -0,0 +1,323 @@ |
|||||
|
#ifndef URL_CPP_H |
||||
|
#define URL_CPP_H |
||||
|
|
||||
|
#include <stdexcept> |
||||
|
#include <functional> |
||||
|
#include <string> |
||||
|
#include <vector> |
||||
|
#include <unordered_map> |
||||
|
#include <unordered_set> |
||||
|
|
||||
|
namespace Url |
||||
|
{ |
||||
|
|
||||
|
struct UrlParseException : public std::logic_error |
||||
|
{ |
||||
|
UrlParseException(const std::string& message) : std::logic_error(message) {} |
||||
|
}; |
||||
|
|
||||
|
struct CharacterClass |
||||
|
{ |
||||
|
CharacterClass(const std::string& chars) : chars_(chars), map_(256, false) |
||||
|
{ |
||||
|
for (auto it = chars_.begin(); it != chars_.end(); ++it) |
||||
|
{ |
||||
|
map_[static_cast<size_t>(*it)] = true; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
bool operator()(char c) const |
||||
|
{ |
||||
|
return map_[static_cast<unsigned char>(c)]; |
||||
|
} |
||||
|
|
||||
|
const std::string& chars() const |
||||
|
{ |
||||
|
return chars_; |
||||
|
} |
||||
|
|
||||
|
private: |
||||
|
// Private, unimplemented to prevent use
|
||||
|
CharacterClass(); |
||||
|
CharacterClass(const CharacterClass& other); |
||||
|
|
||||
|
std::string chars_; |
||||
|
std::vector<bool> map_; |
||||
|
}; |
||||
|
|
||||
|
struct Url |
||||
|
{ |
||||
|
/* Character classes */ |
||||
|
const static CharacterClass GEN_DELIMS; |
||||
|
const static CharacterClass SUB_DELIMS; |
||||
|
const static CharacterClass ALPHA; |
||||
|
const static CharacterClass DIGIT; |
||||
|
const static CharacterClass UNRESERVED; |
||||
|
const static CharacterClass RESERVED; |
||||
|
const static CharacterClass PCHAR; |
||||
|
const static CharacterClass PATH; |
||||
|
const static CharacterClass QUERY; |
||||
|
const static CharacterClass FRAGMENT; |
||||
|
const static CharacterClass USERINFO; |
||||
|
const static CharacterClass HEX; |
||||
|
const static CharacterClass SCHEME; |
||||
|
const static std::vector<signed char> HEX_TO_DEC; |
||||
|
const static std::unordered_map<std::string, int> PORTS; |
||||
|
const static std::unordered_set<std::string> USES_RELATIVE; |
||||
|
const static std::unordered_set<std::string> USES_NETLOC; |
||||
|
const static std::unordered_set<std::string> USES_PARAMS; |
||||
|
const static std::unordered_set<std::string> KNOWN_PROTOCOLS; |
||||
|
|
||||
|
// The type of the predicate used for removing parameters
|
||||
|
typedef std::function<bool(std::string&, std::string&)> deparam_predicate; |
||||
|
|
||||
|
explicit Url(const std::string& url); |
||||
|
|
||||
|
Url(const Url& other) |
||||
|
: scheme_(other.scheme_) |
||||
|
, host_(other.host_) |
||||
|
, port_(other.port_) |
||||
|
, path_(other.path_) |
||||
|
, params_(other.params_) |
||||
|
, query_(other.query_) |
||||
|
, fragment_(other.fragment_) |
||||
|
, userinfo_(other.userinfo_) |
||||
|
, has_params_(other.has_params_) |
||||
|
, has_query_(other.has_query_) { } |
||||
|
|
||||
|
/**
|
||||
|
* Take on the value of the other URL. |
||||
|
*/ |
||||
|
Url& assign(const Url& other); |
||||
|
|
||||
|
/**
|
||||
|
* To be considered equal, all fields must be equal. |
||||
|
*/ |
||||
|
bool operator==(const Url& other) const; |
||||
|
bool operator!=(const Url& other) const; |
||||
|
|
||||
|
/**
|
||||
|
* Two URLs are considered equivalent if they have the same meaning. |
||||
|
*/ |
||||
|
bool equiv(const Url& other); |
||||
|
|
||||
|
/**************************************
|
||||
|
* Component-wise access and setting. * |
||||
|
**************************************/ |
||||
|
const std::string& scheme() const { return scheme_; } |
||||
|
Url& setScheme(const std::string& s) |
||||
|
{ |
||||
|
scheme_ = s; |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
const std::string& host() const { return host_; } |
||||
|
Url& setHost(const std::string& s) |
||||
|
{ |
||||
|
host_ = s; |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
const int port() const { return port_; } |
||||
|
Url& setPort(int i) |
||||
|
{ |
||||
|
port_ = i; |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
const std::string& path() const { return path_; } |
||||
|
Url& setPath(const std::string& s) |
||||
|
{ |
||||
|
path_ = s; |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
const std::string& params() const { return params_; } |
||||
|
Url& setParams(const std::string& s) |
||||
|
{ |
||||
|
params_ = s; |
||||
|
has_params_ = !s.empty(); |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
const std::string& query() const { return query_; } |
||||
|
Url& setQuery(const std::string& s) |
||||
|
{ |
||||
|
query_ = s; |
||||
|
has_query_ = !s.empty(); |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
const std::string& fragment() const { return fragment_; } |
||||
|
Url& setFragment(const std::string& s) |
||||
|
{ |
||||
|
fragment_ = s; |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
const std::string& userinfo() const { return userinfo_; } |
||||
|
Url& setUserinfo(const std::string& s) |
||||
|
{ |
||||
|
userinfo_ = s; |
||||
|
return *this; |
||||
|
} |
||||
|
|
||||
|
/**
|
||||
|
* Get a representation of all components of the path, params, query, fragment. |
||||
|
* |
||||
|
* Always includes a leading /. |
||||
|
*/ |
||||
|
std::string fullpath() const; |
||||
|
|
||||
|
/**
|
||||
|
* Get a new string representation of the URL. |
||||
|
**/ |
||||
|
std::string str() const; |
||||
|
|
||||
|
/*********************
|
||||
|
* Chainable methods * |
||||
|
*********************/ |
||||
|
|
||||
|
/**
|
||||
|
* Strip semantically meaningless excess '?', '&', and ';' characters from query |
||||
|
* and params. |
||||
|
*/ |
||||
|
Url& strip(); |
||||
|
|
||||
|
/**
|
||||
|
* Make the path absolute. |
||||
|
* |
||||
|
* Evaluate '.', '..', and excessive slashes. |
||||
|
*/ |
||||
|
Url& abspath(); |
||||
|
|
||||
|
/**
|
||||
|
* Evaluate this URL relative fo `other`, placing the result in this object. |
||||
|
*/ |
||||
|
Url& relative_to(const std::string& other) |
||||
|
{ |
||||
|
return relative_to(Url(other)); |
||||
|
} |
||||
|
|
||||
|
/**
|
||||
|
* Evaluate this URL relative fo `other`, placing the result in this object. |
||||
|
*/ |
||||
|
Url& relative_to(const Url& other); |
||||
|
|
||||
|
/**
|
||||
|
* Ensure that the path, params, query, and userinfo are properly escaped. |
||||
|
* |
||||
|
* In 'strict' mode, only entities that are both safe and not reserved characters |
||||
|
* are unescaped. In non-strict mode, entities that are safe are unescaped. |
||||
|
*/ |
||||
|
Url& escape(bool strict=false); |
||||
|
|
||||
|
/**
|
||||
|
* Unescape all entities in the path, params, query, and userinfo. |
||||
|
*/ |
||||
|
Url& unescape(); |
||||
|
|
||||
|
/**
|
||||
|
* Remove any params or queries that appear in the blacklist. |
||||
|
* |
||||
|
* The blacklist should contain only lowercased strings, and the comparison is |
||||
|
* done in a case-insensitive way. |
||||
|
*/ |
||||
|
Url& deparam(const std::unordered_set<std::string>& blacklist); |
||||
|
|
||||
|
/**
|
||||
|
* Filter params subject to a predicate for whether it should be filtered. |
||||
|
* |
||||
|
* The predicate must accept two string refs -- the key and value (which may be |
||||
|
* empty). Return `true` if the parameter should be removed, and `false` |
||||
|
* otherwise. |
||||
|
*/ |
||||
|
Url& deparam(const deparam_predicate& predicate); |
||||
|
|
||||
|
/**
|
||||
|
* Put queries and params in sorted order. |
||||
|
* |
||||
|
* To ensure consistent comparisons, escape should be called beforehand. |
||||
|
*/ |
||||
|
Url& sort_query(); |
||||
|
|
||||
|
/**
|
||||
|
* Remove the port if it's the default for the scheme. |
||||
|
*/ |
||||
|
Url& remove_default_port(); |
||||
|
|
||||
|
/**
|
||||
|
* Remove the userinfo portion. |
||||
|
*/ |
||||
|
Url& deuserinfo(); |
||||
|
|
||||
|
/**
|
||||
|
* Remove the fragment. |
||||
|
*/ |
||||
|
Url& defrag(); |
||||
|
|
||||
|
/**
|
||||
|
* Punycode the hostname. |
||||
|
*/ |
||||
|
Url& punycode(); |
||||
|
|
||||
|
/**
|
||||
|
* Unpunycode the hostname. |
||||
|
*/ |
||||
|
Url& unpunycode(); |
||||
|
|
||||
|
/**
|
||||
|
* Reverse the hostname (a.b.c.d => d.c.b.a) |
||||
|
*/ |
||||
|
Url& host_reversed(); |
||||
|
|
||||
|
private: |
||||
|
// Private, unimplemented to prevent use.
|
||||
|
Url(); |
||||
|
|
||||
|
/**
|
||||
|
* Remove repeated, leading, and trailing instances of chr from the string. |
||||
|
*/ |
||||
|
std::string& remove_repeats(std::string& str, const char chr); |
||||
|
|
||||
|
/**
|
||||
|
* Ensure all the provided characters are escaped if necessary |
||||
|
*/ |
||||
|
std::string& escape(std::string& str, const CharacterClass& safe, bool strict); |
||||
|
|
||||
|
/**
|
||||
|
* Unescape entities in the provided string |
||||
|
*/ |
||||
|
std::string& unescape(std::string& str); |
||||
|
|
||||
|
/**
|
||||
|
* Remove any params that match entries in the blacklist. |
||||
|
*/ |
||||
|
std::string& remove_params( |
||||
|
std::string& str, const deparam_predicate& pred, char sep); |
||||
|
|
||||
|
/**
|
||||
|
* Split the provided string by char, sort, join by char. |
||||
|
*/ |
||||
|
std::string& split_sort_join(std::string& str, const char glue); |
||||
|
|
||||
|
/**
|
||||
|
* Check that the hostname is valid, removing an optional trailing '.'. |
||||
|
*/ |
||||
|
void check_hostname(std::string& host); |
||||
|
|
||||
|
std::string scheme_; |
||||
|
std::string host_; |
||||
|
int port_; |
||||
|
std::string path_; |
||||
|
std::string params_; |
||||
|
std::string query_; |
||||
|
std::string fragment_; |
||||
|
std::string userinfo_; |
||||
|
bool has_params_; |
||||
|
bool has_query_; |
||||
|
}; |
||||
|
|
||||
|
} |
||||
|
|
||||
|
#endif |
@ -0,0 +1,150 @@ |
|||||
|
#include <algorithm> |
||||
|
#include <string> |
||||
|
#include <iostream> |
||||
|
|
||||
|
#include "utf8.h" |
||||
|
|
||||
|
namespace Url |
||||
|
{ |
||||
|
|
||||
|
Utf8::codepoint_t Utf8::readCodepoint( |
||||
|
std::string::const_iterator& it, const std::string::const_iterator& end) |
||||
|
{ |
||||
|
Utf8::char_t current = static_cast<Utf8::char_t>(*it++); |
||||
|
if (current & 0x80) |
||||
|
{ |
||||
|
// Number of additional bytes needed
|
||||
|
unsigned int bytes = 0; |
||||
|
// The accumulated value
|
||||
|
Utf8::codepoint_t result = 0; |
||||
|
if (current < 0xC0) |
||||
|
{ |
||||
|
// Invalid sequence
|
||||
|
throw std::invalid_argument("Low UTF-8 start byte"); |
||||
|
} |
||||
|
else if (current < 0xE0) |
||||
|
{ |
||||
|
// One additional byte, two bytes total, use 5 bits
|
||||
|
bytes = 1; |
||||
|
result = current & 0x1F; |
||||
|
} |
||||
|
else if (current < 0xF0) |
||||
|
{ |
||||
|
// Two additional bytes, three bytes total, use 4 bits
|
||||
|
bytes = 2; |
||||
|
result = current & 0x0F; |
||||
|
} |
||||
|
else if (current < 0xF8) |
||||
|
{ |
||||
|
// Three additional bytes, four bytes total, use 3 bits
|
||||
|
bytes = 3; |
||||
|
result = current & 0x07; |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
throw std::invalid_argument("High UTF-8 start byte"); |
||||
|
} |
||||
|
|
||||
|
for (; bytes > 0; --bytes) { |
||||
|
if (it == end) |
||||
|
{ |
||||
|
throw std::invalid_argument("UTF-8 sequence terminated early."); |
||||
|
} |
||||
|
|
||||
|
current = static_cast<unsigned char>(*it++); |
||||
|
// Ensure the first two bits are 10
|
||||
|
if ((current & 0xC0) != 0x80) |
||||
|
{ |
||||
|
throw std::invalid_argument("Invalid continuation byte"); |
||||
|
} |
||||
|
result = (result << 6) | (current & 0x3F); |
||||
|
} |
||||
|
|
||||
|
return result; |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
return current; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
std::string& Utf8::writeCodepoint(std::string& str, Utf8::codepoint_t value) |
||||
|
{ |
||||
|
if (value > MAX_CODEPOINT) |
||||
|
{ |
||||
|
throw std::invalid_argument("Code point too high."); |
||||
|
} |
||||
|
else if (value <= 0x007F) |
||||
|
{ |
||||
|
// Just append the character itself
|
||||
|
str.append(1, static_cast<char>(value)); |
||||
|
return str; |
||||
|
} |
||||
|
|
||||
|
unsigned int bytes = 0; |
||||
|
if (value > 0xFFFF) |
||||
|
{ |
||||
|
/**
|
||||
|
* 11110xxx + 3 bytes for 21 bits total |
||||
|
* |
||||
|
* We need to take bits 20-18, which 0x1C0000 masks out. These form the least |
||||
|
* significant bits of this byte (so we shift them back down by 18). The 5 |
||||
|
* most significant bits of this byte are 11110, so we OR this result with |
||||
|
* 0xF0 to get this first byte. |
||||
|
* |
||||
|
* The remaining bits will be consumed from the most-significant end and so |
||||
|
* they must be shifted up by (32 - 18) = 14. |
||||
|
*/ |
||||
|
str.append(1, static_cast<char>(((value & 0x1C0000) >> 18) | 0xF0)); |
||||
|
bytes = 3; |
||||
|
value <<= 14; |
||||
|
} |
||||
|
else if (value > 0x07FF) |
||||
|
{ |
||||
|
/**
|
||||
|
* 1110xxxx + 2 bytes for 16 bits total |
||||
|
* |
||||
|
* We need to take bits 15-12, which 0xF000 masks out. These form the least |
||||
|
* significant bits of this byte (so we shift them back down by 12). The 4 |
||||
|
* most significant bits of this byte are 1110, so we OR this result with |
||||
|
* 0xE0 to get this first byte. |
||||
|
* |
||||
|
* The remaining bits will be consumed from the most-significant end and so |
||||
|
* they must be shifted up by (32 - 12) = 20. |
||||
|
*/ |
||||
|
str.append(1, static_cast<char>(((value & 0xF000) >> 12) | 0xE0)); |
||||
|
bytes = 2; |
||||
|
value <<= 20; |
||||
|
} |
||||
|
else |
||||
|
{ |
||||
|
/**
|
||||
|
* 110xxxxx + 1 byte for 11 bits total |
||||
|
* |
||||
|
* We need to take bits 10-6, which 0x7C0 masks out. These form the least |
||||
|
* significant bits of this byte (so we shift them back down by 6). The 3 |
||||
|
* most significant bits of this byte are 110, so we OR this result with |
||||
|
* 0xC0 to get this first byte. |
||||
|
* |
||||
|
* The remaining bits will be consumed from the most-significant end and so |
||||
|
* they must be shifted up by (32 - 6) = 26. |
||||
|
*/ |
||||
|
str.append(1, static_cast<char>(((value & 0x7C0) >> 6) | 0xC0)); |
||||
|
bytes = 1; |
||||
|
value <<= 26; |
||||
|
} |
||||
|
|
||||
|
/**
|
||||
|
* The remaining bits are to be consumed 6 at a time from the most-significant |
||||
|
* end. The mask 0xFC000000 grabs these six bits, which then must be shifted down |
||||
|
* by 26, and OR'd with 0x80 to produce the continuation byte. |
||||
|
*/ |
||||
|
for (; bytes > 0; --bytes, value <<= 6) |
||||
|
{ |
||||
|
str.append(1, static_cast<char>(((value & 0xFC000000) >> 26) | 0x80)); |
||||
|
} |
||||
|
|
||||
|
return str; |
||||
|
} |
||||
|
|
||||
|
}; |
@ -0,0 +1,91 @@ |
|||||
|
#ifndef UTF8_CPP_H |
||||
|
#define UTF8_CPP_H |
||||
|
|
||||
|
#include <stdexcept> |
||||
|
#include <string> |
||||
|
#include <vector> |
||||
|
|
||||
|
namespace Url |
||||
|
{ |
||||
|
|
||||
|
/**
|
||||
|
* Work between unicode code points and their UTF-8-encoded representation. |
||||
|
*/ |
||||
|
struct Utf8 |
||||
|
{ |
||||
|
/**
|
||||
|
* The type we use to represent Unicode codepoints. |
||||
|
*/ |
||||
|
typedef uint32_t codepoint_t; |
||||
|
|
||||
|
/**
|
||||
|
* The type we use when talking about the integral value of bytes. |
||||
|
*/ |
||||
|
typedef unsigned char char_t; |
||||
|
|
||||
|
/**
|
||||
|
* The highest allowed codepoint. |
||||
|
*/ |
||||
|
static const codepoint_t MAX_CODEPOINT = 0x10FFFF; |
||||
|
|
||||
|
/**
|
||||
|
* Consume up to the last byte of the sequence, returning the codepoint. |
||||
|
*/ |
||||
|
static codepoint_t readCodepoint( |
||||
|
std::string::const_iterator& it, const std::string::const_iterator& end); |
||||
|
|
||||
|
/**
|
||||
|
* Write a codepoint to the provided string. |
||||
|
*/ |
||||
|
static std::string& writeCodepoint(std::string& str, codepoint_t value); |
||||
|
|
||||
|
/**
|
||||
|
* Return the first codepoint stored in the provided string. |
||||
|
*/ |
||||
|
static codepoint_t toCodepoint(const std::string& str) |
||||
|
{ |
||||
|
auto it = str.begin(); |
||||
|
return readCodepoint(it, str.end()); |
||||
|
} |
||||
|
|
||||
|
/**
|
||||
|
* Get a string with the provided codepoint. |
||||
|
*/ |
||||
|
static std::string fromCodepoint(codepoint_t value) |
||||
|
{ |
||||
|
std::string str; |
||||
|
writeCodepoint(str, value); |
||||
|
return str; |
||||
|
} |
||||
|
|
||||
|
/**
|
||||
|
* Return all the codepoints in the string. |
||||
|
*/ |
||||
|
static std::vector<codepoint_t> toCodepoints(const std::string& str) |
||||
|
{ |
||||
|
std::vector<codepoint_t> result; |
||||
|
for (auto it = str.begin(); it != str.end(); ) |
||||
|
{ |
||||
|
result.push_back(readCodepoint(it, str.end())); |
||||
|
} |
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
/**
|
||||
|
* Create a string from a vector of codepoints. |
||||
|
*/ |
||||
|
static std::string fromCodepoints(const std::vector<codepoint_t>& points) |
||||
|
{ |
||||
|
std::string result; |
||||
|
for (auto it = points.begin(); it != points.end(); ++it) |
||||
|
{ |
||||
|
writeCodepoint(result, *it); |
||||
|
} |
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
}; |
||||
|
|
||||
|
} |
||||
|
|
||||
|
#endif |
@ -0,0 +1,3 @@ |
|||||
|
library(testthat) |
||||
|
library(robotstxt) |
||||
|
test_check("rep") |
@ -0,0 +1,11 @@ |
|||||
|
context("basic functionality") |
||||
|
test_that("we can do something", { |
||||
|
|
||||
|
rt <- robxp(robotstxt::get_robotstxt("https://cdc.gov")) |
||||
|
|
||||
|
expect_that(rt, is_a("robxp")) |
||||
|
|
||||
|
expect_that(can_fetch(rt, "/asthma/asthma_stats/default.htm", "*"), equals(TRUE)) |
||||
|
expect_that(can_fetch(rt, "/_borders", "*"), equals(FALSE)) |
||||
|
|
||||
|
}) |
Loading…
Reference in new issue