Browse Source

initial commit

master
boB Rudis 7 years ago
commit
7404834ac2
No known key found for this signature in database GPG Key ID: 1D7529BE14E2BBA9
  1. 10
      .Rbuildignore
  2. 1
      .codecov.yml
  3. 8
      .gitignore
  4. 31
      .travis.yml
  5. 32
      DESCRIPTION
  6. 9
      NAMESPACE
  7. 2
      NEWS.md
  8. 45
      R/RcppExports.R
  9. 51
      R/sectxt.r
  10. 20
      R/securitytxt-package.R
  11. 19
      R/validate.r
  12. 70
      README.Rmd
  13. 114
      README.md
  14. 4
      inst/extdata/security.txt
  15. 28
      man/sectxt.Rd
  16. 17
      man/sectxt_info.Rd
  17. 22
      man/sectxt_url.Rd
  18. 17
      man/sectxt_validate.Rd
  19. 24
      man/securitytxt.Rd
  20. 21
      securitytxt.Rproj
  21. 3
      src/.gitignore
  22. 3
      src/Makevars
  23. 76
      src/RcppExports.cpp
  24. 183
      src/psl.cpp
  25. 102
      src/psl.h
  26. 409
      src/punycode.cpp
  27. 105
      src/punycode.h
  28. 89
      src/security.cpp
  29. 37
      src/security.h
  30. 66
      src/securitymain.cpp
  31. 962
      src/url.cpp
  32. 323
      src/url.h
  33. 150
      src/utf8.cpp
  34. 91
      src/utf8.h
  35. 2
      tests/test-all.R
  36. 6
      tests/testthat/test-securitytxt.R

10
.Rbuildignore

@ -0,0 +1,10 @@
^.*\.Rproj$
^\.Rproj\.user$
^\.travis\.yml$
^README\.*Rmd$
^README\.*html$
^NOTES\.*Rmd$
^NOTES\.*html$
^\.codecov\.yml$
^README_files$
^doc$

1
.codecov.yml

@ -0,0 +1 @@
comment: false

8
.gitignore

@ -0,0 +1,8 @@
.DS_Store
.Rproj.user
.Rhistory
.RData
.Rproj
src/*.o
src/*.so
src/*.dll

31
.travis.yml

@ -0,0 +1,31 @@
language: r
warnings_are_errors: true
sudo: required
cache: packages
r:
- oldrel
- release
- devel
apt_packages:
- libv8-dev
- xclip
env:
global:
- CRAN: http://cran.rstudio.com
after_success:
- Rscript -e 'covr::codecov()'
notifications:
email:
- bob@rud.is
irc:
channels:
- "104.236.112.222#builds"
nick: travisci

32
DESCRIPTION

@ -0,0 +1,32 @@
Package: securitytxt
Type: Package
Title: Identify and Parse Web Security Policies Files
Version: 0.1.0
Date: 2017-10-09
Authors@R: c(
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-5670-2640"))
)
Author: Bob Rudis (bob@rud.is)
Maintainer: Bob Rudis <bob@rud.is>
Description: When security risks in web services are discovered by independent
security researchers who understand the severity of the risk, they
often lack the channels to properly disclose them. As a result,
security issues may be left unreported. The 'security.txt' 'Web Security Policies'
specification defines a 'IETF' standard to help organizations define the process
for security researchers to securely disclose security vulnerabilities. Tools are
provided to help identify and parse 'security.txt' files to enable analysis of
the usage of these policies.
URL: https://github.com/hrbrmstr/securitytxt
BugReports: https://github.com/hrbrmstr/securitytxt/issues
License: AGPL
Suggests:
testthat,
covr
Depends:
R (>= 3.2.0)
Imports:
purrr,
Rcpp
RoxygenNote: 6.0.1
LinkingTo: Rcpp

9
NAMESPACE

@ -0,0 +1,9 @@
# Generated by roxygen2: do not edit by hand
S3method(print,sectxt)
export(sectxt)
export(sectxt_info)
export(sectxt_url)
export(sectxt_validate)
importFrom(Rcpp,sourceCpp)
useDynLib(securitytxt)

2
NEWS.md

@ -0,0 +1,2 @@
0.1.0
* Initial release

45
R/RcppExports.R

@ -0,0 +1,45 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
#' Parse security.txt
#'
#' @noRd
#'
sectxt_parse <- function(content) {
.Call('_securitytxt_sectxt_parse', PACKAGE = 'securitytxt', content)
}
sectxt_raw <- function(x) {
.Call('_securitytxt_sectxt_raw', PACKAGE = 'securitytxt', x)
}
sectxt_keys <- function(x) {
.Call('_securitytxt_sectxt_keys', PACKAGE = 'securitytxt', x)
}
#' Retrieve a data frame of security.txt keys/values
#'
#' @md
#' @param x a parsed `security.txt` created with [sec_parse()]
#' @return data frame
#' @export
sectxt_info <- function(x) {
.Call('_securitytxt_sectxt_info', PACKAGE = 'securitytxt', x)
}
#' Determine security.txt URL for a given site/URL
#'
#' Provide any URL for a resource and retrieve the URL for
#' the `security.txt` file. Strips off extraneous URL
#' components and appends `.well-known/security.txt`.
#'
#' @md
#' @param url URL
#' @return character vector
#' @export
#' @examples
#' sectxt_url("https://securitytxt.org/this/that/the/other.html")
sectxt_url <- function(url) {
.Call('_securitytxt_sectxt_url', PACKAGE = 'securitytxt', url)
}

51
R/sectxt.r

@ -0,0 +1,51 @@
#' Parse a `security.txt` Web Security Policies file & create a `sectxt` object
#'
#' This function takes in a single element character vector and parses it into
#' a `sectxt` object.
#'
#' @md
#' @param x either an atomic character vector containing a complete `security.txt` file
#' _or_ a length >1 character vector that will be concatenated into a single string _or_
#' a `connection` object that will be passed to [readLines()], the result of which
#' will be concatenated into a single string and parsed and the connection will be closed.
#' @references [IETF Draft](https://tools.ietf.org/html/draft-foudil-securitytxt-00); [Information hub](https://securitytxt.org/)
#' [GitHub Organization](https://github.com/securitytxt)
#' @export
#' @examples
#' sectxt(readLines(system.file("extdata", "security.txt", package="securitytxt")))
#' \dontrun{
#' sectxt(url(sectxt_url("https://securitytxt.org")))
#' }
sectxt <- function(x) {
if (inherits(x, "connection")) {
y <- suppressWarnings(try(readLines(x, warn = FALSE), silent=TRUE))
if (inherits(y, "try-error")) {
warning("security.txt not found")
return(NULL)
}
close(x)
x <- y
}
if (is.character(x)) if (length(x) > 1) x <- paste0(x, collapse="\n")
sec_txt <- sectxt_parse(x)
class(sec_txt) <- c("sectxt")
sec_txt
}
#' Custom printer for `sectxt`` objects
#'
#' @md
#' @noRd
#' @keywords internal
#' @param x object to print
#' @param ... unused
#' @export
print.sectxt <- function(x, ...) {
cat("<Web Security Policies Object>\n", sep="")
cat(sectxt_raw(x))
invisible(x)
}

20
R/securitytxt-package.R

@ -0,0 +1,20 @@
#' Identify and Parse Web Security Policies Files
#'
#' When security risks in web services are discovered by independent
#' security researchers who understand the severity of the risk, they
#' often lack the channels to properly disclose them. As a result,
#' security issues may be left unreported. The 'security.txt' 'Web Security Policies'
#' specification defines a 'IETF' standard to help organizations define the process
#' for security researchers to securely disclose security vulnerabilities. Tools are
#' provided to help identify and parse 'security.txt' files to enable analysis of
#' the usage of these policies.
#'
#' @md
#' @name securitytxt
#' @references [IETF Draft](https://tools.ietf.org/html/draft-foudil-securitytxt-00); [Information hub](https://securitytxt.org/)
#' [GitHub Organization](https://github.com/securitytxt)
#' @docType package
#' @author Bob Rudis (bob@@rud.is)
#' @useDynLib securitytxt
#' @importFrom Rcpp sourceCpp
NULL

19
R/validate.r

@ -0,0 +1,19 @@
securitytxt_ietf_fields <- c("contact", "encryption", "acknowledgement", "disclosure")
#' Validate a `security.txt` Web Security Policies file
#'
#' @md
#' @param x an object created with [sectxt()]
#' @return logical; `TRUE` if all fields match current IETF standard, `FALSE` if not
#' @export
sectxt_validate <- function(x) {
tmp_keys <- sectxt_keys(x)
key_test <- tmp_keys %in% securitytxt_ietf_fields
if (any(key_test == FALSE)) {
message(sprintf("The following keys were found that are not in the current IETF standard: %s",
paste0(tmp_keys[which(key_test == FALSE)], sep=", ")))
FALSE
} else {
TRUE
}
}

70
README.Rmd

@ -0,0 +1,70 @@
---
output: rmarkdown::github_document
---
# securitytxt
Identify and Parse Web Security Policies Files
## Description
When security risks in web services are discovered by independent
security researchers who understand the severity of the risk, they
often lack the channels to properly disclose them. As a result,
security issues may be left unreported. The 'security.txt' 'Web Security Policies'
specification defines a 'IETF' standard to help organizations define the process
for security researchers to securely disclose security vulnerabilities. Tools are
provided to help identify and parse 'security.txt' files to enable analysis of
the usage of these policies.
- [IETF Draft](https://tools.ietf.org/html/draft-foudil-securitytxt-00)
- [Information hub](https://securitytxt.org/)
- [GitHub Organization](https://github.com/securitytxt)
## What's Inside The Tin
The following functions are implemented:
- `sectxt`: Parse a 'security.txt' Web Security Policies file & create a 'sectxt' object
- `sectxt_info`: Retrieve a data frame of `security.txt` keys/values
- `sectxt_validate`: Validate a 'security.txt' Web Security Policies file
- `sectxt_url`: Determine `security.txt` URL for a given site/URL
## Installation
```{r eval=FALSE}
devtools::install_github("hrbrmstr/securitytxt")
```
```{r message=FALSE, warning=FALSE, error=FALSE, include=FALSE}
options(width=120)
```
## Usage
```{r message=FALSE, warning=FALSE, error=FALSE}
library(securitytxt)
# current verison
packageVersion("securitytxt")
# built-in example
x <- sectxt(readLines(system.file("extdata", "security.txt", package="securitytxt")))
sectxt_info(x)
# "live" example
(xurl <- sectxt_url("https://securitytxt.org"))
x <- sectxt(url(xurl))
sectxt_info(x)
sectxt_validate(x)
x
# another "live" example
(xurl <- sectxt_url("https://rud.is/b"))
x <- sectxt(url(xurl))
sectxt_info(x)
sectxt_validate(x)
x
```

114
README.md

@ -0,0 +1,114 @@
securitytxt
===========
Identify and Parse Web Security Policies Files
Description
-----------
When security risks in web services are discovered by independent security researchers who understand the severity of the risk, they often lack the channels to properly disclose them. As a result, security issues may be left unreported. The 'security.txt' 'Web Security Policies' specification defines a 'IETF' standard to help organizations define the process for security researchers to securely disclose security vulnerabilities. Tools are provided to help identify and parse 'security.txt' files to enable analysis of the usage of these policies.
- [IETF Draft](https://tools.ietf.org/html/draft-foudil-securitytxt-00)
- [Information hub](https://securitytxt.org/)
- [GitHub Organization](https://github.com/securitytxt)
What's Inside The Tin
---------------------
The following functions are implemented:
- `sectxt`: Parse a 'security.txt' Web Security Policies file & create a 'sectxt' object
- `sectxt_info`: Retrieve a data frame of `security.txt` keys/values
- `sectxt_validate`: Validate a 'security.txt' Web Security Policies file
- `sectxt_url`: Determine `security.txt` URL for a given site/URL
Installation
------------
``` r
devtools::install_github("hrbrmstr/securitytxt")
```
Usage
-----
``` r
library(securitytxt)
# current verison
packageVersion("securitytxt")
```
## [1] '0.1.0'
``` r
# built-in example
x <- sectxt(readLines(system.file("extdata", "security.txt", package="securitytxt")))
sectxt_info(x)
```
## key value
## 1 contact security@example.com
## 2 encryption https://example.com/pgp-key.txt
``` r
# "live" example
(xurl <- sectxt_url("https://securitytxt.org"))
```
## [1] "https://securitytxt.org/.well-known/security.txt"
``` r
x <- sectxt(url(xurl))
sectxt_info(x)
```
## key value
## 1 contact https://twitter.com/EdOverflow
``` r
sectxt_validate(x)
```
## [1] TRUE
``` r
x
```
## <Web Security Policies Object>
## # Our security address
## Contact: https://twitter.com/EdOverflow
``` r
# another "live" example
(xurl <- sectxt_url("https://rud.is/b"))
```
## [1] "https://rud.is/.well-known/security.txt"
``` r
x <- sectxt(url(xurl))
sectxt_info(x)
```
## key value
## 1 contact bob@rud.is
## 2 encryption https://keybase.io/hrbrmstr/pgp_keys.asc?fingerprint=e5388172b81c210906f5e5605879179645de9399
## 3 disclosure Full
``` r
sectxt_validate(x)
```
## [1] TRUE
``` r
x
```
## <Web Security Policies Object>
## Contact: bob@rud.is
## Encryption: https://keybase.io/hrbrmstr/pgp_keys.asc?fingerprint=e5388172b81c210906f5e5605879179645de9399
## Disclosure: Full

4
inst/extdata/security.txt

@ -0,0 +1,4 @@
# Our security address
Contact: security@example.com
Encryption: https://example.com/pgp-key.txt

28
man/sectxt.Rd

@ -0,0 +1,28 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/sectxt.r
\name{sectxt}
\alias{sectxt}
\title{Parse a \code{security.txt} Web Security Policies file & create a \code{sectxt} object}
\usage{
sectxt(x)
}
\arguments{
\item{x}{either an atomic character vector containing a complete \code{security.txt} file
\emph{or} a length >1 character vector that will be concatenated into a single string \emph{or}
a \code{connection} object that will be passed to \code{\link[=readLines]{readLines()}}, the result of which
will be concatenated into a single string and parsed and the connection will be closed.}
}
\description{
This function takes in a single element character vector and parses it into
a \code{sectxt} object.
}
\examples{
sectxt(readLines(system.file("extdata", "security.txt", package="securitytxt")))
\dontrun{
sectxt(url(sectxt_url("https://securitytxt.org")))
}
}
\references{
\href{https://tools.ietf.org/html/draft-foudil-securitytxt-00}{IETF Draft}; \href{https://securitytxt.org/}{Information hub}
\href{https://github.com/securitytxt}{GitHub Organization}
}

17
man/sectxt_info.Rd

@ -0,0 +1,17 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/RcppExports.R
\name{sectxt_info}
\alias{sectxt_info}
\title{Retrieve a data frame of security.txt keys/values}
\usage{
sectxt_info(x)
}
\arguments{
\item{x}{a parsed \code{security.txt} created with \code{\link[=sec_parse]{sec_parse()}}}
}
\value{
data frame
}
\description{
Retrieve a data frame of security.txt keys/values
}

22
man/sectxt_url.Rd

@ -0,0 +1,22 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/RcppExports.R
\name{sectxt_url}
\alias{sectxt_url}
\title{Determine security.txt URL for a given site/URL}
\usage{
sectxt_url(url)
}
\arguments{
\item{url}{URL}
}
\value{
character vector
}
\description{
Provide any URL for a resource and retrieve the URL for
the \code{security.txt} file. Strips off extraneous URL
components and appends \code{.well-known/security.txt}.
}
\examples{
sectxt_url("https://securitytxt.org/this/that/the/other.html")
}

17
man/sectxt_validate.Rd

@ -0,0 +1,17 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/validate.r
\name{sectxt_validate}
\alias{sectxt_validate}
\title{Validate a \code{security.txt} Web Security Policies file}
\usage{
sectxt_validate(x)
}
\arguments{
\item{x}{an object created with \code{\link[=sectxt]{sectxt()}}}
}
\value{
logical; \code{TRUE} if all fields match current IETF standard, \code{FALSE} if not
}
\description{
Validate a \code{security.txt} Web Security Policies file
}

24
man/securitytxt.Rd

@ -0,0 +1,24 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/securitytxt-package.R
\docType{package}
\name{securitytxt}
\alias{securitytxt}
\alias{securitytxt-package}
\title{Identify and Parse Web Security Policies Files}
\description{
When security risks in web services are discovered by independent
security researchers who understand the severity of the risk, they
often lack the channels to properly disclose them. As a result,
security issues may be left unreported. The 'security.txt' 'Web Security Policies'
specification defines a 'IETF' standard to help organizations define the process
for security researchers to securely disclose security vulnerabilities. Tools are
provided to help identify and parse 'security.txt' files to enable analysis of
the usage of these policies.
}
\references{
\href{https://tools.ietf.org/html/draft-foudil-securitytxt-00}{IETF Draft}; \href{https://securitytxt.org/}{Information hub}
\href{https://github.com/securitytxt}{GitHub Organization}
}
\author{
Bob Rudis (bob@rud.is)
}

21
securitytxt.Rproj

@ -0,0 +1,21 @@
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
StripTrailingWhitespace: Yes
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageBuildArgs: --resave-data
PackageRoxygenize: rd,collate,namespace

3
src/.gitignore

@ -0,0 +1,3 @@
*.o
*.so
*.dll

3
src/Makevars

@ -0,0 +1,3 @@
CXX_STD = CXX11
PKG_CXXFLAGS =
PKG_LIBS = -L.

76
src/RcppExports.cpp

@ -0,0 +1,76 @@
// Generated by using Rcpp::compileAttributes() -> do not edit by hand
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
#include <Rcpp.h>
using namespace Rcpp;
// sectxt_parse
SEXP sectxt_parse(std::string content);
RcppExport SEXP _securitytxt_sectxt_parse(SEXP contentSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::string >::type content(contentSEXP);
rcpp_result_gen = Rcpp::wrap(sectxt_parse(content));
return rcpp_result_gen;
END_RCPP
}
// sectxt_raw
std::string sectxt_raw(SEXP x);
RcppExport SEXP _securitytxt_sectxt_raw(SEXP xSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< SEXP >::type x(xSEXP);
rcpp_result_gen = Rcpp::wrap(sectxt_raw(x));
return rcpp_result_gen;
END_RCPP
}
// sectxt_keys
std::vector< std::string > sectxt_keys(SEXP x);
RcppExport SEXP _securitytxt_sectxt_keys(SEXP xSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< SEXP >::type x(xSEXP);
rcpp_result_gen = Rcpp::wrap(sectxt_keys(x));
return rcpp_result_gen;
END_RCPP
}
// sectxt_info
DataFrame sectxt_info(SEXP x);
RcppExport SEXP _securitytxt_sectxt_info(SEXP xSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< SEXP >::type x(xSEXP);
rcpp_result_gen = Rcpp::wrap(sectxt_info(x));
return rcpp_result_gen;
END_RCPP
}
// sectxt_url
std::string sectxt_url(std::string url);
RcppExport SEXP _securitytxt_sectxt_url(SEXP urlSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::string >::type url(urlSEXP);
rcpp_result_gen = Rcpp::wrap(sectxt_url(url));
return rcpp_result_gen;
END_RCPP
}
static const R_CallMethodDef CallEntries[] = {
{"_securitytxt_sectxt_parse", (DL_FUNC) &_securitytxt_sectxt_parse, 1},
{"_securitytxt_sectxt_raw", (DL_FUNC) &_securitytxt_sectxt_raw, 1},
{"_securitytxt_sectxt_keys", (DL_FUNC) &_securitytxt_sectxt_keys, 1},
{"_securitytxt_sectxt_info", (DL_FUNC) &_securitytxt_sectxt_info, 1},
{"_securitytxt_sectxt_url", (DL_FUNC) &_securitytxt_sectxt_url, 1},
{NULL, NULL, 0}
};
RcppExport void R_init_securitytxt(DllInfo *dll) {
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
}

183
src/psl.cpp

@ -0,0 +1,183 @@
#include <algorithm>
#include <fstream>
#include <iostream>
#include <string>
#include "psl.h"
#include "punycode.h"
namespace Url
{
const std::string PSL::not_found = "";
PSL::PSL(std::istream& stream)
{
std::string line;
while (std::getline(stream, line))
{
// Only take up to the first whitespace.
auto it = std::find_if(line.begin(), line.end(), ::isspace);
line.resize(it - line.begin());
// Skip blank lines
if (line.empty())
{
continue;
}
// Skip comments
if (line.compare(0, 2, "//") == 0)
{
continue;
}
// We know the line has at least a single character at this point
if (line[0] == '*')
{
// Line is a wildcard rule
if (line.size() <= 2 || line[1] != '.')
{
throw std::invalid_argument("Wildcard rule must be of form *.<host>");
}
add(line, 1, 2);
}
else if (line[0] == '!')
{
// Line is an exception, take all but the !
if (line.size() <= 1)
{
throw std::invalid_argument("Exception rule has no hostname.");
}
add(line, -1, 1);
}
else
{
add(line, 0, 0);
}
}
}
PSL PSL::fromPath(const std::string& path)
{
std::ifstream stream(path);
if (!stream.good())
{
std::stringstream message;
message << "Path '" << path << "' inaccessible.";
throw std::invalid_argument(message.str());
}
return PSL(stream);
}
PSL PSL::fromString(const std::string& str)
{
std::stringstream stream(str);
return PSL(stream);
}
std::string PSL::getTLD(const std::string& hostname) const
{
return getLastSegments(hostname, getTLDLength(hostname));
}
std::string PSL::getPLD(const std::string& hostname) const
{
return getLastSegments(hostname, getTLDLength(hostname) + 1);
}
std::pair<std::string, std::string> PSL::getBoth(const std::string& hostname) const
{
size_t length = getTLDLength(hostname);
return std::make_pair(
getLastSegments(hostname, length),
getLastSegments(hostname, length + 1));
}
size_t PSL::getTLDLength(const std::string& hostname) const
{
// Reversed copy of hostname
std::string tld(hostname.rbegin(), hostname.rend());
std::transform(tld.begin(), tld.end(), tld.begin(), ::tolower);
while (tld.size())
{
auto it = levels.find(tld);
if (it != levels.end())
{
return it->second;
}
size_t position = tld.rfind('.');
if (position == std::string::npos || position == 0)
{
tld.resize(0);
}
else
{
tld.resize(position);
}
}
return 1;
}
std::string PSL::getLastSegments(const std::string& hostname, size_t segments) const
{
size_t position = hostname.size();
size_t remaining = segments;
while (remaining != 0 && position && position != std::string::npos)
{
position = hostname.rfind('.', position - 1);
remaining -= 1;
}
if (remaining >= 1)
{
return not_found;
}
// Return the whole string if position == std:string::npos
size_t start = (position == std::string::npos) ? 0 : position + 1;
std::string result(hostname, start);
std::transform(result.begin(), result.end(), result.begin(), ::tolower);
// Leading .'s indicate that the query had an empty segment
if (result.size() && result[0] == '.')
{
std::stringstream message;
message << "Empty segment in " << result;
throw std::invalid_argument(message.str());
}
return result;
}
size_t PSL::countSegments(const std::string& hostname) const
{
size_t count = 1;
size_t position = hostname.find('.');
while (position != std::string::npos)
{
count += 1;
position = hostname.find('.', position + 1);
}
return count;
}
void PSL::add(std::string& rule, int level_adjust, size_t trim)
{
// First unpunycoded
std::string copy(rule.rbegin(), rule.rend() - trim);
size_t length = countSegments(copy) + level_adjust;
levels[copy] = length;
// And now punycoded
rule = Punycode::encodeHostname(rule);
copy.assign(rule.rbegin(), rule.rend() - trim);
levels[copy] = length;
}
};

102
src/psl.h

@ -0,0 +1,102 @@
#ifndef PSL_CPP_H
#define PSL_CPP_H
#include <istream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <utility>
namespace Url
{
/**
* Find TLDs and PLDs of a hostname according to a PSL.
*/
struct PSL
{
/**
* Indicates the there is no TLD / PLD
*/
static const std::string not_found;
/**
* Read a PSL from an istream.
*/
PSL(std::istream& stream);
PSL(): levels() { };
PSL(const PSL& other): levels(other.levels) { }
PSL& operator=(const PSL& other)
{
levels = other.levels;
return *this;
}
/**
* Read the provided path holding a set of PSL rules.
*/
static PSL fromPath(const std::string& path);
/**
* Create a PSL object from a string.
*/
static PSL fromString(const std::string& str);
/**
* Get just the TLD of the hostname.
*
* Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If
* some segments have been appropriately punycoded and others not, it may return
* a wrong answer. If a punycoded host is provided, a punycoded response is
* returned. If an unpunycoded host is provided, an unpunycoded response is
* returned.
*/
std::string getTLD(const std::string& hostname) const;
/**
* Get just the PLD of the hostname.
*
* Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If
* some segments have been appropriately punycoded and others not, it may return
* a wrong answer. If a punycoded host is provided, a punycoded response is
* returned. If an unpunycoded host is provided, an unpunycoded response is
* returned.
*/
std::string getPLD(const std::string& hostname) const;
/**
* Get the (TLD, PLD) of the hostname.
*
* Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If
* some segments have been appropriately punycoded and others not, it may return
* a wrong answer. If a punycoded host is provided, a punycoded response is
* returned. If an unpunycoded host is provided, an unpunycoded response is
* returned.
*/
std::pair<std::string, std::string> getBoth(const std::string& hostname) const;
private:
// Mapping of a string rule to its level
std::unordered_map<std::string, size_t> levels;
// Return the number of segments in a hostname
size_t countSegments(const std::string& hostname) const;
// Return the number of segments in the TLD of the provided hostname
size_t getTLDLength(const std::string& hostname) const;
// Return the last `segments` segments of a hostname
std::string getLastSegments(const std::string& hostname, size_t segments) const;
/**
* Add the provided host with the provided priority, trimming characters off
* the front, and adjusting the level by the provided number.
*/
void add(std::string& host, int level_adjust, size_t trim);
};
}
#endif

409
src/punycode.cpp

@ -0,0 +1,409 @@
#include <algorithm>
#include <string>
#include <iostream>
#include "punycode.h"
#include "utf8.h"
namespace Url
{
std::string& Punycode::encode(std::string& str)
{
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3
//
// let n = initial_n
// let delta = 0
// let bias = initial_bias
punycode_uint n = INITIAL_N;
punycode_uint delta = 0;
punycode_uint bias = INITIAL_BIAS;
std::string output;
// Accumulate the non-basic codepoints
std::vector<punycode_uint> codepoints;
for (auto it = str.cbegin(); it != str.cend(); )
{
Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend());
if (value < 0x80)
{
// copy them to the output in order
output.append(1, static_cast<char>(value));
}
codepoints.push_back(value);
}
// let h = b = the number of basic code points in the input
size_t h = output.size();
size_t b = h;
// copy a delimiter if b > 0
if (b > 0)
{
output.append(1, '-');
}
// while h < length(input) do begin
while (h < codepoints.size())
{
// let m = the minimum {non-basic} code point >= n in the input
punycode_uint m = MAX_PUNYCODE_UINT;
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
{
if ((*it >= n) && (*it < m))
{
m = *it;
}
}
// let delta = delta + (m - n) * (h + 1), fail on overflow
if ((m - n) > ((MAX_PUNYCODE_UINT - delta) / (h + 1)))
{
throw std::invalid_argument("Overflow delta update.");
}
delta += (m - n) * (h + 1);
// let n = m
n = m;
// for each code point c in the input (in order) do begin
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
{
// if c < n {or c is basic} then increment delta, fail on overflow
if (*it < n)
{
if (delta == MAX_PUNYCODE_UINT)
{
throw std::invalid_argument("Overflow delta increment.");
}
++delta;
}
// if c == n then begin
if (*it == n)
{
// let q = delta
punycode_uint q = delta;
// for k = base to infinity in steps of base do begin
for (punycode_uint k = BASE; ; k += BASE)
{
// let t = tmin if k <= bias {+ tmin}, or
// tmax if k >= bias + tmax, or k - bias otherwise
punycode_uint t = k <= bias ? TMIN :
k >= bias + TMAX ? TMAX : k - bias;
// if q < t then break
if (q < t)
{
break;
}
// output the code point for digit t + ((q - t) mod (base - t))
output.append(1, DIGIT_TO_BASIC[t + ((q - t) % (BASE - t))]);
// let q = (q - t) div (base - t)
q = (q - t) / (BASE - t);
}
// output the code point for digit q
output.append(1, DIGIT_TO_BASIC[q]);
// let bias = adapt(delta, h + 1, test h equals b?)
bias = adapt(delta, h + 1, h == b);
// let delta = 0
delta = 0;
// increment h
++h;
}
}
// increment delta and n
++delta;
++n;
}
str.assign(output);
return str;
}
std::string Punycode::encode(const std::string& str)
{
std::string result(str);
encode(result);
return result;
}
std::string Punycode::encodeHostname(const std::string& hostname)
{
// Avoid any punycoding at all if none is needed
if (!needsPunycoding(hostname))
{
return hostname;
}
std::string encoded;
size_t start = 0;
size_t end = hostname.find('.');
while(true)
{
std::string segment = hostname.substr(start, end - start);
if (needsPunycoding(segment))
{
encoded.append("xn--");
encoded.append(Punycode::encode(segment));
}
else
{
encoded.append(segment);
}
if (end == std::string::npos)
{
break;
}
else
{
encoded.append(1, '.');
start = end + 1;
end = hostname.find('.', start);
}
}
return encoded;
}
std::string& Punycode::decode(std::string& str)
{
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2
//
// let n = initial_n
// let i = 0
// let bias = initial_bias
// let output = an empty string indexed from 0
punycode_uint n = INITIAL_N;
punycode_uint i = 0;
punycode_uint bias = INITIAL_BIAS;
std::vector<punycode_uint> codepoints;
size_t index = str.rfind('-');
if (index == std::string::npos)
{
index = 0;
}
// consume all code points before the last delimiter (if there is one)
// and copy them to output, fail on any non-basic code point
for (auto it = str.begin(); it != (str.begin() + index); ++it)
{
if (static_cast<unsigned char>(*it) > 127U)
{
throw std::invalid_argument("Argument has non-basic code points.");
}
codepoints.push_back(*it);
}
// if more than zero code points were consumed then consume one more
// (which will be the last delimiter)
if (index > 0)
{
index += 1;
}
// while the input is not exhausted do begin
for (auto it = (str.begin() + index); it != str.end(); ++it)
{
// let oldi = i
// let w = 1
punycode_uint oldi = i;
punycode_uint w = 1;
// for k = base to infinity in steps of base do begin
for (punycode_uint k = BASE; ; k += BASE, ++it)
{
// consume a code point, or fail if there was none to consume
if (it == str.end())
{
throw std::invalid_argument("Premature termination");
}
// let digit = the code point's digit-value, fail if it has none
int lookup = BASIC_TO_DIGIT[static_cast<size_t>(*it)];
if (lookup == -1)
{
throw std::invalid_argument("Invalid base 36 character.");
}
unsigned char digit = static_cast<unsigned char>(lookup);
// let i = i + digit * w, fail on overflow
if (digit > ((MAX_PUNYCODE_UINT - i) / w))
{
throw std::invalid_argument("Overflow on i.");
}
i += digit * w;
// let t = tmin if k <= bias {+ tmin}, or
// tmax if k >= bias + tmax, or k - bias otherwise
punycode_uint t = k <= bias ? TMIN :
k >= bias + TMAX ? TMAX : k - bias;
// if digit < t then break
if (digit < t)
{
break;
}
// let w = w * (base - t), fail on overflow
if (w > (MAX_PUNYCODE_UINT / (BASE - t)))
{
// I believe this line is unreachable without first overflowing i.
// Since 'i' is updated above as i += digit * w, and w is updated as
// w = w * (BASE - t), we should like to keep (BASE - t) > digit to
// give 'w' a chance to overflow first. To keep t minimized, we must
// have 'bias' maximized. `bias` is driven by the 'adapt' function
// below.
//
// The value returned by 'adapt' increases with the input delta, and
// decreases with the input size. The delta is a function of the input
// size as well, on the order of (delta_n * input size), and
// legitimate delta_n values are limited to 0x10FFFF (the maximum
// unicode codepoint). Even setting that aside, the maximum value that
// adapt() can return is adapt(2 ** 32 - 1, 1, false) = 204.
//
// Using this bias, we could use the input (HERE) to get iterations:
//
// digit = b = 1, i = 2, k = 36, t = 1, w = 35
// digit = b = 1, i = 37, k = 72, t = 1, w = 1225
// digit = b = 1, i = 1262, k = 108, t = 1, w = 42875
// digit = b = 1, i = 44137, k = 144, t = 1, w = 1500625
// digit = b = 1, i = 1544762, k = 180, t = 1, w = 52521875
//
// At this point, t now becomes TMAX (26) because k exceeds the bias
// (since the maximum bias is 204). As such, the minimum continuation
// value is 26:
//
// digit = 0 = 26, i = 1367113512, k = 216, t = 26, w = 525218750
//
// However, the next iteration now overflows i before we can get to
// the w update.
throw std::invalid_argument("Overflow on w."); // LCOV_EXCL_LINE
}
w *= (BASE - t);
}
// let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?)
bias = adapt(i - oldi, codepoints.size() + 1, oldi == 0);
// let n = n + i div (length(output) + 1), fail on overflow
if ((i / (codepoints.size() + 1)) > (MAX_PUNYCODE_UINT - n))
{
throw std::invalid_argument("Overflow on n.");
}
n += i / (codepoints.size() + 1);
// let i = i mod (length(output) + 1)
i %= (codepoints.size() + 1);
// insert n into output at position i
codepoints.insert(codepoints.begin() + i, n);
// increment i
++i;
}
std::string output;
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
{
Utf8::writeCodepoint(output, *it);
}
str.assign(output);
return str;
}
std::string Punycode::decode(const std::string& str)
{
std::string result(str);
decode(result);
return result;
}
std::string Punycode::decodeHostname(const std::string& hostname)
{
std::string unencoded;
size_t start = 0;
size_t end = hostname.find('.');
while(true)
{
std::string segment = hostname.substr(start, end - start);
if (segment.substr(0, 4).compare("xn--") == 0)
{
segment = segment.substr(4);
unencoded.append(Punycode::decode(segment));
}
else
{
unencoded.append(segment);
}
if (end == std::string::npos)
{
break;
}
else
{
unencoded.append(1, '.');
start = end + 1;
end = hostname.find('.', start);
}
}
return unencoded;
}
bool Punycode::needsPunycoding(const std::string& str)
{
return std::any_of(
str.begin(),
str.end(),
[](char i){ return static_cast<unsigned char>(i) & 0x80; });
}
Punycode::punycode_uint Punycode::adapt(
punycode_uint delta, punycode_uint numpoints, bool firsttime)
{
// Psuedocode from https://tools.ietf.org/html/rfc3492#section-6.1
//
// It does not matter whether the modifications to delta and k inside
// adapt() affect variables of the same name inside the
// encoding/decoding procedures, because after calling adapt() the
// caller does not read those variables before overwriting them.
//
// if firsttime then let delta = delta div damp
// else let delta = delta div 2
delta = firsttime ? delta / DAMP : delta >> 1;
// let delta = delta + (delta div numpoints)
delta += (delta / numpoints);
// let k = 0
punycode_uint k = 0;
// while delta > ((base - tmin) * tmax) div 2 do begin
for (; delta > ((BASE - TMIN) * TMAX) / 2; k += BASE)
{
// let delta = delta div (base - tmin)
// let k = k + base
delta /= (BASE - TMIN);
}
// return k + (((base - tmin + 1) * delta) div (delta + skew))
return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
}
};

105
src/punycode.h

@ -0,0 +1,105 @@
#ifndef PUNYCODE_CPP_H
#define PUNYCODE_CPP_H
#include <stdexcept>
#include <string>
#include <vector>
#include <unordered_map>
#include <unordered_set>
#include "utf8.h"
namespace Url
{
namespace Punycode
{
typedef Utf8::codepoint_t punycode_uint;
const unsigned int BASE = 36;
const unsigned int TMIN = 1;
const unsigned int TMAX = 26;
const unsigned int SKEW = 38;
const unsigned int DAMP = 700;
const unsigned int INITIAL_BIAS = 72;
const unsigned int INITIAL_N = 128;
// Codepoints to their base-36 value
const std::vector<int8_t> BASIC_TO_DIGIT = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
const std::string DIGIT_TO_BASIC = "abcdefghijklmnopqrstuvwxyz0123456789";
// The highest codepoint in unicode
const punycode_uint MAX_PUNYCODE_UINT = std::numeric_limits<punycode_uint>::max();
//Utf8::MAX_CODEPOINT;
//std::numeric_limits<punycode_uint>::max();
/**
* Replace utf-8-encoded str into punycode.
*/
std::string& encode(std::string& str);
/**
* Create a new punycoded string from utf-8-encoded input.
*/
std::string encode(const std::string& str);
/**
* Encode a hostname.
*/
std::string encodeHostname(const std::string& hostname);
/**
* Replace punycoded str into utf-8-encoded.
*/
std::string& decode(std::string& str);
/**
* Create a new utf-8-encoded string from punycoded input.
*/
std::string decode(const std::string& str);
/**
* Decode a hostname.
*/
std::string decodeHostname(const std::string& hostname);
/**
* Determine if a string needs punycoding.
*/
bool needsPunycoding(const std::string& str);
/**
* Internal function for calculating bias.
*/
punycode_uint adapt(
punycode_uint delta, punycode_uint numpoints, bool firsttime);
};
}
#endif

89
src/security.cpp

@ -0,0 +1,89 @@
#include <algorithm>
#include <functional>
#include <cctype>
#include <locale>
#include <sstream>
#include <iostream>
#include <unordered_map>
#include "url.h"
#include "security.h"
#include <Rcpp.h>
namespace SecTxt {
void SecurityText::strip(std::string& string) {
string.erase(string.begin(), std::find_if(string.begin(), string.end(),
std::not1(std::ptr_fun<int, int>(std::isspace))));
string.erase(std::find_if(string.rbegin(), string.rend(),
std::not1(std::ptr_fun<int, int>(std::isspace))).base(), string.end());
}
bool SecurityText::getpair(std::istringstream& stream, std::string& key, std::string& value) {
while (getline(stream, key)) {
size_t index = key.find('#');
if (index != std::string::npos) key.resize(index);
// Find the colon and divide it into key and value, skipping malformed lines
index = key.find(':');
if (index == std::string::npos) continue;
value.assign(key.begin() + index + 1, key.end());
key.resize(index);
// Strip whitespace off of each
strip(key);
strip(value);
// Lowercase the key
std::transform(key.begin(), key.end(), key.begin(), ::tolower);
return true;
}
return false;
}
SecurityText::SecurityText(const std::string& content) {
orig_file = content;
std::istringstream input(content);
if (content.compare(0, 3, "\xEF\xBB\xBF") == 0) input.ignore(3);
std::string key, value;
while (SecurityText::getpair(input, key, value)) {
st_keys.push_back(key);
st_vals.push_back(value);
}
}
std::string SecurityText::rawFile() {
return(orig_file);
}
std::vector< std::string > SecurityText::sectxtKeys() {
return(st_keys);
}
std::vector< std::string > SecurityText::sectxtVals() {
return(st_vals);
}
std::string SecurityText::securityUrl(const std::string& url) {
return Url::Url(url)
.setUserinfo("")
.setPath(".well-known/security.txt")
.setParams("")
.setQuery("")
.setFragment("")
.remove_default_port()
.str();
}
}

37
src/security.h

@ -0,0 +1,37 @@
#ifndef SECURITY_CPP_H
#define SECURITY_CPP_H
#include <sstream>
#include <unordered_map>
#include <vector>
namespace SecTxt {
class SecurityText {
public:
// Create a security.txt data structure from a utf-8-encoded string.
SecurityText(const std::string& content);
std::string rawFile();
std::vector<std::string> sectxtKeys();
std::vector<std::string> sectxtVals();
// Return the security.txt URL corresponding to the provided URL.
static std::string securityUrl(const std::string& url);
private:
static void strip(std::string& string);
static bool getpair(std::istringstream& stream, std::string& key, std::string& value);
std::vector<std::string> st_keys;
std::vector<std::string> st_vals;
std::string orig_file;
};
}
#endif

66
src/securitymain.cpp

@ -0,0 +1,66 @@
#include <Rcpp.h>
using namespace Rcpp;
#include "url.h"
#include "security.h"
//' Parse security.txt
//'
//' @noRd
//'
// [[Rcpp::export]]
SEXP sectxt_parse(std::string content) {
Rcpp::XPtr<SecTxt::SecurityText> ptr(new SecTxt::SecurityText(content));
return(ptr);
}
// [[Rcpp::export]]
std::string sectxt_raw(SEXP x) {
Rcpp::XPtr<SecTxt::SecurityText> ptr(x);
return(ptr->rawFile());
}
// [[Rcpp::export]]
std::vector< std::string > sectxt_keys(SEXP x) {
Rcpp::XPtr<SecTxt::SecurityText> ptr(x);
return(ptr->sectxtKeys());
}
//' Retrieve a data frame of security.txt keys/values
//'
//' @md
//' @param x a parsed `security.txt` created with [sec_parse()]
//' @return data frame
//' @export
// [[Rcpp::export]]
DataFrame sectxt_info(SEXP x) {
Rcpp::XPtr<SecTxt::SecurityText> ptr(x);
return(DataFrame::create(
_["key"] = ptr->sectxtKeys(),
_["value"] = ptr->sectxtVals()));
}
//' Determine security.txt URL for a given site/URL
//'
//' Provide any URL for a resource and retrieve the URL for
//' the `security.txt` file. Strips off extraneous URL
//' components and appends `.well-known/security.txt`.
//'
//' @md
//' @param url URL
//' @return character vector
//' @export
//' @examples
//' sectxt_url("https://securitytxt.org/this/that/the/other.html")
// [[Rcpp::export]]
std::string sectxt_url(std::string url) {
return(SecTxt::SecurityText::securityUrl(url));
}

962
src/url.cpp

@ -0,0 +1,962 @@
#include <algorithm>
#include <string>
#include <iterator>
#include <unordered_map>
#include <unordered_set>
#include <iostream>
#include <iterator>
#include <sstream>
#include "url.h"
#include "punycode.h"
namespace Url
{
/* Character classes */
const CharacterClass Url::GEN_DELIMS(":/?#[]@");
const CharacterClass Url::SUB_DELIMS("!$&'()*+,;=");
const CharacterClass Url::DIGIT("0123456789");
const CharacterClass Url::ALPHA(
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
const CharacterClass Url::UNRESERVED(
Url::ALPHA.chars() + Url::DIGIT.chars() + "-._~");
const CharacterClass Url::RESERVED(
Url::GEN_DELIMS.chars() + Url::SUB_DELIMS.chars());
const CharacterClass Url::PCHAR(
Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":@");
const CharacterClass Url::PATH(
Url::PCHAR.chars() + "/");
const CharacterClass Url::QUERY(
Url::PCHAR.chars() + "/?");
const CharacterClass Url::FRAGMENT(
Url::PCHAR.chars() + "/?");
const CharacterClass Url::USERINFO(
Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":");
const CharacterClass Url::HEX("0123456789ABCDEF");
const CharacterClass Url::SCHEME(
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-.");
const std::vector<signed char> Url::HEX_TO_DEC = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
const std::unordered_map<std::string, int> Url::PORTS = {
{"http", 80},
{"https", 443}
};
const std::unordered_set<std::string> Url::USES_RELATIVE = {
"",
"file",
"ftp",
"gopher",
"http",
"https",
"imap",
"mms",
"nntp",
"prospero",
"rtsp",
"rtspu",
"sftp",
"shttp",
"svn",
"svn+ssh",
"wais"
};
const std::unordered_set<std::string> Url::USES_NETLOC = {
"",
"file",
"ftp",
"git",
"git+ssh",
"gopher",
"http",
"https",
"imap",
"mms",
"nfs",
"nntp",
"prospero",
"rsync",
"rtsp",
"rtspu",
"sftp",
"shttp",
"snews",
"svn",
"svn+ssh",
"telnet",
"wais"
};
const std::unordered_set<std::string> Url::USES_PARAMS = {
"",
"ftp",
"hdl",
"http",
"https",
"imap",
"mms",
"prospero",
"rtsp",
"rtspu",
"sftp",
"shttp",
"sip",
"sips",
"tel"
};
const std::unordered_set<std::string> Url::KNOWN_PROTOCOLS = {
"",
"file",
"ftp",
"git",
"git+ssh",
"gopher",
"hdl",
"http",
"https",
"imap",
"mms",
"nfs",
"nntp",
"prospero",
"rsync",
"rtsp",
"rtspu",
"sftp",
"shttp",
"sip",
"sips",
"sms",
"snews",
"svn",
"svn+ssh",
"tel",
"telnet",
"wais"
};
Url::Url(const std::string& url): port_(0), has_params_(false), has_query_(false)
{
size_t position = 0;
size_t index = url.find(':');
if (index != std::string::npos)
{
// All the characters in our would-be scheme must be in SCHEME
if (std::all_of(
url.begin(),
url.begin() + index,
[](char c) { return SCHEME(c); } ))
{
// If there is nothing after the : or there are any non-digits, this is
// the scheme
if ((index + 1) >= url.length()
|| std::any_of(
url.begin() + index + 1,
url.end(),
[](char c) { return !DIGIT(c); }))
{
scheme_.assign(url, 0, index);
std::transform(
scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower);
position = index + 1;
}
else
{
scheme_.assign(url, 0, index);
std::transform(
scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower);
if (KNOWN_PROTOCOLS.find(scheme_) != KNOWN_PROTOCOLS.end())
{
position = index + 1;
}
else
{
scheme_.clear();
}
}
}
}
// Search for the netloc
if ((url.length() - position) >= 1
&& url[position] == '/'
&& url[position + 1] == '/')
{
// Skip the '//'
position += 2;
index = url.find_first_of("/?#", position);
host_.assign(url, position, index - position);
position = index;
// Extract any userinfo if there is any
index = host_.find('@');
if (index != std::string::npos)
{
userinfo_.assign(host_, 0, index);
host_.assign(host_, index + 1, std::string::npos);
}
// Lowercase the hostname
std::transform(host_.begin(), host_.end(), host_.begin(), ::tolower);
// Try to find a port
index = host_.find(':');
if (index != std::string::npos)
{
std::string portText(host_, index + 1, std::string::npos);
host_.resize(index);
if (portText.empty())
{
port_ = 0;
}
else
{
try
{
port_ = std::stoi(portText, &index);
if (index != portText.length())
{
// Malformed port
throw UrlParseException("Port not a number: " + portText);
}
if (port_ > 65535)
{
throw UrlParseException("Port too high: " + portText);
}
else if (port_ < 0)
{
throw UrlParseException("Port negative: " + portText);
}
}
catch (const std::invalid_argument&)
{
// Malformed port
throw UrlParseException("Port not a number: " + portText);
}
catch (const std::out_of_range&)
{
throw UrlParseException("Port out of integer range: " + portText);
}
}
}
}
if (position != std::string::npos)
{
path_.assign(url, position, std::string::npos);
index = path_.find('#');
if (index != std::string::npos)
{
fragment_.assign(path_, index + 1, std::string::npos);
path_.resize(index);
}
index = path_.find('?');
if (index != std::string::npos)
{
query_.assign(path_, index + 1, std::string::npos);
has_query_ = true;
path_.resize(index);
}
if (USES_PARAMS.find(scheme_) != USES_PARAMS.end())
{
index = path_.find(';');
if (index != std::string::npos)
{
params_.assign(path_, index + 1, std::string::npos);
has_params_ = true;
path_.resize(index);
}
}
}
}
Url& Url::assign(const Url& other)
{
return (*this) = other;
}
bool Url::operator==(const Url& other) const
{
return (
(scheme_ == other.scheme_ ) &&
(userinfo_ == other.userinfo_ ) &&
(host_ == other.host_ ) &&
(port_ == other.port_ ) &&
(path_ == other.path_ ) &&
(params_ == other.params_ ) &&
(query_ == other.query_ ) &&
(fragment_ == other.fragment_ ) &&
(has_params_ == other.has_params_) &&
(has_query_ == other.has_query_ )
);
}
bool Url::operator!=(const Url& other) const
{
return !operator==(other);
}
bool Url::equiv(const Url& other)
{
Url self_(*this);
Url other_(other);
self_.strip()
.sort_query()
.defrag()
.deuserinfo()
.abspath()
.escape()
.punycode()
.remove_default_port();
other_.strip()
.sort_query()
.defrag()
.deuserinfo()
.abspath()
.escape()
.punycode()
.remove_default_port();
return self_ == other_;
}
std::string& Url::remove_repeats(std::string& str, const char chr)
{
size_t dest = 0;
// By initializing this to true, it also strips of leading instances of chr
bool seen = true;
for (size_t src = 0; src < str.length(); ++src)
{
if (!seen || (str[src] != chr))
{
str[dest++] = str[src];
}
seen = str[src] == chr;
}
// Remove the last character if it happens to be chr
size_t length = ((dest > 0) && (str[dest - 1] == chr)) ? dest - 1 : dest;
str.resize(length);
return str;
}
std::string Url::fullpath() const
{
std::string result;
if (path_.empty() || path_[0] != '/')
{
result.append(1, '/');
}
result.append(path_);
if (has_params_)
{
result.append(";");
result.append(params_);
}
if (has_query_)
{
result.append("?");
result.append(query_);
}
if (!fragment_.empty())
{
result.append("#");
result.append(fragment_);
}
return result;
}
std::string Url::str() const
{
std::string result;
if (!scheme_.empty())
{
result.append(scheme_);
if (USES_NETLOC.find(scheme_) == USES_NETLOC.end())
{
result.append(":");
}
else
{
result.append("://");
}
}
else if (!host_.empty())
{
result.append("//");
}
if (!userinfo_.empty())
{
result.append(userinfo_);
result.append("@");
}
if (!host_.empty())
{
result.append(host_);
}
if (port_)
{
result.append(":");
result.append(std::to_string(port_));
}
if (path_.empty())
{
if (!result.empty())
{
result.append("/");
}
}
else
{
if (!host_.empty() && path_[0] != '/')
{
result.append(1, '/');
}
result.append(path_);
}
if (has_params_)
{
result.append(";");
result.append(params_);
}
if (has_query_)
{
result.append("?");
result.append(query_);
}
if (!fragment_.empty())
{
result.append("#");
result.append(fragment_);
}
return result;
}
Url& Url::strip()
{
size_t start = query_.find_first_not_of('?');
if (start != std::string::npos)
{
query_.assign(query_, start, std::string::npos);
}
else
{
query_.assign("");
}
setQuery(remove_repeats(query_, '&'));
setParams(remove_repeats(params_, ';'));
return *this;
}
Url& Url::abspath()
{
std::string copy;
std::vector<size_t> segment_starts;
if (path_.size() >= 1 && path_[0] == '/')
{
copy.append(1, '/');
segment_starts.push_back(0);
}
bool directory = false;
size_t previous = 0;
size_t index = 0;
for (index = path_.find('/')
; index != std::string::npos
; previous = index + 1, index = path_.find('/', index + 1))
{
// Skip empty segments
if (index - previous == 0)
{
continue;
}
if ((index - previous == 2)
&& path_[previous] == '.'
&& path_[previous + 1] == '.')
{
if (!segment_starts.empty())
{
copy.resize(segment_starts.back());
segment_starts.pop_back();
}
directory = true;
}
else if ((index - previous == 1) && path_[previous] == '.')
{
directory = true;
}
else
{
segment_starts.push_back(copy.length());
copy.append(path_, previous, index - previous);
copy.append(1, '/');
directory = false;
}
}
// Handle the last segment
index = path_.length();
if (previous == path_.length())
{
directory = true;
}
else if ((index - previous == 1) && path_[previous] == '.')
{
directory = true;
}
else if ((index - previous == 2)
&& path_[previous] == '.'
&& path_[previous + 1] == '.')
{
if (!segment_starts.empty())
{
copy.resize(segment_starts.back());
}
directory = true;
}
else
{
copy.append(path_, previous, index - previous);
copy.append(1, '/');
directory = false;
}
if (!directory && copy.size() >= 1)
{
copy.resize(copy.size() - 1);
}
else if (directory && copy.empty())
{
copy.append(1, '/');
}
path_.assign(copy);
return *this;
}
Url& Url::relative_to(const Url& other)
{
// If this scheme does not use relative, return it unchanged
if (USES_RELATIVE.find(scheme_) == USES_RELATIVE.end())
{
return *this;
}
// Support scheme-relative URLs
if (scheme_.empty())
{
scheme_ = other.scheme_;
}
// If this is an absolute URL (or scheme-relative), return early
if (!host_.empty()) {
return *this;
}
// If it's not an absolute URL, we need to copy the other host and port
host_ = other.host_;
port_ = other.port_;
userinfo_ = other.userinfo_;
// If the path portion is absolute, then bail out early.
if (!path_.empty() && path_.front() == '/')
{
return *this;
}
// Otherwise, this is a path that need to be evaluated relative to the other. If
// there is no '/', then we just keep our current path if it's not empty.
if (path_.empty())
{
if (params_.empty())
{
path_ = other.path_;
params_ = other.params_;
has_params_ = other.has_params_;
if (query_.empty())
{
query_ = other.query_;
has_query_ = other.has_query_;
}
}
else
{
path_.assign(other.path_, 0, other.path_.rfind('/') + 1);
}
if (fragment_.empty())
{
fragment_ = other.fragment_;
}
}
else
{
size_t index = other.path_.rfind('/');
if (index != std::string::npos)
{
path_ = other.path_.substr(0, index + 1) + path_;
}
else if (!host_.empty())
{
path_ = "/" + path_;
}
}
return *this;
}
Url& Url::escape(bool strict)
{
escape(path_, PATH, strict);
escape(query_, QUERY, strict);
escape(params_, QUERY, strict);
escape(userinfo_, USERINFO, strict);
return *this;
}
std::string& Url::escape(std::string& str, const CharacterClass& safe, bool strict)
{
std::string copy(str);
size_t dest = 0;
// Allocate space pessimistically -- if every entity is expanded, it will take 3x
// the space.
str.resize(str.length() * 3);
for (size_t src = 0; src < copy.length(); ++src)
{
if (copy[src] == '%' && (copy.length() - src) >= 2)
{
// Read ahead to see if there's a valid escape sequence. If not, treat
// this like a normal character.
if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1)
{
int value = (
HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]);
// In strict mode, we can only unescape parameters if they are both
// safe and node reserved
if (!strict || (strict && safe(value) && !RESERVED(value)))
{
// Replace src + 2 with that byte, advance src to consume it and
// continue.
src += 2;
copy[src] = value;
}
else
{
str[dest++] = copy[src++];
str[dest++] = ::toupper(copy[src++]);
str[dest++] = ::toupper(copy[src]);
continue;
}
}
}
if (!safe(copy[src]))
{
// Not safe -- replace with %XX
str[dest++] = '%';
str[dest++] = HEX.chars()[(copy[src] >> 4) & 0xF];
str[dest++] = HEX.chars()[copy[src] & 0xF];
}
else
{
str[dest++] = copy[src];
}
}
str.resize(dest);
return str;
}
Url& Url::unescape()
{
unescape(path_);
unescape(query_);
unescape(params_);
unescape(userinfo_);
return *this;
}
std::string& Url::unescape(std::string& str)
{
std::string copy(str);
size_t dest = 0;
for (size_t src = 0; src < copy.length(); ++src, ++dest)
{
if (copy[src] == '%' && (copy.length() - src) >= 2)
{
// Read ahead to see if there's a valid escape sequence. If not, treat
// this like a normal character.
if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1)
{
int value = (
HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]);
// Replace src + 2 with that byte, advance src to consume it and
// continue.
src += 2;
str[dest] = value;
continue;
}
}
// Either not a % or an incomplete entity
str[dest] = copy[src];
}
str.resize(dest);
return str;
}
Url& Url::deparam(const std::unordered_set<std::string>& blacklist)
{
// Predicate is if it's present in the blacklist.
auto predicate = [blacklist](std::string& name, const std::string& value)
{
std::transform(name.begin(), name.end(), name.begin(), ::tolower);
return blacklist.find(name) != blacklist.end();
};
setQuery(remove_params(query_, predicate, '&'));
setParams(remove_params(params_, predicate, ';'));
return *this;
}
Url& Url::deparam(const deparam_predicate& predicate)
{
setQuery(remove_params(query_, predicate, '&'));
setParams(remove_params(params_, predicate, ';'));
return *this;
}
std::string& Url::remove_params(std::string& str,
const deparam_predicate& predicate,
char sep)
{
std::string copy;
std::string piece;
std::string name;
std::string value;
size_t previous = 0;
for (size_t index = str.find(sep)
; index != std::string::npos
; previous = index + 1, index = str.find(sep, previous))
{
piece.assign(str, previous, index - previous);
size_t position = piece.find('=');
name.assign(piece, 0, position);
value.clear();
if (position != std::string::npos)
{
value.assign(piece, position + 1, std::string::npos);
}
if (!predicate(name, value))
{
copy.append(copy.empty() ? 0 : 1, sep);
copy.append(piece);
}
}
if (previous < str.length())
{
piece.assign(str, previous, std::string::npos);
size_t position = piece.find('=');
name.assign(piece, 0, position);
value.clear();
if (position != std::string::npos)
{
value.assign(piece, position + 1, std::string::npos);
}
if (!predicate(name, value))
{
copy.append(copy.empty() ? 0 : 1, sep);
copy.append(piece);
}
}
str.assign(copy);
return str;
}
Url& Url::sort_query()
{
split_sort_join(query_, '&');
split_sort_join(params_, ';');
return *this;
}
std::string& Url::split_sort_join(std::string& str, const char glue)
{
// Return early if empty
if (str.empty())
{
return str;
}
// Split
std::vector<std::string> pieces;
std::stringstream stream(str);
std::string item;
while (getline(stream, item, glue))
{
pieces.push_back(item);
}
// Return early if it's just a single element
if (pieces.size() == 1)
{
return str;
}
// Sort
std::sort(pieces.begin(), pieces.end());
// Join (at this point we know that there's at least one element)
std::stringstream output;
for (auto it = pieces.begin(); it != (pieces.end() - 1); ++it)
{
output << *it << glue;
}
output << pieces.back();
str.assign(output.str());
return str;
}
Url& Url::remove_default_port()
{
if (port_ && !scheme_.empty())
{
auto it = PORTS.find(scheme_);
if (it != PORTS.end() && port_ == it->second)
{
port_ = 0;
}
}
return *this;
}
Url& Url::deuserinfo()
{
userinfo_.clear();
return *this;
}
Url& Url::defrag()
{
fragment_.clear();
return *this;
}
Url& Url::punycode()
{
check_hostname(host_);
std::string encoded(Punycode::encodeHostname(host_));
check_hostname(encoded);
host_ = encoded;
return *this;
}
Url& Url::unpunycode()
{
host_ = Punycode::decodeHostname(host_);
return *this;
}
Url& Url::host_reversed()
{
std::reverse(host_.begin(), host_.end());
for (size_t index = 0, position = 0; index < host_.size(); index = position + 1)
{
position = host_.find('.', index);
if (position == std::string::npos)
{
std::reverse(host_.begin() + index, host_.end());
break;
}
else
{
std::reverse(host_.begin() + index, host_.begin() + position);
}
}
return *this;
}
void Url::check_hostname(std::string& host)
{
// Skip empty hostnames -- they are valid
if (host.empty())
{
return;
}
size_t start = 0;
size_t end = host.find('.');
while (end != std::string::npos)
{
if ((end - start) > 63)
{
throw std::invalid_argument("Label too long.");
}
else if (end == start)
{
throw std::invalid_argument("Empty label.");
}
start = end + 1;
end = host.find('.', start);
}
// For the final segment
if ((host.size() - start) > 63)
{
throw std::invalid_argument("Label too long.");
}
else if (host.size() == start && start > 1)
{
// Remove a trailing empty segment
host.resize(start - 1);
}
}
};

323
src/url.h

@ -0,0 +1,323 @@
#ifndef URL_CPP_H
#define URL_CPP_H
#include <stdexcept>
#include <functional>
#include <string>
#include <vector>
#include <unordered_map>
#include <unordered_set>
namespace Url
{
struct UrlParseException : public std::logic_error
{
UrlParseException(const std::string& message) : std::logic_error(message) {}
};
struct CharacterClass
{
CharacterClass(const std::string& chars) : chars_(chars), map_(256, false)
{
for (auto it = chars_.begin(); it != chars_.end(); ++it)
{
map_[static_cast<size_t>(*it)] = true;
}
}
bool operator()(char c) const
{
return map_[static_cast<unsigned char>(c)];
}
const std::string& chars() const
{
return chars_;
}
private:
// Private, unimplemented to prevent use
CharacterClass();
CharacterClass(const CharacterClass& other);
std::string chars_;
std::vector<bool> map_;
};
struct Url
{
/* Character classes */
const static CharacterClass GEN_DELIMS;
const static CharacterClass SUB_DELIMS;
const static CharacterClass ALPHA;
const static CharacterClass DIGIT;
const static CharacterClass UNRESERVED;
const static CharacterClass RESERVED;
const static CharacterClass PCHAR;
const static CharacterClass PATH;
const static CharacterClass QUERY;
const static CharacterClass FRAGMENT;
const static CharacterClass USERINFO;
const static CharacterClass HEX;
const static CharacterClass SCHEME;
const static std::vector<signed char> HEX_TO_DEC;
const static std::unordered_map<std::string, int> PORTS;
const static std::unordered_set<std::string> USES_RELATIVE;
const static std::unordered_set<std::string> USES_NETLOC;
const static std::unordered_set<std::string> USES_PARAMS;
const static std::unordered_set<std::string> KNOWN_PROTOCOLS;
// The type of the predicate used for removing parameters
typedef std::function<bool(std::string&, std::string&)> deparam_predicate;
explicit Url(const std::string& url);
Url(const Url& other)
: scheme_(other.scheme_)
, host_(other.host_)
, port_(other.port_)
, path_(other.path_)
, params_(other.params_)
, query_(other.query_)
, fragment_(other.fragment_)
, userinfo_(other.userinfo_)
, has_params_(other.has_params_)
, has_query_(other.has_query_) { }
/**
* Take on the value of the other URL.
*/
Url& assign(const Url& other);
/**
* To be considered equal, all fields must be equal.
*/
bool operator==(const Url& other) const;
bool operator!=(const Url& other) const;
/**
* Two URLs are considered equivalent if they have the same meaning.
*/
bool equiv(const Url& other);
/**************************************
* Component-wise access and setting. *
**************************************/
const std::string& scheme() const { return scheme_; }
Url& setScheme(const std::string& s)
{
scheme_ = s;
return *this;
}
const std::string& host() const { return host_; }
Url& setHost(const std::string& s)
{
host_ = s;
return *this;
}
const int port() const { return port_; }
Url& setPort(int i)
{
port_ = i;
return *this;
}
const std::string& path() const { return path_; }
Url& setPath(const std::string& s)
{
path_ = s;
return *this;
}
const std::string& params() const { return params_; }
Url& setParams(const std::string& s)
{
params_ = s;
has_params_ = !s.empty();
return *this;
}
const std::string& query() const { return query_; }
Url& setQuery(const std::string& s)
{
query_ = s;
has_query_ = !s.empty();
return *this;
}
const std::string& fragment() const { return fragment_; }
Url& setFragment(const std::string& s)
{
fragment_ = s;
return *this;
}
const std::string& userinfo() const { return userinfo_; }
Url& setUserinfo(const std::string& s)
{
userinfo_ = s;
return *this;
}
/**
* Get a representation of all components of the path, params, query, fragment.
*
* Always includes a leading /.
*/
std::string fullpath() const;
/**
* Get a new string representation of the URL.
**/
std::string str() const;
/*********************
* Chainable methods *
*********************/
/**
* Strip semantically meaningless excess '?', '&', and ';' characters from query
* and params.
*/
Url& strip();
/**
* Make the path absolute.
*
* Evaluate '.', '..', and excessive slashes.
*/
Url& abspath();
/**
* Evaluate this URL relative fo `other`, placing the result in this object.
*/
Url& relative_to(const std::string& other)
{
return relative_to(Url(other));
}
/**
* Evaluate this URL relative fo `other`, placing the result in this object.
*/
Url& relative_to(const Url& other);
/**
* Ensure that the path, params, query, and userinfo are properly escaped.
*
* In 'strict' mode, only entities that are both safe and not reserved characters
* are unescaped. In non-strict mode, entities that are safe are unescaped.
*/
Url& escape(bool strict=false);
/**
* Unescape all entities in the path, params, query, and userinfo.
*/
Url& unescape();
/**
* Remove any params or queries that appear in the blacklist.
*
* The blacklist should contain only lowercased strings, and the comparison is
* done in a case-insensitive way.
*/
Url& deparam(const std::unordered_set<std::string>& blacklist);
/**
* Filter params subject to a predicate for whether it should be filtered.
*
* The predicate must accept two string refs -- the key and value (which may be
* empty). Return `true` if the parameter should be removed, and `false`
* otherwise.
*/
Url& deparam(const deparam_predicate& predicate);
/**
* Put queries and params in sorted order.
*
* To ensure consistent comparisons, escape should be called beforehand.
*/
Url& sort_query();
/**
* Remove the port if it's the default for the scheme.
*/
Url& remove_default_port();
/**
* Remove the userinfo portion.
*/
Url& deuserinfo();
/**
* Remove the fragment.
*/
Url& defrag();
/**
* Punycode the hostname.
*/
Url& punycode();
/**
* Unpunycode the hostname.
*/
Url& unpunycode();
/**
* Reverse the hostname (a.b.c.d => d.c.b.a)
*/
Url& host_reversed();
private:
// Private, unimplemented to prevent use.
Url();
/**
* Remove repeated, leading, and trailing instances of chr from the string.
*/
std::string& remove_repeats(std::string& str, const char chr);
/**
* Ensure all the provided characters are escaped if necessary
*/
std::string& escape(std::string& str, const CharacterClass& safe, bool strict);
/**
* Unescape entities in the provided string
*/
std::string& unescape(std::string& str);
/**
* Remove any params that match entries in the blacklist.
*/
std::string& remove_params(
std::string& str, const deparam_predicate& pred, char sep);
/**
* Split the provided string by char, sort, join by char.
*/
std::string& split_sort_join(std::string& str, const char glue);
/**
* Check that the hostname is valid, removing an optional trailing '.'.
*/
void check_hostname(std::string& host);
std::string scheme_;
std::string host_;
int port_;
std::string path_;
std::string params_;
std::string query_;
std::string fragment_;
std::string userinfo_;
bool has_params_;
bool has_query_;
};
}
#endif

150
src/utf8.cpp

@ -0,0 +1,150 @@
#include <algorithm>
#include <string>
#include <iostream>
#include "utf8.h"
namespace Url
{
Utf8::codepoint_t Utf8::readCodepoint(
std::string::const_iterator& it, const std::string::const_iterator& end)
{
Utf8::char_t current = static_cast<Utf8::char_t>(*it++);
if (current & 0x80)
{
// Number of additional bytes needed
unsigned int bytes = 0;
// The accumulated value
Utf8::codepoint_t result = 0;
if (current < 0xC0)
{
// Invalid sequence
throw std::invalid_argument("Low UTF-8 start byte");
}
else if (current < 0xE0)
{
// One additional byte, two bytes total, use 5 bits
bytes = 1;
result = current & 0x1F;
}
else if (current < 0xF0)
{
// Two additional bytes, three bytes total, use 4 bits
bytes = 2;
result = current & 0x0F;
}
else if (current < 0xF8)
{
// Three additional bytes, four bytes total, use 3 bits
bytes = 3;
result = current & 0x07;
}
else
{
throw std::invalid_argument("High UTF-8 start byte");
}
for (; bytes > 0; --bytes) {
if (it == end)
{
throw std::invalid_argument("UTF-8 sequence terminated early.");
}
current = static_cast<unsigned char>(*it++);
// Ensure the first two bits are 10
if ((current & 0xC0) != 0x80)
{
throw std::invalid_argument("Invalid continuation byte");
}
result = (result << 6) | (current & 0x3F);
}
return result;
}
else
{
return current;
}
}
std::string& Utf8::writeCodepoint(std::string& str, Utf8::codepoint_t value)
{
if (value > MAX_CODEPOINT)
{
throw std::invalid_argument("Code point too high.");
}
else if (value <= 0x007F)
{
// Just append the character itself
str.append(1, static_cast<char>(value));
return str;
}
unsigned int bytes = 0;
if (value > 0xFFFF)
{
/**
* 11110xxx + 3 bytes for 21 bits total
*
* We need to take bits 20-18, which 0x1C0000 masks out. These form the least
* significant bits of this byte (so we shift them back down by 18). The 5
* most significant bits of this byte are 11110, so we OR this result with
* 0xF0 to get this first byte.
*
* The remaining bits will be consumed from the most-significant end and so
* they must be shifted up by (32 - 18) = 14.
*/
str.append(1, static_cast<char>(((value & 0x1C0000) >> 18) | 0xF0));
bytes = 3;
value <<= 14;
}
else if (value > 0x07FF)
{
/**
* 1110xxxx + 2 bytes for 16 bits total
*
* We need to take bits 15-12, which 0xF000 masks out. These form the least
* significant bits of this byte (so we shift them back down by 12). The 4
* most significant bits of this byte are 1110, so we OR this result with
* 0xE0 to get this first byte.
*
* The remaining bits will be consumed from the most-significant end and so
* they must be shifted up by (32 - 12) = 20.
*/
str.append(1, static_cast<char>(((value & 0xF000) >> 12) | 0xE0));
bytes = 2;
value <<= 20;
}
else
{
/**
* 110xxxxx + 1 byte for 11 bits total
*
* We need to take bits 10-6, which 0x7C0 masks out. These form the least
* significant bits of this byte (so we shift them back down by 6). The 3
* most significant bits of this byte are 110, so we OR this result with
* 0xC0 to get this first byte.
*
* The remaining bits will be consumed from the most-significant end and so
* they must be shifted up by (32 - 6) = 26.
*/
str.append(1, static_cast<char>(((value & 0x7C0) >> 6) | 0xC0));
bytes = 1;
value <<= 26;
}
/**
* The remaining bits are to be consumed 6 at a time from the most-significant
* end. The mask 0xFC000000 grabs these six bits, which then must be shifted down
* by 26, and OR'd with 0x80 to produce the continuation byte.
*/
for (; bytes > 0; --bytes, value <<= 6)
{
str.append(1, static_cast<char>(((value & 0xFC000000) >> 26) | 0x80));
}
return str;
}
};

91
src/utf8.h

@ -0,0 +1,91 @@
#ifndef UTF8_CPP_H
#define UTF8_CPP_H
#include <stdexcept>
#include <string>
#include <vector>
namespace Url
{
/**
* Work between unicode code points and their UTF-8-encoded representation.
*/
struct Utf8
{
/**
* The type we use to represent Unicode codepoints.
*/
typedef uint32_t codepoint_t;
/**
* The type we use when talking about the integral value of bytes.
*/
typedef unsigned char char_t;
/**
* The highest allowed codepoint.
*/
static const codepoint_t MAX_CODEPOINT = 0x10FFFF;
/**
* Consume up to the last byte of the sequence, returning the codepoint.
*/
static codepoint_t readCodepoint(
std::string::const_iterator& it, const std::string::const_iterator& end);
/**
* Write a codepoint to the provided string.
*/
static std::string& writeCodepoint(std::string& str, codepoint_t value);
/**
* Return the first codepoint stored in the provided string.
*/
static codepoint_t toCodepoint(const std::string& str)
{
auto it = str.begin();
return readCodepoint(it, str.end());
}
/**
* Get a string with the provided codepoint.
*/
static std::string fromCodepoint(codepoint_t value)
{
std::string str;
writeCodepoint(str, value);
return str;
}
/**
* Return all the codepoints in the string.
*/
static std::vector<codepoint_t> toCodepoints(const std::string& str)
{
std::vector<codepoint_t> result;
for (auto it = str.begin(); it != str.end(); )
{
result.push_back(readCodepoint(it, str.end()));
}
return result;
}
/**
* Create a string from a vector of codepoints.
*/
static std::string fromCodepoints(const std::vector<codepoint_t>& points)
{
std::string result;
for (auto it = points.begin(); it != points.end(); ++it)
{
writeCodepoint(result, *it);
}
return result;
}
};
}
#endif

2
tests/test-all.R

@ -0,0 +1,2 @@
library(testthat)
test_check("securitytxt")

6
tests/testthat/test-securitytxt.R

@ -0,0 +1,6 @@
context("basic functionality")
test_that("we can do something", {
#expect_that(some_function(), is_a("data.frame"))
})
Loading…
Cancel
Save