commit 7404834ac25d48007de8c50bad89b77852050297 Author: boB Rudis Date: Mon Oct 9 13:45:44 2017 -0400 initial commit diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..1c60b19 --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,10 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^\.travis\.yml$ +^README\.*Rmd$ +^README\.*html$ +^NOTES\.*Rmd$ +^NOTES\.*html$ +^\.codecov\.yml$ +^README_files$ +^doc$ diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 0000000..69cb760 --- /dev/null +++ b/.codecov.yml @@ -0,0 +1 @@ +comment: false diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cce1f17 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +.DS_Store +.Rproj.user +.Rhistory +.RData +.Rproj +src/*.o +src/*.so +src/*.dll diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..76d9586 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,31 @@ +language: r + +warnings_are_errors: true + +sudo: required + +cache: packages + +r: + - oldrel + - release + - devel + +apt_packages: + - libv8-dev + - xclip + +env: + global: + - CRAN: http://cran.rstudio.com + +after_success: + - Rscript -e 'covr::codecov()' + +notifications: + email: + - bob@rud.is + irc: + channels: + - "104.236.112.222#builds" + nick: travisci diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..98f6c9f --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,32 @@ +Package: securitytxt +Type: Package +Title: Identify and Parse Web Security Policies Files +Version: 0.1.0 +Date: 2017-10-09 +Authors@R: c( + person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), + comment = c(ORCID = "0000-0001-5670-2640")) + ) +Author: Bob Rudis (bob@rud.is) +Maintainer: Bob Rudis +Description: When security risks in web services are discovered by independent + security researchers who understand the severity of the risk, they + often lack the channels to properly disclose them. As a result, + security issues may be left unreported. The 'security.txt' 'Web Security Policies' + specification defines a 'IETF' standard to help organizations define the process + for security researchers to securely disclose security vulnerabilities. Tools are + provided to help identify and parse 'security.txt' files to enable analysis of + the usage of these policies. +URL: https://github.com/hrbrmstr/securitytxt +BugReports: https://github.com/hrbrmstr/securitytxt/issues +License: AGPL +Suggests: + testthat, + covr +Depends: + R (>= 3.2.0) +Imports: + purrr, + Rcpp +RoxygenNote: 6.0.1 +LinkingTo: Rcpp diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..e977e71 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,9 @@ +# Generated by roxygen2: do not edit by hand + +S3method(print,sectxt) +export(sectxt) +export(sectxt_info) +export(sectxt_url) +export(sectxt_validate) +importFrom(Rcpp,sourceCpp) +useDynLib(securitytxt) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..9b4679b --- /dev/null +++ b/NEWS.md @@ -0,0 +1,2 @@ +0.1.0 +* Initial release diff --git a/R/RcppExports.R b/R/RcppExports.R new file mode 100644 index 0000000..9c416c3 --- /dev/null +++ b/R/RcppExports.R @@ -0,0 +1,45 @@ +# Generated by using Rcpp::compileAttributes() -> do not edit by hand +# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#' Parse security.txt +#' +#' @noRd +#' +sectxt_parse <- function(content) { + .Call('_securitytxt_sectxt_parse', PACKAGE = 'securitytxt', content) +} + +sectxt_raw <- function(x) { + .Call('_securitytxt_sectxt_raw', PACKAGE = 'securitytxt', x) +} + +sectxt_keys <- function(x) { + .Call('_securitytxt_sectxt_keys', PACKAGE = 'securitytxt', x) +} + +#' Retrieve a data frame of security.txt keys/values +#' +#' @md +#' @param x a parsed `security.txt` created with [sec_parse()] +#' @return data frame +#' @export +sectxt_info <- function(x) { + .Call('_securitytxt_sectxt_info', PACKAGE = 'securitytxt', x) +} + +#' Determine security.txt URL for a given site/URL +#' +#' Provide any URL for a resource and retrieve the URL for +#' the `security.txt` file. Strips off extraneous URL +#' components and appends `.well-known/security.txt`. +#' +#' @md +#' @param url URL +#' @return character vector +#' @export +#' @examples +#' sectxt_url("https://securitytxt.org/this/that/the/other.html") +sectxt_url <- function(url) { + .Call('_securitytxt_sectxt_url', PACKAGE = 'securitytxt', url) +} + diff --git a/R/sectxt.r b/R/sectxt.r new file mode 100644 index 0000000..8b085a1 --- /dev/null +++ b/R/sectxt.r @@ -0,0 +1,51 @@ +#' Parse a `security.txt` Web Security Policies file & create a `sectxt` object +#' +#' This function takes in a single element character vector and parses it into +#' a `sectxt` object. +#' +#' @md +#' @param x either an atomic character vector containing a complete `security.txt` file +#' _or_ a length >1 character vector that will be concatenated into a single string _or_ +#' a `connection` object that will be passed to [readLines()], the result of which +#' will be concatenated into a single string and parsed and the connection will be closed. +#' @references [IETF Draft](https://tools.ietf.org/html/draft-foudil-securitytxt-00); [Information hub](https://securitytxt.org/) +#' [GitHub Organization](https://github.com/securitytxt) +#' @export +#' @examples +#' sectxt(readLines(system.file("extdata", "security.txt", package="securitytxt"))) +#' \dontrun{ +#' sectxt(url(sectxt_url("https://securitytxt.org"))) +#' } +sectxt <- function(x) { + + if (inherits(x, "connection")) { + y <- suppressWarnings(try(readLines(x, warn = FALSE), silent=TRUE)) + if (inherits(y, "try-error")) { + warning("security.txt not found") + return(NULL) + } + close(x) + x <- y + } + if (is.character(x)) if (length(x) > 1) x <- paste0(x, collapse="\n") + + sec_txt <- sectxt_parse(x) + class(sec_txt) <- c("sectxt") + + sec_txt + +} + +#' Custom printer for `sectxt`` objects +#' +#' @md +#' @noRd +#' @keywords internal +#' @param x object to print +#' @param ... unused +#' @export +print.sectxt <- function(x, ...) { + cat("\n", sep="") + cat(sectxt_raw(x)) + invisible(x) +} \ No newline at end of file diff --git a/R/securitytxt-package.R b/R/securitytxt-package.R new file mode 100644 index 0000000..84a35c1 --- /dev/null +++ b/R/securitytxt-package.R @@ -0,0 +1,20 @@ +#' Identify and Parse Web Security Policies Files +#' +#' When security risks in web services are discovered by independent +#' security researchers who understand the severity of the risk, they +#' often lack the channels to properly disclose them. As a result, +#' security issues may be left unreported. The 'security.txt' 'Web Security Policies' +#' specification defines a 'IETF' standard to help organizations define the process +#' for security researchers to securely disclose security vulnerabilities. Tools are +#' provided to help identify and parse 'security.txt' files to enable analysis of +#' the usage of these policies. +#' +#' @md +#' @name securitytxt +#' @references [IETF Draft](https://tools.ietf.org/html/draft-foudil-securitytxt-00); [Information hub](https://securitytxt.org/) +#' [GitHub Organization](https://github.com/securitytxt) +#' @docType package +#' @author Bob Rudis (bob@@rud.is) +#' @useDynLib securitytxt +#' @importFrom Rcpp sourceCpp +NULL \ No newline at end of file diff --git a/R/validate.r b/R/validate.r new file mode 100644 index 0000000..39ca37f --- /dev/null +++ b/R/validate.r @@ -0,0 +1,19 @@ +securitytxt_ietf_fields <- c("contact", "encryption", "acknowledgement", "disclosure") + +#' Validate a `security.txt` Web Security Policies file +#' +#' @md +#' @param x an object created with [sectxt()] +#' @return logical; `TRUE` if all fields match current IETF standard, `FALSE` if not +#' @export +sectxt_validate <- function(x) { + tmp_keys <- sectxt_keys(x) + key_test <- tmp_keys %in% securitytxt_ietf_fields + if (any(key_test == FALSE)) { + message(sprintf("The following keys were found that are not in the current IETF standard: %s", + paste0(tmp_keys[which(key_test == FALSE)], sep=", "))) + FALSE + } else { + TRUE + } +} \ No newline at end of file diff --git a/README.Rmd b/README.Rmd new file mode 100644 index 0000000..1c87a0c --- /dev/null +++ b/README.Rmd @@ -0,0 +1,70 @@ +--- +output: rmarkdown::github_document +--- + +# securitytxt + +Identify and Parse Web Security Policies Files + +## Description + +When security risks in web services are discovered by independent +security researchers who understand the severity of the risk, they +often lack the channels to properly disclose them. As a result, +security issues may be left unreported. The 'security.txt' 'Web Security Policies' +specification defines a 'IETF' standard to help organizations define the process +for security researchers to securely disclose security vulnerabilities. Tools are +provided to help identify and parse 'security.txt' files to enable analysis of +the usage of these policies. + +- [IETF Draft](https://tools.ietf.org/html/draft-foudil-securitytxt-00) +- [Information hub](https://securitytxt.org/) +- [GitHub Organization](https://github.com/securitytxt) + +## What's Inside The Tin + +The following functions are implemented: + +- `sectxt`: Parse a 'security.txt' Web Security Policies file & create a 'sectxt' object +- `sectxt_info`: Retrieve a data frame of `security.txt` keys/values +- `sectxt_validate`: Validate a 'security.txt' Web Security Policies file +- `sectxt_url`: Determine `security.txt` URL for a given site/URL + +## Installation + +```{r eval=FALSE} +devtools::install_github("hrbrmstr/securitytxt") +``` + +```{r message=FALSE, warning=FALSE, error=FALSE, include=FALSE} +options(width=120) +``` + +## Usage + +```{r message=FALSE, warning=FALSE, error=FALSE} +library(securitytxt) + +# current verison +packageVersion("securitytxt") + +# built-in example +x <- sectxt(readLines(system.file("extdata", "security.txt", package="securitytxt"))) +sectxt_info(x) + +# "live" example +(xurl <- sectxt_url("https://securitytxt.org")) +x <- sectxt(url(xurl)) +sectxt_info(x) +sectxt_validate(x) +x + +# another "live" example +(xurl <- sectxt_url("https://rud.is/b")) +x <- sectxt(url(xurl)) +sectxt_info(x) +sectxt_validate(x) +x + +``` + diff --git a/README.md b/README.md new file mode 100644 index 0000000..4ef1a26 --- /dev/null +++ b/README.md @@ -0,0 +1,114 @@ + +securitytxt +=========== + +Identify and Parse Web Security Policies Files + +Description +----------- + +When security risks in web services are discovered by independent security researchers who understand the severity of the risk, they often lack the channels to properly disclose them. As a result, security issues may be left unreported. The 'security.txt' 'Web Security Policies' specification defines a 'IETF' standard to help organizations define the process for security researchers to securely disclose security vulnerabilities. Tools are provided to help identify and parse 'security.txt' files to enable analysis of the usage of these policies. + +- [IETF Draft](https://tools.ietf.org/html/draft-foudil-securitytxt-00) +- [Information hub](https://securitytxt.org/) +- [GitHub Organization](https://github.com/securitytxt) + +What's Inside The Tin +--------------------- + +The following functions are implemented: + +- `sectxt`: Parse a 'security.txt' Web Security Policies file & create a 'sectxt' object +- `sectxt_info`: Retrieve a data frame of `security.txt` keys/values +- `sectxt_validate`: Validate a 'security.txt' Web Security Policies file +- `sectxt_url`: Determine `security.txt` URL for a given site/URL + +Installation +------------ + +``` r +devtools::install_github("hrbrmstr/securitytxt") +``` + +Usage +----- + +``` r +library(securitytxt) + +# current verison +packageVersion("securitytxt") +``` + + ## [1] '0.1.0' + +``` r +# built-in example +x <- sectxt(readLines(system.file("extdata", "security.txt", package="securitytxt"))) +sectxt_info(x) +``` + + ## key value + ## 1 contact security@example.com + ## 2 encryption https://example.com/pgp-key.txt + +``` r +# "live" example +(xurl <- sectxt_url("https://securitytxt.org")) +``` + + ## [1] "https://securitytxt.org/.well-known/security.txt" + +``` r +x <- sectxt(url(xurl)) +sectxt_info(x) +``` + + ## key value + ## 1 contact https://twitter.com/EdOverflow + +``` r +sectxt_validate(x) +``` + + ## [1] TRUE + +``` r +x +``` + + ## + ## # Our security address + ## Contact: https://twitter.com/EdOverflow + +``` r +# another "live" example +(xurl <- sectxt_url("https://rud.is/b")) +``` + + ## [1] "https://rud.is/.well-known/security.txt" + +``` r +x <- sectxt(url(xurl)) +sectxt_info(x) +``` + + ## key value + ## 1 contact bob@rud.is + ## 2 encryption https://keybase.io/hrbrmstr/pgp_keys.asc?fingerprint=e5388172b81c210906f5e5605879179645de9399 + ## 3 disclosure Full + +``` r +sectxt_validate(x) +``` + + ## [1] TRUE + +``` r +x +``` + + ## + ## Contact: bob@rud.is + ## Encryption: https://keybase.io/hrbrmstr/pgp_keys.asc?fingerprint=e5388172b81c210906f5e5605879179645de9399 + ## Disclosure: Full diff --git a/inst/extdata/security.txt b/inst/extdata/security.txt new file mode 100644 index 0000000..2a2366a --- /dev/null +++ b/inst/extdata/security.txt @@ -0,0 +1,4 @@ +# Our security address + +Contact: security@example.com +Encryption: https://example.com/pgp-key.txt diff --git a/man/sectxt.Rd b/man/sectxt.Rd new file mode 100644 index 0000000..d760955 --- /dev/null +++ b/man/sectxt.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sectxt.r +\name{sectxt} +\alias{sectxt} +\title{Parse a \code{security.txt} Web Security Policies file & create a \code{sectxt} object} +\usage{ +sectxt(x) +} +\arguments{ +\item{x}{either an atomic character vector containing a complete \code{security.txt} file +\emph{or} a length >1 character vector that will be concatenated into a single string \emph{or} +a \code{connection} object that will be passed to \code{\link[=readLines]{readLines()}}, the result of which +will be concatenated into a single string and parsed and the connection will be closed.} +} +\description{ +This function takes in a single element character vector and parses it into +a \code{sectxt} object. +} +\examples{ +sectxt(readLines(system.file("extdata", "security.txt", package="securitytxt"))) +\dontrun{ +sectxt(url(sectxt_url("https://securitytxt.org"))) +} +} +\references{ +\href{https://tools.ietf.org/html/draft-foudil-securitytxt-00}{IETF Draft}; \href{https://securitytxt.org/}{Information hub} +\href{https://github.com/securitytxt}{GitHub Organization} +} diff --git a/man/sectxt_info.Rd b/man/sectxt_info.Rd new file mode 100644 index 0000000..b749e51 --- /dev/null +++ b/man/sectxt_info.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{sectxt_info} +\alias{sectxt_info} +\title{Retrieve a data frame of security.txt keys/values} +\usage{ +sectxt_info(x) +} +\arguments{ +\item{x}{a parsed \code{security.txt} created with \code{\link[=sec_parse]{sec_parse()}}} +} +\value{ +data frame +} +\description{ +Retrieve a data frame of security.txt keys/values +} diff --git a/man/sectxt_url.Rd b/man/sectxt_url.Rd new file mode 100644 index 0000000..cf4dfc5 --- /dev/null +++ b/man/sectxt_url.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{sectxt_url} +\alias{sectxt_url} +\title{Determine security.txt URL for a given site/URL} +\usage{ +sectxt_url(url) +} +\arguments{ +\item{url}{URL} +} +\value{ +character vector +} +\description{ +Provide any URL for a resource and retrieve the URL for +the \code{security.txt} file. Strips off extraneous URL +components and appends \code{.well-known/security.txt}. +} +\examples{ +sectxt_url("https://securitytxt.org/this/that/the/other.html") +} diff --git a/man/sectxt_validate.Rd b/man/sectxt_validate.Rd new file mode 100644 index 0000000..463b1e6 --- /dev/null +++ b/man/sectxt_validate.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/validate.r +\name{sectxt_validate} +\alias{sectxt_validate} +\title{Validate a \code{security.txt} Web Security Policies file} +\usage{ +sectxt_validate(x) +} +\arguments{ +\item{x}{an object created with \code{\link[=sectxt]{sectxt()}}} +} +\value{ +logical; \code{TRUE} if all fields match current IETF standard, \code{FALSE} if not +} +\description{ +Validate a \code{security.txt} Web Security Policies file +} diff --git a/man/securitytxt.Rd b/man/securitytxt.Rd new file mode 100644 index 0000000..95f5ece --- /dev/null +++ b/man/securitytxt.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/securitytxt-package.R +\docType{package} +\name{securitytxt} +\alias{securitytxt} +\alias{securitytxt-package} +\title{Identify and Parse Web Security Policies Files} +\description{ +When security risks in web services are discovered by independent +security researchers who understand the severity of the risk, they +often lack the channels to properly disclose them. As a result, +security issues may be left unreported. The 'security.txt' 'Web Security Policies' +specification defines a 'IETF' standard to help organizations define the process +for security researchers to securely disclose security vulnerabilities. Tools are +provided to help identify and parse 'security.txt' files to enable analysis of +the usage of these policies. +} +\references{ +\href{https://tools.ietf.org/html/draft-foudil-securitytxt-00}{IETF Draft}; \href{https://securitytxt.org/}{Information hub} +\href{https://github.com/securitytxt}{GitHub Organization} +} +\author{ +Bob Rudis (bob@rud.is) +} diff --git a/securitytxt.Rproj b/securitytxt.Rproj new file mode 100644 index 0000000..446d9e1 --- /dev/null +++ b/securitytxt.Rproj @@ -0,0 +1,21 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageBuildArgs: --resave-data +PackageRoxygenize: rd,collate,namespace diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..22034c4 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,3 @@ +*.o +*.so +*.dll diff --git a/src/Makevars b/src/Makevars new file mode 100644 index 0000000..a231a44 --- /dev/null +++ b/src/Makevars @@ -0,0 +1,3 @@ +CXX_STD = CXX11 +PKG_CXXFLAGS = +PKG_LIBS = -L. diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp new file mode 100644 index 0000000..84f7dd3 --- /dev/null +++ b/src/RcppExports.cpp @@ -0,0 +1,76 @@ +// Generated by using Rcpp::compileAttributes() -> do not edit by hand +// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#include + +using namespace Rcpp; + +// sectxt_parse +SEXP sectxt_parse(std::string content); +RcppExport SEXP _securitytxt_sectxt_parse(SEXP contentSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< std::string >::type content(contentSEXP); + rcpp_result_gen = Rcpp::wrap(sectxt_parse(content)); + return rcpp_result_gen; +END_RCPP +} +// sectxt_raw +std::string sectxt_raw(SEXP x); +RcppExport SEXP _securitytxt_sectxt_raw(SEXP xSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type x(xSEXP); + rcpp_result_gen = Rcpp::wrap(sectxt_raw(x)); + return rcpp_result_gen; +END_RCPP +} +// sectxt_keys +std::vector< std::string > sectxt_keys(SEXP x); +RcppExport SEXP _securitytxt_sectxt_keys(SEXP xSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type x(xSEXP); + rcpp_result_gen = Rcpp::wrap(sectxt_keys(x)); + return rcpp_result_gen; +END_RCPP +} +// sectxt_info +DataFrame sectxt_info(SEXP x); +RcppExport SEXP _securitytxt_sectxt_info(SEXP xSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< SEXP >::type x(xSEXP); + rcpp_result_gen = Rcpp::wrap(sectxt_info(x)); + return rcpp_result_gen; +END_RCPP +} +// sectxt_url +std::string sectxt_url(std::string url); +RcppExport SEXP _securitytxt_sectxt_url(SEXP urlSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< std::string >::type url(urlSEXP); + rcpp_result_gen = Rcpp::wrap(sectxt_url(url)); + return rcpp_result_gen; +END_RCPP +} + +static const R_CallMethodDef CallEntries[] = { + {"_securitytxt_sectxt_parse", (DL_FUNC) &_securitytxt_sectxt_parse, 1}, + {"_securitytxt_sectxt_raw", (DL_FUNC) &_securitytxt_sectxt_raw, 1}, + {"_securitytxt_sectxt_keys", (DL_FUNC) &_securitytxt_sectxt_keys, 1}, + {"_securitytxt_sectxt_info", (DL_FUNC) &_securitytxt_sectxt_info, 1}, + {"_securitytxt_sectxt_url", (DL_FUNC) &_securitytxt_sectxt_url, 1}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_securitytxt(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/src/psl.cpp b/src/psl.cpp new file mode 100644 index 0000000..c078d21 --- /dev/null +++ b/src/psl.cpp @@ -0,0 +1,183 @@ +#include +#include +#include +#include + +#include "psl.h" +#include "punycode.h" + +namespace Url +{ + const std::string PSL::not_found = ""; + + PSL::PSL(std::istream& stream) + { + std::string line; + while (std::getline(stream, line)) + { + // Only take up to the first whitespace. + auto it = std::find_if(line.begin(), line.end(), ::isspace); + line.resize(it - line.begin()); + + // Skip blank lines + if (line.empty()) + { + continue; + } + + // Skip comments + if (line.compare(0, 2, "//") == 0) + { + continue; + } + + // We know the line has at least a single character at this point + if (line[0] == '*') + { + // Line is a wildcard rule + if (line.size() <= 2 || line[1] != '.') + { + throw std::invalid_argument("Wildcard rule must be of form *."); + } + + add(line, 1, 2); + } + else if (line[0] == '!') + { + // Line is an exception, take all but the ! + if (line.size() <= 1) + { + throw std::invalid_argument("Exception rule has no hostname."); + } + + add(line, -1, 1); + } + else + { + add(line, 0, 0); + } + } + } + + PSL PSL::fromPath(const std::string& path) + { + std::ifstream stream(path); + if (!stream.good()) + { + std::stringstream message; + message << "Path '" << path << "' inaccessible."; + throw std::invalid_argument(message.str()); + } + return PSL(stream); + } + + PSL PSL::fromString(const std::string& str) + { + std::stringstream stream(str); + return PSL(stream); + } + + std::string PSL::getTLD(const std::string& hostname) const + { + return getLastSegments(hostname, getTLDLength(hostname)); + } + + std::string PSL::getPLD(const std::string& hostname) const + { + return getLastSegments(hostname, getTLDLength(hostname) + 1); + } + + std::pair PSL::getBoth(const std::string& hostname) const + { + size_t length = getTLDLength(hostname); + return std::make_pair( + getLastSegments(hostname, length), + getLastSegments(hostname, length + 1)); + } + + size_t PSL::getTLDLength(const std::string& hostname) const + { + // Reversed copy of hostname + std::string tld(hostname.rbegin(), hostname.rend()); + std::transform(tld.begin(), tld.end(), tld.begin(), ::tolower); + + while (tld.size()) + { + auto it = levels.find(tld); + if (it != levels.end()) + { + return it->second; + } + + size_t position = tld.rfind('.'); + if (position == std::string::npos || position == 0) + { + tld.resize(0); + } + else + { + tld.resize(position); + } + } + + return 1; + } + + std::string PSL::getLastSegments(const std::string& hostname, size_t segments) const + { + size_t position = hostname.size(); + size_t remaining = segments; + while (remaining != 0 && position && position != std::string::npos) + { + position = hostname.rfind('.', position - 1); + remaining -= 1; + } + + if (remaining >= 1) + { + return not_found; + } + + // Return the whole string if position == std:string::npos + size_t start = (position == std::string::npos) ? 0 : position + 1; + + std::string result(hostname, start); + std::transform(result.begin(), result.end(), result.begin(), ::tolower); + + // Leading .'s indicate that the query had an empty segment + if (result.size() && result[0] == '.') + { + std::stringstream message; + message << "Empty segment in " << result; + throw std::invalid_argument(message.str()); + } + + return result; + } + + size_t PSL::countSegments(const std::string& hostname) const + { + size_t count = 1; + size_t position = hostname.find('.'); + while (position != std::string::npos) + { + count += 1; + position = hostname.find('.', position + 1); + } + return count; + } + + void PSL::add(std::string& rule, int level_adjust, size_t trim) + { + // First unpunycoded + std::string copy(rule.rbegin(), rule.rend() - trim); + size_t length = countSegments(copy) + level_adjust; + levels[copy] = length; + + // And now punycoded + rule = Punycode::encodeHostname(rule); + copy.assign(rule.rbegin(), rule.rend() - trim); + levels[copy] = length; + } + +}; diff --git a/src/psl.h b/src/psl.h new file mode 100644 index 0000000..e1714f0 --- /dev/null +++ b/src/psl.h @@ -0,0 +1,102 @@ +#ifndef PSL_CPP_H +#define PSL_CPP_H + +#include +#include +#include +#include +#include + +namespace Url +{ + + /** + * Find TLDs and PLDs of a hostname according to a PSL. + */ + struct PSL + { + /** + * Indicates the there is no TLD / PLD + */ + static const std::string not_found; + + /** + * Read a PSL from an istream. + */ + PSL(std::istream& stream); + + PSL(): levels() { }; + + PSL(const PSL& other): levels(other.levels) { } + + PSL& operator=(const PSL& other) + { + levels = other.levels; + return *this; + } + + /** + * Read the provided path holding a set of PSL rules. + */ + static PSL fromPath(const std::string& path); + + /** + * Create a PSL object from a string. + */ + static PSL fromString(const std::string& str); + + /** + * Get just the TLD of the hostname. + * + * Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If + * some segments have been appropriately punycoded and others not, it may return + * a wrong answer. If a punycoded host is provided, a punycoded response is + * returned. If an unpunycoded host is provided, an unpunycoded response is + * returned. + */ + std::string getTLD(const std::string& hostname) const; + + /** + * Get just the PLD of the hostname. + * + * Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If + * some segments have been appropriately punycoded and others not, it may return + * a wrong answer. If a punycoded host is provided, a punycoded response is + * returned. If an unpunycoded host is provided, an unpunycoded response is + * returned. + */ + std::string getPLD(const std::string& hostname) const; + + /** + * Get the (TLD, PLD) of the hostname. + * + * Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If + * some segments have been appropriately punycoded and others not, it may return + * a wrong answer. If a punycoded host is provided, a punycoded response is + * returned. If an unpunycoded host is provided, an unpunycoded response is + * returned. + */ + std::pair getBoth(const std::string& hostname) const; + private: + // Mapping of a string rule to its level + std::unordered_map levels; + + // Return the number of segments in a hostname + size_t countSegments(const std::string& hostname) const; + + // Return the number of segments in the TLD of the provided hostname + size_t getTLDLength(const std::string& hostname) const; + + // Return the last `segments` segments of a hostname + std::string getLastSegments(const std::string& hostname, size_t segments) const; + + /** + * Add the provided host with the provided priority, trimming characters off + * the front, and adjusting the level by the provided number. + */ + void add(std::string& host, int level_adjust, size_t trim); + }; + +} + +#endif diff --git a/src/punycode.cpp b/src/punycode.cpp new file mode 100644 index 0000000..eb85d92 --- /dev/null +++ b/src/punycode.cpp @@ -0,0 +1,409 @@ +#include +#include +#include + +#include "punycode.h" +#include "utf8.h" + +namespace Url +{ + + std::string& Punycode::encode(std::string& str) + { + // Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3 + // + // let n = initial_n + // let delta = 0 + // let bias = initial_bias + punycode_uint n = INITIAL_N; + punycode_uint delta = 0; + punycode_uint bias = INITIAL_BIAS; + std::string output; + + // Accumulate the non-basic codepoints + std::vector codepoints; + for (auto it = str.cbegin(); it != str.cend(); ) + { + Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend()); + if (value < 0x80) + { + // copy them to the output in order + output.append(1, static_cast(value)); + } + codepoints.push_back(value); + } + + // let h = b = the number of basic code points in the input + size_t h = output.size(); + size_t b = h; + + // copy a delimiter if b > 0 + if (b > 0) + { + output.append(1, '-'); + } + + // while h < length(input) do begin + while (h < codepoints.size()) + { + // let m = the minimum {non-basic} code point >= n in the input + punycode_uint m = MAX_PUNYCODE_UINT; + for (auto it = codepoints.begin(); it != codepoints.end(); ++it) + { + if ((*it >= n) && (*it < m)) + { + m = *it; + } + } + + // let delta = delta + (m - n) * (h + 1), fail on overflow + if ((m - n) > ((MAX_PUNYCODE_UINT - delta) / (h + 1))) + { + throw std::invalid_argument("Overflow delta update."); + } + delta += (m - n) * (h + 1); + + // let n = m + n = m; + + // for each code point c in the input (in order) do begin + for (auto it = codepoints.begin(); it != codepoints.end(); ++it) + { + // if c < n {or c is basic} then increment delta, fail on overflow + if (*it < n) + { + if (delta == MAX_PUNYCODE_UINT) + { + throw std::invalid_argument("Overflow delta increment."); + } + ++delta; + } + + // if c == n then begin + if (*it == n) + { + // let q = delta + punycode_uint q = delta; + + // for k = base to infinity in steps of base do begin + for (punycode_uint k = BASE; ; k += BASE) + { + // let t = tmin if k <= bias {+ tmin}, or + // tmax if k >= bias + tmax, or k - bias otherwise + punycode_uint t = k <= bias ? TMIN : + k >= bias + TMAX ? TMAX : k - bias; + + // if q < t then break + if (q < t) + { + break; + } + + // output the code point for digit t + ((q - t) mod (base - t)) + output.append(1, DIGIT_TO_BASIC[t + ((q - t) % (BASE - t))]); + + // let q = (q - t) div (base - t) + q = (q - t) / (BASE - t); + } + + // output the code point for digit q + output.append(1, DIGIT_TO_BASIC[q]); + + // let bias = adapt(delta, h + 1, test h equals b?) + bias = adapt(delta, h + 1, h == b); + + // let delta = 0 + delta = 0; + + // increment h + ++h; + + } + } + + // increment delta and n + ++delta; + ++n; + } + + str.assign(output); + return str; + } + + std::string Punycode::encode(const std::string& str) + { + std::string result(str); + encode(result); + return result; + } + + std::string Punycode::encodeHostname(const std::string& hostname) + { + // Avoid any punycoding at all if none is needed + if (!needsPunycoding(hostname)) + { + return hostname; + } + + std::string encoded; + + size_t start = 0; + size_t end = hostname.find('.'); + while(true) + { + std::string segment = hostname.substr(start, end - start); + if (needsPunycoding(segment)) + { + encoded.append("xn--"); + encoded.append(Punycode::encode(segment)); + } + else + { + encoded.append(segment); + } + + if (end == std::string::npos) + { + break; + } + else + { + encoded.append(1, '.'); + start = end + 1; + end = hostname.find('.', start); + } + } + + return encoded; + } + + std::string& Punycode::decode(std::string& str) + { + // Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2 + // + // let n = initial_n + // let i = 0 + // let bias = initial_bias + // let output = an empty string indexed from 0 + punycode_uint n = INITIAL_N; + punycode_uint i = 0; + punycode_uint bias = INITIAL_BIAS; + std::vector codepoints; + + size_t index = str.rfind('-'); + if (index == std::string::npos) + { + index = 0; + } + + // consume all code points before the last delimiter (if there is one) + // and copy them to output, fail on any non-basic code point + for (auto it = str.begin(); it != (str.begin() + index); ++it) + { + if (static_cast(*it) > 127U) + { + throw std::invalid_argument("Argument has non-basic code points."); + } + codepoints.push_back(*it); + } + + // if more than zero code points were consumed then consume one more + // (which will be the last delimiter) + if (index > 0) + { + index += 1; + } + + // while the input is not exhausted do begin + for (auto it = (str.begin() + index); it != str.end(); ++it) + { + // let oldi = i + // let w = 1 + punycode_uint oldi = i; + punycode_uint w = 1; + + // for k = base to infinity in steps of base do begin + for (punycode_uint k = BASE; ; k += BASE, ++it) + { + // consume a code point, or fail if there was none to consume + if (it == str.end()) + { + throw std::invalid_argument("Premature termination"); + } + + // let digit = the code point's digit-value, fail if it has none + int lookup = BASIC_TO_DIGIT[static_cast(*it)]; + if (lookup == -1) + { + throw std::invalid_argument("Invalid base 36 character."); + } + unsigned char digit = static_cast(lookup); + + // let i = i + digit * w, fail on overflow + if (digit > ((MAX_PUNYCODE_UINT - i) / w)) + { + throw std::invalid_argument("Overflow on i."); + } + i += digit * w; + + // let t = tmin if k <= bias {+ tmin}, or + // tmax if k >= bias + tmax, or k - bias otherwise + punycode_uint t = k <= bias ? TMIN : + k >= bias + TMAX ? TMAX : k - bias; + + // if digit < t then break + if (digit < t) + { + break; + } + + // let w = w * (base - t), fail on overflow + if (w > (MAX_PUNYCODE_UINT / (BASE - t))) + { + // I believe this line is unreachable without first overflowing i. + // Since 'i' is updated above as i += digit * w, and w is updated as + // w = w * (BASE - t), we should like to keep (BASE - t) > digit to + // give 'w' a chance to overflow first. To keep t minimized, we must + // have 'bias' maximized. `bias` is driven by the 'adapt' function + // below. + // + // The value returned by 'adapt' increases with the input delta, and + // decreases with the input size. The delta is a function of the input + // size as well, on the order of (delta_n * input size), and + // legitimate delta_n values are limited to 0x10FFFF (the maximum + // unicode codepoint). Even setting that aside, the maximum value that + // adapt() can return is adapt(2 ** 32 - 1, 1, false) = 204. + // + // Using this bias, we could use the input (HERE) to get iterations: + // + // digit = b = 1, i = 2, k = 36, t = 1, w = 35 + // digit = b = 1, i = 37, k = 72, t = 1, w = 1225 + // digit = b = 1, i = 1262, k = 108, t = 1, w = 42875 + // digit = b = 1, i = 44137, k = 144, t = 1, w = 1500625 + // digit = b = 1, i = 1544762, k = 180, t = 1, w = 52521875 + // + // At this point, t now becomes TMAX (26) because k exceeds the bias + // (since the maximum bias is 204). As such, the minimum continuation + // value is 26: + // + // digit = 0 = 26, i = 1367113512, k = 216, t = 26, w = 525218750 + // + // However, the next iteration now overflows i before we can get to + // the w update. + throw std::invalid_argument("Overflow on w."); // LCOV_EXCL_LINE + } + w *= (BASE - t); + } + + // let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?) + bias = adapt(i - oldi, codepoints.size() + 1, oldi == 0); + + // let n = n + i div (length(output) + 1), fail on overflow + if ((i / (codepoints.size() + 1)) > (MAX_PUNYCODE_UINT - n)) + { + throw std::invalid_argument("Overflow on n."); + } + n += i / (codepoints.size() + 1); + + // let i = i mod (length(output) + 1) + i %= (codepoints.size() + 1); + + // insert n into output at position i + codepoints.insert(codepoints.begin() + i, n); + + // increment i + ++i; + } + + std::string output; + for (auto it = codepoints.begin(); it != codepoints.end(); ++it) + { + Utf8::writeCodepoint(output, *it); + } + str.assign(output); + + return str; + } + + std::string Punycode::decode(const std::string& str) + { + std::string result(str); + decode(result); + return result; + } + + std::string Punycode::decodeHostname(const std::string& hostname) + { + std::string unencoded; + + size_t start = 0; + size_t end = hostname.find('.'); + while(true) + { + std::string segment = hostname.substr(start, end - start); + if (segment.substr(0, 4).compare("xn--") == 0) + { + segment = segment.substr(4); + unencoded.append(Punycode::decode(segment)); + } + else + { + unencoded.append(segment); + } + + if (end == std::string::npos) + { + break; + } + else + { + unencoded.append(1, '.'); + start = end + 1; + end = hostname.find('.', start); + } + } + + return unencoded; + } + + bool Punycode::needsPunycoding(const std::string& str) + { + return std::any_of( + str.begin(), + str.end(), + [](char i){ return static_cast(i) & 0x80; }); + } + + Punycode::punycode_uint Punycode::adapt( + punycode_uint delta, punycode_uint numpoints, bool firsttime) + { + // Psuedocode from https://tools.ietf.org/html/rfc3492#section-6.1 + // + // It does not matter whether the modifications to delta and k inside + // adapt() affect variables of the same name inside the + // encoding/decoding procedures, because after calling adapt() the + // caller does not read those variables before overwriting them. + // + // if firsttime then let delta = delta div damp + // else let delta = delta div 2 + delta = firsttime ? delta / DAMP : delta >> 1; + + // let delta = delta + (delta div numpoints) + delta += (delta / numpoints); + + // let k = 0 + punycode_uint k = 0; + + // while delta > ((base - tmin) * tmax) div 2 do begin + for (; delta > ((BASE - TMIN) * TMAX) / 2; k += BASE) + { + // let delta = delta div (base - tmin) + // let k = k + base + delta /= (BASE - TMIN); + } + + // return k + (((base - tmin + 1) * delta) div (delta + skew)) + return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW)); + } + +}; diff --git a/src/punycode.h b/src/punycode.h new file mode 100644 index 0000000..25fce96 --- /dev/null +++ b/src/punycode.h @@ -0,0 +1,105 @@ +#ifndef PUNYCODE_CPP_H +#define PUNYCODE_CPP_H + +#include +#include +#include +#include +#include + +#include "utf8.h" + +namespace Url +{ + + namespace Punycode + { + typedef Utf8::codepoint_t punycode_uint; + + const unsigned int BASE = 36; + const unsigned int TMIN = 1; + const unsigned int TMAX = 26; + const unsigned int SKEW = 38; + const unsigned int DAMP = 700; + const unsigned int INITIAL_BIAS = 72; + const unsigned int INITIAL_N = 128; + + // Codepoints to their base-36 value + const std::vector BASIC_TO_DIGIT = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, + + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + const std::string DIGIT_TO_BASIC = "abcdefghijklmnopqrstuvwxyz0123456789"; + + // The highest codepoint in unicode + const punycode_uint MAX_PUNYCODE_UINT = std::numeric_limits::max(); + //Utf8::MAX_CODEPOINT; + //std::numeric_limits::max(); + + /** + * Replace utf-8-encoded str into punycode. + */ + std::string& encode(std::string& str); + + /** + * Create a new punycoded string from utf-8-encoded input. + */ + std::string encode(const std::string& str); + + /** + * Encode a hostname. + */ + std::string encodeHostname(const std::string& hostname); + + /** + * Replace punycoded str into utf-8-encoded. + */ + std::string& decode(std::string& str); + + /** + * Create a new utf-8-encoded string from punycoded input. + */ + std::string decode(const std::string& str); + + /** + * Decode a hostname. + */ + std::string decodeHostname(const std::string& hostname); + + /** + * Determine if a string needs punycoding. + */ + bool needsPunycoding(const std::string& str); + + /** + * Internal function for calculating bias. + */ + punycode_uint adapt( + punycode_uint delta, punycode_uint numpoints, bool firsttime); + + }; + +} + +#endif diff --git a/src/security.cpp b/src/security.cpp new file mode 100644 index 0000000..f6fe5ad --- /dev/null +++ b/src/security.cpp @@ -0,0 +1,89 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "url.h" + +#include "security.h" +#include + +namespace SecTxt { + + void SecurityText::strip(std::string& string) { + string.erase(string.begin(), std::find_if(string.begin(), string.end(), + std::not1(std::ptr_fun(std::isspace)))); + string.erase(std::find_if(string.rbegin(), string.rend(), + std::not1(std::ptr_fun(std::isspace))).base(), string.end()); + } + + bool SecurityText::getpair(std::istringstream& stream, std::string& key, std::string& value) { + + while (getline(stream, key)) { + + size_t index = key.find('#'); + + if (index != std::string::npos) key.resize(index); + + // Find the colon and divide it into key and value, skipping malformed lines + index = key.find(':'); + if (index == std::string::npos) continue; + + value.assign(key.begin() + index + 1, key.end()); + key.resize(index); + + // Strip whitespace off of each + strip(key); + strip(value); + + // Lowercase the key + std::transform(key.begin(), key.end(), key.begin(), ::tolower); + + return true; + } + return false; + } + + SecurityText::SecurityText(const std::string& content) { + + orig_file = content; + + std::istringstream input(content); + + if (content.compare(0, 3, "\xEF\xBB\xBF") == 0) input.ignore(3); + + std::string key, value; + + while (SecurityText::getpair(input, key, value)) { + st_keys.push_back(key); + st_vals.push_back(value); + } + + } + + std::string SecurityText::rawFile() { + return(orig_file); + } + + std::vector< std::string > SecurityText::sectxtKeys() { + return(st_keys); + } + + std::vector< std::string > SecurityText::sectxtVals() { + return(st_vals); + } + + std::string SecurityText::securityUrl(const std::string& url) { + return Url::Url(url) + .setUserinfo("") + .setPath(".well-known/security.txt") + .setParams("") + .setQuery("") + .setFragment("") + .remove_default_port() + .str(); + } +} diff --git a/src/security.h b/src/security.h new file mode 100644 index 0000000..e610731 --- /dev/null +++ b/src/security.h @@ -0,0 +1,37 @@ +#ifndef SECURITY_CPP_H +#define SECURITY_CPP_H + +#include +#include +#include + +namespace SecTxt { + + class SecurityText { + + public: + + // Create a security.txt data structure from a utf-8-encoded string. + SecurityText(const std::string& content); + + std::string rawFile(); + std::vector sectxtKeys(); + std::vector sectxtVals(); + + // Return the security.txt URL corresponding to the provided URL. + static std::string securityUrl(const std::string& url); + + private: + + static void strip(std::string& string); + + static bool getpair(std::istringstream& stream, std::string& key, std::string& value); + + std::vector st_keys; + std::vector st_vals; + std::string orig_file; + + }; +} + +#endif diff --git a/src/securitymain.cpp b/src/securitymain.cpp new file mode 100644 index 0000000..41b8ca1 --- /dev/null +++ b/src/securitymain.cpp @@ -0,0 +1,66 @@ +#include +using namespace Rcpp; + +#include "url.h" +#include "security.h" + +//' Parse security.txt +//' +//' @noRd +//' +// [[Rcpp::export]] +SEXP sectxt_parse(std::string content) { + Rcpp::XPtr ptr(new SecTxt::SecurityText(content)); + return(ptr); +} + +// [[Rcpp::export]] +std::string sectxt_raw(SEXP x) { + + Rcpp::XPtr ptr(x); + return(ptr->rawFile()); + +} + +// [[Rcpp::export]] +std::vector< std::string > sectxt_keys(SEXP x) { + + Rcpp::XPtr ptr(x); + return(ptr->sectxtKeys()); + +} + + +//' Retrieve a data frame of security.txt keys/values +//' +//' @md +//' @param x a parsed `security.txt` created with [sec_parse()] +//' @return data frame +//' @export +// [[Rcpp::export]] +DataFrame sectxt_info(SEXP x) { + + Rcpp::XPtr ptr(x); + + return(DataFrame::create( + _["key"] = ptr->sectxtKeys(), + _["value"] = ptr->sectxtVals())); + +} + +//' Determine security.txt URL for a given site/URL +//' +//' Provide any URL for a resource and retrieve the URL for +//' the `security.txt` file. Strips off extraneous URL +//' components and appends `.well-known/security.txt`. +//' +//' @md +//' @param url URL +//' @return character vector +//' @export +//' @examples +//' sectxt_url("https://securitytxt.org/this/that/the/other.html") +// [[Rcpp::export]] +std::string sectxt_url(std::string url) { + return(SecTxt::SecurityText::securityUrl(url)); +} diff --git a/src/url.cpp b/src/url.cpp new file mode 100644 index 0000000..900a65e --- /dev/null +++ b/src/url.cpp @@ -0,0 +1,962 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "url.h" +#include "punycode.h" + +namespace Url +{ + + /* Character classes */ + const CharacterClass Url::GEN_DELIMS(":/?#[]@"); + const CharacterClass Url::SUB_DELIMS("!$&'()*+,;="); + const CharacterClass Url::DIGIT("0123456789"); + const CharacterClass Url::ALPHA( + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); + const CharacterClass Url::UNRESERVED( + Url::ALPHA.chars() + Url::DIGIT.chars() + "-._~"); + const CharacterClass Url::RESERVED( + Url::GEN_DELIMS.chars() + Url::SUB_DELIMS.chars()); + const CharacterClass Url::PCHAR( + Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":@"); + const CharacterClass Url::PATH( + Url::PCHAR.chars() + "/"); + const CharacterClass Url::QUERY( + Url::PCHAR.chars() + "/?"); + const CharacterClass Url::FRAGMENT( + Url::PCHAR.chars() + "/?"); + const CharacterClass Url::USERINFO( + Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":"); + const CharacterClass Url::HEX("0123456789ABCDEF"); + const CharacterClass Url::SCHEME( + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-."); + const std::vector Url::HEX_TO_DEC = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, + + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + const std::unordered_map Url::PORTS = { + {"http", 80}, + {"https", 443} + }; + const std::unordered_set Url::USES_RELATIVE = { + "", + "file", + "ftp", + "gopher", + "http", + "https", + "imap", + "mms", + "nntp", + "prospero", + "rtsp", + "rtspu", + "sftp", + "shttp", + "svn", + "svn+ssh", + "wais" + }; + const std::unordered_set Url::USES_NETLOC = { + "", + "file", + "ftp", + "git", + "git+ssh", + "gopher", + "http", + "https", + "imap", + "mms", + "nfs", + "nntp", + "prospero", + "rsync", + "rtsp", + "rtspu", + "sftp", + "shttp", + "snews", + "svn", + "svn+ssh", + "telnet", + "wais" + }; + const std::unordered_set Url::USES_PARAMS = { + "", + "ftp", + "hdl", + "http", + "https", + "imap", + "mms", + "prospero", + "rtsp", + "rtspu", + "sftp", + "shttp", + "sip", + "sips", + "tel" + }; + const std::unordered_set Url::KNOWN_PROTOCOLS = { + "", + "file", + "ftp", + "git", + "git+ssh", + "gopher", + "hdl", + "http", + "https", + "imap", + "mms", + "nfs", + "nntp", + "prospero", + "rsync", + "rtsp", + "rtspu", + "sftp", + "shttp", + "sip", + "sips", + "sms", + "snews", + "svn", + "svn+ssh", + "tel", + "telnet", + "wais" + }; + + Url::Url(const std::string& url): port_(0), has_params_(false), has_query_(false) + { + size_t position = 0; + size_t index = url.find(':'); + if (index != std::string::npos) + { + // All the characters in our would-be scheme must be in SCHEME + if (std::all_of( + url.begin(), + url.begin() + index, + [](char c) { return SCHEME(c); } )) + { + // If there is nothing after the : or there are any non-digits, this is + // the scheme + if ((index + 1) >= url.length() + || std::any_of( + url.begin() + index + 1, + url.end(), + [](char c) { return !DIGIT(c); })) + { + scheme_.assign(url, 0, index); + std::transform( + scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower); + position = index + 1; + } + else + { + scheme_.assign(url, 0, index); + std::transform( + scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower); + if (KNOWN_PROTOCOLS.find(scheme_) != KNOWN_PROTOCOLS.end()) + { + position = index + 1; + } + else + { + scheme_.clear(); + } + } + } + } + + // Search for the netloc + if ((url.length() - position) >= 1 + && url[position] == '/' + && url[position + 1] == '/') + { + // Skip the '//' + position += 2; + index = url.find_first_of("/?#", position); + host_.assign(url, position, index - position); + position = index; + + // Extract any userinfo if there is any + index = host_.find('@'); + if (index != std::string::npos) + { + userinfo_.assign(host_, 0, index); + host_.assign(host_, index + 1, std::string::npos); + } + + // Lowercase the hostname + std::transform(host_.begin(), host_.end(), host_.begin(), ::tolower); + + // Try to find a port + index = host_.find(':'); + if (index != std::string::npos) + { + std::string portText(host_, index + 1, std::string::npos); + host_.resize(index); + + if (portText.empty()) + { + port_ = 0; + } + else + { + try + { + port_ = std::stoi(portText, &index); + + if (index != portText.length()) + { + // Malformed port + throw UrlParseException("Port not a number: " + portText); + } + + if (port_ > 65535) + { + throw UrlParseException("Port too high: " + portText); + } + else if (port_ < 0) + { + throw UrlParseException("Port negative: " + portText); + } + } + catch (const std::invalid_argument&) + { + // Malformed port + throw UrlParseException("Port not a number: " + portText); + } + catch (const std::out_of_range&) + { + throw UrlParseException("Port out of integer range: " + portText); + } + } + } + } + + if (position != std::string::npos) + { + path_.assign(url, position, std::string::npos); + + index = path_.find('#'); + if (index != std::string::npos) + { + fragment_.assign(path_, index + 1, std::string::npos); + path_.resize(index); + } + + index = path_.find('?'); + if (index != std::string::npos) + { + query_.assign(path_, index + 1, std::string::npos); + has_query_ = true; + path_.resize(index); + } + + if (USES_PARAMS.find(scheme_) != USES_PARAMS.end()) + { + index = path_.find(';'); + if (index != std::string::npos) + { + params_.assign(path_, index + 1, std::string::npos); + has_params_ = true; + path_.resize(index); + } + } + } + } + + Url& Url::assign(const Url& other) + { + return (*this) = other; + } + + bool Url::operator==(const Url& other) const + { + return ( + (scheme_ == other.scheme_ ) && + (userinfo_ == other.userinfo_ ) && + (host_ == other.host_ ) && + (port_ == other.port_ ) && + (path_ == other.path_ ) && + (params_ == other.params_ ) && + (query_ == other.query_ ) && + (fragment_ == other.fragment_ ) && + (has_params_ == other.has_params_) && + (has_query_ == other.has_query_ ) + ); + } + + bool Url::operator!=(const Url& other) const + { + return !operator==(other); + } + + bool Url::equiv(const Url& other) + { + Url self_(*this); + Url other_(other); + + self_.strip() + .sort_query() + .defrag() + .deuserinfo() + .abspath() + .escape() + .punycode() + .remove_default_port(); + other_.strip() + .sort_query() + .defrag() + .deuserinfo() + .abspath() + .escape() + .punycode() + .remove_default_port(); + return self_ == other_; + } + + std::string& Url::remove_repeats(std::string& str, const char chr) + { + size_t dest = 0; + // By initializing this to true, it also strips of leading instances of chr + bool seen = true; + for (size_t src = 0; src < str.length(); ++src) + { + if (!seen || (str[src] != chr)) + { + str[dest++] = str[src]; + } + seen = str[src] == chr; + } + // Remove the last character if it happens to be chr + size_t length = ((dest > 0) && (str[dest - 1] == chr)) ? dest - 1 : dest; + str.resize(length); + return str; + } + + std::string Url::fullpath() const + { + std::string result; + if (path_.empty() || path_[0] != '/') + { + result.append(1, '/'); + } + result.append(path_); + + if (has_params_) + { + result.append(";"); + result.append(params_); + } + + if (has_query_) + { + result.append("?"); + result.append(query_); + } + + if (!fragment_.empty()) + { + result.append("#"); + result.append(fragment_); + } + return result; + } + + std::string Url::str() const + { + std::string result; + + if (!scheme_.empty()) + { + result.append(scheme_); + if (USES_NETLOC.find(scheme_) == USES_NETLOC.end()) + { + result.append(":"); + } + else + { + result.append("://"); + } + } + else if (!host_.empty()) + { + result.append("//"); + } + + if (!userinfo_.empty()) + { + result.append(userinfo_); + result.append("@"); + } + + if (!host_.empty()) + { + result.append(host_); + } + + if (port_) + { + result.append(":"); + result.append(std::to_string(port_)); + } + + if (path_.empty()) + { + if (!result.empty()) + { + result.append("/"); + } + } + else + { + if (!host_.empty() && path_[0] != '/') + { + result.append(1, '/'); + } + result.append(path_); + } + + if (has_params_) + { + result.append(";"); + result.append(params_); + } + + if (has_query_) + { + result.append("?"); + result.append(query_); + } + + if (!fragment_.empty()) + { + result.append("#"); + result.append(fragment_); + } + + return result; + } + + Url& Url::strip() + { + size_t start = query_.find_first_not_of('?'); + if (start != std::string::npos) + { + query_.assign(query_, start, std::string::npos); + } + else + { + query_.assign(""); + } + setQuery(remove_repeats(query_, '&')); + setParams(remove_repeats(params_, ';')); + return *this; + } + + Url& Url::abspath() + { + std::string copy; + std::vector segment_starts; + + if (path_.size() >= 1 && path_[0] == '/') + { + copy.append(1, '/'); + segment_starts.push_back(0); + } + + bool directory = false; + size_t previous = 0; + size_t index = 0; + for (index = path_.find('/') + ; index != std::string::npos + ; previous = index + 1, index = path_.find('/', index + 1)) + { + // Skip empty segments + if (index - previous == 0) + { + continue; + } + + if ((index - previous == 2) + && path_[previous] == '.' + && path_[previous + 1] == '.') + { + if (!segment_starts.empty()) + { + copy.resize(segment_starts.back()); + segment_starts.pop_back(); + } + directory = true; + } + else if ((index - previous == 1) && path_[previous] == '.') + { + directory = true; + } + else + { + segment_starts.push_back(copy.length()); + copy.append(path_, previous, index - previous); + copy.append(1, '/'); + directory = false; + } + } + + // Handle the last segment + index = path_.length(); + if (previous == path_.length()) + { + directory = true; + } + else if ((index - previous == 1) && path_[previous] == '.') + { + directory = true; + } + else if ((index - previous == 2) + && path_[previous] == '.' + && path_[previous + 1] == '.') + { + if (!segment_starts.empty()) + { + copy.resize(segment_starts.back()); + } + directory = true; + } + else + { + copy.append(path_, previous, index - previous); + copy.append(1, '/'); + directory = false; + } + + if (!directory && copy.size() >= 1) + { + copy.resize(copy.size() - 1); + } + else if (directory && copy.empty()) + { + copy.append(1, '/'); + } + path_.assign(copy); + + return *this; + } + + Url& Url::relative_to(const Url& other) + { + // If this scheme does not use relative, return it unchanged + if (USES_RELATIVE.find(scheme_) == USES_RELATIVE.end()) + { + return *this; + } + + // Support scheme-relative URLs + if (scheme_.empty()) + { + scheme_ = other.scheme_; + } + + // If this is an absolute URL (or scheme-relative), return early + if (!host_.empty()) { + return *this; + } + + // If it's not an absolute URL, we need to copy the other host and port + host_ = other.host_; + port_ = other.port_; + userinfo_ = other.userinfo_; + + // If the path portion is absolute, then bail out early. + if (!path_.empty() && path_.front() == '/') + { + return *this; + } + + // Otherwise, this is a path that need to be evaluated relative to the other. If + // there is no '/', then we just keep our current path if it's not empty. + if (path_.empty()) + { + if (params_.empty()) + { + path_ = other.path_; + params_ = other.params_; + has_params_ = other.has_params_; + if (query_.empty()) + { + query_ = other.query_; + has_query_ = other.has_query_; + } + } + else + { + path_.assign(other.path_, 0, other.path_.rfind('/') + 1); + } + + if (fragment_.empty()) + { + fragment_ = other.fragment_; + } + } + else + { + size_t index = other.path_.rfind('/'); + if (index != std::string::npos) + { + path_ = other.path_.substr(0, index + 1) + path_; + } + else if (!host_.empty()) + { + path_ = "/" + path_; + } + } + + return *this; + } + + Url& Url::escape(bool strict) + { + escape(path_, PATH, strict); + escape(query_, QUERY, strict); + escape(params_, QUERY, strict); + escape(userinfo_, USERINFO, strict); + return *this; + } + + std::string& Url::escape(std::string& str, const CharacterClass& safe, bool strict) + { + std::string copy(str); + size_t dest = 0; + // Allocate space pessimistically -- if every entity is expanded, it will take 3x + // the space. + str.resize(str.length() * 3); + for (size_t src = 0; src < copy.length(); ++src) + { + if (copy[src] == '%' && (copy.length() - src) >= 2) + { + // Read ahead to see if there's a valid escape sequence. If not, treat + // this like a normal character. + if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1) + { + int value = ( + HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]); + + // In strict mode, we can only unescape parameters if they are both + // safe and node reserved + if (!strict || (strict && safe(value) && !RESERVED(value))) + { + // Replace src + 2 with that byte, advance src to consume it and + // continue. + src += 2; + copy[src] = value; + } + else + { + str[dest++] = copy[src++]; + str[dest++] = ::toupper(copy[src++]); + str[dest++] = ::toupper(copy[src]); + continue; + } + } + } + + if (!safe(copy[src])) + { + // Not safe -- replace with %XX + str[dest++] = '%'; + str[dest++] = HEX.chars()[(copy[src] >> 4) & 0xF]; + str[dest++] = HEX.chars()[copy[src] & 0xF]; + } + else + { + str[dest++] = copy[src]; + } + } + str.resize(dest); + return str; + } + + Url& Url::unescape() + { + unescape(path_); + unescape(query_); + unescape(params_); + unescape(userinfo_); + return *this; + } + + std::string& Url::unescape(std::string& str) + { + std::string copy(str); + size_t dest = 0; + for (size_t src = 0; src < copy.length(); ++src, ++dest) + { + if (copy[src] == '%' && (copy.length() - src) >= 2) + { + // Read ahead to see if there's a valid escape sequence. If not, treat + // this like a normal character. + if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1) + { + int value = ( + HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]); + + // Replace src + 2 with that byte, advance src to consume it and + // continue. + src += 2; + str[dest] = value; + continue; + } + } + + // Either not a % or an incomplete entity + str[dest] = copy[src]; + } + str.resize(dest); + return str; + } + + Url& Url::deparam(const std::unordered_set& blacklist) + { + // Predicate is if it's present in the blacklist. + auto predicate = [blacklist](std::string& name, const std::string& value) + { + std::transform(name.begin(), name.end(), name.begin(), ::tolower); + return blacklist.find(name) != blacklist.end(); + }; + + setQuery(remove_params(query_, predicate, '&')); + setParams(remove_params(params_, predicate, ';')); + return *this; + } + + Url& Url::deparam(const deparam_predicate& predicate) + { + setQuery(remove_params(query_, predicate, '&')); + setParams(remove_params(params_, predicate, ';')); + return *this; + } + + std::string& Url::remove_params(std::string& str, + const deparam_predicate& predicate, + char sep) + { + std::string copy; + std::string piece; + std::string name; + std::string value; + size_t previous = 0; + for (size_t index = str.find(sep) + ; index != std::string::npos + ; previous = index + 1, index = str.find(sep, previous)) + { + piece.assign(str, previous, index - previous); + size_t position = piece.find('='); + name.assign(piece, 0, position); + value.clear(); + if (position != std::string::npos) + { + value.assign(piece, position + 1, std::string::npos); + } + + if (!predicate(name, value)) + { + copy.append(copy.empty() ? 0 : 1, sep); + copy.append(piece); + } + } + + if (previous < str.length()) + { + piece.assign(str, previous, std::string::npos); + size_t position = piece.find('='); + name.assign(piece, 0, position); + value.clear(); + if (position != std::string::npos) + { + value.assign(piece, position + 1, std::string::npos); + } + + if (!predicate(name, value)) + { + copy.append(copy.empty() ? 0 : 1, sep); + copy.append(piece); + } + } + + str.assign(copy); + return str; + } + + Url& Url::sort_query() + { + split_sort_join(query_, '&'); + split_sort_join(params_, ';'); + return *this; + } + + std::string& Url::split_sort_join(std::string& str, const char glue) + { + // Return early if empty + if (str.empty()) + { + return str; + } + + // Split + std::vector pieces; + std::stringstream stream(str); + std::string item; + while (getline(stream, item, glue)) + { + pieces.push_back(item); + } + + // Return early if it's just a single element + if (pieces.size() == 1) + { + return str; + } + + // Sort + std::sort(pieces.begin(), pieces.end()); + + // Join (at this point we know that there's at least one element) + std::stringstream output; + for (auto it = pieces.begin(); it != (pieces.end() - 1); ++it) + { + output << *it << glue; + } + output << pieces.back(); + str.assign(output.str()); + return str; + } + + Url& Url::remove_default_port() + { + if (port_ && !scheme_.empty()) + { + auto it = PORTS.find(scheme_); + if (it != PORTS.end() && port_ == it->second) + { + port_ = 0; + } + } + return *this; + } + + Url& Url::deuserinfo() + { + userinfo_.clear(); + return *this; + } + + Url& Url::defrag() + { + fragment_.clear(); + return *this; + } + + Url& Url::punycode() + { + check_hostname(host_); + std::string encoded(Punycode::encodeHostname(host_)); + check_hostname(encoded); + host_ = encoded; + return *this; + } + + Url& Url::unpunycode() + { + host_ = Punycode::decodeHostname(host_); + return *this; + } + + Url& Url::host_reversed() + { + std::reverse(host_.begin(), host_.end()); + for (size_t index = 0, position = 0; index < host_.size(); index = position + 1) + { + position = host_.find('.', index); + if (position == std::string::npos) + { + std::reverse(host_.begin() + index, host_.end()); + break; + } + else + { + std::reverse(host_.begin() + index, host_.begin() + position); + } + } + return *this; + } + + void Url::check_hostname(std::string& host) + { + // Skip empty hostnames -- they are valid + if (host.empty()) + { + return; + } + + size_t start = 0; + size_t end = host.find('.'); + while (end != std::string::npos) + { + if ((end - start) > 63) + { + throw std::invalid_argument("Label too long."); + } + else if (end == start) + { + throw std::invalid_argument("Empty label."); + } + + start = end + 1; + end = host.find('.', start); + } + + // For the final segment + if ((host.size() - start) > 63) + { + throw std::invalid_argument("Label too long."); + } + else if (host.size() == start && start > 1) + { + // Remove a trailing empty segment + host.resize(start - 1); + } + } + +}; diff --git a/src/url.h b/src/url.h new file mode 100644 index 0000000..6245124 --- /dev/null +++ b/src/url.h @@ -0,0 +1,323 @@ +#ifndef URL_CPP_H +#define URL_CPP_H + +#include +#include +#include +#include +#include +#include + +namespace Url +{ + + struct UrlParseException : public std::logic_error + { + UrlParseException(const std::string& message) : std::logic_error(message) {} + }; + + struct CharacterClass + { + CharacterClass(const std::string& chars) : chars_(chars), map_(256, false) + { + for (auto it = chars_.begin(); it != chars_.end(); ++it) + { + map_[static_cast(*it)] = true; + } + } + + bool operator()(char c) const + { + return map_[static_cast(c)]; + } + + const std::string& chars() const + { + return chars_; + } + + private: + // Private, unimplemented to prevent use + CharacterClass(); + CharacterClass(const CharacterClass& other); + + std::string chars_; + std::vector map_; + }; + + struct Url + { + /* Character classes */ + const static CharacterClass GEN_DELIMS; + const static CharacterClass SUB_DELIMS; + const static CharacterClass ALPHA; + const static CharacterClass DIGIT; + const static CharacterClass UNRESERVED; + const static CharacterClass RESERVED; + const static CharacterClass PCHAR; + const static CharacterClass PATH; + const static CharacterClass QUERY; + const static CharacterClass FRAGMENT; + const static CharacterClass USERINFO; + const static CharacterClass HEX; + const static CharacterClass SCHEME; + const static std::vector HEX_TO_DEC; + const static std::unordered_map PORTS; + const static std::unordered_set USES_RELATIVE; + const static std::unordered_set USES_NETLOC; + const static std::unordered_set USES_PARAMS; + const static std::unordered_set KNOWN_PROTOCOLS; + + // The type of the predicate used for removing parameters + typedef std::function deparam_predicate; + + explicit Url(const std::string& url); + + Url(const Url& other) + : scheme_(other.scheme_) + , host_(other.host_) + , port_(other.port_) + , path_(other.path_) + , params_(other.params_) + , query_(other.query_) + , fragment_(other.fragment_) + , userinfo_(other.userinfo_) + , has_params_(other.has_params_) + , has_query_(other.has_query_) { } + + /** + * Take on the value of the other URL. + */ + Url& assign(const Url& other); + + /** + * To be considered equal, all fields must be equal. + */ + bool operator==(const Url& other) const; + bool operator!=(const Url& other) const; + + /** + * Two URLs are considered equivalent if they have the same meaning. + */ + bool equiv(const Url& other); + + /************************************** + * Component-wise access and setting. * + **************************************/ + const std::string& scheme() const { return scheme_; } + Url& setScheme(const std::string& s) + { + scheme_ = s; + return *this; + } + + const std::string& host() const { return host_; } + Url& setHost(const std::string& s) + { + host_ = s; + return *this; + } + + const int port() const { return port_; } + Url& setPort(int i) + { + port_ = i; + return *this; + } + + const std::string& path() const { return path_; } + Url& setPath(const std::string& s) + { + path_ = s; + return *this; + } + + const std::string& params() const { return params_; } + Url& setParams(const std::string& s) + { + params_ = s; + has_params_ = !s.empty(); + return *this; + } + + const std::string& query() const { return query_; } + Url& setQuery(const std::string& s) + { + query_ = s; + has_query_ = !s.empty(); + return *this; + } + + const std::string& fragment() const { return fragment_; } + Url& setFragment(const std::string& s) + { + fragment_ = s; + return *this; + } + + const std::string& userinfo() const { return userinfo_; } + Url& setUserinfo(const std::string& s) + { + userinfo_ = s; + return *this; + } + + /** + * Get a representation of all components of the path, params, query, fragment. + * + * Always includes a leading /. + */ + std::string fullpath() const; + + /** + * Get a new string representation of the URL. + **/ + std::string str() const; + + /********************* + * Chainable methods * + *********************/ + + /** + * Strip semantically meaningless excess '?', '&', and ';' characters from query + * and params. + */ + Url& strip(); + + /** + * Make the path absolute. + * + * Evaluate '.', '..', and excessive slashes. + */ + Url& abspath(); + + /** + * Evaluate this URL relative fo `other`, placing the result in this object. + */ + Url& relative_to(const std::string& other) + { + return relative_to(Url(other)); + } + + /** + * Evaluate this URL relative fo `other`, placing the result in this object. + */ + Url& relative_to(const Url& other); + + /** + * Ensure that the path, params, query, and userinfo are properly escaped. + * + * In 'strict' mode, only entities that are both safe and not reserved characters + * are unescaped. In non-strict mode, entities that are safe are unescaped. + */ + Url& escape(bool strict=false); + + /** + * Unescape all entities in the path, params, query, and userinfo. + */ + Url& unescape(); + + /** + * Remove any params or queries that appear in the blacklist. + * + * The blacklist should contain only lowercased strings, and the comparison is + * done in a case-insensitive way. + */ + Url& deparam(const std::unordered_set& blacklist); + + /** + * Filter params subject to a predicate for whether it should be filtered. + * + * The predicate must accept two string refs -- the key and value (which may be + * empty). Return `true` if the parameter should be removed, and `false` + * otherwise. + */ + Url& deparam(const deparam_predicate& predicate); + + /** + * Put queries and params in sorted order. + * + * To ensure consistent comparisons, escape should be called beforehand. + */ + Url& sort_query(); + + /** + * Remove the port if it's the default for the scheme. + */ + Url& remove_default_port(); + + /** + * Remove the userinfo portion. + */ + Url& deuserinfo(); + + /** + * Remove the fragment. + */ + Url& defrag(); + + /** + * Punycode the hostname. + */ + Url& punycode(); + + /** + * Unpunycode the hostname. + */ + Url& unpunycode(); + + /** + * Reverse the hostname (a.b.c.d => d.c.b.a) + */ + Url& host_reversed(); + + private: + // Private, unimplemented to prevent use. + Url(); + + /** + * Remove repeated, leading, and trailing instances of chr from the string. + */ + std::string& remove_repeats(std::string& str, const char chr); + + /** + * Ensure all the provided characters are escaped if necessary + */ + std::string& escape(std::string& str, const CharacterClass& safe, bool strict); + + /** + * Unescape entities in the provided string + */ + std::string& unescape(std::string& str); + + /** + * Remove any params that match entries in the blacklist. + */ + std::string& remove_params( + std::string& str, const deparam_predicate& pred, char sep); + + /** + * Split the provided string by char, sort, join by char. + */ + std::string& split_sort_join(std::string& str, const char glue); + + /** + * Check that the hostname is valid, removing an optional trailing '.'. + */ + void check_hostname(std::string& host); + + std::string scheme_; + std::string host_; + int port_; + std::string path_; + std::string params_; + std::string query_; + std::string fragment_; + std::string userinfo_; + bool has_params_; + bool has_query_; + }; + +} + +#endif diff --git a/src/utf8.cpp b/src/utf8.cpp new file mode 100644 index 0000000..3502377 --- /dev/null +++ b/src/utf8.cpp @@ -0,0 +1,150 @@ +#include +#include +#include + +#include "utf8.h" + +namespace Url +{ + + Utf8::codepoint_t Utf8::readCodepoint( + std::string::const_iterator& it, const std::string::const_iterator& end) + { + Utf8::char_t current = static_cast(*it++); + if (current & 0x80) + { + // Number of additional bytes needed + unsigned int bytes = 0; + // The accumulated value + Utf8::codepoint_t result = 0; + if (current < 0xC0) + { + // Invalid sequence + throw std::invalid_argument("Low UTF-8 start byte"); + } + else if (current < 0xE0) + { + // One additional byte, two bytes total, use 5 bits + bytes = 1; + result = current & 0x1F; + } + else if (current < 0xF0) + { + // Two additional bytes, three bytes total, use 4 bits + bytes = 2; + result = current & 0x0F; + } + else if (current < 0xF8) + { + // Three additional bytes, four bytes total, use 3 bits + bytes = 3; + result = current & 0x07; + } + else + { + throw std::invalid_argument("High UTF-8 start byte"); + } + + for (; bytes > 0; --bytes) { + if (it == end) + { + throw std::invalid_argument("UTF-8 sequence terminated early."); + } + + current = static_cast(*it++); + // Ensure the first two bits are 10 + if ((current & 0xC0) != 0x80) + { + throw std::invalid_argument("Invalid continuation byte"); + } + result = (result << 6) | (current & 0x3F); + } + + return result; + } + else + { + return current; + } + } + + std::string& Utf8::writeCodepoint(std::string& str, Utf8::codepoint_t value) + { + if (value > MAX_CODEPOINT) + { + throw std::invalid_argument("Code point too high."); + } + else if (value <= 0x007F) + { + // Just append the character itself + str.append(1, static_cast(value)); + return str; + } + + unsigned int bytes = 0; + if (value > 0xFFFF) + { + /** + * 11110xxx + 3 bytes for 21 bits total + * + * We need to take bits 20-18, which 0x1C0000 masks out. These form the least + * significant bits of this byte (so we shift them back down by 18). The 5 + * most significant bits of this byte are 11110, so we OR this result with + * 0xF0 to get this first byte. + * + * The remaining bits will be consumed from the most-significant end and so + * they must be shifted up by (32 - 18) = 14. + */ + str.append(1, static_cast(((value & 0x1C0000) >> 18) | 0xF0)); + bytes = 3; + value <<= 14; + } + else if (value > 0x07FF) + { + /** + * 1110xxxx + 2 bytes for 16 bits total + * + * We need to take bits 15-12, which 0xF000 masks out. These form the least + * significant bits of this byte (so we shift them back down by 12). The 4 + * most significant bits of this byte are 1110, so we OR this result with + * 0xE0 to get this first byte. + * + * The remaining bits will be consumed from the most-significant end and so + * they must be shifted up by (32 - 12) = 20. + */ + str.append(1, static_cast(((value & 0xF000) >> 12) | 0xE0)); + bytes = 2; + value <<= 20; + } + else + { + /** + * 110xxxxx + 1 byte for 11 bits total + * + * We need to take bits 10-6, which 0x7C0 masks out. These form the least + * significant bits of this byte (so we shift them back down by 6). The 3 + * most significant bits of this byte are 110, so we OR this result with + * 0xC0 to get this first byte. + * + * The remaining bits will be consumed from the most-significant end and so + * they must be shifted up by (32 - 6) = 26. + */ + str.append(1, static_cast(((value & 0x7C0) >> 6) | 0xC0)); + bytes = 1; + value <<= 26; + } + + /** + * The remaining bits are to be consumed 6 at a time from the most-significant + * end. The mask 0xFC000000 grabs these six bits, which then must be shifted down + * by 26, and OR'd with 0x80 to produce the continuation byte. + */ + for (; bytes > 0; --bytes, value <<= 6) + { + str.append(1, static_cast(((value & 0xFC000000) >> 26) | 0x80)); + } + + return str; + } + +}; diff --git a/src/utf8.h b/src/utf8.h new file mode 100644 index 0000000..b677ce8 --- /dev/null +++ b/src/utf8.h @@ -0,0 +1,91 @@ +#ifndef UTF8_CPP_H +#define UTF8_CPP_H + +#include +#include +#include + +namespace Url +{ + + /** + * Work between unicode code points and their UTF-8-encoded representation. + */ + struct Utf8 + { + /** + * The type we use to represent Unicode codepoints. + */ + typedef uint32_t codepoint_t; + + /** + * The type we use when talking about the integral value of bytes. + */ + typedef unsigned char char_t; + + /** + * The highest allowed codepoint. + */ + static const codepoint_t MAX_CODEPOINT = 0x10FFFF; + + /** + * Consume up to the last byte of the sequence, returning the codepoint. + */ + static codepoint_t readCodepoint( + std::string::const_iterator& it, const std::string::const_iterator& end); + + /** + * Write a codepoint to the provided string. + */ + static std::string& writeCodepoint(std::string& str, codepoint_t value); + + /** + * Return the first codepoint stored in the provided string. + */ + static codepoint_t toCodepoint(const std::string& str) + { + auto it = str.begin(); + return readCodepoint(it, str.end()); + } + + /** + * Get a string with the provided codepoint. + */ + static std::string fromCodepoint(codepoint_t value) + { + std::string str; + writeCodepoint(str, value); + return str; + } + + /** + * Return all the codepoints in the string. + */ + static std::vector toCodepoints(const std::string& str) + { + std::vector result; + for (auto it = str.begin(); it != str.end(); ) + { + result.push_back(readCodepoint(it, str.end())); + } + return result; + } + + /** + * Create a string from a vector of codepoints. + */ + static std::string fromCodepoints(const std::vector& points) + { + std::string result; + for (auto it = points.begin(); it != points.end(); ++it) + { + writeCodepoint(result, *it); + } + return result; + } + + }; + +} + +#endif diff --git a/tests/test-all.R b/tests/test-all.R new file mode 100644 index 0000000..19e6960 --- /dev/null +++ b/tests/test-all.R @@ -0,0 +1,2 @@ +library(testthat) +test_check("securitytxt") diff --git a/tests/testthat/test-securitytxt.R b/tests/testthat/test-securitytxt.R new file mode 100644 index 0000000..ab6f62f --- /dev/null +++ b/tests/testthat/test-securitytxt.R @@ -0,0 +1,6 @@ +context("basic functionality") +test_that("we can do something", { + + #expect_that(some_function(), is_a("data.frame")) + +})