initial commit

7 years ago · 878bb7f045
39 changed files with 3445 additions and 0 deletions
--- a/.Rbuildignore
+++ b/.Rbuildignore
@ -0,0 +1,11 @@
+^.*\.Rproj$
+^\.Rproj\.user$
+^\.travis\.yml$
+^README\.*Rmd$
+^README\.*html$
+^NOTES\.*Rmd$
+^NOTES\.*html$
+^\.codecov\.yml$
+^README_files$
+^doc$
+^CONDUCT\.md$
--- a/.codecov.yml
+++ b/.codecov.yml
@ -0,0 +1 @@
+comment: false
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,8 @@
+.DS_Store
+.Rproj.user
+.Rhistory
+.RData
+.Rproj
+src/*.o
+src/*.so
+src/*.dll
--- a/.travis.yml
+++ b/.travis.yml
@ -0,0 +1,31 @@
+language: r
+
+warnings_are_errors: true
+
+sudo: required
+
+cache: packages
+
+r:
+ - oldrel
+ - release
+ - devel
+
+apt_packages:
+  - libv8-dev
+  - xclip
+
+env:
+ global:
+   - CRAN: http://cran.rstudio.com
+
+after_success:
+  - Rscript -e 'covr::codecov()'
+
+notifications:
+  email:
+    - bob@rud.is
+  irc:
+    channels:
+      - "104.236.112.222#builds"
+    nick: travisci
--- a/CONDUCT.md
+++ b/CONDUCT.md
@ -0,0 +1,25 @@
+# Contributor Code of Conduct
+
+As contributors and maintainers of this project, we pledge to respect all people who 
+contribute through reporting issues, posting feature requests, updating documentation,
+submitting pull requests or patches, and other activities.
+
+We are committed to making participation in this project a harassment-free experience for
+everyone, regardless of level of experience, gender, gender identity and expression,
+sexual orientation, disability, personal appearance, body size, race, ethnicity, age, or religion.
+
+Examples of unacceptable behavior by participants include the use of sexual language or
+imagery, derogatory comments or personal attacks, trolling, public or private harassment,
+insults, or other unprofessional conduct.
+
+Project maintainers have the right and responsibility to remove, edit, or reject comments,
+commits, code, wiki edits, issues, and other contributions that are not aligned to this 
+Code of Conduct. Project maintainers who do not follow the Code of Conduct may be removed 
+from the project team.
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by 
+opening an issue or contacting one or more of the project maintainers.
+
+This Code of Conduct is adapted from the Contributor Covenant 
+(http:contributor-covenant.org), version 1.0.0, available at 
+http://contributor-covenant.org/version/1/0/0/
--- a/27
+++ b/27
@ -0,0 +1,27 @@
+Package: rep
+Type: Package
+Title: Tools to Parse and Test Robots Exclusion Protocol Files and Rules
+Version: 0.1.0
+Date: 2017-08-14
+Author: Bob Rudis (bob@rud.is) [aut, cre], SEOmoz, Inc [aut]
+Maintainer: Bob Rudis <bob@rud.is>
+Description: The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents
+    a set of standards for allowing or excluding robot/spider crawling of different areas of
+    site content. Tools are provided which wrap The 'rep-cpp` <https://github.com/seomoz/rep-cpp>
+    C++ library for processing these 'robots.txt' files.
+SystemRequirements: C++11
+NeedsCompilation: yes
+URL: https://github.com/hrbrmstr/rep
+BugReports: https://github.com/hrbrmstr/rep/issues
+License: MIT + file LICENSE
+Suggests:
+    testthat,
+    covr,
+    robotstxt
+Depends:
+    R (>= 3.2.0)
+Imports:
+    purrr,
+    Rcpp
+RoxygenNote: 6.0.1
+LinkingTo: Rcpp
--- a/2
+++ b/2
@ -0,0 +1,2 @@
+YEAR: 2017
+COPYRIGHT HOLDER: Bob Rudis
--- a/7
+++ b/7
@ -0,0 +1,7 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method(print,robxp)
+export(can_fetch)
+export(robxp)
+importFrom(Rcpp,sourceCpp)
+useDynLib(rep, .registration=TRUE)
--- a/NEWS.md
+++ b/NEWS.md
@ -0,0 +1,2 @@
+0.1.0 
+* Initial release
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@ -0,0 +1,19 @@
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#' Parse robots.txt
+#'
+#' @noRd
+#'
+rep_parse <- function(content) {
+    .Call(`_rep_rep_parse`, content)
+}
+
+#' Path allowed
+#'
+#' @noRd
+#'
+rep_path_allowed <- function(xp, path, agent = "*") {
+    .Call(`_rep_rep_path_allowed`, xp, path, agent)
+}
+
--- a/R/rep-package.R
+++ b/R/rep-package.R
@ -0,0 +1,14 @@
+#' Tools to Parse and Test Robots Exclusion Protocol Files and Rules
+#'
+#' The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents a set
+#' of standards for allowing or excluding robot/spider crawling of different areas of
+#' site content. Tools are provided which wrap The 'rep-cpp` <https://github.com/seomoz/rep-cpp>
+#' C++ library for processing these 'robots.txt' files.
+#'
+#' @md
+#' @name rep
+#' @docType package
+#' @author Bob Rudis (bob@@rud.is)
+#' @useDynLib rep, .registration=TRUE
+#' @importFrom Rcpp sourceCpp
+NULL
--- a/R/rep.r
+++ b/R/rep.r
@ -0,0 +1,47 @@
+#' Create a robots.txt object
+#'
+#' @param x atomic character vector containing a complete robots.txt file
+#' @export
+#' @examples
+#' library(robotstxt)
+#' can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE
+#' can_fetch(rt, "/_borders", "*") # FALSE
+robxp <- function(x) {
+
+  robxp <- rep_parse(x)
+  class(robxp) <- c("robxp")
+
+  robxp
+
+}
+
+#' Test URL path against robots.txt
+#'
+#' @md
+#' @param obj `robxp` object
+#' @param path path to test
+#' @param user_agent user agent to test
+#' @export
+#' @examples
+#' library(robotstxt)
+#' can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE
+#' can_fetch(rt, "/_borders", "*") # FALSE
+can_fetch <- function(obj, path="/", user_agent="*") {
+
+  if (inherits(obj, "robxp")) {
+    rep_path_allowed(obj, path, user_agent)
+  } else {
+    return(NULL)
+  }
+
+}
+
+#' Custom printer for 'robexp' objects
+#'
+#' @md
+#' @param x object to print
+#' @param ... unused
+#' @export
+print.robxp <- function(x, ...) {
+  cat("<Robots Exclusion Protocol Object>")
+}
--- a/README.Rmd
+++ b/README.Rmd
@ -0,0 +1,58 @@
+---
+output: rmarkdown::github_document
+---
+
+`rep` : Tools to Parse and Test Robots Exclusion Protocol Files and Rules
+
+The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents a set of standards for allowing or excluding robot/spider crawling of different areas of site content. Tools are provided which wrap The 'rep-cpp` <https://github.com/seomoz/rep-cpp> C++ library for processing these 'robots.txt' files.
+
+- [`rep-cpp`](https://github.com/seomoz/rep-cpp)
+- [`url-cpp`](https://github.com/seomoz/url-cpp)
+
+The following functions are implemented:
+
+- `robxp`:	Create a robots.txt object
+- `can_fetch`:	Test URL path against robots.txt
+
+### Installation
+
+```{r eval=FALSE}
+devtools::install_github("hrbrmstr/rep")
+```
+
+```{r message=FALSE, warning=FALSE, error=FALSE, include=FALSE}
+options(width=120)
+```
+
+### Usage
+
+```{r message=FALSE, warning=FALSE, error=FALSE}
+library(rep)
+library(robotstxt)
+
+# current verison
+packageVersion("rep")
+
+rt <- robxp(get_robotstxt("https://cdc.gov"))
+
+print(rt)
+
+can_fetch(rt, "/asthma/asthma_stats/default.htm", "*")
+
+can_fetch(rt, "/_borders", "*")
+```
+
+### Test Results
+
+```{r message=FALSE, warning=FALSE, error=FALSE}
+library(rep)
+library(testthat)
+
+date()
+
+test_dir("tests/")
+```
+
+### Code of Conduct
+
+Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms.
--- a/README.md
+++ b/README.md
@ -0,0 +1,74 @@
+
+`rep` : Tools to Parse and Test Robots Exclusion Protocol Files and Rules
+
+The 'Robots Exclusion Protocol' <http://www.robotstxt.org/orig.html> documents a set of standards for allowing or excluding robot/spider crawling of different areas of site content. Tools are provided which wrap The 'rep-cpp\` <https://github.com/seomoz/rep-cpp> C++ library for processing these 'robots.txt' files.
+
+-   [`rep-cpp`](https://github.com/seomoz/rep-cpp)
+-   [`url-cpp`](https://github.com/seomoz/url-cpp)
+
+The following functions are implemented:
+
+-   `robxp`: Create a robots.txt object
+-   `can_fetch`: Test URL path against robots.txt
+
+### Installation
+
+``` r
+devtools::install_github("hrbrmstr/rep")
+```
+
+### Usage
+
+``` r
+library(rep)
+library(robotstxt)
+
+# current verison
+packageVersion("rep")
+```
+
+    ## [1] '0.1.0'
+
+``` r
+rt <- robxp(get_robotstxt("https://cdc.gov"))
+
+print(rt)
+```
+
+    ## <Robots Exclusion Protocol Object>
+
+``` r
+can_fetch(rt, "/asthma/asthma_stats/default.htm", "*")
+```
+
+    ## [1] TRUE
+
+``` r
+can_fetch(rt, "/_borders", "*")
+```
+
+    ## [1] FALSE
+
+### Test Results
+
+``` r
+library(rep)
+library(testthat)
+
+date()
+```
+
+    ## [1] "Mon Aug 14 15:00:16 2017"
+
+``` r
+test_dir("tests/")
+```
+
+    ## testthat results ========================================================================================================
+    ## OK: 3 SKIPPED: 0 FAILED: 0
+    ## 
+    ## DONE ===================================================================================================================
+
+### Code of Conduct
+
+Please note that this project is released with a [Contributor Code of Conduct](CONDUCT.md). By participating in this project you agree to abide by its terms.
--- a/man/can_fetch.Rd
+++ b/man/can_fetch.Rd
@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/rep.r
+\name{can_fetch}
+\alias{can_fetch}
+\title{Test URL path against robots.txt}
+\usage{
+can_fetch(obj, path = "/", user_agent = "*")
+}
+\arguments{
+\item{obj}{\code{robxp} object}
+
+\item{path}{path to test}
+
+\item{user_agent}{user agent to test}
+}
+\description{
+Test URL path against robots.txt
+}
+\examples{
+library(robotstxt)
+can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE
+can_fetch(rt, "/_borders", "*") # FALSE
+}
--- a/man/print.robxp.Rd
+++ b/man/print.robxp.Rd
@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/rep.r
+\name{print.robxp}
+\alias{print.robxp}
+\title{Custom printer for 'robexp' objects}
+\usage{
+\method{print}{robxp}(x, ...)
+}
+\arguments{
+\item{x}{object to print}
+
+\item{...}{unused}
+}
+\description{
+Custom printer for 'robexp' objects
+}
--- a/man/rep.Rd
+++ b/man/rep.Rd
@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/rep-package.R
+\docType{package}
+\name{rep}
+\alias{rep}
+\alias{rep-package}
+\title{Tools to Parse and Test Robots Exclusion Protocol Files and Rules}
+\description{
+The 'Robots Exclusion Protocol' \url{http://www.robotstxt.org/orig.html} documents a set
+of standards for allowing or excluding robot/spider crawling of different areas of
+site content. Tools are provided which wrap The 'rep-cpp` \url{https://github.com/seomoz/rep-cpp}
+C++ library for processing these 'robots.txt' files.
+}
+\author{
+Bob Rudis (bob@rud.is)
+}
--- a/man/robxp.Rd
+++ b/man/robxp.Rd
@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/rep.r
+\name{robxp}
+\alias{robxp}
+\title{Create a robots.txt object}
+\usage{
+robxp(x)
+}
+\arguments{
+\item{x}{atomic character vector containing a complete robots.txt file}
+}
+\description{
+Create a robots.txt object
+}
+\examples{
+library(robotstxt)
+can_fetch(rt, "/asthma/asthma_stats/default.htm", "*") # TRUE
+can_fetch(rt, "/_borders", "*") # FALSE
+}
--- a/rep.Rproj
+++ b/rep.Rproj
@ -0,0 +1,21 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+StripTrailingWhitespace: Yes
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
+PackageBuildArgs: --resave-data
+PackageRoxygenize: rd,collate,namespace
--- a/src/.gitignore
+++ b/src/.gitignore
@ -0,0 +1,3 @@
+*.o
+*.so
+*.dll
--- a/src/Makevars
+++ b/src/Makevars
@ -0,0 +1,3 @@
+CXX_STD = CXX11
+PKG_CXXFLAGS = 
+PKG_LIBS = -L.
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@ -0,0 +1,42 @@
+// Generated by using Rcpp::compileAttributes() -> do not edit by hand
+// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+// rep_parse
+SEXP rep_parse(std::string content);
+RcppExport SEXP _rep_rep_parse(SEXP contentSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< std::string >::type content(contentSEXP);
+    rcpp_result_gen = Rcpp::wrap(rep_parse(content));
+    return rcpp_result_gen;
+END_RCPP
+}
+// rep_path_allowed
+bool rep_path_allowed(SEXP xp, std::string path, std::string agent);
+RcppExport SEXP _rep_rep_path_allowed(SEXP xpSEXP, SEXP pathSEXP, SEXP agentSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< SEXP >::type xp(xpSEXP);
+    Rcpp::traits::input_parameter< std::string >::type path(pathSEXP);
+    Rcpp::traits::input_parameter< std::string >::type agent(agentSEXP);
+    rcpp_result_gen = Rcpp::wrap(rep_path_allowed(xp, path, agent));
+    return rcpp_result_gen;
+END_RCPP
+}
+
+static const R_CallMethodDef CallEntries[] = {
+    {"_rep_rep_parse", (DL_FUNC) &_rep_rep_parse, 1},
+    {"_rep_rep_path_allowed", (DL_FUNC) &_rep_rep_path_allowed, 3},
+    {NULL, NULL, 0}
+};
+
+RcppExport void R_init_rep(DllInfo *dll) {
+    R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+    R_useDynamicSymbols(dll, FALSE);
+}
--- a/src/agent.cpp
+++ b/src/agent.cpp
@ -0,0 +1,87 @@
+#include <algorithm>
+#include <sstream>
+
+#include "url.h"
+
+#include "agent.h"
+#include "directive.h"
+
+namespace Rep
+{
+    Agent& Agent::allow(const std::string& query)
+    {
+        directives_.push_back(Directive(escape(query), true));
+        sorted_ = false;
+        return *this;
+    }
+
+    Agent& Agent::disallow(const std::string& query)
+    {
+        if (query.empty())
+        {
+            // Special case: "Disallow:" means "Allow: /"
+            directives_.push_back(Directive(query, true));
+        }
+        else
+        {
+            directives_.push_back(Directive(escape(query), false));
+        }
+        sorted_ = false;
+        return *this;
+    }
+
+    const std::vector<Directive>& Agent::directives() const
+    {
+        if (!sorted_)
+        {
+            std::sort(directives_.begin(), directives_.end(), [](const Directive& a, const Directive& b) {
+                return b.priority() < a.priority();
+            });
+            sorted_ = true;
+        }
+        return directives_;
+    }
+
+    bool Agent::allowed(const std::string& query) const
+    {
+        std::string path(escape(query));
+
+        if (path.compare("/robots.txt") == 0)
+        {
+            return true;
+        }
+
+        for (auto directive : directives())
+        {
+            if (directive.match(path))
+            {
+                return directive.allowed();
+            }
+        }
+        return true;
+    }
+
+    std::string Agent::str() const
+    {
+        std::stringstream out;
+        out << '[';
+        auto begin = directives().begin();
+        auto end = directives().end();
+        if (begin != end)
+        {
+            out << "Directive(" << begin->str() << ')';
+            ++begin;
+        }
+        for (; begin != end; ++begin)
+        {
+            out << ", Directive(" << begin->str() << ')';
+        }
+        out << ']';
+        return out.str();
+    }
+
+    std::string Agent::escape(const std::string& query)
+    {
+        return Url::Url(query).defrag().escape().fullpath();
+    }
+}
--- a/src/agent.h
+++ b/src/agent.h
@ -0,0 +1,70 @@
+#ifndef AGENT_CPP_H
+#define AGENT_CPP_H
+
+#include <vector>
+
+#include "directive.h"
+
+
+namespace Rep
+{
+
+    class Agent
+    {
+    public:
+        /* The type for the delay. */
+        typedef float delay_t;
+
+        /**
+         * Construct an agent.
+         */
+        Agent(): directives_(), delay_(-1.0), sorted_(true) {}
+
+        /**
+         * Add an allowed directive.
+         */
+        Agent& allow(const std::string& query);
+
+        /**
+         * Add a disallowed directive.
+         */
+        Agent& disallow(const std::string& query);
+
+        /**
+         * Set the delay for this agent.
+         */
+        Agent& delay(delay_t value) {
+            delay_ = value;
+            return *this;
+        }
+
+        /**
+         * Return the delay for this agent.
+         */
+        delay_t delay() const { return delay_; }
+
+        /**
+         * A vector of the directives, in priority-sorted order.
+         */
+        const std::vector<Directive>& directives() const;
+
+        /**
+         * Return true if the URL (either a full URL or a path) is allowed.
+         */
+        bool allowed(const std::string& path) const;
+
+        std::string str() const;
+
+        /**
+         * Canonically escape the provided query for matching purposes.
+         */
+        static std::string escape(const std::string& query);
+
+    private:
+        mutable std::vector<Directive> directives_;
+        delay_t delay_;
+        mutable bool sorted_;
+    };
+}
+
+#endif
--- a/src/directive.cpp
+++ b/src/directive.cpp
@ -0,0 +1,130 @@
+#include <algorithm>
+#include <locale>
+#include <sstream>
+#include <string>
+
+#include "url.h"
+
+#include "directive.h"
+
+namespace Rep
+{
+    Directive::Directive(const std::string& line, bool allowed)
+        : expression_()
+        , priority_(line.size())
+        , allowed_(allowed)
+    {
+        if (line.find('*') == std::string::npos)
+        {
+            expression_.assign(line);
+            return;
+        }
+
+        // Remove consecutive '*'s
+        expression_.reserve(line.size());
+        bool star = false;
+        for (auto character : line)
+        {
+            if (character == '*')
+            {
+                if (!star)
+                {
+                    expression_.append(1, character);
+                }
+                star = true;
+            }
+            else
+            {
+                expression_.append(1, character);
+                star = false;
+            }
+        }
+
+        // Remove trailing '*'s
+        std::string::reverse_iterator last =
+            std::find_if(expression_.rbegin(), expression_.rend(),
+                [](const char c) {
+                    return c != '*';
+                });
+        expression_.erase(last.base(), expression_.end());
+
+        // Priority is the length of the expression
+        priority_ = expression_.size();
+    }
+
+    bool Directive::match(const std::string::const_iterator& e_begin,
+                          const std::string::const_iterator& e_end,
+                          const std::string::const_iterator& p_begin,
+                          const std::string::const_iterator& p_end) const
+    {
+        std::string::const_iterator expression_it = e_begin;
+        std::string::const_iterator path_it = p_begin;
+        while (expression_it != e_end && path_it != p_end)
+        {
+            if (*expression_it == '*')
+            {
+                // Advance and recurse
+                ++expression_it;
+                for (; path_it != p_end; ++path_it)
+                {
+                    if (match(expression_it, e_end, path_it, p_end))
+                    {
+                        return true;
+                    }
+                }
+                return false;
+            }
+            else if (*expression_it == '$')
+            {
+                // This check expects path to be fully consumed. But since one of the
+                // criteria of being in this while loop is that we've not fully consumed
+                // path, return false.
+                return false;
+            }
+            else if (*expression_it != *path_it)
+            {
+                // These characters must match
+                return false;
+            }
+            else
+            {
+                // Advance both by one
+                ++path_it;
+                ++expression_it;
+            }
+        }
+
+        // Return true only if we've consumed all of the expression
+        if (expression_it == e_end)
+        {
+            return true;
+        }
+        else if (*expression_it == '$')
+        {
+            return path_it == p_end;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    std::string Directive::str() const
+    {
+        std::stringstream out;
+        if (allowed_)
+        {
+            out << "Allow: " << expression_;
+        }
+        else {
+            out << "Disallow: " << expression_;
+        }
+        return out.str();
+    }
+
+    bool Directive::match(const std::string& path) const
+    {
+        return match(expression_.begin(), expression_.end(), path.begin(), path.end());
+    }
+
+}
--- a/src/directive.h
+++ b/src/directive.h
@ -0,0 +1,67 @@
+#ifndef DIRECTIVE_CPP_H
+#define DIRECTIVE_CPP_H
+
+
+namespace Rep
+{
+
+    class Directive
+    {
+    public:
+        /**
+         * The type of our priority value.
+         */
+        typedef size_t priority_t;
+
+        /**
+         * Default constructor disallowed.
+         */
+        Directive() = delete;
+
+        /**
+         * The input to this constructor must be stripped of comments and trailing
+         * whitespace.
+         */
+        Directive(const std::string& line, bool allowed);
+
+        /**
+         * The priority of the rule.
+         */
+        priority_t priority() const
+        {
+            return priority_;
+        }
+
+        /**
+         * Whether or not the provided path matches. The path is expected to be properly
+         * escaped.
+         */
+        bool match(const std::string& path) const;
+
+        /**
+         * Whether this rule is for an allow or a disallow.
+         */
+        bool allowed() const
+        {
+            return allowed_;
+        }
+
+        std::string str() const;
+
+    private:
+        std::string expression_;
+        priority_t priority_;
+        bool allowed_;
+
+        /**
+         * Return true if p_begin -> p_end matches the expression e_begin -> e_end.
+         */
+        bool match(const std::string::const_iterator& e_begin,
+                   const std::string::const_iterator& e_end,
+                   const std::string::const_iterator& p_begin,
+                   const std::string::const_iterator& p_end) const;
+    };
+
+}
+
+#endif
--- a/src/psl.cpp
+++ b/src/psl.cpp
@ -0,0 +1,183 @@
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#include "psl.h"
+#include "punycode.h"
+
+namespace Url
+{
+    const std::string PSL::not_found = "";
+
+    PSL::PSL(std::istream& stream)
+    {
+        std::string line;
+        while (std::getline(stream, line))
+        {
+            // Only take up to the first whitespace.
+            auto it = std::find_if(line.begin(), line.end(), ::isspace);
+            line.resize(it - line.begin());
+
+            // Skip blank lines
+            if (line.empty())
+            {
+                continue;
+            }
+
+            // Skip comments
+            if (line.compare(0, 2, "//") == 0)
+            {
+                continue;
+            }
+
+            // We know the line has at least a single character at this point
+            if (line[0] == '*')
+            {
+                // Line is a wildcard rule
+                if (line.size() <= 2 || line[1] != '.')
+                {
+                    throw std::invalid_argument("Wildcard rule must be of form *.<host>");
+                }
+
+                add(line, 1, 2);
+            }
+            else if (line[0] == '!')
+            {
+                // Line is an exception, take all but the !
+                if (line.size() <= 1)
+                {
+                    throw std::invalid_argument("Exception rule has no hostname.");
+                }
+
+                add(line, -1, 1);
+            }
+            else
+            {
+                add(line, 0, 0);
+            }
+        }
+    }
+
+    PSL PSL::fromPath(const std::string& path)
+    {
+        std::ifstream stream(path);
+        if (!stream.good())
+        {
+            std::stringstream message;
+            message << "Path '" << path << "' inaccessible.";
+            throw std::invalid_argument(message.str());
+        }
+        return PSL(stream);
+    }
+
+    PSL PSL::fromString(const std::string& str)
+    {
+        std::stringstream stream(str);
+        return PSL(stream);
+    }
+
+    std::string PSL::getTLD(const std::string& hostname) const
+    {
+        return getLastSegments(hostname, getTLDLength(hostname));
+    }
+
+    std::string PSL::getPLD(const std::string& hostname) const
+    {
+        return getLastSegments(hostname, getTLDLength(hostname) + 1);
+    }
+
+    std::pair<std::string, std::string> PSL::getBoth(const std::string& hostname) const
+    {
+        size_t length = getTLDLength(hostname);
+        return std::make_pair(
+            getLastSegments(hostname, length),
+            getLastSegments(hostname, length + 1));
+    }
+
+    size_t PSL::getTLDLength(const std::string& hostname) const
+    {
+        // Reversed copy of hostname
+        std::string tld(hostname.rbegin(), hostname.rend());
+        std::transform(tld.begin(), tld.end(), tld.begin(), ::tolower);
+
+        while (tld.size())
+        {
+            auto it = levels.find(tld);
+            if (it != levels.end())
+            {
+                return it->second;
+            }
+
+            size_t position = tld.rfind('.');
+            if (position == std::string::npos || position == 0)
+            {
+                tld.resize(0);
+            }
+            else
+            {
+                tld.resize(position);
+            }
+        }
+
+        return 1;
+    }
+
+    std::string PSL::getLastSegments(const std::string& hostname, size_t segments) const
+    {
+        size_t position = hostname.size();
+        size_t remaining = segments;
+        while (remaining != 0 && position && position != std::string::npos)
+        {
+            position = hostname.rfind('.', position - 1);
+            remaining -= 1;
+        }
+
+        if (remaining >= 1)
+        {
+            return not_found;
+        }
+
+        // Return the whole string if position == std:string::npos
+        size_t start = (position == std::string::npos) ? 0 : position + 1;
+        
+        std::string result(hostname, start);
+        std::transform(result.begin(), result.end(), result.begin(), ::tolower);
+
+        // Leading .'s indicate that the query had an empty segment
+        if (result.size() && result[0] == '.')
+        {
+            std::stringstream message;
+            message << "Empty segment in " << result;
+            throw std::invalid_argument(message.str());
+        }
+
+        return result;
+    }
+
+    size_t PSL::countSegments(const std::string& hostname) const
+    {
+        size_t count = 1;
+        size_t position = hostname.find('.');
+        while (position != std::string::npos)
+        {
+            count += 1;
+            position = hostname.find('.', position + 1);
+        }
+        return count;
+    }
+
+    void PSL::add(std::string& rule, int level_adjust, size_t trim)
+    {
+        // First unpunycoded
+        std::string copy(rule.rbegin(), rule.rend() - trim);
+        size_t length = countSegments(copy) + level_adjust;
+        levels[copy] = length;
+
+        // And now punycoded
+        rule = Punycode::encodeHostname(rule);
+        copy.assign(rule.rbegin(), rule.rend() - trim);
+        levels[copy] = length;
+    }
+
+};
--- a/src/psl.h
+++ b/src/psl.h
@ -0,0 +1,102 @@
+#ifndef PSL_CPP_H
+#define PSL_CPP_H
+
+#include <istream>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+namespace Url
+{
+
+    /**
+     * Find TLDs and PLDs of a hostname according to a PSL.
+     */
+    struct PSL
+    {
+        /**
+         * Indicates the there is no TLD / PLD
+         */
+        static const std::string not_found;
+
+        /**
+         * Read a PSL from an istream.
+         */
+        PSL(std::istream& stream);
+
+        PSL(): levels() { };
+
+        PSL(const PSL& other): levels(other.levels) { }
+
+        PSL& operator=(const PSL& other)
+        {
+            levels = other.levels;
+            return *this;
+        }
+
+        /**
+         * Read the provided path holding a set of PSL rules.
+         */
+        static PSL fromPath(const std::string& path);
+
+        /**
+         * Create a PSL object from a string.
+         */
+        static PSL fromString(const std::string& str);
+
+        /**
+         * Get just the TLD of the hostname.
+         *
+         * Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If
+         * some segments have been appropriately punycoded and others not, it may return
+         * a wrong answer. If a punycoded host is provided, a punycoded response is
+         * returned. If an unpunycoded host is provided, an unpunycoded response is
+         * returned.
+         */
+        std::string getTLD(const std::string& hostname) const;
+
+        /**
+         * Get just the PLD of the hostname.
+         *
+         * Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If
+         * some segments have been appropriately punycoded and others not, it may return
+         * a wrong answer. If a punycoded host is provided, a punycoded response is
+         * returned. If an unpunycoded host is provided, an unpunycoded response is
+         * returned.
+         */
+        std::string getPLD(const std::string& hostname) const;
+
+        /**
+         * Get the (TLD, PLD) of the hostname.
+         *
+         * Works if the hostname is _either_ punycoded or unpunycoded, but not mixed. If
+         * some segments have been appropriately punycoded and others not, it may return
+         * a wrong answer. If a punycoded host is provided, a punycoded response is
+         * returned. If an unpunycoded host is provided, an unpunycoded response is
+         * returned.
+         */
+        std::pair<std::string, std::string> getBoth(const std::string& hostname) const;
+    private:
+        // Mapping of a string rule to its level
+        std::unordered_map<std::string, size_t> levels;
+
+        // Return the number of segments in a hostname
+        size_t countSegments(const std::string& hostname) const;
+
+        // Return the number of segments in the TLD of the provided hostname
+        size_t getTLDLength(const std::string& hostname) const;
+
+        // Return the last `segments` segments of a hostname
+        std::string getLastSegments(const std::string& hostname, size_t segments) const;
+
+        /**
+         * Add the provided host with the provided priority, trimming characters off
+         * the front, and adjusting the level by the provided number.
+         */
+        void add(std::string& host, int level_adjust, size_t trim);
+    };
+
+}
+
+#endif
--- a/src/punycode.cpp
+++ b/src/punycode.cpp
@ -0,0 +1,409 @@
+#include <algorithm>
+#include <string>
+#include <iostream>
+
+#include "punycode.h"
+#include "utf8.h"
+
+namespace Url
+{
+
+    std::string& Punycode::encode(std::string& str)
+    {
+        // Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3
+        //
+        // let n = initial_n
+        // let delta = 0
+        // let bias = initial_bias
+        punycode_uint n = INITIAL_N;
+        punycode_uint delta = 0;
+        punycode_uint bias = INITIAL_BIAS;
+        std::string output;
+
+        // Accumulate the non-basic codepoints
+        std::vector<punycode_uint> codepoints;
+        for (auto it = str.cbegin(); it != str.cend(); )
+        {
+            Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend());
+            if (value < 0x80)
+            {
+                // copy them to the output in order
+                output.append(1, static_cast<char>(value));
+            }
+            codepoints.push_back(value);
+        }
+
+        // let h = b = the number of basic code points in the input
+        size_t h = output.size();
+        size_t b = h;
+
+        // copy a delimiter if b > 0
+        if (b > 0)
+        {
+            output.append(1, '-');
+        }
+    
+        // while h < length(input) do begin
+        while (h < codepoints.size())
+        {
+            // let m = the minimum {non-basic} code point >= n in the input
+            punycode_uint m = MAX_PUNYCODE_UINT;
+            for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
+            {
+                if ((*it >= n) && (*it < m))
+                {
+                    m = *it;
+                }
+            }
+
+            // let delta = delta + (m - n) * (h + 1), fail on overflow
+            if ((m - n) > ((MAX_PUNYCODE_UINT - delta) / (h + 1)))
+            {
+                throw std::invalid_argument("Overflow delta update.");
+            }
+            delta += (m - n) * (h + 1);
+
+            // let n = m    
+            n = m;
+
+            // for each code point c in the input (in order) do begin
+            for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
+            {
+                // if c < n {or c is basic} then increment delta, fail on overflow
+                if (*it < n)
+                {
+                    if (delta == MAX_PUNYCODE_UINT)
+                    {
+                        throw std::invalid_argument("Overflow delta increment.");
+                    }
+                    ++delta;
+                }
+
+                // if c == n then begin
+                if (*it == n)
+                {
+                    // let q = delta
+                    punycode_uint q = delta;
+
+                    // for k = base to infinity in steps of base do begin
+                    for (punycode_uint k = BASE; ; k += BASE)
+                    {
+                        // let t = tmin if k <= bias {+ tmin}, or
+                        //         tmax if k >= bias + tmax, or k - bias otherwise
+                        punycode_uint t = k <= bias ? TMIN :
+                                          k >= bias + TMAX ? TMAX : k - bias;
+                        
+                        // if q < t then break
+                        if (q < t)
+                        {
+                            break;
+                        }
+
+                        // output the code point for digit t + ((q - t) mod (base - t))
+                        output.append(1, DIGIT_TO_BASIC[t + ((q - t) % (BASE - t))]);
+
+                        // let q = (q - t) div (base - t)
+                        q = (q - t) / (BASE - t);
+                    }
+            
+                    // output the code point for digit q
+                    output.append(1, DIGIT_TO_BASIC[q]);
+
+                    // let bias = adapt(delta, h + 1, test h equals b?)
+                    bias = adapt(delta, h + 1, h == b);
+
+                    // let delta = 0
+                    delta = 0;
+
+                    // increment h
+                    ++h;
+            
+                }
+            }
+            
+            // increment delta and n
+            ++delta;
+            ++n;
+        }
+ 
+        str.assign(output);
+        return str;
+    }
+
+    std::string Punycode::encode(const std::string& str)
+    {
+        std::string result(str);
+        encode(result);
+        return result;
+    }
+
+    std::string Punycode::encodeHostname(const std::string& hostname)
+    {
+        // Avoid any punycoding at all if none is needed
+        if (!needsPunycoding(hostname))
+        {
+            return hostname;
+        }
+
+        std::string encoded;
+
+        size_t start = 0;
+        size_t end = hostname.find('.');
+        while(true)
+        {
+            std::string segment = hostname.substr(start, end - start);
+            if (needsPunycoding(segment))
+            {
+                encoded.append("xn--");
+                encoded.append(Punycode::encode(segment));
+            }
+            else
+            {
+                encoded.append(segment);
+            }
+
+            if (end == std::string::npos)
+            {
+                break;
+            }
+            else
+            {
+                encoded.append(1, '.');
+                start = end + 1;
+                end = hostname.find('.', start);
+            }
+        }
+
+        return encoded;
+    }
+
+    std::string& Punycode::decode(std::string& str)
+    {
+        // Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2
+        //
+        // let n = initial_n
+        // let i = 0
+        // let bias = initial_bias
+        // let output = an empty string indexed from 0
+        punycode_uint n = INITIAL_N;
+        punycode_uint i = 0;
+        punycode_uint bias = INITIAL_BIAS;
+        std::vector<punycode_uint> codepoints;
+
+        size_t index = str.rfind('-');
+        if (index == std::string::npos)
+        {
+            index = 0;
+        }
+
+        // consume all code points before the last delimiter (if there is one)
+        // and copy them to output, fail on any non-basic code point
+        for (auto it = str.begin(); it != (str.begin() + index); ++it)
+        {
+            if (static_cast<unsigned char>(*it) > 127U)
+            {
+                throw std::invalid_argument("Argument has non-basic code points.");
+            }
+            codepoints.push_back(*it);
+        }
+
+        // if more than zero code points were consumed then consume one more
+        //   (which will be the last delimiter)
+        if (index > 0)
+        {
+            index += 1;
+        }
+
+        // while the input is not exhausted do begin
+        for (auto it = (str.begin() + index); it != str.end(); ++it)
+        {
+            // let oldi = i
+            // let w = 1
+            punycode_uint oldi = i;
+            punycode_uint w = 1;
+
+            // for k = base to infinity in steps of base do begin
+            for (punycode_uint k = BASE; ; k += BASE, ++it)
+            {
+                // consume a code point, or fail if there was none to consume
+                if (it == str.end())
+                {
+                    throw std::invalid_argument("Premature termination");
+                }
+                
+                // let digit = the code point's digit-value, fail if it has none
+                int lookup = BASIC_TO_DIGIT[static_cast<size_t>(*it)];
+                if (lookup == -1)
+                {
+                    throw std::invalid_argument("Invalid base 36 character.");
+                }
+                unsigned char digit = static_cast<unsigned char>(lookup);
+
+                // let i = i + digit * w, fail on overflow
+                if (digit > ((MAX_PUNYCODE_UINT - i) / w))
+                {
+                    throw std::invalid_argument("Overflow on i.");
+                }
+                i += digit * w;
+
+                // let t = tmin if k <= bias {+ tmin}, or
+                //         tmax if k >= bias + tmax, or k - bias otherwise
+                punycode_uint t = k <= bias ? TMIN :
+                                  k >= bias + TMAX ? TMAX : k - bias;
+
+                // if digit < t then break
+                if (digit < t)
+                {
+                    break;
+                }
+                
+                // let w = w * (base - t), fail on overflow
+                if (w > (MAX_PUNYCODE_UINT / (BASE - t)))
+                {
+                    // I believe this line is unreachable without first overflowing i.
+                    // Since 'i' is updated above as i += digit * w, and w is updated as
+                    // w = w * (BASE - t), we should like to keep (BASE - t) > digit to
+                    // give 'w' a chance to overflow first. To keep t minimized, we must
+                    // have 'bias' maximized. `bias` is driven by the 'adapt' function
+                    // below.
+                    //
+                    // The value returned by 'adapt' increases with the input delta, and
+                    // decreases with the input size. The delta is a function of the input
+                    // size as well, on the order of (delta_n * input size), and
+                    // legitimate delta_n values are limited to 0x10FFFF (the maximum
+                    // unicode codepoint). Even setting that aside, the maximum value that
+                    // adapt() can return is adapt(2 ** 32 - 1, 1, false) = 204.
+                    //
+                    // Using this bias, we could use the input (HERE) to get iterations:
+                    //
+                    //     digit = b = 1, i = 2, k = 36, t = 1, w = 35
+                    //     digit = b = 1, i = 37, k = 72, t = 1, w = 1225
+                    //     digit = b = 1, i = 1262, k = 108, t = 1, w = 42875
+                    //     digit = b = 1, i = 44137, k = 144, t = 1, w = 1500625
+                    //     digit = b = 1, i = 1544762, k = 180, t = 1, w = 52521875
+                    //
+                    // At this point, t now becomes TMAX (26) because k exceeds the bias
+                    // (since the maximum bias is 204). As such, the minimum continuation
+                    // value is 26:
+                    //
+                    //     digit = 0 = 26, i = 1367113512, k = 216, t = 26, w = 525218750
+                    //
+                    // However, the next iteration now overflows i before we can get to
+                    // the w update.
+                    throw std::invalid_argument("Overflow on w."); // LCOV_EXCL_LINE
+                }
+                w *= (BASE - t);
+            }
+            
+            // let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?)
+            bias = adapt(i - oldi, codepoints.size() + 1, oldi == 0);
+            
+            // let n = n + i div (length(output) + 1), fail on overflow
+            if ((i / (codepoints.size() + 1)) > (MAX_PUNYCODE_UINT - n))
+            {
+                throw std::invalid_argument("Overflow on n.");
+            }
+            n += i / (codepoints.size() + 1);
+
+            // let i = i mod (length(output) + 1)
+            i %= (codepoints.size() + 1);
+
+            // insert n into output at position i
+            codepoints.insert(codepoints.begin() + i, n);
+
+            // increment i
+            ++i;
+        }
+
+        std::string output;
+        for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
+        {
+            Utf8::writeCodepoint(output, *it);
+        }
+        str.assign(output);
+
+        return str;
+    }
+
+    std::string Punycode::decode(const std::string& str)
+    {
+        std::string result(str);
+        decode(result);
+        return result;
+    }
+
+    std::string Punycode::decodeHostname(const std::string& hostname)
+    {
+        std::string unencoded;
+
+        size_t start = 0;
+        size_t end = hostname.find('.');
+        while(true)
+        {
+            std::string segment = hostname.substr(start, end - start);
+            if (segment.substr(0, 4).compare("xn--") == 0)
+            {
+                segment = segment.substr(4);
+                unencoded.append(Punycode::decode(segment));
+            }
+            else
+            {
+                unencoded.append(segment);
+            }
+
+            if (end == std::string::npos)
+            {
+                break;
+            }
+            else
+            {
+                unencoded.append(1, '.');
+                start = end + 1;
+                end = hostname.find('.', start);
+            }
+        }
+
+        return unencoded;
+    }
+
+    bool Punycode::needsPunycoding(const std::string& str)
+    {
+        return std::any_of(
+            str.begin(),
+            str.end(),
+            [](char i){ return static_cast<unsigned char>(i) & 0x80; });
+    }
+
+    Punycode::punycode_uint Punycode::adapt(
+        punycode_uint delta, punycode_uint numpoints, bool firsttime)
+    {
+        // Psuedocode from https://tools.ietf.org/html/rfc3492#section-6.1
+        //
+        // It does not matter whether the modifications to delta and k inside
+        // adapt() affect variables of the same name inside the
+        // encoding/decoding procedures, because after calling adapt() the
+        // caller does not read those variables before overwriting them.
+        //
+        // if firsttime then let delta = delta div damp
+        // else let delta = delta div 2
+        delta = firsttime ? delta / DAMP : delta >> 1;
+        
+        // let delta = delta + (delta div numpoints)
+        delta += (delta / numpoints);
+
+        // let k = 0
+        punycode_uint k = 0;
+
+        // while delta > ((base - tmin) * tmax) div 2 do begin
+        for (; delta > ((BASE - TMIN) * TMAX) / 2; k += BASE)
+        {
+            // let delta = delta div (base - tmin)
+            // let k = k + base
+            delta /= (BASE - TMIN);
+        }
+        
+        // return k + (((base - tmin + 1) * delta) div (delta + skew))
+        return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
+    }
+
+};
--- a/src/punycode.h
+++ b/src/punycode.h
@ -0,0 +1,105 @@
+#ifndef PUNYCODE_CPP_H
+#define PUNYCODE_CPP_H
+
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "utf8.h"
+
+namespace Url
+{
+
+    namespace Punycode
+    {
+        typedef Utf8::codepoint_t punycode_uint;
+
+        const unsigned int BASE          = 36;
+        const unsigned int TMIN          = 1;
+        const unsigned int TMAX          = 26;
+        const unsigned int SKEW          = 38;
+        const unsigned int DAMP          = 700;
+        const unsigned int INITIAL_BIAS  = 72;
+        const unsigned int INITIAL_N     = 128;
+
+        // Codepoints to their base-36 value
+        const std::vector<int8_t> BASIC_TO_DIGIT = {
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
+
+            -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
+            15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
+
+            -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
+            15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
+
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+        };
+        const std::string DIGIT_TO_BASIC = "abcdefghijklmnopqrstuvwxyz0123456789";
+
+        // The highest codepoint in unicode
+        const punycode_uint MAX_PUNYCODE_UINT = std::numeric_limits<punycode_uint>::max();
+        //Utf8::MAX_CODEPOINT;
+        //std::numeric_limits<punycode_uint>::max();
+
+        /**
+         * Replace utf-8-encoded str into punycode.
+         */
+        std::string& encode(std::string& str);
+
+        /**
+         * Create a new punycoded string from utf-8-encoded input.
+         */
+        std::string encode(const std::string& str);
+
+        /**
+         * Encode a hostname.
+         */
+        std::string encodeHostname(const std::string& hostname);
+
+        /**
+         * Replace punycoded str into utf-8-encoded.
+         */
+        std::string& decode(std::string& str);
+
+        /**
+         * Create a new utf-8-encoded string from punycoded input.
+         */
+        std::string decode(const std::string& str);
+
+        /**
+         * Decode a hostname.
+         */
+        std::string decodeHostname(const std::string& hostname);
+
+        /**
+         * Determine if a string needs punycoding.
+         */
+        bool needsPunycoding(const std::string& str);
+
+        /**
+         * Internal function for calculating bias.
+         */
+        punycode_uint adapt(
+            punycode_uint delta, punycode_uint numpoints, bool firsttime);
+
+    };
+
+}
+
+#endif
--- a/src/repmain.cpp
+++ b/src/repmain.cpp
@ -0,0 +1,26 @@
+#include <Rcpp.h>
+using namespace Rcpp;
+
+#include "url.h"
+#include "robots.h"
+
+//' Parse robots.txt
+//'
+//' @noRd
+//'
+// [[Rcpp::export]]
+SEXP rep_parse(std::string content) {
+  Rcpp::XPtr<Rep::Robots> ptr(new Rep::Robots(content));
+  return(ptr);
+}
+
+
+//' Path allowed
+//'
+//' @noRd
+//'
+// [[Rcpp::export]]
+bool rep_path_allowed(SEXP xp, std::string path, std::string agent = "*") {
+  Rcpp::XPtr<Rep::Robots> ptr(xp);
+  return(ptr->allowed(path, agent));
+}
--- a/src/robots.cpp
+++ b/src/robots.cpp
@ -0,0 +1,188 @@
+#include <algorithm>
+#include <functional>
+#include <cctype>
+#include <locale>
+#include <sstream>
+#include <iostream>
+#include <unordered_map>
+
+#include "url.h"
+
+#include "robots.h"
+#include <Rcpp.h>
+
+namespace Rep
+{
+
+    void Robots::strip(std::string& string)
+    {
+        string.erase(string.begin(), std::find_if(string.begin(), string.end(),
+            std::not1(std::ptr_fun<int, int>(std::isspace))));
+        string.erase(std::find_if(string.rbegin(), string.rend(),
+            std::not1(std::ptr_fun<int, int>(std::isspace))).base(), string.end());
+    }
+
+    bool Robots::getpair(std::istringstream& stream, std::string& key, std::string& value)
+    {
+        while (getline(stream, key))
+        {
+            size_t index = key.find('#');
+            if (index != std::string::npos)
+            {
+                key.resize(index);
+            }
+
+            // Find the colon and divide it into key and value, skipping malformed lines
+            index = key.find(':');
+            if (index == std::string::npos)
+            {
+                continue;
+            }
+
+            value.assign(key.begin() + index + 1, key.end());
+            key.resize(index);
+
+            // Strip whitespace off of each
+            strip(key);
+            strip(value);
+
+            // Lowercase the key
+            std::transform(key.begin(), key.end(), key.begin(), ::tolower);
+
+            return true;
+        }
+        return false;
+    }
+
+    Robots::Robots(const std::string& content): agents_(), sitemaps_(), default_(agents_["*"])
+    {
+        std::string agent_name("*");
+        std::istringstream input(content);
+        if (content.compare(0, 3, "\xEF\xBB\xBF") == 0)
+        {
+            input.ignore(3);
+        }
+        std::string key, value;
+        std::vector<std::string> group;
+        bool last_agent = false;
+        agent_map_t::iterator current = agents_.find("*");
+        while (Robots::getpair(input, key, value))
+        {
+            if (key.compare("user-agent") == 0)
+            {
+                // Store the user agent string as lowercased
+                std::transform(value.begin(), value.end(), value.begin(), ::tolower);
+
+                if (last_agent)
+                {
+                    group.push_back(value);
+                }
+                else
+                {
+                    if (!agent_name.empty())
+                    {
+                        for (auto other : group)
+                        {
+                            agents_[other] = current->second;
+                        }
+                        group.clear();
+                    }
+                    agent_name = value;
+                    current = agents_.emplace(agent_name, Agent()).first;
+                }
+                last_agent = true;
+                continue;
+            }
+            else
+            {
+                last_agent = false;
+            }
+
+            if (key.compare("sitemap") == 0)
+            {
+                sitemaps_.push_back(value);
+            }
+            else if (key.compare("disallow") == 0)
+            {
+                current->second.disallow(value);
+            }
+            else if (key.compare("allow") == 0)
+            {
+                current->second.allow(value);
+            }
+            else if (key.compare("crawl-delay") == 0)
+            {
+                try
+                {
+                    current->second.delay(std::stof(value));
+                }
+                catch (const std::exception&)
+                {
+                    Rcpp::Rcout << "Could not parse " << value << " as float." << std::endl;
+                }
+            }
+        }
+
+        if (!agent_name.empty())
+        {
+            for (auto other : group)
+            {
+                agents_[other] = current->second;
+            }
+        }
+    }
+
+    const Agent& Robots::agent(const std::string& name) const
+    {
+        // Lowercase the agent
+        std::string lowered(name);
+        std::transform(lowered.begin(), lowered.end(), lowered.begin(), ::tolower);
+
+        auto it = agents_.find(lowered);
+        if (it == agents_.end())
+        {
+            return default_;
+        }
+        else
+        {
+            return it->second;
+        }
+    }
+
+    bool Robots::allowed(const std::string& path, const std::string& name) const
+    {
+        return agent(name).allowed(path);
+    }
+
+    std::string Robots::str() const
+    {
+        std::stringstream out;
+        // TODO: include sitepath info
+        out << '{';
+        auto begin = agents_.begin();
+        auto end = agents_.end();
+        if (begin != end)
+        {
+            out << '"' << begin->first << '"' << ": " << begin->second.str();
+            ++begin;
+        }
+        for (; begin != end; ++begin)
+        {
+            out << ", \"" << begin->first << '"' << ": " << begin->second.str();
+        }
+        out << '}';
+        return out.str();
+    }
+
+    std::string Robots::robotsUrl(const std::string& url)
+    {
+        return Url::Url(url)
+            .setUserinfo("")
+            .setPath("robots.txt")
+            .setParams("")
+            .setQuery("")
+            .setFragment("")
+            .remove_default_port()
+            .str();
+    }
+}
--- a/src/robots.h
+++ b/src/robots.h
@ -0,0 +1,69 @@
+#ifndef ROBOTS_CPP_H
+#define ROBOTS_CPP_H
+
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+
+#include "agent.h"
+
+namespace Rep
+{
+
+    class Robots
+    {
+    public:
+        typedef std::unordered_map<std::string, Agent> agent_map_t;
+        typedef std::vector<std::string> sitemaps_t;
+
+        /**
+         * Create a robots.txt from a utf-8-encoded string.
+         */
+        Robots(const std::string& content);
+
+        /**
+         * Instantiate a Robots object.
+         */
+        Robots(
+            const agent_map_t& agents,
+            const sitemaps_t& sitemaps)
+            : agents_(agents)
+            , sitemaps_(sitemaps)
+            , default_(agents_["*"]) {}
+
+        /**
+         * Get the sitemaps in this robots.txt
+         */
+        const sitemaps_t& sitemaps() const { return sitemaps_; }
+
+        /**
+         * Get the agent with the corresponding name.
+         */
+        const Agent& agent(const std::string& name) const;
+
+        /**
+         * Return true if agent is allowed to fetch the URL (either a
+         * full URL or a path).
+         */
+        bool allowed(const std::string& path, const std::string& name) const;
+
+        std::string str() const;
+
+        /**
+         * Return the robots.txt URL corresponding to the provided URL.
+         */
+        static std::string robotsUrl(const std::string& url);
+
+    private:
+        static void strip(std::string& string);
+
+        static bool getpair(
+            std::istringstream& stream, std::string& key, std::string& value);
+
+        agent_map_t agents_;
+        sitemaps_t sitemaps_;
+        Agent& default_;
+    };
+}
+
+#endif
--- a/src/url.cpp
+++ b/src/url.cpp
@ -0,0 +1,962 @@
+#include <algorithm>
+#include <string>
+#include <iterator>
+#include <unordered_map>
+#include <unordered_set>
+#include <iostream>
+#include <iterator>
+#include <sstream>
+
+#include "url.h"
+#include "punycode.h"
+
+namespace Url
+{
+
+    /* Character classes */
+    const CharacterClass Url::GEN_DELIMS(":/?#[]@");
+    const CharacterClass Url::SUB_DELIMS("!$&'()*+,;=");
+    const CharacterClass Url::DIGIT("0123456789");
+    const CharacterClass Url::ALPHA(
+        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+    const CharacterClass Url::UNRESERVED(
+        Url::ALPHA.chars() + Url::DIGIT.chars() + "-._~");
+    const CharacterClass Url::RESERVED(
+        Url::GEN_DELIMS.chars() + Url::SUB_DELIMS.chars());
+    const CharacterClass Url::PCHAR(
+        Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":@");
+    const CharacterClass Url::PATH(
+        Url::PCHAR.chars() + "/");
+    const CharacterClass Url::QUERY(
+        Url::PCHAR.chars() + "/?");
+    const CharacterClass Url::FRAGMENT(
+        Url::PCHAR.chars() + "/?");
+    const CharacterClass Url::USERINFO(
+        Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":");
+    const CharacterClass Url::HEX("0123456789ABCDEF");
+    const CharacterClass Url::SCHEME(
+        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-.");
+    const std::vector<signed char> Url::HEX_TO_DEC = {
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -1, -1, -1, -1, -1,
+
+        -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+
+        -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+    };
+    const std::unordered_map<std::string, int> Url::PORTS = {
+        {"http", 80},
+        {"https", 443}
+    };
+    const std::unordered_set<std::string> Url::USES_RELATIVE = {
+        "",
+        "file",
+        "ftp",
+        "gopher",
+        "http",
+        "https",
+        "imap",
+        "mms",
+        "nntp",
+        "prospero",
+        "rtsp",
+        "rtspu",
+        "sftp",
+        "shttp",
+        "svn",
+        "svn+ssh",
+        "wais"
+    };
+    const std::unordered_set<std::string> Url::USES_NETLOC = {
+        "",
+        "file",
+        "ftp",
+        "git",
+        "git+ssh",
+        "gopher",
+        "http",
+        "https",
+        "imap",
+        "mms",
+        "nfs",
+        "nntp",
+        "prospero",
+        "rsync",
+        "rtsp",
+        "rtspu",
+        "sftp",
+        "shttp",
+        "snews",
+        "svn",
+        "svn+ssh",
+        "telnet",
+        "wais"
+    };
+    const std::unordered_set<std::string> Url::USES_PARAMS = {
+        "",
+        "ftp",
+        "hdl",
+        "http",
+        "https",
+        "imap",
+        "mms",
+        "prospero",
+        "rtsp",
+        "rtspu",
+        "sftp",
+        "shttp",
+        "sip",
+        "sips",
+        "tel"
+    };
+    const std::unordered_set<std::string> Url::KNOWN_PROTOCOLS = {
+        "",
+        "file",
+        "ftp",
+        "git",
+        "git+ssh",
+        "gopher",
+        "hdl",
+        "http",
+        "https",
+        "imap",
+        "mms",
+        "nfs",
+        "nntp",
+        "prospero",
+        "rsync",
+        "rtsp",
+        "rtspu",
+        "sftp",
+        "shttp",
+        "sip",
+        "sips",
+        "sms",
+        "snews",
+        "svn",
+        "svn+ssh",
+        "tel",
+        "telnet",
+        "wais"
+    };
+
+    Url::Url(const std::string& url): port_(0), has_params_(false), has_query_(false)
+    {
+        size_t position = 0;
+        size_t index = url.find(':');
+        if (index != std::string::npos)
+        {
+            // All the characters in our would-be scheme must be in SCHEME
+            if (std::all_of(
+                    url.begin(),
+                    url.begin() + index,
+                    [](char c) { return SCHEME(c); } ))
+            {
+                // If there is nothing after the : or there are any non-digits, this is
+                // the scheme
+                if ((index + 1) >= url.length()
+                    || std::any_of(
+                        url.begin() + index + 1,
+                        url.end(),
+                        [](char c) { return !DIGIT(c); }))
+                {
+                    scheme_.assign(url, 0, index);
+                    std::transform(
+                        scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower);
+                    position = index + 1;
+                }
+                else
+                {
+                    scheme_.assign(url, 0, index);
+                    std::transform(
+                        scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower);
+                    if (KNOWN_PROTOCOLS.find(scheme_) != KNOWN_PROTOCOLS.end())
+                    {
+                        position = index + 1;
+                    }
+                    else
+                    {
+                        scheme_.clear();
+                    }
+                }
+            }
+        }
+
+        // Search for the netloc
+        if ((url.length() - position) >= 1
+            && url[position] == '/'
+            && url[position + 1] == '/')
+        {
+            // Skip the '//'
+            position += 2;
+            index = url.find_first_of("/?#", position);
+            host_.assign(url, position, index - position);
+            position = index;
+
+            // Extract any userinfo if there is any
+            index = host_.find('@');
+            if (index != std::string::npos)
+            {
+                userinfo_.assign(host_, 0, index);
+                host_.assign(host_, index + 1, std::string::npos);
+            }
+
+            // Lowercase the hostname
+            std::transform(host_.begin(), host_.end(), host_.begin(), ::tolower);
+
+            // Try to find a port
+            index = host_.find(':');
+            if (index != std::string::npos)
+            {
+                std::string portText(host_, index + 1, std::string::npos);
+                host_.resize(index);
+
+                if (portText.empty())
+                {
+                    port_ = 0;
+                }
+                else
+                {
+                    try
+                    {
+                        port_ = std::stoi(portText, &index);
+
+                        if (index != portText.length())
+                        {
+                            // Malformed port
+                            throw UrlParseException("Port not a number: " + portText);
+                        }
+
+                        if (port_ > 65535)
+                        {
+                            throw UrlParseException("Port too high: " + portText);
+                        }
+                        else if (port_ < 0)
+                        {
+                            throw UrlParseException("Port negative: " + portText);
+                        }
+                    }
+                    catch (const std::invalid_argument&)
+                    {
+                        // Malformed port
+                        throw UrlParseException("Port not a number: " + portText);
+                    }
+                    catch (const std::out_of_range&)
+                    {
+                        throw UrlParseException("Port out of integer range: " + portText);
+                    }
+                }
+            }
+        }
+
+        if (position != std::string::npos)
+        {
+            path_.assign(url, position, std::string::npos);
+
+            index = path_.find('#');
+            if (index != std::string::npos)
+            {
+                fragment_.assign(path_, index + 1, std::string::npos);
+                path_.resize(index);
+            }
+
+            index = path_.find('?');
+            if (index != std::string::npos)
+            {
+                query_.assign(path_, index + 1, std::string::npos);
+                has_query_ = true;
+                path_.resize(index);
+            }
+
+            if (USES_PARAMS.find(scheme_) != USES_PARAMS.end())
+            {
+                index = path_.find(';');
+                if (index != std::string::npos)
+                {
+                    params_.assign(path_, index + 1, std::string::npos);
+                    has_params_ = true;
+                    path_.resize(index);
+                }
+            }
+        }
+    }
+
+    Url& Url::assign(const Url& other)
+    {
+        return (*this) = other;
+    }
+
+    bool Url::operator==(const Url& other) const
+    {
+        return (
+            (scheme_     == other.scheme_    ) &&
+            (userinfo_   == other.userinfo_  ) &&
+            (host_       == other.host_      ) &&
+            (port_       == other.port_      ) &&
+            (path_       == other.path_      ) &&
+            (params_     == other.params_    ) &&
+            (query_      == other.query_     ) &&
+            (fragment_   == other.fragment_  ) &&
+            (has_params_ == other.has_params_) &&
+            (has_query_  == other.has_query_ )
+        );
+    }
+
+    bool Url::operator!=(const Url& other) const
+    {
+        return !operator==(other);
+    }
+
+    bool Url::equiv(const Url& other)
+    {
+        Url self_(*this);
+        Url other_(other);
+
+        self_.strip()
+             .sort_query()
+             .defrag()
+             .deuserinfo()
+             .abspath()
+             .escape()
+             .punycode()
+             .remove_default_port();
+        other_.strip()
+              .sort_query()
+              .defrag()
+              .deuserinfo()
+              .abspath()
+              .escape()
+              .punycode()
+              .remove_default_port();
+        return self_ == other_;
+    }
+
+    std::string& Url::remove_repeats(std::string& str, const char chr)
+    {
+        size_t dest = 0;
+        // By initializing this to true, it also strips of leading instances of chr
+        bool seen = true;
+        for (size_t src = 0; src < str.length(); ++src)
+        {
+            if (!seen || (str[src] != chr))
+            {
+                str[dest++] = str[src];
+            }
+            seen = str[src] == chr;
+        }
+        // Remove the last character if it happens to be chr
+        size_t length = ((dest > 0) && (str[dest - 1] == chr)) ? dest - 1 : dest;
+        str.resize(length);
+        return str;
+    }
+
+    std::string Url::fullpath() const
+    {
+        std::string result;
+        if (path_.empty() || path_[0] != '/')
+        {
+            result.append(1, '/');
+        }
+        result.append(path_);
+
+        if (has_params_)
+        {
+            result.append(";");
+            result.append(params_);
+        }
+
+        if (has_query_)
+        {
+            result.append("?");
+            result.append(query_);
+        }
+
+        if (!fragment_.empty())
+        {
+            result.append("#");
+            result.append(fragment_);
+        }
+        return result;
+    }
+
+    std::string Url::str() const
+    {
+        std::string result;
+
+        if (!scheme_.empty())
+        {
+            result.append(scheme_);
+            if (USES_NETLOC.find(scheme_) == USES_NETLOC.end())
+            {
+                result.append(":");
+            }
+            else
+            {
+                result.append("://");
+            }
+        }
+        else if (!host_.empty())
+        {
+            result.append("//");
+        }
+
+        if (!userinfo_.empty())
+        {
+            result.append(userinfo_);
+            result.append("@");
+        }
+
+        if (!host_.empty())
+        {
+            result.append(host_);
+        }
+
+        if (port_)
+        {
+            result.append(":");
+            result.append(std::to_string(port_));
+        }
+
+        if (path_.empty())
+        {
+            if (!result.empty())
+            {
+                result.append("/");
+            }
+        }
+        else
+        {
+            if (!host_.empty() && path_[0] != '/')
+            {
+                result.append(1, '/');
+            }
+            result.append(path_);
+        }
+
+        if (has_params_)
+        {
+            result.append(";");
+            result.append(params_);
+        }
+
+        if (has_query_)
+        {
+            result.append("?");
+            result.append(query_);
+        }
+
+        if (!fragment_.empty())
+        {
+            result.append("#");
+            result.append(fragment_);
+        }
+
+        return result;
+    }
+
+    Url& Url::strip()
+    {
+        size_t start = query_.find_first_not_of('?');
+        if (start != std::string::npos)
+        {
+            query_.assign(query_, start, std::string::npos);
+        }
+        else
+        {
+            query_.assign("");
+        }
+        setQuery(remove_repeats(query_, '&'));
+        setParams(remove_repeats(params_, ';'));
+        return *this;
+    }
+
+    Url& Url::abspath()
+    {
+        std::string copy;
+        std::vector<size_t> segment_starts;
+
+        if (path_.size() >= 1 && path_[0] == '/')
+        {
+            copy.append(1, '/');
+            segment_starts.push_back(0);
+        }
+
+        bool directory = false;
+        size_t previous = 0;
+        size_t index = 0;
+        for (index = path_.find('/')
+            ; index != std::string::npos
+            ; previous = index + 1, index = path_.find('/', index + 1))
+        {
+            // Skip empty segments
+            if (index - previous == 0)
+            {
+                continue;
+            }
+
+            if ((index - previous == 2)
+                && path_[previous] == '.'
+                && path_[previous + 1] == '.')
+            {
+                if (!segment_starts.empty())
+                {
+                    copy.resize(segment_starts.back());
+                    segment_starts.pop_back();
+                }
+                directory = true;
+            }
+            else if ((index - previous == 1) && path_[previous] == '.')
+            {
+                directory = true;
+            }
+            else
+            {
+                segment_starts.push_back(copy.length());
+                copy.append(path_, previous, index - previous);
+                copy.append(1, '/');
+                directory = false;
+            }
+        }
+
+        // Handle the last segment
+        index = path_.length();
+        if (previous == path_.length())
+        {
+            directory = true;
+        }
+        else if ((index - previous == 1) && path_[previous] == '.')
+        {
+            directory = true;
+        }
+        else if ((index - previous == 2)
+                && path_[previous] == '.'
+                && path_[previous + 1] == '.')
+        {
+            if (!segment_starts.empty())
+            {
+                copy.resize(segment_starts.back());
+            }
+            directory = true;
+        }
+        else
+        {
+            copy.append(path_, previous, index - previous);
+            copy.append(1, '/');
+            directory = false;
+        }
+
+        if (!directory && copy.size() >= 1)
+        {
+            copy.resize(copy.size() - 1);
+        }
+        else if (directory && copy.empty())
+        {
+            copy.append(1, '/');
+        }
+        path_.assign(copy);
+
+        return *this;
+    }
+
+    Url& Url::relative_to(const Url& other)
+    {
+        // If this scheme does not use relative, return it unchanged
+        if (USES_RELATIVE.find(scheme_) == USES_RELATIVE.end())
+        {
+            return *this;
+        }
+
+        // Support scheme-relative URLs
+        if (scheme_.empty())
+        {
+            scheme_ = other.scheme_;
+        }
+
+        // If this is an absolute URL (or scheme-relative), return early
+        if (!host_.empty()) {
+            return *this;
+        }
+
+        // If it's not an absolute URL, we need to copy the other host and port
+        host_ = other.host_;
+        port_ = other.port_;
+        userinfo_ = other.userinfo_;
+
+        // If the path portion is absolute, then bail out early.
+        if (!path_.empty() && path_.front() == '/')
+        {
+            return *this;
+        }
+
+        // Otherwise, this is a path that need to be evaluated relative to the other. If
+        // there is no '/', then we just keep our current path if it's not empty.
+        if (path_.empty())
+        {
+            if (params_.empty())
+            {
+                path_ = other.path_;
+                params_ = other.params_;
+                has_params_ = other.has_params_;
+                if (query_.empty())
+                {
+                    query_ = other.query_;
+                    has_query_ = other.has_query_;
+                }
+            }
+            else
+            {
+                path_.assign(other.path_, 0, other.path_.rfind('/') + 1);
+            }
+
+            if (fragment_.empty())
+            {
+                fragment_ = other.fragment_;
+            }
+        }
+        else
+        {
+            size_t index = other.path_.rfind('/');
+            if (index != std::string::npos)
+            {
+                path_ = other.path_.substr(0, index + 1) + path_;
+            }
+            else if (!host_.empty())
+            {
+                path_ = "/" + path_;
+            }
+        }
+
+        return *this;
+    }
+
+    Url& Url::escape(bool strict)
+    {
+        escape(path_, PATH, strict);
+        escape(query_, QUERY, strict);
+        escape(params_, QUERY, strict);
+        escape(userinfo_, USERINFO, strict);
+        return *this;
+    }
+
+    std::string& Url::escape(std::string& str, const CharacterClass& safe, bool strict)
+    {
+        std::string copy(str);
+        size_t dest = 0;
+        // Allocate space pessimistically -- if every entity is expanded, it will take 3x
+        // the space.
+        str.resize(str.length() * 3);
+        for (size_t src = 0; src < copy.length(); ++src)
+        {
+            if (copy[src] == '%' && (copy.length() - src) >= 2)
+            {
+                // Read ahead to see if there's a valid escape sequence. If not, treat
+                // this like a normal character.
+                if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1)
+                {
+                    int value = (
+                        HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]);
+
+                    // In strict mode, we can only unescape parameters if they are both
+                    // safe and node reserved
+                    if (!strict || (strict && safe(value) && !RESERVED(value)))
+                    {
+                        // Replace src + 2 with that byte, advance src to consume it and
+                        // continue.
+                        src += 2;
+                        copy[src] = value;
+                    }
+                    else
+                    {
+                        str[dest++] = copy[src++];
+                        str[dest++] = ::toupper(copy[src++]);
+                        str[dest++] = ::toupper(copy[src]);
+                        continue;
+                    }
+                }
+            }
+
+            if (!safe(copy[src]))
+            {
+                // Not safe -- replace with %XX
+                str[dest++] = '%';
+                str[dest++] = HEX.chars()[(copy[src] >> 4) & 0xF];
+                str[dest++] = HEX.chars()[copy[src] & 0xF];
+            }
+            else
+            {
+                str[dest++] = copy[src];
+            }
+        }
+        str.resize(dest);
+        return str;
+    }
+
+    Url& Url::unescape()
+    {
+        unescape(path_);
+        unescape(query_);
+        unescape(params_);
+        unescape(userinfo_);
+        return *this;
+    }
+
+    std::string& Url::unescape(std::string& str)
+    {
+        std::string copy(str);
+        size_t dest = 0;
+        for (size_t src = 0; src < copy.length(); ++src, ++dest)
+        {
+            if (copy[src] == '%' && (copy.length() - src) >= 2)
+            {
+                // Read ahead to see if there's a valid escape sequence. If not, treat
+                // this like a normal character.
+                if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1)
+                {
+                    int value = (
+                        HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]);
+
+                    // Replace src + 2 with that byte, advance src to consume it and
+                    // continue.
+                    src += 2;
+                    str[dest] = value;
+                    continue;
+                }
+            }
+
+            // Either not a % or an incomplete entity
+            str[dest] = copy[src];
+        }
+        str.resize(dest);
+        return str;
+    }
+
+    Url& Url::deparam(const std::unordered_set<std::string>& blacklist)
+    {
+        // Predicate is if it's present in the blacklist.
+        auto predicate = [blacklist](std::string& name, const std::string& value)
+        {
+            std::transform(name.begin(), name.end(), name.begin(), ::tolower);
+            return blacklist.find(name) != blacklist.end();
+        };
+
+        setQuery(remove_params(query_, predicate, '&'));
+        setParams(remove_params(params_, predicate, ';'));
+        return *this;
+    }
+
+    Url& Url::deparam(const deparam_predicate& predicate)
+    {
+        setQuery(remove_params(query_, predicate, '&'));
+        setParams(remove_params(params_, predicate, ';'));
+        return *this;
+    }
+
+    std::string& Url::remove_params(std::string& str,
+                            const deparam_predicate& predicate,
+                            char sep)
+    {
+        std::string copy;
+        std::string piece;
+        std::string name;
+        std::string value;
+        size_t previous = 0;
+        for (size_t index = str.find(sep)
+            ; index != std::string::npos
+            ; previous = index + 1, index = str.find(sep, previous))
+        {
+            piece.assign(str, previous, index - previous);
+            size_t position = piece.find('=');
+            name.assign(piece, 0, position);
+            value.clear();
+            if (position != std::string::npos)
+            {
+                value.assign(piece, position + 1, std::string::npos);
+            }
+
+            if (!predicate(name, value))
+            {
+                copy.append(copy.empty() ? 0 : 1, sep);
+                copy.append(piece);
+            }
+        }
+
+        if (previous < str.length())
+        {
+            piece.assign(str, previous, std::string::npos);
+            size_t position = piece.find('=');
+            name.assign(piece, 0, position);
+            value.clear();
+            if (position != std::string::npos)
+            {
+                value.assign(piece, position + 1, std::string::npos);
+            }
+
+            if (!predicate(name, value))
+            {
+                copy.append(copy.empty() ? 0 : 1, sep);
+                copy.append(piece);
+            }
+        }
+
+        str.assign(copy);
+        return str;
+    }
+
+    Url& Url::sort_query()
+    {
+        split_sort_join(query_, '&');
+        split_sort_join(params_, ';');
+        return *this;
+    }
+
+    std::string& Url::split_sort_join(std::string& str, const char glue)
+    {
+        // Return early if empty
+        if (str.empty())
+        {
+            return str;
+        }
+
+        // Split
+        std::vector<std::string> pieces;
+        std::stringstream stream(str);
+        std::string item;
+        while (getline(stream, item, glue))
+        {
+            pieces.push_back(item);
+        }
+
+        // Return early if it's just a single element
+        if (pieces.size() == 1)
+        {
+            return str;
+        }
+
+        // Sort
+        std::sort(pieces.begin(), pieces.end());
+
+        // Join (at this point we know that there's at least one element)
+        std::stringstream output;
+        for (auto it = pieces.begin(); it != (pieces.end() - 1); ++it)
+        {
+            output << *it << glue;
+        }
+        output << pieces.back();
+        str.assign(output.str());
+        return str;
+    }
+
+    Url& Url::remove_default_port()
+    {
+        if (port_ && !scheme_.empty())
+        {
+            auto it = PORTS.find(scheme_);
+            if (it != PORTS.end() && port_ == it->second)
+            {
+                port_ = 0;
+            }
+        }
+        return *this;
+    }
+
+    Url& Url::deuserinfo()
+    {
+        userinfo_.clear();
+        return *this;
+    }
+
+    Url& Url::defrag()
+    {
+        fragment_.clear();
+        return *this;
+    }
+
+    Url& Url::punycode()
+    {
+        check_hostname(host_);
+        std::string encoded(Punycode::encodeHostname(host_));
+        check_hostname(encoded);
+        host_ = encoded;
+        return *this;
+    }
+
+    Url& Url::unpunycode()
+    {
+        host_ = Punycode::decodeHostname(host_);
+        return *this;
+    }
+
+    Url& Url::host_reversed()
+    {
+        std::reverse(host_.begin(), host_.end());
+        for (size_t index = 0, position = 0; index < host_.size(); index = position + 1)
+        {
+            position = host_.find('.', index);
+            if (position == std::string::npos)
+            {
+                std::reverse(host_.begin() + index, host_.end());
+                break;
+            }
+            else
+            {
+                std::reverse(host_.begin() + index, host_.begin() + position);
+            }
+        }
+        return *this;
+    }
+
+    void Url::check_hostname(std::string& host)
+    {
+        // Skip empty hostnames -- they are valid
+        if (host.empty())
+        {
+            return;
+        }
+
+        size_t start = 0;
+        size_t end = host.find('.');
+        while (end != std::string::npos)
+        {
+            if ((end - start) > 63)
+            {
+                throw std::invalid_argument("Label too long.");
+            }
+            else if (end == start)
+            {
+                throw std::invalid_argument("Empty label.");
+            }
+
+            start = end + 1;
+            end = host.find('.', start);
+        }
+
+        // For the final segment
+        if ((host.size() - start) > 63)
+        {
+            throw std::invalid_argument("Label too long.");
+        }
+        else if (host.size() == start && start > 1)
+        {
+            // Remove a trailing empty segment
+            host.resize(start - 1);
+        }
+    }
+
+};
--- a/src/url.h
+++ b/src/url.h
@ -0,0 +1,323 @@
+#ifndef URL_CPP_H
+#define URL_CPP_H
+
+#include <stdexcept>
+#include <functional>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace Url
+{
+
+    struct UrlParseException : public std::logic_error
+    {
+        UrlParseException(const std::string& message) : std::logic_error(message) {}
+    };
+
+    struct CharacterClass
+    {
+        CharacterClass(const std::string& chars) : chars_(chars), map_(256, false)
+        {
+            for (auto it = chars_.begin(); it != chars_.end(); ++it)
+            {
+                map_[static_cast<size_t>(*it)] = true;
+            }
+        }
+
+        bool operator()(char c) const
+        {
+            return map_[static_cast<unsigned char>(c)];
+        }
+
+        const std::string& chars() const
+        {
+            return chars_;
+        }
+
+    private:
+        // Private, unimplemented to prevent use
+        CharacterClass();
+        CharacterClass(const CharacterClass& other);
+
+        std::string chars_;
+        std::vector<bool> map_;
+    };
+
+    struct Url
+    {
+        /* Character classes */
+        const static CharacterClass GEN_DELIMS;
+        const static CharacterClass SUB_DELIMS;
+        const static CharacterClass ALPHA;
+        const static CharacterClass DIGIT;
+        const static CharacterClass UNRESERVED;
+        const static CharacterClass RESERVED;
+        const static CharacterClass PCHAR;
+        const static CharacterClass PATH;
+        const static CharacterClass QUERY;
+        const static CharacterClass FRAGMENT;
+        const static CharacterClass USERINFO;
+        const static CharacterClass HEX;
+        const static CharacterClass SCHEME;
+        const static std::vector<signed char> HEX_TO_DEC;
+        const static std::unordered_map<std::string, int> PORTS;
+        const static std::unordered_set<std::string> USES_RELATIVE;
+        const static std::unordered_set<std::string> USES_NETLOC;
+        const static std::unordered_set<std::string> USES_PARAMS;
+        const static std::unordered_set<std::string> KNOWN_PROTOCOLS;
+
+        // The type of the predicate used for removing parameters
+        typedef std::function<bool(std::string&, std::string&)> deparam_predicate;
+
+        explicit Url(const std::string& url);
+
+        Url(const Url& other)
+            : scheme_(other.scheme_)
+            , host_(other.host_)
+            , port_(other.port_)
+            , path_(other.path_)
+            , params_(other.params_)
+            , query_(other.query_)
+            , fragment_(other.fragment_)
+            , userinfo_(other.userinfo_)
+            , has_params_(other.has_params_)
+            , has_query_(other.has_query_) { }
+
+        /**
+         * Take on the value of the other URL.
+         */
+        Url& assign(const Url& other);
+
+        /**
+         * To be considered equal, all fields must be equal.
+         */
+        bool operator==(const Url& other) const;
+        bool operator!=(const Url& other) const;
+
+        /**
+         * Two URLs are considered equivalent if they have the same meaning.
+         */
+        bool equiv(const Url& other);
+
+        /**************************************
+         * Component-wise access and setting. *
+         **************************************/
+        const std::string& scheme() const { return scheme_; }
+        Url& setScheme(const std::string& s)
+        {
+            scheme_ = s;
+            return *this;
+        }
+
+        const std::string& host() const { return host_; }
+        Url& setHost(const std::string& s)
+        {
+            host_ = s;
+            return *this;
+        }
+
+        const int port() const { return port_; }
+        Url& setPort(int i)
+        {
+            port_ = i;
+            return *this;
+        }
+
+        const std::string& path() const { return path_; }
+        Url& setPath(const std::string& s)
+        {
+            path_ = s;
+            return *this;
+        }
+
+        const std::string& params() const { return params_; }
+        Url& setParams(const std::string& s)
+        {
+            params_ = s;
+            has_params_ = !s.empty();
+            return *this;
+        }
+
+        const std::string& query() const { return query_; }
+        Url& setQuery(const std::string& s)
+        {
+            query_ = s;
+            has_query_ = !s.empty();
+            return *this;
+        }
+
+        const std::string& fragment() const { return fragment_; }
+        Url& setFragment(const std::string& s)
+        {
+            fragment_ = s;
+            return *this;
+        }
+
+        const std::string& userinfo() const { return userinfo_; }
+        Url& setUserinfo(const std::string& s)
+        {
+            userinfo_ = s;
+            return *this;
+        }
+
+        /**
+         * Get a representation of all components of the path, params, query, fragment.
+         *
+         * Always includes a leading /.
+         */
+        std::string fullpath() const;
+
+        /**
+         * Get a new string representation of the URL.
+         **/
+        std::string str() const;
+
+        /*********************
+         * Chainable methods *
+         *********************/
+
+        /**
+         * Strip semantically meaningless excess '?', '&', and ';' characters from query
+         * and params.
+         */
+        Url& strip();
+
+        /**
+         * Make the path absolute.
+         *
+         * Evaluate '.', '..', and excessive slashes.
+         */
+        Url& abspath();
+
+        /**
+         * Evaluate this URL relative fo `other`, placing the result in this object.
+         */
+        Url& relative_to(const std::string& other)
+        {
+            return relative_to(Url(other));
+        }
+
+        /**
+         * Evaluate this URL relative fo `other`, placing the result in this object.
+         */
+        Url& relative_to(const Url& other);
+
+        /**
+         * Ensure that the path, params, query, and userinfo are properly escaped.
+         *
+         * In 'strict' mode, only entities that are both safe and not reserved characters
+         * are unescaped. In non-strict mode, entities that are safe are unescaped.
+         */
+        Url& escape(bool strict=false);
+
+        /**
+         * Unescape all entities in the path, params, query, and userinfo.
+         */
+        Url& unescape();
+
+        /**
+         * Remove any params or queries that appear in the blacklist.
+         *
+         * The blacklist should contain only lowercased strings, and the comparison is
+         * done in a case-insensitive way.
+         */
+        Url& deparam(const std::unordered_set<std::string>& blacklist);
+
+        /**
+         * Filter params subject to a predicate for whether it should be filtered.
+         *
+         * The predicate must accept two string refs -- the key and value (which may be
+         * empty). Return `true` if the parameter should be removed, and `false`
+         * otherwise.
+         */
+        Url& deparam(const deparam_predicate& predicate);
+
+        /**
+         * Put queries and params in sorted order.
+         *
+         * To ensure consistent comparisons, escape should be called beforehand.
+         */
+        Url& sort_query();
+
+        /**
+         * Remove the port if it's the default for the scheme.
+         */
+        Url& remove_default_port();
+
+        /**
+         * Remove the userinfo portion.
+         */
+        Url& deuserinfo();
+
+        /**
+         * Remove the fragment.
+         */
+        Url& defrag();
+
+        /**
+         * Punycode the hostname.
+         */
+        Url& punycode();
+
+        /**
+         * Unpunycode the hostname.
+         */
+        Url& unpunycode();
+
+        /**
+         * Reverse the hostname (a.b.c.d => d.c.b.a)
+         */
+        Url& host_reversed();
+
+    private:
+        // Private, unimplemented to prevent use.
+        Url();
+
+        /**
+         * Remove repeated, leading, and trailing instances of chr from the string.
+         */
+        std::string& remove_repeats(std::string& str, const char chr);
+
+        /**
+         * Ensure all the provided characters are escaped if necessary
+         */
+        std::string& escape(std::string& str, const CharacterClass& safe, bool strict);
+
+        /**
+         * Unescape entities in the provided string
+         */
+        std::string& unescape(std::string& str);
+
+        /**
+         * Remove any params that match entries in the blacklist.
+         */
+        std::string& remove_params(
+            std::string& str, const deparam_predicate& pred, char sep);
+
+        /**
+         * Split the provided string by char, sort, join by char.
+         */
+        std::string& split_sort_join(std::string& str, const char glue);
+
+        /**
+         * Check that the hostname is valid, removing an optional trailing '.'.
+         */
+        void check_hostname(std::string& host);
+
+        std::string scheme_;
+        std::string host_;
+        int port_;
+        std::string path_;
+        std::string params_;
+        std::string query_;
+        std::string fragment_;
+        std::string userinfo_;
+        bool has_params_;
+        bool has_query_;
+    };
+
+}
+
+#endif
--- a/src/utf8.cpp
+++ b/src/utf8.cpp
@ -0,0 +1,150 @@
+#include <algorithm>
+#include <string>
+#include <iostream>
+
+#include "utf8.h"
+
+namespace Url
+{
+
+    Utf8::codepoint_t Utf8::readCodepoint(
+        std::string::const_iterator& it, const std::string::const_iterator& end)
+    {
+        Utf8::char_t current = static_cast<Utf8::char_t>(*it++);
+        if (current & 0x80)
+        {
+            // Number of additional bytes needed
+            unsigned int bytes = 0;
+            // The accumulated value
+            Utf8::codepoint_t result = 0;
+            if (current < 0xC0)
+            {
+                // Invalid sequence
+                throw std::invalid_argument("Low UTF-8 start byte");
+            }
+            else if (current < 0xE0)
+            {
+                // One additional byte, two bytes total, use 5 bits
+                bytes = 1;
+                result = current & 0x1F;
+            }
+            else if (current < 0xF0)
+            {
+                // Two additional bytes, three bytes total, use 4 bits
+                bytes = 2;
+                result = current & 0x0F;
+            }
+            else if (current < 0xF8)
+            {
+                // Three additional bytes, four bytes total, use 3 bits
+                bytes = 3;
+                result = current & 0x07;
+            }
+            else
+            {
+                throw std::invalid_argument("High UTF-8 start byte");
+            }
+
+            for (; bytes > 0; --bytes) {
+                if (it == end)
+                {
+                    throw std::invalid_argument("UTF-8 sequence terminated early.");
+                }
+
+                current = static_cast<unsigned char>(*it++);
+                // Ensure the first two bits are 10
+                if ((current & 0xC0) != 0x80)
+                {
+                    throw std::invalid_argument("Invalid continuation byte");
+                }
+                result = (result << 6) | (current & 0x3F);
+            }
+
+            return result;
+        }
+        else
+        {
+            return current;
+        }
+    }
+
+    std::string& Utf8::writeCodepoint(std::string& str, Utf8::codepoint_t value)
+    {
+        if (value > MAX_CODEPOINT)
+        {
+            throw std::invalid_argument("Code point too high.");
+        }
+        else if (value <= 0x007F)
+        {
+            // Just append the character itself
+            str.append(1, static_cast<char>(value));
+            return str;
+        }
+
+        unsigned int bytes = 0;
+        if (value > 0xFFFF)
+        {
+            /**
+             * 11110xxx + 3 bytes for 21 bits total
+             *
+             * We need to take bits 20-18, which 0x1C0000 masks out. These form the least
+             * significant bits of this byte (so we shift them back down by 18). The 5
+             * most significant bits of this byte are 11110, so we OR this result with
+             * 0xF0 to get this first byte.
+             *
+             * The remaining bits will be consumed from the most-significant end and so
+             * they must be shifted up by (32 - 18) = 14.
+             */
+            str.append(1, static_cast<char>(((value & 0x1C0000) >> 18) | 0xF0));
+            bytes = 3;
+            value <<= 14;
+        }
+        else if (value > 0x07FF)
+        {
+            /**
+             * 1110xxxx + 2 bytes for 16 bits total
+             *
+             * We need to take bits 15-12, which 0xF000 masks out. These form the least
+             * significant bits of this byte (so we shift them back down by 12). The 4
+             * most significant bits of this byte are 1110, so we OR this result with
+             * 0xE0 to get this first byte.
+             *
+             * The remaining bits will be consumed from the most-significant end and so
+             * they must be shifted up by (32 - 12) = 20.
+             */
+            str.append(1, static_cast<char>(((value & 0xF000) >> 12) | 0xE0));
+            bytes = 2;
+            value <<= 20;
+        }
+        else
+        {
+            /**
+             * 110xxxxx + 1 byte for 11 bits total
+             *
+             * We need to take bits 10-6, which 0x7C0 masks out. These form the least
+             * significant bits of this byte (so we shift them back down by 6). The 3
+             * most significant bits of this byte are 110, so we OR this result with
+             * 0xC0 to get this first byte.
+             *
+             * The remaining bits will be consumed from the most-significant end and so
+             * they must be shifted up by (32 - 6) = 26.
+             */
+            str.append(1, static_cast<char>(((value & 0x7C0) >> 6) | 0xC0));
+            bytes = 1;
+            value <<= 26;
+        }
+
+        /**
+         * The remaining bits are to be consumed 6 at a time from the most-significant
+         * end. The mask 0xFC000000 grabs these six bits, which then must be shifted down
+         * by 26, and OR'd with 0x80 to produce the continuation byte.
+         */
+        for (; bytes > 0; --bytes, value <<= 6)
+        {
+            str.append(1, static_cast<char>(((value & 0xFC000000) >> 26) | 0x80));
+        }
+
+        return str;
+    }
+
+};
--- a/src/utf8.h
+++ b/src/utf8.h
@ -0,0 +1,91 @@
+#ifndef UTF8_CPP_H
+#define UTF8_CPP_H
+
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace Url
+{
+
+    /**
+     * Work between unicode code points and their UTF-8-encoded representation.
+     */
+    struct Utf8
+    {
+        /**
+         * The type we use to represent Unicode codepoints.
+         */
+        typedef uint32_t codepoint_t;
+
+        /**
+         * The type we use when talking about the integral value of bytes.
+         */
+        typedef unsigned char char_t;
+
+        /**
+         * The highest allowed codepoint.
+         */
+        static const codepoint_t MAX_CODEPOINT = 0x10FFFF;
+
+        /**
+         * Consume up to the last byte of the sequence, returning the codepoint.
+         */
+        static codepoint_t readCodepoint(
+            std::string::const_iterator& it, const std::string::const_iterator& end);
+
+        /**
+         * Write a codepoint to the provided string.
+         */
+        static std::string& writeCodepoint(std::string& str, codepoint_t value);
+
+        /**
+         * Return the first codepoint stored in the provided string.
+         */
+        static codepoint_t toCodepoint(const std::string& str)
+        {
+            auto it = str.begin();
+            return readCodepoint(it, str.end());
+        }
+
+        /**
+         * Get a string with the provided codepoint.
+         */
+        static std::string fromCodepoint(codepoint_t value)
+        {
+            std::string str;
+            writeCodepoint(str, value);
+            return str;
+        }
+
+        /**
+         * Return all the codepoints in the string.
+         */
+        static std::vector<codepoint_t> toCodepoints(const std::string& str)
+        {
+            std::vector<codepoint_t> result;
+            for (auto it = str.begin(); it != str.end(); )
+            {
+                result.push_back(readCodepoint(it, str.end()));
+            }
+            return result;
+        }
+
+        /**
+         * Create a string from a vector of codepoints.
+         */
+        static std::string fromCodepoints(const std::vector<codepoint_t>& points)
+        {
+            std::string result;
+            for (auto it = points.begin(); it != points.end(); ++it)
+            {
+                writeCodepoint(result, *it);
+            }
+            return result;
+        }
+
+    };
+
+}
+
+#endif
--- a/tests/test-all.R
+++ b/tests/test-all.R
@ -0,0 +1,3 @@
+library(testthat)
+library(robotstxt)
+test_check("rep")
--- a/tests/testthat/test-rep.R
+++ b/tests/testthat/test-rep.R
@ -0,0 +1,11 @@
+context("basic functionality")
+test_that("we can do something", {
+
+  rt <- robxp(robotstxt::get_robotstxt("https://cdc.gov"))
+
+  expect_that(rt, is_a("robxp"))
+
+  expect_that(can_fetch(rt, "/asthma/asthma_stats/default.htm", "*"), equals(TRUE))
+  expect_that(can_fetch(rt, "/_borders", "*"), equals(FALSE))
+
+})