initial commit

6 years ago · ce4475df5e
24 changed files with 19681 additions and 15 deletions
--- a/19
+++ b/19
@ -1,24 +1,31 @@
 Package: psl
 Type: Package
-Title: psl title goes here otherwise CRAN checks fail
+Title: Extract Internet Domain Components Using the Public Suffix List
 Version: 0.1.0
 Date: 2018-09-06
 Authors@R: c(
    person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), 
-           comment = c(ORCID = "0000-0001-5670-2640"))
+           comment = c(ORCID = "0000-0001-5670-2640")),
+    person("Tim", "Rühsen", email = "bob@rud.is", role = c("aut"), 
+           comment = "libpsl : <https://github.com/rockdaboot/libpsl>")
  )
 Maintainer: Bob Rudis <bob@rud.is>
-Description: A good description goes here otherwise CRAN checks fail.
+Description: The 'Public Suffix List' (<https://publicsuffix.org/>) is a collection
+   of top-level domains ('TLDs') which include global top-level domainsa ('gTLDs')
+   such as '.com' and '.net'; country top-level domains ('ccTLDs') such as '.de' and
+   '.cn'; and, brand top-level domains such as '.apple' and '.google'. Tools are provided
+   to extract internet domain components using the public suffix list base data.
 URL: https://gitlab.com/hrbrmstr/psl
 BugReports: https://gitlab.com/hrbrmstr/psl/issues
+SystemRequirements: C++11
 Encoding: UTF-8
-License: AGPL
+License: MIT + file LICENSE
 Suggests:
    testthat,
    covr
 Depends:
    R (>= 3.2.0)
 Imports:
-    httr,
-    jsonlite
+    Rcpp
 RoxygenNote: 6.0.1.9000
+LinkingTo: Rcpp
--- a/2
+++ b/2
@ -0,0 +1,2 @@
+YEAR: 2018
+COPYRIGHT HOLDER: Bob Rudis
--- a/9
+++ b/9
@ -1,4 +1,9 @@
 # Generated by roxygen2: do not edit by hand

-import(httr)
-importFrom(jsonlite,fromJSON)
+export(apex_domain)
+export(is_public_suffix)
+export(public_suffix)
+export(suffix_extract)
+export(suffix_extract2)
+importFrom(Rcpp,sourceCpp)
+useDynLib(psl)
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@ -0,0 +1,55 @@
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
+# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#' Return the apex/top-private domain from a vector of domains
+#'
+#' @md
+#' @param domains character vector of domains
+#' @return character vector
+#' @export
+apex_domain <- function(domains) {
+    .Call('_psl_apex_domain', PACKAGE = 'psl', domains)
+}
+
+#' Return the public suffix from a vector of domains
+#'
+#' @md
+#' @param domains character vector of domains
+#' @return character vector
+#' @export
+public_suffix <- function(domains) {
+    .Call('_psl_public_suffix', PACKAGE = 'psl', domains)
+}
+
+#' Test whether a domain is a public suffix
+#'
+#' @md
+#' @param domains character vector of domains
+#' @return character vector
+#' @export
+is_public_suffix <- function(domains) {
+    .Call('_psl_is_public_suffix', PACKAGE = 'psl', domains)
+}
+
+#' Separate a domain into component parts
+#'
+#' @md
+#' @param domains character vector of domains
+#' @return data frame
+#' @export
+suffix_extract <- function(domains) {
+    .Call('_psl_suffix_extract', PACKAGE = 'psl', domains)
+}
+
+#' Separate a domain into component parts
+#'
+#' Compatibility function for those using `urltools::suffix_extract()`
+#'
+#' @md
+#' @param domains character vector of domains
+#' @return data frame
+#' @export
+suffix_extract2 <- function(domains) {
+    .Call('_psl_suffix_extract2', PACKAGE = 'psl', domains)
+}
+
--- a/R/psl-package.R
+++ b/R/psl-package.R
@ -1,12 +1,21 @@
-#' ...
-#' 
+#' Extract Internet Domain Components Using the Public Suffix List
+#'
+#' The 'Public Suffix List' (<https://publicsuffix.org/>) is a collection
+#' of top-level domains ('TLDs') which include global top-level domainsa ('gTLDs')
+#' such as '.com' and '.net'; country top-level domains ('ccTLDs') such as '.de' and
+#' '.cn'; and, brand top-level domains such as '.apple' and '.google'. Tools are provided
+#' to extract internet domain components using the public suffix list base data.
+#'
+#' - `libpsl`: <https://github.com/rockdaboot/libpsl>
+#' - Public Suffix List: <https://publicsuffix.org/>
+#'
 #' - URL: <https://gitlab.com/hrbrmstr/psl>
 #' - BugReports: <https://gitlab.com/hrbrmstr/psl/issues>
-#' 
+#'
 #' @md
 #' @name psl
 #' @docType package
 #' @author Bob Rudis (bob@@rud.is)
-#' @import httr
-#' @importFrom jsonlite fromJSON
-NULL
+#' @useDynLib psl
+#' @importFrom Rcpp sourceCpp
+NULL
--- a/README.Rmd
+++ b/README.Rmd
@ -2,14 +2,33 @@
 output: rmarkdown::github_document
 ---

+```{r include=FALSE}
+knitr::opts_chunk$set(
+  fig.width=10, fig.retina=2, message=FALSE, warning=FALSE, collapse=TRUE
+)
+```
+
 # psl

+  Extract Internet Domain Components Using the Public Suffix List
+
 ## Description

+The 'Public Suffix List' (<https://publicsuffix.org/>) is a collection of top-level domains ('TLDs') which include global top-level domainsa ('gTLDs') such as '.com' and '.net'; country top-level domains ('ccTLDs') such as '.de' and '.cn'; and, brand top-level domains such as '.apple' and '.google'. Tools are provided to extract internet domain components using the public suffix list base data.
+
+- `libpsl`: <https://github.com/rockdaboot/libpsl>
+- Public Suffix List: <https://publicsuffix.org/>
+ 
 ## What's Inside The Tin

 The following functions are implemented:

+- `apex_domain`:	Return the apex/top-private domain from a vector of domains
+- `is_public_suffix`:	Test whether a domain is a public suffix
+- `public_suffix`:	Return the public suffix from a vector of domains
+- `suffix_extract`:	Separate a domain into component parts
+- `suffix_extract2`:	Separate a domain into component parts (urltools compatible output)
+
 ## Installation

 ```{r eval=FALSE}
@ -24,9 +43,49 @@ options(width=120)

 ```{r message=FALSE, warning=FALSE, error=FALSE}
 library(psl)
+library(tidyverse)

 # current verison
 packageVersion("psl")

 ```

+```{r message=FALSE, warning=FALSE, error=FALSE}
+doms <- c(
+  "", "com", "example.com", "www.example.com",
+  ".com", ".example", ".example.com", ".example.example", "example",
+  "example.example", "b.example.example", "a.b.example.example",
+  "biz", "domain.biz", "b.domain.biz", "a.b.domain.biz", "com",
+  "example.com", "b.example.com", "a.b.example.com", "uk.com",
+  "example.uk.com", "b.example.uk.com", "a.b.example.uk.com", "test.ac",
+  "cy", "c.cy", "b.c.cy", "a.b.c.cy", "jp", "test.jp", "www.test.jp",
+  "ac.jp", "test.ac.jp", "www.test.ac.jp", "kyoto.jp", "test.kyoto.jp",
+  "ide.kyoto.jp", "b.ide.kyoto.jp", "a.b.ide.kyoto.jp", "c.kobe.jp",
+  "b.c.kobe.jp", "a.b.c.kobe.jp", "city.kobe.jp", "www.city.kobe.jp",
+  "ck", "test.ck", "b.test.ck", "a.b.test.ck", "www.ck", "www.www.ck",
+  "us", "test.us", "www.test.us", "ak.us", "test.ak.us", "www.test.ak.us",
+  "k12.ak.us", "test.k12.ak.us", "www.test.k12.ak.us"
+)
+
+apex_domain(doms)
+
+public_suffix(doms)
+
+is_public_suffix(doms)
+
+suffix_extract(doms)
+
+suffix_extract2(doms) # urltools compatible output
+```
+
+```{r bench, message=FALSE, warning=FALSE, error=FALSE, fig.width=10, fig.retina=2}
+library(microbenchmark)
+
+microbenchmark(
+  urltools = urltools::suffix_extract(doms),
+  psl = psl::suffix_extract(doms), # returns more data
+  psl2 = psl::suffix_extract2(doms) # returns what urltools does
+) -> mb
+
+autoplot(mb)
+```
--- a/README.md
+++ b/README.md
@ -1,2 +1,138 @@
+
 # psl

+Extract Internet Domain Components Using the Public Suffix List
+
+## Description
+
+The ‘Public Suffix List’ (<https://publicsuffix.org/>) is a collection
+of top-level domains (‘TLDs’) which include global top-level domainsa
+(‘gTLDs’) such as ‘.com’ and ‘.net’; country top-level domains
+(‘ccTLDs’) such as ‘.de’ and ‘.cn’; and, brand top-level domains such
+as ‘.apple’ and ‘.google’. Tools are provided to extract internet domain
+components using the public suffix list base data.
+
+  - `libpsl`: <https://github.com/rockdaboot/libpsl>
+  - Public Suffix List: <https://publicsuffix.org/>
+
+## What’s Inside The Tin
+
+The following functions are implemented:
+
+  - `apex_domain`: Return the apex/top-private domain from a vector of
+    domains
+  - `is_public_suffix`: Test whether a domain is a public suffix
+  - `public_suffix`: Return the public suffix from a vector of domains
+  - `suffix_extract`: Separate a domain into component parts
+  - `suffix_extract2`: Separate a domain into component parts (urltools
+    compatible output)
+
+## Installation
+
+``` r
+devtools::install_github("hrbrmstr/psl")
+```
+
+## Usage
+
+``` r
+library(psl)
+library(tidyverse)
+
+# current verison
+packageVersion("psl")
+## [1] '0.1.0'
+```
+
+``` r
+doms <- c(
+  "", "com", "example.com", "www.example.com",
+  ".com", ".example", ".example.com", ".example.example", "example",
+  "example.example", "b.example.example", "a.b.example.example",
+  "biz", "domain.biz", "b.domain.biz", "a.b.domain.biz", "com",
+  "example.com", "b.example.com", "a.b.example.com", "uk.com",
+  "example.uk.com", "b.example.uk.com", "a.b.example.uk.com", "test.ac",
+  "cy", "c.cy", "b.c.cy", "a.b.c.cy", "jp", "test.jp", "www.test.jp",
+  "ac.jp", "test.ac.jp", "www.test.ac.jp", "kyoto.jp", "test.kyoto.jp",
+  "ide.kyoto.jp", "b.ide.kyoto.jp", "a.b.ide.kyoto.jp", "c.kobe.jp",
+  "b.c.kobe.jp", "a.b.c.kobe.jp", "city.kobe.jp", "www.city.kobe.jp",
+  "ck", "test.ck", "b.test.ck", "a.b.test.ck", "www.ck", "www.www.ck",
+  "us", "test.us", "www.test.us", "ak.us", "test.ak.us", "www.test.ak.us",
+  "k12.ak.us", "test.k12.ak.us", "www.test.k12.ak.us"
+)
+
+apex_domain(doms)
+##  [1] NA                NA                "example.com"     "example.com"     NA                NA               
+##  [7] NA                NA                NA                "example.example" "example.example" "example.example"
+## [13] NA                "domain.biz"      "domain.biz"      "domain.biz"      NA                "example.com"    
+## [19] "example.com"     "example.com"     NA                "example.uk.com"  "example.uk.com"  "example.uk.com" 
+## [25] "test.ac"         NA                "c.cy"            "c.cy"            "c.cy"            NA               
+## [31] "test.jp"         "test.jp"         NA                "test.ac.jp"      "test.ac.jp"      NA               
+## [37] "test.kyoto.jp"   NA                "b.ide.kyoto.jp"  "b.ide.kyoto.jp"  NA                "b.c.kobe.jp"    
+## [43] "b.c.kobe.jp"     "city.kobe.jp"    "city.kobe.jp"    NA                NA                "b.test.ck"      
+## [49] "b.test.ck"       "www.ck"          "www.ck"          NA                "test.us"         "test.us"        
+## [55] NA                "test.ak.us"      "test.ak.us"      NA                "test.k12.ak.us"  "test.k12.ak.us"
+
+public_suffix(doms)
+##  [1] ""             "com"          "com"          "com"          ".com"         ".example"     "com"         
+##  [8] "example"      "example"      "example"      "example"      "example"      "biz"          "biz"         
+## [15] "biz"          "biz"          "com"          "com"          "com"          "com"          "uk.com"      
+## [22] "uk.com"       "uk.com"       "uk.com"       "ac"           "cy"           "cy"           "cy"          
+## [29] "cy"           "jp"           "jp"           "jp"           "ac.jp"        "ac.jp"        "ac.jp"       
+## [36] "kyoto.jp"     "kyoto.jp"     "ide.kyoto.jp" "ide.kyoto.jp" "ide.kyoto.jp" "c.kobe.jp"    "c.kobe.jp"   
+## [43] "c.kobe.jp"    "kobe.jp"      "kobe.jp"      "ck"           "test.ck"      "test.ck"      "test.ck"     
+## [50] "ck"           "ck"           "us"           "us"           "us"           "ak.us"        "ak.us"       
+## [57] "ak.us"        "k12.ak.us"    "k12.ak.us"    "k12.ak.us"
+
+is_public_suffix(doms)
+##  [1]  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
+## [20] FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE
+## [39] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE
+## [58]  TRUE FALSE FALSE
+
+suffix_extract(doms)
+## # A tibble: 60 x 6
+##    orig             normalized       subdomain apex            domain  suffix  
+##    <chr>            <chr>            <chr>     <chr>           <chr>   <chr>   
+##  1 ""               ""               <NA>      <NA>            <NA>    ""      
+##  2 com              com              <NA>      <NA>            <NA>    com     
+##  3 example.com      example.com      ""        example.com     example com     
+##  4 www.example.com  www.example.com  www       example.com     example com     
+##  5 .com             .com             <NA>      <NA>            <NA>    .com    
+##  6 .example         .example         <NA>      <NA>            <NA>    .example
+##  7 .example.com     .example.com     <NA>      <NA>            <NA>    com     
+##  8 .example.example .example.example <NA>      <NA>            <NA>    example 
+##  9 example          example          <NA>      <NA>            <NA>    example 
+## 10 example.example  example.example  ""        example.example example example 
+## # ... with 50 more rows
+
+suffix_extract2(doms) # urltools compatible output
+## # A tibble: 60 x 4
+##    host             subdomain domain  suffix  
+##    <chr>            <chr>     <chr>   <chr>   
+##  1 ""               <NA>      <NA>    ""      
+##  2 com              <NA>      <NA>    com     
+##  3 example.com      ""        example com     
+##  4 www.example.com  www       example com     
+##  5 .com             <NA>      <NA>    .com    
+##  6 .example         <NA>      <NA>    .example
+##  7 .example.com     <NA>      <NA>    com     
+##  8 .example.example <NA>      <NA>    example 
+##  9 example          <NA>      <NA>    example 
+## 10 example.example  ""        example example 
+## # ... with 50 more rows
+```
+
+``` r
+library(microbenchmark)
+
+microbenchmark(
+  urltools = urltools::suffix_extract(doms),
+  psl = psl::suffix_extract(doms), # returns more data
+  psl2 = psl::suffix_extract2(doms) # returns what urltools does
+) -> mb
+
+autoplot(mb)
+```
+
+<img src="README_files/figure-gfm/bench-1.png" width="960" />
--- a/README_files/figure-gfm/bench-1.png
+++ b/README_files/figure-gfm/bench-1.png
--- a/inst/dat/public_suffix_list.dat
+++ b/inst/dat/public_suffix_list.dat
--- a/man/apex_domain.Rd
+++ b/man/apex_domain.Rd
@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/RcppExports.R
+\name{apex_domain}
+\alias{apex_domain}
+\title{Return the apex/top-private domain from a vector of domains}
+\usage{
+apex_domain(domains)
+}
+\arguments{
+\item{domains}{character vector of domains}
+}
+\value{
+character vector
+}
+\description{
+Return the apex/top-private domain from a vector of domains
+}
--- a/man/is_public_suffix.Rd
+++ b/man/is_public_suffix.Rd
@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/RcppExports.R
+\name{is_public_suffix}
+\alias{is_public_suffix}
+\title{Test whether a domain is a public suffix}
+\usage{
+is_public_suffix(domains)
+}
+\arguments{
+\item{domains}{character vector of domains}
+}
+\value{
+character vector
+}
+\description{
+Test whether a domain is a public suffix
+}
--- a/man/psl.Rd
+++ b/man/psl.Rd
@ -4,9 +4,18 @@
 \name{psl}
 \alias{psl}
 \alias{psl-package}
-\title{...}
+\title{Extract Internet Domain Components Using the Public Suffix List}
 \description{
+The 'Public Suffix List' (\url{https://publicsuffix.org/}) is a collection
+of top-level domains ('TLDs') which include global top-level domainsa ('gTLDs')
+such as '.com' and '.net'; country top-level domains ('ccTLDs') such as '.de' and
+'.cn'; and, brand top-level domains such as '.apple' and '.google'. Tools are provided
+to extract internet domain components using the public suffix list base data.
+}
+\details{
 \itemize{
+\item \code{libpsl}: \url{https://github.com/rockdaboot/libpsl}
+\item Public Suffix List: \url{https://publicsuffix.org/}
 \item URL: \url{https://gitlab.com/hrbrmstr/psl}
 \item BugReports: \url{https://gitlab.com/hrbrmstr/psl/issues}
 }
--- a/man/public_suffix.Rd
+++ b/man/public_suffix.Rd
@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/RcppExports.R
+\name{public_suffix}
+\alias{public_suffix}
+\title{Return the public suffix from a vector of domains}
+\usage{
+public_suffix(domains)
+}
+\arguments{
+\item{domains}{character vector of domains}
+}
+\value{
+character vector
+}
+\description{
+Return the public suffix from a vector of domains
+}
--- a/man/suffix_extract.Rd
+++ b/man/suffix_extract.Rd
@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/RcppExports.R
+\name{suffix_extract}
+\alias{suffix_extract}
+\title{Separate a domain into component parts}
+\usage{
+suffix_extract(domains)
+}
+\arguments{
+\item{domains}{character vector of domains}
+}
+\value{
+data frame
+}
+\description{
+Separate a domain into component parts
+}
--- a/man/suffix_extract2.Rd
+++ b/man/suffix_extract2.Rd
@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/RcppExports.R
+\name{suffix_extract2}
+\alias{suffix_extract2}
+\title{Separate a domain into component parts}
+\usage{
+suffix_extract2(domains)
+}
+\arguments{
+\item{domains}{character vector of domains}
+}
+\value{
+data frame
+}
+\description{
+Compatibility function for those using \code{urltools::suffix_extract()}
+}
--- a/src/.gitignore
+++ b/src/.gitignore
@ -0,0 +1,3 @@
+*.o
+*.so
+*.dll
--- a/src/Makevars
+++ b/src/Makevars
@ -0,0 +1,3 @@
+CXX_STD = CXX11
+PKG_CXXFLAGS =
+PKG_LIBS = -L. -liconv -lidn2
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@ -0,0 +1,76 @@
+// Generated by using Rcpp::compileAttributes() -> do not edit by hand
+// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
+
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+// apex_domain
+CharacterVector apex_domain(CharacterVector domains);
+RcppExport SEXP _psl_apex_domain(SEXP domainsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP);
+    rcpp_result_gen = Rcpp::wrap(apex_domain(domains));
+    return rcpp_result_gen;
+END_RCPP
+}
+// public_suffix
+CharacterVector public_suffix(CharacterVector domains);
+RcppExport SEXP _psl_public_suffix(SEXP domainsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP);
+    rcpp_result_gen = Rcpp::wrap(public_suffix(domains));
+    return rcpp_result_gen;
+END_RCPP
+}
+// is_public_suffix
+std::vector< bool > is_public_suffix(CharacterVector domains);
+RcppExport SEXP _psl_is_public_suffix(SEXP domainsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP);
+    rcpp_result_gen = Rcpp::wrap(is_public_suffix(domains));
+    return rcpp_result_gen;
+END_RCPP
+}
+// suffix_extract
+DataFrame suffix_extract(CharacterVector domains);
+RcppExport SEXP _psl_suffix_extract(SEXP domainsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP);
+    rcpp_result_gen = Rcpp::wrap(suffix_extract(domains));
+    return rcpp_result_gen;
+END_RCPP
+}
+// suffix_extract2
+DataFrame suffix_extract2(CharacterVector domains);
+RcppExport SEXP _psl_suffix_extract2(SEXP domainsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP);
+    rcpp_result_gen = Rcpp::wrap(suffix_extract2(domains));
+    return rcpp_result_gen;
+END_RCPP
+}
+
+static const R_CallMethodDef CallEntries[] = {
+    {"_psl_apex_domain", (DL_FUNC) &_psl_apex_domain, 1},
+    {"_psl_public_suffix", (DL_FUNC) &_psl_public_suffix, 1},
+    {"_psl_is_public_suffix", (DL_FUNC) &_psl_is_public_suffix, 1},
+    {"_psl_suffix_extract", (DL_FUNC) &_psl_suffix_extract, 1},
+    {"_psl_suffix_extract2", (DL_FUNC) &_psl_suffix_extract2, 1},
+    {NULL, NULL, 0}
+};
+
+RcppExport void R_init_psl(DllInfo *dll) {
+    R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+    R_useDynamicSymbols(dll, FALSE);
+}
--- a/src/config.h
+++ b/src/config.h
@ -0,0 +1,147 @@
+/* config.h.  Generated from config.h.in by configure.  */
+/* config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* generate PSL data using libicu */
+/* #undef BUILTIN_GENERATOR_LIBICU */
+
+/* generate PSL data using libidn */
+/* #undef BUILTIN_GENERATOR_LIBIDN */
+
+/* generate PSL data using libidn2 */
+#define BUILTIN_GENERATOR_LIBIDN2 1
+
+/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
+   systems. This function is required for `alloca.c' support on those systems.
+   */
+/* #undef CRAY_STACKSEG_END */
+
+/* Define to 1 if using `alloca.c'. */
+/* #undef C_ALLOCA */
+
+/* Define to 1 if translation of program messages to the user's native
+   language is requested. */
+/* #undef ENABLE_NLS */
+
+/* Define to 1 if you have `alloca', as a function or macro. */
+#define HAVE_ALLOCA 1
+
+/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
+   */
+#define HAVE_ALLOCA_H 1
+
+/* Define to 1 if you have the MacOS X function CFLocaleCopyCurrent in the
+   CoreFoundation framework. */
+#define HAVE_CFLOCALECOPYCURRENT 1
+
+/* Define to 1 if you have the MacOS X function CFPreferencesCopyAppValue in
+   the CoreFoundation framework. */
+#define HAVE_CFPREFERENCESCOPYAPPVALUE 1
+
+/* Define to 1 if you have the `clock_gettime' function. */
+#define HAVE_CLOCK_GETTIME 1
+
+/* Define if the GNU dcgettext() function is already present or preinstalled.
+   */
+/* #undef HAVE_DCGETTEXT */
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the `fmemopen' function. */
+#define HAVE_FMEMOPEN 1
+
+/* Define if the GNU gettext() function is already present or preinstalled. */
+/* #undef HAVE_GETTEXT */
+
+/* Define if you have the iconv() function and it works. */
+#define HAVE_ICONV 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the `nl_langinfo' function. */
+#define HAVE_NL_LANGINFO 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strndup' function. */
+#define HAVE_STRNDUP 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Define to 1 or 0, depending whether the compiler supports simple visibility
+   declarations. */
+#define HAVE_VISIBILITY 1
+
+/* Define as const if the declaration of iconv() needs const. */
+#define ICONV_CONST 
+
+/* Define to the sub-directory where libtool stores uninstalled libraries. */
+#define LT_OBJDIR ".libs/"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT "tim.ruehsen@gmx.de"
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "libpsl"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "libpsl 0.20.2"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "libpsl"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL "https://github.com/rockdaboot/libpsl"
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "0.20.2"
+
+/* If using the C implementation of alloca, define if you know the
+   direction of stack growth for your system; otherwise it will be
+   automatically deduced at runtime.
+	STACK_DIRECTION > 0 => grows toward higher addresses
+	STACK_DIRECTION < 0 => grows toward lower addresses
+	STACK_DIRECTION = 0 => direction of growth unknown */
+/* #undef STACK_DIRECTION */
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* generate PSL data using libicu */
+/* #undef WITH_LIBICU */
+
+/* generate PSL data using libidn */
+/* #undef WITH_LIBIDN */
+
+/* generate PSL data using libidn2 */
+#define WITH_LIBIDN2 1
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+/* #undef inline */
+#endif
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+/* #undef size_t */
--- a/src/libpsl.h
+++ b/src/libpsl.h
@ -0,0 +1,212 @@
+/*
+ * Copyright(c) 2014-2018 Tim Ruehsen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * This file is part of libpsl.
+ *
+ * Header file for libpsl library routines
+ *
+ * Changelog
+ * 20.03.2014  Tim Ruehsen  created
+ *
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef LIBPSL_LIBPSL_H
+#define LIBPSL_LIBPSL_H
+
+#include <stdio.h>
+#include <time.h>
+
+#define PSL_VERSION "0.20.2"
+#define PSL_VERSION_MAJOR 0
+#define PSL_VERSION_MINOR 20
+#define PSL_VERSION_PATCH 2
+#define PSL_VERSION_NUMBER 0x001402
+
+#ifndef PSL_API
+#if defined BUILDING_PSL && HAVE_VISIBILITY
+#  define PSL_API __attribute__ ((__visibility__("default")))
+#elif defined BUILDING_PSL && defined _MSC_VER && !defined PSL_STATIC
+#  define PSL_API __declspec(dllexport)
+#elif defined _MSC_VER && !defined PSL_STATIC
+#  define PSL_API __declspec(dllimport)
+#else
+#  define PSL_API
+#endif
+#endif
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+/* types for psl_is_public_suffix2() */
+#define PSL_TYPE_ICANN        (1<<0)
+#define PSL_TYPE_PRIVATE      (1<<1)
+#define PSL_TYPE_NO_STAR_RULE (1<<2)
+#define PSL_TYPE_ANY          (PSL_TYPE_ICANN | PSL_TYPE_PRIVATE)
+
+/**
+ * psl_error_t:
+ * @PSL_SUCCESS: Successful return.
+ * @PSL_ERR_INVALID_ARG: Invalid argument.
+ * @PSL_ERR_CONVERTER: Failed to open libicu utf-16 converter.
+ * @PSL_ERR_TO_UTF16: Failed to convert to utf-16.
+ * @PSL_ERR_TO_LOWER: Failed to convert utf-16 to lowercase.
+ * @PSL_ERR_TO_UTF8: Failed to convert utf-16 to utf-8.
+ * @PSL_ERR_NO_MEM: Failed to allocate memory.
+ *
+ * Return codes for PSL functions.
+ * Negative return codes mean failure.
+ * Positive values are reserved for non-error return codes.
+ */
+typedef enum {
+	PSL_SUCCESS = 0,
+	PSL_ERR_INVALID_ARG = -1,
+	PSL_ERR_CONVERTER = -2, /* failed to open libicu utf-16 converter */
+	PSL_ERR_TO_UTF16 = -3,  /* failed to convert to utf-16 */
+	PSL_ERR_TO_LOWER = -4,  /* failed to convert utf-16 to lowercase */
+	PSL_ERR_TO_UTF8 = -5,   /* failed to convert utf-16 to utf-8 */
+	PSL_ERR_NO_MEM = -6    /* failed to allocate memory */
+} psl_error_t;
+
+typedef struct _psl_ctx_st psl_ctx_t;
+
+/* frees PSL context */
+PSL_API
+void
+	psl_free(psl_ctx_t *psl);
+
+/* frees memory allocated by libpsl routines */
+PSL_API
+void
+	psl_free_string(char *str);
+
+/* loads PSL data from file */
+PSL_API
+psl_ctx_t *
+	psl_load_file(const char *fname);
+
+/* loads PSL data from FILE pointer */
+PSL_API
+psl_ctx_t *
+	psl_load_fp(FILE *fp);
+
+/* retrieves builtin PSL data */
+PSL_API
+const psl_ctx_t *
+	psl_builtin(void);
+
+/* retrieves most recent PSL data */
+PSL_API
+psl_ctx_t *
+	psl_latest(const char *fname);
+
+/* checks whether domain is a public suffix or not */
+PSL_API
+int
+	psl_is_public_suffix(const psl_ctx_t *psl, const char *domain);
+
+/* checks whether domain is a public suffix regarding the type or not */
+PSL_API
+int
+	psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type);
+
+/* checks whether cookie_domain is acceptable for domain or not */
+PSL_API
+int
+	psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname, const char *cookie_domain);
+
+/* returns the longest not registrable domain within 'domain' or NULL if none found */
+PSL_API
+const char *
+	psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain);
+
+/* returns the shortest possible registrable domain part or NULL if domain is not registrable at all */
+PSL_API
+const char *
+	psl_registrable_domain(const psl_ctx_t *psl, const char *domain);
+
+/* convert a string into lowercase UTF-8 */
+PSL_API
+psl_error_t
+	psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower);
+
+/* does not include exceptions */
+PSL_API
+int
+	psl_suffix_count(const psl_ctx_t *psl);
+
+/* just counts exceptions */
+PSL_API
+int
+	psl_suffix_exception_count(const psl_ctx_t *psl);
+
+/* just counts wildcards */
+PSL_API
+int
+	psl_suffix_wildcard_count(const psl_ctx_t *psl);
+
+/* returns mtime of PSL source file */
+PSL_API
+time_t
+	psl_builtin_file_time(void);
+
+/* returns SHA1 checksum (hex-encoded, lowercase) of PSL source file */
+PSL_API
+const char *
+	psl_builtin_sha1sum(void);
+
+/* returns file name of PSL source file */
+PSL_API
+const char *
+	psl_builtin_filename(void);
+
+/* returns name of distribution PSL data file */
+PSL_API
+const char *
+	psl_dist_filename(void);
+
+/* returns library version string */
+PSL_API
+const char *
+	psl_get_version(void);
+
+/* checks library version number */
+PSL_API
+int
+	psl_check_version_number(int version);
+
+/* returns whether the built-in data is outdated or not */
+PSL_API
+int
+	psl_builtin_outdated(void);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif /* LIBPSL_LIBPSL_H */
+
+#ifdef __cplusplus
+}
+#endif
--- a/src/lookup_string_in_fixed_set.c
+++ b/src/lookup_string_in_fixed_set.c
@ -0,0 +1,279 @@
+/* Copyright 2015-2016 The Chromium Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE.chromium file.
+ *
+ * Converted to C89 2015 by Tim Rühsen
+ */
+
+#include <stddef.h>
+
+#if defined(__GNUC__) && defined(__GNUC_MINOR__)
+#       define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
+#else
+#       define _GCC_VERSION_AT_LEAST(major, minor) 0
+#endif
+
+#define CHECK_LT(a, b) if ((a) >= b) return 0
+
+static const char multibyte_length_table[16] = {
+	0, 0, 0, 0,	 /* 0x00-0x3F */
+	0, 0, 0, 0,	 /* 0x40-0x7F */
+	0, 0, 0, 0,	 /* 0x80-0xBF */
+	2, 2, 3, 4,	 /* 0xC0-0xFF */
+};
+
+
+/*
+ * Get length of multibyte character sequence starting at a given byte.
+ * Returns zero if the byte is not a valid leading byte in UTF-8.
+ */
+static int GetMultibyteLength(char c) {
+	return multibyte_length_table[((unsigned char)c) >> 4];
+}
+
+/*
+ * Moves pointers one byte forward.
+ */
+static void NextPos(const unsigned char** pos,
+	const char** key,
+	const char** multibyte_start)
+{
+	++*pos;
+	if (*multibyte_start) {
+		/* Advance key to next byte in multibyte sequence. */
+		++*key;
+		/* Reset multibyte_start if last byte in multibyte sequence was consumed. */
+		if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
+			*multibyte_start = 0;
+	} else {
+		if (GetMultibyteLength(**key)) {
+			/* Multibyte prefix was matched in the dafsa, start matching multibyte
+			 * content in next round. */
+			*multibyte_start = *key;
+		} else {
+			/* Advance key as a single byte character was matched. */
+			++*key;
+		}
+	}
+}
+
+/*
+ * Read next offset from pos.
+ * Returns true if an offset could be read, false otherwise.
+ */
+
+static int GetNextOffset(const unsigned char** pos,
+	const unsigned char* end,
+	const unsigned char** offset)
+{
+	size_t bytes_consumed;
+
+	if (*pos == end)
+		return 0;
+
+	/* When reading an offset the byte array must always contain at least
+	 * three more bytes to consume. First the offset to read, then a node
+	 * to skip over and finally a destination node. No object can be smaller
+	 * than one byte. */
+	CHECK_LT(*pos + 2, end);
+	switch (**pos & 0x60) {
+	case 0x60: /* Read three byte offset */
+		*offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
+		bytes_consumed = 3;
+		break;
+	case 0x40: /* Read two byte offset */
+		*offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
+		bytes_consumed = 2;
+		break;
+	default:
+		*offset += (*pos)[0] & 0x3F;
+		bytes_consumed = 1;
+	}
+	if ((**pos & 0x80) != 0) {
+		*pos = end;
+	} else {
+		*pos += bytes_consumed;
+	}
+	return 1;
+}
+
+/*
+ * Check if byte at offset is last in label.
+ */
+
+static int IsEOL(const unsigned char* offset, const unsigned char* end)
+{
+	CHECK_LT(offset, end);
+	return(*offset & 0x80) != 0;
+}
+
+/*
+ * Check if byte at offset matches first character in key.
+ * This version assumes a range check was already performed by the caller.
+ */
+
+static int IsMatchUnchecked(const unsigned char matcher,
+	const char* key,
+	const char* multibyte_start)
+{
+	if (multibyte_start) {
+		/* Multibyte matching mode. */
+		if (multibyte_start == key) {
+			/* Match leading byte, which will also match the sequence length. */
+			return (matcher ^ 0x80) == (const unsigned char)*key;
+		} else {
+			/* Match following bytes. */
+			return (matcher ^ 0xC0) == (const unsigned char)*key;
+		}
+	}
+	/* If key points at a leading byte in a multibyte sequence, but we are not yet
+	 * in multibyte mode, then the dafsa should contain a special byte to indicate
+	 * a mode switch. */
+	if (GetMultibyteLength(*key)) {
+		return matcher == 0x1F;
+	}
+	/* Normal matching of a single byte character. */
+	return matcher == (const unsigned char)*key;
+}
+
+/*
+ * Check if byte at offset matches first character in key.
+ * This version matches characters not last in label.
+ */
+
+static int IsMatch(const unsigned char* offset,
+	const unsigned char* end,
+	const char* key,
+	const char* multibyte_start)
+{
+	CHECK_LT(offset, end);
+	return IsMatchUnchecked(*offset, key, multibyte_start);
+}
+
+/*
+ * Check if byte at offset matches first character in key.
+ * This version matches characters last in label.
+ */
+
+static int IsEndCharMatch(const unsigned char* offset,
+	const unsigned char* end,
+	const char* key,
+	const char* multibyte_start)
+{
+	CHECK_LT(offset, end);
+	return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
+}
+
+/*
+ * Read return value at offset.
+ * Returns true if a return value could be read, false otherwise.
+ */
+
+static int GetReturnValue(const unsigned char* offset,
+	const unsigned char* end,
+	const char* multibyte_start,
+	int* return_value)
+{
+	CHECK_LT(offset, end);
+	if (!multibyte_start && (*offset & 0xE0) == 0x80) {
+		*return_value = *offset & 0x0F;
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ *  Looks up the string |key| with length |key_length| in a fixed set of
+ * strings. The set of strings must be known at compile time. It is converted to
+ * a graph structure named a DAFSA (Deterministic Acyclic Finite State
+ * Automaton) by the script psl-make-dafsa during compilation. This permits
+ * efficient (in time and space) lookup. The graph generated by psl-make-dafsa
+ * takes the form of a constant byte array which should be supplied via the
+ * |graph| and |length| parameters.  The return value is kDafsaNotFound,
+ * kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule,
+ * kDafsaWildcardRule and kDafsaPrivateRule ORed together.
+ * 
+ * Lookup a domain key in a byte array generated by psl-make-dafsa.
+ */
+
+/* prototype to skip warning with -Wmissing-prototypes */
+int LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t);
+
+int LookupStringInFixedSet(const unsigned char* graph,
+	size_t length,
+	const char* key,
+	size_t key_length)
+{
+	const unsigned char* pos = graph;
+	const unsigned char* end = graph + length;
+	const unsigned char* offset = pos;
+	const char* key_end = key + key_length;
+	const char* multibyte_start = 0;
+
+	while (GetNextOffset(&pos, end, &offset)) {
+		/*char <char>+ end_char offsets
+		 * char <char>+ return value
+		 * char end_char offsets
+		 * char return value
+		 * end_char offsets
+		 * return_value
+		 */
+		int did_consume = 0;
+
+		if (key != key_end && !IsEOL(offset, end)) {
+			/* Leading <char> is not a match. Don't dive into this child */
+			if (!IsMatch(offset, end, key, multibyte_start))
+				continue;
+			did_consume = 1;
+			NextPos(&offset, &key, &multibyte_start);
+			/* Possible matches at this point:
+			 * <char>+ end_char offsets
+			 * <char>+ return value
+			 * end_char offsets
+			 * return value
+			 */
+
+			/* Remove all remaining <char> nodes possible */
+			while (!IsEOL(offset, end) && key != key_end) {
+				if (!IsMatch(offset, end, key, multibyte_start))
+					return -1;
+				NextPos(&offset, &key, &multibyte_start);
+			}
+		}
+		/* Possible matches at this point:
+		 * end_char offsets
+		 * return_value
+		 * If one or more <char> elements were consumed, a failure
+		 * to match is terminal. Otherwise, try the next node.
+		 */
+		if (key == key_end) {
+			int return_value;
+
+			if (GetReturnValue(offset, end, multibyte_start, &return_value))
+				return return_value;
+			/* The DAFSA guarantees that if the first char is a match, all
+			 * remaining char elements MUST match if the key is truly present.
+			 */
+			if (did_consume)
+				return -1;
+			continue;
+		}
+		if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
+			if (did_consume)
+				return -1; /* Unexpected */
+			continue;
+		}
+		NextPos(&offset, &key, &multibyte_start);
+		pos = offset; /* Dive into child */
+	}
+
+	return -1; /* No match */
+}
+
+/* prototype to skip warning with -Wmissing-prototypes */
+int GetUtfMode(const unsigned char *graph, size_t length);
+
+int GetUtfMode(const unsigned char *graph, size_t length)
+{
+	return length > 0 && graph[length - 1] < 0x80;
+}
--- a/src/psl-main.cpp
+++ b/src/psl-main.cpp
@ -0,0 +1,350 @@
+#include <Rcpp.h>
+
+#include <regex>
+
+#include "libpsl.h"
+
+using namespace Rcpp;
+
+//' Return the apex/top-private domain from a vector of domains
+//'
+//' @md
+//' @param domains character vector of domains
+//' @return character vector
+//' @export
+// [[Rcpp::export]]
+CharacterVector apex_domain(CharacterVector domains) {
+
+  unsigned int input_size = domains.size();
+  CharacterVector output(input_size);
+  char *lower = NULL;
+  int rc;
+  const char * result;
+  const psl_ctx_t *psl = psl_builtin();
+
+  for (unsigned int i = 0; i < input_size; i++) {
+
+    // remove trailing period if any
+    std::string cleaned = Rcpp::as<std::string>(domains[i]);
+    if (cleaned.length() > 0) {
+      if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back();
+    }
+
+    // lowercase it
+    rc = psl_str_to_utf8lower(
+      cleaned.c_str(),
+      "utf-8", "en",
+      &lower
+    );
+
+    if (rc == PSL_SUCCESS) {
+      result = psl_registrable_domain(psl, lower);
+      if (result) {
+        output[i] = std::string(result);
+      } else {
+        output[i] = NA_STRING;
+      }
+    } else {
+      output[i] = NA_STRING;
+    }
+
+    psl_free_string(lower);
+
+  }
+
+  return(output);
+
+}
+
+//' Return the public suffix from a vector of domains
+//'
+//' @md
+//' @param domains character vector of domains
+//' @return character vector
+//' @export
+// [[Rcpp::export]]
+CharacterVector public_suffix(CharacterVector domains) {
+
+  unsigned int input_size = domains.size();
+  CharacterVector output(input_size);
+  char *lower = NULL;
+  int rc;
+  const char * result;
+  const psl_ctx_t *psl = psl_builtin();
+
+  for (unsigned int i = 0; i < input_size; i++) {
+
+    // remove trailing period if any
+    std::string cleaned = Rcpp::as<std::string>(domains[i]);
+    if (cleaned.length() > 0) {
+      if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back();
+    }
+
+    // lowercase it
+    rc = psl_str_to_utf8lower(
+      cleaned.c_str(),
+      "utf-8", "en",
+      &lower
+    );
+
+    if (rc == PSL_SUCCESS) {
+      result = psl_unregistrable_domain(psl, lower);
+      if (result) {
+        output[i] = std::string(result);
+      } else {
+        output[i] = NA_STRING;
+      }
+    } else {
+      output[i] = NA_STRING;
+    }
+
+    psl_free_string(lower);
+
+  }
+
+  return(output);
+
+}
+
+//' Test whether a domain is a public suffix
+//'
+//' @md
+//' @param domains character vector of domains
+//' @return character vector
+//' @export
+// [[Rcpp::export]]
+std::vector< bool > is_public_suffix(CharacterVector domains) {
+
+  unsigned int input_size = domains.size();
+  std::vector < bool > output(input_size);
+  char *lower = NULL;
+  int rc;
+  const psl_ctx_t *psl = psl_builtin();
+
+  for (unsigned int i = 0; i < input_size; i++) {
+
+    // remove trailing period if any
+    std::string cleaned = Rcpp::as<std::string>(domains[i]);
+    if (cleaned.length() > 0) {
+      if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back();
+    }
+
+    // lowercase it
+    rc = psl_str_to_utf8lower(
+      cleaned.c_str(),
+      "utf-8", "en",
+      &lower
+    );
+
+    if (rc == PSL_SUCCESS) {
+      output[i] = (psl_is_public_suffix(psl, lower) == 1);
+    } else {
+      output[i] = NA_LOGICAL;
+    }
+
+    psl_free_string(lower);
+
+  }
+
+  return(output);
+
+}
+
+//' Separate a domain into component parts
+//'
+//' @md
+//' @param domains character vector of domains
+//' @return data frame
+//' @export
+// [[Rcpp::export]]
+DataFrame suffix_extract(CharacterVector domains) {
+
+  unsigned int input_size = domains.size();
+
+  CharacterVector normalized(input_size);
+  CharacterVector subdomain(input_size);
+  CharacterVector apex(input_size);
+  CharacterVector domain(input_size);
+  CharacterVector suffix(input_size);
+
+  char *lower = NULL;
+  int rc;
+  const char * result;
+  const psl_ctx_t *psl = psl_builtin();
+
+  for (unsigned int i = 0; i < input_size; i++) {
+
+    // remove trailing period if any
+    std::string cleaned = Rcpp::as<std::string>(domains[i]);
+    if (cleaned.length() > 0) {
+      if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back();
+    }
+
+    // lowercase it
+    rc = psl_str_to_utf8lower(
+      cleaned.c_str(),
+      "utf-8", "en",
+      &lower
+    );
+
+    if (rc == PSL_SUCCESS) {
+
+      // no dots at end and lowercased
+      normalized[i] = std::string(lower);
+
+      // try to get the suffix
+      result = psl_unregistrable_domain(psl, lower);
+      if (result) {
+        suffix[i] = std::string(result);
+      } else {
+        suffix[i] = NA_STRING;
+      }
+
+      // try to get the apex
+      result = psl_registrable_domain(psl, lower);
+      if (result) {
+        apex[i] = std::string(result);
+      } else {
+        apex[i] = NA_STRING;
+      }
+
+      if ((suffix[i] != NA_STRING) && (apex[i] != NA_STRING)) {
+
+        std::regex trail_suf("[\\.]*" + Rcpp::as<std::string>(suffix[i]) + "$");
+        domain[i] = std::regex_replace(
+          Rcpp::as<std::string>(apex[i]),
+          trail_suf, ""
+        );
+
+        std::regex apex_suf("[\\.]*" + Rcpp::as<std::string>(apex[i]) + "$");
+        subdomain[i] = std::regex_replace(
+          Rcpp::as<std::string>(normalized[i]),
+          apex_suf, ""
+        );
+
+      } else {
+        domain[i] = NA_STRING;
+        subdomain[i] = NA_STRING;
+      }
+
+    } else {
+      normalized[i] = NA_STRING;
+      subdomain[i] = NA_STRING;
+      apex[i] = NA_STRING;
+      domain[i] = NA_STRING;
+      suffix[i] = NA_STRING;
+    }
+
+    psl_free_string(lower);
+
+  }
+
+  DataFrame out = DataFrame::create(
+    _["orig"] = domains,
+    _["normalized"] = normalized,
+    _["subdomain"] = subdomain,
+    _["apex"] = apex,
+    _["domain"] = domain,
+    _["suffix"] = suffix,
+    _["stringsAsFactors"] = false
+  );
+
+  out.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame");
+
+  return(out);
+
+}
+
+//' Separate a domain into component parts
+//'
+//' Compatibility function for those using `urltools::suffix_extract()`
+//'
+//' @md
+//' @param domains character vector of domains
+//' @return data frame
+//' @export
+// [[Rcpp::export]]
+DataFrame suffix_extract2(CharacterVector domains) {
+
+  unsigned int input_size = domains.size();
+
+  CharacterVector subdomain(input_size);
+  CharacterVector domain(input_size);
+  CharacterVector suffix(input_size);
+
+  char *lower = NULL;
+  int rc;
+  const char * result;
+  const psl_ctx_t *psl = psl_builtin();
+
+  for (unsigned int i = 0; i < input_size; i++) {
+
+    std::string cleaned = Rcpp::as<std::string>(domains[i]);
+    if (cleaned.length() > 0) {
+      if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back();
+    }
+
+    // lowercase it
+    rc = psl_str_to_utf8lower(
+      cleaned.c_str(),
+      "utf-8", "en",
+      &lower
+    );
+
+    if (rc == PSL_SUCCESS) {
+
+      std::string normalized(lower);
+
+      // try to get the suffix
+      result = psl_unregistrable_domain(psl, lower);
+
+      if (result) {
+
+        std::string suf = std::string(result);
+        suffix[i] = suf;
+
+        result = psl_registrable_domain(psl, lower);
+
+        if (result) {
+
+          std::string apex(result);
+
+          std::regex trail_suf("[\\.]*" + suf + "$");
+          std::regex apex_suf("[\\.]*" + apex + "$");
+
+          domain[i] = std::regex_replace(apex, trail_suf, "");
+          subdomain[i] = std::regex_replace(normalized, apex_suf, "");
+
+        } else {
+          subdomain[i] = NA_STRING;
+          domain[i] = NA_STRING;
+        }
+      } else {
+        subdomain[i] = NA_STRING;
+        suffix[i] = NA_STRING;
+        domain[i] = NA_STRING;
+      }
+
+    } else {
+      subdomain[i] = NA_STRING;
+      domain[i] = NA_STRING;
+      suffix[i] = NA_STRING;
+    }
+
+    psl_free_string(lower);
+
+  }
+
+  DataFrame out = DataFrame::create(
+    _["host"] = domains,
+    _["subdomain"] = subdomain,
+    _["domain"] = domain,
+    _["suffix"] = suffix,
+    _["stringsAsFactors"] = false
+  );
+
+  out.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame");
+
+  return(out);
+
+}
--- a/src/psl.c
+++ b/src/psl.c
--- a/src/suffixes_dafsa.c
+++ b/src/suffixes_dafsa.c