Browse Source

initial commit

latest
boB Rudis 2 years ago
parent
commit
ce4475df5e
No known key found for this signature in database GPG Key ID: 1D7529BE14E2BBA9
24 changed files with 19681 additions and 15 deletions
  1. +13
    -6
      DESCRIPTION
  2. +2
    -0
      LICENSE
  3. +7
    -2
      NAMESPACE
  4. +55
    -0
      R/RcppExports.R
  5. +15
    -6
      R/psl-package.R
  6. +59
    -0
      README.Rmd
  7. +136
    -0
      README.md
  8. BIN
      README_files/figure-gfm/bench-1.png
  9. +12388
    -0
      inst/dat/public_suffix_list.dat
  10. +17
    -0
      man/apex_domain.Rd
  11. +17
    -0
      man/is_public_suffix.Rd
  12. +10
    -1
      man/psl.Rd
  13. +17
    -0
      man/public_suffix.Rd
  14. +17
    -0
      man/suffix_extract.Rd
  15. +17
    -0
      man/suffix_extract2.Rd
  16. +3
    -0
      src/.gitignore
  17. +3
    -0
      src/Makevars
  18. +76
    -0
      src/RcppExports.cpp
  19. +147
    -0
      src/config.h
  20. +212
    -0
      src/libpsl.h
  21. +279
    -0
      src/lookup_string_in_fixed_set.c
  22. +350
    -0
      src/psl-main.cpp
  23. +1943
    -0
      src/psl.c
  24. +3898
    -0
      src/suffixes_dafsa.c

+ 13
- 6
DESCRIPTION View File

@@ -1,24 +1,31 @@
Package: psl
Type: Package
Title: psl title goes here otherwise CRAN checks fail
Title: Extract Internet Domain Components Using the Public Suffix List
Version: 0.1.0
Date: 2018-09-06
Authors@R: c(
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-5670-2640"))
comment = c(ORCID = "0000-0001-5670-2640")),
person("Tim", "Rühsen", email = "bob@rud.is", role = c("aut"),
comment = "libpsl : <https://github.com/rockdaboot/libpsl>")
)
Maintainer: Bob Rudis <bob@rud.is>
Description: A good description goes here otherwise CRAN checks fail.
Description: The 'Public Suffix List' (<https://publicsuffix.org/>) is a collection
of top-level domains ('TLDs') which include global top-level domainsa ('gTLDs')
such as '.com' and '.net'; country top-level domains ('ccTLDs') such as '.de' and
'.cn'; and, brand top-level domains such as '.apple' and '.google'. Tools are provided
to extract internet domain components using the public suffix list base data.
URL: https://gitlab.com/hrbrmstr/psl
BugReports: https://gitlab.com/hrbrmstr/psl/issues
SystemRequirements: C++11
Encoding: UTF-8
License: AGPL
License: MIT + file LICENSE
Suggests:
testthat,
covr
Depends:
R (>= 3.2.0)
Imports:
httr,
jsonlite
Rcpp
RoxygenNote: 6.0.1.9000
LinkingTo: Rcpp

+ 2
- 0
LICENSE View File

@@ -0,0 +1,2 @@
YEAR: 2018
COPYRIGHT HOLDER: Bob Rudis

+ 7
- 2
NAMESPACE View File

@@ -1,4 +1,9 @@
# Generated by roxygen2: do not edit by hand

import(httr)
importFrom(jsonlite,fromJSON)
export(apex_domain)
export(is_public_suffix)
export(public_suffix)
export(suffix_extract)
export(suffix_extract2)
importFrom(Rcpp,sourceCpp)
useDynLib(psl)

+ 55
- 0
R/RcppExports.R View File

@@ -0,0 +1,55 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#' Return the apex/top-private domain from a vector of domains
#'
#' @md
#' @param domains character vector of domains
#' @return character vector
#' @export
apex_domain <- function(domains) {
.Call('_psl_apex_domain', PACKAGE = 'psl', domains)
}

#' Return the public suffix from a vector of domains
#'
#' @md
#' @param domains character vector of domains
#' @return character vector
#' @export
public_suffix <- function(domains) {
.Call('_psl_public_suffix', PACKAGE = 'psl', domains)
}

#' Test whether a domain is a public suffix
#'
#' @md
#' @param domains character vector of domains
#' @return character vector
#' @export
is_public_suffix <- function(domains) {
.Call('_psl_is_public_suffix', PACKAGE = 'psl', domains)
}

#' Separate a domain into component parts
#'
#' @md
#' @param domains character vector of domains
#' @return data frame
#' @export
suffix_extract <- function(domains) {
.Call('_psl_suffix_extract', PACKAGE = 'psl', domains)
}

#' Separate a domain into component parts
#'
#' Compatibility function for those using `urltools::suffix_extract()`
#'
#' @md
#' @param domains character vector of domains
#' @return data frame
#' @export
suffix_extract2 <- function(domains) {
.Call('_psl_suffix_extract2', PACKAGE = 'psl', domains)
}


+ 15
- 6
R/psl-package.R View File

@@ -1,12 +1,21 @@
#' ...
#'
#' Extract Internet Domain Components Using the Public Suffix List
#'
#' The 'Public Suffix List' (<https://publicsuffix.org/>) is a collection
#' of top-level domains ('TLDs') which include global top-level domainsa ('gTLDs')
#' such as '.com' and '.net'; country top-level domains ('ccTLDs') such as '.de' and
#' '.cn'; and, brand top-level domains such as '.apple' and '.google'. Tools are provided
#' to extract internet domain components using the public suffix list base data.
#'
#' - `libpsl`: <https://github.com/rockdaboot/libpsl>
#' - Public Suffix List: <https://publicsuffix.org/>
#'
#' - URL: <https://gitlab.com/hrbrmstr/psl>
#' - BugReports: <https://gitlab.com/hrbrmstr/psl/issues>
#'
#'
#' @md
#' @name psl
#' @docType package
#' @author Bob Rudis (bob@@rud.is)
#' @import httr
#' @importFrom jsonlite fromJSON
NULL
#' @useDynLib psl
#' @importFrom Rcpp sourceCpp
NULL

+ 59
- 0
README.Rmd View File

@@ -2,14 +2,33 @@
output: rmarkdown::github_document
---

```{r include=FALSE}
knitr::opts_chunk$set(
fig.width=10, fig.retina=2, message=FALSE, warning=FALSE, collapse=TRUE
)
```

# psl

Extract Internet Domain Components Using the Public Suffix List

## Description

The 'Public Suffix List' (<https://publicsuffix.org/>) is a collection of top-level domains ('TLDs') which include global top-level domainsa ('gTLDs') such as '.com' and '.net'; country top-level domains ('ccTLDs') such as '.de' and '.cn'; and, brand top-level domains such as '.apple' and '.google'. Tools are provided to extract internet domain components using the public suffix list base data.

- `libpsl`: <https://github.com/rockdaboot/libpsl>
- Public Suffix List: <https://publicsuffix.org/>
## What's Inside The Tin

The following functions are implemented:

- `apex_domain`: Return the apex/top-private domain from a vector of domains
- `is_public_suffix`: Test whether a domain is a public suffix
- `public_suffix`: Return the public suffix from a vector of domains
- `suffix_extract`: Separate a domain into component parts
- `suffix_extract2`: Separate a domain into component parts (urltools compatible output)

## Installation

```{r eval=FALSE}
@@ -24,9 +43,49 @@ options(width=120)

```{r message=FALSE, warning=FALSE, error=FALSE}
library(psl)
library(tidyverse)

# current verison
packageVersion("psl")

```

```{r message=FALSE, warning=FALSE, error=FALSE}
doms <- c(
"", "com", "example.com", "www.example.com",
".com", ".example", ".example.com", ".example.example", "example",
"example.example", "b.example.example", "a.b.example.example",
"biz", "domain.biz", "b.domain.biz", "a.b.domain.biz", "com",
"example.com", "b.example.com", "a.b.example.com", "uk.com",
"example.uk.com", "b.example.uk.com", "a.b.example.uk.com", "test.ac",
"cy", "c.cy", "b.c.cy", "a.b.c.cy", "jp", "test.jp", "www.test.jp",
"ac.jp", "test.ac.jp", "www.test.ac.jp", "kyoto.jp", "test.kyoto.jp",
"ide.kyoto.jp", "b.ide.kyoto.jp", "a.b.ide.kyoto.jp", "c.kobe.jp",
"b.c.kobe.jp", "a.b.c.kobe.jp", "city.kobe.jp", "www.city.kobe.jp",
"ck", "test.ck", "b.test.ck", "a.b.test.ck", "www.ck", "www.www.ck",
"us", "test.us", "www.test.us", "ak.us", "test.ak.us", "www.test.ak.us",
"k12.ak.us", "test.k12.ak.us", "www.test.k12.ak.us"
)

apex_domain(doms)

public_suffix(doms)

is_public_suffix(doms)

suffix_extract(doms)

suffix_extract2(doms) # urltools compatible output
```

```{r bench, message=FALSE, warning=FALSE, error=FALSE, fig.width=10, fig.retina=2}
library(microbenchmark)

microbenchmark(
urltools = urltools::suffix_extract(doms),
psl = psl::suffix_extract(doms), # returns more data
psl2 = psl::suffix_extract2(doms) # returns what urltools does
) -> mb

autoplot(mb)
```

+ 136
- 0
README.md View File

@@ -1,2 +1,138 @@

# psl

Extract Internet Domain Components Using the Public Suffix List

## Description

The ‘Public Suffix List’ (<https://publicsuffix.org/>) is a collection
of top-level domains (‘TLDs’) which include global top-level domainsa
(‘gTLDs’) such as ‘.com’ and ‘.net’; country top-level domains
(‘ccTLDs’) such as ‘.de’ and ‘.cn’; and, brand top-level domains such
as ‘.apple’ and ‘.google’. Tools are provided to extract internet domain
components using the public suffix list base data.

- `libpsl`: <https://github.com/rockdaboot/libpsl>
- Public Suffix List: <https://publicsuffix.org/>

## What’s Inside The Tin

The following functions are implemented:

- `apex_domain`: Return the apex/top-private domain from a vector of
domains
- `is_public_suffix`: Test whether a domain is a public suffix
- `public_suffix`: Return the public suffix from a vector of domains
- `suffix_extract`: Separate a domain into component parts
- `suffix_extract2`: Separate a domain into component parts (urltools
compatible output)

## Installation

``` r
devtools::install_github("hrbrmstr/psl")
```

## Usage

``` r
library(psl)
library(tidyverse)

# current verison
packageVersion("psl")
## [1] '0.1.0'
```

``` r
doms <- c(
"", "com", "example.com", "www.example.com",
".com", ".example", ".example.com", ".example.example", "example",
"example.example", "b.example.example", "a.b.example.example",
"biz", "domain.biz", "b.domain.biz", "a.b.domain.biz", "com",
"example.com", "b.example.com", "a.b.example.com", "uk.com",
"example.uk.com", "b.example.uk.com", "a.b.example.uk.com", "test.ac",
"cy", "c.cy", "b.c.cy", "a.b.c.cy", "jp", "test.jp", "www.test.jp",
"ac.jp", "test.ac.jp", "www.test.ac.jp", "kyoto.jp", "test.kyoto.jp",
"ide.kyoto.jp", "b.ide.kyoto.jp", "a.b.ide.kyoto.jp", "c.kobe.jp",
"b.c.kobe.jp", "a.b.c.kobe.jp", "city.kobe.jp", "www.city.kobe.jp",
"ck", "test.ck", "b.test.ck", "a.b.test.ck", "www.ck", "www.www.ck",
"us", "test.us", "www.test.us", "ak.us", "test.ak.us", "www.test.ak.us",
"k12.ak.us", "test.k12.ak.us", "www.test.k12.ak.us"
)

apex_domain(doms)
## [1] NA NA "example.com" "example.com" NA NA
## [7] NA NA NA "example.example" "example.example" "example.example"
## [13] NA "domain.biz" "domain.biz" "domain.biz" NA "example.com"
## [19] "example.com" "example.com" NA "example.uk.com" "example.uk.com" "example.uk.com"
## [25] "test.ac" NA "c.cy" "c.cy" "c.cy" NA
## [31] "test.jp" "test.jp" NA "test.ac.jp" "test.ac.jp" NA
## [37] "test.kyoto.jp" NA "b.ide.kyoto.jp" "b.ide.kyoto.jp" NA "b.c.kobe.jp"
## [43] "b.c.kobe.jp" "city.kobe.jp" "city.kobe.jp" NA NA "b.test.ck"
## [49] "b.test.ck" "www.ck" "www.ck" NA "test.us" "test.us"
## [55] NA "test.ak.us" "test.ak.us" NA "test.k12.ak.us" "test.k12.ak.us"

public_suffix(doms)
## [1] "" "com" "com" "com" ".com" ".example" "com"
## [8] "example" "example" "example" "example" "example" "biz" "biz"
## [15] "biz" "biz" "com" "com" "com" "com" "uk.com"
## [22] "uk.com" "uk.com" "uk.com" "ac" "cy" "cy" "cy"
## [29] "cy" "jp" "jp" "jp" "ac.jp" "ac.jp" "ac.jp"
## [36] "kyoto.jp" "kyoto.jp" "ide.kyoto.jp" "ide.kyoto.jp" "ide.kyoto.jp" "c.kobe.jp" "c.kobe.jp"
## [43] "c.kobe.jp" "kobe.jp" "kobe.jp" "ck" "test.ck" "test.ck" "test.ck"
## [50] "ck" "ck" "us" "us" "us" "ak.us" "ak.us"
## [57] "ak.us" "k12.ak.us" "k12.ak.us" "k12.ak.us"

is_public_suffix(doms)
## [1] TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
## [20] FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE
## [39] FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE
## [58] TRUE FALSE FALSE

suffix_extract(doms)
## # A tibble: 60 x 6
## orig normalized subdomain apex domain suffix
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 "" "" <NA> <NA> <NA> ""
## 2 com com <NA> <NA> <NA> com
## 3 example.com example.com "" example.com example com
## 4 www.example.com www.example.com www example.com example com
## 5 .com .com <NA> <NA> <NA> .com
## 6 .example .example <NA> <NA> <NA> .example
## 7 .example.com .example.com <NA> <NA> <NA> com
## 8 .example.example .example.example <NA> <NA> <NA> example
## 9 example example <NA> <NA> <NA> example
## 10 example.example example.example "" example.example example example
## # ... with 50 more rows

suffix_extract2(doms) # urltools compatible output
## # A tibble: 60 x 4
## host subdomain domain suffix
## <chr> <chr> <chr> <chr>
## 1 "" <NA> <NA> ""
## 2 com <NA> <NA> com
## 3 example.com "" example com
## 4 www.example.com www example com
## 5 .com <NA> <NA> .com
## 6 .example <NA> <NA> .example
## 7 .example.com <NA> <NA> com
## 8 .example.example <NA> <NA> example
## 9 example <NA> <NA> example
## 10 example.example "" example example
## # ... with 50 more rows
```

``` r
library(microbenchmark)

microbenchmark(
urltools = urltools::suffix_extract(doms),
psl = psl::suffix_extract(doms), # returns more data
psl2 = psl::suffix_extract2(doms) # returns what urltools does
) -> mb

autoplot(mb)
```

<img src="README_files/figure-gfm/bench-1.png" width="960" />

BIN
README_files/figure-gfm/bench-1.png View File

Before After
Width: 1920  |  Height: 960  |  Size: 73KB

+ 12388
- 0
inst/dat/public_suffix_list.dat
File diff suppressed because it is too large
View File


+ 17
- 0
man/apex_domain.Rd View File

@@ -0,0 +1,17 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/RcppExports.R
\name{apex_domain}
\alias{apex_domain}
\title{Return the apex/top-private domain from a vector of domains}
\usage{
apex_domain(domains)
}
\arguments{
\item{domains}{character vector of domains}
}
\value{
character vector
}
\description{
Return the apex/top-private domain from a vector of domains
}

+ 17
- 0
man/is_public_suffix.Rd View File

@@ -0,0 +1,17 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/RcppExports.R
\name{is_public_suffix}
\alias{is_public_suffix}
\title{Test whether a domain is a public suffix}
\usage{
is_public_suffix(domains)
}
\arguments{
\item{domains}{character vector of domains}
}
\value{
character vector
}
\description{
Test whether a domain is a public suffix
}

+ 10
- 1
man/psl.Rd View File

@@ -4,9 +4,18 @@
\name{psl}
\alias{psl}
\alias{psl-package}
\title{...}
\title{Extract Internet Domain Components Using the Public Suffix List}
\description{
The 'Public Suffix List' (\url{https://publicsuffix.org/}) is a collection
of top-level domains ('TLDs') which include global top-level domainsa ('gTLDs')
such as '.com' and '.net'; country top-level domains ('ccTLDs') such as '.de' and
'.cn'; and, brand top-level domains such as '.apple' and '.google'. Tools are provided
to extract internet domain components using the public suffix list base data.
}
\details{
\itemize{
\item \code{libpsl}: \url{https://github.com/rockdaboot/libpsl}
\item Public Suffix List: \url{https://publicsuffix.org/}
\item URL: \url{https://gitlab.com/hrbrmstr/psl}
\item BugReports: \url{https://gitlab.com/hrbrmstr/psl/issues}
}


+ 17
- 0
man/public_suffix.Rd View File

@@ -0,0 +1,17 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/RcppExports.R
\name{public_suffix}
\alias{public_suffix}
\title{Return the public suffix from a vector of domains}
\usage{
public_suffix(domains)
}
\arguments{
\item{domains}{character vector of domains}
}
\value{
character vector
}
\description{
Return the public suffix from a vector of domains
}

+ 17
- 0
man/suffix_extract.Rd View File

@@ -0,0 +1,17 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/RcppExports.R
\name{suffix_extract}
\alias{suffix_extract}
\title{Separate a domain into component parts}
\usage{
suffix_extract(domains)
}
\arguments{
\item{domains}{character vector of domains}
}
\value{
data frame
}
\description{
Separate a domain into component parts
}

+ 17
- 0
man/suffix_extract2.Rd View File

@@ -0,0 +1,17 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/RcppExports.R
\name{suffix_extract2}
\alias{suffix_extract2}
\title{Separate a domain into component parts}
\usage{
suffix_extract2(domains)
}
\arguments{
\item{domains}{character vector of domains}
}
\value{
data frame
}
\description{
Compatibility function for those using \code{urltools::suffix_extract()}
}

+ 3
- 0
src/.gitignore View File

@@ -0,0 +1,3 @@
*.o
*.so
*.dll

+ 3
- 0
src/Makevars View File

@@ -0,0 +1,3 @@
CXX_STD = CXX11
PKG_CXXFLAGS =
PKG_LIBS = -L. -liconv -lidn2

+ 76
- 0
src/RcppExports.cpp View File

@@ -0,0 +1,76 @@
// Generated by using Rcpp::compileAttributes() -> do not edit by hand
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#include <Rcpp.h>

using namespace Rcpp;

// apex_domain
CharacterVector apex_domain(CharacterVector domains);
RcppExport SEXP _psl_apex_domain(SEXP domainsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP);
rcpp_result_gen = Rcpp::wrap(apex_domain(domains));
return rcpp_result_gen;
END_RCPP
}
// public_suffix
CharacterVector public_suffix(CharacterVector domains);
RcppExport SEXP _psl_public_suffix(SEXP domainsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP);
rcpp_result_gen = Rcpp::wrap(public_suffix(domains));
return rcpp_result_gen;
END_RCPP
}
// is_public_suffix
std::vector< bool > is_public_suffix(CharacterVector domains);
RcppExport SEXP _psl_is_public_suffix(SEXP domainsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP);
rcpp_result_gen = Rcpp::wrap(is_public_suffix(domains));
return rcpp_result_gen;
END_RCPP
}
// suffix_extract
DataFrame suffix_extract(CharacterVector domains);
RcppExport SEXP _psl_suffix_extract(SEXP domainsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP);
rcpp_result_gen = Rcpp::wrap(suffix_extract(domains));
return rcpp_result_gen;
END_RCPP
}
// suffix_extract2
DataFrame suffix_extract2(CharacterVector domains);
RcppExport SEXP _psl_suffix_extract2(SEXP domainsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP);
rcpp_result_gen = Rcpp::wrap(suffix_extract2(domains));
return rcpp_result_gen;
END_RCPP
}

static const R_CallMethodDef CallEntries[] = {
{"_psl_apex_domain", (DL_FUNC) &_psl_apex_domain, 1},
{"_psl_public_suffix", (DL_FUNC) &_psl_public_suffix, 1},
{"_psl_is_public_suffix", (DL_FUNC) &_psl_is_public_suffix, 1},
{"_psl_suffix_extract", (DL_FUNC) &_psl_suffix_extract, 1},
{"_psl_suffix_extract2", (DL_FUNC) &_psl_suffix_extract2, 1},
{NULL, NULL, 0}
};

RcppExport void R_init_psl(DllInfo *dll) {
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
}

+ 147
- 0
src/config.h View File

@@ -0,0 +1,147 @@
/* config.h. Generated from config.h.in by configure. */
/* config.h.in. Generated from configure.ac by autoheader. */

/* generate PSL data using libicu */
/* #undef BUILTIN_GENERATOR_LIBICU */

/* generate PSL data using libidn */
/* #undef BUILTIN_GENERATOR_LIBIDN */

/* generate PSL data using libidn2 */
#define BUILTIN_GENERATOR_LIBIDN2 1

/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
systems. This function is required for `alloca.c' support on those systems.
*/
/* #undef CRAY_STACKSEG_END */

/* Define to 1 if using `alloca.c'. */
/* #undef C_ALLOCA */

/* Define to 1 if translation of program messages to the user's native
language is requested. */
/* #undef ENABLE_NLS */

/* Define to 1 if you have `alloca', as a function or macro. */
#define HAVE_ALLOCA 1

/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
*/
#define HAVE_ALLOCA_H 1

/* Define to 1 if you have the MacOS X function CFLocaleCopyCurrent in the
CoreFoundation framework. */
#define HAVE_CFLOCALECOPYCURRENT 1

/* Define to 1 if you have the MacOS X function CFPreferencesCopyAppValue in
the CoreFoundation framework. */
#define HAVE_CFPREFERENCESCOPYAPPVALUE 1

/* Define to 1 if you have the `clock_gettime' function. */
#define HAVE_CLOCK_GETTIME 1

/* Define if the GNU dcgettext() function is already present or preinstalled.
*/
/* #undef HAVE_DCGETTEXT */

/* Define to 1 if you have the <dlfcn.h> header file. */
#define HAVE_DLFCN_H 1

/* Define to 1 if you have the `fmemopen' function. */
#define HAVE_FMEMOPEN 1

/* Define if the GNU gettext() function is already present or preinstalled. */
/* #undef HAVE_GETTEXT */

/* Define if you have the iconv() function and it works. */
#define HAVE_ICONV 1

/* Define to 1 if you have the <inttypes.h> header file. */
#define HAVE_INTTYPES_H 1

/* Define to 1 if you have the <memory.h> header file. */
#define HAVE_MEMORY_H 1

/* Define to 1 if you have the `nl_langinfo' function. */
#define HAVE_NL_LANGINFO 1

/* Define to 1 if you have the <stdint.h> header file. */
#define HAVE_STDINT_H 1

/* Define to 1 if you have the <stdlib.h> header file. */
#define HAVE_STDLIB_H 1

/* Define to 1 if you have the <strings.h> header file. */
#define HAVE_STRINGS_H 1

/* Define to 1 if you have the <string.h> header file. */
#define HAVE_STRING_H 1

/* Define to 1 if you have the `strndup' function. */
#define HAVE_STRNDUP 1

/* Define to 1 if you have the <sys/stat.h> header file. */
#define HAVE_SYS_STAT_H 1

/* Define to 1 if you have the <sys/types.h> header file. */
#define HAVE_SYS_TYPES_H 1

/* Define to 1 if you have the <unistd.h> header file. */
#define HAVE_UNISTD_H 1

/* Define to 1 or 0, depending whether the compiler supports simple visibility
declarations. */
#define HAVE_VISIBILITY 1

/* Define as const if the declaration of iconv() needs const. */
#define ICONV_CONST

/* Define to the sub-directory where libtool stores uninstalled libraries. */
#define LT_OBJDIR ".libs/"

/* Define to the address where bug reports for this package should be sent. */
#define PACKAGE_BUGREPORT "tim.ruehsen@gmx.de"

/* Define to the full name of this package. */
#define PACKAGE_NAME "libpsl"

/* Define to the full name and version of this package. */
#define PACKAGE_STRING "libpsl 0.20.2"

/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "libpsl"

/* Define to the home page for this package. */
#define PACKAGE_URL "https://github.com/rockdaboot/libpsl"

/* Define to the version of this package. */
#define PACKAGE_VERSION "0.20.2"

/* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be
automatically deduced at runtime.
STACK_DIRECTION > 0 => grows toward higher addresses
STACK_DIRECTION < 0 => grows toward lower addresses
STACK_DIRECTION = 0 => direction of growth unknown */
/* #undef STACK_DIRECTION */

/* Define to 1 if you have the ANSI C header files. */
#define STDC_HEADERS 1

/* generate PSL data using libicu */
/* #undef WITH_LIBICU */

/* generate PSL data using libidn */
/* #undef WITH_LIBIDN */

/* generate PSL data using libidn2 */
#define WITH_LIBIDN2 1

/* Define to `__inline__' or `__inline' if that's what the C compiler
calls it, or to nothing if 'inline' is not supported under any name. */
#ifndef __cplusplus
/* #undef inline */
#endif

/* Define to `unsigned int' if <sys/types.h> does not define. */
/* #undef size_t */

+ 212
- 0
src/libpsl.h View File

@@ -0,0 +1,212 @@
/*
* Copyright(c) 2014-2018 Tim Ruehsen
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
* This file is part of libpsl.
*
* Header file for libpsl library routines
*
* Changelog
* 20.03.2014 Tim Ruehsen created
*
*/
#ifdef __cplusplus
extern "C" {
#endif

#ifndef LIBPSL_LIBPSL_H
#define LIBPSL_LIBPSL_H

#include <stdio.h>
#include <time.h>

#define PSL_VERSION "0.20.2"
#define PSL_VERSION_MAJOR 0
#define PSL_VERSION_MINOR 20
#define PSL_VERSION_PATCH 2
#define PSL_VERSION_NUMBER 0x001402

#ifndef PSL_API
#if defined BUILDING_PSL && HAVE_VISIBILITY
# define PSL_API __attribute__ ((__visibility__("default")))
#elif defined BUILDING_PSL && defined _MSC_VER && !defined PSL_STATIC
# define PSL_API __declspec(dllexport)
#elif defined _MSC_VER && !defined PSL_STATIC
# define PSL_API __declspec(dllimport)
#else
# define PSL_API
#endif
#endif

#ifdef __cplusplus
extern "C" {
#endif

/* types for psl_is_public_suffix2() */
#define PSL_TYPE_ICANN (1<<0)
#define PSL_TYPE_PRIVATE (1<<1)
#define PSL_TYPE_NO_STAR_RULE (1<<2)
#define PSL_TYPE_ANY (PSL_TYPE_ICANN | PSL_TYPE_PRIVATE)

/**
* psl_error_t:
* @PSL_SUCCESS: Successful return.
* @PSL_ERR_INVALID_ARG: Invalid argument.
* @PSL_ERR_CONVERTER: Failed to open libicu utf-16 converter.
* @PSL_ERR_TO_UTF16: Failed to convert to utf-16.
* @PSL_ERR_TO_LOWER: Failed to convert utf-16 to lowercase.
* @PSL_ERR_TO_UTF8: Failed to convert utf-16 to utf-8.
* @PSL_ERR_NO_MEM: Failed to allocate memory.
*
* Return codes for PSL functions.
* Negative return codes mean failure.
* Positive values are reserved for non-error return codes.
*/
typedef enum {
PSL_SUCCESS = 0,
PSL_ERR_INVALID_ARG = -1,
PSL_ERR_CONVERTER = -2, /* failed to open libicu utf-16 converter */
PSL_ERR_TO_UTF16 = -3, /* failed to convert to utf-16 */
PSL_ERR_TO_LOWER = -4, /* failed to convert utf-16 to lowercase */
PSL_ERR_TO_UTF8 = -5, /* failed to convert utf-16 to utf-8 */
PSL_ERR_NO_MEM = -6 /* failed to allocate memory */
} psl_error_t;

typedef struct _psl_ctx_st psl_ctx_t;

/* frees PSL context */
PSL_API
void
psl_free(psl_ctx_t *psl);

/* frees memory allocated by libpsl routines */
PSL_API
void
psl_free_string(char *str);

/* loads PSL data from file */
PSL_API
psl_ctx_t *
psl_load_file(const char *fname);

/* loads PSL data from FILE pointer */
PSL_API
psl_ctx_t *
psl_load_fp(FILE *fp);

/* retrieves builtin PSL data */
PSL_API
const psl_ctx_t *
psl_builtin(void);

/* retrieves most recent PSL data */
PSL_API
psl_ctx_t *
psl_latest(const char *fname);

/* checks whether domain is a public suffix or not */
PSL_API
int
psl_is_public_suffix(const psl_ctx_t *psl, const char *domain);

/* checks whether domain is a public suffix regarding the type or not */
PSL_API
int
psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type);

/* checks whether cookie_domain is acceptable for domain or not */
PSL_API
int
psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname, const char *cookie_domain);

/* returns the longest not registrable domain within 'domain' or NULL if none found */
PSL_API
const char *
psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain);

/* returns the shortest possible registrable domain part or NULL if domain is not registrable at all */
PSL_API
const char *
psl_registrable_domain(const psl_ctx_t *psl, const char *domain);

/* convert a string into lowercase UTF-8 */
PSL_API
psl_error_t
psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower);

/* does not include exceptions */
PSL_API
int
psl_suffix_count(const psl_ctx_t *psl);

/* just counts exceptions */
PSL_API
int
psl_suffix_exception_count(const psl_ctx_t *psl);

/* just counts wildcards */
PSL_API
int
psl_suffix_wildcard_count(const psl_ctx_t *psl);

/* returns mtime of PSL source file */
PSL_API
time_t
psl_builtin_file_time(void);

/* returns SHA1 checksum (hex-encoded, lowercase) of PSL source file */
PSL_API
const char *
psl_builtin_sha1sum(void);

/* returns file name of PSL source file */
PSL_API
const char *
psl_builtin_filename(void);

/* returns name of distribution PSL data file */
PSL_API
const char *
psl_dist_filename(void);

/* returns library version string */
PSL_API
const char *
psl_get_version(void);

/* checks library version number */
PSL_API
int
psl_check_version_number(int version);

/* returns whether the built-in data is outdated or not */
PSL_API
int
psl_builtin_outdated(void);

#ifdef __cplusplus
}
#endif

#endif /* LIBPSL_LIBPSL_H */

#ifdef __cplusplus
}
#endif

+ 279
- 0
src/lookup_string_in_fixed_set.c View File

@@ -0,0 +1,279 @@
/* Copyright 2015-2016 The Chromium Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE.chromium file.
*
* Converted to C89 2015 by Tim Rühsen
*/

#include <stddef.h>

#if defined(__GNUC__) && defined(__GNUC_MINOR__)
# define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
#else
# define _GCC_VERSION_AT_LEAST(major, minor) 0
#endif

#define CHECK_LT(a, b) if ((a) >= b) return 0

static const char multibyte_length_table[16] = {
0, 0, 0, 0, /* 0x00-0x3F */
0, 0, 0, 0, /* 0x40-0x7F */
0, 0, 0, 0, /* 0x80-0xBF */
2, 2, 3, 4, /* 0xC0-0xFF */
};


/*
* Get length of multibyte character sequence starting at a given byte.
* Returns zero if the byte is not a valid leading byte in UTF-8.
*/
static int GetMultibyteLength(char c) {
return multibyte_length_table[((unsigned char)c) >> 4];
}

/*
* Moves pointers one byte forward.
*/
static void NextPos(const unsigned char** pos,
const char** key,
const char** multibyte_start)
{
++*pos;
if (*multibyte_start) {
/* Advance key to next byte in multibyte sequence. */
++*key;
/* Reset multibyte_start if last byte in multibyte sequence was consumed. */
if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
*multibyte_start = 0;
} else {
if (GetMultibyteLength(**key)) {
/* Multibyte prefix was matched in the dafsa, start matching multibyte
* content in next round. */
*multibyte_start = *key;
} else {
/* Advance key as a single byte character was matched. */
++*key;
}
}
}

/*
* Read next offset from pos.
* Returns true if an offset could be read, false otherwise.
*/

static int GetNextOffset(const unsigned char** pos,
const unsigned char* end,
const unsigned char** offset)
{
size_t bytes_consumed;

if (*pos == end)
return 0;

/* When reading an offset the byte array must always contain at least
* three more bytes to consume. First the offset to read, then a node
* to skip over and finally a destination node. No object can be smaller
* than one byte. */
CHECK_LT(*pos + 2, end);
switch (**pos & 0x60) {
case 0x60: /* Read three byte offset */
*offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
bytes_consumed = 3;
break;
case 0x40: /* Read two byte offset */
*offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
bytes_consumed = 2;
break;
default:
*offset += (*pos)[0] & 0x3F;
bytes_consumed = 1;
}
if ((**pos & 0x80) != 0) {
*pos = end;
} else {
*pos += bytes_consumed;
}
return 1;
}

/*
* Check if byte at offset is last in label.
*/

static int IsEOL(const unsigned char* offset, const unsigned char* end)
{
CHECK_LT(offset, end);
return(*offset & 0x80) != 0;
}

/*
* Check if byte at offset matches first character in key.
* This version assumes a range check was already performed by the caller.
*/

static int IsMatchUnchecked(const unsigned char matcher,
const char* key,
const char* multibyte_start)
{
if (multibyte_start) {
/* Multibyte matching mode. */
if (multibyte_start == key) {
/* Match leading byte, which will also match the sequence length. */
return (matcher ^ 0x80) == (const unsigned char)*key;
} else {
/* Match following bytes. */
return (matcher ^ 0xC0) == (const unsigned char)*key;
}
}
/* If key points at a leading byte in a multibyte sequence, but we are not yet
* in multibyte mode, then the dafsa should contain a special byte to indicate
* a mode switch. */
if (GetMultibyteLength(*key)) {
return matcher == 0x1F;
}
/* Normal matching of a single byte character. */
return matcher == (const unsigned char)*key;
}

/*
* Check if byte at offset matches first character in key.
* This version matches characters not last in label.
*/

static int IsMatch(const unsigned char* offset,
const unsigned char* end,
const char* key,
const char* multibyte_start)
{
CHECK_LT(offset, end);
return IsMatchUnchecked(*offset, key, multibyte_start);
}

/*
* Check if byte at offset matches first character in key.
* This version matches characters last in label.
*/

static int IsEndCharMatch(const unsigned char* offset,
const unsigned char* end,
const char* key,
const char* multibyte_start)
{
CHECK_LT(offset, end);
return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
}

/*
* Read return value at offset.
* Returns true if a return value could be read, false otherwise.
*/

static int GetReturnValue(const unsigned char* offset,
const unsigned char* end,
const char* multibyte_start,
int* return_value)
{
CHECK_LT(offset, end);
if (!multibyte_start && (*offset & 0xE0) == 0x80) {
*return_value = *offset & 0x0F;
return 1;
}
return 0;
}

/*
* Looks up the string |key| with length |key_length| in a fixed set of
* strings. The set of strings must be known at compile time. It is converted to
* a graph structure named a DAFSA (Deterministic Acyclic Finite State
* Automaton) by the script psl-make-dafsa during compilation. This permits
* efficient (in time and space) lookup. The graph generated by psl-make-dafsa
* takes the form of a constant byte array which should be supplied via the
* |graph| and |length| parameters. The return value is kDafsaNotFound,
* kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule,
* kDafsaWildcardRule and kDafsaPrivateRule ORed together.
*
* Lookup a domain key in a byte array generated by psl-make-dafsa.
*/

/* prototype to skip warning with -Wmissing-prototypes */
int LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t);

int LookupStringInFixedSet(const unsigned char* graph,
size_t length,
const char* key,
size_t key_length)
{
const unsigned char* pos = graph;
const unsigned char* end = graph + length;
const unsigned char* offset = pos;
const char* key_end = key + key_length;
const char* multibyte_start = 0;

while (GetNextOffset(&pos, end, &offset)) {
/*char <char>+ end_char offsets
* char <char>+ return value
* char end_char offsets
* char return value
* end_char offsets
* return_value
*/
int did_consume = 0;

if (key != key_end && !IsEOL(offset, end)) {
/* Leading <char> is not a match. Don't dive into this child */
if (!IsMatch(offset, end, key, multibyte_start))
continue;
did_consume = 1;
NextPos(&offset, &key, &multibyte_start);
/* Possible matches at this point:
* <char>+ end_char offsets
* <char>+ return value
* end_char offsets
* return value
*/

/* Remove all remaining <char> nodes possible */
while (!IsEOL(offset, end) && key != key_end) {
if (!IsMatch(offset, end, key, multibyte_start))
return -1;
NextPos(&offset, &key, &multibyte_start);
}
}
/* Possible matches at this point:
* end_char offsets
* return_value
* If one or more <char> elements were consumed, a failure
* to match is terminal. Otherwise, try the next node.
*/
if (key == key_end) {
int return_value;

if (GetReturnValue(offset, end, multibyte_start, &return_value))
return return_value;
/* The DAFSA guarantees that if the first char is a match, all
* remaining char elements MUST match if the key is truly present.
*/
if (did_consume)
return -1;
continue;
}
if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
if (did_consume)
return -1; /* Unexpected */
continue;
}
NextPos(&offset, &key, &multibyte_start);
pos = offset; /* Dive into child */
}

return -1; /* No match */
}

/* prototype to skip warning with -Wmissing-prototypes */
int GetUtfMode(const unsigned char *graph, size_t length);

int GetUtfMode(const unsigned char *graph, size_t length)
{
return length > 0 && graph[length - 1] < 0x80;
}

+ 350
- 0
src/psl-main.cpp View File

@@ -0,0 +1,350 @@
#include <Rcpp.h>

#include <regex>

#include "libpsl.h"

using namespace Rcpp;

//' Return the apex/top-private domain from a vector of domains
//'
//' @md
//' @param domains character vector of domains
//' @return character vector
//' @export
// [[Rcpp::export]]
CharacterVector apex_domain(CharacterVector domains) {

unsigned int input_size = domains.size();
CharacterVector output(input_size);
char *lower = NULL;
int rc;
const char * result;
const psl_ctx_t *psl = psl_builtin();

for (unsigned int i = 0; i < input_size; i++) {

// remove trailing period if any
std::string cleaned = Rcpp::as<std::string>(domains[i]);
if (cleaned.length() > 0) {
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back();
}

// lowercase it
rc = psl_str_to_utf8lower(
cleaned.c_str(),
"utf-8", "en",
&lower
);

if (rc == PSL_SUCCESS) {
result = psl_registrable_domain(psl, lower);
if (result) {
output[i] = std::string(result);
} else {
output[i] = NA_STRING;
}
} else {
output[i] = NA_STRING;
}

psl_free_string(lower);

}

return(output);

}

//' Return the public suffix from a vector of domains
//'
//' @md
//' @param domains character vector of domains
//' @return character vector
//' @export
// [[Rcpp::export]]
CharacterVector public_suffix(CharacterVector domains) {

unsigned int input_size = domains.size();
CharacterVector output(input_size);
char *lower = NULL;
int rc;
const char * result;
const psl_ctx_t *psl = psl_builtin();

for (unsigned int i = 0; i < input_size; i++) {

// remove trailing period if any
std::string cleaned = Rcpp::as<std::string>(domains[i]);
if (cleaned.length() > 0) {
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back();
}

// lowercase it
rc = psl_str_to_utf8lower(
cleaned.c_str(),
"utf-8", "en",
&lower
);

if (rc == PSL_SUCCESS) {
result = psl_unregistrable_domain(psl, lower);
if (result) {
output[i] = std::string(result);
} else {
output[i] = NA_STRING;
}
} else {
output[i] = NA_STRING;
}

psl_free_string(lower);

}

return(output);

}

//' Test whether a domain is a public suffix
//'
//' @md
//' @param domains character vector of domains
//' @return character vector
//' @export
// [[Rcpp::export]]
std::vector< bool > is_public_suffix(CharacterVector domains) {

unsigned int input_size = domains.size();
std::vector < bool > output(input_size);
char *lower = NULL;
int rc;
const psl_ctx_t *psl = psl_builtin();

for (unsigned int i = 0; i < input_size; i++) {

// remove trailing period if any
std::string cleaned = Rcpp::as<std::string>(domains[i]);
if (cleaned.length() > 0) {
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back();
}

// lowercase it
rc = psl_str_to_utf8lower(
cleaned.c_str(),
"utf-8", "en",
&lower
);

if (rc == PSL_SUCCESS) {
output[i] = (psl_is_public_suffix(psl, lower) == 1);
} else {
output[i] = NA_LOGICAL;
}

psl_free_string(lower);

}

return(output);

}

//' Separate a domain into component parts
//'
//' @md
//' @param domains character vector of domains
//' @return data frame
//' @export
// [[Rcpp::export]]
DataFrame suffix_extract(CharacterVector domains) {

unsigned int input_size = domains.size();

CharacterVector normalized(input_size);
CharacterVector subdomain(input_size);
CharacterVector apex(input_size);
CharacterVector domain(input_size);
CharacterVector suffix(input_size);

char *lower = NULL;
int rc;
const char * result;
const psl_ctx_t *psl = psl_builtin();

for (unsigned int i = 0; i < input_size; i++) {

// remove trailing period if any
std::string cleaned = Rcpp::as<std::string>(domains[i]);
if (cleaned.length() > 0) {
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back();
}

// lowercase it
rc = psl_str_to_utf8lower(
cleaned.c_str(),
"utf-8", "en",
&lower
);

if (rc == PSL_SUCCESS) {

// no dots at end and lowercased
normalized[i] = std::string(lower);

// try to get the suffix
result = psl_unregistrable_domain(psl, lower);
if (result) {
suffix[i] = std::string(result);
} else {
suffix[i] = NA_STRING;
}

// try to get the apex
result = psl_registrable_domain(psl, lower);
if (result) {
apex[i] = std::string(result);
} else {
apex[i] = NA_STRING;
}

if ((suffix[i] != NA_STRING) && (apex[i] != NA_STRING)) {

std::regex trail_suf("[\\.]*" + Rcpp::as<std::string>(suffix[i]) + "$");
domain[i] = std::regex_replace(
Rcpp::as<std::string>(apex[i]),
trail_suf, ""
);

std::regex apex_suf("[\\.]*" + Rcpp::as<std::string>(apex[i]) + "$");
subdomain[i] = std::regex_replace(
Rcpp::as<std::string>(normalized[i]),
apex_suf, ""
);

} else {
domain[i] = NA_STRING;
subdomain[i] = NA_STRING;
}

} else {
normalized[i] = NA_STRING;
subdomain[i] = NA_STRING;
apex[i] = NA_STRING;
domain[i] = NA_STRING;
suffix[i] = NA_STRING;
}

psl_free_string(lower);

}

DataFrame out = DataFrame::create(
_["orig"] = domains,
_["normalized"] = normalized,
_["subdomain"] = subdomain,
_["apex"] = apex,
_["domain"] = domain,
_["suffix"] = suffix,
_["stringsAsFactors"] = false
);

out.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame");

return(out);

}

//' Separate a domain into component parts
//'
//' Compatibility function for those using `urltools::suffix_extract()`
//'
//' @md
//' @param domains character vector of domains
//' @return data frame
//' @export
// [[Rcpp::export]]
DataFrame suffix_extract2(CharacterVector domains) {

unsigned int input_size = domains.size();

CharacterVector subdomain(input_size);
CharacterVector domain(input_size);
CharacterVector suffix(input_size);

char *lower = NULL;
int rc;
const char * result;
const psl_ctx_t *psl = psl_builtin();

for (unsigned int i = 0; i < input_size; i++) {

std::string cleaned = Rcpp::as<std::string>(domains[i]);
if (cleaned.length() > 0) {
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back();
}

// lowercase it
rc = psl_str_to_utf8lower(
cleaned.c_str(),
"utf-8", "en",
&lower
);

if (rc == PSL_SUCCESS) {

std::string normalized(lower);

// try to get the suffix
result = psl_unregistrable_domain(psl, lower);

if (result) {

std::string suf = std::string(result);
suffix[i] = suf;

result = psl_registrable_domain(psl, lower);

if (result) {

std::string apex(result);

std::regex trail_suf("[\\.]*" + suf + "$");
std::regex apex_suf("[\\.]*" + apex + "$");

domain[i] = std::regex_replace(apex, trail_suf, "");
subdomain[i] = std::regex_replace(normalized, apex_suf, "");

} else {
subdomain[i] = NA_STRING;
domain[i] = NA_STRING;
}
} else {
subdomain[i] = NA_STRING;
suffix[i] = NA_STRING;
domain[i] = NA_STRING;
}

} else {
subdomain[i] = NA_STRING;
domain[i] = NA_STRING;
suffix[i] = NA_STRING;
}

psl_free_string(lower);

}

DataFrame out = DataFrame::create(
_["host"] = domains,
_["subdomain"] = subdomain,
_["domain"] = domain,
_["suffix"] = suffix,
_["stringsAsFactors"] = false
);

out.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame");

return(out);

}

+ 1943
- 0
src/psl.c
File diff suppressed because it is too large
View File


+ 3898
- 0
src/suffixes_dafsa.c
File diff suppressed because it is too large
View File


Loading…
Cancel
Save