@ -1,24 +1,31 @@ | |||
Package: psl | |||
Type: Package | |||
Title: psl title goes here otherwise CRAN checks fail | |||
Title: Extract Internet Domain Components Using the Public Suffix List | |||
Version: 0.1.0 | |||
Date: 2018-09-06 | |||
Authors@R: c( | |||
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), | |||
comment = c(ORCID = "0000-0001-5670-2640")) | |||
comment = c(ORCID = "0000-0001-5670-2640")), | |||
person("Tim", "Rühsen", email = "bob@rud.is", role = c("aut"), | |||
comment = "libpsl : <https://github.com/rockdaboot/libpsl>") | |||
) | |||
Maintainer: Bob Rudis <bob@rud.is> | |||
Description: A good description goes here otherwise CRAN checks fail. | |||
Description: The 'Public Suffix List' (<https://publicsuffix.org/>) is a collection | |||
of top-level domains ('TLDs') which include global top-level domainsa ('gTLDs') | |||
such as '.com' and '.net'; country top-level domains ('ccTLDs') such as '.de' and | |||
'.cn'; and, brand top-level domains such as '.apple' and '.google'. Tools are provided | |||
to extract internet domain components using the public suffix list base data. | |||
URL: https://gitlab.com/hrbrmstr/psl | |||
BugReports: https://gitlab.com/hrbrmstr/psl/issues | |||
SystemRequirements: C++11 | |||
Encoding: UTF-8 | |||
License: AGPL | |||
License: MIT + file LICENSE | |||
Suggests: | |||
testthat, | |||
covr | |||
Depends: | |||
R (>= 3.2.0) | |||
Imports: | |||
httr, | |||
jsonlite | |||
Rcpp | |||
RoxygenNote: 6.0.1.9000 | |||
LinkingTo: Rcpp |
@ -0,0 +1,2 @@ | |||
YEAR: 2018 | |||
COPYRIGHT HOLDER: Bob Rudis |
@ -1,4 +1,9 @@ | |||
# Generated by roxygen2: do not edit by hand | |||
import(httr) | |||
importFrom(jsonlite,fromJSON) | |||
export(apex_domain) | |||
export(is_public_suffix) | |||
export(public_suffix) | |||
export(suffix_extract) | |||
export(suffix_extract2) | |||
importFrom(Rcpp,sourceCpp) | |||
useDynLib(psl) |
@ -0,0 +1,55 @@ | |||
# Generated by using Rcpp::compileAttributes() -> do not edit by hand | |||
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 | |||
#' Return the apex/top-private domain from a vector of domains | |||
#' | |||
#' @md | |||
#' @param domains character vector of domains | |||
#' @return character vector | |||
#' @export | |||
apex_domain <- function(domains) { | |||
.Call('_psl_apex_domain', PACKAGE = 'psl', domains) | |||
} | |||
#' Return the public suffix from a vector of domains | |||
#' | |||
#' @md | |||
#' @param domains character vector of domains | |||
#' @return character vector | |||
#' @export | |||
public_suffix <- function(domains) { | |||
.Call('_psl_public_suffix', PACKAGE = 'psl', domains) | |||
} | |||
#' Test whether a domain is a public suffix | |||
#' | |||
#' @md | |||
#' @param domains character vector of domains | |||
#' @return character vector | |||
#' @export | |||
is_public_suffix <- function(domains) { | |||
.Call('_psl_is_public_suffix', PACKAGE = 'psl', domains) | |||
} | |||
#' Separate a domain into component parts | |||
#' | |||
#' @md | |||
#' @param domains character vector of domains | |||
#' @return data frame | |||
#' @export | |||
suffix_extract <- function(domains) { | |||
.Call('_psl_suffix_extract', PACKAGE = 'psl', domains) | |||
} | |||
#' Separate a domain into component parts | |||
#' | |||
#' Compatibility function for those using `urltools::suffix_extract()` | |||
#' | |||
#' @md | |||
#' @param domains character vector of domains | |||
#' @return data frame | |||
#' @export | |||
suffix_extract2 <- function(domains) { | |||
.Call('_psl_suffix_extract2', PACKAGE = 'psl', domains) | |||
} | |||
@ -1,12 +1,21 @@ | |||
#' ... | |||
#' | |||
#' Extract Internet Domain Components Using the Public Suffix List | |||
#' | |||
#' The 'Public Suffix List' (<https://publicsuffix.org/>) is a collection | |||
#' of top-level domains ('TLDs') which include global top-level domainsa ('gTLDs') | |||
#' such as '.com' and '.net'; country top-level domains ('ccTLDs') such as '.de' and | |||
#' '.cn'; and, brand top-level domains such as '.apple' and '.google'. Tools are provided | |||
#' to extract internet domain components using the public suffix list base data. | |||
#' | |||
#' - `libpsl`: <https://github.com/rockdaboot/libpsl> | |||
#' - Public Suffix List: <https://publicsuffix.org/> | |||
#' | |||
#' - URL: <https://gitlab.com/hrbrmstr/psl> | |||
#' - BugReports: <https://gitlab.com/hrbrmstr/psl/issues> | |||
#' | |||
#' | |||
#' @md | |||
#' @name psl | |||
#' @docType package | |||
#' @author Bob Rudis (bob@@rud.is) | |||
#' @import httr | |||
#' @importFrom jsonlite fromJSON | |||
NULL | |||
#' @useDynLib psl | |||
#' @importFrom Rcpp sourceCpp | |||
NULL |
@ -1,2 +1,138 @@ | |||
# psl | |||
Extract Internet Domain Components Using the Public Suffix List | |||
## Description | |||
The ‘Public Suffix List’ (<https://publicsuffix.org/>) is a collection | |||
of top-level domains (‘TLDs’) which include global top-level domainsa | |||
(‘gTLDs’) such as ‘.com’ and ‘.net’; country top-level domains | |||
(‘ccTLDs’) such as ‘.de’ and ‘.cn’; and, brand top-level domains such | |||
as ‘.apple’ and ‘.google’. Tools are provided to extract internet domain | |||
components using the public suffix list base data. | |||
- `libpsl`: <https://github.com/rockdaboot/libpsl> | |||
- Public Suffix List: <https://publicsuffix.org/> | |||
## What’s Inside The Tin | |||
The following functions are implemented: | |||
- `apex_domain`: Return the apex/top-private domain from a vector of | |||
domains | |||
- `is_public_suffix`: Test whether a domain is a public suffix | |||
- `public_suffix`: Return the public suffix from a vector of domains | |||
- `suffix_extract`: Separate a domain into component parts | |||
- `suffix_extract2`: Separate a domain into component parts (urltools | |||
compatible output) | |||
## Installation | |||
``` r | |||
devtools::install_github("hrbrmstr/psl") | |||
``` | |||
## Usage | |||
``` r | |||
library(psl) | |||
library(tidyverse) | |||
# current verison | |||
packageVersion("psl") | |||
## [1] '0.1.0' | |||
``` | |||
``` r | |||
doms <- c( | |||
"", "com", "example.com", "www.example.com", | |||
".com", ".example", ".example.com", ".example.example", "example", | |||
"example.example", "b.example.example", "a.b.example.example", | |||
"biz", "domain.biz", "b.domain.biz", "a.b.domain.biz", "com", | |||
"example.com", "b.example.com", "a.b.example.com", "uk.com", | |||
"example.uk.com", "b.example.uk.com", "a.b.example.uk.com", "test.ac", | |||
"cy", "c.cy", "b.c.cy", "a.b.c.cy", "jp", "test.jp", "www.test.jp", | |||
"ac.jp", "test.ac.jp", "www.test.ac.jp", "kyoto.jp", "test.kyoto.jp", | |||
"ide.kyoto.jp", "b.ide.kyoto.jp", "a.b.ide.kyoto.jp", "c.kobe.jp", | |||
"b.c.kobe.jp", "a.b.c.kobe.jp", "city.kobe.jp", "www.city.kobe.jp", | |||
"ck", "test.ck", "b.test.ck", "a.b.test.ck", "www.ck", "www.www.ck", | |||
"us", "test.us", "www.test.us", "ak.us", "test.ak.us", "www.test.ak.us", | |||
"k12.ak.us", "test.k12.ak.us", "www.test.k12.ak.us" | |||
) | |||
apex_domain(doms) | |||
## [1] NA NA "example.com" "example.com" NA NA | |||
## [7] NA NA NA "example.example" "example.example" "example.example" | |||
## [13] NA "domain.biz" "domain.biz" "domain.biz" NA "example.com" | |||
## [19] "example.com" "example.com" NA "example.uk.com" "example.uk.com" "example.uk.com" | |||
## [25] "test.ac" NA "c.cy" "c.cy" "c.cy" NA | |||
## [31] "test.jp" "test.jp" NA "test.ac.jp" "test.ac.jp" NA | |||
## [37] "test.kyoto.jp" NA "b.ide.kyoto.jp" "b.ide.kyoto.jp" NA "b.c.kobe.jp" | |||
## [43] "b.c.kobe.jp" "city.kobe.jp" "city.kobe.jp" NA NA "b.test.ck" | |||
## [49] "b.test.ck" "www.ck" "www.ck" NA "test.us" "test.us" | |||
## [55] NA "test.ak.us" "test.ak.us" NA "test.k12.ak.us" "test.k12.ak.us" | |||
public_suffix(doms) | |||
## [1] "" "com" "com" "com" ".com" ".example" "com" | |||
## [8] "example" "example" "example" "example" "example" "biz" "biz" | |||
## [15] "biz" "biz" "com" "com" "com" "com" "uk.com" | |||
## [22] "uk.com" "uk.com" "uk.com" "ac" "cy" "cy" "cy" | |||
## [29] "cy" "jp" "jp" "jp" "ac.jp" "ac.jp" "ac.jp" | |||
## [36] "kyoto.jp" "kyoto.jp" "ide.kyoto.jp" "ide.kyoto.jp" "ide.kyoto.jp" "c.kobe.jp" "c.kobe.jp" | |||
## [43] "c.kobe.jp" "kobe.jp" "kobe.jp" "ck" "test.ck" "test.ck" "test.ck" | |||
## [50] "ck" "ck" "us" "us" "us" "ak.us" "ak.us" | |||
## [57] "ak.us" "k12.ak.us" "k12.ak.us" "k12.ak.us" | |||
is_public_suffix(doms) | |||
## [1] TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE | |||
## [20] FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE | |||
## [39] FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE | |||
## [58] TRUE FALSE FALSE | |||
suffix_extract(doms) | |||
## # A tibble: 60 x 6 | |||
## orig normalized subdomain apex domain suffix | |||
## <chr> <chr> <chr> <chr> <chr> <chr> | |||
## 1 "" "" <NA> <NA> <NA> "" | |||
## 2 com com <NA> <NA> <NA> com | |||
## 3 example.com example.com "" example.com example com | |||
## 4 www.example.com www.example.com www example.com example com | |||
## 5 .com .com <NA> <NA> <NA> .com | |||
## 6 .example .example <NA> <NA> <NA> .example | |||
## 7 .example.com .example.com <NA> <NA> <NA> com | |||
## 8 .example.example .example.example <NA> <NA> <NA> example | |||
## 9 example example <NA> <NA> <NA> example | |||
## 10 example.example example.example "" example.example example example | |||
## # ... with 50 more rows | |||
suffix_extract2(doms) # urltools compatible output | |||
## # A tibble: 60 x 4 | |||
## host subdomain domain suffix | |||
## <chr> <chr> <chr> <chr> | |||
## 1 "" <NA> <NA> "" | |||
## 2 com <NA> <NA> com | |||
## 3 example.com "" example com | |||
## 4 www.example.com www example com | |||
## 5 .com <NA> <NA> .com | |||
## 6 .example <NA> <NA> .example | |||
## 7 .example.com <NA> <NA> com | |||
## 8 .example.example <NA> <NA> example | |||
## 9 example <NA> <NA> example | |||
## 10 example.example "" example example | |||
## # ... with 50 more rows | |||
``` | |||
``` r | |||
library(microbenchmark) | |||
microbenchmark( | |||
urltools = urltools::suffix_extract(doms), | |||
psl = psl::suffix_extract(doms), # returns more data | |||
psl2 = psl::suffix_extract2(doms) # returns what urltools does | |||
) -> mb | |||
autoplot(mb) | |||
``` | |||
<img src="README_files/figure-gfm/bench-1.png" width="960" /> |
@ -0,0 +1,17 @@ | |||
% Generated by roxygen2: do not edit by hand | |||
% Please edit documentation in R/RcppExports.R | |||
\name{apex_domain} | |||
\alias{apex_domain} | |||
\title{Return the apex/top-private domain from a vector of domains} | |||
\usage{ | |||
apex_domain(domains) | |||
} | |||
\arguments{ | |||
\item{domains}{character vector of domains} | |||
} | |||
\value{ | |||
character vector | |||
} | |||
\description{ | |||
Return the apex/top-private domain from a vector of domains | |||
} |
@ -0,0 +1,17 @@ | |||
% Generated by roxygen2: do not edit by hand | |||
% Please edit documentation in R/RcppExports.R | |||
\name{is_public_suffix} | |||
\alias{is_public_suffix} | |||
\title{Test whether a domain is a public suffix} | |||
\usage{ | |||
is_public_suffix(domains) | |||
} | |||
\arguments{ | |||
\item{domains}{character vector of domains} | |||
} | |||
\value{ | |||
character vector | |||
} | |||
\description{ | |||
Test whether a domain is a public suffix | |||
} |
@ -0,0 +1,17 @@ | |||
% Generated by roxygen2: do not edit by hand | |||
% Please edit documentation in R/RcppExports.R | |||
\name{public_suffix} | |||
\alias{public_suffix} | |||
\title{Return the public suffix from a vector of domains} | |||
\usage{ | |||
public_suffix(domains) | |||
} | |||
\arguments{ | |||
\item{domains}{character vector of domains} | |||
} | |||
\value{ | |||
character vector | |||
} | |||
\description{ | |||
Return the public suffix from a vector of domains | |||
} |
@ -0,0 +1,17 @@ | |||
% Generated by roxygen2: do not edit by hand | |||
% Please edit documentation in R/RcppExports.R | |||
\name{suffix_extract} | |||
\alias{suffix_extract} | |||
\title{Separate a domain into component parts} | |||
\usage{ | |||
suffix_extract(domains) | |||
} | |||
\arguments{ | |||
\item{domains}{character vector of domains} | |||
} | |||
\value{ | |||
data frame | |||
} | |||
\description{ | |||
Separate a domain into component parts | |||
} |
@ -0,0 +1,17 @@ | |||
% Generated by roxygen2: do not edit by hand | |||
% Please edit documentation in R/RcppExports.R | |||
\name{suffix_extract2} | |||
\alias{suffix_extract2} | |||
\title{Separate a domain into component parts} | |||
\usage{ | |||
suffix_extract2(domains) | |||
} | |||
\arguments{ | |||
\item{domains}{character vector of domains} | |||
} | |||
\value{ | |||
data frame | |||
} | |||
\description{ | |||
Compatibility function for those using \code{urltools::suffix_extract()} | |||
} |
@ -0,0 +1,3 @@ | |||
*.o | |||
*.so | |||
*.dll |
@ -0,0 +1,3 @@ | |||
CXX_STD = CXX11 | |||
PKG_CXXFLAGS = | |||
PKG_LIBS = -L. -liconv -lidn2 |
@ -0,0 +1,76 @@ | |||
// Generated by using Rcpp::compileAttributes() -> do not edit by hand | |||
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 | |||
#include <Rcpp.h> | |||
using namespace Rcpp; | |||
// apex_domain | |||
CharacterVector apex_domain(CharacterVector domains); | |||
RcppExport SEXP _psl_apex_domain(SEXP domainsSEXP) { | |||
BEGIN_RCPP | |||
Rcpp::RObject rcpp_result_gen; | |||
Rcpp::RNGScope rcpp_rngScope_gen; | |||
Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP); | |||
rcpp_result_gen = Rcpp::wrap(apex_domain(domains)); | |||
return rcpp_result_gen; | |||
END_RCPP | |||
} | |||
// public_suffix | |||
CharacterVector public_suffix(CharacterVector domains); | |||
RcppExport SEXP _psl_public_suffix(SEXP domainsSEXP) { | |||
BEGIN_RCPP | |||
Rcpp::RObject rcpp_result_gen; | |||
Rcpp::RNGScope rcpp_rngScope_gen; | |||
Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP); | |||
rcpp_result_gen = Rcpp::wrap(public_suffix(domains)); | |||
return rcpp_result_gen; | |||
END_RCPP | |||
} | |||
// is_public_suffix | |||
std::vector< bool > is_public_suffix(CharacterVector domains); | |||
RcppExport SEXP _psl_is_public_suffix(SEXP domainsSEXP) { | |||
BEGIN_RCPP | |||
Rcpp::RObject rcpp_result_gen; | |||
Rcpp::RNGScope rcpp_rngScope_gen; | |||
Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP); | |||
rcpp_result_gen = Rcpp::wrap(is_public_suffix(domains)); | |||
return rcpp_result_gen; | |||
END_RCPP | |||
} | |||
// suffix_extract | |||
DataFrame suffix_extract(CharacterVector domains); | |||
RcppExport SEXP _psl_suffix_extract(SEXP domainsSEXP) { | |||
BEGIN_RCPP | |||
Rcpp::RObject rcpp_result_gen; | |||
Rcpp::RNGScope rcpp_rngScope_gen; | |||
Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP); | |||
rcpp_result_gen = Rcpp::wrap(suffix_extract(domains)); | |||
return rcpp_result_gen; | |||
END_RCPP | |||
} | |||
// suffix_extract2 | |||
DataFrame suffix_extract2(CharacterVector domains); | |||
RcppExport SEXP _psl_suffix_extract2(SEXP domainsSEXP) { | |||
BEGIN_RCPP | |||
Rcpp::RObject rcpp_result_gen; | |||
Rcpp::RNGScope rcpp_rngScope_gen; | |||
Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP); | |||
rcpp_result_gen = Rcpp::wrap(suffix_extract2(domains)); | |||
return rcpp_result_gen; | |||
END_RCPP | |||
} | |||
static const R_CallMethodDef CallEntries[] = { | |||
{"_psl_apex_domain", (DL_FUNC) &_psl_apex_domain, 1}, | |||
{"_psl_public_suffix", (DL_FUNC) &_psl_public_suffix, 1}, | |||
{"_psl_is_public_suffix", (DL_FUNC) &_psl_is_public_suffix, 1}, | |||
{"_psl_suffix_extract", (DL_FUNC) &_psl_suffix_extract, 1}, | |||
{"_psl_suffix_extract2", (DL_FUNC) &_psl_suffix_extract2, 1}, | |||
{NULL, NULL, 0} | |||
}; | |||
RcppExport void R_init_psl(DllInfo *dll) { | |||
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); | |||
R_useDynamicSymbols(dll, FALSE); | |||
} |
@ -0,0 +1,147 @@ | |||
/* config.h. Generated from config.h.in by configure. */ | |||
/* config.h.in. Generated from configure.ac by autoheader. */ | |||
/* generate PSL data using libicu */ | |||
/* #undef BUILTIN_GENERATOR_LIBICU */ | |||
/* generate PSL data using libidn */ | |||
/* #undef BUILTIN_GENERATOR_LIBIDN */ | |||
/* generate PSL data using libidn2 */ | |||
#define BUILTIN_GENERATOR_LIBIDN2 1 | |||
/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP | |||
systems. This function is required for `alloca.c' support on those systems. | |||
*/ | |||
/* #undef CRAY_STACKSEG_END */ | |||
/* Define to 1 if using `alloca.c'. */ | |||
/* #undef C_ALLOCA */ | |||
/* Define to 1 if translation of program messages to the user's native | |||
language is requested. */ | |||
/* #undef ENABLE_NLS */ | |||
/* Define to 1 if you have `alloca', as a function or macro. */ | |||
#define HAVE_ALLOCA 1 | |||
/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix). | |||
*/ | |||
#define HAVE_ALLOCA_H 1 | |||
/* Define to 1 if you have the MacOS X function CFLocaleCopyCurrent in the | |||
CoreFoundation framework. */ | |||
#define HAVE_CFLOCALECOPYCURRENT 1 | |||
/* Define to 1 if you have the MacOS X function CFPreferencesCopyAppValue in | |||
the CoreFoundation framework. */ | |||
#define HAVE_CFPREFERENCESCOPYAPPVALUE 1 | |||
/* Define to 1 if you have the `clock_gettime' function. */ | |||
#define HAVE_CLOCK_GETTIME 1 | |||
/* Define if the GNU dcgettext() function is already present or preinstalled. | |||
*/ | |||
/* #undef HAVE_DCGETTEXT */ | |||
/* Define to 1 if you have the <dlfcn.h> header file. */ | |||
#define HAVE_DLFCN_H 1 | |||
/* Define to 1 if you have the `fmemopen' function. */ | |||
#define HAVE_FMEMOPEN 1 | |||
/* Define if the GNU gettext() function is already present or preinstalled. */ | |||
/* #undef HAVE_GETTEXT */ | |||
/* Define if you have the iconv() function and it works. */ | |||
#define HAVE_ICONV 1 | |||
/* Define to 1 if you have the <inttypes.h> header file. */ | |||
#define HAVE_INTTYPES_H 1 | |||
/* Define to 1 if you have the <memory.h> header file. */ | |||
#define HAVE_MEMORY_H 1 | |||
/* Define to 1 if you have the `nl_langinfo' function. */ | |||
#define HAVE_NL_LANGINFO 1 | |||
/* Define to 1 if you have the <stdint.h> header file. */ | |||
#define HAVE_STDINT_H 1 | |||
/* Define to 1 if you have the <stdlib.h> header file. */ | |||
#define HAVE_STDLIB_H 1 | |||
/* Define to 1 if you have the <strings.h> header file. */ | |||
#define HAVE_STRINGS_H 1 | |||
/* Define to 1 if you have the <string.h> header file. */ | |||
#define HAVE_STRING_H 1 | |||
/* Define to 1 if you have the `strndup' function. */ | |||
#define HAVE_STRNDUP 1 | |||
/* Define to 1 if you have the <sys/stat.h> header file. */ | |||
#define HAVE_SYS_STAT_H 1 | |||
/* Define to 1 if you have the <sys/types.h> header file. */ | |||
#define HAVE_SYS_TYPES_H 1 | |||
/* Define to 1 if you have the <unistd.h> header file. */ | |||
#define HAVE_UNISTD_H 1 | |||
/* Define to 1 or 0, depending whether the compiler supports simple visibility | |||
declarations. */ | |||
#define HAVE_VISIBILITY 1 | |||
/* Define as const if the declaration of iconv() needs const. */ | |||
#define ICONV_CONST | |||
/* Define to the sub-directory where libtool stores uninstalled libraries. */ | |||
#define LT_OBJDIR ".libs/" | |||
/* Define to the address where bug reports for this package should be sent. */ | |||
#define PACKAGE_BUGREPORT "tim.ruehsen@gmx.de" | |||
/* Define to the full name of this package. */ | |||
#define PACKAGE_NAME "libpsl" | |||
/* Define to the full name and version of this package. */ | |||
#define PACKAGE_STRING "libpsl 0.20.2" | |||
/* Define to the one symbol short name of this package. */ | |||
#define PACKAGE_TARNAME "libpsl" | |||
/* Define to the home page for this package. */ | |||
#define PACKAGE_URL "https://github.com/rockdaboot/libpsl" | |||
/* Define to the version of this package. */ | |||
#define PACKAGE_VERSION "0.20.2" | |||
/* If using the C implementation of alloca, define if you know the | |||
direction of stack growth for your system; otherwise it will be | |||
automatically deduced at runtime. | |||
STACK_DIRECTION > 0 => grows toward higher addresses | |||
STACK_DIRECTION < 0 => grows toward lower addresses | |||
STACK_DIRECTION = 0 => direction of growth unknown */ | |||
/* #undef STACK_DIRECTION */ | |||
/* Define to 1 if you have the ANSI C header files. */ | |||
#define STDC_HEADERS 1 | |||
/* generate PSL data using libicu */ | |||
/* #undef WITH_LIBICU */ | |||
/* generate PSL data using libidn */ | |||
/* #undef WITH_LIBIDN */ | |||
/* generate PSL data using libidn2 */ | |||
#define WITH_LIBIDN2 1 | |||
/* Define to `__inline__' or `__inline' if that's what the C compiler | |||
calls it, or to nothing if 'inline' is not supported under any name. */ | |||
#ifndef __cplusplus | |||
/* #undef inline */ | |||
#endif | |||
/* Define to `unsigned int' if <sys/types.h> does not define. */ | |||
/* #undef size_t */ |
@ -0,0 +1,212 @@ | |||
/* | |||
* Copyright(c) 2014-2018 Tim Ruehsen | |||
* | |||
* Permission is hereby granted, free of charge, to any person obtaining a | |||
* copy of this software and associated documentation files (the "Software"), | |||
* to deal in the Software without restriction, including without limitation | |||
* the rights to use, copy, modify, merge, publish, distribute, sublicense, | |||
* and/or sell copies of the Software, and to permit persons to whom the | |||
* Software is furnished to do so, subject to the following conditions: | |||
* | |||
* The above copyright notice and this permission notice shall be included in | |||
* all copies or substantial portions of the Software. | |||
* | |||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |||
* DEALINGS IN THE SOFTWARE. | |||
* | |||
* This file is part of libpsl. | |||
* | |||
* Header file for libpsl library routines | |||
* | |||
* Changelog | |||
* 20.03.2014 Tim Ruehsen created | |||
* | |||
*/ | |||
#ifdef __cplusplus | |||
extern "C" { | |||
#endif | |||
#ifndef LIBPSL_LIBPSL_H | |||
#define LIBPSL_LIBPSL_H | |||
#include <stdio.h> | |||
#include <time.h> | |||
#define PSL_VERSION "0.20.2" | |||
#define PSL_VERSION_MAJOR 0 | |||
#define PSL_VERSION_MINOR 20 | |||
#define PSL_VERSION_PATCH 2 | |||
#define PSL_VERSION_NUMBER 0x001402 | |||
#ifndef PSL_API | |||
#if defined BUILDING_PSL && HAVE_VISIBILITY | |||
# define PSL_API __attribute__ ((__visibility__("default"))) | |||
#elif defined BUILDING_PSL && defined _MSC_VER && !defined PSL_STATIC | |||
# define PSL_API __declspec(dllexport) | |||
#elif defined _MSC_VER && !defined PSL_STATIC | |||
# define PSL_API __declspec(dllimport) | |||
#else | |||
# define PSL_API | |||
#endif | |||
#endif | |||
#ifdef __cplusplus | |||
extern "C" { | |||
#endif | |||
/* types for psl_is_public_suffix2() */ | |||
#define PSL_TYPE_ICANN (1<<0) | |||
#define PSL_TYPE_PRIVATE (1<<1) | |||
#define PSL_TYPE_NO_STAR_RULE (1<<2) | |||
#define PSL_TYPE_ANY (PSL_TYPE_ICANN | PSL_TYPE_PRIVATE) | |||
/** | |||
* psl_error_t: | |||
* @PSL_SUCCESS: Successful return. | |||
* @PSL_ERR_INVALID_ARG: Invalid argument. | |||
* @PSL_ERR_CONVERTER: Failed to open libicu utf-16 converter. | |||
* @PSL_ERR_TO_UTF16: Failed to convert to utf-16. | |||
* @PSL_ERR_TO_LOWER: Failed to convert utf-16 to lowercase. | |||
* @PSL_ERR_TO_UTF8: Failed to convert utf-16 to utf-8. | |||
* @PSL_ERR_NO_MEM: Failed to allocate memory. | |||
* | |||
* Return codes for PSL functions. | |||
* Negative return codes mean failure. | |||
* Positive values are reserved for non-error return codes. | |||
*/ | |||
typedef enum { | |||
PSL_SUCCESS = 0, | |||
PSL_ERR_INVALID_ARG = -1, | |||
PSL_ERR_CONVERTER = -2, /* failed to open libicu utf-16 converter */ | |||
PSL_ERR_TO_UTF16 = -3, /* failed to convert to utf-16 */ | |||
PSL_ERR_TO_LOWER = -4, /* failed to convert utf-16 to lowercase */ | |||
PSL_ERR_TO_UTF8 = -5, /* failed to convert utf-16 to utf-8 */ | |||
PSL_ERR_NO_MEM = -6 /* failed to allocate memory */ | |||
} psl_error_t; | |||
typedef struct _psl_ctx_st psl_ctx_t; | |||
/* frees PSL context */ | |||
PSL_API | |||
void | |||
psl_free(psl_ctx_t *psl); | |||
/* frees memory allocated by libpsl routines */ | |||
PSL_API | |||
void | |||
psl_free_string(char *str); | |||
/* loads PSL data from file */ | |||
PSL_API | |||
psl_ctx_t * | |||
psl_load_file(const char *fname); | |||
/* loads PSL data from FILE pointer */ | |||
PSL_API | |||
psl_ctx_t * | |||
psl_load_fp(FILE *fp); | |||
/* retrieves builtin PSL data */ | |||
PSL_API | |||
const psl_ctx_t * | |||
psl_builtin(void); | |||
/* retrieves most recent PSL data */ | |||
PSL_API | |||
psl_ctx_t * | |||
psl_latest(const char *fname); | |||
/* checks whether domain is a public suffix or not */ | |||
PSL_API | |||
int | |||
psl_is_public_suffix(const psl_ctx_t *psl, const char *domain); | |||
/* checks whether domain is a public suffix regarding the type or not */ | |||
PSL_API | |||
int | |||
psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type); | |||
/* checks whether cookie_domain is acceptable for domain or not */ | |||
PSL_API | |||
int | |||
psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname, const char *cookie_domain); | |||
/* returns the longest not registrable domain within 'domain' or NULL if none found */ | |||
PSL_API | |||
const char * | |||
psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain); | |||
/* returns the shortest possible registrable domain part or NULL if domain is not registrable at all */ | |||
PSL_API | |||
const char * | |||
psl_registrable_domain(const psl_ctx_t *psl, const char *domain); | |||
/* convert a string into lowercase UTF-8 */ | |||
PSL_API | |||
psl_error_t | |||
psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower); | |||
/* does not include exceptions */ | |||
PSL_API | |||
int | |||
psl_suffix_count(const psl_ctx_t *psl); | |||
/* just counts exceptions */ | |||
PSL_API | |||
int | |||
psl_suffix_exception_count(const psl_ctx_t *psl); | |||
/* just counts wildcards */ | |||
PSL_API | |||
int | |||
psl_suffix_wildcard_count(const psl_ctx_t *psl); | |||
/* returns mtime of PSL source file */ | |||
PSL_API | |||
time_t | |||
psl_builtin_file_time(void); | |||
/* returns SHA1 checksum (hex-encoded, lowercase) of PSL source file */ | |||
PSL_API | |||
const char * | |||
psl_builtin_sha1sum(void); | |||
/* returns file name of PSL source file */ | |||
PSL_API | |||
const char * | |||
psl_builtin_filename(void); | |||
/* returns name of distribution PSL data file */ | |||
PSL_API | |||
const char * | |||
psl_dist_filename(void); | |||
/* returns library version string */ | |||
PSL_API | |||
const char * | |||
psl_get_version(void); | |||
/* checks library version number */ | |||
PSL_API | |||
int | |||
psl_check_version_number(int version); | |||
/* returns whether the built-in data is outdated or not */ | |||
PSL_API | |||
int | |||
psl_builtin_outdated(void); | |||
#ifdef __cplusplus | |||
} | |||
#endif | |||
#endif /* LIBPSL_LIBPSL_H */ | |||
#ifdef __cplusplus | |||
} | |||
#endif |
@ -0,0 +1,279 @@ | |||
/* Copyright 2015-2016 The Chromium Authors. All rights reserved. | |||
* Use of this source code is governed by a BSD-style license that can be | |||
* found in the LICENSE.chromium file. | |||
* | |||
* Converted to C89 2015 by Tim Rühsen | |||
*/ | |||
#include <stddef.h> | |||
#if defined(__GNUC__) && defined(__GNUC_MINOR__) | |||
# define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))) | |||
#else | |||
# define _GCC_VERSION_AT_LEAST(major, minor) 0 | |||
#endif | |||
#define CHECK_LT(a, b) if ((a) >= b) return 0 | |||
static const char multibyte_length_table[16] = { | |||
0, 0, 0, 0, /* 0x00-0x3F */ | |||
0, 0, 0, 0, /* 0x40-0x7F */ | |||
0, 0, 0, 0, /* 0x80-0xBF */ | |||
2, 2, 3, 4, /* 0xC0-0xFF */ | |||
}; | |||
/* | |||
* Get length of multibyte character sequence starting at a given byte. | |||
* Returns zero if the byte is not a valid leading byte in UTF-8. | |||
*/ | |||
static int GetMultibyteLength(char c) { | |||
return multibyte_length_table[((unsigned char)c) >> 4]; | |||
} | |||
/* | |||
* Moves pointers one byte forward. | |||
*/ | |||
static void NextPos(const unsigned char** pos, | |||
const char** key, | |||
const char** multibyte_start) | |||
{ | |||
++*pos; | |||
if (*multibyte_start) { | |||
/* Advance key to next byte in multibyte sequence. */ | |||
++*key; | |||
/* Reset multibyte_start if last byte in multibyte sequence was consumed. */ | |||
if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start)) | |||
*multibyte_start = 0; | |||
} else { | |||
if (GetMultibyteLength(**key)) { | |||
/* Multibyte prefix was matched in the dafsa, start matching multibyte | |||
* content in next round. */ | |||
*multibyte_start = *key; | |||
} else { | |||
/* Advance key as a single byte character was matched. */ | |||
++*key; | |||
} | |||
} | |||
} | |||
/* | |||
* Read next offset from pos. | |||
* Returns true if an offset could be read, false otherwise. | |||
*/ | |||
static int GetNextOffset(const unsigned char** pos, | |||
const unsigned char* end, | |||
const unsigned char** offset) | |||
{ | |||
size_t bytes_consumed; | |||
if (*pos == end) | |||
return 0; | |||
/* When reading an offset the byte array must always contain at least | |||
* three more bytes to consume. First the offset to read, then a node | |||
* to skip over and finally a destination node. No object can be smaller | |||
* than one byte. */ | |||
CHECK_LT(*pos + 2, end); | |||
switch (**pos & 0x60) { | |||
case 0x60: /* Read three byte offset */ | |||
*offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2]; | |||
bytes_consumed = 3; | |||
break; | |||
case 0x40: /* Read two byte offset */ | |||
*offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1]; | |||
bytes_consumed = 2; | |||
break; | |||
default: | |||
*offset += (*pos)[0] & 0x3F; | |||
bytes_consumed = 1; | |||
} | |||
if ((**pos & 0x80) != 0) { | |||
*pos = end; | |||
} else { | |||
*pos += bytes_consumed; | |||
} | |||
return 1; | |||
} | |||
/* | |||
* Check if byte at offset is last in label. | |||
*/ | |||
static int IsEOL(const unsigned char* offset, const unsigned char* end) | |||
{ | |||
CHECK_LT(offset, end); | |||
return(*offset & 0x80) != 0; | |||
} | |||
/* | |||
* Check if byte at offset matches first character in key. | |||
* This version assumes a range check was already performed by the caller. | |||
*/ | |||
static int IsMatchUnchecked(const unsigned char matcher, | |||
const char* key, | |||
const char* multibyte_start) | |||
{ | |||
if (multibyte_start) { | |||
/* Multibyte matching mode. */ | |||
if (multibyte_start == key) { | |||
/* Match leading byte, which will also match the sequence length. */ | |||
return (matcher ^ 0x80) == (const unsigned char)*key; | |||
} else { | |||
/* Match following bytes. */ | |||
return (matcher ^ 0xC0) == (const unsigned char)*key; | |||
} | |||
} | |||
/* If key points at a leading byte in a multibyte sequence, but we are not yet | |||
* in multibyte mode, then the dafsa should contain a special byte to indicate | |||
* a mode switch. */ | |||
if (GetMultibyteLength(*key)) { | |||
return matcher == 0x1F; | |||
} | |||
/* Normal matching of a single byte character. */ | |||
return matcher == (const unsigned char)*key; | |||
} | |||
/* | |||
* Check if byte at offset matches first character in key. | |||
* This version matches characters not last in label. | |||
*/ | |||
static int IsMatch(const unsigned char* offset, | |||
const unsigned char* end, | |||
const char* key, | |||
const char* multibyte_start) | |||
{ | |||
CHECK_LT(offset, end); | |||
return IsMatchUnchecked(*offset, key, multibyte_start); | |||
} | |||
/* | |||
* Check if byte at offset matches first character in key. | |||
* This version matches characters last in label. | |||
*/ | |||
static int IsEndCharMatch(const unsigned char* offset, | |||
const unsigned char* end, | |||
const char* key, | |||
const char* multibyte_start) | |||
{ | |||
CHECK_LT(offset, end); | |||
return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start); | |||
} | |||
/* | |||
* Read return value at offset. | |||
* Returns true if a return value could be read, false otherwise. | |||
*/ | |||
static int GetReturnValue(const unsigned char* offset, | |||
const unsigned char* end, | |||
const char* multibyte_start, | |||
int* return_value) | |||
{ | |||
CHECK_LT(offset, end); | |||
if (!multibyte_start && (*offset & 0xE0) == 0x80) { | |||
*return_value = *offset & 0x0F; | |||
return 1; | |||
} | |||
return 0; | |||
} | |||
/* | |||
* Looks up the string |key| with length |key_length| in a fixed set of | |||
* strings. The set of strings must be known at compile time. It is converted to | |||
* a graph structure named a DAFSA (Deterministic Acyclic Finite State | |||
* Automaton) by the script psl-make-dafsa during compilation. This permits | |||
* efficient (in time and space) lookup. The graph generated by psl-make-dafsa | |||
* takes the form of a constant byte array which should be supplied via the | |||
* |graph| and |length| parameters. The return value is kDafsaNotFound, | |||
* kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule, | |||
* kDafsaWildcardRule and kDafsaPrivateRule ORed together. | |||
* | |||
* Lookup a domain key in a byte array generated by psl-make-dafsa. | |||
*/ | |||
/* prototype to skip warning with -Wmissing-prototypes */ | |||
int LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t); | |||
int LookupStringInFixedSet(const unsigned char* graph, | |||
size_t length, | |||
const char* key, | |||
size_t key_length) | |||
{ | |||
const unsigned char* pos = graph; | |||
const unsigned char* end = graph + length; | |||
const unsigned char* offset = pos; | |||
const char* key_end = key + key_length; | |||
const char* multibyte_start = 0; | |||
while (GetNextOffset(&pos, end, &offset)) { | |||
/*char <char>+ end_char offsets | |||
* char <char>+ return value | |||
* char end_char offsets | |||
* char return value | |||
* end_char offsets | |||
* return_value | |||
*/ | |||
int did_consume = 0; | |||
if (key != key_end && !IsEOL(offset, end)) { | |||
/* Leading <char> is not a match. Don't dive into this child */ | |||
if (!IsMatch(offset, end, key, multibyte_start)) | |||
continue; | |||
did_consume = 1; | |||
NextPos(&offset, &key, &multibyte_start); | |||
/* Possible matches at this point: | |||
* <char>+ end_char offsets | |||
* <char>+ return value | |||
* end_char offsets | |||
* return value | |||
*/ | |||
/* Remove all remaining <char> nodes possible */ | |||
while (!IsEOL(offset, end) && key != key_end) { | |||
if (!IsMatch(offset, end, key, multibyte_start)) | |||
return -1; | |||
NextPos(&offset, &key, &multibyte_start); | |||
} | |||
} | |||
/* Possible matches at this point: | |||
* end_char offsets | |||
* return_value | |||
* If one or more <char> elements were consumed, a failure | |||
* to match is terminal. Otherwise, try the next node. | |||
*/ | |||
if (key == key_end) { | |||
int return_value; | |||
if (GetReturnValue(offset, end, multibyte_start, &return_value)) | |||
return return_value; | |||
/* The DAFSA guarantees that if the first char is a match, all | |||
* remaining char elements MUST match if the key is truly present. | |||
*/ | |||
if (did_consume) | |||
return -1; | |||
continue; | |||
} | |||
if (!IsEndCharMatch(offset, end, key, multibyte_start)) { | |||
if (did_consume) | |||
return -1; /* Unexpected */ | |||
continue; | |||
} | |||
NextPos(&offset, &key, &multibyte_start); | |||
pos = offset; /* Dive into child */ | |||
} | |||
return -1; /* No match */ | |||
} | |||
/* prototype to skip warning with -Wmissing-prototypes */ | |||
int GetUtfMode(const unsigned char *graph, size_t length); | |||
int GetUtfMode(const unsigned char *graph, size_t length) | |||
{ | |||
return length > 0 && graph[length - 1] < 0x80; | |||
} |
@ -0,0 +1,350 @@ | |||
#include <Rcpp.h> | |||
#include <regex> | |||
#include "libpsl.h" | |||
using namespace Rcpp; | |||
//' Return the apex/top-private domain from a vector of domains | |||
//' | |||
//' @md | |||
//' @param domains character vector of domains | |||
//' @return character vector | |||
//' @export | |||
// [[Rcpp::export]] | |||
CharacterVector apex_domain(CharacterVector domains) { | |||
unsigned int input_size = domains.size(); | |||
CharacterVector output(input_size); | |||
char *lower = NULL; | |||
int rc; | |||
const char * result; | |||
const psl_ctx_t *psl = psl_builtin(); | |||
for (unsigned int i = 0; i < input_size; i++) { | |||
// remove trailing period if any | |||
std::string cleaned = Rcpp::as<std::string>(domains[i]); | |||
if (cleaned.length() > 0) { | |||
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back(); | |||
} | |||
// lowercase it | |||
rc = psl_str_to_utf8lower( | |||
cleaned.c_str(), | |||
"utf-8", "en", | |||
&lower | |||
); | |||
if (rc == PSL_SUCCESS) { | |||
result = psl_registrable_domain(psl, lower); | |||
if (result) { | |||
output[i] = std::string(result); | |||
} else { | |||
output[i] = NA_STRING; | |||
} | |||
} else { | |||
output[i] = NA_STRING; | |||
} | |||
psl_free_string(lower); | |||
} | |||
return(output); | |||
} | |||
//' Return the public suffix from a vector of domains | |||
//' | |||
//' @md | |||
//' @param domains character vector of domains | |||
//' @return character vector | |||
//' @export | |||
// [[Rcpp::export]] | |||
CharacterVector public_suffix(CharacterVector domains) { | |||
unsigned int input_size = domains.size(); | |||
CharacterVector output(input_size); | |||
char *lower = NULL; | |||
int rc; | |||
const char * result; | |||
const psl_ctx_t *psl = psl_builtin(); | |||
for (unsigned int i = 0; i < input_size; i++) { | |||
// remove trailing period if any | |||
std::string cleaned = Rcpp::as<std::string>(domains[i]); | |||
if (cleaned.length() > 0) { | |||
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back(); | |||
} | |||
// lowercase it | |||
rc = psl_str_to_utf8lower( | |||
cleaned.c_str(), | |||
"utf-8", "en", | |||
&lower | |||
); | |||
if (rc == PSL_SUCCESS) { | |||
result = psl_unregistrable_domain(psl, lower); | |||
if (result) { | |||
output[i] = std::string(result); | |||
} else { | |||
output[i] = NA_STRING; | |||
} | |||
} else { | |||
output[i] = NA_STRING; | |||
} | |||
psl_free_string(lower); | |||
} | |||
return(output); | |||
} | |||
//' Test whether a domain is a public suffix | |||
//' | |||
//' @md | |||
//' @param domains character vector of domains | |||
//' @return character vector | |||
//' @export | |||
// [[Rcpp::export]] | |||
std::vector< bool > is_public_suffix(CharacterVector domains) { | |||
unsigned int input_size = domains.size(); | |||
std::vector < bool > output(input_size); | |||
char *lower = NULL; | |||
int rc; | |||
const psl_ctx_t *psl = psl_builtin(); | |||
for (unsigned int i = 0; i < input_size; i++) { | |||
// remove trailing period if any | |||
std::string cleaned = Rcpp::as<std::string>(domains[i]); | |||
if (cleaned.length() > 0) { | |||
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back(); | |||
} | |||
// lowercase it | |||
rc = psl_str_to_utf8lower( | |||
cleaned.c_str(), | |||
"utf-8", "en", | |||
&lower | |||
); | |||
if (rc == PSL_SUCCESS) { | |||
output[i] = (psl_is_public_suffix(psl, lower) == 1); | |||
} else { | |||
output[i] = NA_LOGICAL; | |||
} | |||
psl_free_string(lower); | |||
} | |||
return(output); | |||
} | |||
//' Separate a domain into component parts | |||
//' | |||
//' @md | |||
//' @param domains character vector of domains | |||
//' @return data frame | |||
//' @export | |||
// [[Rcpp::export]] | |||
DataFrame suffix_extract(CharacterVector domains) { | |||
unsigned int input_size = domains.size(); | |||
CharacterVector normalized(input_size); | |||
CharacterVector subdomain(input_size); | |||
CharacterVector apex(input_size); | |||
CharacterVector domain(input_size); | |||
CharacterVector suffix(input_size); | |||
char *lower = NULL; | |||
int rc; | |||
const char * result; | |||
const psl_ctx_t *psl = psl_builtin(); | |||
for (unsigned int i = 0; i < input_size; i++) { | |||
// remove trailing period if any | |||
std::string cleaned = Rcpp::as<std::string>(domains[i]); | |||
if (cleaned.length() > 0) { | |||
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back(); | |||
} | |||
// lowercase it | |||
rc = psl_str_to_utf8lower( | |||
cleaned.c_str(), | |||
"utf-8", "en", | |||
&lower | |||
); | |||
if (rc == PSL_SUCCESS) { | |||
// no dots at end and lowercased | |||
normalized[i] = std::string(lower); | |||
// try to get the suffix | |||
result = psl_unregistrable_domain(psl, lower); | |||
if (result) { | |||
suffix[i] = std::string(result); | |||
} else { | |||
suffix[i] = NA_STRING; | |||
} | |||
// try to get the apex | |||
result = psl_registrable_domain(psl, lower); | |||
if (result) { | |||
apex[i] = std::string(result); | |||
} else { | |||
apex[i] = NA_STRING; | |||
} | |||
if ((suffix[i] != NA_STRING) && (apex[i] != NA_STRING)) { | |||
std::regex trail_suf("[\\.]*" + Rcpp::as<std::string>(suffix[i]) + "$"); | |||
domain[i] = std::regex_replace( | |||
Rcpp::as<std::string>(apex[i]), | |||
trail_suf, "" | |||
); | |||
std::regex apex_suf("[\\.]*" + Rcpp::as<std::string>(apex[i]) + "$"); | |||
subdomain[i] = std::regex_replace( | |||
Rcpp::as<std::string>(normalized[i]), | |||
apex_suf, "" | |||
); | |||
} else { | |||
domain[i] = NA_STRING; | |||
subdomain[i] = NA_STRING; | |||
} | |||
} else { | |||
normalized[i] = NA_STRING; | |||
subdomain[i] = NA_STRING; | |||
apex[i] = NA_STRING; | |||
domain[i] = NA_STRING; | |||
suffix[i] = NA_STRING; | |||
} | |||
psl_free_string(lower); | |||
} | |||
DataFrame out = DataFrame::create( | |||
_["orig"] = domains, | |||
_["normalized"] = normalized, | |||
_["subdomain"] = subdomain, | |||
_["apex"] = apex, | |||
_["domain"] = domain, | |||
_["suffix"] = suffix, | |||
_["stringsAsFactors"] = false | |||
); | |||
out.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame"); | |||
return(out); | |||
} | |||
//' Separate a domain into component parts | |||
//' | |||
//' Compatibility function for those using `urltools::suffix_extract()` | |||
//' | |||
//' @md | |||
//' @param domains character vector of domains | |||
//' @return data frame | |||
//' @export | |||
// [[Rcpp::export]] | |||
DataFrame suffix_extract2(CharacterVector domains) { | |||
unsigned int input_size = domains.size(); | |||
CharacterVector subdomain(input_size); | |||
CharacterVector domain(input_size); | |||
CharacterVector suffix(input_size); | |||
char *lower = NULL; | |||
int rc; | |||
const char * result; | |||
const psl_ctx_t *psl = psl_builtin(); | |||
for (unsigned int i = 0; i < input_size; i++) { | |||
std::string cleaned = Rcpp::as<std::string>(domains[i]); | |||
if (cleaned.length() > 0) { | |||
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back(); | |||
} | |||
// lowercase it | |||
rc = psl_str_to_utf8lower( | |||
cleaned.c_str(), | |||
"utf-8", "en", | |||
&lower | |||
); | |||
if (rc == PSL_SUCCESS) { | |||
std::string normalized(lower); | |||
// try to get the suffix | |||
result = psl_unregistrable_domain(psl, lower); | |||
if (result) { | |||
std::string suf = std::string(result); | |||
suffix[i] = suf; | |||
result = psl_registrable_domain(psl, lower); | |||
if (result) { | |||
std::string apex(result); | |||
std::regex trail_suf("[\\.]*" + suf + "$"); | |||
std::regex apex_suf("[\\.]*" + apex + "$"); | |||
domain[i] = std::regex_replace(apex, trail_suf, ""); | |||
subdomain[i] = std::regex_replace(normalized, apex_suf, ""); | |||
} else { | |||
subdomain[i] = NA_STRING; | |||
domain[i] = NA_STRING; | |||
} | |||
} else { | |||
subdomain[i] = NA_STRING; | |||
suffix[i] = NA_STRING; | |||
domain[i] = NA_STRING; | |||
} | |||
} else { | |||
subdomain[i] = NA_STRING; | |||
domain[i] = NA_STRING; | |||
suffix[i] = NA_STRING; | |||
} | |||
psl_free_string(lower); | |||
} | |||
DataFrame out = DataFrame::create( | |||
_["host"] = domains, | |||
_["subdomain"] = subdomain, | |||
_["domain"] = domain, | |||
_["suffix"] = suffix, | |||
_["stringsAsFactors"] = false | |||
); | |||
out.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame"); | |||
return(out); | |||
} |