24 changed files with 19681 additions and 15 deletions
@ -1,24 +1,31 @@ |
|||
Package: psl |
|||
Type: Package |
|||
Title: psl title goes here otherwise CRAN checks fail |
|||
Title: Extract Internet Domain Components Using the Public Suffix List |
|||
Version: 0.1.0 |
|||
Date: 2018-09-06 |
|||
Authors@R: c( |
|||
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), |
|||
comment = c(ORCID = "0000-0001-5670-2640")) |
|||
comment = c(ORCID = "0000-0001-5670-2640")), |
|||
person("Tim", "Rühsen", email = "bob@rud.is", role = c("aut"), |
|||
comment = "libpsl : <https://github.com/rockdaboot/libpsl>") |
|||
) |
|||
Maintainer: Bob Rudis <bob@rud.is> |
|||
Description: A good description goes here otherwise CRAN checks fail. |
|||
Description: The 'Public Suffix List' (<https://publicsuffix.org/>) is a collection |
|||
of top-level domains ('TLDs') which include global top-level domainsa ('gTLDs') |
|||
such as '.com' and '.net'; country top-level domains ('ccTLDs') such as '.de' and |
|||
'.cn'; and, brand top-level domains such as '.apple' and '.google'. Tools are provided |
|||
to extract internet domain components using the public suffix list base data. |
|||
URL: https://gitlab.com/hrbrmstr/psl |
|||
BugReports: https://gitlab.com/hrbrmstr/psl/issues |
|||
SystemRequirements: C++11 |
|||
Encoding: UTF-8 |
|||
License: AGPL |
|||
License: MIT + file LICENSE |
|||
Suggests: |
|||
testthat, |
|||
covr |
|||
Depends: |
|||
R (>= 3.2.0) |
|||
Imports: |
|||
httr, |
|||
jsonlite |
|||
Rcpp |
|||
RoxygenNote: 6.0.1.9000 |
|||
LinkingTo: Rcpp |
|||
|
@ -0,0 +1,2 @@ |
|||
YEAR: 2018 |
|||
COPYRIGHT HOLDER: Bob Rudis |
@ -1,4 +1,9 @@ |
|||
# Generated by roxygen2: do not edit by hand |
|||
|
|||
import(httr) |
|||
importFrom(jsonlite,fromJSON) |
|||
export(apex_domain) |
|||
export(is_public_suffix) |
|||
export(public_suffix) |
|||
export(suffix_extract) |
|||
export(suffix_extract2) |
|||
importFrom(Rcpp,sourceCpp) |
|||
useDynLib(psl) |
|||
|
@ -0,0 +1,55 @@ |
|||
# Generated by using Rcpp::compileAttributes() -> do not edit by hand |
|||
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 |
|||
|
|||
#' Return the apex/top-private domain from a vector of domains |
|||
#' |
|||
#' @md |
|||
#' @param domains character vector of domains |
|||
#' @return character vector |
|||
#' @export |
|||
apex_domain <- function(domains) { |
|||
.Call('_psl_apex_domain', PACKAGE = 'psl', domains) |
|||
} |
|||
|
|||
#' Return the public suffix from a vector of domains |
|||
#' |
|||
#' @md |
|||
#' @param domains character vector of domains |
|||
#' @return character vector |
|||
#' @export |
|||
public_suffix <- function(domains) { |
|||
.Call('_psl_public_suffix', PACKAGE = 'psl', domains) |
|||
} |
|||
|
|||
#' Test whether a domain is a public suffix |
|||
#' |
|||
#' @md |
|||
#' @param domains character vector of domains |
|||
#' @return character vector |
|||
#' @export |
|||
is_public_suffix <- function(domains) { |
|||
.Call('_psl_is_public_suffix', PACKAGE = 'psl', domains) |
|||
} |
|||
|
|||
#' Separate a domain into component parts |
|||
#' |
|||
#' @md |
|||
#' @param domains character vector of domains |
|||
#' @return data frame |
|||
#' @export |
|||
suffix_extract <- function(domains) { |
|||
.Call('_psl_suffix_extract', PACKAGE = 'psl', domains) |
|||
} |
|||
|
|||
#' Separate a domain into component parts |
|||
#' |
|||
#' Compatibility function for those using `urltools::suffix_extract()` |
|||
#' |
|||
#' @md |
|||
#' @param domains character vector of domains |
|||
#' @return data frame |
|||
#' @export |
|||
suffix_extract2 <- function(domains) { |
|||
.Call('_psl_suffix_extract2', PACKAGE = 'psl', domains) |
|||
} |
|||
|
@ -1,12 +1,21 @@ |
|||
#' ... |
|||
#' |
|||
#' Extract Internet Domain Components Using the Public Suffix List |
|||
#' |
|||
#' The 'Public Suffix List' (<https://publicsuffix.org/>) is a collection |
|||
#' of top-level domains ('TLDs') which include global top-level domainsa ('gTLDs') |
|||
#' such as '.com' and '.net'; country top-level domains ('ccTLDs') such as '.de' and |
|||
#' '.cn'; and, brand top-level domains such as '.apple' and '.google'. Tools are provided |
|||
#' to extract internet domain components using the public suffix list base data. |
|||
#' |
|||
#' - `libpsl`: <https://github.com/rockdaboot/libpsl> |
|||
#' - Public Suffix List: <https://publicsuffix.org/> |
|||
#' |
|||
#' - URL: <https://gitlab.com/hrbrmstr/psl> |
|||
#' - BugReports: <https://gitlab.com/hrbrmstr/psl/issues> |
|||
#' |
|||
#' |
|||
#' @md |
|||
#' @name psl |
|||
#' @docType package |
|||
#' @author Bob Rudis (bob@@rud.is) |
|||
#' @import httr |
|||
#' @importFrom jsonlite fromJSON |
|||
NULL |
|||
#' @useDynLib psl |
|||
#' @importFrom Rcpp sourceCpp |
|||
NULL |
@ -1,2 +1,138 @@ |
|||
|
|||
# psl |
|||
|
|||
Extract Internet Domain Components Using the Public Suffix List |
|||
|
|||
## Description |
|||
|
|||
The ‘Public Suffix List’ (<https://publicsuffix.org/>) is a collection |
|||
of top-level domains (‘TLDs’) which include global top-level domainsa |
|||
(‘gTLDs’) such as ‘.com’ and ‘.net’; country top-level domains |
|||
(‘ccTLDs’) such as ‘.de’ and ‘.cn’; and, brand top-level domains such |
|||
as ‘.apple’ and ‘.google’. Tools are provided to extract internet domain |
|||
components using the public suffix list base data. |
|||
|
|||
- `libpsl`: <https://github.com/rockdaboot/libpsl> |
|||
- Public Suffix List: <https://publicsuffix.org/> |
|||
|
|||
## What’s Inside The Tin |
|||
|
|||
The following functions are implemented: |
|||
|
|||
- `apex_domain`: Return the apex/top-private domain from a vector of |
|||
domains |
|||
- `is_public_suffix`: Test whether a domain is a public suffix |
|||
- `public_suffix`: Return the public suffix from a vector of domains |
|||
- `suffix_extract`: Separate a domain into component parts |
|||
- `suffix_extract2`: Separate a domain into component parts (urltools |
|||
compatible output) |
|||
|
|||
## Installation |
|||
|
|||
``` r |
|||
devtools::install_github("hrbrmstr/psl") |
|||
``` |
|||
|
|||
## Usage |
|||
|
|||
``` r |
|||
library(psl) |
|||
library(tidyverse) |
|||
|
|||
# current verison |
|||
packageVersion("psl") |
|||
## [1] '0.1.0' |
|||
``` |
|||
|
|||
``` r |
|||
doms <- c( |
|||
"", "com", "example.com", "www.example.com", |
|||
".com", ".example", ".example.com", ".example.example", "example", |
|||
"example.example", "b.example.example", "a.b.example.example", |
|||
"biz", "domain.biz", "b.domain.biz", "a.b.domain.biz", "com", |
|||
"example.com", "b.example.com", "a.b.example.com", "uk.com", |
|||
"example.uk.com", "b.example.uk.com", "a.b.example.uk.com", "test.ac", |
|||
"cy", "c.cy", "b.c.cy", "a.b.c.cy", "jp", "test.jp", "www.test.jp", |
|||
"ac.jp", "test.ac.jp", "www.test.ac.jp", "kyoto.jp", "test.kyoto.jp", |
|||
"ide.kyoto.jp", "b.ide.kyoto.jp", "a.b.ide.kyoto.jp", "c.kobe.jp", |
|||
"b.c.kobe.jp", "a.b.c.kobe.jp", "city.kobe.jp", "www.city.kobe.jp", |
|||
"ck", "test.ck", "b.test.ck", "a.b.test.ck", "www.ck", "www.www.ck", |
|||
"us", "test.us", "www.test.us", "ak.us", "test.ak.us", "www.test.ak.us", |
|||
"k12.ak.us", "test.k12.ak.us", "www.test.k12.ak.us" |
|||
) |
|||
|
|||
apex_domain(doms) |
|||
## [1] NA NA "example.com" "example.com" NA NA |
|||
## [7] NA NA NA "example.example" "example.example" "example.example" |
|||
## [13] NA "domain.biz" "domain.biz" "domain.biz" NA "example.com" |
|||
## [19] "example.com" "example.com" NA "example.uk.com" "example.uk.com" "example.uk.com" |
|||
## [25] "test.ac" NA "c.cy" "c.cy" "c.cy" NA |
|||
## [31] "test.jp" "test.jp" NA "test.ac.jp" "test.ac.jp" NA |
|||
## [37] "test.kyoto.jp" NA "b.ide.kyoto.jp" "b.ide.kyoto.jp" NA "b.c.kobe.jp" |
|||
## [43] "b.c.kobe.jp" "city.kobe.jp" "city.kobe.jp" NA NA "b.test.ck" |
|||
## [49] "b.test.ck" "www.ck" "www.ck" NA "test.us" "test.us" |
|||
## [55] NA "test.ak.us" "test.ak.us" NA "test.k12.ak.us" "test.k12.ak.us" |
|||
|
|||
public_suffix(doms) |
|||
## [1] "" "com" "com" "com" ".com" ".example" "com" |
|||
## [8] "example" "example" "example" "example" "example" "biz" "biz" |
|||
## [15] "biz" "biz" "com" "com" "com" "com" "uk.com" |
|||
## [22] "uk.com" "uk.com" "uk.com" "ac" "cy" "cy" "cy" |
|||
## [29] "cy" "jp" "jp" "jp" "ac.jp" "ac.jp" "ac.jp" |
|||
## [36] "kyoto.jp" "kyoto.jp" "ide.kyoto.jp" "ide.kyoto.jp" "ide.kyoto.jp" "c.kobe.jp" "c.kobe.jp" |
|||
## [43] "c.kobe.jp" "kobe.jp" "kobe.jp" "ck" "test.ck" "test.ck" "test.ck" |
|||
## [50] "ck" "ck" "us" "us" "us" "ak.us" "ak.us" |
|||
## [57] "ak.us" "k12.ak.us" "k12.ak.us" "k12.ak.us" |
|||
|
|||
is_public_suffix(doms) |
|||
## [1] TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE |
|||
## [20] FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE |
|||
## [39] FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE |
|||
## [58] TRUE FALSE FALSE |
|||
|
|||
suffix_extract(doms) |
|||
## # A tibble: 60 x 6 |
|||
## orig normalized subdomain apex domain suffix |
|||
## <chr> <chr> <chr> <chr> <chr> <chr> |
|||
## 1 "" "" <NA> <NA> <NA> "" |
|||
## 2 com com <NA> <NA> <NA> com |
|||
## 3 example.com example.com "" example.com example com |
|||
## 4 www.example.com www.example.com www example.com example com |
|||
## 5 .com .com <NA> <NA> <NA> .com |
|||
## 6 .example .example <NA> <NA> <NA> .example |
|||
## 7 .example.com .example.com <NA> <NA> <NA> com |
|||
## 8 .example.example .example.example <NA> <NA> <NA> example |
|||
## 9 example example <NA> <NA> <NA> example |
|||
## 10 example.example example.example "" example.example example example |
|||
## # ... with 50 more rows |
|||
|
|||
suffix_extract2(doms) # urltools compatible output |
|||
## # A tibble: 60 x 4 |
|||
## host subdomain domain suffix |
|||
## <chr> <chr> <chr> <chr> |
|||
## 1 "" <NA> <NA> "" |
|||
## 2 com <NA> <NA> com |
|||
## 3 example.com "" example com |
|||
## 4 www.example.com www example com |
|||
## 5 .com <NA> <NA> .com |
|||
## 6 .example <NA> <NA> .example |
|||
## 7 .example.com <NA> <NA> com |
|||
## 8 .example.example <NA> <NA> example |
|||
## 9 example <NA> <NA> example |
|||
## 10 example.example "" example example |
|||
## # ... with 50 more rows |
|||
``` |
|||
|
|||
``` r |
|||
library(microbenchmark) |
|||
|
|||
microbenchmark( |
|||
urltools = urltools::suffix_extract(doms), |
|||
psl = psl::suffix_extract(doms), # returns more data |
|||
psl2 = psl::suffix_extract2(doms) # returns what urltools does |
|||
) -> mb |
|||
|
|||
autoplot(mb) |
|||
``` |
|||
|
|||
<img src="README_files/figure-gfm/bench-1.png" width="960" /> |
|||
|
After Width: | Height: | Size: 73 KiB |
File diff suppressed because it is too large
@ -0,0 +1,17 @@ |
|||
% Generated by roxygen2: do not edit by hand |
|||
% Please edit documentation in R/RcppExports.R |
|||
\name{apex_domain} |
|||
\alias{apex_domain} |
|||
\title{Return the apex/top-private domain from a vector of domains} |
|||
\usage{ |
|||
apex_domain(domains) |
|||
} |
|||
\arguments{ |
|||
\item{domains}{character vector of domains} |
|||
} |
|||
\value{ |
|||
character vector |
|||
} |
|||
\description{ |
|||
Return the apex/top-private domain from a vector of domains |
|||
} |
@ -0,0 +1,17 @@ |
|||
% Generated by roxygen2: do not edit by hand |
|||
% Please edit documentation in R/RcppExports.R |
|||
\name{is_public_suffix} |
|||
\alias{is_public_suffix} |
|||
\title{Test whether a domain is a public suffix} |
|||
\usage{ |
|||
is_public_suffix(domains) |
|||
} |
|||
\arguments{ |
|||
\item{domains}{character vector of domains} |
|||
} |
|||
\value{ |
|||
character vector |
|||
} |
|||
\description{ |
|||
Test whether a domain is a public suffix |
|||
} |
@ -0,0 +1,17 @@ |
|||
% Generated by roxygen2: do not edit by hand |
|||
% Please edit documentation in R/RcppExports.R |
|||
\name{public_suffix} |
|||
\alias{public_suffix} |
|||
\title{Return the public suffix from a vector of domains} |
|||
\usage{ |
|||
public_suffix(domains) |
|||
} |
|||
\arguments{ |
|||
\item{domains}{character vector of domains} |
|||
} |
|||
\value{ |
|||
character vector |
|||
} |
|||
\description{ |
|||
Return the public suffix from a vector of domains |
|||
} |
@ -0,0 +1,17 @@ |
|||
% Generated by roxygen2: do not edit by hand |
|||
% Please edit documentation in R/RcppExports.R |
|||
\name{suffix_extract} |
|||
\alias{suffix_extract} |
|||
\title{Separate a domain into component parts} |
|||
\usage{ |
|||
suffix_extract(domains) |
|||
} |
|||
\arguments{ |
|||
\item{domains}{character vector of domains} |
|||
} |
|||
\value{ |
|||
data frame |
|||
} |
|||
\description{ |
|||
Separate a domain into component parts |
|||
} |
@ -0,0 +1,17 @@ |
|||
% Generated by roxygen2: do not edit by hand |
|||
% Please edit documentation in R/RcppExports.R |
|||
\name{suffix_extract2} |
|||
\alias{suffix_extract2} |
|||
\title{Separate a domain into component parts} |
|||
\usage{ |
|||
suffix_extract2(domains) |
|||
} |
|||
\arguments{ |
|||
\item{domains}{character vector of domains} |
|||
} |
|||
\value{ |
|||
data frame |
|||
} |
|||
\description{ |
|||
Compatibility function for those using \code{urltools::suffix_extract()} |
|||
} |
@ -0,0 +1,3 @@ |
|||
*.o |
|||
*.so |
|||
*.dll |
@ -0,0 +1,3 @@ |
|||
CXX_STD = CXX11 |
|||
PKG_CXXFLAGS = |
|||
PKG_LIBS = -L. -liconv -lidn2 |
@ -0,0 +1,76 @@ |
|||
// Generated by using Rcpp::compileAttributes() -> do not edit by hand
|
|||
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
|
|||
|
|||
#include <Rcpp.h> |
|||
|
|||
using namespace Rcpp; |
|||
|
|||
// apex_domain
|
|||
CharacterVector apex_domain(CharacterVector domains); |
|||
RcppExport SEXP _psl_apex_domain(SEXP domainsSEXP) { |
|||
BEGIN_RCPP |
|||
Rcpp::RObject rcpp_result_gen; |
|||
Rcpp::RNGScope rcpp_rngScope_gen; |
|||
Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP); |
|||
rcpp_result_gen = Rcpp::wrap(apex_domain(domains)); |
|||
return rcpp_result_gen; |
|||
END_RCPP |
|||
} |
|||
// public_suffix
|
|||
CharacterVector public_suffix(CharacterVector domains); |
|||
RcppExport SEXP _psl_public_suffix(SEXP domainsSEXP) { |
|||
BEGIN_RCPP |
|||
Rcpp::RObject rcpp_result_gen; |
|||
Rcpp::RNGScope rcpp_rngScope_gen; |
|||
Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP); |
|||
rcpp_result_gen = Rcpp::wrap(public_suffix(domains)); |
|||
return rcpp_result_gen; |
|||
END_RCPP |
|||
} |
|||
// is_public_suffix
|
|||
std::vector< bool > is_public_suffix(CharacterVector domains); |
|||
RcppExport SEXP _psl_is_public_suffix(SEXP domainsSEXP) { |
|||
BEGIN_RCPP |
|||
Rcpp::RObject rcpp_result_gen; |
|||
Rcpp::RNGScope rcpp_rngScope_gen; |
|||
Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP); |
|||
rcpp_result_gen = Rcpp::wrap(is_public_suffix(domains)); |
|||
return rcpp_result_gen; |
|||
END_RCPP |
|||
} |
|||
// suffix_extract
|
|||
DataFrame suffix_extract(CharacterVector domains); |
|||
RcppExport SEXP _psl_suffix_extract(SEXP domainsSEXP) { |
|||
BEGIN_RCPP |
|||
Rcpp::RObject rcpp_result_gen; |
|||
Rcpp::RNGScope rcpp_rngScope_gen; |
|||
Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP); |
|||
rcpp_result_gen = Rcpp::wrap(suffix_extract(domains)); |
|||
return rcpp_result_gen; |
|||
END_RCPP |
|||
} |
|||
// suffix_extract2
|
|||
DataFrame suffix_extract2(CharacterVector domains); |
|||
RcppExport SEXP _psl_suffix_extract2(SEXP domainsSEXP) { |
|||
BEGIN_RCPP |
|||
Rcpp::RObject rcpp_result_gen; |
|||
Rcpp::RNGScope rcpp_rngScope_gen; |
|||
Rcpp::traits::input_parameter< CharacterVector >::type domains(domainsSEXP); |
|||
rcpp_result_gen = Rcpp::wrap(suffix_extract2(domains)); |
|||
return rcpp_result_gen; |
|||
END_RCPP |
|||
} |
|||
|
|||
static const R_CallMethodDef CallEntries[] = { |
|||
{"_psl_apex_domain", (DL_FUNC) &_psl_apex_domain, 1}, |
|||
{"_psl_public_suffix", (DL_FUNC) &_psl_public_suffix, 1}, |
|||
{"_psl_is_public_suffix", (DL_FUNC) &_psl_is_public_suffix, 1}, |
|||
{"_psl_suffix_extract", (DL_FUNC) &_psl_suffix_extract, 1}, |
|||
{"_psl_suffix_extract2", (DL_FUNC) &_psl_suffix_extract2, 1}, |
|||
{NULL, NULL, 0} |
|||
}; |
|||
|
|||
RcppExport void R_init_psl(DllInfo *dll) { |
|||
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); |
|||
R_useDynamicSymbols(dll, FALSE); |
|||
} |
@ -0,0 +1,147 @@ |
|||
/* config.h. Generated from config.h.in by configure. */ |
|||
/* config.h.in. Generated from configure.ac by autoheader. */ |
|||
|
|||
/* generate PSL data using libicu */ |
|||
/* #undef BUILTIN_GENERATOR_LIBICU */ |
|||
|
|||
/* generate PSL data using libidn */ |
|||
/* #undef BUILTIN_GENERATOR_LIBIDN */ |
|||
|
|||
/* generate PSL data using libidn2 */ |
|||
#define BUILTIN_GENERATOR_LIBIDN2 1 |
|||
|
|||
/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
|
|||
systems. This function is required for `alloca.c' support on those systems. |
|||
*/ |
|||
/* #undef CRAY_STACKSEG_END */ |
|||
|
|||
/* Define to 1 if using `alloca.c'. */ |
|||
/* #undef C_ALLOCA */ |
|||
|
|||
/* Define to 1 if translation of program messages to the user's native
|
|||
language is requested. */ |
|||
/* #undef ENABLE_NLS */ |
|||
|
|||
/* Define to 1 if you have `alloca', as a function or macro. */ |
|||
#define HAVE_ALLOCA 1 |
|||
|
|||
/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
|
|||
*/ |
|||
#define HAVE_ALLOCA_H 1 |
|||
|
|||
/* Define to 1 if you have the MacOS X function CFLocaleCopyCurrent in the
|
|||
CoreFoundation framework. */ |
|||
#define HAVE_CFLOCALECOPYCURRENT 1 |
|||
|
|||
/* Define to 1 if you have the MacOS X function CFPreferencesCopyAppValue in
|
|||
the CoreFoundation framework. */ |
|||
#define HAVE_CFPREFERENCESCOPYAPPVALUE 1 |
|||
|
|||
/* Define to 1 if you have the `clock_gettime' function. */ |
|||
#define HAVE_CLOCK_GETTIME 1 |
|||
|
|||
/* Define if the GNU dcgettext() function is already present or preinstalled.
|
|||
*/ |
|||
/* #undef HAVE_DCGETTEXT */ |
|||
|
|||
/* Define to 1 if you have the <dlfcn.h> header file. */ |
|||
#define HAVE_DLFCN_H 1 |
|||
|
|||
/* Define to 1 if you have the `fmemopen' function. */ |
|||
#define HAVE_FMEMOPEN 1 |
|||
|
|||
/* Define if the GNU gettext() function is already present or preinstalled. */ |
|||
/* #undef HAVE_GETTEXT */ |
|||
|
|||
/* Define if you have the iconv() function and it works. */ |
|||
#define HAVE_ICONV 1 |
|||
|
|||
/* Define to 1 if you have the <inttypes.h> header file. */ |
|||
#define HAVE_INTTYPES_H 1 |
|||
|
|||
/* Define to 1 if you have the <memory.h> header file. */ |
|||
#define HAVE_MEMORY_H 1 |
|||
|
|||
/* Define to 1 if you have the `nl_langinfo' function. */ |
|||
#define HAVE_NL_LANGINFO 1 |
|||
|
|||
/* Define to 1 if you have the <stdint.h> header file. */ |
|||
#define HAVE_STDINT_H 1 |
|||
|
|||
/* Define to 1 if you have the <stdlib.h> header file. */ |
|||
#define HAVE_STDLIB_H 1 |
|||
|
|||
/* Define to 1 if you have the <strings.h> header file. */ |
|||
#define HAVE_STRINGS_H 1 |
|||
|
|||
/* Define to 1 if you have the <string.h> header file. */ |
|||
#define HAVE_STRING_H 1 |
|||
|
|||
/* Define to 1 if you have the `strndup' function. */ |
|||
#define HAVE_STRNDUP 1 |
|||
|
|||
/* Define to 1 if you have the <sys/stat.h> header file. */ |
|||
#define HAVE_SYS_STAT_H 1 |
|||
|
|||
/* Define to 1 if you have the <sys/types.h> header file. */ |
|||
#define HAVE_SYS_TYPES_H 1 |
|||
|
|||
/* Define to 1 if you have the <unistd.h> header file. */ |
|||
#define HAVE_UNISTD_H 1 |
|||
|
|||
/* Define to 1 or 0, depending whether the compiler supports simple visibility
|
|||
declarations. */ |
|||
#define HAVE_VISIBILITY 1 |
|||
|
|||
/* Define as const if the declaration of iconv() needs const. */ |
|||
#define ICONV_CONST |
|||
|
|||
/* Define to the sub-directory where libtool stores uninstalled libraries. */ |
|||
#define LT_OBJDIR ".libs/" |
|||
|
|||
/* Define to the address where bug reports for this package should be sent. */ |
|||
#define PACKAGE_BUGREPORT "tim.ruehsen@gmx.de" |
|||
|
|||
/* Define to the full name of this package. */ |
|||
#define PACKAGE_NAME "libpsl" |
|||
|
|||
/* Define to the full name and version of this package. */ |
|||
#define PACKAGE_STRING "libpsl 0.20.2" |
|||
|
|||
/* Define to the one symbol short name of this package. */ |
|||
#define PACKAGE_TARNAME "libpsl" |
|||
|
|||
/* Define to the home page for this package. */ |
|||
#define PACKAGE_URL "https://github.com/rockdaboot/libpsl"
|
|||
|
|||
/* Define to the version of this package. */ |
|||
#define PACKAGE_VERSION "0.20.2" |
|||
|
|||
/* If using the C implementation of alloca, define if you know the
|
|||
direction of stack growth for your system; otherwise it will be |
|||
automatically deduced at runtime. |
|||
STACK_DIRECTION > 0 => grows toward higher addresses |
|||
STACK_DIRECTION < 0 => grows toward lower addresses |
|||
STACK_DIRECTION = 0 => direction of growth unknown */ |
|||
/* #undef STACK_DIRECTION */ |
|||
|
|||
/* Define to 1 if you have the ANSI C header files. */ |
|||
#define STDC_HEADERS 1 |
|||
|
|||
/* generate PSL data using libicu */ |
|||
/* #undef WITH_LIBICU */ |
|||
|
|||
/* generate PSL data using libidn */ |
|||
/* #undef WITH_LIBIDN */ |
|||
|
|||
/* generate PSL data using libidn2 */ |
|||
#define WITH_LIBIDN2 1 |
|||
|
|||
/* Define to `__inline__' or `__inline' if that's what the C compiler
|
|||
calls it, or to nothing if 'inline' is not supported under any name. */ |
|||
#ifndef __cplusplus |
|||
/* #undef inline */ |
|||
#endif |
|||
|
|||
/* Define to `unsigned int' if <sys/types.h> does not define. */ |
|||
/* #undef size_t */ |
@ -0,0 +1,212 @@ |
|||
/*
|
|||
* Copyright(c) 2014-2018 Tim Ruehsen |
|||
* |
|||
* Permission is hereby granted, free of charge, to any person obtaining a |
|||
* copy of this software and associated documentation files (the "Software"), |
|||
* to deal in the Software without restriction, including without limitation |
|||
* the rights to use, copy, modify, merge, publish, distribute, sublicense, |
|||
* and/or sell copies of the Software, and to permit persons to whom the |
|||
* Software is furnished to do so, subject to the following conditions: |
|||
* |
|||
* The above copyright notice and this permission notice shall be included in |
|||
* all copies or substantial portions of the Software. |
|||
* |
|||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|||
* DEALINGS IN THE SOFTWARE. |
|||
* |
|||
* This file is part of libpsl. |
|||
* |
|||
* Header file for libpsl library routines |
|||
* |
|||
* Changelog |
|||
* 20.03.2014 Tim Ruehsen created |
|||
* |
|||
*/ |
|||
#ifdef __cplusplus |
|||
extern "C" { |
|||
#endif |
|||
|
|||
#ifndef LIBPSL_LIBPSL_H |
|||
#define LIBPSL_LIBPSL_H |
|||
|
|||
#include <stdio.h> |
|||
#include <time.h> |
|||
|
|||
#define PSL_VERSION "0.20.2" |
|||
#define PSL_VERSION_MAJOR 0 |
|||
#define PSL_VERSION_MINOR 20 |
|||
#define PSL_VERSION_PATCH 2 |
|||
#define PSL_VERSION_NUMBER 0x001402 |
|||
|
|||
#ifndef PSL_API |
|||
#if defined BUILDING_PSL && HAVE_VISIBILITY |
|||
# define PSL_API __attribute__ ((__visibility__("default"))) |
|||
#elif defined BUILDING_PSL && defined _MSC_VER && !defined PSL_STATIC |
|||
# define PSL_API __declspec(dllexport) |
|||
#elif defined _MSC_VER && !defined PSL_STATIC |
|||
# define PSL_API __declspec(dllimport) |
|||
#else |
|||
# define PSL_API |
|||
#endif |
|||
#endif |
|||
|
|||
#ifdef __cplusplus |
|||
extern "C" { |
|||
#endif |
|||
|
|||
/* types for psl_is_public_suffix2() */ |
|||
#define PSL_TYPE_ICANN (1<<0) |
|||
#define PSL_TYPE_PRIVATE (1<<1) |
|||
#define PSL_TYPE_NO_STAR_RULE (1<<2) |
|||
#define PSL_TYPE_ANY (PSL_TYPE_ICANN | PSL_TYPE_PRIVATE) |
|||
|
|||
/**
|
|||
* psl_error_t: |
|||
* @PSL_SUCCESS: Successful return. |
|||
* @PSL_ERR_INVALID_ARG: Invalid argument. |
|||
* @PSL_ERR_CONVERTER: Failed to open libicu utf-16 converter. |
|||
* @PSL_ERR_TO_UTF16: Failed to convert to utf-16. |
|||
* @PSL_ERR_TO_LOWER: Failed to convert utf-16 to lowercase. |
|||
* @PSL_ERR_TO_UTF8: Failed to convert utf-16 to utf-8. |
|||
* @PSL_ERR_NO_MEM: Failed to allocate memory. |
|||
* |
|||
* Return codes for PSL functions. |
|||
* Negative return codes mean failure. |
|||
* Positive values are reserved for non-error return codes. |
|||
*/ |
|||
typedef enum { |
|||
PSL_SUCCESS = 0, |
|||
PSL_ERR_INVALID_ARG = -1, |
|||
PSL_ERR_CONVERTER = -2, /* failed to open libicu utf-16 converter */ |
|||
PSL_ERR_TO_UTF16 = -3, /* failed to convert to utf-16 */ |
|||
PSL_ERR_TO_LOWER = -4, /* failed to convert utf-16 to lowercase */ |
|||
PSL_ERR_TO_UTF8 = -5, /* failed to convert utf-16 to utf-8 */ |
|||
PSL_ERR_NO_MEM = -6 /* failed to allocate memory */ |
|||
} psl_error_t; |
|||
|
|||
typedef struct _psl_ctx_st psl_ctx_t; |
|||
|
|||
/* frees PSL context */ |
|||
PSL_API |
|||
void |
|||
psl_free(psl_ctx_t *psl); |
|||
|
|||
/* frees memory allocated by libpsl routines */ |
|||
PSL_API |
|||
void |
|||
psl_free_string(char *str); |
|||
|
|||
/* loads PSL data from file */ |
|||
PSL_API |
|||
psl_ctx_t * |
|||
psl_load_file(const char *fname); |
|||
|
|||
/* loads PSL data from FILE pointer */ |
|||
PSL_API |
|||
psl_ctx_t * |
|||
psl_load_fp(FILE *fp); |
|||
|
|||
/* retrieves builtin PSL data */ |
|||
PSL_API |
|||
const psl_ctx_t * |
|||
psl_builtin(void); |
|||
|
|||
/* retrieves most recent PSL data */ |
|||
PSL_API |
|||
psl_ctx_t * |
|||
psl_latest(const char *fname); |
|||
|
|||
/* checks whether domain is a public suffix or not */ |
|||
PSL_API |
|||
int |
|||
psl_is_public_suffix(const psl_ctx_t *psl, const char *domain); |
|||
|
|||
/* checks whether domain is a public suffix regarding the type or not */ |
|||
PSL_API |
|||
int |
|||
psl_is_public_suffix2(const psl_ctx_t *psl, const char *domain, int type); |
|||
|
|||
/* checks whether cookie_domain is acceptable for domain or not */ |
|||
PSL_API |
|||
int |
|||
psl_is_cookie_domain_acceptable(const psl_ctx_t *psl, const char *hostname, const char *cookie_domain); |
|||
|
|||
/* returns the longest not registrable domain within 'domain' or NULL if none found */ |
|||
PSL_API |
|||
const char * |
|||
psl_unregistrable_domain(const psl_ctx_t *psl, const char *domain); |
|||
|
|||
/* returns the shortest possible registrable domain part or NULL if domain is not registrable at all */ |
|||
PSL_API |
|||
const char * |
|||
psl_registrable_domain(const psl_ctx_t *psl, const char *domain); |
|||
|
|||
/* convert a string into lowercase UTF-8 */ |
|||
PSL_API |
|||
psl_error_t |
|||
psl_str_to_utf8lower(const char *str, const char *encoding, const char *locale, char **lower); |
|||
|
|||
/* does not include exceptions */ |
|||
PSL_API |
|||
int |
|||
psl_suffix_count(const psl_ctx_t *psl); |
|||
|
|||
/* just counts exceptions */ |
|||
PSL_API |
|||
int |
|||
psl_suffix_exception_count(const psl_ctx_t *psl); |
|||
|
|||
/* just counts wildcards */ |
|||
PSL_API |
|||
int |
|||
psl_suffix_wildcard_count(const psl_ctx_t *psl); |
|||
|
|||
/* returns mtime of PSL source file */ |
|||
PSL_API |
|||
time_t |
|||
psl_builtin_file_time(void); |
|||
|
|||
/* returns SHA1 checksum (hex-encoded, lowercase) of PSL source file */ |
|||
PSL_API |
|||
const char * |
|||
psl_builtin_sha1sum(void); |
|||
|
|||
/* returns file name of PSL source file */ |
|||
PSL_API |
|||
const char * |
|||
psl_builtin_filename(void); |
|||
|
|||
/* returns name of distribution PSL data file */ |
|||
PSL_API |
|||
const char * |
|||
psl_dist_filename(void); |
|||
|
|||
/* returns library version string */ |
|||
PSL_API |
|||
const char * |
|||
psl_get_version(void); |
|||
|
|||
/* checks library version number */ |
|||
PSL_API |
|||
int |
|||
psl_check_version_number(int version); |
|||
|
|||
/* returns whether the built-in data is outdated or not */ |
|||
PSL_API |
|||
int |
|||
psl_builtin_outdated(void); |
|||
|
|||
#ifdef __cplusplus |
|||
} |
|||
#endif |
|||
|
|||
#endif /* LIBPSL_LIBPSL_H */ |
|||
|
|||
#ifdef __cplusplus |
|||
} |
|||
#endif |
@ -0,0 +1,279 @@ |
|||
/* Copyright 2015-2016 The Chromium Authors. All rights reserved.
|
|||
* Use of this source code is governed by a BSD-style license that can be |
|||
* found in the LICENSE.chromium file. |
|||
* |
|||
* Converted to C89 2015 by Tim Rühsen |
|||
*/ |
|||
|
|||
#include <stddef.h> |
|||
|
|||
#if defined(__GNUC__) && defined(__GNUC_MINOR__) |
|||
# define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))) |
|||
#else |
|||
# define _GCC_VERSION_AT_LEAST(major, minor) 0 |
|||
#endif |
|||
|
|||
#define CHECK_LT(a, b) if ((a) >= b) return 0 |
|||
|
|||
static const char multibyte_length_table[16] = { |
|||
0, 0, 0, 0, /* 0x00-0x3F */ |
|||
0, 0, 0, 0, /* 0x40-0x7F */ |
|||
0, 0, 0, 0, /* 0x80-0xBF */ |
|||
2, 2, 3, 4, /* 0xC0-0xFF */ |
|||
}; |
|||
|
|||
|
|||
/*
|
|||
* Get length of multibyte character sequence starting at a given byte. |
|||
* Returns zero if the byte is not a valid leading byte in UTF-8. |
|||
*/ |
|||
static int GetMultibyteLength(char c) { |
|||
return multibyte_length_table[((unsigned char)c) >> 4]; |
|||
} |
|||
|
|||
/*
|
|||
* Moves pointers one byte forward. |
|||
*/ |
|||
static void NextPos(const unsigned char** pos, |
|||
const char** key, |
|||
const char** multibyte_start) |
|||
{ |
|||
++*pos; |
|||
if (*multibyte_start) { |
|||
/* Advance key to next byte in multibyte sequence. */ |
|||
++*key; |
|||
/* Reset multibyte_start if last byte in multibyte sequence was consumed. */ |
|||
if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start)) |
|||
*multibyte_start = 0; |
|||
} else { |
|||
if (GetMultibyteLength(**key)) { |
|||
/* Multibyte prefix was matched in the dafsa, start matching multibyte
|
|||
* content in next round. */ |
|||
*multibyte_start = *key; |
|||
} else { |
|||
/* Advance key as a single byte character was matched. */ |
|||
++*key; |
|||
} |
|||
} |
|||
} |
|||
|
|||
/*
|
|||
* Read next offset from pos. |
|||
* Returns true if an offset could be read, false otherwise. |
|||
*/ |
|||
|
|||
static int GetNextOffset(const unsigned char** pos, |
|||
const unsigned char* end, |
|||
const unsigned char** offset) |
|||
{ |
|||
size_t bytes_consumed; |
|||
|
|||
if (*pos == end) |
|||
return 0; |
|||
|
|||
/* When reading an offset the byte array must always contain at least
|
|||
* three more bytes to consume. First the offset to read, then a node |
|||
* to skip over and finally a destination node. No object can be smaller |
|||
* than one byte. */ |
|||
CHECK_LT(*pos + 2, end); |
|||
switch (**pos & 0x60) { |
|||
case 0x60: /* Read three byte offset */ |
|||
*offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2]; |
|||
bytes_consumed = 3; |
|||
break; |
|||
case 0x40: /* Read two byte offset */ |
|||
*offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1]; |
|||
bytes_consumed = 2; |
|||
break; |
|||
default: |
|||
*offset += (*pos)[0] & 0x3F; |
|||
bytes_consumed = 1; |
|||
} |
|||
if ((**pos & 0x80) != 0) { |
|||
*pos = end; |
|||
} else { |
|||
*pos += bytes_consumed; |
|||
} |
|||
return 1; |
|||
} |
|||
|
|||
/*
|
|||
* Check if byte at offset is last in label. |
|||
*/ |
|||
|
|||
static int IsEOL(const unsigned char* offset, const unsigned char* end) |
|||
{ |
|||
CHECK_LT(offset, end); |
|||
return(*offset & 0x80) != 0; |
|||
} |
|||
|
|||
/*
|
|||
* Check if byte at offset matches first character in key. |
|||
* This version assumes a range check was already performed by the caller. |
|||
*/ |
|||
|
|||
static int IsMatchUnchecked(const unsigned char matcher, |
|||
const char* key, |
|||
const char* multibyte_start) |
|||
{ |
|||
if (multibyte_start) { |
|||
/* Multibyte matching mode. */ |
|||
if (multibyte_start == key) { |
|||
/* Match leading byte, which will also match the sequence length. */ |
|||
return (matcher ^ 0x80) == (const unsigned char)*key; |
|||
} else { |
|||
/* Match following bytes. */ |
|||
return (matcher ^ 0xC0) == (const unsigned char)*key; |
|||
} |
|||
} |
|||
/* If key points at a leading byte in a multibyte sequence, but we are not yet
|
|||
* in multibyte mode, then the dafsa should contain a special byte to indicate |
|||
* a mode switch. */ |
|||
if (GetMultibyteLength(*key)) { |
|||
return matcher == 0x1F; |
|||
} |
|||
/* Normal matching of a single byte character. */ |
|||
return matcher == (const unsigned char)*key; |
|||
} |
|||
|
|||
/*
|
|||
* Check if byte at offset matches first character in key. |
|||
* This version matches characters not last in label. |
|||
*/ |
|||
|
|||
static int IsMatch(const unsigned char* offset, |
|||
const unsigned char* end, |
|||
const char* key, |
|||
const char* multibyte_start) |
|||
{ |
|||
CHECK_LT(offset, end); |
|||
return IsMatchUnchecked(*offset, key, multibyte_start); |
|||
} |
|||
|
|||
/*
|
|||
* Check if byte at offset matches first character in key. |
|||
* This version matches characters last in label. |
|||
*/ |
|||
|
|||
static int IsEndCharMatch(const unsigned char* offset, |
|||
const unsigned char* end, |
|||
const char* key, |
|||
const char* multibyte_start) |
|||
{ |
|||
CHECK_LT(offset, end); |
|||
return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start); |
|||
} |
|||
|
|||
/*
|
|||
* Read return value at offset. |
|||
* Returns true if a return value could be read, false otherwise. |
|||
*/ |
|||
|
|||
static int GetReturnValue(const unsigned char* offset, |
|||
const unsigned char* end, |
|||
const char* multibyte_start, |
|||
int* return_value) |
|||
{ |
|||
CHECK_LT(offset, end); |
|||
if (!multibyte_start && (*offset & 0xE0) == 0x80) { |
|||
*return_value = *offset & 0x0F; |
|||
return 1; |
|||
} |
|||
return 0; |
|||
} |
|||
|
|||
/*
|
|||
* Looks up the string |key| with length |key_length| in a fixed set of |
|||
* strings. The set of strings must be known at compile time. It is converted to |
|||
* a graph structure named a DAFSA (Deterministic Acyclic Finite State |
|||
* Automaton) by the script psl-make-dafsa during compilation. This permits |
|||
* efficient (in time and space) lookup. The graph generated by psl-make-dafsa |
|||
* takes the form of a constant byte array which should be supplied via the |
|||
* |graph| and |length| parameters. The return value is kDafsaNotFound, |
|||
* kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule, |
|||
* kDafsaWildcardRule and kDafsaPrivateRule ORed together. |
|||
* |
|||
* Lookup a domain key in a byte array generated by psl-make-dafsa. |
|||
*/ |
|||
|
|||
/* prototype to skip warning with -Wmissing-prototypes */ |
|||
int LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t); |
|||
|
|||
int LookupStringInFixedSet(const unsigned char* graph, |
|||
size_t length, |
|||
const char* key, |
|||
size_t key_length) |
|||
{ |
|||
const unsigned char* pos = graph; |
|||
const unsigned char* end = graph + length; |
|||
const unsigned char* offset = pos; |
|||
const char* key_end = key + key_length; |
|||
const char* multibyte_start = 0; |
|||
|
|||
while (GetNextOffset(&pos, end, &offset)) { |
|||
/*char <char>+ end_char offsets
|
|||
* char <char>+ return value |
|||
* char end_char offsets |
|||
* char return value |
|||
* end_char offsets |
|||
* return_value |
|||
*/ |
|||
int did_consume = 0; |
|||
|
|||
if (key != key_end && !IsEOL(offset, end)) { |
|||
/* Leading <char> is not a match. Don't dive into this child */ |
|||
if (!IsMatch(offset, end, key, multibyte_start)) |
|||
continue; |
|||
did_consume = 1; |
|||
NextPos(&offset, &key, &multibyte_start); |
|||
/* Possible matches at this point:
|
|||
* <char>+ end_char offsets |
|||
* <char>+ return value |
|||
* end_char offsets |
|||
* return value |
|||
*/ |
|||
|
|||
/* Remove all remaining <char> nodes possible */ |
|||
while (!IsEOL(offset, end) && key != key_end) { |
|||
if (!IsMatch(offset, end, key, multibyte_start)) |
|||
return -1; |
|||
NextPos(&offset, &key, &multibyte_start); |
|||
} |
|||
} |
|||
/* Possible matches at this point:
|
|||
* end_char offsets |
|||
* return_value |
|||
* If one or more <char> elements were consumed, a failure |
|||
* to match is terminal. Otherwise, try the next node. |
|||
*/ |
|||
if (key == key_end) { |
|||
int return_value; |
|||
|
|||
if (GetReturnValue(offset, end, multibyte_start, &return_value)) |
|||
return return_value; |
|||
/* The DAFSA guarantees that if the first char is a match, all
|
|||
* remaining char elements MUST match if the key is truly present. |
|||
*/ |
|||
if (did_consume) |
|||
return -1; |
|||
continue; |
|||
} |
|||
if (!IsEndCharMatch(offset, end, key, multibyte_start)) { |
|||
if (did_consume) |
|||
return -1; /* Unexpected */ |
|||
continue; |
|||
} |
|||
NextPos(&offset, &key, &multibyte_start); |
|||
pos = offset; /* Dive into child */ |
|||
} |
|||
|
|||
return -1; /* No match */ |
|||
} |
|||
|
|||
/* prototype to skip warning with -Wmissing-prototypes */ |
|||
int GetUtfMode(const unsigned char *graph, size_t length); |
|||
|
|||
int GetUtfMode(const unsigned char *graph, size_t length) |
|||
{ |
|||
return length > 0 && graph[length - 1] < 0x80; |
|||
} |
@ -0,0 +1,350 @@ |
|||
#include <Rcpp.h> |
|||
|
|||
#include <regex> |
|||
|
|||
#include "libpsl.h" |
|||
|
|||
using namespace Rcpp; |
|||
|
|||
//' Return the apex/top-private domain from a vector of domains
|
|||
//'
|
|||
//' @md
|
|||
//' @param domains character vector of domains
|
|||
//' @return character vector
|
|||
//' @export
|
|||
// [[Rcpp::export]]
|
|||
CharacterVector apex_domain(CharacterVector domains) { |
|||
|
|||
unsigned int input_size = domains.size(); |
|||
CharacterVector output(input_size); |
|||
char *lower = NULL; |
|||
int rc; |
|||
const char * result; |
|||
const psl_ctx_t *psl = psl_builtin(); |
|||
|
|||
for (unsigned int i = 0; i < input_size; i++) { |
|||
|
|||
// remove trailing period if any
|
|||
std::string cleaned = Rcpp::as<std::string>(domains[i]); |
|||
if (cleaned.length() > 0) { |
|||
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back(); |
|||
} |
|||
|
|||
// lowercase it
|
|||
rc = psl_str_to_utf8lower( |
|||
cleaned.c_str(), |
|||
"utf-8", "en", |
|||
&lower |
|||
); |
|||
|
|||
if (rc == PSL_SUCCESS) { |
|||
result = psl_registrable_domain(psl, lower); |
|||
if (result) { |
|||
output[i] = std::string(result); |
|||
} else { |
|||
output[i] = NA_STRING; |
|||
} |
|||
} else { |
|||
output[i] = NA_STRING; |
|||
} |
|||
|
|||
psl_free_string(lower); |
|||
|
|||
} |
|||
|
|||
return(output); |
|||
|
|||
} |
|||
|
|||
//' Return the public suffix from a vector of domains
|
|||
//'
|
|||
//' @md
|
|||
//' @param domains character vector of domains
|
|||
//' @return character vector
|
|||
//' @export
|
|||
// [[Rcpp::export]]
|
|||
CharacterVector public_suffix(CharacterVector domains) { |
|||
|
|||
unsigned int input_size = domains.size(); |
|||
CharacterVector output(input_size); |
|||
char *lower = NULL; |
|||
int rc; |
|||
const char * result; |
|||
const psl_ctx_t *psl = psl_builtin(); |
|||
|
|||
for (unsigned int i = 0; i < input_size; i++) { |
|||
|
|||
// remove trailing period if any
|
|||
std::string cleaned = Rcpp::as<std::string>(domains[i]); |
|||
if (cleaned.length() > 0) { |
|||
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back(); |
|||
} |
|||
|
|||
// lowercase it
|
|||
rc = psl_str_to_utf8lower( |
|||
cleaned.c_str(), |
|||
"utf-8", "en", |
|||
&lower |
|||
); |
|||
|
|||
if (rc == PSL_SUCCESS) { |
|||
result = psl_unregistrable_domain(psl, lower); |
|||
if (result) { |
|||
output[i] = std::string(result); |
|||
} else { |
|||
output[i] = NA_STRING; |
|||
} |
|||
} else { |
|||
output[i] = NA_STRING; |
|||
} |
|||
|
|||
psl_free_string(lower); |
|||
|
|||
} |
|||
|
|||
return(output); |
|||
|
|||
} |
|||
|
|||
//' Test whether a domain is a public suffix
|
|||
//'
|
|||
//' @md
|
|||
//' @param domains character vector of domains
|
|||
//' @return character vector
|
|||
//' @export
|
|||
// [[Rcpp::export]]
|
|||
std::vector< bool > is_public_suffix(CharacterVector domains) { |
|||
|
|||
unsigned int input_size = domains.size(); |
|||
std::vector < bool > output(input_size); |
|||
char *lower = NULL; |
|||
int rc; |
|||
const psl_ctx_t *psl = psl_builtin(); |
|||
|
|||
for (unsigned int i = 0; i < input_size; i++) { |
|||
|
|||
// remove trailing period if any
|
|||
std::string cleaned = Rcpp::as<std::string>(domains[i]); |
|||
if (cleaned.length() > 0) { |
|||
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back(); |
|||
} |
|||
|
|||
// lowercase it
|
|||
rc = psl_str_to_utf8lower( |
|||
cleaned.c_str(), |
|||
"utf-8", "en", |
|||
&lower |
|||
); |
|||
|
|||
if (rc == PSL_SUCCESS) { |
|||
output[i] = (psl_is_public_suffix(psl, lower) == 1); |
|||
} else { |
|||
output[i] = NA_LOGICAL; |
|||
} |
|||
|
|||
psl_free_string(lower); |
|||
|
|||
} |
|||
|
|||
return(output); |
|||
|
|||
} |
|||
|
|||
//' Separate a domain into component parts
|
|||
//'
|
|||
//' @md
|
|||
//' @param domains character vector of domains
|
|||
//' @return data frame
|
|||
//' @export
|
|||
// [[Rcpp::export]]
|
|||
DataFrame suffix_extract(CharacterVector domains) { |
|||
|
|||
unsigned int input_size = domains.size(); |
|||
|
|||
CharacterVector normalized(input_size); |
|||
CharacterVector subdomain(input_size); |
|||
CharacterVector apex(input_size); |
|||
CharacterVector domain(input_size); |
|||
CharacterVector suffix(input_size); |
|||
|
|||
char *lower = NULL; |
|||
int rc; |
|||
const char * result; |
|||
const psl_ctx_t *psl = psl_builtin(); |
|||
|
|||
for (unsigned int i = 0; i < input_size; i++) { |
|||
|
|||
// remove trailing period if any
|
|||
std::string cleaned = Rcpp::as<std::string>(domains[i]); |
|||
if (cleaned.length() > 0) { |
|||
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back(); |
|||
} |
|||
|
|||
// lowercase it
|
|||
rc = psl_str_to_utf8lower( |
|||
cleaned.c_str(), |
|||
"utf-8", "en", |
|||
&lower |
|||
); |
|||
|
|||
if (rc == PSL_SUCCESS) { |
|||
|
|||
// no dots at end and lowercased
|
|||
normalized[i] = std::string(lower); |
|||
|
|||
// try to get the suffix
|
|||
result = psl_unregistrable_domain(psl, lower); |
|||
if (result) { |
|||
suffix[i] = std::string(result); |
|||
} else { |
|||
suffix[i] = NA_STRING; |
|||
} |
|||
|
|||
// try to get the apex
|
|||
result = psl_registrable_domain(psl, lower); |
|||
if (result) { |
|||
apex[i] = std::string(result); |
|||
} else { |
|||
apex[i] = NA_STRING; |
|||
} |
|||
|
|||
if ((suffix[i] != NA_STRING) && (apex[i] != NA_STRING)) { |
|||
|
|||
std::regex trail_suf("[\\.]*" + Rcpp::as<std::string>(suffix[i]) + "$"); |
|||
domain[i] = std::regex_replace( |
|||
Rcpp::as<std::string>(apex[i]), |
|||
trail_suf, "" |
|||
); |
|||
|
|||
std::regex apex_suf("[\\.]*" + Rcpp::as<std::string>(apex[i]) + "$"); |
|||
subdomain[i] = std::regex_replace( |
|||
Rcpp::as<std::string>(normalized[i]), |
|||
apex_suf, "" |
|||
); |
|||
|
|||
} else { |
|||
domain[i] = NA_STRING; |
|||
subdomain[i] = NA_STRING; |
|||
} |
|||
|
|||
} else { |
|||
normalized[i] = NA_STRING; |
|||
subdomain[i] = NA_STRING; |
|||
apex[i] = NA_STRING; |
|||
domain[i] = NA_STRING; |
|||
suffix[i] = NA_STRING; |
|||
} |
|||
|
|||
psl_free_string(lower); |
|||
|
|||
} |
|||
|
|||
DataFrame out = DataFrame::create( |
|||
_["orig"] = domains, |
|||
_["normalized"] = normalized, |
|||
_["subdomain"] = subdomain, |
|||
_["apex"] = apex, |
|||
_["domain"] = domain, |
|||
_["suffix"] = suffix, |
|||
_["stringsAsFactors"] = false |
|||
); |
|||
|
|||
out.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame"); |
|||
|
|||
return(out); |
|||
|
|||
} |
|||
|
|||
//' Separate a domain into component parts
|
|||
//'
|
|||
//' Compatibility function for those using `urltools::suffix_extract()`
|
|||
//'
|
|||
//' @md
|
|||
//' @param domains character vector of domains
|
|||
//' @return data frame
|
|||
//' @export
|
|||
// [[Rcpp::export]]
|
|||
DataFrame suffix_extract2(CharacterVector domains) { |
|||
|
|||
unsigned int input_size = domains.size(); |
|||
|
|||
CharacterVector subdomain(input_size); |
|||
CharacterVector domain(input_size); |
|||
CharacterVector suffix(input_size); |
|||
|
|||
char *lower = NULL; |
|||
int rc; |
|||
const char * result; |
|||
const psl_ctx_t *psl = psl_builtin(); |
|||
|
|||
for (unsigned int i = 0; i < input_size; i++) { |
|||
|
|||
std::string cleaned = Rcpp::as<std::string>(domains[i]); |
|||
if (cleaned.length() > 0) { |
|||
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back(); |
|||
} |
|||
|
|||
// lowercase it
|
|||
rc = psl_str_to_utf8lower( |
|||
cleaned.c_str(), |
|||
"utf-8", "en", |
|||
&lower |
|||
); |
|||
|
|||
if (rc == PSL_SUCCESS) { |
|||
|
|||
std::string normalized(lower); |
|||
|
|||
// try to get the suffix
|
|||
result = psl_unregistrable_domain(psl, lower); |
|||
|
|||
if (result) { |
|||
|
|||
std::string suf = std::string(result); |
|||
suffix[i] = suf; |
|||
|
|||
result = psl_registrable_domain(psl, lower); |
|||
|
|||
if (result) { |
|||
|
|||
std::string apex(result); |
|||
|
|||
std::regex trail_suf("[\\.]*" + suf + "$"); |
|||
std::regex apex_suf("[\\.]*" + apex + "$"); |
|||
|
|||
domain[i] = std::regex_replace(apex, trail_suf, ""); |
|||
subdomain[i] = std::regex_replace(normalized, apex_suf, ""); |
|||
|
|||
} else { |
|||
subdomain[i] = NA_STRING; |
|||
domain[i] = NA_STRING; |
|||
} |
|||
} else { |
|||
subdomain[i] = NA_STRING; |
|||
suffix[i] = NA_STRING; |
|||
domain[i] = NA_STRING; |
|||
} |
|||
|
|||
} else { |
|||
subdomain[i] = NA_STRING; |
|||
domain[i] = NA_STRING; |
|||
suffix[i] = NA_STRING; |
|||
} |
|||
|
|||
psl_free_string(lower); |
|||
|
|||
} |
|||
|
|||
DataFrame out = DataFrame::create( |
|||
_["host"] = domains, |
|||
_["subdomain"] = subdomain, |
|||
_["domain"] = domain, |
|||
_["suffix"] = suffix, |
|||
_["stringsAsFactors"] = false |
|||
); |
|||
|
|||
out.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame"); |
|||
|
|||
return(out); |
|||
|
|||
} |
File diff suppressed because it is too large
File diff suppressed because it is too large
Loading…
Reference in new issue