Bladeren bron

Faster than urltools

latest
boB Rudis 2 jaren geleden
bovenliggende
commit
ce1b5c0bc5
Geen bekende sleutel gevonden voor deze handtekening in de database GPG sleutel-ID: 1D7529BE14E2BBA9
5 gewijzigde bestanden met toevoegingen van 107 en 103 verwijderingen
  1. +1
    -1
      R/RcppExports.R
  2. +61
    -14
      README.md
  3. BIN
      README_files/figure-gfm/bench-1.png
  4. +1
    -1
      src/RcppExports.cpp
  5. +44
    -87
      src/psl-main.cpp

+ 1
- 1
R/RcppExports.R Bestand weergeven

@@ -41,7 +41,7 @@ suffix_extract <- function(domains) {
.Call('_psl_suffix_extract', PACKAGE = 'psl', domains)
}

#' Separate a domain into component parts
#' Separate a domain into component parts (`urltools` compatibility function)
#'
#' Compatibility function for those using `urltools::suffix_extract()`
#'


+ 61
- 14
README.md Bestand weergeven

@@ -116,20 +116,67 @@ suffix_extract(doms)
## # ... with 50 more rows

suffix_extract2(doms) # urltools compatible output
## # A tibble: 60 x 4
## host subdomain domain suffix
## <chr> <chr> <chr> <chr>
## 1 "" <NA> <NA> ""
## 2 com <NA> <NA> com
## 3 example.com "" example com
## 4 www.example.com www example com
## 5 .com <NA> <NA> .com
## 6 .example <NA> <NA> .example
## 7 .example.com <NA> <NA> com
## 8 .example.example <NA> <NA> example
## 9 example <NA> <NA> example
## 10 example.example "" example example
## # ... with 50 more rows
## host subdomain domain suffix
## 1 <NA> <NA>
## 2 com <NA> <NA> com
## 3 example.com example com
## 4 www.example.com www example com
## 5 .com <NA> <NA> .com
## 6 .example <NA> <NA> .example
## 7 .example.com <NA> <NA> com
## 8 .example.example <NA> <NA> example
## 9 example <NA> <NA> example
## 10 example.example example example
## 11 b.example.example b example example
## 12 a.b.example.example a.b example example
## 13 biz <NA> <NA> biz
## 14 domain.biz domain biz
## 15 b.domain.biz b domain biz
## 16 a.b.domain.biz a.b domain biz
## 17 com <NA> <NA> com
## 18 example.com example com
## 19 b.example.com b example com
## 20 a.b.example.com a.b example com
## 21 uk.com <NA> <NA> uk.com
## 22 example.uk.com example uk.com
## 23 b.example.uk.com b example uk.com
## 24 a.b.example.uk.com a.b example uk.com
## 25 test.ac test ac
## 26 cy <NA> <NA> cy
## 27 c.cy c cy
## 28 b.c.cy b c cy
## 29 a.b.c.cy a.b c cy
## 30 jp <NA> <NA> jp
## 31 test.jp test jp
## 32 www.test.jp www test jp
## 33 ac.jp <NA> <NA> ac.jp
## 34 test.ac.jp test ac.jp
## 35 www.test.ac.jp www test ac.jp
## 36 kyoto.jp <NA> <NA> kyoto.jp
## 37 test.kyoto.jp test kyoto.jp
## 38 ide.kyoto.jp <NA> <NA> ide.kyoto.jp
## 39 b.ide.kyoto.jp b ide.kyoto.jp
## 40 a.b.ide.kyoto.jp a b ide.kyoto.jp
## 41 c.kobe.jp <NA> <NA> c.kobe.jp
## 42 b.c.kobe.jp b c.kobe.jp
## 43 a.b.c.kobe.jp a b c.kobe.jp
## 44 city.kobe.jp city kobe.jp
## 45 www.city.kobe.jp www city kobe.jp
## 46 ck <NA> <NA> ck
## 47 test.ck <NA> <NA> test.ck
## 48 b.test.ck b test.ck
## 49 a.b.test.ck a b test.ck
## 50 www.ck www ck
## 51 www.www.ck www www ck
## 52 us <NA> <NA> us
## 53 test.us test us
## 54 www.test.us www test us
## 55 ak.us <NA> <NA> ak.us
## 56 test.ak.us test ak.us
## 57 www.test.ak.us www test ak.us
## 58 k12.ak.us <NA> <NA> k12.ak.us
## 59 test.k12.ak.us test k12.ak.us
## 60 www.test.k12.ak.us www test k12.ak.us
```

``` r


BIN
README_files/figure-gfm/bench-1.png Bestand weergeven

Before After
Width: 1920  |  Height: 960  |  Size: 74KB Width: 1920  |  Height: 960  |  Size: 79KB

+ 1
- 1
src/RcppExports.cpp Bestand weergeven

@@ -28,7 +28,7 @@ BEGIN_RCPP
END_RCPP
}
// is_public_suffix
std::vector< bool > is_public_suffix(CharacterVector domains);
LogicalVector is_public_suffix(CharacterVector domains);
RcppExport SEXP _psl_is_public_suffix(SEXP domainsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;


+ 44
- 87
src/psl-main.cpp Bestand weergeven

@@ -1,5 +1,4 @@
#include <Rcpp.h>
#include <regex>
#include <libpsl.h>

using namespace Rcpp;
@@ -24,24 +23,14 @@ CharacterVector apex_domain(CharacterVector domains) {

// remove trailing period if any
std::string cleaned = Rcpp::as<std::string>(domains[i]);
if (cleaned.length() > 0) {
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back();
}
if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back();

// lowercase it
rc = psl_str_to_utf8lower(
cleaned.c_str(),
"utf-8", "en",
&lower
);
rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower);

if (rc == PSL_SUCCESS) {
result = psl_registrable_domain(psl, lower);
if (result) {
output[i] = std::string(result);
} else {
output[i] = NA_STRING;
}
output[i] = (result) ? String(result) : NA_STRING;
} else {
output[i] = NA_STRING;
}
@@ -74,24 +63,14 @@ CharacterVector public_suffix(CharacterVector domains) {

// remove trailing period if any
std::string cleaned = Rcpp::as<std::string>(domains[i]);
if (cleaned.length() > 0) {
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back();
}
if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back();

// lowercase it
rc = psl_str_to_utf8lower(
cleaned.c_str(),
"utf-8", "en",
&lower
);
rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower);

if (rc == PSL_SUCCESS) {
result = psl_unregistrable_domain(psl, lower);
if (result) {
output[i] = std::string(result);
} else {
output[i] = NA_STRING;
}
output[i] = (result) ? String(result) : NA_STRING;
} else {
output[i] = NA_STRING;
}
@@ -111,10 +90,10 @@ CharacterVector public_suffix(CharacterVector domains) {
//' @return character vector
//' @export
// [[Rcpp::export]]
std::vector< bool > is_public_suffix(CharacterVector domains) {
LogicalVector is_public_suffix(CharacterVector domains) {

unsigned int input_size = domains.size();
std::vector < bool > output(input_size);
LogicalVector output(input_size);
char *lower = NULL;
int rc;
const psl_ctx_t *psl = psl_builtin();
@@ -123,22 +102,12 @@ std::vector< bool > is_public_suffix(CharacterVector domains) {

// remove trailing period if any
std::string cleaned = Rcpp::as<std::string>(domains[i]);
if (cleaned.length() > 0) {
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back();
}
if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back();

// lowercase it
rc = psl_str_to_utf8lower(
cleaned.c_str(),
"utf-8", "en",
&lower
);
rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower);

if (rc == PSL_SUCCESS) {
output[i] = (psl_is_public_suffix(psl, lower) == 1);
} else {
output[i] = NA_LOGICAL;
}
output[i] =(rc == PSL_SUCCESS) ? (psl_is_public_suffix(psl, lower) == 1) : NA_LOGICAL;

psl_free_string(lower);

@@ -174,51 +143,41 @@ DataFrame suffix_extract(CharacterVector domains) {

// remove trailing period if any
std::string cleaned = Rcpp::as<std::string>(domains[i]);
if (cleaned.length() > 0) {
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back();
}
if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back();

// lowercase it
rc = psl_str_to_utf8lower(
cleaned.c_str(),
"utf-8", "en",
&lower
);
rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower);

if (rc == PSL_SUCCESS) {

// no dots at end and lowercased
normalized[i] = std::string(lower);
std::string normd = std::string(lower);
normalized[i] = normd;

// try to get the suffix
result = psl_unregistrable_domain(psl, lower);
if (result) {
suffix[i] = std::string(result);
} else {
suffix[i] = NA_STRING;
}
std::string suf = std::string(result);
suffix[i] = (result) ? String(result) : NA_STRING;

// try to get the apex
result = psl_registrable_domain(psl, lower);
apex[i] = (result) ? String(result) : NA_STRING;

if (result) {
apex[i] = std::string(result);
} else {
apex[i] = NA_STRING;
}

if ((suffix[i] != NA_STRING) && (apex[i] != NA_STRING)) {
std::string apx = std::string(result);

int suf_pos = apx.rfind(suf);
std::string dom = apx.substr(0, suf_pos);

int apex_pos = normd.rfind(apx);
std::string subdom = (apex_pos == 0) ? "" : normd.substr(0, apex_pos);

std::regex trail_suf("[\\.]*" + Rcpp::as<std::string>(suffix[i]) + "$");
domain[i] = std::regex_replace(
Rcpp::as<std::string>(apex[i]),
trail_suf, ""
);
if ((dom.length() > 0) && (dom.at(dom.length()-1) == '.')) dom.pop_back();
if ((subdom.length() > 0) && (subdom.at(subdom.length()-1) == '.')) subdom.pop_back();

std::regex apex_suf("[\\.]*" + Rcpp::as<std::string>(apex[i]) + "$");
subdomain[i] = std::regex_replace(
Rcpp::as<std::string>(normalized[i]),
apex_suf, ""
);
domain[i] = dom;
subdomain[i] = subdom;

} else {
domain[i] = NA_STRING;
@@ -253,7 +212,7 @@ DataFrame suffix_extract(CharacterVector domains) {

}

//' Separate a domain into component parts
//' Separate a domain into component parts (`urltools` compatibility function)
//'
//' Compatibility function for those using `urltools::suffix_extract()`
//'
@@ -272,22 +231,16 @@ DataFrame suffix_extract2(CharacterVector domains) {

char *lower = NULL;
int rc;
const char * result;
const char *result;
const psl_ctx_t *psl = psl_builtin();

for (unsigned int i = 0; i < input_size; i++) {

std::string cleaned = Rcpp::as<std::string>(domains[i]);
if (cleaned.length() > 0) {
if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back();
}
if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back();

// lowercase it
rc = psl_str_to_utf8lower(
cleaned.c_str(),
"utf-8", "en",
&lower
);
rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower);

if (rc == PSL_SUCCESS) {

@@ -307,11 +260,17 @@ DataFrame suffix_extract2(CharacterVector domains) {

std::string apex(result);

std::regex trail_suf("[\\.]*" + suf + "$");
std::regex apex_suf("[\\.]*" + apex + "$");
int suf_pos = apex.rfind(suf);
std::string dom = apex.substr(0, suf_pos);

int apex_pos = normalized.rfind(apex);
std::string subdom = (apex_pos == 0) ? "" : normalized.substr(0, apex_pos);

domain[i] = std::regex_replace(apex, trail_suf, "");
subdomain[i] = std::regex_replace(normalized, apex_suf, "");
if ((dom.length() > 0) && (dom.at(dom.length()-1) == '.')) dom.pop_back();
if ((subdom.length() > 0) && (subdom.at(subdom.length()-1) == '.')) subdom.pop_back();

domain[i] = (dom);
subdomain[i] = (subdom);

} else {
subdomain[i] = NA_STRING;
@@ -341,8 +300,6 @@ DataFrame suffix_extract2(CharacterVector domains) {
_["stringsAsFactors"] = false
);

out.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame");

return(out);

}

Laden…
Annuleren
Opslaan