diff --git a/R/RcppExports.R b/R/RcppExports.R index 91d718b..d895ae3 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -41,7 +41,7 @@ suffix_extract <- function(domains) { .Call('_psl_suffix_extract', PACKAGE = 'psl', domains) } -#' Separate a domain into component parts +#' Separate a domain into component parts (`urltools` compatibility function) #' #' Compatibility function for those using `urltools::suffix_extract()` #' diff --git a/README.md b/README.md index fec7905..c3d7514 100644 --- a/README.md +++ b/README.md @@ -116,20 +116,67 @@ suffix_extract(doms) ## # ... with 50 more rows suffix_extract2(doms) # urltools compatible output -## # A tibble: 60 x 4 -## host subdomain domain suffix -## -## 1 "" "" -## 2 com com -## 3 example.com "" example com -## 4 www.example.com www example com -## 5 .com .com -## 6 .example .example -## 7 .example.com com -## 8 .example.example example -## 9 example example -## 10 example.example "" example example -## # ... with 50 more rows +## host subdomain domain suffix +## 1 +## 2 com com +## 3 example.com example com +## 4 www.example.com www example com +## 5 .com .com +## 6 .example .example +## 7 .example.com com +## 8 .example.example example +## 9 example example +## 10 example.example example example +## 11 b.example.example b example example +## 12 a.b.example.example a.b example example +## 13 biz biz +## 14 domain.biz domain biz +## 15 b.domain.biz b domain biz +## 16 a.b.domain.biz a.b domain biz +## 17 com com +## 18 example.com example com +## 19 b.example.com b example com +## 20 a.b.example.com a.b example com +## 21 uk.com uk.com +## 22 example.uk.com example uk.com +## 23 b.example.uk.com b example uk.com +## 24 a.b.example.uk.com a.b example uk.com +## 25 test.ac test ac +## 26 cy cy +## 27 c.cy c cy +## 28 b.c.cy b c cy +## 29 a.b.c.cy a.b c cy +## 30 jp jp +## 31 test.jp test jp +## 32 www.test.jp www test jp +## 33 ac.jp ac.jp +## 34 test.ac.jp test ac.jp +## 35 www.test.ac.jp www test ac.jp +## 36 kyoto.jp kyoto.jp +## 37 test.kyoto.jp test kyoto.jp +## 38 ide.kyoto.jp ide.kyoto.jp +## 39 b.ide.kyoto.jp b ide.kyoto.jp +## 40 a.b.ide.kyoto.jp a b ide.kyoto.jp +## 41 c.kobe.jp c.kobe.jp +## 42 b.c.kobe.jp b c.kobe.jp +## 43 a.b.c.kobe.jp a b c.kobe.jp +## 44 city.kobe.jp city kobe.jp +## 45 www.city.kobe.jp www city kobe.jp +## 46 ck ck +## 47 test.ck test.ck +## 48 b.test.ck b test.ck +## 49 a.b.test.ck a b test.ck +## 50 www.ck www ck +## 51 www.www.ck www www ck +## 52 us us +## 53 test.us test us +## 54 www.test.us www test us +## 55 ak.us ak.us +## 56 test.ak.us test ak.us +## 57 www.test.ak.us www test ak.us +## 58 k12.ak.us k12.ak.us +## 59 test.k12.ak.us test k12.ak.us +## 60 www.test.k12.ak.us www test k12.ak.us ``` ``` r diff --git a/README_files/figure-gfm/bench-1.png b/README_files/figure-gfm/bench-1.png index c95c841..94c6c4f 100644 Binary files a/README_files/figure-gfm/bench-1.png and b/README_files/figure-gfm/bench-1.png differ diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index a0d259b..293320e 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -28,7 +28,7 @@ BEGIN_RCPP END_RCPP } // is_public_suffix -std::vector< bool > is_public_suffix(CharacterVector domains); +LogicalVector is_public_suffix(CharacterVector domains); RcppExport SEXP _psl_is_public_suffix(SEXP domainsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; diff --git a/src/psl-main.cpp b/src/psl-main.cpp index 9e68e45..f9d0883 100644 --- a/src/psl-main.cpp +++ b/src/psl-main.cpp @@ -1,5 +1,4 @@ #include -#include #include using namespace Rcpp; @@ -24,24 +23,14 @@ CharacterVector apex_domain(CharacterVector domains) { // remove trailing period if any std::string cleaned = Rcpp::as(domains[i]); - if (cleaned.length() > 0) { - if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back(); - } + if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back(); // lowercase it - rc = psl_str_to_utf8lower( - cleaned.c_str(), - "utf-8", "en", - &lower - ); + rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower); if (rc == PSL_SUCCESS) { result = psl_registrable_domain(psl, lower); - if (result) { - output[i] = std::string(result); - } else { - output[i] = NA_STRING; - } + output[i] = (result) ? String(result) : NA_STRING; } else { output[i] = NA_STRING; } @@ -74,24 +63,14 @@ CharacterVector public_suffix(CharacterVector domains) { // remove trailing period if any std::string cleaned = Rcpp::as(domains[i]); - if (cleaned.length() > 0) { - if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back(); - } + if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back(); // lowercase it - rc = psl_str_to_utf8lower( - cleaned.c_str(), - "utf-8", "en", - &lower - ); + rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower); if (rc == PSL_SUCCESS) { result = psl_unregistrable_domain(psl, lower); - if (result) { - output[i] = std::string(result); - } else { - output[i] = NA_STRING; - } + output[i] = (result) ? String(result) : NA_STRING; } else { output[i] = NA_STRING; } @@ -111,10 +90,10 @@ CharacterVector public_suffix(CharacterVector domains) { //' @return character vector //' @export // [[Rcpp::export]] -std::vector< bool > is_public_suffix(CharacterVector domains) { +LogicalVector is_public_suffix(CharacterVector domains) { unsigned int input_size = domains.size(); - std::vector < bool > output(input_size); + LogicalVector output(input_size); char *lower = NULL; int rc; const psl_ctx_t *psl = psl_builtin(); @@ -123,22 +102,12 @@ std::vector< bool > is_public_suffix(CharacterVector domains) { // remove trailing period if any std::string cleaned = Rcpp::as(domains[i]); - if (cleaned.length() > 0) { - if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back(); - } + if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back(); // lowercase it - rc = psl_str_to_utf8lower( - cleaned.c_str(), - "utf-8", "en", - &lower - ); + rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower); - if (rc == PSL_SUCCESS) { - output[i] = (psl_is_public_suffix(psl, lower) == 1); - } else { - output[i] = NA_LOGICAL; - } + output[i] =(rc == PSL_SUCCESS) ? (psl_is_public_suffix(psl, lower) == 1) : NA_LOGICAL; psl_free_string(lower); @@ -174,51 +143,41 @@ DataFrame suffix_extract(CharacterVector domains) { // remove trailing period if any std::string cleaned = Rcpp::as(domains[i]); - if (cleaned.length() > 0) { - if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back(); - } + if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back(); // lowercase it - rc = psl_str_to_utf8lower( - cleaned.c_str(), - "utf-8", "en", - &lower - ); + rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower); if (rc == PSL_SUCCESS) { // no dots at end and lowercased - normalized[i] = std::string(lower); + std::string normd = std::string(lower); + normalized[i] = normd; // try to get the suffix result = psl_unregistrable_domain(psl, lower); - if (result) { - suffix[i] = std::string(result); - } else { - suffix[i] = NA_STRING; - } + std::string suf = std::string(result); + suffix[i] = (result) ? String(result) : NA_STRING; // try to get the apex result = psl_registrable_domain(psl, lower); + apex[i] = (result) ? String(result) : NA_STRING; + if (result) { - apex[i] = std::string(result); - } else { - apex[i] = NA_STRING; - } - if ((suffix[i] != NA_STRING) && (apex[i] != NA_STRING)) { + std::string apx = std::string(result); + + int suf_pos = apx.rfind(suf); + std::string dom = apx.substr(0, suf_pos); + + int apex_pos = normd.rfind(apx); + std::string subdom = (apex_pos == 0) ? "" : normd.substr(0, apex_pos); - std::regex trail_suf("[\\.]*" + Rcpp::as(suffix[i]) + "$"); - domain[i] = std::regex_replace( - Rcpp::as(apex[i]), - trail_suf, "" - ); + if ((dom.length() > 0) && (dom.at(dom.length()-1) == '.')) dom.pop_back(); + if ((subdom.length() > 0) && (subdom.at(subdom.length()-1) == '.')) subdom.pop_back(); - std::regex apex_suf("[\\.]*" + Rcpp::as(apex[i]) + "$"); - subdomain[i] = std::regex_replace( - Rcpp::as(normalized[i]), - apex_suf, "" - ); + domain[i] = dom; + subdomain[i] = subdom; } else { domain[i] = NA_STRING; @@ -253,7 +212,7 @@ DataFrame suffix_extract(CharacterVector domains) { } -//' Separate a domain into component parts +//' Separate a domain into component parts (`urltools` compatibility function) //' //' Compatibility function for those using `urltools::suffix_extract()` //' @@ -272,22 +231,16 @@ DataFrame suffix_extract2(CharacterVector domains) { char *lower = NULL; int rc; - const char * result; + const char *result; const psl_ctx_t *psl = psl_builtin(); for (unsigned int i = 0; i < input_size; i++) { std::string cleaned = Rcpp::as(domains[i]); - if (cleaned.length() > 0) { - if (cleaned.at(cleaned.length()-1) == '.') cleaned.pop_back(); - } + if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back(); // lowercase it - rc = psl_str_to_utf8lower( - cleaned.c_str(), - "utf-8", "en", - &lower - ); + rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower); if (rc == PSL_SUCCESS) { @@ -307,11 +260,17 @@ DataFrame suffix_extract2(CharacterVector domains) { std::string apex(result); - std::regex trail_suf("[\\.]*" + suf + "$"); - std::regex apex_suf("[\\.]*" + apex + "$"); + int suf_pos = apex.rfind(suf); + std::string dom = apex.substr(0, suf_pos); + + int apex_pos = normalized.rfind(apex); + std::string subdom = (apex_pos == 0) ? "" : normalized.substr(0, apex_pos); - domain[i] = std::regex_replace(apex, trail_suf, ""); - subdomain[i] = std::regex_replace(normalized, apex_suf, ""); + if ((dom.length() > 0) && (dom.at(dom.length()-1) == '.')) dom.pop_back(); + if ((subdom.length() > 0) && (subdom.at(subdom.length()-1) == '.')) subdom.pop_back(); + + domain[i] = (dom); + subdomain[i] = (subdom); } else { subdomain[i] = NA_STRING; @@ -341,8 +300,6 @@ DataFrame suffix_extract2(CharacterVector domains) { _["stringsAsFactors"] = false ); - out.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame"); - return(out); }