diff --git a/README.Rmd b/README.Rmd index 5f23f74..dbe9de8 100644 --- a/README.Rmd +++ b/README.Rmd @@ -34,8 +34,8 @@ The following functions are implemented: You need a recent `libpsl`. - macOS: `brew install libpsl` -- Debian/Ubuntu-ish: Many repos have old versions so build from source and run `ldconfig` afterwards -- Windows: Just use `urltools::suffix_extract()` +- Debian/Ubuntu-ish: Many repos have old versions so it's _highly_ suggested that you build from source and ensure the library & header files are accessible +- Windows: Just use `urltools::suffix_extract()` until winlibs are available for psl ## Installation @@ -83,7 +83,7 @@ is_public_suffix(doms) suffix_extract(doms) -suffix_extract2(doms) # urltools compatible output +str(suffix_extract2(doms)) # urltools compatible output ``` ```{r bench, message=FALSE, warning=FALSE, error=FALSE, fig.width=10, fig.retina=2} diff --git a/README.md b/README.md index c3d7514..1390f93 100644 --- a/README.md +++ b/README.md @@ -32,9 +32,11 @@ The following functions are implemented: You need a recent `libpsl`. - macOS: `brew install libpsl` - - Debian/Ubuntu-ish: Many repos have old versions so build from source - and run `ldconfig` afterwards - - Windows: Just use `urltools::suffix_extract()` + - Debian/Ubuntu-ish: Many repos have old versions so it’s *highly* + suggested that you build from source and ensure the library & header + files are accessible + - Windows: Just use `urltools::suffix_extract()` until winlibs are + available for psl ## Installation @@ -83,7 +85,7 @@ apex_domain(doms) ## [55] NA "test.ak.us" "test.ak.us" NA "test.k12.ak.us" "test.k12.ak.us" public_suffix(doms) -## [1] "" "com" "com" "com" ".com" ".example" "com" +## [1] "" "com" "com" "com" "com" "example" "com" ## [8] "example" "example" "example" "example" "example" "biz" "biz" ## [15] "biz" "biz" "com" "com" "com" "com" "uk.com" ## [22] "uk.com" "uk.com" "uk.com" "ac" "cy" "cy" "cy" @@ -101,82 +103,26 @@ is_public_suffix(doms) suffix_extract(doms) ## # A tibble: 60 x 6 -## orig normalized subdomain apex domain suffix -## -## 1 "" "" "" -## 2 com com com -## 3 example.com example.com "" example.com example com -## 4 www.example.com www.example.com www example.com example com -## 5 .com .com .com -## 6 .example .example .example -## 7 .example.com .example.com com -## 8 .example.example .example.example example -## 9 example example example -## 10 example.example example.example "" example.example example example +## orig normalized subdomain apex domain suffix +## +## 1 "" "" "" +## 2 com com com +## 3 example.com example.com "" example.com example com +## 4 www.example.com www.example.com www example.com example com +## 5 .com .com com +## 6 .example .example example +## 7 .example.com .example.com com +## 8 .example.example .example.example example +## 9 example example example +## 10 example.example example.example "" example.example example example ## # ... with 50 more rows -suffix_extract2(doms) # urltools compatible output -## host subdomain domain suffix -## 1 -## 2 com com -## 3 example.com example com -## 4 www.example.com www example com -## 5 .com .com -## 6 .example .example -## 7 .example.com com -## 8 .example.example example -## 9 example example -## 10 example.example example example -## 11 b.example.example b example example -## 12 a.b.example.example a.b example example -## 13 biz biz -## 14 domain.biz domain biz -## 15 b.domain.biz b domain biz -## 16 a.b.domain.biz a.b domain biz -## 17 com com -## 18 example.com example com -## 19 b.example.com b example com -## 20 a.b.example.com a.b example com -## 21 uk.com uk.com -## 22 example.uk.com example uk.com -## 23 b.example.uk.com b example uk.com -## 24 a.b.example.uk.com a.b example uk.com -## 25 test.ac test ac -## 26 cy cy -## 27 c.cy c cy -## 28 b.c.cy b c cy -## 29 a.b.c.cy a.b c cy -## 30 jp jp -## 31 test.jp test jp -## 32 www.test.jp www test jp -## 33 ac.jp ac.jp -## 34 test.ac.jp test ac.jp -## 35 www.test.ac.jp www test ac.jp -## 36 kyoto.jp kyoto.jp -## 37 test.kyoto.jp test kyoto.jp -## 38 ide.kyoto.jp ide.kyoto.jp -## 39 b.ide.kyoto.jp b ide.kyoto.jp -## 40 a.b.ide.kyoto.jp a b ide.kyoto.jp -## 41 c.kobe.jp c.kobe.jp -## 42 b.c.kobe.jp b c.kobe.jp -## 43 a.b.c.kobe.jp a b c.kobe.jp -## 44 city.kobe.jp city kobe.jp -## 45 www.city.kobe.jp www city kobe.jp -## 46 ck ck -## 47 test.ck test.ck -## 48 b.test.ck b test.ck -## 49 a.b.test.ck a b test.ck -## 50 www.ck www ck -## 51 www.www.ck www www ck -## 52 us us -## 53 test.us test us -## 54 www.test.us www test us -## 55 ak.us ak.us -## 56 test.ak.us test ak.us -## 57 www.test.ak.us www test ak.us -## 58 k12.ak.us k12.ak.us -## 59 test.k12.ak.us test k12.ak.us -## 60 www.test.k12.ak.us www test k12.ak.us +str(suffix_extract2(doms)) # urltools compatible output +## 'data.frame': 60 obs. of 4 variables: +## $ host : chr "" "com" "example.com" "www.example.com" ... +## $ subdomain: chr NA NA "" "www" ... +## $ domain : chr NA NA "example" "example" ... +## $ suffix : chr "" "com" "com" "com" ... ``` ``` r diff --git a/README_files/figure-gfm/bench-1.png b/README_files/figure-gfm/bench-1.png index 94c6c4f..f1a4513 100644 Binary files a/README_files/figure-gfm/bench-1.png and b/README_files/figure-gfm/bench-1.png differ diff --git a/src/psl-main.cpp b/src/psl-main.cpp index f9d0883..d4ba237 100644 --- a/src/psl-main.cpp +++ b/src/psl-main.cpp @@ -70,7 +70,13 @@ CharacterVector public_suffix(CharacterVector domains) { if (rc == PSL_SUCCESS) { result = psl_unregistrable_domain(psl, lower); - output[i] = (result) ? String(result) : NA_STRING; + if (result) { + std::string res(result); + if ((res.length() > 0) && (res.at(0) == '.')) res.erase(0, 1); + output[i] = res; + } else { + output[i] = NA_STRING; + } } else { output[i] = NA_STRING; } @@ -157,7 +163,12 @@ DataFrame suffix_extract(CharacterVector domains) { // try to get the suffix result = psl_unregistrable_domain(psl, lower); std::string suf = std::string(result); - suffix[i] = (result) ? String(result) : NA_STRING; + if (result) { + if ((suf.length() > 0) && (suf.at(0) == '.')) suf.erase(0, 1); + suffix[i] = suf; + } else { + suffix[i] = NA_STRING; + } // try to get the apex result = psl_registrable_domain(psl, lower); @@ -252,6 +263,7 @@ DataFrame suffix_extract2(CharacterVector domains) { if (result) { std::string suf = std::string(result); + if ((suf.length() > 0) && (suf.at(0) == '.')) suf.erase(0, 1); suffix[i] = suf; result = psl_registrable_domain(psl, lower);