Browse Source

fixed edge case where input was like '.suffix'

latest
boB Rudis 2 years ago
parent
commit
49bac3af55
No known key found for this signature in database GPG Key ID: 1D7529BE14E2BBA9
4 changed files with 41 additions and 83 deletions
  1. +3
    -3
      README.Rmd
  2. +24
    -78
      README.md
  3. BIN
      README_files/figure-gfm/bench-1.png
  4. +14
    -2
      src/psl-main.cpp

+ 3
- 3
README.Rmd View File

@@ -34,8 +34,8 @@ The following functions are implemented:
You need a recent `libpsl`.

- macOS: `brew install libpsl`
- Debian/Ubuntu-ish: Many repos have old versions so build from source and run `ldconfig` afterwards
- Windows: Just use `urltools::suffix_extract()`
- Debian/Ubuntu-ish: Many repos have old versions so it's _highly_ suggested that you build from source and ensure the library & header files are accessible
- Windows: Just use `urltools::suffix_extract()` until winlibs are available for psl

## Installation

@@ -83,7 +83,7 @@ is_public_suffix(doms)

suffix_extract(doms)

suffix_extract2(doms) # urltools compatible output
str(suffix_extract2(doms)) # urltools compatible output
```

```{r bench, message=FALSE, warning=FALSE, error=FALSE, fig.width=10, fig.retina=2}


+ 24
- 78
README.md View File

@@ -32,9 +32,11 @@ The following functions are implemented:
You need a recent `libpsl`.

- macOS: `brew install libpsl`
- Debian/Ubuntu-ish: Many repos have old versions so build from source
and run `ldconfig` afterwards
- Windows: Just use `urltools::suffix_extract()`
- Debian/Ubuntu-ish: Many repos have old versions so it’s *highly*
suggested that you build from source and ensure the library & header
files are accessible
- Windows: Just use `urltools::suffix_extract()` until winlibs are
available for psl

## Installation

@@ -83,7 +85,7 @@ apex_domain(doms)
## [55] NA "test.ak.us" "test.ak.us" NA "test.k12.ak.us" "test.k12.ak.us"

public_suffix(doms)
## [1] "" "com" "com" "com" ".com" ".example" "com"
## [1] "" "com" "com" "com" "com" "example" "com"
## [8] "example" "example" "example" "example" "example" "biz" "biz"
## [15] "biz" "biz" "com" "com" "com" "com" "uk.com"
## [22] "uk.com" "uk.com" "uk.com" "ac" "cy" "cy" "cy"
@@ -101,82 +103,26 @@ is_public_suffix(doms)

suffix_extract(doms)
## # A tibble: 60 x 6
## orig normalized subdomain apex domain suffix
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 "" "" <NA> <NA> <NA> ""
## 2 com com <NA> <NA> <NA> com
## 3 example.com example.com "" example.com example com
## 4 www.example.com www.example.com www example.com example com
## 5 .com .com <NA> <NA> <NA> .com
## 6 .example .example <NA> <NA> <NA> .example
## 7 .example.com .example.com <NA> <NA> <NA> com
## 8 .example.example .example.example <NA> <NA> <NA> example
## 9 example example <NA> <NA> <NA> example
## 10 example.example example.example "" example.example example example
## orig normalized subdomain apex domain suffix
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 "" "" <NA> <NA> <NA> ""
## 2 com com <NA> <NA> <NA> com
## 3 example.com example.com "" example.com example com
## 4 www.example.com www.example.com www example.com example com
## 5 .com .com <NA> <NA> <NA> com
## 6 .example .example <NA> <NA> <NA> example
## 7 .example.com .example.com <NA> <NA> <NA> com
## 8 .example.example .example.example <NA> <NA> <NA> example
## 9 example example <NA> <NA> <NA> example
## 10 example.example example.example "" example.example example example
## # ... with 50 more rows

suffix_extract2(doms) # urltools compatible output
## host subdomain domain suffix
## 1 <NA> <NA>
## 2 com <NA> <NA> com
## 3 example.com example com
## 4 www.example.com www example com
## 5 .com <NA> <NA> .com
## 6 .example <NA> <NA> .example
## 7 .example.com <NA> <NA> com
## 8 .example.example <NA> <NA> example
## 9 example <NA> <NA> example
## 10 example.example example example
## 11 b.example.example b example example
## 12 a.b.example.example a.b example example
## 13 biz <NA> <NA> biz
## 14 domain.biz domain biz
## 15 b.domain.biz b domain biz
## 16 a.b.domain.biz a.b domain biz
## 17 com <NA> <NA> com
## 18 example.com example com
## 19 b.example.com b example com
## 20 a.b.example.com a.b example com
## 21 uk.com <NA> <NA> uk.com
## 22 example.uk.com example uk.com
## 23 b.example.uk.com b example uk.com
## 24 a.b.example.uk.com a.b example uk.com
## 25 test.ac test ac
## 26 cy <NA> <NA> cy
## 27 c.cy c cy
## 28 b.c.cy b c cy
## 29 a.b.c.cy a.b c cy
## 30 jp <NA> <NA> jp
## 31 test.jp test jp
## 32 www.test.jp www test jp
## 33 ac.jp <NA> <NA> ac.jp
## 34 test.ac.jp test ac.jp
## 35 www.test.ac.jp www test ac.jp
## 36 kyoto.jp <NA> <NA> kyoto.jp
## 37 test.kyoto.jp test kyoto.jp
## 38 ide.kyoto.jp <NA> <NA> ide.kyoto.jp
## 39 b.ide.kyoto.jp b ide.kyoto.jp
## 40 a.b.ide.kyoto.jp a b ide.kyoto.jp
## 41 c.kobe.jp <NA> <NA> c.kobe.jp
## 42 b.c.kobe.jp b c.kobe.jp
## 43 a.b.c.kobe.jp a b c.kobe.jp
## 44 city.kobe.jp city kobe.jp
## 45 www.city.kobe.jp www city kobe.jp
## 46 ck <NA> <NA> ck
## 47 test.ck <NA> <NA> test.ck
## 48 b.test.ck b test.ck
## 49 a.b.test.ck a b test.ck
## 50 www.ck www ck
## 51 www.www.ck www www ck
## 52 us <NA> <NA> us
## 53 test.us test us
## 54 www.test.us www test us
## 55 ak.us <NA> <NA> ak.us
## 56 test.ak.us test ak.us
## 57 www.test.ak.us www test ak.us
## 58 k12.ak.us <NA> <NA> k12.ak.us
## 59 test.k12.ak.us test k12.ak.us
## 60 www.test.k12.ak.us www test k12.ak.us
str(suffix_extract2(doms)) # urltools compatible output
## 'data.frame': 60 obs. of 4 variables:
## $ host : chr "" "com" "example.com" "www.example.com" ...
## $ subdomain: chr NA NA "" "www" ...
## $ domain : chr NA NA "example" "example" ...
## $ suffix : chr "" "com" "com" "com" ...
```

``` r


BIN
README_files/figure-gfm/bench-1.png View File

Before After
Width: 1920  |  Height: 960  |  Size: 79KB Width: 1920  |  Height: 960  |  Size: 77KB

+ 14
- 2
src/psl-main.cpp View File

@@ -70,7 +70,13 @@ CharacterVector public_suffix(CharacterVector domains) {

if (rc == PSL_SUCCESS) {
result = psl_unregistrable_domain(psl, lower);
output[i] = (result) ? String(result) : NA_STRING;
if (result) {
std::string res(result);
if ((res.length() > 0) && (res.at(0) == '.')) res.erase(0, 1);
output[i] = res;
} else {
output[i] = NA_STRING;
}
} else {
output[i] = NA_STRING;
}
@@ -157,7 +163,12 @@ DataFrame suffix_extract(CharacterVector domains) {
// try to get the suffix
result = psl_unregistrable_domain(psl, lower);
std::string suf = std::string(result);
suffix[i] = (result) ? String(result) : NA_STRING;
if (result) {
if ((suf.length() > 0) && (suf.at(0) == '.')) suf.erase(0, 1);
suffix[i] = suf;
} else {
suffix[i] = NA_STRING;
}

// try to get the apex
result = psl_registrable_domain(psl, lower);
@@ -252,6 +263,7 @@ DataFrame suffix_extract2(CharacterVector domains) {
if (result) {

std::string suf = std::string(result);
if ((suf.length() > 0) && (suf.at(0) == '.')) suf.erase(0, 1);
suffix[i] = suf;

result = psl_registrable_domain(psl, lower);


Loading…
Cancel
Save