Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 
 

305 linhas
7.5 KiB

#include <Rcpp.h>
#include <libpsl.h>
using namespace Rcpp;
//' Return the apex/top-private domain from a vector of domains
//'
//' @md
//' @param domains character vector of domains
//' @return character vector
//' @export
// [[Rcpp::export]]
CharacterVector apex_domain(CharacterVector domains) {
unsigned int input_size = domains.size();
CharacterVector output(input_size);
char *lower = NULL;
int rc;
const char * result;
const psl_ctx_t *psl = psl_builtin();
for (unsigned int i = 0; i < input_size; i++) {
// remove trailing period if any
std::string cleaned = Rcpp::as<std::string>(domains[i]);
if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back();
// lowercase it
rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower);
if (rc == PSL_SUCCESS) {
result = psl_registrable_domain(psl, lower);
output[i] = (result) ? String(result) : NA_STRING;
} else {
output[i] = NA_STRING;
}
psl_free_string(lower);
}
return(output);
}
//' Return the public suffix from a vector of domains
//'
//' @md
//' @param domains character vector of domains
//' @return character vector
//' @export
// [[Rcpp::export]]
CharacterVector public_suffix(CharacterVector domains) {
unsigned int input_size = domains.size();
CharacterVector output(input_size);
char *lower = NULL;
int rc;
const char * result;
const psl_ctx_t *psl = psl_builtin();
for (unsigned int i = 0; i < input_size; i++) {
// remove trailing period if any
std::string cleaned = Rcpp::as<std::string>(domains[i]);
if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back();
// lowercase it
rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower);
if (rc == PSL_SUCCESS) {
result = psl_unregistrable_domain(psl, lower);
output[i] = (result) ? String(result) : NA_STRING;
} else {
output[i] = NA_STRING;
}
psl_free_string(lower);
}
return(output);
}
//' Test whether a domain is a public suffix
//'
//' @md
//' @param domains character vector of domains
//' @return character vector
//' @export
// [[Rcpp::export]]
LogicalVector is_public_suffix(CharacterVector domains) {
unsigned int input_size = domains.size();
LogicalVector output(input_size);
char *lower = NULL;
int rc;
const psl_ctx_t *psl = psl_builtin();
for (unsigned int i = 0; i < input_size; i++) {
// remove trailing period if any
std::string cleaned = Rcpp::as<std::string>(domains[i]);
if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back();
// lowercase it
rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower);
output[i] =(rc == PSL_SUCCESS) ? (psl_is_public_suffix(psl, lower) == 1) : NA_LOGICAL;
psl_free_string(lower);
}
return(output);
}
//' Separate a domain into component parts
//'
//' @md
//' @param domains character vector of domains
//' @return data frame
//' @export
// [[Rcpp::export]]
DataFrame suffix_extract(CharacterVector domains) {
unsigned int input_size = domains.size();
CharacterVector normalized(input_size);
CharacterVector subdomain(input_size);
CharacterVector apex(input_size);
CharacterVector domain(input_size);
CharacterVector suffix(input_size);
char *lower = NULL;
int rc;
const char * result;
const psl_ctx_t *psl = psl_builtin();
for (unsigned int i = 0; i < input_size; i++) {
// remove trailing period if any
std::string cleaned = Rcpp::as<std::string>(domains[i]);
if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back();
// lowercase it
rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower);
if (rc == PSL_SUCCESS) {
// no dots at end and lowercased
std::string normd = std::string(lower);
normalized[i] = normd;
// try to get the suffix
result = psl_unregistrable_domain(psl, lower);
std::string suf = std::string(result);
suffix[i] = (result) ? String(result) : NA_STRING;
// try to get the apex
result = psl_registrable_domain(psl, lower);
apex[i] = (result) ? String(result) : NA_STRING;
if (result) {
std::string apx = std::string(result);
int suf_pos = apx.rfind(suf);
std::string dom = apx.substr(0, suf_pos);
int apex_pos = normd.rfind(apx);
std::string subdom = (apex_pos == 0) ? "" : normd.substr(0, apex_pos);
if ((dom.length() > 0) && (dom.at(dom.length()-1) == '.')) dom.pop_back();
if ((subdom.length() > 0) && (subdom.at(subdom.length()-1) == '.')) subdom.pop_back();
domain[i] = dom;
subdomain[i] = subdom;
} else {
domain[i] = NA_STRING;
subdomain[i] = NA_STRING;
}
} else {
normalized[i] = NA_STRING;
subdomain[i] = NA_STRING;
apex[i] = NA_STRING;
domain[i] = NA_STRING;
suffix[i] = NA_STRING;
}
psl_free_string(lower);
}
DataFrame out = DataFrame::create(
_["orig"] = domains,
_["normalized"] = normalized,
_["subdomain"] = subdomain,
_["apex"] = apex,
_["domain"] = domain,
_["suffix"] = suffix,
_["stringsAsFactors"] = false
);
out.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame");
return(out);
}
//' Separate a domain into component parts (`urltools` compatibility function)
//'
//' Compatibility function for those using `urltools::suffix_extract()`
//'
//' @md
//' @param domains character vector of domains
//' @return data frame
//' @export
// [[Rcpp::export]]
DataFrame suffix_extract2(CharacterVector domains) {
unsigned int input_size = domains.size();
CharacterVector subdomain(input_size);
CharacterVector domain(input_size);
CharacterVector suffix(input_size);
char *lower = NULL;
int rc;
const char *result;
const psl_ctx_t *psl = psl_builtin();
for (unsigned int i = 0; i < input_size; i++) {
std::string cleaned = Rcpp::as<std::string>(domains[i]);
if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back();
// lowercase it
rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower);
if (rc == PSL_SUCCESS) {
std::string normalized(lower);
// try to get the suffix
result = psl_unregistrable_domain(psl, lower);
if (result) {
std::string suf = std::string(result);
suffix[i] = suf;
result = psl_registrable_domain(psl, lower);
if (result) {
std::string apex(result);
int suf_pos = apex.rfind(suf);
std::string dom = apex.substr(0, suf_pos);
int apex_pos = normalized.rfind(apex);
std::string subdom = (apex_pos == 0) ? "" : normalized.substr(0, apex_pos);
if ((dom.length() > 0) && (dom.at(dom.length()-1) == '.')) dom.pop_back();
if ((subdom.length() > 0) && (subdom.at(subdom.length()-1) == '.')) subdom.pop_back();
domain[i] = (dom);
subdomain[i] = (subdom);
} else {
subdomain[i] = NA_STRING;
domain[i] = NA_STRING;
}
} else {
subdomain[i] = NA_STRING;
suffix[i] = NA_STRING;
domain[i] = NA_STRING;
}
} else {
subdomain[i] = NA_STRING;
domain[i] = NA_STRING;
suffix[i] = NA_STRING;
}
psl_free_string(lower);
}
DataFrame out = DataFrame::create(
_["host"] = domains,
_["subdomain"] = subdomain,
_["domain"] = domain,
_["suffix"] = suffix,
_["stringsAsFactors"] = false
);
return(out);
}