You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

306 lines
7.5KB

  1. #include <Rcpp.h>
  2. #include <libpsl.h>
  3. using namespace Rcpp;
  4. //' Return the apex/top-private domain from a vector of domains
  5. //'
  6. //' @md
  7. //' @param domains character vector of domains
  8. //' @return character vector
  9. //' @export
  10. // [[Rcpp::export]]
  11. CharacterVector apex_domain(CharacterVector domains) {
  12. unsigned int input_size = domains.size();
  13. CharacterVector output(input_size);
  14. char *lower = NULL;
  15. int rc;
  16. const char * result;
  17. const psl_ctx_t *psl = psl_builtin();
  18. for (unsigned int i = 0; i < input_size; i++) {
  19. // remove trailing period if any
  20. std::string cleaned = Rcpp::as<std::string>(domains[i]);
  21. if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back();
  22. // lowercase it
  23. rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower);
  24. if (rc == PSL_SUCCESS) {
  25. result = psl_registrable_domain(psl, lower);
  26. output[i] = (result) ? String(result) : NA_STRING;
  27. } else {
  28. output[i] = NA_STRING;
  29. }
  30. psl_free_string(lower);
  31. }
  32. return(output);
  33. }
  34. //' Return the public suffix from a vector of domains
  35. //'
  36. //' @md
  37. //' @param domains character vector of domains
  38. //' @return character vector
  39. //' @export
  40. // [[Rcpp::export]]
  41. CharacterVector public_suffix(CharacterVector domains) {
  42. unsigned int input_size = domains.size();
  43. CharacterVector output(input_size);
  44. char *lower = NULL;
  45. int rc;
  46. const char * result;
  47. const psl_ctx_t *psl = psl_builtin();
  48. for (unsigned int i = 0; i < input_size; i++) {
  49. // remove trailing period if any
  50. std::string cleaned = Rcpp::as<std::string>(domains[i]);
  51. if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back();
  52. // lowercase it
  53. rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower);
  54. if (rc == PSL_SUCCESS) {
  55. result = psl_unregistrable_domain(psl, lower);
  56. output[i] = (result) ? String(result) : NA_STRING;
  57. } else {
  58. output[i] = NA_STRING;
  59. }
  60. psl_free_string(lower);
  61. }
  62. return(output);
  63. }
  64. //' Test whether a domain is a public suffix
  65. //'
  66. //' @md
  67. //' @param domains character vector of domains
  68. //' @return character vector
  69. //' @export
  70. // [[Rcpp::export]]
  71. LogicalVector is_public_suffix(CharacterVector domains) {
  72. unsigned int input_size = domains.size();
  73. LogicalVector output(input_size);
  74. char *lower = NULL;
  75. int rc;
  76. const psl_ctx_t *psl = psl_builtin();
  77. for (unsigned int i = 0; i < input_size; i++) {
  78. // remove trailing period if any
  79. std::string cleaned = Rcpp::as<std::string>(domains[i]);
  80. if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back();
  81. // lowercase it
  82. rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower);
  83. output[i] =(rc == PSL_SUCCESS) ? (psl_is_public_suffix(psl, lower) == 1) : NA_LOGICAL;
  84. psl_free_string(lower);
  85. }
  86. return(output);
  87. }
  88. //' Separate a domain into component parts
  89. //'
  90. //' @md
  91. //' @param domains character vector of domains
  92. //' @return data frame
  93. //' @export
  94. // [[Rcpp::export]]
  95. DataFrame suffix_extract(CharacterVector domains) {
  96. unsigned int input_size = domains.size();
  97. CharacterVector normalized(input_size);
  98. CharacterVector subdomain(input_size);
  99. CharacterVector apex(input_size);
  100. CharacterVector domain(input_size);
  101. CharacterVector suffix(input_size);
  102. char *lower = NULL;
  103. int rc;
  104. const char * result;
  105. const psl_ctx_t *psl = psl_builtin();
  106. for (unsigned int i = 0; i < input_size; i++) {
  107. // remove trailing period if any
  108. std::string cleaned = Rcpp::as<std::string>(domains[i]);
  109. if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back();
  110. // lowercase it
  111. rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower);
  112. if (rc == PSL_SUCCESS) {
  113. // no dots at end and lowercased
  114. std::string normd = std::string(lower);
  115. normalized[i] = normd;
  116. // try to get the suffix
  117. result = psl_unregistrable_domain(psl, lower);
  118. std::string suf = std::string(result);
  119. suffix[i] = (result) ? String(result) : NA_STRING;
  120. // try to get the apex
  121. result = psl_registrable_domain(psl, lower);
  122. apex[i] = (result) ? String(result) : NA_STRING;
  123. if (result) {
  124. std::string apx = std::string(result);
  125. int suf_pos = apx.rfind(suf);
  126. std::string dom = apx.substr(0, suf_pos);
  127. int apex_pos = normd.rfind(apx);
  128. std::string subdom = (apex_pos == 0) ? "" : normd.substr(0, apex_pos);
  129. if ((dom.length() > 0) && (dom.at(dom.length()-1) == '.')) dom.pop_back();
  130. if ((subdom.length() > 0) && (subdom.at(subdom.length()-1) == '.')) subdom.pop_back();
  131. domain[i] = dom;
  132. subdomain[i] = subdom;
  133. } else {
  134. domain[i] = NA_STRING;
  135. subdomain[i] = NA_STRING;
  136. }
  137. } else {
  138. normalized[i] = NA_STRING;
  139. subdomain[i] = NA_STRING;
  140. apex[i] = NA_STRING;
  141. domain[i] = NA_STRING;
  142. suffix[i] = NA_STRING;
  143. }
  144. psl_free_string(lower);
  145. }
  146. DataFrame out = DataFrame::create(
  147. _["orig"] = domains,
  148. _["normalized"] = normalized,
  149. _["subdomain"] = subdomain,
  150. _["apex"] = apex,
  151. _["domain"] = domain,
  152. _["suffix"] = suffix,
  153. _["stringsAsFactors"] = false
  154. );
  155. out.attr("class") = CharacterVector::create("tbl_df", "tbl", "data.frame");
  156. return(out);
  157. }
  158. //' Separate a domain into component parts (`urltools` compatibility function)
  159. //'
  160. //' Compatibility function for those using `urltools::suffix_extract()`
  161. //'
  162. //' @md
  163. //' @param domains character vector of domains
  164. //' @return data frame
  165. //' @export
  166. // [[Rcpp::export]]
  167. DataFrame suffix_extract2(CharacterVector domains) {
  168. unsigned int input_size = domains.size();
  169. CharacterVector subdomain(input_size);
  170. CharacterVector domain(input_size);
  171. CharacterVector suffix(input_size);
  172. char *lower = NULL;
  173. int rc;
  174. const char *result;
  175. const psl_ctx_t *psl = psl_builtin();
  176. for (unsigned int i = 0; i < input_size; i++) {
  177. std::string cleaned = Rcpp::as<std::string>(domains[i]);
  178. if ((cleaned.length() > 0) && (cleaned.at(cleaned.length()-1) == '.')) cleaned.pop_back();
  179. // lowercase it
  180. rc = psl_str_to_utf8lower(cleaned.c_str(), "utf-8", "en", &lower);
  181. if (rc == PSL_SUCCESS) {
  182. std::string normalized(lower);
  183. // try to get the suffix
  184. result = psl_unregistrable_domain(psl, lower);
  185. if (result) {
  186. std::string suf = std::string(result);
  187. suffix[i] = suf;
  188. result = psl_registrable_domain(psl, lower);
  189. if (result) {
  190. std::string apex(result);
  191. int suf_pos = apex.rfind(suf);
  192. std::string dom = apex.substr(0, suf_pos);
  193. int apex_pos = normalized.rfind(apex);
  194. std::string subdom = (apex_pos == 0) ? "" : normalized.substr(0, apex_pos);
  195. if ((dom.length() > 0) && (dom.at(dom.length()-1) == '.')) dom.pop_back();
  196. if ((subdom.length() > 0) && (subdom.at(subdom.length()-1) == '.')) subdom.pop_back();
  197. domain[i] = (dom);
  198. subdomain[i] = (subdom);
  199. } else {
  200. subdomain[i] = NA_STRING;
  201. domain[i] = NA_STRING;
  202. }
  203. } else {
  204. subdomain[i] = NA_STRING;
  205. suffix[i] = NA_STRING;
  206. domain[i] = NA_STRING;
  207. }
  208. } else {
  209. subdomain[i] = NA_STRING;
  210. domain[i] = NA_STRING;
  211. suffix[i] = NA_STRING;
  212. }
  213. psl_free_string(lower);
  214. }
  215. DataFrame out = DataFrame::create(
  216. _["host"] = domains,
  217. _["subdomain"] = subdomain,
  218. _["domain"] = domain,
  219. _["suffix"] = suffix,
  220. _["stringsAsFactors"] = false
  221. );
  222. return(out);
  223. }