Browse Source
custom function to retrieve all crawl_delay settings for all user agents
master
No known key found for this signature in database
GPG Key ID: 1D7529BE14E2BBA9
7 changed files with
73 additions and
1 deletions
-
NAMESPACE
-
R/RcppExports.R
-
R/cd.r
-
man/crawl_delays.Rd
-
src/RcppExports.cpp
-
src/repmain.cpp
-
src/robots.h
|
|
@ -2,6 +2,7 @@ |
|
|
|
|
|
|
|
S3method(print,robxp) |
|
|
|
export(can_fetch) |
|
|
|
export(crawl_delays) |
|
|
|
export(robxp) |
|
|
|
importFrom(Rcpp,sourceCpp) |
|
|
|
useDynLib(rep, .registration=TRUE) |
|
|
|
|
|
@ -9,6 +9,14 @@ rep_parse <- function(content) { |
|
|
|
.Call(`_rep_rep_parse`, content) |
|
|
|
} |
|
|
|
|
|
|
|
#' Get delays |
|
|
|
#' |
|
|
|
#' @noRd |
|
|
|
#' |
|
|
|
rep_crawl_delays <- function(xp) { |
|
|
|
.Call(`_rep_rep_crawl_delays`, xp) |
|
|
|
} |
|
|
|
|
|
|
|
#' Path allowed |
|
|
|
#' |
|
|
|
#' @noRd |
|
|
|
|
|
@ -0,0 +1,15 @@ |
|
|
|
#' Get all crawl_delay |
|
|
|
#' |
|
|
|
#' @md |
|
|
|
#' @param obj `robxp` object |
|
|
|
#' @export |
|
|
|
#' @examples |
|
|
|
crawl_delays <- function(obj) { |
|
|
|
|
|
|
|
if (inherits(obj, "robxp")) { |
|
|
|
rep_crawl_delays(obj) |
|
|
|
} else { |
|
|
|
return(NULL) |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
@ -0,0 +1,14 @@ |
|
|
|
% Generated by roxygen2: do not edit by hand |
|
|
|
% Please edit documentation in R/cd.r |
|
|
|
\name{crawl_delays} |
|
|
|
\alias{crawl_delays} |
|
|
|
\title{Get all crawl_delay} |
|
|
|
\usage{ |
|
|
|
crawl_delays(obj) |
|
|
|
} |
|
|
|
\arguments{ |
|
|
|
\item{obj}{\code{robxp} object} |
|
|
|
} |
|
|
|
\description{ |
|
|
|
Get all crawl_delay |
|
|
|
} |
|
|
@ -16,6 +16,17 @@ BEGIN_RCPP |
|
|
|
return rcpp_result_gen; |
|
|
|
END_RCPP |
|
|
|
} |
|
|
|
// rep_crawl_delays
|
|
|
|
std::vector<float> rep_crawl_delays(SEXP xp); |
|
|
|
RcppExport SEXP _rep_rep_crawl_delays(SEXP xpSEXP) { |
|
|
|
BEGIN_RCPP |
|
|
|
Rcpp::RObject rcpp_result_gen; |
|
|
|
Rcpp::RNGScope rcpp_rngScope_gen; |
|
|
|
Rcpp::traits::input_parameter< SEXP >::type xp(xpSEXP); |
|
|
|
rcpp_result_gen = Rcpp::wrap(rep_crawl_delays(xp)); |
|
|
|
return rcpp_result_gen; |
|
|
|
END_RCPP |
|
|
|
} |
|
|
|
// rep_path_allowed
|
|
|
|
bool rep_path_allowed(SEXP xp, std::string path, std::string agent); |
|
|
|
RcppExport SEXP _rep_rep_path_allowed(SEXP xpSEXP, SEXP pathSEXP, SEXP agentSEXP) { |
|
|
@ -32,6 +43,7 @@ END_RCPP |
|
|
|
|
|
|
|
static const R_CallMethodDef CallEntries[] = { |
|
|
|
{"_rep_rep_parse", (DL_FUNC) &_rep_rep_parse, 1}, |
|
|
|
{"_rep_rep_crawl_delays", (DL_FUNC) &_rep_rep_crawl_delays, 1}, |
|
|
|
{"_rep_rep_path_allowed", (DL_FUNC) &_rep_rep_path_allowed, 3}, |
|
|
|
{NULL, NULL, 0} |
|
|
|
}; |
|
|
|
|
|
@ -2,6 +2,8 @@ |
|
|
|
using namespace Rcpp; |
|
|
|
|
|
|
|
#include "url.h" |
|
|
|
#include "agent.h" |
|
|
|
#include "directive.h" |
|
|
|
#include "robots.h" |
|
|
|
|
|
|
|
//' Parse robots.txt
|
|
|
@ -14,6 +16,26 @@ SEXP rep_parse(std::string content) { |
|
|
|
return(ptr); |
|
|
|
} |
|
|
|
|
|
|
|
//' Get delays
|
|
|
|
//'
|
|
|
|
//' @noRd
|
|
|
|
//'
|
|
|
|
// [[Rcpp::export]]
|
|
|
|
std::vector<float> rep_crawl_delays(SEXP xp) { |
|
|
|
|
|
|
|
Rcpp::XPtr<Rep::Robots> ptr(xp); |
|
|
|
|
|
|
|
std::vector<float> vals; |
|
|
|
vals.reserve(ptr->agents_.size()); |
|
|
|
|
|
|
|
for(auto kv : ptr->agents_) { |
|
|
|
vals.push_back(kv.second.delay()); |
|
|
|
} |
|
|
|
|
|
|
|
return(vals); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
//' Path allowed
|
|
|
|
//'
|
|
|
|
|
|
@ -15,6 +15,7 @@ namespace Rep |
|
|
|
public: |
|
|
|
typedef std::unordered_map<std::string, Agent> agent_map_t; |
|
|
|
typedef std::vector<std::string> sitemaps_t; |
|
|
|
agent_map_t agents_; |
|
|
|
|
|
|
|
/**
|
|
|
|
* Create a robots.txt from a utf-8-encoded string. |
|
|
@ -60,7 +61,6 @@ namespace Rep |
|
|
|
static bool getpair( |
|
|
|
std::istringstream& stream, std::string& key, std::string& value); |
|
|
|
|
|
|
|
agent_map_t agents_; |
|
|
|
sitemaps_t sitemaps_; |
|
|
|
Agent& default_; |
|
|
|
}; |
|
|
|