Browse Source

custom function to retrieve all crawl_delay settings for all user agents

master
boB Rudis 7 years ago
parent
commit
4848ff5e68
No known key found for this signature in database GPG Key ID: 1D7529BE14E2BBA9
  1. 1
      NAMESPACE
  2. 8
      R/RcppExports.R
  3. 15
      R/cd.r
  4. 14
      man/crawl_delays.Rd
  5. 12
      src/RcppExports.cpp
  6. 22
      src/repmain.cpp
  7. 2
      src/robots.h

1
NAMESPACE

@ -2,6 +2,7 @@
S3method(print,robxp)
export(can_fetch)
export(crawl_delays)
export(robxp)
importFrom(Rcpp,sourceCpp)
useDynLib(rep, .registration=TRUE)

8
R/RcppExports.R

@ -9,6 +9,14 @@ rep_parse <- function(content) {
.Call(`_rep_rep_parse`, content)
}
#' Get delays
#'
#' @noRd
#'
rep_crawl_delays <- function(xp) {
.Call(`_rep_rep_crawl_delays`, xp)
}
#' Path allowed
#'
#' @noRd

15
R/cd.r

@ -0,0 +1,15 @@
#' Get all crawl_delay
#'
#' @md
#' @param obj `robxp` object
#' @export
#' @examples
crawl_delays <- function(obj) {
if (inherits(obj, "robxp")) {
rep_crawl_delays(obj)
} else {
return(NULL)
}
}

14
man/crawl_delays.Rd

@ -0,0 +1,14 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/cd.r
\name{crawl_delays}
\alias{crawl_delays}
\title{Get all crawl_delay}
\usage{
crawl_delays(obj)
}
\arguments{
\item{obj}{\code{robxp} object}
}
\description{
Get all crawl_delay
}

12
src/RcppExports.cpp

@ -16,6 +16,17 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// rep_crawl_delays
std::vector<float> rep_crawl_delays(SEXP xp);
RcppExport SEXP _rep_rep_crawl_delays(SEXP xpSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< SEXP >::type xp(xpSEXP);
rcpp_result_gen = Rcpp::wrap(rep_crawl_delays(xp));
return rcpp_result_gen;
END_RCPP
}
// rep_path_allowed
bool rep_path_allowed(SEXP xp, std::string path, std::string agent);
RcppExport SEXP _rep_rep_path_allowed(SEXP xpSEXP, SEXP pathSEXP, SEXP agentSEXP) {
@ -32,6 +43,7 @@ END_RCPP
static const R_CallMethodDef CallEntries[] = {
{"_rep_rep_parse", (DL_FUNC) &_rep_rep_parse, 1},
{"_rep_rep_crawl_delays", (DL_FUNC) &_rep_rep_crawl_delays, 1},
{"_rep_rep_path_allowed", (DL_FUNC) &_rep_rep_path_allowed, 3},
{NULL, NULL, 0}
};

22
src/repmain.cpp

@ -2,6 +2,8 @@
using namespace Rcpp;
#include "url.h"
#include "agent.h"
#include "directive.h"
#include "robots.h"
//' Parse robots.txt
@ -14,6 +16,26 @@ SEXP rep_parse(std::string content) {
return(ptr);
}
//' Get delays
//'
//' @noRd
//'
// [[Rcpp::export]]
std::vector<float> rep_crawl_delays(SEXP xp) {
Rcpp::XPtr<Rep::Robots> ptr(xp);
std::vector<float> vals;
vals.reserve(ptr->agents_.size());
for(auto kv : ptr->agents_) {
vals.push_back(kv.second.delay());
}
return(vals);
}
//' Path allowed
//'

2
src/robots.h

@ -15,6 +15,7 @@ namespace Rep
public:
typedef std::unordered_map<std::string, Agent> agent_map_t;
typedef std::vector<std::string> sitemaps_t;
agent_map_t agents_;
/**
* Create a robots.txt from a utf-8-encoded string.
@ -60,7 +61,6 @@ namespace Rep
static bool getpair(
std::istringstream& stream, std::string& key, std::string& value);
agent_map_t agents_;
sitemaps_t sitemaps_;
Agent& default_;
};

Loading…
Cancel
Save