You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
188 lines
5.1 KiB
188 lines
5.1 KiB
#include <algorithm>
|
|
#include <functional>
|
|
#include <cctype>
|
|
#include <locale>
|
|
#include <sstream>
|
|
#include <iostream>
|
|
#include <unordered_map>
|
|
|
|
#include "url.h"
|
|
|
|
#include "robots.h"
|
|
#include <Rcpp.h>
|
|
|
|
namespace Rep
|
|
{
|
|
|
|
void Robots::strip(std::string& string)
|
|
{
|
|
string.erase(string.begin(), std::find_if(string.begin(), string.end(),
|
|
std::not1(std::ptr_fun<int, int>(std::isspace))));
|
|
string.erase(std::find_if(string.rbegin(), string.rend(),
|
|
std::not1(std::ptr_fun<int, int>(std::isspace))).base(), string.end());
|
|
}
|
|
|
|
bool Robots::getpair(std::istringstream& stream, std::string& key, std::string& value)
|
|
{
|
|
while (getline(stream, key))
|
|
{
|
|
size_t index = key.find('#');
|
|
if (index != std::string::npos)
|
|
{
|
|
key.resize(index);
|
|
}
|
|
|
|
// Find the colon and divide it into key and value, skipping malformed lines
|
|
index = key.find(':');
|
|
if (index == std::string::npos)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
value.assign(key.begin() + index + 1, key.end());
|
|
key.resize(index);
|
|
|
|
// Strip whitespace off of each
|
|
strip(key);
|
|
strip(value);
|
|
|
|
// Lowercase the key
|
|
std::transform(key.begin(), key.end(), key.begin(), ::tolower);
|
|
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
Robots::Robots(const std::string& content): agents_(), sitemaps_(), default_(agents_["*"])
|
|
{
|
|
std::string agent_name("*");
|
|
std::istringstream input(content);
|
|
if (content.compare(0, 3, "\xEF\xBB\xBF") == 0)
|
|
{
|
|
input.ignore(3);
|
|
}
|
|
std::string key, value;
|
|
std::vector<std::string> group;
|
|
bool last_agent = false;
|
|
agent_map_t::iterator current = agents_.find("*");
|
|
while (Robots::getpair(input, key, value))
|
|
{
|
|
if (key.compare("user-agent") == 0)
|
|
{
|
|
// Store the user agent string as lowercased
|
|
std::transform(value.begin(), value.end(), value.begin(), ::tolower);
|
|
|
|
if (last_agent)
|
|
{
|
|
group.push_back(value);
|
|
}
|
|
else
|
|
{
|
|
if (!agent_name.empty())
|
|
{
|
|
for (auto other : group)
|
|
{
|
|
agents_[other] = current->second;
|
|
}
|
|
group.clear();
|
|
}
|
|
agent_name = value;
|
|
current = agents_.emplace(agent_name, Agent()).first;
|
|
}
|
|
last_agent = true;
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
last_agent = false;
|
|
}
|
|
|
|
if (key.compare("sitemap") == 0)
|
|
{
|
|
sitemaps_.push_back(value);
|
|
}
|
|
else if (key.compare("disallow") == 0)
|
|
{
|
|
current->second.disallow(value);
|
|
}
|
|
else if (key.compare("allow") == 0)
|
|
{
|
|
current->second.allow(value);
|
|
}
|
|
else if (key.compare("crawl-delay") == 0)
|
|
{
|
|
try
|
|
{
|
|
current->second.delay(std::stof(value));
|
|
}
|
|
catch (const std::exception&)
|
|
{
|
|
Rcpp::Rcout << "Could not parse " << value << " as float." << std::endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!agent_name.empty())
|
|
{
|
|
for (auto other : group)
|
|
{
|
|
agents_[other] = current->second;
|
|
}
|
|
}
|
|
}
|
|
|
|
const Agent& Robots::agent(const std::string& name) const
|
|
{
|
|
// Lowercase the agent
|
|
std::string lowered(name);
|
|
std::transform(lowered.begin(), lowered.end(), lowered.begin(), ::tolower);
|
|
|
|
auto it = agents_.find(lowered);
|
|
if (it == agents_.end())
|
|
{
|
|
return default_;
|
|
}
|
|
else
|
|
{
|
|
return it->second;
|
|
}
|
|
}
|
|
|
|
bool Robots::allowed(const std::string& path, const std::string& name) const
|
|
{
|
|
return agent(name).allowed(path);
|
|
}
|
|
|
|
std::string Robots::str() const
|
|
{
|
|
std::stringstream out;
|
|
// TODO: include sitepath info
|
|
out << '{';
|
|
auto begin = agents_.begin();
|
|
auto end = agents_.end();
|
|
if (begin != end)
|
|
{
|
|
out << '"' << begin->first << '"' << ": " << begin->second.str();
|
|
++begin;
|
|
}
|
|
for (; begin != end; ++begin)
|
|
{
|
|
out << ", \"" << begin->first << '"' << ": " << begin->second.str();
|
|
}
|
|
out << '}';
|
|
return out.str();
|
|
}
|
|
|
|
std::string Robots::robotsUrl(const std::string& url)
|
|
{
|
|
return Url::Url(url)
|
|
.setUserinfo("")
|
|
.setPath("robots.txt")
|
|
.setParams("")
|
|
.setQuery("")
|
|
.setFragment("")
|
|
.remove_default_port()
|
|
.str();
|
|
}
|
|
}
|
|
|