You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
985 lines
27 KiB
985 lines
27 KiB
/*
|
|
Copyright (c) 2016-2017 SEOmoz, Inc.
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining
|
|
a copy of this software and associated documentation files (the
|
|
"Software"), to deal in the Software without restriction, including
|
|
without limitation the rights to use, copy, modify, merge, publish,
|
|
distribute, sublicense, and/or sell copies of the Software, and to
|
|
permit persons to whom the Software is furnished to do so, subject to
|
|
the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be
|
|
included in all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
#include <algorithm>
|
|
#include <string>
|
|
#include <iterator>
|
|
#include <unordered_map>
|
|
#include <unordered_set>
|
|
#include <iostream>
|
|
#include <iterator>
|
|
#include <sstream>
|
|
|
|
#include "url.h"
|
|
#include "punycode.h"
|
|
|
|
namespace Url
|
|
{
|
|
|
|
/* Character classes */
|
|
const CharacterClass Url::GEN_DELIMS(":/?#[]@");
|
|
const CharacterClass Url::SUB_DELIMS("!$&'()*+,;=");
|
|
const CharacterClass Url::DIGIT("0123456789");
|
|
const CharacterClass Url::ALPHA(
|
|
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
|
|
const CharacterClass Url::UNRESERVED(
|
|
Url::ALPHA.chars() + Url::DIGIT.chars() + "-._~");
|
|
const CharacterClass Url::RESERVED(
|
|
Url::GEN_DELIMS.chars() + Url::SUB_DELIMS.chars());
|
|
const CharacterClass Url::PCHAR(
|
|
Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":@");
|
|
const CharacterClass Url::PATH(
|
|
Url::PCHAR.chars() + "/");
|
|
const CharacterClass Url::QUERY(
|
|
Url::PCHAR.chars() + "/?");
|
|
const CharacterClass Url::FRAGMENT(
|
|
Url::PCHAR.chars() + "/?");
|
|
const CharacterClass Url::USERINFO(
|
|
Url::UNRESERVED.chars() + Url::SUB_DELIMS.chars() + ":");
|
|
const CharacterClass Url::HEX("0123456789ABCDEF");
|
|
const CharacterClass Url::SCHEME(
|
|
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-.");
|
|
const std::vector<signed char> Url::HEX_TO_DEC = {
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
|
|
|
|
-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
|
|
-1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
|
|
};
|
|
const std::unordered_map<std::string, int> Url::PORTS = {
|
|
{"http", 80},
|
|
{"https", 443}
|
|
};
|
|
const std::unordered_set<std::string> Url::USES_RELATIVE = {
|
|
"",
|
|
"file",
|
|
"ftp",
|
|
"gopher",
|
|
"http",
|
|
"https",
|
|
"imap",
|
|
"mms",
|
|
"nntp",
|
|
"prospero",
|
|
"rtsp",
|
|
"rtspu",
|
|
"sftp",
|
|
"shttp",
|
|
"svn",
|
|
"svn+ssh",
|
|
"wais"
|
|
};
|
|
const std::unordered_set<std::string> Url::USES_NETLOC = {
|
|
"",
|
|
"file",
|
|
"ftp",
|
|
"git",
|
|
"git+ssh",
|
|
"gopher",
|
|
"http",
|
|
"https",
|
|
"imap",
|
|
"mms",
|
|
"nfs",
|
|
"nntp",
|
|
"prospero",
|
|
"rsync",
|
|
"rtsp",
|
|
"rtspu",
|
|
"sftp",
|
|
"shttp",
|
|
"snews",
|
|
"svn",
|
|
"svn+ssh",
|
|
"telnet",
|
|
"wais"
|
|
};
|
|
const std::unordered_set<std::string> Url::USES_PARAMS = {
|
|
"",
|
|
"ftp",
|
|
"hdl",
|
|
"http",
|
|
"https",
|
|
"imap",
|
|
"mms",
|
|
"prospero",
|
|
"rtsp",
|
|
"rtspu",
|
|
"sftp",
|
|
"shttp",
|
|
"sip",
|
|
"sips",
|
|
"tel"
|
|
};
|
|
const std::unordered_set<std::string> Url::KNOWN_PROTOCOLS = {
|
|
"",
|
|
"file",
|
|
"ftp",
|
|
"git",
|
|
"git+ssh",
|
|
"gopher",
|
|
"hdl",
|
|
"http",
|
|
"https",
|
|
"imap",
|
|
"mms",
|
|
"nfs",
|
|
"nntp",
|
|
"prospero",
|
|
"rsync",
|
|
"rtsp",
|
|
"rtspu",
|
|
"sftp",
|
|
"shttp",
|
|
"sip",
|
|
"sips",
|
|
"sms",
|
|
"snews",
|
|
"svn",
|
|
"svn+ssh",
|
|
"tel",
|
|
"telnet",
|
|
"wais"
|
|
};
|
|
|
|
Url::Url(const std::string& url): port_(0), has_params_(false), has_query_(false)
|
|
{
|
|
size_t position = 0;
|
|
size_t index = url.find(':');
|
|
if (index != std::string::npos)
|
|
{
|
|
// All the characters in our would-be scheme must be in SCHEME
|
|
if (std::all_of(
|
|
url.begin(),
|
|
url.begin() + index,
|
|
[](char c) { return SCHEME(c); } ))
|
|
{
|
|
// If there is nothing after the : or there are any non-digits, this is
|
|
// the scheme
|
|
if ((index + 1) >= url.length()
|
|
|| std::any_of(
|
|
url.begin() + index + 1,
|
|
url.end(),
|
|
[](char c) { return !DIGIT(c); }))
|
|
{
|
|
scheme_.assign(url, 0, index);
|
|
std::transform(
|
|
scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower);
|
|
position = index + 1;
|
|
}
|
|
else
|
|
{
|
|
scheme_.assign(url, 0, index);
|
|
std::transform(
|
|
scheme_.begin(), scheme_.end(), scheme_.begin(), ::tolower);
|
|
if (KNOWN_PROTOCOLS.find(scheme_) != KNOWN_PROTOCOLS.end())
|
|
{
|
|
position = index + 1;
|
|
}
|
|
else
|
|
{
|
|
scheme_.clear();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Search for the netloc
|
|
if ((url.length() - position) >= 1
|
|
&& url[position] == '/'
|
|
&& url[position + 1] == '/')
|
|
{
|
|
// Skip the '//'
|
|
position += 2;
|
|
index = url.find_first_of("/?#", position);
|
|
host_.assign(url, position, index - position);
|
|
position = index;
|
|
|
|
// Extract any userinfo if there is any
|
|
index = host_.find('@');
|
|
if (index != std::string::npos)
|
|
{
|
|
userinfo_.assign(host_, 0, index);
|
|
host_.assign(host_, index + 1, std::string::npos);
|
|
}
|
|
|
|
// Lowercase the hostname
|
|
std::transform(host_.begin(), host_.end(), host_.begin(), ::tolower);
|
|
|
|
// Try to find a port
|
|
index = host_.find(':');
|
|
if (index != std::string::npos)
|
|
{
|
|
std::string portText(host_, index + 1, std::string::npos);
|
|
host_.resize(index);
|
|
|
|
if (portText.empty())
|
|
{
|
|
port_ = 0;
|
|
}
|
|
else
|
|
{
|
|
try
|
|
{
|
|
port_ = std::stoi(portText, &index);
|
|
|
|
if (index != portText.length())
|
|
{
|
|
// Malformed port
|
|
throw UrlParseException("Port not a number: " + portText);
|
|
}
|
|
|
|
if (port_ > 65535)
|
|
{
|
|
throw UrlParseException("Port too high: " + portText);
|
|
}
|
|
else if (port_ < 0)
|
|
{
|
|
throw UrlParseException("Port negative: " + portText);
|
|
}
|
|
}
|
|
catch (const std::invalid_argument&)
|
|
{
|
|
// Malformed port
|
|
throw UrlParseException("Port not a number: " + portText);
|
|
}
|
|
catch (const std::out_of_range&)
|
|
{
|
|
throw UrlParseException("Port out of integer range: " + portText);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (position != std::string::npos)
|
|
{
|
|
path_.assign(url, position, std::string::npos);
|
|
|
|
index = path_.find('#');
|
|
if (index != std::string::npos)
|
|
{
|
|
fragment_.assign(path_, index + 1, std::string::npos);
|
|
path_.resize(index);
|
|
}
|
|
|
|
index = path_.find('?');
|
|
if (index != std::string::npos)
|
|
{
|
|
query_.assign(path_, index + 1, std::string::npos);
|
|
has_query_ = true;
|
|
path_.resize(index);
|
|
}
|
|
|
|
if (USES_PARAMS.find(scheme_) != USES_PARAMS.end())
|
|
{
|
|
index = path_.find(';');
|
|
if (index != std::string::npos)
|
|
{
|
|
params_.assign(path_, index + 1, std::string::npos);
|
|
has_params_ = true;
|
|
path_.resize(index);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Url& Url::assign(const Url& other)
|
|
{
|
|
return (*this) = other;
|
|
}
|
|
|
|
bool Url::operator==(const Url& other) const
|
|
{
|
|
return (
|
|
(scheme_ == other.scheme_ ) &&
|
|
(userinfo_ == other.userinfo_ ) &&
|
|
(host_ == other.host_ ) &&
|
|
(port_ == other.port_ ) &&
|
|
(path_ == other.path_ ) &&
|
|
(params_ == other.params_ ) &&
|
|
(query_ == other.query_ ) &&
|
|
(fragment_ == other.fragment_ ) &&
|
|
(has_params_ == other.has_params_) &&
|
|
(has_query_ == other.has_query_ )
|
|
);
|
|
}
|
|
|
|
bool Url::operator!=(const Url& other) const
|
|
{
|
|
return !operator==(other);
|
|
}
|
|
|
|
bool Url::equiv(const Url& other)
|
|
{
|
|
Url self_(*this);
|
|
Url other_(other);
|
|
|
|
self_.strip()
|
|
.sort_query()
|
|
.defrag()
|
|
.deuserinfo()
|
|
.abspath()
|
|
.escape()
|
|
.punycode()
|
|
.remove_default_port();
|
|
other_.strip()
|
|
.sort_query()
|
|
.defrag()
|
|
.deuserinfo()
|
|
.abspath()
|
|
.escape()
|
|
.punycode()
|
|
.remove_default_port();
|
|
return self_ == other_;
|
|
}
|
|
|
|
std::string& Url::remove_repeats(std::string& str, const char chr)
|
|
{
|
|
size_t dest = 0;
|
|
// By initializing this to true, it also strips of leading instances of chr
|
|
bool seen = true;
|
|
for (size_t src = 0; src < str.length(); ++src)
|
|
{
|
|
if (!seen || (str[src] != chr))
|
|
{
|
|
str[dest++] = str[src];
|
|
}
|
|
seen = str[src] == chr;
|
|
}
|
|
// Remove the last character if it happens to be chr
|
|
size_t length = ((dest > 0) && (str[dest - 1] == chr)) ? dest - 1 : dest;
|
|
str.resize(length);
|
|
return str;
|
|
}
|
|
|
|
std::string Url::fullpath() const
|
|
{
|
|
std::string result;
|
|
if (path_.empty() || path_[0] != '/')
|
|
{
|
|
result.append(1, '/');
|
|
}
|
|
result.append(path_);
|
|
|
|
if (has_params_)
|
|
{
|
|
result.append(";");
|
|
result.append(params_);
|
|
}
|
|
|
|
if (has_query_)
|
|
{
|
|
result.append("?");
|
|
result.append(query_);
|
|
}
|
|
|
|
if (!fragment_.empty())
|
|
{
|
|
result.append("#");
|
|
result.append(fragment_);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
std::string Url::str() const
|
|
{
|
|
std::string result;
|
|
|
|
if (!scheme_.empty())
|
|
{
|
|
result.append(scheme_);
|
|
if (USES_NETLOC.find(scheme_) == USES_NETLOC.end())
|
|
{
|
|
result.append(":");
|
|
}
|
|
else
|
|
{
|
|
result.append("://");
|
|
}
|
|
}
|
|
else if (!host_.empty())
|
|
{
|
|
result.append("//");
|
|
}
|
|
|
|
if (!userinfo_.empty())
|
|
{
|
|
result.append(userinfo_);
|
|
result.append("@");
|
|
}
|
|
|
|
if (!host_.empty())
|
|
{
|
|
result.append(host_);
|
|
}
|
|
|
|
if (port_)
|
|
{
|
|
result.append(":");
|
|
result.append(std::to_string(port_));
|
|
}
|
|
|
|
if (path_.empty())
|
|
{
|
|
if (!result.empty())
|
|
{
|
|
result.append("/");
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (!host_.empty() && path_[0] != '/')
|
|
{
|
|
result.append(1, '/');
|
|
}
|
|
result.append(path_);
|
|
}
|
|
|
|
if (has_params_)
|
|
{
|
|
result.append(";");
|
|
result.append(params_);
|
|
}
|
|
|
|
if (has_query_)
|
|
{
|
|
result.append("?");
|
|
result.append(query_);
|
|
}
|
|
|
|
if (!fragment_.empty())
|
|
{
|
|
result.append("#");
|
|
result.append(fragment_);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
Url& Url::strip()
|
|
{
|
|
size_t start = query_.find_first_not_of('?');
|
|
if (start != std::string::npos)
|
|
{
|
|
query_.assign(query_, start, std::string::npos);
|
|
}
|
|
else
|
|
{
|
|
query_.assign("");
|
|
}
|
|
setQuery(remove_repeats(query_, '&'));
|
|
setParams(remove_repeats(params_, ';'));
|
|
return *this;
|
|
}
|
|
|
|
Url& Url::abspath()
|
|
{
|
|
std::string copy;
|
|
std::vector<size_t> segment_starts;
|
|
|
|
if (path_.size() >= 1 && path_[0] == '/')
|
|
{
|
|
copy.append(1, '/');
|
|
segment_starts.push_back(0);
|
|
}
|
|
|
|
bool directory = false;
|
|
size_t previous = 0;
|
|
size_t index = 0;
|
|
for (index = path_.find('/')
|
|
; index != std::string::npos
|
|
; previous = index + 1, index = path_.find('/', index + 1))
|
|
{
|
|
// Skip empty segments
|
|
if (index - previous == 0)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if ((index - previous == 2)
|
|
&& path_[previous] == '.'
|
|
&& path_[previous + 1] == '.')
|
|
{
|
|
if (!segment_starts.empty())
|
|
{
|
|
copy.resize(segment_starts.back());
|
|
segment_starts.pop_back();
|
|
}
|
|
directory = true;
|
|
}
|
|
else if ((index - previous == 1) && path_[previous] == '.')
|
|
{
|
|
directory = true;
|
|
}
|
|
else
|
|
{
|
|
segment_starts.push_back(copy.length());
|
|
copy.append(path_, previous, index - previous);
|
|
copy.append(1, '/');
|
|
directory = false;
|
|
}
|
|
}
|
|
|
|
// Handle the last segment
|
|
index = path_.length();
|
|
if (previous == path_.length())
|
|
{
|
|
directory = true;
|
|
}
|
|
else if ((index - previous == 1) && path_[previous] == '.')
|
|
{
|
|
directory = true;
|
|
}
|
|
else if ((index - previous == 2)
|
|
&& path_[previous] == '.'
|
|
&& path_[previous + 1] == '.')
|
|
{
|
|
if (!segment_starts.empty())
|
|
{
|
|
copy.resize(segment_starts.back());
|
|
}
|
|
directory = true;
|
|
}
|
|
else
|
|
{
|
|
copy.append(path_, previous, index - previous);
|
|
copy.append(1, '/');
|
|
directory = false;
|
|
}
|
|
|
|
if (!directory && copy.size() >= 1)
|
|
{
|
|
copy.resize(copy.size() - 1);
|
|
}
|
|
else if (directory && copy.empty())
|
|
{
|
|
copy.append(1, '/');
|
|
}
|
|
path_.assign(copy);
|
|
|
|
return *this;
|
|
}
|
|
|
|
Url& Url::relative_to(const Url& other)
|
|
{
|
|
// If this scheme does not use relative, return it unchanged
|
|
if (USES_RELATIVE.find(scheme_) == USES_RELATIVE.end())
|
|
{
|
|
return *this;
|
|
}
|
|
|
|
// Support scheme-relative URLs
|
|
if (scheme_.empty())
|
|
{
|
|
scheme_ = other.scheme_;
|
|
}
|
|
|
|
// If this is an absolute URL (or scheme-relative), return early
|
|
if (!host_.empty()) {
|
|
return *this;
|
|
}
|
|
|
|
// If it's not an absolute URL, we need to copy the other host and port
|
|
host_ = other.host_;
|
|
port_ = other.port_;
|
|
userinfo_ = other.userinfo_;
|
|
|
|
// If the path portion is absolute, then bail out early.
|
|
if (!path_.empty() && path_.front() == '/')
|
|
{
|
|
return *this;
|
|
}
|
|
|
|
// Otherwise, this is a path that need to be evaluated relative to the other. If
|
|
// there is no '/', then we just keep our current path if it's not empty.
|
|
if (path_.empty())
|
|
{
|
|
if (params_.empty())
|
|
{
|
|
path_ = other.path_;
|
|
params_ = other.params_;
|
|
has_params_ = other.has_params_;
|
|
if (query_.empty())
|
|
{
|
|
query_ = other.query_;
|
|
has_query_ = other.has_query_;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
path_.assign(other.path_, 0, other.path_.rfind('/') + 1);
|
|
}
|
|
|
|
if (fragment_.empty())
|
|
{
|
|
fragment_ = other.fragment_;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
size_t index = other.path_.rfind('/');
|
|
if (index != std::string::npos)
|
|
{
|
|
path_ = other.path_.substr(0, index + 1) + path_;
|
|
}
|
|
else if (!host_.empty())
|
|
{
|
|
path_ = "/" + path_;
|
|
}
|
|
}
|
|
|
|
return *this;
|
|
}
|
|
|
|
Url& Url::escape(bool strict)
|
|
{
|
|
escape(path_, PATH, strict);
|
|
escape(query_, QUERY, strict);
|
|
escape(params_, QUERY, strict);
|
|
escape(userinfo_, USERINFO, strict);
|
|
return *this;
|
|
}
|
|
|
|
std::string& Url::escape(std::string& str, const CharacterClass& safe, bool strict)
|
|
{
|
|
std::string copy(str);
|
|
size_t dest = 0;
|
|
// Allocate space pessimistically -- if every entity is expanded, it will take 3x
|
|
// the space.
|
|
str.resize(str.length() * 3);
|
|
for (size_t src = 0; src < copy.length(); ++src)
|
|
{
|
|
if (copy[src] == '%' && (copy.length() - src) >= 2)
|
|
{
|
|
// Read ahead to see if there's a valid escape sequence. If not, treat
|
|
// this like a normal character.
|
|
if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1)
|
|
{
|
|
int value = (
|
|
HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]);
|
|
|
|
// In strict mode, we can only unescape parameters if they are both
|
|
// safe and node reserved
|
|
if (!strict || (strict && safe(value) && !RESERVED(value)))
|
|
{
|
|
// Replace src + 2 with that byte, advance src to consume it and
|
|
// continue.
|
|
src += 2;
|
|
copy[src] = value;
|
|
}
|
|
else
|
|
{
|
|
str[dest++] = copy[src++];
|
|
str[dest++] = ::toupper(copy[src++]);
|
|
str[dest++] = ::toupper(copy[src]);
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!safe(copy[src]))
|
|
{
|
|
// Not safe -- replace with %XX
|
|
str[dest++] = '%';
|
|
str[dest++] = HEX.chars()[(copy[src] >> 4) & 0xF];
|
|
str[dest++] = HEX.chars()[copy[src] & 0xF];
|
|
}
|
|
else
|
|
{
|
|
str[dest++] = copy[src];
|
|
}
|
|
}
|
|
str.resize(dest);
|
|
return str;
|
|
}
|
|
|
|
Url& Url::unescape()
|
|
{
|
|
unescape(path_);
|
|
unescape(query_);
|
|
unescape(params_);
|
|
unescape(userinfo_);
|
|
return *this;
|
|
}
|
|
|
|
std::string& Url::unescape(std::string& str)
|
|
{
|
|
std::string copy(str);
|
|
size_t dest = 0;
|
|
for (size_t src = 0; src < copy.length(); ++src, ++dest)
|
|
{
|
|
if (copy[src] == '%' && (copy.length() - src) >= 2)
|
|
{
|
|
// Read ahead to see if there's a valid escape sequence. If not, treat
|
|
// this like a normal character.
|
|
if (HEX_TO_DEC[copy[src+1]] != -1 && HEX_TO_DEC[copy[src+2]] != -1)
|
|
{
|
|
int value = (
|
|
HEX_TO_DEC[copy[src+1]] * 16 + HEX_TO_DEC[copy[src+2]]);
|
|
|
|
// Replace src + 2 with that byte, advance src to consume it and
|
|
// continue.
|
|
src += 2;
|
|
str[dest] = value;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Either not a % or an incomplete entity
|
|
str[dest] = copy[src];
|
|
}
|
|
str.resize(dest);
|
|
return str;
|
|
}
|
|
|
|
Url& Url::deparam(const std::unordered_set<std::string>& blacklist)
|
|
{
|
|
// Predicate is if it's present in the blacklist.
|
|
auto predicate = [blacklist](std::string& name, const std::string& value)
|
|
{
|
|
std::transform(name.begin(), name.end(), name.begin(), ::tolower);
|
|
return blacklist.find(name) != blacklist.end();
|
|
};
|
|
|
|
setQuery(remove_params(query_, predicate, '&'));
|
|
setParams(remove_params(params_, predicate, ';'));
|
|
return *this;
|
|
}
|
|
|
|
Url& Url::deparam(const deparam_predicate& predicate)
|
|
{
|
|
setQuery(remove_params(query_, predicate, '&'));
|
|
setParams(remove_params(params_, predicate, ';'));
|
|
return *this;
|
|
}
|
|
|
|
std::string& Url::remove_params(std::string& str,
|
|
const deparam_predicate& predicate,
|
|
char sep)
|
|
{
|
|
std::string copy;
|
|
std::string piece;
|
|
std::string name;
|
|
std::string value;
|
|
size_t previous = 0;
|
|
for (size_t index = str.find(sep)
|
|
; index != std::string::npos
|
|
; previous = index + 1, index = str.find(sep, previous))
|
|
{
|
|
piece.assign(str, previous, index - previous);
|
|
size_t position = piece.find('=');
|
|
name.assign(piece, 0, position);
|
|
value.clear();
|
|
if (position != std::string::npos)
|
|
{
|
|
value.assign(piece, position + 1, std::string::npos);
|
|
}
|
|
|
|
if (!predicate(name, value))
|
|
{
|
|
copy.append(copy.empty() ? 0 : 1, sep);
|
|
copy.append(piece);
|
|
}
|
|
}
|
|
|
|
if (previous < str.length())
|
|
{
|
|
piece.assign(str, previous, std::string::npos);
|
|
size_t position = piece.find('=');
|
|
name.assign(piece, 0, position);
|
|
value.clear();
|
|
if (position != std::string::npos)
|
|
{
|
|
value.assign(piece, position + 1, std::string::npos);
|
|
}
|
|
|
|
if (!predicate(name, value))
|
|
{
|
|
copy.append(copy.empty() ? 0 : 1, sep);
|
|
copy.append(piece);
|
|
}
|
|
}
|
|
|
|
str.assign(copy);
|
|
return str;
|
|
}
|
|
|
|
Url& Url::sort_query()
|
|
{
|
|
split_sort_join(query_, '&');
|
|
split_sort_join(params_, ';');
|
|
return *this;
|
|
}
|
|
|
|
std::string& Url::split_sort_join(std::string& str, const char glue)
|
|
{
|
|
// Return early if empty
|
|
if (str.empty())
|
|
{
|
|
return str;
|
|
}
|
|
|
|
// Split
|
|
std::vector<std::string> pieces;
|
|
std::stringstream stream(str);
|
|
std::string item;
|
|
while (getline(stream, item, glue))
|
|
{
|
|
pieces.push_back(item);
|
|
}
|
|
|
|
// Return early if it's just a single element
|
|
if (pieces.size() == 1)
|
|
{
|
|
return str;
|
|
}
|
|
|
|
// Sort
|
|
std::sort(pieces.begin(), pieces.end());
|
|
|
|
// Join (at this point we know that there's at least one element)
|
|
std::stringstream output;
|
|
for (auto it = pieces.begin(); it != (pieces.end() - 1); ++it)
|
|
{
|
|
output << *it << glue;
|
|
}
|
|
output << pieces.back();
|
|
str.assign(output.str());
|
|
return str;
|
|
}
|
|
|
|
Url& Url::remove_default_port()
|
|
{
|
|
if (port_ && !scheme_.empty())
|
|
{
|
|
auto it = PORTS.find(scheme_);
|
|
if (it != PORTS.end() && port_ == it->second)
|
|
{
|
|
port_ = 0;
|
|
}
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
Url& Url::deuserinfo()
|
|
{
|
|
userinfo_.clear();
|
|
return *this;
|
|
}
|
|
|
|
Url& Url::defrag()
|
|
{
|
|
fragment_.clear();
|
|
return *this;
|
|
}
|
|
|
|
Url& Url::punycode()
|
|
{
|
|
check_hostname(host_);
|
|
std::string encoded(Punycode::encodeHostname(host_));
|
|
check_hostname(encoded);
|
|
host_ = encoded;
|
|
return *this;
|
|
}
|
|
|
|
Url& Url::unpunycode()
|
|
{
|
|
host_ = Punycode::decodeHostname(host_);
|
|
return *this;
|
|
}
|
|
|
|
Url& Url::host_reversed()
|
|
{
|
|
std::reverse(host_.begin(), host_.end());
|
|
for (size_t index = 0, position = 0; index < host_.size(); index = position + 1)
|
|
{
|
|
position = host_.find('.', index);
|
|
if (position == std::string::npos)
|
|
{
|
|
std::reverse(host_.begin() + index, host_.end());
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
std::reverse(host_.begin() + index, host_.begin() + position);
|
|
}
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
void Url::check_hostname(std::string& host)
|
|
{
|
|
// Skip empty hostnames -- they are valid
|
|
if (host.empty())
|
|
{
|
|
return;
|
|
}
|
|
|
|
size_t start = 0;
|
|
size_t end = host.find('.');
|
|
while (end != std::string::npos)
|
|
{
|
|
if ((end - start) > 63)
|
|
{
|
|
throw std::invalid_argument("Label too long.");
|
|
}
|
|
else if (end == start)
|
|
{
|
|
throw std::invalid_argument("Empty label.");
|
|
}
|
|
|
|
start = end + 1;
|
|
end = host.find('.', start);
|
|
}
|
|
|
|
// For the final segment
|
|
if ((host.size() - start) > 63)
|
|
{
|
|
throw std::invalid_argument("Label too long.");
|
|
}
|
|
else if (host.size() == start && start > 1)
|
|
{
|
|
// Remove a trailing empty segment
|
|
host.resize(start - 1);
|
|
}
|
|
}
|
|
|
|
};
|
|
|