You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
409 lines
14 KiB
409 lines
14 KiB
#include <algorithm>
|
|
#include <string>
|
|
#include <iostream>
|
|
|
|
#include "punycode.h"
|
|
#include "utf8.h"
|
|
|
|
namespace Url
|
|
{
|
|
|
|
std::string& Punycode::encode(std::string& str)
|
|
{
|
|
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3
|
|
//
|
|
// let n = initial_n
|
|
// let delta = 0
|
|
// let bias = initial_bias
|
|
punycode_uint n = INITIAL_N;
|
|
punycode_uint delta = 0;
|
|
punycode_uint bias = INITIAL_BIAS;
|
|
std::string output;
|
|
|
|
// Accumulate the non-basic codepoints
|
|
std::vector<punycode_uint> codepoints;
|
|
for (auto it = str.cbegin(); it != str.cend(); )
|
|
{
|
|
Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend());
|
|
if (value < 0x80)
|
|
{
|
|
// copy them to the output in order
|
|
output.append(1, static_cast<char>(value));
|
|
}
|
|
codepoints.push_back(value);
|
|
}
|
|
|
|
// let h = b = the number of basic code points in the input
|
|
size_t h = output.size();
|
|
size_t b = h;
|
|
|
|
// copy a delimiter if b > 0
|
|
if (b > 0)
|
|
{
|
|
output.append(1, '-');
|
|
}
|
|
|
|
// while h < length(input) do begin
|
|
while (h < codepoints.size())
|
|
{
|
|
// let m = the minimum {non-basic} code point >= n in the input
|
|
punycode_uint m = MAX_PUNYCODE_UINT;
|
|
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
|
|
{
|
|
if ((*it >= n) && (*it < m))
|
|
{
|
|
m = *it;
|
|
}
|
|
}
|
|
|
|
// let delta = delta + (m - n) * (h + 1), fail on overflow
|
|
if ((m - n) > ((MAX_PUNYCODE_UINT - delta) / (h + 1)))
|
|
{
|
|
throw std::invalid_argument("Overflow delta update.");
|
|
}
|
|
delta += (m - n) * (h + 1);
|
|
|
|
// let n = m
|
|
n = m;
|
|
|
|
// for each code point c in the input (in order) do begin
|
|
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
|
|
{
|
|
// if c < n {or c is basic} then increment delta, fail on overflow
|
|
if (*it < n)
|
|
{
|
|
if (delta == MAX_PUNYCODE_UINT)
|
|
{
|
|
throw std::invalid_argument("Overflow delta increment.");
|
|
}
|
|
++delta;
|
|
}
|
|
|
|
// if c == n then begin
|
|
if (*it == n)
|
|
{
|
|
// let q = delta
|
|
punycode_uint q = delta;
|
|
|
|
// for k = base to infinity in steps of base do begin
|
|
for (punycode_uint k = BASE; ; k += BASE)
|
|
{
|
|
// let t = tmin if k <= bias {+ tmin}, or
|
|
// tmax if k >= bias + tmax, or k - bias otherwise
|
|
punycode_uint t = k <= bias ? TMIN :
|
|
k >= bias + TMAX ? TMAX : k - bias;
|
|
|
|
// if q < t then break
|
|
if (q < t)
|
|
{
|
|
break;
|
|
}
|
|
|
|
// output the code point for digit t + ((q - t) mod (base - t))
|
|
output.append(1, DIGIT_TO_BASIC[t + ((q - t) % (BASE - t))]);
|
|
|
|
// let q = (q - t) div (base - t)
|
|
q = (q - t) / (BASE - t);
|
|
}
|
|
|
|
// output the code point for digit q
|
|
output.append(1, DIGIT_TO_BASIC[q]);
|
|
|
|
// let bias = adapt(delta, h + 1, test h equals b?)
|
|
bias = adapt(delta, h + 1, h == b);
|
|
|
|
// let delta = 0
|
|
delta = 0;
|
|
|
|
// increment h
|
|
++h;
|
|
|
|
}
|
|
}
|
|
|
|
// increment delta and n
|
|
++delta;
|
|
++n;
|
|
}
|
|
|
|
str.assign(output);
|
|
return str;
|
|
}
|
|
|
|
std::string Punycode::encode(const std::string& str)
|
|
{
|
|
std::string result(str);
|
|
encode(result);
|
|
return result;
|
|
}
|
|
|
|
std::string Punycode::encodeHostname(const std::string& hostname)
|
|
{
|
|
// Avoid any punycoding at all if none is needed
|
|
if (!needsPunycoding(hostname))
|
|
{
|
|
return hostname;
|
|
}
|
|
|
|
std::string encoded;
|
|
|
|
size_t start = 0;
|
|
size_t end = hostname.find('.');
|
|
while(true)
|
|
{
|
|
std::string segment = hostname.substr(start, end - start);
|
|
if (needsPunycoding(segment))
|
|
{
|
|
encoded.append("xn--");
|
|
encoded.append(Punycode::encode(segment));
|
|
}
|
|
else
|
|
{
|
|
encoded.append(segment);
|
|
}
|
|
|
|
if (end == std::string::npos)
|
|
{
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
encoded.append(1, '.');
|
|
start = end + 1;
|
|
end = hostname.find('.', start);
|
|
}
|
|
}
|
|
|
|
return encoded;
|
|
}
|
|
|
|
std::string& Punycode::decode(std::string& str)
|
|
{
|
|
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2
|
|
//
|
|
// let n = initial_n
|
|
// let i = 0
|
|
// let bias = initial_bias
|
|
// let output = an empty string indexed from 0
|
|
punycode_uint n = INITIAL_N;
|
|
punycode_uint i = 0;
|
|
punycode_uint bias = INITIAL_BIAS;
|
|
std::vector<punycode_uint> codepoints;
|
|
|
|
size_t index = str.rfind('-');
|
|
if (index == std::string::npos)
|
|
{
|
|
index = 0;
|
|
}
|
|
|
|
// consume all code points before the last delimiter (if there is one)
|
|
// and copy them to output, fail on any non-basic code point
|
|
for (auto it = str.begin(); it != (str.begin() + index); ++it)
|
|
{
|
|
if (static_cast<unsigned char>(*it) > 127U)
|
|
{
|
|
throw std::invalid_argument("Argument has non-basic code points.");
|
|
}
|
|
codepoints.push_back(*it);
|
|
}
|
|
|
|
// if more than zero code points were consumed then consume one more
|
|
// (which will be the last delimiter)
|
|
if (index > 0)
|
|
{
|
|
index += 1;
|
|
}
|
|
|
|
// while the input is not exhausted do begin
|
|
for (auto it = (str.begin() + index); it != str.end(); ++it)
|
|
{
|
|
// let oldi = i
|
|
// let w = 1
|
|
punycode_uint oldi = i;
|
|
punycode_uint w = 1;
|
|
|
|
// for k = base to infinity in steps of base do begin
|
|
for (punycode_uint k = BASE; ; k += BASE, ++it)
|
|
{
|
|
// consume a code point, or fail if there was none to consume
|
|
if (it == str.end())
|
|
{
|
|
throw std::invalid_argument("Premature termination");
|
|
}
|
|
|
|
// let digit = the code point's digit-value, fail if it has none
|
|
int lookup = BASIC_TO_DIGIT[static_cast<size_t>(*it)];
|
|
if (lookup == -1)
|
|
{
|
|
throw std::invalid_argument("Invalid base 36 character.");
|
|
}
|
|
unsigned char digit = static_cast<unsigned char>(lookup);
|
|
|
|
// let i = i + digit * w, fail on overflow
|
|
if (digit > ((MAX_PUNYCODE_UINT - i) / w))
|
|
{
|
|
throw std::invalid_argument("Overflow on i.");
|
|
}
|
|
i += digit * w;
|
|
|
|
// let t = tmin if k <= bias {+ tmin}, or
|
|
// tmax if k >= bias + tmax, or k - bias otherwise
|
|
punycode_uint t = k <= bias ? TMIN :
|
|
k >= bias + TMAX ? TMAX : k - bias;
|
|
|
|
// if digit < t then break
|
|
if (digit < t)
|
|
{
|
|
break;
|
|
}
|
|
|
|
// let w = w * (base - t), fail on overflow
|
|
if (w > (MAX_PUNYCODE_UINT / (BASE - t)))
|
|
{
|
|
// I believe this line is unreachable without first overflowing i.
|
|
// Since 'i' is updated above as i += digit * w, and w is updated as
|
|
// w = w * (BASE - t), we should like to keep (BASE - t) > digit to
|
|
// give 'w' a chance to overflow first. To keep t minimized, we must
|
|
// have 'bias' maximized. `bias` is driven by the 'adapt' function
|
|
// below.
|
|
//
|
|
// The value returned by 'adapt' increases with the input delta, and
|
|
// decreases with the input size. The delta is a function of the input
|
|
// size as well, on the order of (delta_n * input size), and
|
|
// legitimate delta_n values are limited to 0x10FFFF (the maximum
|
|
// unicode codepoint). Even setting that aside, the maximum value that
|
|
// adapt() can return is adapt(2 ** 32 - 1, 1, false) = 204.
|
|
//
|
|
// Using this bias, we could use the input (HERE) to get iterations:
|
|
//
|
|
// digit = b = 1, i = 2, k = 36, t = 1, w = 35
|
|
// digit = b = 1, i = 37, k = 72, t = 1, w = 1225
|
|
// digit = b = 1, i = 1262, k = 108, t = 1, w = 42875
|
|
// digit = b = 1, i = 44137, k = 144, t = 1, w = 1500625
|
|
// digit = b = 1, i = 1544762, k = 180, t = 1, w = 52521875
|
|
//
|
|
// At this point, t now becomes TMAX (26) because k exceeds the bias
|
|
// (since the maximum bias is 204). As such, the minimum continuation
|
|
// value is 26:
|
|
//
|
|
// digit = 0 = 26, i = 1367113512, k = 216, t = 26, w = 525218750
|
|
//
|
|
// However, the next iteration now overflows i before we can get to
|
|
// the w update.
|
|
throw std::invalid_argument("Overflow on w."); // LCOV_EXCL_LINE
|
|
}
|
|
w *= (BASE - t);
|
|
}
|
|
|
|
// let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?)
|
|
bias = adapt(i - oldi, codepoints.size() + 1, oldi == 0);
|
|
|
|
// let n = n + i div (length(output) + 1), fail on overflow
|
|
if ((i / (codepoints.size() + 1)) > (MAX_PUNYCODE_UINT - n))
|
|
{
|
|
throw std::invalid_argument("Overflow on n.");
|
|
}
|
|
n += i / (codepoints.size() + 1);
|
|
|
|
// let i = i mod (length(output) + 1)
|
|
i %= (codepoints.size() + 1);
|
|
|
|
// insert n into output at position i
|
|
codepoints.insert(codepoints.begin() + i, n);
|
|
|
|
// increment i
|
|
++i;
|
|
}
|
|
|
|
std::string output;
|
|
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
|
|
{
|
|
Utf8::writeCodepoint(output, *it);
|
|
}
|
|
str.assign(output);
|
|
|
|
return str;
|
|
}
|
|
|
|
std::string Punycode::decode(const std::string& str)
|
|
{
|
|
std::string result(str);
|
|
decode(result);
|
|
return result;
|
|
}
|
|
|
|
std::string Punycode::decodeHostname(const std::string& hostname)
|
|
{
|
|
std::string unencoded;
|
|
|
|
size_t start = 0;
|
|
size_t end = hostname.find('.');
|
|
while(true)
|
|
{
|
|
std::string segment = hostname.substr(start, end - start);
|
|
if (segment.substr(0, 4).compare("xn--") == 0)
|
|
{
|
|
segment = segment.substr(4);
|
|
unencoded.append(Punycode::decode(segment));
|
|
}
|
|
else
|
|
{
|
|
unencoded.append(segment);
|
|
}
|
|
|
|
if (end == std::string::npos)
|
|
{
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
unencoded.append(1, '.');
|
|
start = end + 1;
|
|
end = hostname.find('.', start);
|
|
}
|
|
}
|
|
|
|
return unencoded;
|
|
}
|
|
|
|
bool Punycode::needsPunycoding(const std::string& str)
|
|
{
|
|
return std::any_of(
|
|
str.begin(),
|
|
str.end(),
|
|
[](char i){ return static_cast<unsigned char>(i) & 0x80; });
|
|
}
|
|
|
|
Punycode::punycode_uint Punycode::adapt(
|
|
punycode_uint delta, punycode_uint numpoints, bool firsttime)
|
|
{
|
|
// Psuedocode from https://tools.ietf.org/html/rfc3492#section-6.1
|
|
//
|
|
// It does not matter whether the modifications to delta and k inside
|
|
// adapt() affect variables of the same name inside the
|
|
// encoding/decoding procedures, because after calling adapt() the
|
|
// caller does not read those variables before overwriting them.
|
|
//
|
|
// if firsttime then let delta = delta div damp
|
|
// else let delta = delta div 2
|
|
delta = firsttime ? delta / DAMP : delta >> 1;
|
|
|
|
// let delta = delta + (delta div numpoints)
|
|
delta += (delta / numpoints);
|
|
|
|
// let k = 0
|
|
punycode_uint k = 0;
|
|
|
|
// while delta > ((base - tmin) * tmax) div 2 do begin
|
|
for (; delta > ((BASE - TMIN) * TMAX) / 2; k += BASE)
|
|
{
|
|
// let delta = delta div (base - tmin)
|
|
// let k = k + base
|
|
delta /= (BASE - TMIN);
|
|
}
|
|
|
|
// return k + (((base - tmin + 1) * delta) div (delta + skew))
|
|
return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
|
|
}
|
|
|
|
};
|
|
|