You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

409 lines
14 KiB

#include <algorithm>
#include <string>
#include <iostream>
#include "punycode.h"
#include "utf8.h"
namespace Url
{
std::string& Punycode::encode(std::string& str)
{
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.3
//
// let n = initial_n
// let delta = 0
// let bias = initial_bias
punycode_uint n = INITIAL_N;
punycode_uint delta = 0;
punycode_uint bias = INITIAL_BIAS;
std::string output;
// Accumulate the non-basic codepoints
std::vector<punycode_uint> codepoints;
for (auto it = str.cbegin(); it != str.cend(); )
{
Utf8::codepoint_t value = Utf8::readCodepoint(it, str.cend());
if (value < 0x80)
{
// copy them to the output in order
output.append(1, static_cast<char>(value));
}
codepoints.push_back(value);
}
// let h = b = the number of basic code points in the input
size_t h = output.size();
size_t b = h;
// copy a delimiter if b > 0
if (b > 0)
{
output.append(1, '-');
}
// while h < length(input) do begin
while (h < codepoints.size())
{
// let m = the minimum {non-basic} code point >= n in the input
punycode_uint m = MAX_PUNYCODE_UINT;
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
{
if ((*it >= n) && (*it < m))
{
m = *it;
}
}
// let delta = delta + (m - n) * (h + 1), fail on overflow
if ((m - n) > ((MAX_PUNYCODE_UINT - delta) / (h + 1)))
{
throw std::invalid_argument("Overflow delta update.");
}
delta += (m - n) * (h + 1);
// let n = m
n = m;
// for each code point c in the input (in order) do begin
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
{
// if c < n {or c is basic} then increment delta, fail on overflow
if (*it < n)
{
if (delta == MAX_PUNYCODE_UINT)
{
throw std::invalid_argument("Overflow delta increment.");
}
++delta;
}
// if c == n then begin
if (*it == n)
{
// let q = delta
punycode_uint q = delta;
// for k = base to infinity in steps of base do begin
for (punycode_uint k = BASE; ; k += BASE)
{
// let t = tmin if k <= bias {+ tmin}, or
// tmax if k >= bias + tmax, or k - bias otherwise
punycode_uint t = k <= bias ? TMIN :
k >= bias + TMAX ? TMAX : k - bias;
// if q < t then break
if (q < t)
{
break;
}
// output the code point for digit t + ((q - t) mod (base - t))
output.append(1, DIGIT_TO_BASIC[t + ((q - t) % (BASE - t))]);
// let q = (q - t) div (base - t)
q = (q - t) / (BASE - t);
}
// output the code point for digit q
output.append(1, DIGIT_TO_BASIC[q]);
// let bias = adapt(delta, h + 1, test h equals b?)
bias = adapt(delta, h + 1, h == b);
// let delta = 0
delta = 0;
// increment h
++h;
}
}
// increment delta and n
++delta;
++n;
}
str.assign(output);
return str;
}
std::string Punycode::encode(const std::string& str)
{
std::string result(str);
encode(result);
return result;
}
std::string Punycode::encodeHostname(const std::string& hostname)
{
// Avoid any punycoding at all if none is needed
if (!needsPunycoding(hostname))
{
return hostname;
}
std::string encoded;
size_t start = 0;
size_t end = hostname.find('.');
while(true)
{
std::string segment = hostname.substr(start, end - start);
if (needsPunycoding(segment))
{
encoded.append("xn--");
encoded.append(Punycode::encode(segment));
}
else
{
encoded.append(segment);
}
if (end == std::string::npos)
{
break;
}
else
{
encoded.append(1, '.');
start = end + 1;
end = hostname.find('.', start);
}
}
return encoded;
}
std::string& Punycode::decode(std::string& str)
{
// Pseudocode copied from https://tools.ietf.org/html/rfc3492#section-6.2
//
// let n = initial_n
// let i = 0
// let bias = initial_bias
// let output = an empty string indexed from 0
punycode_uint n = INITIAL_N;
punycode_uint i = 0;
punycode_uint bias = INITIAL_BIAS;
std::vector<punycode_uint> codepoints;
size_t index = str.rfind('-');
if (index == std::string::npos)
{
index = 0;
}
// consume all code points before the last delimiter (if there is one)
// and copy them to output, fail on any non-basic code point
for (auto it = str.begin(); it != (str.begin() + index); ++it)
{
if (static_cast<unsigned char>(*it) > 127U)
{
throw std::invalid_argument("Argument has non-basic code points.");
}
codepoints.push_back(*it);
}
// if more than zero code points were consumed then consume one more
// (which will be the last delimiter)
if (index > 0)
{
index += 1;
}
// while the input is not exhausted do begin
for (auto it = (str.begin() + index); it != str.end(); ++it)
{
// let oldi = i
// let w = 1
punycode_uint oldi = i;
punycode_uint w = 1;
// for k = base to infinity in steps of base do begin
for (punycode_uint k = BASE; ; k += BASE, ++it)
{
// consume a code point, or fail if there was none to consume
if (it == str.end())
{
throw std::invalid_argument("Premature termination");
}
// let digit = the code point's digit-value, fail if it has none
int lookup = BASIC_TO_DIGIT[static_cast<size_t>(*it)];
if (lookup == -1)
{
throw std::invalid_argument("Invalid base 36 character.");
}
unsigned char digit = static_cast<unsigned char>(lookup);
// let i = i + digit * w, fail on overflow
if (digit > ((MAX_PUNYCODE_UINT - i) / w))
{
throw std::invalid_argument("Overflow on i.");
}
i += digit * w;
// let t = tmin if k <= bias {+ tmin}, or
// tmax if k >= bias + tmax, or k - bias otherwise
punycode_uint t = k <= bias ? TMIN :
k >= bias + TMAX ? TMAX : k - bias;
// if digit < t then break
if (digit < t)
{
break;
}
// let w = w * (base - t), fail on overflow
if (w > (MAX_PUNYCODE_UINT / (BASE - t)))
{
// I believe this line is unreachable without first overflowing i.
// Since 'i' is updated above as i += digit * w, and w is updated as
// w = w * (BASE - t), we should like to keep (BASE - t) > digit to
// give 'w' a chance to overflow first. To keep t minimized, we must
// have 'bias' maximized. `bias` is driven by the 'adapt' function
// below.
//
// The value returned by 'adapt' increases with the input delta, and
// decreases with the input size. The delta is a function of the input
// size as well, on the order of (delta_n * input size), and
// legitimate delta_n values are limited to 0x10FFFF (the maximum
// unicode codepoint). Even setting that aside, the maximum value that
// adapt() can return is adapt(2 ** 32 - 1, 1, false) = 204.
//
// Using this bias, we could use the input (HERE) to get iterations:
//
// digit = b = 1, i = 2, k = 36, t = 1, w = 35
// digit = b = 1, i = 37, k = 72, t = 1, w = 1225
// digit = b = 1, i = 1262, k = 108, t = 1, w = 42875
// digit = b = 1, i = 44137, k = 144, t = 1, w = 1500625
// digit = b = 1, i = 1544762, k = 180, t = 1, w = 52521875
//
// At this point, t now becomes TMAX (26) because k exceeds the bias
// (since the maximum bias is 204). As such, the minimum continuation
// value is 26:
//
// digit = 0 = 26, i = 1367113512, k = 216, t = 26, w = 525218750
//
// However, the next iteration now overflows i before we can get to
// the w update.
throw std::invalid_argument("Overflow on w."); // LCOV_EXCL_LINE
}
w *= (BASE - t);
}
// let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?)
bias = adapt(i - oldi, codepoints.size() + 1, oldi == 0);
// let n = n + i div (length(output) + 1), fail on overflow
if ((i / (codepoints.size() + 1)) > (MAX_PUNYCODE_UINT - n))
{
throw std::invalid_argument("Overflow on n.");
}
n += i / (codepoints.size() + 1);
// let i = i mod (length(output) + 1)
i %= (codepoints.size() + 1);
// insert n into output at position i
codepoints.insert(codepoints.begin() + i, n);
// increment i
++i;
}
std::string output;
for (auto it = codepoints.begin(); it != codepoints.end(); ++it)
{
Utf8::writeCodepoint(output, *it);
}
str.assign(output);
return str;
}
std::string Punycode::decode(const std::string& str)
{
std::string result(str);
decode(result);
return result;
}
std::string Punycode::decodeHostname(const std::string& hostname)
{
std::string unencoded;
size_t start = 0;
size_t end = hostname.find('.');
while(true)
{
std::string segment = hostname.substr(start, end - start);
if (segment.substr(0, 4).compare("xn--") == 0)
{
segment = segment.substr(4);
unencoded.append(Punycode::decode(segment));
}
else
{
unencoded.append(segment);
}
if (end == std::string::npos)
{
break;
}
else
{
unencoded.append(1, '.');
start = end + 1;
end = hostname.find('.', start);
}
}
return unencoded;
}
bool Punycode::needsPunycoding(const std::string& str)
{
return std::any_of(
str.begin(),
str.end(),
[](char i){ return static_cast<unsigned char>(i) & 0x80; });
}
Punycode::punycode_uint Punycode::adapt(
punycode_uint delta, punycode_uint numpoints, bool firsttime)
{
// Psuedocode from https://tools.ietf.org/html/rfc3492#section-6.1
//
// It does not matter whether the modifications to delta and k inside
// adapt() affect variables of the same name inside the
// encoding/decoding procedures, because after calling adapt() the
// caller does not read those variables before overwriting them.
//
// if firsttime then let delta = delta div damp
// else let delta = delta div 2
delta = firsttime ? delta / DAMP : delta >> 1;
// let delta = delta + (delta div numpoints)
delta += (delta / numpoints);
// let k = 0
punycode_uint k = 0;
// while delta > ((base - tmin) * tmax) div 2 do begin
for (; delta > ((BASE - TMIN) * TMAX) / 2; k += BASE)
{
// let delta = delta div (base - tmin)
// let k = k + base
delta /= (BASE - TMIN);
}
// return k + (((base - tmin + 1) * delta) div (delta + skew))
return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
}
};