mirror of https://gitlab.com/hrbrmstr/tdigest.git
boB Rudis
5 years ago
14 changed files with 779 additions and 15 deletions
@ -1,25 +1,37 @@ |
|||
Package: tdigest |
|||
Type: Package |
|||
Title: tdigest title goes here otherwise CRAN checks fail |
|||
Title: Accurate Quantiles Using 't-Digests' |
|||
Version: 0.1.0 |
|||
Date: 2019-02-15 |
|||
Authors@R: c( |
|||
person("Bob", "Rudis", email = "bob@rud.is", role = c("aut", "cre"), |
|||
comment = c(ORCID = "0000-0001-5670-2640")) |
|||
comment = c(ORCID = "0000-0001-5670-2640")), |
|||
person("Gabriel", "Pichot", role = "aut", comment = "Original C++ coode") |
|||
) |
|||
Maintainer: Bob Rudis <bob@rud.is> |
|||
Description: A good description goes here otherwise CRAN checks fail. |
|||
Description: The t-digest construction algorithm uses a variant of 1-dimensional |
|||
k-means clustering to produce a very compact data structure that allows |
|||
accurate estimation of quantiles. This t-digest data structure can be used |
|||
to estimate quantiles, compute other rank statistics or even to estimate |
|||
related measures like trimmed means. The advantage of the t-digest over |
|||
previous digests for this purpose is that the t-digest handles data with |
|||
full floating point resolution. With small changes, the t-digest can handle |
|||
values from any ordered set for which we can compute something akin to a mean. |
|||
The accuracy of quantile estimates produced by t-digests can be orders of |
|||
magnitude more accurate than those produced by previous digest algorithms. |
|||
URL: https://gitlab.com/hrbrmstr/tdigest |
|||
BugReports: https://gitlab.com/hrbrmstr/tdigest/issues |
|||
SystemRequirements: C++11 |
|||
Encoding: UTF-8 |
|||
License: AGPL |
|||
License: MIT + file LICENSE |
|||
Suggests: |
|||
testthat, |
|||
covr |
|||
Depends: |
|||
R (>= 3.2.0) |
|||
Imports: |
|||
httr, |
|||
jsonlite |
|||
Rcpp |
|||
Roxygen: list(markdown = TRUE) |
|||
RoxygenNote: 6.1.1 |
|||
LinkingTo: |
|||
Rcpp |
|||
|
@ -0,0 +1,2 @@ |
|||
YEAR: 2019 |
|||
COPYRIGHT HOLDER: Bob Rudis |
@ -0,0 +1,21 @@ |
|||
# MIT License |
|||
|
|||
Copyright (c) 2019 Bob Rudis |
|||
|
|||
Permission is hereby granted, free of charge, to any person obtaining a copy |
|||
of this software and associated documentation files (the "Software"), to deal |
|||
in the Software without restriction, including without limitation the rights |
|||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
|||
copies of the Software, and to permit persons to whom the Software is |
|||
furnished to do so, subject to the following conditions: |
|||
|
|||
The above copyright notice and this permission notice shall be included in all |
|||
copies or substantial portions of the Software. |
|||
|
|||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
|||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
|||
SOFTWARE. |
@ -1,4 +1,4 @@ |
|||
# Generated by roxygen2: do not edit by hand |
|||
|
|||
import(httr) |
|||
importFrom(jsonlite,fromJSON) |
|||
importFrom(Rcpp,sourceCpp) |
|||
useDynLib(tdigest, .registration = TRUE) |
|||
|
@ -1,12 +1,23 @@ |
|||
#' ... |
|||
#' |
|||
#' Accurate Quantiles Using 't-Digests' |
|||
#' |
|||
#' The t-digest construction algorithm uses a variant of 1-dimensional |
|||
#' k-means clustering to produce a very compact data structure that allows |
|||
#' accurate estimation of quantiles. This t-digest data structure can be used |
|||
#' to estimate quantiles, compute other rank statistics or even to estimate |
|||
#' related measures like trimmed means. The advantage of the t-digest over |
|||
#' previous digests for this purpose is that the t-digest handles data with |
|||
#' full floating point resolution. With small changes, the t-digest can handle |
|||
#' values from any ordered set for which we can compute something akin to a mean. |
|||
#' The accuracy of quantile estimates produced by t-digests can be orders of |
|||
#' magnitude more accurate than those produced by previous digest algorithms. |
|||
#' |
|||
#' - URL: <https://gitlab.com/hrbrmstr/tdigest> |
|||
#' - BugReports: <https://gitlab.com/hrbrmstr/tdigest/issues> |
|||
#' |
|||
#' |
|||
#' @md |
|||
#' @name tdigest |
|||
#' @docType package |
|||
#' @author Bob Rudis (bob@@rud.is) |
|||
#' @import httr |
|||
#' @importFrom jsonlite fromJSON |
|||
NULL |
|||
#' @useDynLib tdigest, .registration = TRUE |
|||
#' @importFrom Rcpp sourceCpp |
|||
NULL |
@ -0,0 +1,3 @@ |
|||
*.o |
|||
*.so |
|||
*.dll |
@ -0,0 +1,3 @@ |
|||
CXX_STD = CXX11 |
|||
PKG_CXXFLAGS = |
|||
PKG_LIBS = |
@ -0,0 +1,256 @@ |
|||
#include "avltree.h" |
|||
|
|||
|
|||
AvlTree::AvlTree(): _root(NIL) { |
|||
_depth[NIL] = 0; |
|||
_parent[NIL] = 0; |
|||
_left[NIL] = 0; |
|||
_right[NIL] = 0; |
|||
} |
|||
|
|||
int AvlTree::first(int node) const { |
|||
if(node == NIL) { |
|||
return NIL; |
|||
} |
|||
|
|||
while(true) { |
|||
const int left = leftNode(node); |
|||
if(left == NIL) { |
|||
break; |
|||
} |
|||
node = left; |
|||
} |
|||
return node; |
|||
} |
|||
|
|||
int AvlTree::last(int node) const { |
|||
while(true) { |
|||
const int right = rightNode(node); |
|||
if(right == NIL) { |
|||
break; |
|||
} |
|||
node = right; |
|||
} |
|||
return node; |
|||
} |
|||
|
|||
int AvlTree::nextNode(int node) const { |
|||
const int right = rightNode(node); |
|||
if(right != NIL) { |
|||
return first(right); |
|||
} else { |
|||
int parent = parentNode(node); |
|||
while(parent != NIL && node == rightNode(parent)) { |
|||
node = parent; |
|||
parent = parentNode(parent); |
|||
} |
|||
return parent; |
|||
} |
|||
} |
|||
|
|||
int AvlTree::prevNode(int node) const { |
|||
const int left = leftNode(node); |
|||
if(left != NIL) { |
|||
return last(left); |
|||
} else { |
|||
int parent = parentNode(node); |
|||
while(parent != NIL && node == leftNode(parent)) { |
|||
node = parent; |
|||
parent = parentNode(parent); |
|||
} |
|||
return parent; |
|||
} |
|||
} |
|||
|
|||
bool AvlTree::add(double x, int w) { |
|||
if(_root == NIL) { |
|||
_root = ++_n; |
|||
_values[_root] = x; |
|||
_count[_root] = w; |
|||
_left[_root] = NIL; |
|||
_right[_root] = NIL; |
|||
_parent[_root] = NIL; |
|||
// Update depth and aggregates
|
|||
updateAggregates(_root); |
|||
} else { |
|||
int node = _root; |
|||
int parent = NIL; |
|||
int cmp; |
|||
do { |
|||
cmp = compare(node, x); |
|||
if(cmp < 0) { |
|||
parent = node; |
|||
node = leftNode(node); |
|||
} else if (cmp > 0) { |
|||
parent = node; |
|||
node = rightNode(node); |
|||
} else { |
|||
// we merge the node
|
|||
merge(node, x, w); |
|||
return false; |
|||
} |
|||
} while(node != NIL); |
|||
|
|||
node = ++_n; |
|||
_values[node] = x; |
|||
_count[node] = w; |
|||
_left[node] = NIL; |
|||
_right[node] = NIL; |
|||
_parent[node] = parent; |
|||
if(cmp < 0) { |
|||
_left[parent] = node; |
|||
} else { |
|||
assert(cmp > 0); |
|||
_right[parent] = node; |
|||
} |
|||
|
|||
rebalance(node); |
|||
|
|||
return true; |
|||
} |
|||
} |
|||
|
|||
int AvlTree::find(double x) const { |
|||
for(int node = _root; node != NIL;) { |
|||
const int cmp = compare(node, x); |
|||
if(cmp < 0) { |
|||
node = leftNode(node); |
|||
} else if(cmp > 0) { |
|||
node = rightNode(node); |
|||
} else { |
|||
return node; |
|||
} |
|||
} |
|||
return NIL; |
|||
} |
|||
|
|||
|
|||
int AvlTree::floor(double x) const { |
|||
int f = NIL; |
|||
for(int node = _root; node != NIL; ) { |
|||
const int cmp = compare(node, x); |
|||
if(cmp <= 0) { |
|||
node = leftNode(node); |
|||
} else { |
|||
f = node; |
|||
node = rightNode(node); |
|||
} |
|||
} |
|||
return f; |
|||
} |
|||
|
|||
int AvlTree::floorSum(long sum) const { |
|||
int f = NIL; |
|||
for(int node = _root; node != NIL; ) { |
|||
const int left = leftNode(node); |
|||
const long leftCount = aggregatedCount(left); |
|||
if(leftCount <= sum) { |
|||
f = node; |
|||
sum -= leftCount + count(node); |
|||
node = rightNode(node); |
|||
} else { |
|||
node = leftNode(node); |
|||
} |
|||
} |
|||
return f; |
|||
} |
|||
|
|||
long AvlTree::ceilSum(int node) const { |
|||
const int left = leftNode(node); |
|||
long sum = aggregatedCount(left); |
|||
int n = node; |
|||
for(int p = parentNode(node); p != NIL; p = parentNode(n)) { |
|||
if(n == rightNode(p)) { |
|||
const int leftP = leftNode(p); |
|||
sum += count(p) + aggregatedCount(leftP); |
|||
} |
|||
n = p; |
|||
} |
|||
return sum; |
|||
} |
|||
|
|||
void AvlTree::rebalance(int node) { |
|||
for(int n = node; n != NIL; ) { |
|||
const int p = parentNode(n); |
|||
|
|||
updateAggregates(n); |
|||
|
|||
switch(balanceFactor(n)) { |
|||
case -2: { |
|||
const int right = rightNode(n); |
|||
if(balanceFactor(right) == 1) { |
|||
rotateRight(right); |
|||
} |
|||
rotateLeft(n); |
|||
break; |
|||
} |
|||
case 2: { |
|||
const int left = leftNode(n); |
|||
if(balanceFactor(left) == -1) { |
|||
rotateLeft(left); |
|||
} |
|||
rotateRight(n); |
|||
break; |
|||
} |
|||
case -1: |
|||
case 1: |
|||
case 0: |
|||
break; |
|||
default: |
|||
// We should throw an error
|
|||
assert(true == false); |
|||
} |
|||
|
|||
n = p; |
|||
} |
|||
} |
|||
|
|||
void AvlTree::rotateLeft(int node) { |
|||
const int r = rightNode(node); |
|||
const int lr = leftNode(r); |
|||
|
|||
_right[node] = lr; |
|||
if(lr != NIL) { |
|||
_parent[lr] = node; |
|||
} |
|||
|
|||
const int p = parentNode(node); |
|||
_parent[r] = p; |
|||
if(p == NIL) { |
|||
_root = r; |
|||
} else if(leftNode(p) == node) { |
|||
_left[p] = r; |
|||
} else { |
|||
assert(rightNode(p) == node); |
|||
_right[p] = r; |
|||
} |
|||
_left[r] = node; |
|||
_parent[node] = r; |
|||
updateAggregates(node); |
|||
updateAggregates(parentNode(node)); |
|||
} |
|||
|
|||
void AvlTree::rotateRight(int node) { |
|||
const int l = leftNode(node); |
|||
const int rl = rightNode(l); |
|||
|
|||
_left[node] = rl; |
|||
if(rl != NIL) { |
|||
_parent[rl] = node; |
|||
} |
|||
|
|||
const int p = parentNode(node); |
|||
_parent[l] = p; |
|||
if(p == NIL) { |
|||
_root = l; |
|||
} else if(rightNode(p) == node) { |
|||
_right[p] = l; |
|||
} else { |
|||
assert(leftNode(p) == node); |
|||
_left[p] = l; |
|||
} |
|||
_right[l] = node; |
|||
_parent[node] = l; |
|||
updateAggregates(node); |
|||
updateAggregates(parentNode(node)); |
|||
} |
@ -0,0 +1,257 @@ |
|||
#ifndef HEADER_AVLTREE |
|||
#define HEADER_AVLTREE |
|||
|
|||
#include <cassert> |
|||
#include <cmath> |
|||
#include <iostream> |
|||
|
|||
|
|||
using namespace std; |
|||
|
|||
|
|||
class AvlTree { |
|||
|
|||
private: |
|||
int _root; |
|||
int _n = 0; |
|||
// TODO We should reallocate tables (ie allow dynamic arrays)
|
|||
int _parent[1000]; |
|||
int _left[1000]; |
|||
int _right[1000]; |
|||
char _depth[1000]; |
|||
int _count[1000]; |
|||
double _values[1000]; |
|||
int _aggregatedCount[1000]; |
|||
|
|||
public: |
|||
static const int NIL = 0; |
|||
|
|||
AvlTree(); |
|||
|
|||
//
|
|||
// Node comparison
|
|||
//
|
|||
|
|||
// O(1)
|
|||
inline int compare(int node, double x) const { |
|||
if(value(node) < x) { |
|||
return 1; |
|||
} else if(value(node) == x) { |
|||
return 0; |
|||
} else { |
|||
return -1; |
|||
} |
|||
} |
|||
|
|||
// O(1)
|
|||
inline int compare(int nodeA, int nodeB) const { |
|||
return compare(nodeA, value(nodeB)); |
|||
} |
|||
|
|||
//
|
|||
// Tree accessors
|
|||
//
|
|||
|
|||
// O(1)
|
|||
inline int root() const { |
|||
return _root; |
|||
} |
|||
// O(1)
|
|||
inline int size() const { |
|||
return _n; |
|||
} |
|||
|
|||
//
|
|||
// Node accessors
|
|||
//
|
|||
|
|||
// O(1)
|
|||
inline int parentNode(int node) const { |
|||
return _parent[node]; |
|||
} |
|||
// O(1)
|
|||
inline int leftNode(int node) const { |
|||
return _left[node]; |
|||
} |
|||
// O(1)
|
|||
inline int rightNode(int node) const { |
|||
return _right[node]; |
|||
} |
|||
// O(1)
|
|||
inline int depth(int node) const { |
|||
return _depth[node]; |
|||
} |
|||
// O(1)
|
|||
inline int count(int node) const { |
|||
return _count[node]; |
|||
} |
|||
// O(1)
|
|||
inline int aggregatedCount(int node) const { |
|||
return _aggregatedCount[node]; |
|||
} |
|||
// O(1)
|
|||
inline double value(int node) const { |
|||
return _values[node]; |
|||
} |
|||
|
|||
//
|
|||
// Tree accessors
|
|||
//
|
|||
|
|||
// O(log(n))
|
|||
int first(int node) const; |
|||
|
|||
// O(log(n))
|
|||
inline int first() const { |
|||
return first(_root); |
|||
} |
|||
|
|||
// O(log(n))
|
|||
int last(int node) const; |
|||
|
|||
// O(log(n))
|
|||
int nextNode(int node) const; |
|||
|
|||
// O(log(n))
|
|||
int prevNode(int node) const; |
|||
|
|||
//
|
|||
// Mutators
|
|||
//
|
|||
|
|||
// O(1)
|
|||
inline void updateAggregates(int node) { |
|||
// Updating depth
|
|||
_depth[node] = 1 + max(depth(leftNode(node)), depth(rightNode(node))); |
|||
_aggregatedCount[node] = count(node) + aggregatedCount(leftNode(node)) + aggregatedCount(rightNode(node)); |
|||
} |
|||
|
|||
// O(log(n))
|
|||
void update(int node, double x, int w) { |
|||
_values[node] += w * (x - value(node)) / count(node); |
|||
_count[node] += w; |
|||
|
|||
for(int n = node; n != NIL; n = parentNode(n)) { |
|||
updateAggregates(n); |
|||
} |
|||
} |
|||
|
|||
// O(log(n))
|
|||
void merge(int node, double x, int w) { |
|||
assert(value(node) == x); |
|||
_count[node] += w; |
|||
|
|||
for(int n = node; n != NIL; n = parentNode(n)) { |
|||
updateAggregates(n); |
|||
} |
|||
} |
|||
|
|||
// O(log(n))
|
|||
bool add(double x, int w); |
|||
|
|||
// O(log(n))
|
|||
int find(double x) const; |
|||
|
|||
// O(log(n))
|
|||
int floor(double x) const; |
|||
|
|||
// O(log(n))
|
|||
int floorSum(long sum) const; |
|||
|
|||
// O(log(n))
|
|||
long ceilSum(int node) const; |
|||
|
|||
private: |
|||
// O(1)
|
|||
inline int balanceFactor(int node) const { |
|||
return depth(leftNode(node)) - depth(rightNode(node)); |
|||
} |
|||
|
|||
// (O(log(n)^2)
|
|||
void rebalance(int node); |
|||
|
|||
// O(log(n))
|
|||
void rotateLeft(int node); |
|||
|
|||
// O(log(n))
|
|||
// TODO to factor with rotateLeft
|
|||
void rotateRight(int node); |
|||
|
|||
public: |
|||
//
|
|||
// For test or debugging purposes
|
|||
//
|
|||
|
|||
// Check balance integrity
|
|||
bool checkBalance(int node) const { |
|||
if(node == NIL) { |
|||
return depth(node) == 0; |
|||
} else { |
|||
return depth(node) == 1 + max(depth(leftNode(node)), depth(rightNode(node))) |
|||
&& abs(depth(leftNode(node)) - depth(rightNode(node))) <= 1 |
|||
&& checkBalance(leftNode(node)) |
|||
&& checkBalance(rightNode(node)) |
|||
; |
|||
} |
|||
} |
|||
inline bool checkBalance() const { |
|||
return checkBalance(_root); |
|||
} |
|||
|
|||
// Check aggregates integrity
|
|||
bool checkAggregates(int node) const { |
|||
if(node == NIL) { |
|||
return count(node) == 0; |
|||
} else { |
|||
return _aggregatedCount[node] == _count[node] + _aggregatedCount[leftNode(node)] + _aggregatedCount[rightNode(node)] |
|||
&& checkAggregates(leftNode(node)) |
|||
&& checkAggregates(rightNode(node)) |
|||
; |
|||
} |
|||
} |
|||
inline bool checkAggregates() const { |
|||
return checkAggregates(_root); |
|||
} |
|||
|
|||
// Check integrity (order of nodes)
|
|||
bool checkIntegrity(int node) const { |
|||
if(node == NIL) { |
|||
return true; |
|||
} else { |
|||
bool ok = true; |
|||
if(leftNode(node) != NIL) { |
|||
ok &= _values[node] >= _values[leftNode(node)]; |
|||
ok &= checkIntegrity(leftNode(node)); |
|||
} |
|||
if(rightNode(node) != NIL) { |
|||
ok &= _values[node] <= _values[rightNode(node)]; |
|||
ok &= checkIntegrity(rightNode(node)); |
|||
} |
|||
return ok; |
|||
} |
|||
} |
|||
inline bool checkIntegrity() const { |
|||
return checkIntegrity(_root); |
|||
} |
|||
|
|||
// Print as rows
|
|||
void print(int node) const { |
|||
if(node == NIL) |
|||
return; |
|||
cout << "Node " << node << "=> "; |
|||
cout << "Value:" << _values[node] << " "; |
|||
cout << "(" << _values[leftNode(node)] << ";"; |
|||
cout << "" << _values[rightNode(node)] << ") "; |
|||
cout << "Depth: " << depth(node) << " "; |
|||
cout << "Count: " <<_count[node] << " "; |
|||
cout << "Aggregate: " << _aggregatedCount[node] << endl; |
|||
print(leftNode(node)); |
|||
print(rightNode(node)); |
|||
} |
|||
void print() const { |
|||
print(_root); |
|||
} |
|||
|
|||
}; |
|||
|
|||
#endif |
@ -0,0 +1,62 @@ |
|||
#include "tdigest.h" |
|||
|
|||
|
|||
void TDigest::compress() { |
|||
// TODO: implement this one
|
|||
} |
|||
|
|||
|
|||
double TDigest::quantile(double q) { |
|||
if(q < 0 || q > 1) { |
|||
return 0; // TODO
|
|||
} |
|||
|
|||
if(_centroids->size() == 0) { |
|||
return 0; // TODO
|
|||
} else if(_centroids->size() == 1) { |
|||
return _centroids->value(_centroids->first()); |
|||
} |
|||
|
|||
const double index = q * (_count - 1); |
|||
|
|||
double previousMean = NAN; |
|||
double previousIndex = 0; |
|||
int next = _centroids->floorSum(index); |
|||
assert(next != AvlTree::NIL); |
|||
long total = _centroids->ceilSum(next); |
|||
const int prev = _centroids->prevNode(next); |
|||
if(prev != AvlTree::NIL) { |
|||
previousMean = _centroids->value(prev); |
|||
previousIndex = total - (_centroids->count(prev) + 1.0) / 2; |
|||
} |
|||
|
|||
while(true) { |
|||
const double nextIndex = total + (_centroids->count(next) - 1.) / 2; |
|||
if(nextIndex >= index) { |
|||
if(previousMean == NAN) { |
|||
// Index is before first centroid
|
|||
assert(total == 0); |
|||
if(nextIndex == previousIndex) { |
|||
return _centroids->value(next); |
|||
} |
|||
// We assume a linear increase
|
|||
int next2 = _centroids->value(next); |
|||
const double nextIndex2 = total + _centroids->count(next) + (_centroids->count(next2) - 1.) / 2; |
|||
previousMean = (nextIndex2 * _centroids->value(next) - nextIndex * _centroids->value(next2)) / (nextIndex2 - nextIndex); |
|||
} |
|||
return quantile(previousIndex, index, nextIndex, previousMean, _centroids->value(next)); |
|||
|
|||
} else if(_centroids->value(next) == AvlTree::NIL) { |
|||
// Beyond last centroid
|
|||
const double nextIndex2 = _count - 1; |
|||
const double nextMean2 = (_centroids->value(next) * (nextIndex2 - previousIndex ) - previousMean * (nextIndex2 - nextIndex)) / (nextIndex - previousIndex); |
|||
return quantile(nextIndex, index, nextIndex2, _centroids->value(next), nextMean2); |
|||
} |
|||
total += _centroids->count(next); |
|||
previousMean = _centroids->value(next); |
|||
previousIndex = nextIndex; |
|||
next = _centroids->nextNode(next); |
|||
} |
|||
|
|||
} |
|||
|
@ -0,0 +1,120 @@ |
|||
#ifndef HEADER_TDIGEST |
|||
#define HEADER_TDIGEST |
|||
|
|||
#include <cfloat> |
|||
|
|||
#include "avltree.h" |
|||
|
|||
using namespace std; |
|||
|
|||
class TDigest { |
|||
|
|||
private: |
|||
double _compression = 100; |
|||
double _count = 0; |
|||
AvlTree* _centroids = new AvlTree(); |
|||
|
|||
public: |
|||
TDigest (double compression): _compression(compression) {} |
|||
|
|||
inline long size() const { |
|||
return _count; |
|||
} |
|||
|
|||
inline void add(double x) { |
|||
add(x, 1); |
|||
} |
|||
|
|||
inline void add(double x, int w) { |
|||
|
|||
int start = _centroids->floor(x); |
|||
if(start == AvlTree::NIL) { |
|||
start = _centroids->first(); |
|||
} |
|||
|
|||
if(start == AvlTree::NIL) { |
|||
assert(_centroids->size() == 0); |
|||
_centroids->add(x, w); |
|||
_count += w; |
|||
} else { |
|||
double minDistance = DBL_MAX; |
|||
int lastNeighbor = AvlTree::NIL; |
|||
for(int neighbor = start; start != AvlTree::NIL; neighbor = _centroids->nextNode(neighbor)) { |
|||
double z = abs(_centroids->value(neighbor) - x); |
|||
if(z < minDistance) { |
|||
start = neighbor; |
|||
minDistance = z; |
|||
} else { |
|||
lastNeighbor = neighbor; |
|||
break; |
|||
} |
|||
|
|||
} |
|||
|
|||
int closest = AvlTree::NIL; |
|||
long sum = _centroids->ceilSum(start); |
|||
double n = 0; |
|||
for(int neighbor = start; neighbor != lastNeighbor; neighbor = _centroids->nextNode(neighbor)) { |
|||
assert(minDistance == abs(_centroids->value(neighbor) - x)); |
|||
double q = _count == 1 |
|||
? 0.5 |
|||
: (sum + (_centroids->count(neighbor) - 1 / 2. )) / (_count - 10) |
|||
; |
|||
double k = 4 * _count * q * (1 - q) / _compression; |
|||
|
|||
if(_centroids->count(neighbor) + w <= k) { |
|||
n++; |
|||
if((float)rand() / RAND_MAX < 1 / n) { |
|||
closest = neighbor; |
|||
} |
|||
} |
|||
sum += _centroids->count(neighbor); |
|||
|
|||
} |
|||
|
|||
if(closest == AvlTree::NIL) { |
|||
_centroids->add(x, w); |
|||
} else { |
|||
_centroids->update(closest, x, w); |
|||
} |
|||
_count += w; |
|||
|
|||
if(_centroids->size() > 20 * _compression) { |
|||
cout << "Compress:" << _centroids->size() << endl; |
|||
compress(); |
|||
} |
|||
} |
|||
|
|||
} |
|||
|
|||
inline static double interpolate(double x, double a, double b) { |
|||
return (x - a) / (b - a); |
|||
} |
|||
|
|||
inline static double quantile( |
|||
double previousIndex, double index, double nextIndex, |
|||
double previousMean, double nextMean |
|||
) { |
|||
const double delta = nextIndex - previousIndex; |
|||
const double previousWeight = (nextIndex - index) / delta; |
|||
const double nextWeight = (index - previousIndex) / delta; |
|||
return previousMean * previousWeight + nextMean * nextWeight; |
|||
} |
|||
|
|||
inline AvlTree* centroids() const { |
|||
return _centroids; |
|||
} |
|||
|
|||
inline void merge(TDigest* digest) { |
|||
AvlTree* centroids = digest->centroids(); |
|||
for(int n = centroids->first(); n != AvlTree::NIL; n = centroids->nextNode(n)) { |
|||
add(centroids->value(n), centroids->count(n)); |
|||
} |
|||
} |
|||
|
|||
void compress(); |
|||
double quantile(double q); |
|||
|
|||
}; |
|||
|
|||
#endif |
Loading…
Reference in new issue