package org.streaminer.stream.quantile; import java.io.Serializable; import java.util.LinkedList; import java.util.List; import java.util.concurrent.CopyOnWriteArrayList; /** * This class is an implementation of the Greenwald-Khanna algorithm for computing * epsilon-approximate quantiles of large data sets. In its pure form it is an offline * algorithm. But it is used as a black box by many online algorithms for computing * epsilon-approximate quantiles on data streams.<br> * Our implementation widely adapts the original idea published by <i>Michael Greenwald * </i> and <i>Sanjeev Khanna</i> in their paper <i>"Space-Efficient Online Computation * of Quantile Summaries"</i>. Contrary to their idea this implementation uses a list * rather than a tree structure to maintain the elements. * * @author Markus Kokott, Carsten Przyluczky * */ public class GKQuantiles implements IQuantiles<Double> { private List<Tuple> summary; private double minimum; private double maximum; private int stepsUntilMerge; /** * GK needs 1 / (2 * epsilon) elements to complete it's initial phase */ private boolean initialPhase; private Integer count; /** * This value specifies the error bound. */ protected double epsilon; public GKQuantiles() { this(0.05); } /** * Creates a new GKQuantiles object that computes epsilon-approximate quantiles. * * @param epsilon The maximum error bound for quantile estimation. */ public GKQuantiles(double epsilon) { if (epsilon <= 0 || epsilon >= 1) { throw new RuntimeException("An appropriate epsilon value must lay between 0 and 1."); } setEpsilon(epsilon); } public void setEpsilon(double epsilon) { this.epsilon = epsilon; this.minimum = Double.MAX_VALUE; this.maximum = Double.MIN_VALUE; Double mergingSteps = Math.floor(1.0 / (2.0 * epsilon)); this.stepsUntilMerge = mergingSteps.intValue(); this.summary = new CopyOnWriteArrayList<Tuple>(); this.count = 0; this.initialPhase = true; } @Override public void offer(Double value) { insertItem(value); incrementCount(); if (count % stepsUntilMerge == 0 && !initialPhase){ compress(); } } /** * Estimates appropriate quantiles (i.e. values that holds epsilon accuracy). Note that if * the query parameter doesn't lay in [0,1] <code>Double.NaN</code> is returned! The same * result will be returned if an empty instance of GK is queried. * * @param q a <code>float</code> value * @return an estimated quantile represented by a {@link Double}. Will return {@link Double#NaN} * if <code>phi</code> isn't between 0 and 1 or this instance of <code>GKQuantiles</code> is empty. */ @Override public Double getQuantile(double q) { /*-------------------------------------------------------- * special cases if some queries occur in a very early state */ if (count == 0 || q < 0 || q > 1) { return Double.NaN; } if (count == 1) { return minimum; } if (count == 2) { if (q < 0.5){ return minimum; } if (q >= 0.5){ return maximum; } } //--------------------------------------------------------- int wantedRank = (int) ((q * count.floatValue())); int currentMinRank = 0; int currentMaxRank = 0; Double tolerance = (epsilon * count.doubleValue()); // if the wanted range is as most epsilon * count ranks smaller than the maximum the maximum // will always be an appropriate estimate if ( wantedRank > count - tolerance ) { return maximum; } // if the wanted range is as most epsilon * count ranks greater than the minimum the minimum // will always be an appropriate estimate if ( wantedRank < tolerance ){ return minimum; } Tuple lastTuple = summary.get(0); Object[] copyOfSummary = summary.toArray(); // usually a range is estimated during this loop. it's element's value will be returned for (int i = 0; i < copyOfSummary.length; i++){ Tuple currentTuple = (Tuple) copyOfSummary[i]; currentMinRank += currentTuple.getOffset(); currentMaxRank = currentMinRank + currentTuple.getRange(); if (currentMaxRank - wantedRank <= tolerance){ lastTuple = currentTuple; if (wantedRank - currentMinRank <= tolerance){ return currentTuple.getValue(); } } } return lastTuple.getValue(); } /** * Checks whether <code>item</code> is a new extreme value (i.e. minimum or maximum) or lays between those values * and calls the appropriate insert method. * * @param item {@link Double} value of current element */ private void insertItem(Double item) { if (item < minimum) { insertAsNewMinimum(item); return; } if (item >= maximum) { insertAsNewMaximum(item); return; } insertInBetween(item); } /** * This method will be called every time an element arrives whose value is smaller than the value * of the current minimum. Contrary to "normal" elements, the minimum's range have to be zero. * * @param item - new element with a {@link Double} value smaller than the current minimum of the summary. */ private void insertAsNewMinimum(Double item) { minimum = item; Tuple newTuple = new Tuple(item, 1, 0); summary.add(0, newTuple); } /** * This method will be called every time an element arrives whose value is greater than the value * of the current maximum. Contrary to "normal" elements, the maximum's range have to be zero. * * @param item - new element with a {@link Double} value greater than the current maximum of the summary. */ private void insertAsNewMaximum(Double item) { if (item == maximum){ Tuple newTuple = new Tuple(item, 1, computeRangeForNewTuple(summary.get(summary.size() - 1))); summary.add(summary.size() - 2 , newTuple); } else { maximum = item; Tuple newTuple = new Tuple(item, 1, 0); summary.add(newTuple); } } /** * Every time a new element gets processed this method is called to insert this element into * the summary. During initial phase element's ranges have to be zero. After this phase every * new element's range depends on its successor. * * @param item - a new arrived element represented by a {@link Double} value. */ private void insertInBetween(Double item) { Tuple newTuple = new Tuple (item, 1, 0); for (int i = 0; i < summary.size() - 1; i++) { Tuple current = summary.get(i); Tuple next = summary.get(i + 1); if (item >= current.getValue() && item < next.getValue()) { // while GK have seen less than 1 / (2*epsilon) elements, all elements must have an // offset of 0 if (!initialPhase){ newTuple.setRange(computeRangeForNewTuple(next)); } summary.add(i + 1, newTuple); return; } } } /** * Increments <code>count</code> and ends the initial phase if enough elements have been seen. */ private void incrementCount() { count++; if (count.equals(stepsUntilMerge)) { initialPhase = false; } } /** * Due to space efficiency the summary is compressed periodically */ private void compress() { List<List<Tuple>> partitions = new LinkedList<List<Tuple>>(); partitions = getPartitionsOfSummary(); List<Tuple> mergedSummary = new CopyOnWriteArrayList<Tuple>(); // just merge tuples per partition and concatenate the single resulting working sets mergedSummary.addAll(partitions.get(partitions.size() - 1)); for (int i=partitions.size() - 2; i > 0; i--){ mergedSummary.addAll(mergeWorkingSet(partitions.get(i))); } mergedSummary.addAll(partitions.get(0)); mergedSummary = sortWorkingSet(mergedSummary); summary = mergedSummary; } /** * merges a whole partition and therefore saves space. * * @param workingSet a partition (created by {@link #getPartitionsOfSummary()}) or parts of it * @return a {@link LinkedList} of {@link Tuple} containing the merged working set. */ private List<Tuple> mergeWorkingSet(List<Tuple> workingSet) { // recursion stops here if (workingSet.size() < 2) { return workingSet; } LinkedList<Tuple> mergedWorkingSet = new LinkedList<Tuple>(); // resulting working set LinkedList<Tuple> currentWorkingSet = new LinkedList<Tuple>(); // elements for this step of recursion LinkedList<Tuple> remainingWorkingSet = new LinkedList<Tuple>(); // remaining elements after this step of recursion remainingWorkingSet.addAll(workingSet); int index = 1; int bandOfChildren = computeBandOfTuple(workingSet.get(0)); int bandOfParent = computeBandOfTuple(workingSet.get(index)); currentWorkingSet.add(workingSet.get(0)); remainingWorkingSet.removeFirst(); // we are looking for the next tuple that have a greater band than the first element because that // element will be the limit for the first element to get merged into while (bandOfChildren == bandOfParent && workingSet.size() - 1 > index) { // the working set will be partitioned into a working set for the current step of recursion and // a partition that contains all elements that have to be processed in later steps currentWorkingSet.add(workingSet.get(index)); remainingWorkingSet.remove(workingSet.get(index)); index++; bandOfParent = computeBandOfTuple(workingSet.get(index)); } Tuple parent = workingSet.get(index); // there is no real parent. all elements have the same band if (bandOfParent == bandOfChildren) { currentWorkingSet.add(parent); mergedWorkingSet.addAll(mergeSiblings(currentWorkingSet)); return mergedWorkingSet; } int capacityOfParent = computeCapacityOfTuple(parent); // an element can be merged into it's parent if the resulting tuple isn't full (i.e. capacityOfParent > 1 after merging) while (capacityOfParent > currentWorkingSet.getLast().getOffset() && currentWorkingSet.size() > 1) { merge(currentWorkingSet.getLast(), parent); currentWorkingSet.removeLast(); capacityOfParent = computeCapacityOfTuple(parent); } // checking whether all children were merged into parent or some were left over if (currentWorkingSet.isEmpty()) { mergedWorkingSet.addAll(mergeWorkingSet(remainingWorkingSet)); } // if there are some children left, some of them can probably be merged into siblings. // if there is any child left over, parent can't be merged into any other tuple, so it must be removed // from the elements in the remaining working set. else { remainingWorkingSet.remove(parent); mergedWorkingSet.addAll(mergeSiblings(currentWorkingSet)); mergedWorkingSet.add(parent); mergedWorkingSet.addAll(mergeWorkingSet(remainingWorkingSet)); } return mergedWorkingSet; } /** * this method merges elements that have the same band * @param workingSet - a {@link LinkedList} of {@link Tuple} * @return a {@link LinkedList} of {@link Tuple} with smallest possible size in respect to * GKs merging operation. */ private LinkedList<Tuple> mergeSiblings(LinkedList<Tuple> workingSet) { // nothing left to merge if (workingSet.size() < 2) { return workingSet; } LinkedList<Tuple> mergedSiblings = new LinkedList<Tuple>(); // it is only possible to merge an element into a sibling, if this sibling is the element's // direct neighbor to the right Tuple lastSibling = workingSet.getLast(); workingSet.removeLast(); boolean canStillMerge = true; // as long as the rightmost element can absorb elements, it will absorb his sibling to the left while (canStillMerge && !workingSet.isEmpty()) { if (this.areMergeable(workingSet.getLast(), lastSibling)) { merge(workingSet.getLast(), lastSibling); workingSet.removeLast(); } else { canStillMerge = false; } } mergedSiblings.add(lastSibling); // recursion mergedSiblings.addAll(mergeSiblings(workingSet)); return mergedSiblings; } /** * call this method to merge the element <code>left</code> into the element <code>right</code>. * Please note, that only elements with smaller value and a band not greater than <code>right * </code> can be element <code>left</code>. * * @param left - element the will be deleted after merging * @param right - element that will contain the offset of element <code>left</code> after merging */ private void merge(Tuple left, Tuple right) { right.setOffset(right.getOffset() + left.getOffset()); } /** * The range of an element depends on range and offset of it's succeeding element. * This methods computes the current element's range. * * @return range of current element as {@link Integer} value */ private Integer computeRangeForNewTuple(Tuple successor) { if (initialPhase) { return 0; } //this is how it's done during algorithm detail in the paper Double range = 2.0 * epsilon * count.doubleValue(); range = Math.floor(range); //this is the more adequate version presented at section "empirical measurements" int successorRange = successor.getRange(); int successorOffset = successor.getOffset(); if (successorRange + successorOffset - 1 >= 0) { return (successorRange + successorOffset -1); } return range.intValue(); } /** * Partitions a list into {@link LinkedList}s of {@link Tuple}, so that bands of elements * in a single {@link LinkedList} are monotonically increasing. * * @return a {@link LinkedList} containing {@link LinkedList}s of {@link Double} which are * the partitions of {@link #summary} */ private List<List<Tuple>> getPartitionsOfSummary() { List<List<Tuple>> partitions = new LinkedList<List<Tuple>>(); List<Tuple> workingSet = summary; LinkedList<Tuple> currentPartition = new LinkedList<Tuple>(); Tuple lastTuple; Tuple lastButOneTuple; // assuring that the minimum and maximum won't appear in a partition with other elements Tuple minimum = workingSet.get(0); Tuple maximum = workingSet.get(workingSet.size() - 1); workingSet.remove(0); workingSet.remove(workingSet.size() - 1); // adding the minimum as the first element into partitions currentPartition = new LinkedList<Tuple>(); currentPartition.add(minimum); partitions.add(currentPartition); currentPartition = new LinkedList<Tuple>(); // nothing left to partitioning if (workingSet.size() < 2) { partitions.add(workingSet); // adding the maximum as the very last element into partitions currentPartition = new LinkedList<Tuple>(); currentPartition.add(maximum); partitions.add(currentPartition); return partitions; } // we process the working set from the very last element to the very first one while (workingSet.size() >= 2) { lastTuple = workingSet.get(workingSet.size() - 1); lastButOneTuple = workingSet.get(workingSet.size() - 2); currentPartition.addFirst(lastTuple); // every time we find an element whose band is greater than the current one the current partition // ended and we have to add a new partition to the resulting list if (isPartitionBorder(lastButOneTuple, lastTuple)) { partitions.add(currentPartition); currentPartition = new LinkedList<Tuple>(); } else { // here got's the last element inserted into an partition if (workingSet.size() == 2) { currentPartition.addFirst(lastButOneTuple); } } workingSet.remove(workingSet.size() - 1); } partitions.add(currentPartition); // adding the maximum as a partition of it's own at the very last position currentPartition = new LinkedList<Tuple>(); currentPartition.add(maximum); partitions.add(currentPartition); return partitions; } /** * Call this method to get the current capacity of an element. * * @param tuple - a {@link Tuple} * @return {@link Integer} value representing the <code>tuple</code>'s capacity */ private Integer computeCapacityOfTuple(Tuple tuple) { Integer offset = tuple.getOffset(); Double currentMaxCapacity = Math.floor(2.0 * epsilon * count); return (currentMaxCapacity.intValue() - offset); } /** * A tuple's band depend on the number of seen elements (<code>count</code>) and the * tuple's range. * <ul> * <li> While GK hasn't finished it's initial phase, all elements have to be put into a * band of their own. This is done using a band -1. * <li> If count and range are logarithmically equal the tuple's band will be 0 * <li> Else the tuple's band will be a value between 1 and <i>log(2*epsilon*count)</i> * </ul> * Please refer to the paper if you are interested in the formula for computing bands. * * @param tuple - a {@link Tuple} * @return {@link Integer} value specifying <code>tuple</code>'s band */ private Integer computeBandOfTuple(Tuple tuple) { Double p = Math.floor(2 * epsilon * count); // this will be true for new tuples if (areLogarithmicallyEqual(p, tuple.getRange().doubleValue())) { return 0; } // initial phase if (tuple.getRange() == 0) { return -1; } double alpha = 0; double lowerBound = 0d; double upperBound = 0d; while (alpha < (Math.log(p) / Math.log(2))) { alpha++; lowerBound = p - Math.pow(2, alpha) - (p % Math.pow(2, alpha)); if (lowerBound <= tuple.getRange()) { upperBound = p - Math.pow(2, alpha - 1) - (p % Math.pow(2, alpha - 1)); if (upperBound >= tuple.getRange()) { return (int) alpha; } } } return (int) alpha; } /** * Checks if two given values are logarithmically equal, i.e. the floored logarithm of * <code>valueOne</code> equals the floored logarithm of <code>valueTwo</code>. * @param valueOne - a {@link Double} representing a {@link Tuple}s band * @param valueTwo - a {@link Double} representing a {@link Tuple}s band * @return <code>true</code> if both values are logarithmically equal */ private boolean areLogarithmicallyEqual(Double valueOne, Double valueTwo) { if (Math.floor(Math.log(valueOne)) == Math.floor(Math.log(valueTwo))) { return true; } else { return false; } } /** * To check whether a pair of elements are mergeable or not you should use this method. Its * decision takes into account the bands and values of the given elements. * * @param tuple The element that will be deleted after merging. * @param parent The element that will absorb <code>tuple</code> during merge. * @return <code>true</code> if given elements are mergeable or <code>false</code> else. */ private boolean areMergeable(Tuple tuple, Tuple parent) { int capacityOfParent = computeCapacityOfTuple(parent); // return true if parent's capacity suffices to absorb tuple and tuple's band isn't greater than parent's if (capacityOfParent > tuple.getOffset() && computeBandOfTuple(parent) >= computeBandOfTuple(tuple)) { return true; } return false; } /** * Bands of elements in a partition are monotonically increasing from the first to the last element. * So a partition border is found if a preceding element has a greater band than the current * element. This method checks this condition for given elements. * * @param left preceding element. * @param right current element. * @return <code>true</code> if a partition boarder exists between the given elements or <code> * false</code> else. */ private boolean isPartitionBorder(Tuple left, Tuple right) { if (computeBandOfTuple(left) > computeBandOfTuple(right)) { return true; } return false; } /** * Sorts a {@link LinkedList} of {@link Tuple}. * @param workingSet - partitions of summary as a {@link LinkedList} of {@link Tuple}. * @return the given working set in ascending order. */ private List<Tuple> sortWorkingSet(List<Tuple> workingSet){ List<Tuple> sortedWorkingSet = new CopyOnWriteArrayList<Tuple>(); while (workingSet.size() > 1) { Tuple currentMinimum = workingSet.get(0); for (int i=0; i<workingSet.size(); i++) { if (currentMinimum.getValue() > workingSet.get(i).getValue()) { currentMinimum = workingSet.get(i); } } workingSet.remove(currentMinimum); sortedWorkingSet.add(currentMinimum); } sortedWorkingSet.add(workingSet.get(0)); return sortedWorkingSet; } public Integer getCount() { return this.count; } @Override public String toString() { StringBuffer s = new StringBuffer(); s.append( getClass().getCanonicalName() ); s.append( " {" ); s.append( " epsilon="); s.append( epsilon ); s.append( " }" ); return s.toString(); } /** * This is just a wrapper class to hold all needed informations of an element. It contains the following * informations: * <ul> * <li><b>value</b>: the value of the element</li> * <li><b>offset</b>: the difference between the least rank of this element and the rank of the preceding * element.</li> * <li><b>range</b>: the span between this elements least and most rank</li> * <ul> * */ private class Tuple implements Serializable { private static final long serialVersionUID = 1L; private Double value; private Integer offset; private Integer range; public Tuple(Double value, Integer offset, Integer range) { this.value = value; this.offset = offset; this.range = range; } public Double getValue() { return value; } public Integer getOffset() { return offset; } public void setOffset(Integer offset) { this.offset = offset; } public Integer getRange() { return range; } public void setRange(Integer range) { this.range = range; } @Override public String toString() { String out = "( " + value + ", " + offset + ", " + range + " )"; return out; } } }