/* * Copyright (c) 2015 Spotify AB. * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ // THIS COMPONENT WAS ADAPTED FROM THE HADOOP PROJECT: /* * Licensed to the Apache Software Foundation (ASF) under one or more contributor license * agreements. See the NOTICE file distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, * either express or implied. See the License for the specific language governing * permissions and limitations under the License. */ package com.spotify.heroic.aggregation.simple; import com.spotify.heroic.aggregation.AbstractBucket; import com.spotify.heroic.metric.Point; import lombok.AllArgsConstructor; import lombok.RequiredArgsConstructor; import java.util.Arrays; import java.util.LinkedList; import java.util.ListIterator; import java.util.Map; /** * Implementation of the Cormode, Korn, Muthukrishnan, and Srivastava algorithm for streaming * calculation of targeted high-percentile epsilon-approximate quantiles. * <p> * This is a generalization of the earlier work by Greenwald and Khanna (GK), which essentially * allows different error bounds on the targeted quantiles, which allows for far more efficient * calculation of high-percentiles. * <p> * See: Cormode, Korn, Muthukrishnan, and Srivastava "Effective Computation of Biased Quantiles over * Data Streams" in ICDE 2005 * <p> * Greenwald and Khanna, "Space-efficient online computation of quantile summaries" in SIGMOD 2001 */ @RequiredArgsConstructor public class QuantileBucket extends AbstractBucket { private final long timestamp; private final double quantile; private final double error; /** * Set of active samples. */ private final LinkedList<SampleItem> samples = new LinkedList<SampleItem>(); /** * Total count of samples gathered. */ private long count = 0; /** * Current batch to insert and the corresponding write index. */ private double[] batch = new double[500]; private int index = 0; /** * Add a new data point from the stream. * * @param d data point to add. */ @Override public synchronized void updatePoint(Map<String, String> key, Point d) { batch[index] = d.getValue(); index++; count++; if (index == batch.length) { compact(); } } @Override public long timestamp() { return timestamp; } public synchronized double value() { if (index > 0) { compact(); } return query(quantile); } public synchronized int getSampleSize() { return samples.size(); } private void compact() { insertBatch(); compressSamples(); } /** * Merges items from buffer into the samples array in one pass. This is more efficient than * doing an insert on every item. */ private void insertBatch() { if (index == 0) { return; } Arrays.sort(batch, 0, index); // Base case: no samples int start = 0; if (samples.isEmpty()) { samples.add(new SampleItem(batch[0], 0, 1)); start++; } final ListIterator<SampleItem> it = samples.listIterator(); SampleItem prev = it.next(); for (int i = start; i < index; i++) { final double value = batch[i]; while (it.nextIndex() < samples.size() && prev.value < value) { prev = it.next(); } // If we found that bigger item, back up so we insert ourselves before it if (prev.value > value) { it.previous(); } // We use different indexes for the edge comparisons, because of the above // if statement that adjusts the iterator final int delta = calculateDelta(it.previousIndex(), it.nextIndex()); final SampleItem next = new SampleItem(value, delta, 1); it.add(next); prev = next; } index = 0; } /** * Try to remove extraneous items from the set of sampled items. This checks if an item is * unnecessary based on the desired error bounds, and merges it with the adjacent item if it * is. */ private void compressSamples() { if (samples.size() < 2) { return; } final ListIterator<SampleItem> it = samples.listIterator(); SampleItem next = it.next(); while (it.hasNext()) { final SampleItem prev = next; next = it.next(); if (prev.g + next.g + next.delta > allowableError(it.previousIndex())) { continue; } next.g += prev.g; // Remove prev. it.remove() kills the last thing returned. it.previous(); it.previous(); it.remove(); // it.next() is now equal to next, skip it back forward again it.next(); } } /** * Specifies the allowable error for this rank, depending on which quantiles are being * targeted. * <p> * This is the f(r_i, n) function from the CKMS paper. It's basically how wide the range of this * rank can be. * * @param rank the index in the list of samples */ private double allowableError(int rank) { int size = samples.size(); final double error = calculateError(rank, size); final double minError = size + 1; if (error < minError) { return error; } return minError; } private double calculateError(int rank, int size) { if (rank <= quantile * size) { return (2.0 * this.error * (size - rank)) / (1.0 - quantile); } return (2.0 * this.error * rank) / quantile; } private int calculateDelta(int previousIndex, int nextIndex) { if (previousIndex == 0 || nextIndex == samples.size()) { return 0; } return ((int) Math.floor(allowableError(nextIndex))) - 1; } /** * Get the estimated value at the specified quantile. * * @param quantile Queried quantile, e.g. 0.50 or 0.99. * @return Estimated value at that quantile. */ private double query(double quantile) { if (samples.isEmpty()) { throw new IllegalStateException("no data in estimator"); } int rankMin = 0; int desired = (int) (quantile * count); ListIterator<SampleItem> it = samples.listIterator(); SampleItem next = it.next(); for (int i = 1; i < samples.size(); i++) { final SampleItem prev = next; next = it.next(); rankMin += prev.g; if (rankMin + next.g + next.delta > desired + (allowableError(i) / 2)) { return prev.value; } } // edge case of wanting max value return samples.get(samples.size() - 1).value; } @AllArgsConstructor private static class SampleItem { public final double value; public final int delta; public int g; } }