/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * */ package org.apache.hadoop.tools.rumen; import java.io.PrintStream; import java.util.Iterator; import java.util.Map; import java.util.TreeMap; /** * {@link Histogram} represents an ordered summary of a sequence of {@code long} * s which can be queried to produce a discrete approximation of its cumulative * distribution function * */ class Histogram implements Iterable<Map.Entry<Long, Long>> { private TreeMap<Long, Long> content = new TreeMap<Long, Long>(); private String name; private long totalCount; public Histogram() { this("(anonymous)"); } public Histogram(String name) { super(); this.name = name; totalCount = 0L; } public void dump(PrintStream stream) { stream.print("dumping Histogram " + name + ":\n"); Iterator<Map.Entry<Long, Long>> iter = iterator(); while (iter.hasNext()) { Map.Entry<Long, Long> ent = iter.next(); stream.print("val/count pair: " + (long) ent.getKey() + ", " + (long) ent.getValue() + "\n"); } stream.print("*** end *** \n"); } public Iterator<Map.Entry<Long, Long>> iterator() { return content.entrySet().iterator(); } public long get(long key) { Long result = content.get(key); return result == null ? 0 : result; } public long getTotalCount() { return totalCount; } public void enter(long value) { Long existingValue = content.get(value); if (existingValue == null) { content.put(value, 1L); } else { content.put(value, existingValue + 1L); } ++totalCount; } /** * Produces a discrete approximation of the CDF. The user provides the points * on the {@code Y} axis he wants, and we give the corresponding points on the * {@code X} axis, plus the minimum and maximum from the data. * * @param scale * the denominator applied to every element of buckets. For example, * if {@code scale} is {@code 1000}, a {@code buckets} element of 500 * will specify the median in that output slot. * @param buckets * an array of int, all less than scale and each strictly greater * than its predecessor if any. We don't check these requirements. * @return a {@code long[]}, with two more elements than {@code buckets} has. * The first resp. last element is the minimum resp. maximum value * that was ever {@code enter}ed. The rest of the elements correspond * to the elements of {@code buckets} and carry the first element * whose rank is no less than {@code #content elements * scale / * bucket}. * */ public long[] getCDF(int scale, int[] buckets) { if (totalCount == 0) { return null; } long[] result = new long[buckets.length + 2]; // fill in the min and the max result[0] = content.firstEntry().getKey(); result[buckets.length + 1] = content.lastEntry().getKey(); Iterator<Map.Entry<Long, Long>> iter = content.entrySet().iterator(); long cumulativeCount = 0; int bucketCursor = 0; // Loop invariant: the item at buckets[bucketCursor] can still be reached // from iter, and the number of logged elements no longer available from // iter is cumulativeCount. // // cumulativeCount/totalCount is therefore strictly less than // buckets[bucketCursor]/scale . while (iter.hasNext()) { long targetCumulativeCount = buckets[bucketCursor] * totalCount / scale; Map.Entry<Long, Long> elt = iter.next(); cumulativeCount += elt.getValue(); while (cumulativeCount >= targetCumulativeCount) { result[bucketCursor + 1] = elt.getKey(); ++bucketCursor; if (bucketCursor < buckets.length) { targetCumulativeCount = buckets[bucketCursor] * totalCount / scale; } else { break; } } if (bucketCursor == buckets.length) { break; } } return result; } }