Histogram.java example

Explorer

bigpetstore-master
- hadoop-1.2.1
  - src
- src
  - integration
    - java
      - org
        bigtop
        bigpetstore
        integration
        BigPetStoreHiveIT.java
        BigPetStoreMahoutIT.java
        BigPetStorePigIT.java
        ITUtils.java
  - main
    - java
      - org
        bigtop
        bigpetstore
        clustering
        BPSRecommnder.java
        MahoutClusterTransactionsByRegion.java
        contract
        PetStoreStatistics.java
        etl
        CrunchETL.java
        HiveViewCreator.java
        LineItem.java
        PigCSVCleaner.java
        generator
        BPSGenerator.java
        GeneratePetStoreTransactionsInputFormat.java
        PetStoreTransaction.java
        PetStoreTransactionInputSplit.java
        TransactionIteratorFactory.java
        util
        BigPetStoreConstants.java
        DeveloperTools.java
        NumericalIdUtils.java
        Pair.java
        PetStoreParseFunctions.java
        StringUtils.java
  - test
    - java
      - org
        bigtop
        bigpetstore
        docs
        TestDocs.java
        generator
        TestNumericalIdUtils.java
        TestPetStoreTransactionGeneratorJob.java

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/**
 * 
 */
package org.apache.hadoop.tools.rumen;

import java.io.PrintStream;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;

/**
 * {@link Histogram} represents an ordered summary of a sequence of {@code long}
 * s which can be queried to produce a discrete approximation of its cumulative
 * distribution function
 * 
 */
class Histogram implements Iterable<Map.Entry<Long, Long>> {
  private TreeMap<Long, Long> content = new TreeMap<Long, Long>();

  private String name;

  private long totalCount;

  public Histogram() {
    this("(anonymous)");
  }

  public Histogram(String name) {
    super();

    this.name = name;

    totalCount = 0L;
  }

  public void dump(PrintStream stream) {
    stream.print("dumping Histogram " + name + ":\n");

    Iterator<Map.Entry<Long, Long>> iter = iterator();

    while (iter.hasNext()) {
      Map.Entry<Long, Long> ent = iter.next();

      stream.print("val/count pair: " + (long) ent.getKey() + ", "
          + (long) ent.getValue() + "\n");
    }

    stream.print("*** end *** \n");
  }

  public Iterator<Map.Entry<Long, Long>> iterator() {
    return content.entrySet().iterator();
  }

  public long get(long key) {
    Long result = content.get(key);

    return result == null ? 0 : result;
  }

  public long getTotalCount() {
    return totalCount;
  }

  public void enter(long value) {
    Long existingValue = content.get(value);

    if (existingValue == null) {
      content.put(value, 1L);
    } else {
      content.put(value, existingValue + 1L);
    }

    ++totalCount;
  }

  /**
   * Produces a discrete approximation of the CDF. The user provides the points
   * on the {@code Y} axis he wants, and we give the corresponding points on the
   * {@code X} axis, plus the minimum and maximum from the data.
   * 
   * @param scale
   *          the denominator applied to every element of buckets. For example,
   *          if {@code scale} is {@code 1000}, a {@code buckets} element of 500
   *          will specify the median in that output slot.
   * @param buckets
   *          an array of int, all less than scale and each strictly greater
   *          than its predecessor if any. We don't check these requirements.
   * @return a {@code long[]}, with two more elements than {@code buckets} has.
   *         The first resp. last element is the minimum resp. maximum value
   *         that was ever {@code enter}ed. The rest of the elements correspond
   *         to the elements of {@code buckets} and carry the first element
   *         whose rank is no less than {@code #content elements * scale /
   *         bucket}.
   * 
   */
  public long[] getCDF(int scale, int[] buckets) {
    if (totalCount == 0) {
      return null;
    }

    long[] result = new long[buckets.length + 2];

    // fill in the min and the max
    result[0] = content.firstEntry().getKey();

    result[buckets.length + 1] = content.lastEntry().getKey();

    Iterator<Map.Entry<Long, Long>> iter = content.entrySet().iterator();
    long cumulativeCount = 0;
    int bucketCursor = 0;

    
    // Loop invariant: the item at buckets[bucketCursor] can still be reached
    // from iter, and the number of logged elements no longer available from
    // iter is cumulativeCount.
    // 
    // cumulativeCount/totalCount is therefore strictly less than
    // buckets[bucketCursor]/scale .
     
    while (iter.hasNext()) {
      long targetCumulativeCount = buckets[bucketCursor] * totalCount / scale;

      Map.Entry<Long, Long> elt = iter.next();

      cumulativeCount += elt.getValue();

      while (cumulativeCount >= targetCumulativeCount) {
        result[bucketCursor + 1] = elt.getKey();

        ++bucketCursor;

        if (bucketCursor < buckets.length) {
          targetCumulativeCount = buckets[bucketCursor] * totalCount / scale;
        } else {
          break;
        }
      }

      if (bucketCursor == buckets.length) {
        break;
      }
    }

    return result;
  }
}