NumericHistogram.java example

Explorer
incubator-hivemall-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package hivemall.ftvec.binning;

import hivemall.utils.lang.SizeOf;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;

/**
 * **THIS CLASS IS IMPORTED FROM HIVE 2.1.0 FOR COMPATIBILITY**
 *
 * A generic, re-usable histogram class that supports partial aggregations. The algorithm is a
 * heuristic adapted from the following paper: Yael Ben-Haim and Elad Tom-Tov,
 * "A streaming parallel decision tree algorithm", J. Machine Learning Research 11 (2010), pp.
 * 849--872. Although there are no approximation guarantees, it appears to work well with adequate
 * data and a large (e.g., 20-80) number of histogram bins.
 */
public final class NumericHistogram {
    /**
     * The Coord class defines a histogram bin, which is just an (x,y) pair.
     */
    static final class Coord implements Comparable<Coord> {
        double x;
        double y;

        Coord() {}

        public int compareTo(Coord other) {
            return Double.compare(x, other.x);
        }
    }

    // Class variables
    private int nbins;
    private int nusedbins;
    private ArrayList<Coord> bins;
    private Random prng;

    /**
     * Creates a new histogram object. Note that the allocate() or merge() method must be called
     * before the histogram can be used.
     */
    public NumericHistogram() {
        nbins = 0;
        nusedbins = 0;
        bins = null;

        // init the RNG for breaking ties in histogram merging. A fixed seed is specified here
        // to aid testing, but can be eliminated to use a time-based seed (which would
        // make the algorithm non-deterministic).
        prng = new Random(31183);
    }

    /**
     * Resets a histogram object to its initial state. allocate() or merge() must be called again
     * before use.
     */
    public void reset() {
        bins = null;
        nbins = nusedbins = 0;
    }

    /**
     * Returns the number of bins currently being used by the histogram.
     */
    public int getUsedBins() {
        return nusedbins;
    }

    /**
     * Returns true if this histogram object has been initialized by calling merge() or allocate().
     */
    public boolean isReady() {
        return nbins != 0;
    }

    /**
     * Returns a particular histogram bin.
     */
    public Coord getBin(int b) {
        return bins.get(b);
    }

    /**
     * Sets the number of histogram bins to use for approximating data.
     *
     * @param num_bins Number of non-uniform-width histogram bins to use
     */
    public void allocate(int num_bins) {
        nbins = num_bins;
        bins = new ArrayList<Coord>();
        nusedbins = 0;
    }

    /**
     * Takes a serialized histogram created by the serialize() method and merges it with the current
     * histogram object.
     *
     * @param other A serialized histogram created by the serialize() method
     * @see #merge
     */
    public void merge(List<?> other, DoubleObjectInspector doi) {
        if (other == null) {
            return;
        }

        if (nbins == 0 || nusedbins == 0) {
            // Our aggregation buffer has nothing in it, so just copy over 'other'
            // by deserializing the ArrayList of (x,y) pairs into an array of Coord objects
            nbins = (int) doi.get(other.get(0));
            nusedbins = (other.size() - 1) / 2;
            bins = new ArrayList<Coord>(nusedbins);
            for (int i = 1; i < other.size(); i += 2) {
                Coord bin = new Coord();
                bin.x = doi.get(other.get(i));
                bin.y = doi.get(other.get(i + 1));
                bins.add(bin);
            }
        } else {
            // The aggregation buffer already contains a partial histogram. Therefore, we need
            // to merge histograms using Algorithm #2 from the Ben-Haim and Tom-Tov paper.

            ArrayList<Coord> tmp_bins = new ArrayList<Coord>(nusedbins + (other.size() - 1) / 2);
            // Copy all the histogram bins from us and 'other' into an overstuffed histogram
            for (int i = 0; i < nusedbins; i++) {
                Coord bin = new Coord();
                bin.x = bins.get(i).x;
                bin.y = bins.get(i).y;
                tmp_bins.add(bin);
            }
            for (int j = 1; j < other.size(); j += 2) {
                Coord bin = new Coord();
                bin.x = doi.get(other.get(j));
                bin.y = doi.get(other.get(j + 1));
                tmp_bins.add(bin);
            }
            Collections.sort(tmp_bins);

            // Now trim the overstuffed histogram down to the correct number of bins
            bins = tmp_bins;
            nusedbins += (other.size() - 1) / 2;
            trim();
        }
    }

    /**
     * Adds a new data point to the histogram approximation. Make sure you have called either
     * allocate() or merge() first. This method implements Algorithm #1 from Ben-Haim and Tom-Tov,
     * "A Streaming Parallel Decision Tree Algorithm", JMLR 2010.
     *
     * @param v The data point to add to the histogram approximation.
     */
    public void add(double v) {
        // Binary search to find the closest bucket that v should go into.
        // 'bin' should be interpreted as the bin to shift right in order to accomodate
        // v. As a result, bin is in the range [0,N], where N means that the value v is
        // greater than all the N bins currently in the histogram. It is also possible that
        // a bucket centered at 'v' already exists, so this must be checked in the next step.
        int bin = 0;
        for (int l = 0, r = nusedbins; l < r;) {
            bin = (l + r) / 2;
            if (bins.get(bin).x > v) {
                r = bin;
            } else {
                if (bins.get(bin).x < v) {
                    l = ++bin;
                } else {
                    break; // break loop on equal comparator
                }
            }
        }

        // If we found an exact bin match for value v, then just increment that bin's count.
        // Otherwise, we need to insert a new bin and trim the resulting histogram back to size.
        // A possible optimization here might be to set some threshold under which 'v' is just
        // assumed to be equal to the closest bin -- if fabs(v-bins[bin].x) < THRESHOLD, then
        // just increment 'bin'. This is not done now because we don't want to make any
        // assumptions about the range of numeric data being analyzed.
        if (bin < nusedbins && bins.get(bin).x == v) {
            bins.get(bin).y++;
        } else {
            Coord newBin = new Coord();
            newBin.x = v;
            newBin.y = 1;
            bins.add(bin, newBin);

            // Trim the bins down to the correct number of bins.
            if (++nusedbins > nbins) {
                trim();
            }
        }

    }

    /**
     * Trims a histogram down to 'nbins' bins by iteratively merging the closest bins. If two pairs
     * of bins are equally close to each other, decide uniformly at random which pair to merge,
     * based on a PRNG.
     */
    private void trim() {
        while (nusedbins > nbins) {
            // Find the closest pair of bins in terms of x coordinates. Break ties randomly.
            double smallestdiff = bins.get(1).x - bins.get(0).x;
            int smallestdiffloc = 0, smallestdiffcount = 1;
            for (int i = 1; i < nusedbins - 1; i++) {
                double diff = bins.get(i + 1).x - bins.get(i).x;
                if (diff < smallestdiff) {
                    smallestdiff = diff;
                    smallestdiffloc = i;
                    smallestdiffcount = 1;
                } else {
                    if (diff == smallestdiff && prng.nextDouble() <= (1.0 / ++smallestdiffcount)) {
                        smallestdiffloc = i;
                    }
                }
            }

            // Merge the two closest bins into their average x location, weighted by their heights.
            // The height of the new bin is the sum of the heights of the old bins.
            double d = bins.get(smallestdiffloc).y + bins.get(smallestdiffloc + 1).y;
            Coord smallestdiffbin = bins.get(smallestdiffloc);
            smallestdiffbin.x *= smallestdiffbin.y / d;
            smallestdiffbin.x += bins.get(smallestdiffloc + 1).x / d
                    * bins.get(smallestdiffloc + 1).y;
            smallestdiffbin.y = d;
            // Shift the remaining bins left one position
            bins.remove(smallestdiffloc + 1);
            nusedbins--;
        }
    }

    /**
     * Gets an approximate quantile value from the current histogram. Some popular quantiles are 0.5
     * (median), 0.95, and 0.98.
     *
     * @param q The requested quantile, must be strictly within the range (0,1).
     * @return The quantile value.
     */
    public double quantile(double q) {
        assert (bins != null && nusedbins > 0 && nbins > 0);
        double sum = 0, csum = 0;
        int b;
        for (b = 0; b < nusedbins; b++) {
            sum += bins.get(b).y;
        }
        for (b = 0; b < nusedbins; b++) {
            csum += bins.get(b).y;
            if (csum / sum >= q) {
                if (b == 0) {
                    return bins.get(b).x;
                }

                csum -= bins.get(b).y;
                double r = bins.get(b - 1).x + (q * sum - csum)
                        * (bins.get(b).x - bins.get(b - 1).x) / (bins.get(b).y);
                return r;
            }
        }
        return -1; // for Xlint, code will never reach here
    }

    /**
     * In preparation for a Hive merge() call, serializes the current histogram object into an
     * ArrayList of DoubleWritable objects. This list is deserialized and merged by the merge
     * method.
     *
     * @return An ArrayList of Hadoop DoubleWritable objects that represents the current histogram.
     * @see #merge
     */
    public ArrayList<DoubleWritable> serialize() {
        ArrayList<DoubleWritable> result = new ArrayList<DoubleWritable>();

        // Return a single ArrayList where the first element is the number of bins bins,
        // and subsequent elements represent bins (x,y) pairs.
        result.add(new DoubleWritable(nbins));
        if (bins != null) {
            for (int i = 0; i < nusedbins; i++) {
                result.add(new DoubleWritable(bins.get(i).x));
                result.add(new DoubleWritable(bins.get(i).y));
            }
        }

        return result;
    }

    public int getNumBins() {
        return bins == null ? 0 : bins.size();
    }

    public int lengthFor() {
        final int sizeOfObject = 16;
        final int sizeOfPrimitive1 = 4;
        final int sizeOfPrimitive2 = 8;
        final int sizeOfArrayList = 44; // JAVA32_OBJECT + PRIMITIVES1 * 2 + JAVA32_ARRAY
        final int sizeOfLengthForRandom = sizeOfObject + sizeOfPrimitive1 + sizeOfPrimitive2
                + sizeOfObject + sizeOfPrimitive2;

        int length = sizeOfObject;
        length += SizeOf.INT * 2; // two int
        int numBins = getNumBins();
        if (numBins > 0) {
            length += sizeOfArrayList; // List<Coord>
            // Coord holds two doubles
            length += numBins * (sizeOfObject + SizeOf.DOUBLE * 2);
        }
        length += sizeOfLengthForRandom; // Random
        return length;
    }

}