package mil.nga.giat.geowave.core.store.adapter.statistics; import java.nio.ByteBuffer; import mil.nga.giat.geowave.core.index.ByteArrayId; import mil.nga.giat.geowave.core.index.Mergeable; import mil.nga.giat.geowave.core.store.adapter.statistics.histogram.ByteUtils; import mil.nga.giat.geowave.core.store.adapter.statistics.histogram.MinimalBinDistanceHistogram.MinimalBinDistanceHistogramFactory; import mil.nga.giat.geowave.core.store.adapter.statistics.histogram.NumericHistogram; import mil.nga.giat.geowave.core.store.adapter.statistics.histogram.NumericHistogramFactory; import mil.nga.giat.geowave.core.store.base.DataStoreEntryInfo; import net.sf.json.JSONArray; import net.sf.json.JSONException; import net.sf.json.JSONObject; /** * Dynamic histogram provide very high accuracy for CDF and quantiles over the a * numeric attribute. * */ public class RowRangeHistogramStatistics<T> extends AbstractDataStatistics<T> { public static final ByteArrayId STATS_TYPE = new ByteArrayId( "ROW_RANGE_HISTOGRAM"); private static final NumericHistogramFactory HistFactory = new MinimalBinDistanceHistogramFactory(); NumericHistogram histogram = HistFactory.create(1024); protected RowRangeHistogramStatistics() { super(); } public RowRangeHistogramStatistics( final ByteArrayId dataAdapterId, final ByteArrayId statisticsId ) { super( dataAdapterId, composeId(statisticsId)); } public RowRangeHistogramStatistics( final ByteArrayId dataAdapterId, final ByteArrayId indexId, NumericHistogramFactory factory, int bins ) { super( dataAdapterId, composeId(indexId)); histogram = factory.create(bins); } public RowRangeHistogramStatistics( final ByteArrayId dataAdapterId, final ByteArrayId statisticsId, int bins ) { super( dataAdapterId, composeId(statisticsId)); histogram = HistFactory.create(bins); } public static ByteArrayId composeId( ByteArrayId statisticsId ) { return composeId( STATS_TYPE.getString(), statisticsId.getString()); } @Override public DataStatistics<T> duplicate() { return new RowRangeHistogramStatistics<T>( dataAdapterId, decomposeFromId(statisticsId), this.histogram.getNumBins());// indexId } public static ByteArrayId decomposeFromId( final ByteArrayId id ) { // Need to account for length of type and of the separator int lengthOfNonId = STATS_TYPE.getBytes().length + STATS_ID_SEPARATOR.length(); int idLength = id.getBytes().length - lengthOfNonId; byte[] idBytes = new byte[idLength]; System.arraycopy( id.getBytes(), lengthOfNonId, idBytes, 0, idLength); return new ByteArrayId( idBytes); } public boolean isSet() { return false; } public double cardinality( byte[] start, byte[] end ) { return this.histogram.sum( ByteUtils.toDouble(end), true) - this.histogram.sum( ByteUtils.toDouble(start), false); } public double[] quantile( final int bins ) { final double[] result = new double[bins]; final double binSize = 1.0 / bins; for (int bin = 0; bin < bins; bin++) { result[bin] = quantile(binSize * (bin + 1)); } return result; } public long[] count( final int bins ) { return histogram.count(bins); } public double cdf( final byte[] id ) { return cdf(ByteUtils.toDouble(id)); } private double cdf( double val ) { return histogram.cdf(val); } public double quantile( final double percentage ) { return histogram.quantile((percentage)); } public double percentPopulationOverRange( final byte[] start, final byte[] stop ) { return cdf(stop) - cdf(start); } public long getLeftMostCount() { return (long) Math.ceil(histogram.sum( histogram.getMinValue(), true)); } public long totalSampleSize() { return histogram.getTotalCount(); } @Override public void merge( final Mergeable mergeable ) { if (mergeable instanceof RowRangeHistogramStatistics) { histogram.merge(((RowRangeHistogramStatistics<?>) mergeable).histogram); } } @Override public byte[] toBinary() { final ByteBuffer buffer = super.binaryBuffer(histogram.bufferSize() + 5); // buffer out an histogram.toBinary(buffer); return buffer.array(); } @Override public void fromBinary( final byte[] bytes ) { final ByteBuffer buffer = super.binaryBuffer(bytes); histogram.fromBinary(buffer); } @Override public void entryIngested( final DataStoreEntryInfo entryInfo, final T entry ) { for (final ByteArrayId ids : entryInfo.getRowIds()) { final byte[] idBytes = ids.getBytes(); add(ByteUtils.toDouble(idBytes)); } } protected void add( double num ) { histogram.add( 1, num); } public String toString() { StringBuffer buffer = new StringBuffer(); buffer.append( "histogram[index=").append( super.statisticsId.getString()); buffer.append(", bins={"); for (double v : this.quantile(10)) { buffer.append(v); buffer.append(' '); } buffer.deleteCharAt(buffer.length() - 1); buffer.append(", counts={"); for (long v : this.count(10)) { buffer.append( v).append( ' '); } buffer.deleteCharAt(buffer.length() - 1); buffer.append("}]"); buffer.append("}]"); return buffer.toString(); } /** * Convert Row Range Numeric statistics to a JSON object */ public JSONObject toJSONObject() throws JSONException { JSONObject jo = new JSONObject(); jo.put( "type", STATS_TYPE.getString()); jo.put( "statisticsID", statisticsId.getString()); jo.put( "range_min", histogram.getMinValue()); jo.put( "range_max", histogram.getMaxValue()); jo.put( "totalCount", histogram.getTotalCount()); JSONArray binsArray = new JSONArray(); for (final double v : this.quantile(10)) { binsArray.add(v); } jo.put( "bins", binsArray); JSONArray countsArray = new JSONArray(); for (final long v : count(10)) { countsArray.add(v); } jo.put( "counts", countsArray); return jo; } }