package quickml.supervised.inspection; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableRangeMap; import com.google.common.collect.Maps; import com.google.common.collect.Range; import quickml.data.AttributesMap; import quickml.data.instances.Instance; import java.io.Serializable; import java.util.*; /** * Created by alexanderhawk on 11/14/14. */ public class NumericDistributionSampler { public Map<Integer, Long> getHistogramOfCountsForValues() { return histogramOfCountsForValues; } Map<Integer, Long> histogramOfCountsForValues = Maps.newHashMap(); ImmutableRangeMap<Double, Integer> attributeValueRangeMap; public static Random rand = new Random(); double actualSamples = 0; int numBins; double lowerBound, upperBound; private int samplesToDetermineBinWidths; int realSizeOfSplitList = 0; double[] attributeValuesAtBinBoundaries; public NumericDistributionSampler(List<Instance<AttributesMap, Serializable>> instances, int samplesToDraw, String attribute, int numBins) { samplesToDetermineBinWidths = instances.size(); updateDistributionSampler(instances, samplesToDraw, attribute, numBins); } public NumericDistributionSampler(List<Instance<AttributesMap, Serializable>> instances, double percentageOfAllSamplesToUse, String attribute, int numBins) { samplesToDetermineBinWidths = instances.size(); updateDistributionSampler(instances, percentageOfAllSamplesToUse, attribute, numBins); } public NumericDistributionSampler(List<Instance<AttributesMap, Serializable>> instances, int samplesToDraw, String attribute, int numBins, int samplesToDetermineBinWidths) { this.samplesToDetermineBinWidths = Math.min(samplesToDetermineBinWidths, instances.size()); updateDistributionSampler(instances, samplesToDraw, attribute, numBins); } public NumericDistributionSampler(List<Instance<AttributesMap, Serializable>> instances, double percentageOfAllSamplesToUse, String attribute, int numBins, int samplesToDetermineBinWidths) { this.samplesToDetermineBinWidths = Math.min(samplesToDetermineBinWidths, instances.size()); updateDistributionSampler(instances, percentageOfAllSamplesToUse, attribute, numBins); } public void updateDistributionSampler(List<Instance<AttributesMap, Serializable>> newInstances, double percentageOfAllSamplesToUse, String attribute, int numBins) { int samplesToDraw = (int) (percentageOfAllSamplesToUse * newInstances.size()); updateHistogramOfCountsForValues(newInstances, samplesToDraw, attribute, numBins); createAttributeValueRangeMap(); } public void updateDistributionSampler(List<Instance<AttributesMap, Serializable>> newInstances, int samplesToDraw, String attribute, int numBins) { updateHistogramOfCountsForValues(newInstances, samplesToDraw, attribute, numBins); createAttributeValueRangeMap(); } private void createAttributeValueRangeMap() { double currentCount = 0, prevCount = 0; ImmutableRangeMap.Builder<Double, Integer> valuesWithProbabilityRangeBuilder = ImmutableRangeMap.builder(); if (attributeValueRangeMap != null) { valuesWithProbabilityRangeBuilder.putAll(attributeValueRangeMap); } //is this right? for (Integer attributeValBinNumber : histogramOfCountsForValues.keySet()) { prevCount = currentCount; currentCount += histogramOfCountsForValues.get(attributeValBinNumber).doubleValue(); Range<Double> range = Range.openClosed(prevCount / actualSamples, currentCount / actualSamples); //prevCount/actualSamples is the start of the interval we associate with this attribute value. valuesWithProbabilityRangeBuilder.put(range, attributeValBinNumber); } attributeValueRangeMap = valuesWithProbabilityRangeBuilder.build(); } private void updateSplitList(double[] splitList, List<Instance<AttributesMap, Serializable>> instances, String attribute, int i) { Number val = ((Number) (instances.get(i).getAttributes().get(attribute))); if (val == null) { val = Double.valueOf(0); //consider making this (here and in the decide function) Double.MAX_VALUE } splitList[i] = ((Number) val).doubleValue(); realSizeOfSplitList++; } private void updateHistogramOfCountsForValues(List<Instance<AttributesMap, Serializable>> instances, int samplesToDraw, String attribute, int numBins) { Number val; //when the samples to draw are less than half the length of the list if (histogramOfCountsForValues.size() == 0) { if (samplesToDetermineBinWidths > instances.size()) { samplesToDetermineBinWidths = instances.size(); } //put samples in a list of appropriate getSize and sort it. double[] splitList = new double[samplesToDetermineBinWidths]; if (instances.size() < samplesToDetermineBinWidths / 2) { int folds = instances.size() / samplesToDraw; for (int i = 0; i < instances.size(); i += folds) { updateSplitList(splitList, instances, attribute, i); } } else { for (int i = instances.size() - 1; i >= Math.max(0, instances.size() - samplesToDetermineBinWidths); i--) { updateSplitList(splitList, instances, attribute, i); } } Arrays.sort(splitList, 0, realSizeOfSplitList); //get bin boundaries from sorted list attributeValuesAtBinBoundaries = new double[numBins + 1]; attributeValuesAtBinBoundaries[0] = splitList[0]; attributeValuesAtBinBoundaries[attributeValuesAtBinBoundaries.length - 1] = splitList[splitList.length - 1]; final int indexMultiplier = realSizeOfSplitList / (numBins); for (int x = 1; x < attributeValuesAtBinBoundaries.length - 1; x++) { attributeValuesAtBinBoundaries[x] = splitList[x * indexMultiplier - 1]; } } //updateBuilderConfig the counts for values that fall in each bin // if (instances.size() < samplesToDraw / 2) { int folds = instances.size() / samplesToDraw; for (int i = 0; i < instances.size(); i += folds) { val = ((Number) (instances.get(i).getAttributes().get(attribute))); if (val == null) val = Double.valueOf(0); updateHistogram(val, histogramOfCountsForValues); actualSamples++; } } else { for (int i = instances.size() - 1; i >= Math.max(0, instances.size() - samplesToDraw); i--) { val = ((Number) (instances.get(i).getAttributes().get(attribute))); if (val == null) { val = Double.valueOf(0); } updateHistogram(val, histogramOfCountsForValues); actualSamples++; } } return; } private void updateHistogram(Number val, Map<Integer, Long> localHstogramOfCountsForValues) { //need to call .get(attributeVal) to get the Range object for that value. But with splits, there is no need for a range object. We can just climb up till we step over. to find the boundaries. //then increment it Preconditions.checkState(attributeValuesAtBinBoundaries != null && attributeValuesAtBinBoundaries.length >= 1); int binIndex = getBinIndex(val); if (localHstogramOfCountsForValues.keySet().contains(binIndex)) { localHstogramOfCountsForValues.put(binIndex, localHstogramOfCountsForValues.get(binIndex).longValue() + 1L); } else { localHstogramOfCountsForValues.put(binIndex, Long.valueOf(1)); } } public int getBinIndex(Number val) { int binIndex = 0; double upper = 0; double valDouble = val.doubleValue(); for (int i = 0; i < attributeValuesAtBinBoundaries.length - 1; i++) { binIndex = i; upper = attributeValuesAtBinBoundaries[i + 1];//starts at top of bin 1 if (valDouble <= upper) { break; } } return binIndex; } private double getRandomDoubleInBin(int bin) { double lower = attributeValuesAtBinBoundaries[bin]; double upper = attributeValuesAtBinBoundaries[bin + 1]; return rand.nextDouble() * (upper - lower) + lower; } public Number sampleHistogram() { int randBin = attributeValueRangeMap.get(rand.nextDouble()); return getRandomDoubleInBin(randBin); } }