package water.api.dsl.util; import water.AutoBuffer; import water.Iced; import java.util.PriorityQueue; /** * Simple utility to perform reservoir sampling in order to make huge data small enough to * bring in to local memory and display/etc. * I did this in Java instead of Scala, because I wasn't sure about Serialization/transient issues in Scala. * * Not Entirely Sure about threading/synchronization in this model */ public class Reservoir extends Iced { private transient final PriorityQueue<SampleItem> minHeap = new PriorityQueue<SampleItem>(); public final int reservoirSize; //Needed For Serialization? public Reservoir() { this.reservoirSize = -1; } public Reservoir(int reservoirSize) { this.reservoirSize = reservoirSize; } @Override public water.AutoBuffer write(AutoBuffer bb) { int[] order = new int[minHeap.size()]; double[] vals = new double[minHeap.size()]; //TODO: Till we figure out nulls int x=0; for(SampleItem item : minHeap) { order[x] = item.getRandomOrder(); vals[x] = item.getValue();//==null?0:item.getValue(); //TODO: till we figure out nulls x++; } bb.put4(reservoirSize); bb.putA4(order); bb.putA8d(vals); return( bb ); } @SuppressWarnings("unchecked") @Override public water.api.dsl.util.Reservoir read(AutoBuffer bb) { int rSize = bb.get4(); int[] order = bb.getA4(); double[] vals = bb.getA8d(); Reservoir reservoir = new Reservoir(rSize); for( int x=0;x<order.length;x++) { reservoir.minHeap.add(new SampleItem(order[x], vals[x])); } return(reservoir); } public void add(double item) { add( new SampleItem(item) ); } synchronized public void add(SampleItem item) { if( item != null ) { if( minHeap.size() < reservoirSize) { minHeap.add(item); }else { SampleItem head = minHeap.peek(); //If Item is > than the lest item in the heap.. then swap them out. if( item.getRandomOrder() > head.getRandomOrder() ) { minHeap.poll(); minHeap.add(item); } } } } // synchronized public void merge( Reservoir other ) { // if( other != null ) { // for( SampleItem item : other.minHeap) { // add(item); // } // } // } synchronized public Reservoir merge( Reservoir other ) { Reservoir result = new Reservoir(this.reservoirSize); for(SampleItem item : this.minHeap ) { result.add(item); } if( other != null ) { for(SampleItem item : other.minHeap ) { result.add(item); } } return( result ); } synchronized public double[] getValues() { double[] result = new double[minHeap.size()]; int x=0; for(SampleItem item : minHeap) { result[x] = item.getValue(); x++; } return( result ); } synchronized public int getNumValues(){ return( minHeap.size() ); } // public static void main(String[] args) { // Reservoir reservoir = new Reservoir(10); // for( int x=0;x<15;x++) { // reservoir.add((double)x); // } // } }