package water.api; import hex.Quantiles; import water.*; import water.util.RString; import water.util.Log; import water.fvec.*; public class QuantilesPage extends Func { static final int API_WEAVER=1; // This file has auto-gen'd doc & json fields static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code. // This Request supports the HTML 'GET' command, and this is the help text // for GET. static final String DOC_GET = "Returns a summary of a fluid-vec frame"; @API(help="An existing H2O Frame key.", required=true, filter=Default.class) public Frame source_key; @API(help="Column to calculate quantile for", required=true, filter=responseFilter.class) public Vec column; class responseFilter extends VecClassSelect { responseFilter() { super("source_key"); } } @API(help = "Quantile desired (0.0-1.0). Median is 0.5. 0 and 1 are min/max", filter = Default.class, dmin = 0, dmax = 1) public double quantile = 0.5; @API(help = "Number of bins used (1-1000000). 1000 recommended", filter = Default.class, lmin = 1, lmax = 1000000) public int max_qbins = 1000; @API(help = "1: Exact result (iterate max 16). 0: One pass approx. 2: Provide both results", filter = Default.class, lmin = 0, lmax = 2) public int multiple_pass = 1; @API(help = "Interpolation between rows. Type 2 (mean) or 7 (linear).", filter = Default.class) public int interpolation_type = 7; // this isn't used yet. column_name is // class colsFilter1 extends MultiVecSelect { public colsFilter1() { super("source_key");} } // @API(help = "Not supported yet (Select columns)", filter=colsFilter1.class) // int[] cols; @API(help = "Maximum number of columns to show quantile", filter = Default.class, lmin = 1) int max_ncols = 1000; @API(help = "Column name.") String column_name; @API(help = "Quantile requested.") double quantile_requested; @API(help = "Interpolation type used.") int interpolation_type_used; @API(help = "False if an exact result is provided, True if the answer is interpolated.") boolean interpolated; @API(help = "Number of iterations actually performed.") int iterations; @API(help = "Result.") public double result; @API(help = "Single pass Result.") double result_single; public static String link(Key k, String content) { RString rs = new RString("<a href='QuantilesPage.query?source=%$key'>"+content+"</a>"); rs.replace("key", k.toString()); return rs.toString(); } @Override protected void init() throws IllegalArgumentException { super.init(); if( source_key == null ) throw new IllegalArgumentException("Source key is missing"); if( column == null ) throw new IllegalArgumentException("Column is missing"); if( column.isEnum() ) throw new IllegalArgumentException("Column is an enum"); if(! ((interpolation_type == 2) || (interpolation_type == 7)) ) { throw new IllegalArgumentException("Unsupported interpolation type. Currently only allow 2 or 7"); } } @Override protected void execImpl() { String[] names = new String[1]; Futures fs = new Futures(); column.rollupStats(fs); fs.blockForPending(); boolean multiPass; Quantiles[] qbins; // just take one here. // it's array because summary2 might use with a single pass list // and an exec single pass approx could pass a threshold list double [] quantiles_to_do = new double[1]; quantiles_to_do[0] = quantile; double approxResult; double exactResult; result_single = Double.NaN; result = Double.NaN; boolean done = false; // approx (fully independent from the multipass) qbins = null; if ( multiple_pass == 0 || multiple_pass == 2 ) { multiPass = false; result_single = Double.NaN; if ( multiple_pass == 0) result = Double.NaN; // These are used as initial params, and setup for the next iteration // be sure to set again if multiple qbins are created double valStart = column.min(); double valEnd = column.max(); // quantile doesn't matter for the map/reduce binning qbins = new Quantiles.BinTask2(max_qbins, valStart, valEnd).doAll(column)._qbins; Log.debug("Q_ for approx. valStart: "+valStart+" valEnd: "+valEnd); // Have to get this internal state, and copy this state for the next iteration // in order to multipass // I guess forward as params to next iteration // while ( (iteration <= maxIterations) && !done ) { // valStart = newValStart; // valEnd = newValEnd; // These 3 are available for viewing, but not necessary to iterate // valRange = newValRange; // valBinSize = newValBinSize; // valLowCnt = newValLowCnt; interpolation_type_used = interpolation_type; quantile_requested = quantiles_to_do[0]; if ( qbins != null ) { // if it's enum it will be null? qbins[0].finishUp(column, quantiles_to_do, interpolation_type, multiPass); column_name = names[0]; // the string name, not the param iterations = 1; done = qbins[0]._done; approxResult = qbins[0]._pctile[0]; interpolated = qbins[0]._interpolated; } else { column_name = ""; iterations = 0; done = false; approxResult = Double.NaN; interpolated = false; } result_single = approxResult; // only the best result if we only ran the approx if ( multiple_pass == 0 ) result = approxResult; // if max_qbins is set to 2? hmm. we won't resolve if max_qbins = 1 // interesting to see how we resolve (should we disallow < 1000? (accuracy issues) but good for test) } if ( multiple_pass == 1 || multiple_pass == 2 ) { final int MAX_ITERATIONS = 16; multiPass = true; exactResult = Double.NaN; double valStart = column.min(); double valEnd = column.max(); for (int b = 0; b < MAX_ITERATIONS; b++) { // we did an approximation pass above we could reuse it for the first pass here? // quantile doesn't matter for the map/reduce binning // cleaned up things so no multipass behavior in qbins..all in finishUp:w // so can reuse the qbins from the approx pass above (if done) if ( !(multiple_pass==2 && b==0) ) { qbins = new Quantiles.BinTask2(max_qbins, valStart, valEnd).doAll(column)._qbins; } iterations = b + 1; if ( qbins == null ) break; else { qbins[0].finishUp(column, quantiles_to_do, interpolation_type, multiPass); Log.debug("\nQ_ multipass iteration: "+iterations+" valStart: "+valStart+" valEnd: "+valEnd); double valBinSize = qbins[0]._valBinSize; Log.debug("Q_ valBinSize: "+valBinSize); valStart = qbins[0]._newValStart; valEnd = qbins[0]._newValEnd; done = qbins[0]._done; if ( done ) break; } } interpolation_type_used = interpolation_type; quantile_requested = quantiles_to_do[0]; if ( qbins != null ) { // if it's enum it will be null? column_name = names[0]; // string name, not the param done = qbins[0]._done; exactResult = qbins[0]._pctile[0]; interpolated = qbins[0]._interpolated; } else { // enums must come this way. Right now we don't seem // to create everything for the normal response, if we reject an enum col. // should fix that. For now, just hack it to not look for stuff column_name = ""; iterations = 0; done = false; exactResult = Double.NaN; interpolated = false; } // all done with it qbins = null; // always the best result if we ran here result = exactResult; } } }