package water; import static water.util.JCodeGen.toStaticVar; import hex.ConfusionMatrix; import hex.VarImp; import java.util.*; import javassist.*; import water.api.*; import water.api.Request.API; import water.fvec.*; import water.serial.AutoBufferSerializer; import water.util.*; import water.util.Log.Tag.Sys; /** * A Model models reality (hopefully). * A model can be used to 'score' a row, or a collection of rows on any * compatible dataset - meaning the row has all the columns with the same names * as used to build the mode. */ public abstract class Model extends Lockable<Model> { static final int API_WEAVER = 1; // This file has auto-gen'd doc & json fields static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code. /** Dataset key used to *build* the model, for models for which this makes * sense, or null otherwise. Not all models are built from a dataset (eg * artificial models), or are built from a single dataset (various ensemble * models), so this key has no *mathematical* significance in the model but * is handy during common model-building and for the historical record. */ @API(help="Datakey used to *build* the model") public final Key _dataKey; /** Columns used in the model and are used to match up with scoring data * columns. The last name is the response column name. */ @API(help="Column names used to build the model") public final String _names[]; /** Categorical/factor/enum mappings, per column. Null for non-enum cols. * The last column holds the response col enums. */ @API(help="Column names used to build the model") public final String _domains[][]; @API(help = "Relative class distribution factors in original data") public final float[] _priorClassDist; @API(help = "Relative class distribution factors used for model building") protected float[] _modelClassDist; // WARNING: be really careful to modify this POJO because // modification does not involve update in DKV public void setModelClassDistribution(float[] classdist) { _modelClassDist = classdist.clone(); } private final UniqueId uniqueId; /** The start time in mS since the epoch for model training. */ public long training_start_time = 0L; /** The duration in mS for model training. */ public long training_duration_in_ms = 0L; /** Any warnings thrown during model building. */ @API(help="warnings") public String[] warnings = new String[0]; /** Whether or not this model has cross-validated results stored. */ protected boolean _have_cv_results; /** Full constructor from frame: Strips out the Vecs to just the names needed * to match columns later for future datasets. */ public Model( Key selfKey, Key dataKey, Frame fr, float[] priorClassDist ) { this(selfKey,dataKey,fr.names(),fr.domains(), priorClassDist, null, 0, 0); } public Model( Key selfKey, Key dataKey, String names[], String domains[][], float[] priorClassDist, float[] modelClassDist) { this(selfKey,dataKey,names,domains,priorClassDist,modelClassDist,0,0); } /** Full constructor */ public Model( Key selfKey, Key dataKey, String names[], String domains[][], float[] priorClassDist, float[] modelClassDist, long training_start_time, long training_duration_in_ms ) { super(selfKey); this.uniqueId = new UniqueId(_key); if( domains == null ) domains=new String[names.length+1][]; assert domains.length==names.length; assert names.length >= 1; assert names[names.length-1] != null; // Have a valid response-column name? _dataKey = dataKey; _names = names; _domains = domains; _priorClassDist = priorClassDist; _modelClassDist = modelClassDist; this.training_duration_in_ms = training_duration_in_ms; this.training_start_time = training_start_time; } // Currently only implemented by GLM2, DeepLearning, GBM and DRF: public Request2 get_params() { throw new UnsupportedOperationException("get_params() has not yet been implemented in class: " + this.getClass()); } // NOTE: this is a local copy of the Job; to get the real state you need to get it from the DKV. // Currently only implemented by GLM2, DeepLearning, GBM and DRF: public Request2 job() { throw new UnsupportedOperationException("job() has not yet been implemented in class: " + this.getClass()); } public enum ModelCategory { Unknown, Binomial, Multinomial, Regression, Clustering; } // TODO: override in KMeansModel once that's rewritten on water.Model public ModelCategory getModelCategory() { return (isClassifier() ? (nclasses() > 2 ? ModelCategory.Multinomial : ModelCategory.Binomial) : ModelCategory.Regression); } /** Remove any Model internal Keys */ @Override public Futures delete_impl(Futures fs) { return fs; /* None in the default Model */ } @Override public String errStr() { return "Model"; } public void addWarning(String warning) { if(this.warnings == null || this.warnings.length == 0) this.warnings = new String[]{warning}; else { this.warnings = Arrays.copyOf(this.warnings,this.warnings.length+1); this.warnings[this.warnings.length-1] = warning; } } public boolean isSupervised() { return true; } public UniqueId getUniqueId() { return this.uniqueId; } public void start_training(long training_start_time) { Log.info("setting training_start_time to: " + training_start_time + " for Model: " + this._key.toString() + " (" + this.getClass().getSimpleName() + "@" + System.identityHashCode(this) + ")"); final long t = training_start_time; new TAtomic<Model>() { @Override public Model atomic(Model m) { if (m != null) { m.training_start_time = t; } return m; } }.invoke(_key); this.training_start_time = training_start_time; } public void start_training(Model previous) { training_start_time = System.currentTimeMillis(); Log.info("setting training_start_time to: " + training_start_time + " for Model: " + this._key.toString() + " (" + this.getClass().getSimpleName() + "@" + System.identityHashCode(this) + ") [checkpoint case]"); if (null != previous) training_duration_in_ms += previous.training_duration_in_ms; final long t = training_start_time; final long d = training_duration_in_ms; new TAtomic<Model>() { @Override public Model atomic(Model m) { if (m != null) { m.training_start_time = t; m.training_duration_in_ms = d; } return m; } }.invoke(_key); } public void stop_training() { training_duration_in_ms += (System.currentTimeMillis() - training_start_time); Log.info("setting training_duration_in_ms to: " + training_duration_in_ms + " for Model: " + this._key.toString() + " (" + this.getClass().getSimpleName() + "@" + System.identityHashCode(this) + ")"); final long d = training_duration_in_ms; new TAtomic<Model>() { @Override public Model atomic(Model m) { if (m != null) { m.training_duration_in_ms = d; } return m; } }.invoke(_key); } public String responseName() { return _names[ _names.length-1]; } public String[] classNames() { return _domains[_domains.length-1]; } public boolean isClassifier() { return classNames() != null ; } public int nclasses() { String cns[] = classNames(); return cns==null ? 1 : cns.length; } /** Returns number of input features */ public int nfeatures() { return _names.length - 1; } /** For classifiers, confusion matrix on validation set. */ public ConfusionMatrix cm() { return null; } /** Returns mse for validation set. */ public double mse() { return Double.NaN; } /** Variable importance of individual input features measured by this model. */ public VarImp varimp() { return null; } public boolean hasCrossValModels() { return _have_cv_results; } /** Bulk score for given <code>fr</code> frame. * The frame is always adapted to this model. * * @param fr frame to be scored * @return frame holding predicted values * * @see #score(Frame, boolean) */ public Frame score(Frame fr) { return score(fr, true); } /** Bulk score the frame <code>fr</code>, producing a Frame result; the 1st Vec is the * predicted class, the remaining Vecs are the probability distributions. * For Regression (single-class) models, the 1st and only Vec is the * prediction value. * * The flat <code>adapt</code> * @param fr frame which should be scored * @param adapt a flag enforcing an adaptation of <code>fr</code> to this model. If flag * is <code>false</code> scoring code expect that <code>fr</code> is already adapted. * @return a new frame containing a predicted values. For classification it contains a column with * prediction and distribution for all response classes. For regression it contains only * one column with predicted values. */ public final Frame score(Frame fr, boolean adapt) { if (isSupervised()) { int ridx = fr.find(responseName()); if (ridx != -1) { // drop the response for scoring! fr = new Frame(fr); fr.remove(ridx); } } // Adapt the Frame layout - returns adapted frame and frame containing only // newly created vectors Frame[] adaptFrms = adapt ? adapt(fr,false) : null; // Adapted frame containing all columns - mix of original vectors from fr // and newly created vectors serving as adaptors Frame adaptFrm = adapt ? adaptFrms[0] : fr; // Contains only newly created vectors. The frame eases deletion of these vectors. Frame onlyAdaptFrm = adapt ? adaptFrms[1] : null; // Invoke scoring Frame output = scoreImpl(adaptFrm); // Be nice to DKV and delete vectors which i created :-) if (adapt) onlyAdaptFrm.delete(); return output; } /** Score already adapted frame. * * @param adaptFrm * @return */ protected Frame scoreImpl(Frame adaptFrm) { if (isSupervised()) { int ridx = adaptFrm.find(responseName()); assert ridx == -1 : "Adapted frame should not contain response in scoring method!"; assert nfeatures() == adaptFrm.numCols() : "Number of model features " + nfeatures() + " != number of test set columns: " + adaptFrm.numCols(); assert adaptFrm.vecs().length == nfeatures() : "Scoring data set contains wrong number of columns: " + adaptFrm.vecs().length + " instead of " + nfeatures(); } // Create a new vector for response // If the model produces a classification/enum, copy the domain into the // result vector. int nc = nclasses(); Vec [] newVecs = new Vec[]{adaptFrm.anyVec().makeZero(classNames())}; if(nc > 1) newVecs = Utils.join(newVecs,adaptFrm.anyVec().makeZeros(nc)); String [] names = new String[newVecs.length]; names[0] = "predict"; for(int i = 1; i < names.length; ++i) names[i] = classNames()[i-1]; final int num_features = nfeatures(); new MRTask2() { @Override public void map( Chunk chks[] ) { double tmp [] = new double[num_features]; // We do not need the last field representing response float preds[] = new float [nclasses()==1?1:nclasses()+1]; int len = chks[0]._len; for( int row=0; row<len; row++ ) { float p[] = score0(chks,row,tmp,preds); for( int c=0; c<preds.length; c++ ) chks[num_features+c].set0(row,p[c]); } } }.doAll(Utils.join(adaptFrm.vecs(),newVecs)); // Return just the output columns return new Frame(names,newVecs); } /** Single row scoring, on a compatible Frame. */ public final float[] score( Frame fr, boolean exact, int row ) { double tmp[] = new double[fr.numCols()]; for( int i=0; i<tmp.length; i++ ) tmp[i] = fr.vecs()[i].at(row); return score(fr.names(),fr.domains(),exact,tmp); } /** Single row scoring, on a compatible set of data. Fairly expensive to adapt. */ public final float[] score( String names[], String domains[][], boolean exact, double row[] ) { return score(adapt(names,domains,exact),row,new float[nclasses()]); } /** Single row scoring, on a compatible set of data, given an adaption vector */ public final float[] score( int map[][][], double row[], float[] preds ) { /*FIXME final int[][] colMap = map[map.length-1]; // Response column mapping is the last array assert colMap.length == _names.length-1 : " "+Arrays.toString(colMap)+" "+Arrays.toString(_names); double tmp[] = new double[colMap.length]; // The adapted data for( int i=0; i<colMap.length; i++ ) { // Column mapping, or NaN for missing columns double d = colMap[i]==-1 ? Double.NaN : row[colMap[i]]; if( map[i] != null ) { // Enum mapping int e = (int)d; if( e < 0 || e >= map[i].length ) d = Double.NaN; // User data is out of adapt range else { e = map[i][e]; d = e==-1 ? Double.NaN : (double)e; } } tmp[i] = d; } return score0(tmp,preds); // The results. */ return null; } /** Build an adaption array. The length is equal to the Model's vector length. * Each inner 2D-array is a * compressed domain map from data domains to model domains - or null for non-enum * columns, or null for identity mappings. The extra final int[] is the * column mapping itself, mapping from model columns to data columns. or -1 * if missing. * If 'exact' is true, will throw if there are: * any columns in the model but not in the input set; * any enums in the data that the model does not understand * any enums returned by the model that the data does not have a mapping for. * If 'exact' is false, these situations will use or return NA's instead. */ private int[][][] adapt( String names[], String domains[][], boolean exact) { int maplen = names.length; int map[][][] = new int[maplen][][]; // Make sure all are compatible for( int c=0; c<names.length;++c) { // Now do domain mapping String ms[] = _domains[c]; // Model enum String ds[] = domains[c]; // Data enum if( ms == ds ) { // Domains trivially equal? } else if( ms == null ) { throw new IllegalArgumentException("Incompatible column: '" + _names[c] + "', expected (trained on) numeric, was passed a categorical"); } else if( ds == null ) { if( exact ) throw new IllegalArgumentException("Incompatible column: '" + _names[c] + "', expected (trained on) categorical, was passed a numeric"); throw H2O.unimpl(); // Attempt an asEnum? } else if( !Arrays.deepEquals(ms, ds) ) { map[c] = getDomainMapping(_names[c], ms, ds, exact); } // null mapping is equal to identity mapping } return map; } /** * Type of missing columns during adaptation between train/test datasets * Overload this method for models that have sparse data handling. * Otherwise, NaN is used. * @return real-valued number (can be NaN) */ protected double missingColumnsType() { return Double.NaN; } /** Build an adapted Frame from the given Frame. Useful for efficient bulk * scoring of a new dataset to an existing model. Same adaption as above, * but expressed as a Frame instead of as an int[][]. The returned Frame * does not have a response column. * It returns a <b>two element array</b> containing an adapted frame and a * frame which contains only vectors which where adapted (the purpose of the * second frame is to delete all adapted vectors with deletion of the * frame). */ public Frame[] adapt( final Frame fr, boolean exact) { return adapt(fr, exact, true); } public Frame[] adapt( final Frame fr, boolean exact, boolean haveResponse) { Frame vfr = new Frame(fr); // To avoid modification of original frame fr int n = _names.length; if (haveResponse && isSupervised()) { int ridx = vfr.find(_names[_names.length - 1]); if (ridx != -1 && ridx != vfr._names.length - 1) { // Unify frame - put response to the end String name = vfr._names[ridx]; vfr.add(name, vfr.remove(ridx)); } n = ridx == -1 ? _names.length - 1 : _names.length; } String [] names = isSupervised() ? Arrays.copyOf(_names, n) : _names.clone(); Frame [] subVfr; // replace missing columns with NaNs (or 0s for DeepLearning with sparse data) subVfr = vfr.subframe(names, missingColumnsType()); vfr = subVfr[0]; // extract only subframe but keep the rest for delete later Vec[] frvecs = vfr.vecs(); boolean[] toEnum = new boolean[frvecs.length]; if(!exact) for(int i = 0; i < n;++i) if(_domains[i] != null && !frvecs[i].isEnum()) {// if model expects domain but input frame does not have domain => switch vector to enum frvecs[i] = frvecs[i].toEnum(); toEnum[i] = true; } int[][][] map = adapt(names,vfr.domains(),exact); assert map.length == names.length; // Be sure that adapt call above do not skip any column ArrayList<Vec> avecs = new ArrayList<Vec>(); // adapted vectors ArrayList<String> anames = new ArrayList<String>(); // names for adapted vector for( int c=0; c<map.length; c++ ) // Iterate over columns if(map[c] != null) { // Column needs adaptation Vec adaptedVec; if (toEnum[c]) { // Vector was flipped to column already, compose transformation adaptedVec = TransfVec.compose( (TransfVec) frvecs[c], map[c], vfr.domains()[c], false); } else adaptedVec = frvecs[c].makeTransf(map[c], vfr.domains()[c]); avecs.add(frvecs[c] = adaptedVec); anames.add(names[c]); // Collect right names } else if (toEnum[c]) { // Vector was transformed to enum domain, but does not need adaptation we need to record it avecs.add(frvecs[c]); anames.add(names[c]); } // Fill trash bin by vectors which need to be deleted later by the caller. Frame vecTrash = new Frame(anames.toArray(new String[anames.size()]), avecs.toArray(new Vec[avecs.size()])); if (subVfr[1]!=null) vecTrash.add(subVfr[1], true); return new Frame[] { new Frame(names,frvecs), vecTrash }; } /** Returns a mapping between values of model domains (<code>modelDom</code>) and given column domain. * @see #getDomainMapping(String, String[], String[], boolean) */ public static int[][] getDomainMapping(String[] modelDom, String[] colDom, boolean exact) { return getDomainMapping(null, modelDom, colDom, exact); } /** * Returns a mapping for given column according to given <code>modelDom</code>. * In this case, <code>modelDom</code> is * * @param colName name of column which is mapped, can be null. * @param modelDom * @param logNonExactMapping * @return */ public static int[][] getDomainMapping(String colName, String[] modelDom, String[] colDom, boolean logNonExactMapping) { int emap[] = new int[modelDom.length]; boolean bmap[] = new boolean[modelDom.length]; HashMap<String,Integer> md = new HashMap<String, Integer>((int) ((colDom.length/0.75f)+1)); for( int i = 0; i < colDom.length; i++) md.put(colDom[i], i); for( int i = 0; i < modelDom.length; i++) { Integer I = md.get(modelDom[i]); if (I == null && logNonExactMapping) Log.warn(Sys.SCORM, "Domain mapping: target domain contains the factor '"+modelDom[i]+"' which DOES NOT appear in input domain " + (colName!=null?"(column: " + colName+")":"")); if (I!=null) { emap[i] = I; bmap[i] = true; } } if (logNonExactMapping) { // Inform about additional values in column domain which do not appear in model domain for (int i=0; i<colDom.length; i++) { boolean found = false; for (int j=0; j<emap.length; j++) if (emap[j]==i) { found=true; break; } if (!found) Log.warn(Sys.SCORM, "Domain mapping: target domain DOES NOT contain the factor '"+colDom[i]+"' which appears in input domain "+ (colName!=null?"(column: " + colName+")":"")); } } // produce packed values int[][] res = Utils.pack(emap, bmap); // Sort values in numeric order to support binary search in TransfVec Utils.sortWith(res[0], res[1]); return res; } /** Bulk scoring API for one row. Chunks are all compatible with the model, * and expect the last Chunks are for the final distribution and prediction. * Default method is to just load the data into the tmp array, then call * subclass scoring logic. */ protected float[] score0( Chunk chks[], int row_in_chunk, double[] tmp, float[] preds ) { assert chks.length>=_names.length; // Last chunk is for the response for( int i=0; i<nfeatures(); i++ ) // Do not include last value since it can contains a response tmp[i] = chks[i].at0(row_in_chunk); float[] scored = score0(tmp,preds); // Correct probabilities obtained from training on oversampled data back to original distribution // C.f. http://gking.harvard.edu/files/0s.pdf Eq.(27) if (isClassifier() && _priorClassDist != null && _modelClassDist != null) { assert(scored.length == nclasses()+1); //1 label + nclasses probs ModelUtils.correctProbabilities(scored, _priorClassDist, _modelClassDist); //set label based on corrected probabilities (max value wins, with deterministic tie-breaking) scored[0] = ModelUtils.getPrediction(scored, tmp); } return scored; } /** * Compute the model error for a given test data set * For multi-class classification, this is the classification error based on assigning labels for the highest predicted per-class probability. * For binary classification, this is the classification error based on assigning labels using the optimal threshold for maximizing the F1 score. * For regression, this is the mean squared error (MSE). * @param ftest Frame containing test data * @param vactual The response column Vec * @param fpreds Frame containing ADAPTED (domain labels from train+test data) predicted data (classification: label + per-class probabilities, regression: target) * @param hitratio_fpreds Frame containing predicted data (domain labels from test data) (classification: label + per-class probabilities, regression: target) * @param label Name for the scored data set to be printed * @param printMe Whether to print the scoring results to Log.info * @param max_conf_mat_size Largest size of Confusion Matrix (#classes) for it to be printed to Log.info * @param cm Confusion Matrix object to populate for multi-class classification (also used for regression) * @param auc AUC object to populate for binary classification * @param hr HitRatio object to populate for classification * @return model error, see description above */ public double calcError(final Frame ftest, final Vec vactual, final Frame fpreds, final Frame hitratio_fpreds, final String label, final boolean printMe, final int max_conf_mat_size, final water.api.ConfusionMatrix cm, final AUC auc, final HitRatio hr) { StringBuilder sb = new StringBuilder(); double error = Double.POSITIVE_INFINITY; // populate AUC if (auc != null) { assert(isClassifier()); assert(nclasses() == 2); auc.actual = ftest; auc.vactual = vactual; auc.predict = fpreds; auc.vpredict = fpreds.vecs()[2]; //binary classifier (label, prob0, prob1 (THIS ONE), adaptedlabel) auc.invoke(); auc.toASCII(sb); error = auc.data().err(); //using optimal threshold for F1 } // populate CM if (cm != null) { cm.actual = ftest; cm.vactual = vactual; cm.predict = fpreds; cm.vpredict = fpreds.vecs()[0]; // prediction (either label or regression target) cm.invoke(); if (isClassifier()) { if (auc != null) { AUCData aucd = auc.data(); //override the CM with the one computed by AUC (using optimal threshold) //Note: must still call invoke above to set the domains etc. cm.cm = new long[3][3]; // 1 extra layer for NaNs (not populated here, since AUC skips them) cm.cm[0][0] = aucd.cm()[0][0]; cm.cm[1][0] = aucd.cm()[1][0]; cm.cm[0][1] = aucd.cm()[0][1]; cm.cm[1][1] = aucd.cm()[1][1]; double cm_err = new hex.ConfusionMatrix(cm.cm).err(); double auc_err = aucd.err(); if (! (Double.isNaN(cm_err) && Double.isNaN(auc_err))) // NOTE: NaN != NaN assert(cm_err == auc_err); //check consistency with AUC-computed error } else { error = new hex.ConfusionMatrix(cm.cm).err(); //only set error if AUC didn't already set the error } if (cm.cm.length <= max_conf_mat_size+1) cm.toASCII(sb); } else { assert(auc == null); error = cm.mse; cm.toASCII(sb); } } // populate HitRatio if (hr != null) { assert(isClassifier()); hr.actual = ftest; hr.vactual = vactual; hr.predict = hitratio_fpreds; hr.invoke(); hr.toASCII(sb); } if (printMe && sb.length() > 0) { Log.info("Scoring on " + label + " data:"); for (String s : sb.toString().split("\n")) Log.info(s); } return error; } /** Subclasses implement the scoring logic. The data is pre-loaded into a * re-used temp array, in the order the model expects. The predictions are * loaded into the re-used temp array, which is also returned. */ protected abstract float[] score0(double data[/*ncols*/], float preds[/*nclasses+1*/]); // Version where the user has just ponied-up an array of data to be scored. // Data must be in proper order. Handy for JUnit tests. public double score(double [] data){ return Utils.maxIndex(score0(data,new float[nclasses()])); } /** Debug flag to generate benchmar code */ protected static final boolean GEN_BENCHMARK_CODE = false; /** Return a String which is a valid Java program representing a class that * implements the Model. The Java is of the form: * <pre> * class UUIDxxxxModel { * public static final String NAMES[] = { ....column names... } * public static final String DOMAINS[][] = { ....domain names... } * // Pass in data in a double[], pre-aligned to the Model's requirements. * // Jam predictions into the preds[] array; preds[0] is reserved for the * // main prediction (class for classifiers or value for regression), * // and remaining columns hold a probability distribution for classifiers. * float[] predict( double data[], float preds[] ); * double[] map( HashMap < String,Double > row, double data[] ); * // Does the mapping lookup for every row, no allocation * float[] predict( HashMap < String,Double > row, double data[], float preds[] ); * // Allocates a double[] for every row * float[] predict( HashMap < String,Double > row, float preds[] ); * // Allocates a double[] and a float[] for every row * float[] predict( HashMap < String,Double > row ); * } * </pre> */ public String toJava() { return toJava(new SB()).toString(); } public SB toJava( SB sb ) { SB fileContextSB = new SB(); // preserve file context String modelName = JCodeGen.toJavaId(_key.toString()); // HEADER sb.p("import java.util.Map;").nl(); sb.p("import water.genmodel.GenUtils.*;").nl().nl(); sb.p("// AUTOGENERATED BY H2O at ").p(new Date().toString()).nl(); sb.p("// ").p(H2O.getBuildVersion().toString()).nl(); sb.p("//").nl(); sb.p("// Standalone prediction code with sample test data for ").p(this.getClass().getSimpleName()).p(" named ").p(modelName).nl(); sb.p("//").nl(); sb.p("// How to download, compile and execute:").nl(); sb.p("// mkdir tmpdir").nl(); sb.p("// cd tmpdir").nl(); sb.p("// curl http:/").p(H2O.SELF.toString()).p("/h2o-model.jar > h2o-model.jar").nl(); sb.p("// curl http:/").p(H2O.SELF.toString()).p("/2/").p(this.getClass().getSimpleName()).p("View.java?_modelKey=").pobj(_key).p(" > ").p(modelName).p(".java").nl(); sb.p("// javac -cp h2o-model.jar -J-Xmx2g -J-XX:MaxPermSize=128m ").p(modelName).p(".java").nl(); if (GEN_BENCHMARK_CODE) sb.p("// java -cp h2o-model.jar:. -Xmx2g -XX:MaxPermSize=256m -XX:ReservedCodeCacheSize=256m ").p(modelName).nl(); sb.p("//").nl(); sb.p("// (Note: Try java argument -XX:+PrintCompilation to show runtime JIT compiler behavior.)").nl(); sb.nl(); sb.p("public class ").p(modelName).p(" extends water.genmodel.GeneratedModel {").nl(); // or extends GenerateModel toJavaInit(sb, fileContextSB).nl(); toJavaNAMES(sb, fileContextSB); toJavaNCLASSES(sb); toJavaDOMAINS(sb, fileContextSB); toJavaPROB(sb); toJavaSuper(sb); // toJavaPredict(sb, fileContextSB); sb.p("}").nl(); sb.p(fileContextSB).nl(); // Append file return sb; } /** Generate implementation for super class. */ protected SB toJavaSuper( SB sb ) { sb.nl(); sb.ii(1); sb.i().p("public String[] getNames() { return NAMES; } ").nl(); sb.i().p("public String[][] getDomainValues() { return DOMAINS; }").nl(); String uuid = this.uniqueId != null ? this.uniqueId.getId() : this._key.toString(); sb.i().p("public String getUUID() { return ").ps(uuid).p("; }").nl(); return sb; } private SB toJavaNAMES(SB sb, SB fileContextSB) { String namesHolderClassName = "NamesHolder"; sb.i().p("// ").p("Names of columns used by model.").nl(); sb.i().p("public static final String[] NAMES = NamesHolder.VALUES;").nl(); // Generate class which fills the names into array fileContextSB.i().p("// The class representing training column names ").nl(); JCodeGen.toClassWithArray(fileContextSB, null, namesHolderClassName, _names); return sb; } protected SB toJavaNCLASSES( SB sb ) { return isClassifier() ? JCodeGen.toStaticVar(sb, "NCLASSES", nclasses(), "Number of output classes included in training data response column.") : sb; } private SB toJavaDOMAINS( SB sb, SB fileContextSB ) { sb.nl(); sb.ii(1); sb.i().p("// Column domains. The last array contains domain of response column.").nl(); sb.i().p("public static final String[][] DOMAINS = new String[][] {").nl(); for (int i=0; i<_domains.length; i++) { String[] dom = _domains[i]; String colInfoClazz = "ColInfo_"+i; sb.i(1).p("/* ").p(_names[i]).p(" */ "); if (dom != null) sb.p(colInfoClazz).p(".VALUES"); else sb.p("null"); if (i!=_domains.length-1) sb.p(','); sb.nl(); if (dom != null) { fileContextSB.i().p("// The class representing column ").p(_names[i]).nl(); JCodeGen.toClassWithArray(fileContextSB, null, colInfoClazz, dom); } } return sb.i().p("};").nl(); } private SB toJavaPROB( SB sb) { sb.di(1); toStaticVar(sb, "PRIOR_CLASS_DISTRIB", _priorClassDist, "Prior class distribution"); toStaticVar(sb, "MODEL_CLASS_DISTRIB", _modelClassDist, "Class distribution used for model building"); return sb; } // Override in subclasses to provide some top-level model-specific goodness protected SB toJavaInit(SB sb, SB fileContextSB) { return sb; } protected void toJavaInit(CtClass ct) { } // Override in subclasses to provide some inside 'predict' call goodness // Method returns code which should be appended into generated top level class after // predict method. protected void toJavaPredictBody(SB bodySb, SB classCtxSb, SB fileCtxSb) { throw new IllegalArgumentException("This model type does not support conversion to Java"); } // Wrapper around the main predict call, including the signature and return value private SB toJavaPredict(SB ccsb, SB fileCtxSb) { // ccsb = classContext ccsb.nl(); ccsb.p(" // Pass in data in a double[], pre-aligned to the Model's requirements.").nl(); ccsb.p(" // Jam predictions into the preds[] array; preds[0] is reserved for the").nl(); ccsb.p(" // main prediction (class for classifiers or value for regression),").nl(); ccsb.p(" // and remaining columns hold a probability distribution for classifiers.").nl(); ccsb.p(" public final float[] predict( double[] data, float[] preds) { preds = predict( data, preds, "+toJavaDefaultMaxIters()+"); return preds; }").nl(); // ccsb.p(" public final float[] predict( double[] data, float[] preds) { return predict( data, preds, "+toJavaDefaultMaxIters()+"); }").nl(); ccsb.p(" public final float[] predict( double[] data, float[] preds, int maxIters ) {").nl(); SB classCtxSb = new SB(); toJavaPredictBody(ccsb.ii(1), classCtxSb, fileCtxSb); ccsb.di(1); ccsb.p(" return preds;").nl(); ccsb.p(" }").nl(); ccsb.p(classCtxSb); return ccsb; } protected String toJavaDefaultMaxIters() { return "-1"; } /** Generates code which unify preds[1,...NCLASSES] */ protected void toJavaUnifyPreds(SB bodySb) { } /** Fill preds[0] based on already filled and unified preds[1,..NCLASSES]. */ protected void toJavaFillPreds0(SB bodySb) { // Pick max index as a prediction if (isClassifier()) { if (_priorClassDist!=null && _modelClassDist!=null) { bodySb.i().p("water.util.ModelUtils.correctProbabilities(preds, PRIOR_CLASS_DISTRIB, MODEL_CLASS_DISTRIB);").nl(); } bodySb.i().p("preds[0] = water.util.ModelUtils.getPrediction(preds,data);").nl(); } else { bodySb.i().p("preds[0] = preds[1];").nl(); } } /** * Compute the cross validation error from an array of predictions for N folds. * Also stores the results in the model for display/query. * @param source Full training data * @param response Full response * @param cv_preds N Frames containing predictions made by N-fold CV runs on disjoint contiguous holdout pieces of the training data * @param offsets Starting row numbers for the N CV pieces (length = N+1, first element: 0, last element: #rows) */ public final void scoreCrossValidation(Job.ValidatedJob job, Frame source, Vec response, Frame[] cv_preds, long[] offsets) { assert(offsets[0] == 0); assert(offsets[offsets.length-1] == source.numRows()); //Hack to make a frame with the correct dimensions and vector group Frame cv_pred = score(source); // Stitch together the content of cv_pred from cv_preds for (int i=0; i<cv_preds.length; ++i) { // stitch probabilities (or regression values) for (int c=(isClassifier() ? 1 : 0); c<cv_preds[i].numCols(); ++c) { Vec.Writer vw = cv_pred.vec(c).open(); try { for (long r=0; r < cv_preds[i].numRows(); ++r) { vw.set(offsets[i] + r, cv_preds[i].vec(c).at(r)); } } finally { vw.close(); } } if (isClassifier()) { // make labels float[] probs = new float[cv_preds[i].numCols()]; Vec.Writer vw = cv_pred.vec(0).open(); try { for (long r = 0; r < cv_preds[i].numRows(); ++r) { //probs[0] stays 0, is not used in getPrediction for (int c = 1; c < cv_preds[i].numCols(); ++c) { probs[c] = (float) cv_preds[i].vec(c).at(r); } final int label = ModelUtils.getPrediction(probs, (int)r); vw.set(offsets[i] + r, label); } } finally { vw.close(); } } } // Now score the model on the N folds try { AUC auc = nclasses() == 2 ? new AUC() : null; water.api.ConfusionMatrix cm = new water.api.ConfusionMatrix(); HitRatio hr = isClassifier() ? new HitRatio() : null; double cv_error = calcError(source, response, cv_pred, cv_pred, "cross-validated", true, 10, cm, auc, hr); setCrossValidationError(job, cv_error, cm, auc == null ? null : auc.data(), hr); } finally { // cleanup temporary frame wit predictions cv_pred.delete(); } } protected void setCrossValidationError(Job.ValidatedJob job, double cv_error, water.api.ConfusionMatrix cm, AUCData auc, HitRatio hr) { throw H2O.unimpl(); } protected void printCrossValidationModelsHTML(StringBuilder sb) { if (job() == null) return; Job.ValidatedJob job = (Job.ValidatedJob)job(); if (job.xval_models != null && job.xval_models.length > 0) { sb.append("<h4>Cross Validation Models</h4>"); sb.append("<table class='table table-bordered table-condensed'>"); sb.append("<tr><th>Model</th></tr>"); for (Key k : job.xval_models) { Model m = UKV.get(k); Job j = m != null ? (Job)m.job() : null; sb.append("<tr>"); sb.append("<td>" + (m != null ? Inspector.link(k.toString(), k.toString()) : "Pending") + (j != null ? ", Progress: " + Utils.formatPct(j.progress()) : "") + "</td>"); sb.append("</tr>"); } sb.append("</table>"); } } /** Helper type for serialization */ protected static class ModelAutobufferSerializer extends AutoBufferSerializer<Model> { } /** Returns a model serializer into AutoBuffer. */ public AutoBufferSerializer<Model> getModelSerializer() { return new ModelAutobufferSerializer(); } }