package water;
import static water.util.JCodeGen.toStaticVar;
import hex.ConfusionMatrix;
import hex.VarImp;
import java.util.*;
import javassist.*;
import water.api.*;
import water.api.Request.API;
import water.fvec.*;
import water.serial.AutoBufferSerializer;
import water.util.*;
import water.util.Log.Tag.Sys;
/**
* A Model models reality (hopefully).
* A model can be used to 'score' a row, or a collection of rows on any
* compatible dataset - meaning the row has all the columns with the same names
* as used to build the mode.
*/
public abstract class Model extends Lockable<Model> {
static final int API_WEAVER = 1; // This file has auto-gen'd doc & json fields
static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code.
/** Dataset key used to *build* the model, for models for which this makes
* sense, or null otherwise. Not all models are built from a dataset (eg
* artificial models), or are built from a single dataset (various ensemble
* models), so this key has no *mathematical* significance in the model but
* is handy during common model-building and for the historical record. */
@API(help="Datakey used to *build* the model")
public final Key _dataKey;
/** Columns used in the model and are used to match up with scoring data
* columns. The last name is the response column name. */
@API(help="Column names used to build the model")
public final String _names[];
/** Categorical/factor/enum mappings, per column. Null for non-enum cols.
* The last column holds the response col enums. */
@API(help="Column names used to build the model")
public final String _domains[][];
@API(help = "Relative class distribution factors in original data")
public final float[] _priorClassDist;
@API(help = "Relative class distribution factors used for model building")
protected float[] _modelClassDist;
// WARNING: be really careful to modify this POJO because
// modification does not involve update in DKV
public void setModelClassDistribution(float[] classdist) {
_modelClassDist = classdist.clone();
}
private final UniqueId uniqueId;
/** The start time in mS since the epoch for model training. */
public long training_start_time = 0L;
/** The duration in mS for model training. */
public long training_duration_in_ms = 0L;
/** Any warnings thrown during model building. */
@API(help="warnings")
public String[] warnings = new String[0];
/** Whether or not this model has cross-validated results stored. */
protected boolean _have_cv_results;
/** Full constructor from frame: Strips out the Vecs to just the names needed
* to match columns later for future datasets.
*/
public Model( Key selfKey, Key dataKey, Frame fr, float[] priorClassDist ) {
this(selfKey,dataKey,fr.names(),fr.domains(), priorClassDist, null, 0, 0);
}
public Model( Key selfKey, Key dataKey, String names[], String domains[][], float[] priorClassDist, float[] modelClassDist) {
this(selfKey,dataKey,names,domains,priorClassDist,modelClassDist,0,0);
}
/** Full constructor */
public Model( Key selfKey, Key dataKey, String names[], String domains[][], float[] priorClassDist, float[] modelClassDist, long training_start_time, long training_duration_in_ms ) {
super(selfKey);
this.uniqueId = new UniqueId(_key);
if( domains == null ) domains=new String[names.length+1][];
assert domains.length==names.length;
assert names.length >= 1;
assert names[names.length-1] != null; // Have a valid response-column name?
_dataKey = dataKey;
_names = names;
_domains = domains;
_priorClassDist = priorClassDist;
_modelClassDist = modelClassDist;
this.training_duration_in_ms = training_duration_in_ms;
this.training_start_time = training_start_time;
}
// Currently only implemented by GLM2, DeepLearning, GBM and DRF:
public Request2 get_params() { throw new UnsupportedOperationException("get_params() has not yet been implemented in class: " + this.getClass()); }
// NOTE: this is a local copy of the Job; to get the real state you need to get it from the DKV.
// Currently only implemented by GLM2, DeepLearning, GBM and DRF:
public Request2 job() { throw new UnsupportedOperationException("job() has not yet been implemented in class: " + this.getClass()); }
public enum ModelCategory {
Unknown,
Binomial,
Multinomial,
Regression,
Clustering;
}
// TODO: override in KMeansModel once that's rewritten on water.Model
public ModelCategory getModelCategory() {
return (isClassifier() ?
(nclasses() > 2 ? ModelCategory.Multinomial : ModelCategory.Binomial) :
ModelCategory.Regression);
}
/** Remove any Model internal Keys */
@Override public Futures delete_impl(Futures fs) { return fs; /* None in the default Model */ }
@Override public String errStr() { return "Model"; }
public void addWarning(String warning) {
if(this.warnings == null || this.warnings.length == 0)
this.warnings = new String[]{warning};
else {
this.warnings = Arrays.copyOf(this.warnings,this.warnings.length+1);
this.warnings[this.warnings.length-1] = warning;
}
}
public boolean isSupervised() { return true; }
public UniqueId getUniqueId() {
return this.uniqueId;
}
public void start_training(long training_start_time) {
Log.info("setting training_start_time to: " + training_start_time + " for Model: " + this._key.toString() + " (" + this.getClass().getSimpleName() + "@" + System.identityHashCode(this) + ")");
final long t = training_start_time;
new TAtomic<Model>() {
@Override public Model atomic(Model m) {
if (m != null) {
m.training_start_time = t;
} return m;
}
}.invoke(_key);
this.training_start_time = training_start_time;
}
public void start_training(Model previous) {
training_start_time = System.currentTimeMillis();
Log.info("setting training_start_time to: " + training_start_time + " for Model: " + this._key.toString() + " (" + this.getClass().getSimpleName() + "@" + System.identityHashCode(this) + ") [checkpoint case]");
if (null != previous)
training_duration_in_ms += previous.training_duration_in_ms;
final long t = training_start_time;
final long d = training_duration_in_ms;
new TAtomic<Model>() {
@Override public Model atomic(Model m) {
if (m != null) {
m.training_start_time = t;
m.training_duration_in_ms = d;
} return m;
}
}.invoke(_key);
}
public void stop_training() {
training_duration_in_ms += (System.currentTimeMillis() - training_start_time);
Log.info("setting training_duration_in_ms to: " + training_duration_in_ms + " for Model: " + this._key.toString() + " (" + this.getClass().getSimpleName() + "@" + System.identityHashCode(this) + ")");
final long d = training_duration_in_ms;
new TAtomic<Model>() {
@Override public Model atomic(Model m) {
if (m != null) {
m.training_duration_in_ms = d;
} return m;
}
}.invoke(_key);
}
public String responseName() { return _names[ _names.length-1]; }
public String[] classNames() { return _domains[_domains.length-1]; }
public boolean isClassifier() { return classNames() != null ; }
public int nclasses() {
String cns[] = classNames();
return cns==null ? 1 : cns.length;
}
/** Returns number of input features */
public int nfeatures() { return _names.length - 1; }
/** For classifiers, confusion matrix on validation set. */
public ConfusionMatrix cm() { return null; }
/** Returns mse for validation set. */
public double mse() { return Double.NaN; }
/** Variable importance of individual input features measured by this model. */
public VarImp varimp() { return null; }
public boolean hasCrossValModels() { return _have_cv_results; }
/** Bulk score for given <code>fr</code> frame.
* The frame is always adapted to this model.
*
* @param fr frame to be scored
* @return frame holding predicted values
*
* @see #score(Frame, boolean)
*/
public Frame score(Frame fr) {
return score(fr, true);
}
/** Bulk score the frame <code>fr</code>, producing a Frame result; the 1st Vec is the
* predicted class, the remaining Vecs are the probability distributions.
* For Regression (single-class) models, the 1st and only Vec is the
* prediction value.
*
* The flat <code>adapt</code>
* @param fr frame which should be scored
* @param adapt a flag enforcing an adaptation of <code>fr</code> to this model. If flag
* is <code>false</code> scoring code expect that <code>fr</code> is already adapted.
* @return a new frame containing a predicted values. For classification it contains a column with
* prediction and distribution for all response classes. For regression it contains only
* one column with predicted values.
*/
public final Frame score(Frame fr, boolean adapt) {
if (isSupervised()) {
int ridx = fr.find(responseName());
if (ridx != -1) { // drop the response for scoring!
fr = new Frame(fr);
fr.remove(ridx);
}
}
// Adapt the Frame layout - returns adapted frame and frame containing only
// newly created vectors
Frame[] adaptFrms = adapt ? adapt(fr,false) : null;
// Adapted frame containing all columns - mix of original vectors from fr
// and newly created vectors serving as adaptors
Frame adaptFrm = adapt ? adaptFrms[0] : fr;
// Contains only newly created vectors. The frame eases deletion of these vectors.
Frame onlyAdaptFrm = adapt ? adaptFrms[1] : null;
// Invoke scoring
Frame output = scoreImpl(adaptFrm);
// Be nice to DKV and delete vectors which i created :-)
if (adapt) onlyAdaptFrm.delete();
return output;
}
/** Score already adapted frame.
*
* @param adaptFrm
* @return
*/
protected Frame scoreImpl(Frame adaptFrm) {
if (isSupervised()) {
int ridx = adaptFrm.find(responseName());
assert ridx == -1 : "Adapted frame should not contain response in scoring method!";
assert nfeatures() == adaptFrm.numCols() : "Number of model features " + nfeatures() + " != number of test set columns: " + adaptFrm.numCols();
assert adaptFrm.vecs().length == nfeatures() : "Scoring data set contains wrong number of columns: " + adaptFrm.vecs().length + " instead of " + nfeatures();
}
// Create a new vector for response
// If the model produces a classification/enum, copy the domain into the
// result vector.
int nc = nclasses();
Vec [] newVecs = new Vec[]{adaptFrm.anyVec().makeZero(classNames())};
if(nc > 1)
newVecs = Utils.join(newVecs,adaptFrm.anyVec().makeZeros(nc));
String [] names = new String[newVecs.length];
names[0] = "predict";
for(int i = 1; i < names.length; ++i)
names[i] = classNames()[i-1];
final int num_features = nfeatures();
new MRTask2() {
@Override public void map( Chunk chks[] ) {
double tmp [] = new double[num_features]; // We do not need the last field representing response
float preds[] = new float [nclasses()==1?1:nclasses()+1];
int len = chks[0]._len;
for( int row=0; row<len; row++ ) {
float p[] = score0(chks,row,tmp,preds);
for( int c=0; c<preds.length; c++ )
chks[num_features+c].set0(row,p[c]);
}
}
}.doAll(Utils.join(adaptFrm.vecs(),newVecs));
// Return just the output columns
return new Frame(names,newVecs);
}
/** Single row scoring, on a compatible Frame. */
public final float[] score( Frame fr, boolean exact, int row ) {
double tmp[] = new double[fr.numCols()];
for( int i=0; i<tmp.length; i++ )
tmp[i] = fr.vecs()[i].at(row);
return score(fr.names(),fr.domains(),exact,tmp);
}
/** Single row scoring, on a compatible set of data. Fairly expensive to adapt. */
public final float[] score( String names[], String domains[][], boolean exact, double row[] ) {
return score(adapt(names,domains,exact),row,new float[nclasses()]);
}
/** Single row scoring, on a compatible set of data, given an adaption vector */
public final float[] score( int map[][][], double row[], float[] preds ) {
/*FIXME final int[][] colMap = map[map.length-1]; // Response column mapping is the last array
assert colMap.length == _names.length-1 : " "+Arrays.toString(colMap)+" "+Arrays.toString(_names);
double tmp[] = new double[colMap.length]; // The adapted data
for( int i=0; i<colMap.length; i++ ) {
// Column mapping, or NaN for missing columns
double d = colMap[i]==-1 ? Double.NaN : row[colMap[i]];
if( map[i] != null ) { // Enum mapping
int e = (int)d;
if( e < 0 || e >= map[i].length ) d = Double.NaN; // User data is out of adapt range
else {
e = map[i][e];
d = e==-1 ? Double.NaN : (double)e;
}
}
tmp[i] = d;
}
return score0(tmp,preds); // The results. */
return null;
}
/** Build an adaption array. The length is equal to the Model's vector length.
* Each inner 2D-array is a
* compressed domain map from data domains to model domains - or null for non-enum
* columns, or null for identity mappings. The extra final int[] is the
* column mapping itself, mapping from model columns to data columns. or -1
* if missing.
* If 'exact' is true, will throw if there are:
* any columns in the model but not in the input set;
* any enums in the data that the model does not understand
* any enums returned by the model that the data does not have a mapping for.
* If 'exact' is false, these situations will use or return NA's instead.
*/
private int[][][] adapt( String names[], String domains[][], boolean exact) {
int maplen = names.length;
int map[][][] = new int[maplen][][];
// Make sure all are compatible
for( int c=0; c<names.length;++c) {
// Now do domain mapping
String ms[] = _domains[c]; // Model enum
String ds[] = domains[c]; // Data enum
if( ms == ds ) { // Domains trivially equal?
} else if( ms == null ) {
throw new IllegalArgumentException("Incompatible column: '" + _names[c] + "', expected (trained on) numeric, was passed a categorical");
} else if( ds == null ) {
if( exact )
throw new IllegalArgumentException("Incompatible column: '" + _names[c] + "', expected (trained on) categorical, was passed a numeric");
throw H2O.unimpl(); // Attempt an asEnum?
} else if( !Arrays.deepEquals(ms, ds) ) {
map[c] = getDomainMapping(_names[c], ms, ds, exact);
} // null mapping is equal to identity mapping
}
return map;
}
/**
* Type of missing columns during adaptation between train/test datasets
* Overload this method for models that have sparse data handling.
* Otherwise, NaN is used.
* @return real-valued number (can be NaN)
*/
protected double missingColumnsType() { return Double.NaN; }
/** Build an adapted Frame from the given Frame. Useful for efficient bulk
* scoring of a new dataset to an existing model. Same adaption as above,
* but expressed as a Frame instead of as an int[][]. The returned Frame
* does not have a response column.
* It returns a <b>two element array</b> containing an adapted frame and a
* frame which contains only vectors which where adapted (the purpose of the
* second frame is to delete all adapted vectors with deletion of the
* frame). */
public Frame[] adapt( final Frame fr, boolean exact) {
return adapt(fr, exact, true);
}
public Frame[] adapt( final Frame fr, boolean exact, boolean haveResponse) {
Frame vfr = new Frame(fr); // To avoid modification of original frame fr
int n = _names.length;
if (haveResponse && isSupervised()) {
int ridx = vfr.find(_names[_names.length - 1]);
if (ridx != -1 && ridx != vfr._names.length - 1) { // Unify frame - put response to the end
String name = vfr._names[ridx];
vfr.add(name, vfr.remove(ridx));
}
n = ridx == -1 ? _names.length - 1 : _names.length;
}
String [] names = isSupervised() ? Arrays.copyOf(_names, n) : _names.clone();
Frame [] subVfr;
// replace missing columns with NaNs (or 0s for DeepLearning with sparse data)
subVfr = vfr.subframe(names, missingColumnsType());
vfr = subVfr[0]; // extract only subframe but keep the rest for delete later
Vec[] frvecs = vfr.vecs();
boolean[] toEnum = new boolean[frvecs.length];
if(!exact) for(int i = 0; i < n;++i)
if(_domains[i] != null && !frvecs[i].isEnum()) {// if model expects domain but input frame does not have domain => switch vector to enum
frvecs[i] = frvecs[i].toEnum();
toEnum[i] = true;
}
int[][][] map = adapt(names,vfr.domains(),exact);
assert map.length == names.length; // Be sure that adapt call above do not skip any column
ArrayList<Vec> avecs = new ArrayList<Vec>(); // adapted vectors
ArrayList<String> anames = new ArrayList<String>(); // names for adapted vector
for( int c=0; c<map.length; c++ ) // Iterate over columns
if(map[c] != null) { // Column needs adaptation
Vec adaptedVec;
if (toEnum[c]) { // Vector was flipped to column already, compose transformation
adaptedVec = TransfVec.compose( (TransfVec) frvecs[c], map[c], vfr.domains()[c], false);
} else adaptedVec = frvecs[c].makeTransf(map[c], vfr.domains()[c]);
avecs.add(frvecs[c] = adaptedVec);
anames.add(names[c]); // Collect right names
} else if (toEnum[c]) { // Vector was transformed to enum domain, but does not need adaptation we need to record it
avecs.add(frvecs[c]);
anames.add(names[c]);
}
// Fill trash bin by vectors which need to be deleted later by the caller.
Frame vecTrash = new Frame(anames.toArray(new String[anames.size()]), avecs.toArray(new Vec[avecs.size()]));
if (subVfr[1]!=null) vecTrash.add(subVfr[1], true);
return new Frame[] { new Frame(names,frvecs), vecTrash };
}
/** Returns a mapping between values of model domains (<code>modelDom</code>) and given column domain.
* @see #getDomainMapping(String, String[], String[], boolean) */
public static int[][] getDomainMapping(String[] modelDom, String[] colDom, boolean exact) {
return getDomainMapping(null, modelDom, colDom, exact);
}
/**
* Returns a mapping for given column according to given <code>modelDom</code>.
* In this case, <code>modelDom</code> is
*
* @param colName name of column which is mapped, can be null.
* @param modelDom
* @param logNonExactMapping
* @return
*/
public static int[][] getDomainMapping(String colName, String[] modelDom, String[] colDom, boolean logNonExactMapping) {
int emap[] = new int[modelDom.length];
boolean bmap[] = new boolean[modelDom.length];
HashMap<String,Integer> md = new HashMap<String, Integer>((int) ((colDom.length/0.75f)+1));
for( int i = 0; i < colDom.length; i++) md.put(colDom[i], i);
for( int i = 0; i < modelDom.length; i++) {
Integer I = md.get(modelDom[i]);
if (I == null && logNonExactMapping)
Log.warn(Sys.SCORM, "Domain mapping: target domain contains the factor '"+modelDom[i]+"' which DOES NOT appear in input domain " + (colName!=null?"(column: " + colName+")":""));
if (I!=null) {
emap[i] = I;
bmap[i] = true;
}
}
if (logNonExactMapping) { // Inform about additional values in column domain which do not appear in model domain
for (int i=0; i<colDom.length; i++) {
boolean found = false;
for (int j=0; j<emap.length; j++)
if (emap[j]==i) { found=true; break; }
if (!found)
Log.warn(Sys.SCORM, "Domain mapping: target domain DOES NOT contain the factor '"+colDom[i]+"' which appears in input domain "+ (colName!=null?"(column: " + colName+")":""));
}
}
// produce packed values
int[][] res = Utils.pack(emap, bmap);
// Sort values in numeric order to support binary search in TransfVec
Utils.sortWith(res[0], res[1]);
return res;
}
/** Bulk scoring API for one row. Chunks are all compatible with the model,
* and expect the last Chunks are for the final distribution and prediction.
* Default method is to just load the data into the tmp array, then call
* subclass scoring logic. */
protected float[] score0( Chunk chks[], int row_in_chunk, double[] tmp, float[] preds ) {
assert chks.length>=_names.length; // Last chunk is for the response
for( int i=0; i<nfeatures(); i++ ) // Do not include last value since it can contains a response
tmp[i] = chks[i].at0(row_in_chunk);
float[] scored = score0(tmp,preds);
// Correct probabilities obtained from training on oversampled data back to original distribution
// C.f. http://gking.harvard.edu/files/0s.pdf Eq.(27)
if (isClassifier() && _priorClassDist != null && _modelClassDist != null) {
assert(scored.length == nclasses()+1); //1 label + nclasses probs
ModelUtils.correctProbabilities(scored, _priorClassDist, _modelClassDist);
//set label based on corrected probabilities (max value wins, with deterministic tie-breaking)
scored[0] = ModelUtils.getPrediction(scored, tmp);
}
return scored;
}
/**
* Compute the model error for a given test data set
* For multi-class classification, this is the classification error based on assigning labels for the highest predicted per-class probability.
* For binary classification, this is the classification error based on assigning labels using the optimal threshold for maximizing the F1 score.
* For regression, this is the mean squared error (MSE).
* @param ftest Frame containing test data
* @param vactual The response column Vec
* @param fpreds Frame containing ADAPTED (domain labels from train+test data) predicted data (classification: label + per-class probabilities, regression: target)
* @param hitratio_fpreds Frame containing predicted data (domain labels from test data) (classification: label + per-class probabilities, regression: target)
* @param label Name for the scored data set to be printed
* @param printMe Whether to print the scoring results to Log.info
* @param max_conf_mat_size Largest size of Confusion Matrix (#classes) for it to be printed to Log.info
* @param cm Confusion Matrix object to populate for multi-class classification (also used for regression)
* @param auc AUC object to populate for binary classification
* @param hr HitRatio object to populate for classification
* @return model error, see description above
*/
public double calcError(final Frame ftest, final Vec vactual,
final Frame fpreds, final Frame hitratio_fpreds,
final String label, final boolean printMe,
final int max_conf_mat_size, final water.api.ConfusionMatrix cm,
final AUC auc,
final HitRatio hr)
{
StringBuilder sb = new StringBuilder();
double error = Double.POSITIVE_INFINITY;
// populate AUC
if (auc != null) {
assert(isClassifier());
assert(nclasses() == 2);
auc.actual = ftest;
auc.vactual = vactual;
auc.predict = fpreds;
auc.vpredict = fpreds.vecs()[2]; //binary classifier (label, prob0, prob1 (THIS ONE), adaptedlabel)
auc.invoke();
auc.toASCII(sb);
error = auc.data().err(); //using optimal threshold for F1
}
// populate CM
if (cm != null) {
cm.actual = ftest;
cm.vactual = vactual;
cm.predict = fpreds;
cm.vpredict = fpreds.vecs()[0]; // prediction (either label or regression target)
cm.invoke();
if (isClassifier()) {
if (auc != null) {
AUCData aucd = auc.data();
//override the CM with the one computed by AUC (using optimal threshold)
//Note: must still call invoke above to set the domains etc.
cm.cm = new long[3][3]; // 1 extra layer for NaNs (not populated here, since AUC skips them)
cm.cm[0][0] = aucd.cm()[0][0];
cm.cm[1][0] = aucd.cm()[1][0];
cm.cm[0][1] = aucd.cm()[0][1];
cm.cm[1][1] = aucd.cm()[1][1];
double cm_err = new hex.ConfusionMatrix(cm.cm).err();
double auc_err = aucd.err();
if (! (Double.isNaN(cm_err) && Double.isNaN(auc_err))) // NOTE: NaN != NaN
assert(cm_err == auc_err); //check consistency with AUC-computed error
} else {
error = new hex.ConfusionMatrix(cm.cm).err(); //only set error if AUC didn't already set the error
}
if (cm.cm.length <= max_conf_mat_size+1) cm.toASCII(sb);
} else {
assert(auc == null);
error = cm.mse;
cm.toASCII(sb);
}
}
// populate HitRatio
if (hr != null) {
assert(isClassifier());
hr.actual = ftest;
hr.vactual = vactual;
hr.predict = hitratio_fpreds;
hr.invoke();
hr.toASCII(sb);
}
if (printMe && sb.length() > 0) {
Log.info("Scoring on " + label + " data:");
for (String s : sb.toString().split("\n")) Log.info(s);
}
return error;
}
/** Subclasses implement the scoring logic. The data is pre-loaded into a
* re-used temp array, in the order the model expects. The predictions are
* loaded into the re-used temp array, which is also returned. */
protected abstract float[] score0(double data[/*ncols*/], float preds[/*nclasses+1*/]);
// Version where the user has just ponied-up an array of data to be scored.
// Data must be in proper order. Handy for JUnit tests.
public double score(double [] data){ return Utils.maxIndex(score0(data,new float[nclasses()])); }
/** Debug flag to generate benchmar code */
protected static final boolean GEN_BENCHMARK_CODE = false;
/** Return a String which is a valid Java program representing a class that
* implements the Model. The Java is of the form:
* <pre>
* class UUIDxxxxModel {
* public static final String NAMES[] = { ....column names... }
* public static final String DOMAINS[][] = { ....domain names... }
* // Pass in data in a double[], pre-aligned to the Model's requirements.
* // Jam predictions into the preds[] array; preds[0] is reserved for the
* // main prediction (class for classifiers or value for regression),
* // and remaining columns hold a probability distribution for classifiers.
* float[] predict( double data[], float preds[] );
* double[] map( HashMap < String,Double > row, double data[] );
* // Does the mapping lookup for every row, no allocation
* float[] predict( HashMap < String,Double > row, double data[], float preds[] );
* // Allocates a double[] for every row
* float[] predict( HashMap < String,Double > row, float preds[] );
* // Allocates a double[] and a float[] for every row
* float[] predict( HashMap < String,Double > row );
* }
* </pre>
*/
public String toJava() { return toJava(new SB()).toString(); }
public SB toJava( SB sb ) {
SB fileContextSB = new SB(); // preserve file context
String modelName = JCodeGen.toJavaId(_key.toString());
// HEADER
sb.p("import java.util.Map;").nl();
sb.p("import water.genmodel.GenUtils.*;").nl().nl();
sb.p("// AUTOGENERATED BY H2O at ").p(new Date().toString()).nl();
sb.p("// ").p(H2O.getBuildVersion().toString()).nl();
sb.p("//").nl();
sb.p("// Standalone prediction code with sample test data for ").p(this.getClass().getSimpleName()).p(" named ").p(modelName).nl();
sb.p("//").nl();
sb.p("// How to download, compile and execute:").nl();
sb.p("// mkdir tmpdir").nl();
sb.p("// cd tmpdir").nl();
sb.p("// curl http:/").p(H2O.SELF.toString()).p("/h2o-model.jar > h2o-model.jar").nl();
sb.p("// curl http:/").p(H2O.SELF.toString()).p("/2/").p(this.getClass().getSimpleName()).p("View.java?_modelKey=").pobj(_key).p(" > ").p(modelName).p(".java").nl();
sb.p("// javac -cp h2o-model.jar -J-Xmx2g -J-XX:MaxPermSize=128m ").p(modelName).p(".java").nl();
if (GEN_BENCHMARK_CODE)
sb.p("// java -cp h2o-model.jar:. -Xmx2g -XX:MaxPermSize=256m -XX:ReservedCodeCacheSize=256m ").p(modelName).nl();
sb.p("//").nl();
sb.p("// (Note: Try java argument -XX:+PrintCompilation to show runtime JIT compiler behavior.)").nl();
sb.nl();
sb.p("public class ").p(modelName).p(" extends water.genmodel.GeneratedModel {").nl(); // or extends GenerateModel
toJavaInit(sb, fileContextSB).nl();
toJavaNAMES(sb, fileContextSB);
toJavaNCLASSES(sb);
toJavaDOMAINS(sb, fileContextSB);
toJavaPROB(sb);
toJavaSuper(sb); //
toJavaPredict(sb, fileContextSB);
sb.p("}").nl();
sb.p(fileContextSB).nl(); // Append file
return sb;
}
/** Generate implementation for super class. */
protected SB toJavaSuper( SB sb ) {
sb.nl();
sb.ii(1);
sb.i().p("public String[] getNames() { return NAMES; } ").nl();
sb.i().p("public String[][] getDomainValues() { return DOMAINS; }").nl();
String uuid = this.uniqueId != null ? this.uniqueId.getId() : this._key.toString();
sb.i().p("public String getUUID() { return ").ps(uuid).p("; }").nl();
return sb;
}
private SB toJavaNAMES(SB sb, SB fileContextSB) {
String namesHolderClassName = "NamesHolder";
sb.i().p("// ").p("Names of columns used by model.").nl();
sb.i().p("public static final String[] NAMES = NamesHolder.VALUES;").nl();
// Generate class which fills the names into array
fileContextSB.i().p("// The class representing training column names ").nl();
JCodeGen.toClassWithArray(fileContextSB, null, namesHolderClassName, _names);
return sb;
}
protected SB toJavaNCLASSES( SB sb ) { return isClassifier() ? JCodeGen.toStaticVar(sb, "NCLASSES", nclasses(), "Number of output classes included in training data response column.") : sb; }
private SB toJavaDOMAINS( SB sb, SB fileContextSB ) {
sb.nl();
sb.ii(1);
sb.i().p("// Column domains. The last array contains domain of response column.").nl();
sb.i().p("public static final String[][] DOMAINS = new String[][] {").nl();
for (int i=0; i<_domains.length; i++) {
String[] dom = _domains[i];
String colInfoClazz = "ColInfo_"+i;
sb.i(1).p("/* ").p(_names[i]).p(" */ ");
if (dom != null) sb.p(colInfoClazz).p(".VALUES"); else sb.p("null");
if (i!=_domains.length-1) sb.p(',');
sb.nl();
if (dom != null) {
fileContextSB.i().p("// The class representing column ").p(_names[i]).nl();
JCodeGen.toClassWithArray(fileContextSB, null, colInfoClazz, dom);
}
}
return sb.i().p("};").nl();
}
private SB toJavaPROB( SB sb) {
sb.di(1);
toStaticVar(sb, "PRIOR_CLASS_DISTRIB", _priorClassDist, "Prior class distribution");
toStaticVar(sb, "MODEL_CLASS_DISTRIB", _modelClassDist, "Class distribution used for model building");
return sb;
}
// Override in subclasses to provide some top-level model-specific goodness
protected SB toJavaInit(SB sb, SB fileContextSB) { return sb; }
protected void toJavaInit(CtClass ct) { }
// Override in subclasses to provide some inside 'predict' call goodness
// Method returns code which should be appended into generated top level class after
// predict method.
protected void toJavaPredictBody(SB bodySb, SB classCtxSb, SB fileCtxSb) {
throw new IllegalArgumentException("This model type does not support conversion to Java");
}
// Wrapper around the main predict call, including the signature and return value
private SB toJavaPredict(SB ccsb, SB fileCtxSb) { // ccsb = classContext
ccsb.nl();
ccsb.p(" // Pass in data in a double[], pre-aligned to the Model's requirements.").nl();
ccsb.p(" // Jam predictions into the preds[] array; preds[0] is reserved for the").nl();
ccsb.p(" // main prediction (class for classifiers or value for regression),").nl();
ccsb.p(" // and remaining columns hold a probability distribution for classifiers.").nl();
ccsb.p(" public final float[] predict( double[] data, float[] preds) { preds = predict( data, preds, "+toJavaDefaultMaxIters()+"); return preds; }").nl();
// ccsb.p(" public final float[] predict( double[] data, float[] preds) { return predict( data, preds, "+toJavaDefaultMaxIters()+"); }").nl();
ccsb.p(" public final float[] predict( double[] data, float[] preds, int maxIters ) {").nl();
SB classCtxSb = new SB();
toJavaPredictBody(ccsb.ii(1), classCtxSb, fileCtxSb); ccsb.di(1);
ccsb.p(" return preds;").nl();
ccsb.p(" }").nl();
ccsb.p(classCtxSb);
return ccsb;
}
protected String toJavaDefaultMaxIters() { return "-1"; }
/** Generates code which unify preds[1,...NCLASSES] */
protected void toJavaUnifyPreds(SB bodySb) {
}
/** Fill preds[0] based on already filled and unified preds[1,..NCLASSES]. */
protected void toJavaFillPreds0(SB bodySb) {
// Pick max index as a prediction
if (isClassifier()) {
if (_priorClassDist!=null && _modelClassDist!=null) {
bodySb.i().p("water.util.ModelUtils.correctProbabilities(preds, PRIOR_CLASS_DISTRIB, MODEL_CLASS_DISTRIB);").nl();
}
bodySb.i().p("preds[0] = water.util.ModelUtils.getPrediction(preds,data);").nl();
} else {
bodySb.i().p("preds[0] = preds[1];").nl();
}
}
/**
* Compute the cross validation error from an array of predictions for N folds.
* Also stores the results in the model for display/query.
* @param source Full training data
* @param response Full response
* @param cv_preds N Frames containing predictions made by N-fold CV runs on disjoint contiguous holdout pieces of the training data
* @param offsets Starting row numbers for the N CV pieces (length = N+1, first element: 0, last element: #rows)
*/
public final void scoreCrossValidation(Job.ValidatedJob job, Frame source, Vec response, Frame[] cv_preds, long[] offsets) {
assert(offsets[0] == 0);
assert(offsets[offsets.length-1] == source.numRows());
//Hack to make a frame with the correct dimensions and vector group
Frame cv_pred = score(source);
// Stitch together the content of cv_pred from cv_preds
for (int i=0; i<cv_preds.length; ++i) {
// stitch probabilities (or regression values)
for (int c=(isClassifier() ? 1 : 0); c<cv_preds[i].numCols(); ++c) {
Vec.Writer vw = cv_pred.vec(c).open();
try {
for (long r=0; r < cv_preds[i].numRows(); ++r) {
vw.set(offsets[i] + r, cv_preds[i].vec(c).at(r));
}
} finally {
vw.close();
}
}
if (isClassifier()) {
// make labels
float[] probs = new float[cv_preds[i].numCols()];
Vec.Writer vw = cv_pred.vec(0).open();
try {
for (long r = 0; r < cv_preds[i].numRows(); ++r) {
//probs[0] stays 0, is not used in getPrediction
for (int c = 1; c < cv_preds[i].numCols(); ++c) {
probs[c] = (float) cv_preds[i].vec(c).at(r);
}
final int label = ModelUtils.getPrediction(probs, (int)r);
vw.set(offsets[i] + r, label);
}
} finally {
vw.close();
}
}
}
// Now score the model on the N folds
try {
AUC auc = nclasses() == 2 ? new AUC() : null;
water.api.ConfusionMatrix cm = new water.api.ConfusionMatrix();
HitRatio hr = isClassifier() ? new HitRatio() : null;
double cv_error = calcError(source, response, cv_pred, cv_pred, "cross-validated", true, 10, cm, auc, hr);
setCrossValidationError(job, cv_error, cm, auc == null ? null : auc.data(), hr);
} finally {
// cleanup temporary frame wit predictions
cv_pred.delete();
}
}
protected void setCrossValidationError(Job.ValidatedJob job, double cv_error, water.api.ConfusionMatrix cm, AUCData auc, HitRatio hr) { throw H2O.unimpl(); }
protected void printCrossValidationModelsHTML(StringBuilder sb) {
if (job() == null) return;
Job.ValidatedJob job = (Job.ValidatedJob)job();
if (job.xval_models != null && job.xval_models.length > 0) {
sb.append("<h4>Cross Validation Models</h4>");
sb.append("<table class='table table-bordered table-condensed'>");
sb.append("<tr><th>Model</th></tr>");
for (Key k : job.xval_models) {
Model m = UKV.get(k);
Job j = m != null ? (Job)m.job() : null;
sb.append("<tr>");
sb.append("<td>" + (m != null ? Inspector.link(k.toString(), k.toString()) : "Pending") + (j != null ? ", Progress: " + Utils.formatPct(j.progress()) : "") + "</td>");
sb.append("</tr>");
}
sb.append("</table>");
}
}
/** Helper type for serialization */
protected static class ModelAutobufferSerializer extends AutoBufferSerializer<Model> { }
/** Returns a model serializer into AutoBuffer. */
public AutoBufferSerializer<Model> getModelSerializer() {
return new ModelAutobufferSerializer();
}
}