package hex.tree;
import hex.*;
import hex.genmodel.GenModel;
import hex.genmodel.utils.DistributionFamily;
import water.Key;
import water.MRTask;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.Vec;
/** Score the tree columns, and produce a confusion matrix and AUC
*/
public class Score extends MRTask<Score> {
final SharedTree _bldr;
final boolean _is_train; // Scoring on pre-scored training data vs full-score data
final boolean _oob; // Computed on OOB
final Key<Vec> _kresp; // Response vector key (might be either train or validation)
final ModelCategory _mcat; // Model category (Binomial, Regression, etc)
ModelMetrics.MetricBuilder _mb;
// GainsLift.GainsLiftBuilder _gainsLiftBuilder;
final boolean _computeGainsLift;
/** Compute ModelMetrics on the testing dataset.
* It expect already adapted validation dataset which is adapted to a model
* and contains a response which is adapted to confusion matrix domain.
*/
public Score(SharedTree bldr, boolean is_train, boolean oob, Key<Vec> kresp, ModelCategory mcat, boolean computeGainsLift) { _bldr = bldr; _is_train = is_train; _oob = oob; _kresp = kresp; _mcat = mcat; _computeGainsLift = computeGainsLift; }
@Override public void map( Chunk chks[] ) {
Chunk ys = _bldr.chk_resp(chks); // Response
Model m = _bldr._model;
Chunk weightsChunk = m._output.hasWeights() ? chks[m._output.weightsIdx()] : null;
Chunk offsetChunk = m._output.hasOffset() ? chks[m._output.offsetIdx()] : null;
final int nclass = _bldr.nclasses();
// Because of adaption - the validation training set has at least as many
// classes as the training set (it may have more). The Confusion Matrix
// needs to be at least as big as the training set domain.
String[] domain = _kresp.get().domain();
// If this is a score-on-train AND DRF, then oobColIdx makes sense,
// otherwise this field is unused.
final int oobColIdx = _bldr.idx_oobt();
_mb = m.makeMetricBuilder(domain);
// _gainsLiftBuilder = _bldr._model._output.nclasses()==2 ? new GainsLift.GainsLiftBuilder(_fr.vec(_bldr.idx_tree(0)).pctiles()) : null;
final double[] cdists = _mb._work; // Temp working array for class distributions
// If working a validation set, need to push thru official model scoring
// logic which requires a temp array to hold the features.
final double[] tmp = _is_train && _bldr._ntrees > 0 ? null : new double[_bldr._ncols];
// final double[] tmp = new double[_bldr._ncols];
// Score all Rows
float [] val= new float[1];
for( int row=0; row<ys._len; row++ ) {
if( ys.isNA(row) ) continue; // Ignore missing response vars only if it was actual NA
// Ignore out-of-bag rows
if( _oob && chks[oobColIdx].atd(row)==0 ) continue;
double weight = weightsChunk!=null?weightsChunk.atd(row):1;
if (weight == 0) continue; //ignore holdout rows
double offset = offsetChunk!=null?offsetChunk.atd(row):0;
if( _is_train ) // Passed in the model-specific columns
_bldr.score2(chks, weight, offset, cdists, row); // Use the training data directly (per-row predictions already made)
else // Must score "the hard way"
m.score0(chks, weight, offset, row, tmp, cdists);
// fill tmp with training data for null model - to have proper tie breaking
if (_is_train && _bldr._ntrees == 0)
for( int i=0; i< tmp.length; i++ )
tmp[i] = chks[i].atd(row);
if( nclass > 1 ) cdists[0] = GenModel.getPrediction(cdists, m._output._priorClassDist, tmp, m.defaultThreshold()); // Fill in prediction
val[0] = (float)ys.atd(row);
_mb.perRow(cdists, val, weight, offset, m);
}
}
@Override public void reduce( Score t ) {
_mb.reduce(t._mb);
}
// Run after the doAll scoring to convert the MetricsBuilder to a ModelMetrics
ModelMetricsSupervised makeModelMetrics(SharedTreeModel model, Frame fr) {
Frame preds = (model._output.nclasses()==2 && _computeGainsLift) || model._parms._distribution == DistributionFamily.huber ? model.score(fr) : null;
ModelMetricsSupervised mms = (ModelMetricsSupervised) _mb.makeModelMetrics(model, fr, null, preds);
if (preds != null) preds.remove();
return mms;
}
}