package hex;
import water.*;
import water.H2O.H2OCountedCompleter;
import water.Job.JobCancelledException;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.NewChunk;
import water.fvec.Vec;
import water.util.Log;
import water.util.Utils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Random;
public abstract class FrameTask<T extends FrameTask<T>> extends MRTask2<T>{
public final DataInfo _dinfo;
final protected Key _jobKey;
// double _ymu = Double.NaN; // mean of the response
// size of the expanded vector of parameters
protected float _useFraction = 1.0f;
protected boolean _shuffle = false;
protected boolean skipMissing() { return true; }
public FrameTask(Key jobKey, DataInfo dinfo) {
this(jobKey,dinfo,null);
}
public FrameTask(Key jobKey, DataInfo dinfo, H2OCountedCompleter cmp) {
super(cmp);
_jobKey = jobKey;
_dinfo = dinfo;
}
protected FrameTask(FrameTask ft){
_dinfo = ft._dinfo;
_jobKey = ft._jobKey;
_useFraction = ft._useFraction;
_shuffle = ft._shuffle;
}
public final double [] normMul(){return _dinfo._normMul;}
public final double [] normSub(){return _dinfo._normSub;}
public final double [] normRespMul(){return _dinfo._normMul;}
public final double [] normRespSub(){return _dinfo._normSub;}
/**
* Method to process one row of the data for GLM functions.
* Numeric and categorical values are passed separately, as is response.
* Categoricals are passed as absolute indexes into the expanded beta vector, 0-levels are skipped
* (so the number of passed categoricals will not be the same for every row).
*
* Categorical expansion/indexing:
* Categoricals are placed in the beginning of the beta vector.
* Each cat variable with n levels is expanded into n-1 independent binary variables.
* Indexes in cats[] will point to the appropriate coefficient in the beta vector, so e.g.
* assume we have 2 categorical columns both with values A,B,C, then the following rows will have following indexes:
* A,A - ncats = 0, we do not pass any categorical here
* A,B - ncats = 1, indexes = [2]
* B,B - ncats = 2, indexes = [0,2]
* and so on
*
* @param gid - global id of this row, in [0,_adaptedFrame.numRows())
* @param nums - numeric values of this row
* @param ncats - number of passed (non-zero) categoricals
* @param cats - indexes of categoricals into the expanded beta-vector.
* @param response - numeric value for the response
*/
protected void processRow(long gid, double [] nums, int ncats, int [] cats, double [] response){throw new RuntimeException("should've been overriden!");}
protected void processRow(long gid, double [] nums, int ncats, int [] cats, double [] response, NewChunk [] outputs){throw new RuntimeException("should've been overriden!");}
public static class DataInfo extends Iced {
public Frame _adaptedFrame;
public int _responses; // number of responses
public enum TransformType { NONE, STANDARDIZE, NORMALIZE, DEMEAN, DESCALE };
public TransformType _predictor_transform;
public TransformType _response_transform;
public boolean _useAllFactorLevels;
public int _nums;
public int _cats;
public int [] _catOffsets;
public int [] _catMissing;
public double [] _normMul;
public double [] _normSub;
public double [] _normRespMul;
public double [] _normRespSub;
public int _foldId;
public int _nfolds;
public Key _frameKey;
public boolean _hasIntercept;
public DataInfo deep_clone() {
AutoBuffer ab = new AutoBuffer();
this.write(ab);
ab.flipForReading();
return new DataInfo().read(ab);
}
private DataInfo() {_catLvls = null; _hasIntercept = true;}
private DataInfo(DataInfo dinfo, int foldId, int nfolds){
assert dinfo._catLvls == null:"Should not be called with filtered levels (assuming the selected levels may change with fold id) ";
_predictor_transform = dinfo._predictor_transform;
_response_transform = dinfo._response_transform;
_responses = dinfo._responses;
_nums = dinfo._nums;
_cats = dinfo._cats;
_adaptedFrame = dinfo._adaptedFrame;
_catOffsets = dinfo._catOffsets;
_catMissing = dinfo._catMissing;
_normMul = dinfo._normMul;
_normSub = dinfo._normSub;
_normRespMul = dinfo._normRespMul;
_normRespSub = dinfo._normRespSub;
_foldId = foldId;
_nfolds = nfolds;
_useAllFactorLevels = dinfo._useAllFactorLevels;
_catLvls = null;
_hasIntercept = dinfo._hasIntercept;
}
public DataInfo(Frame fr, int hasResponses, boolean hasIntercept, boolean useAllFactorLvls, double [] normSub, double [] normMul, TransformType predictor_transform, double [] normRespSub, double [] normRespMul){
this(fr, hasResponses, hasIntercept, useAllFactorLvls,
normMul != null && normSub != null ? predictor_transform : TransformType.NONE, //just allocate, doesn't matter whether standardize or normalize is used (will be overwritten below)
normRespMul != null && normRespSub != null ? TransformType.STANDARDIZE : TransformType.NONE);
assert (normSub == null) == (normMul == null);
assert (normRespSub == null) == (normRespMul == null);
if(normSub != null) {
System.arraycopy(normSub, 0, _normSub, 0, normSub.length);
System.arraycopy(normMul, 0, _normMul, 0, normMul.length);
}
if(normRespSub != null) {
System.arraycopy(normRespSub, 0, _normRespSub, 0, normRespSub.length);
System.arraycopy(normRespMul, 0, _normRespMul, 0, normRespMul.length);
}
}
final int [][] _catLvls;
/**
* Apply data transformation on the given column.
*
* @param c - index into fully exponded vector
* @param v - value of the column to be transformed
* @return v transformed by the transformation (e.g. standardization) defined by this dataset for this column
*/
public double applyTransform(int c, double v){
if(c >= _catOffsets[_catOffsets.length-1]) {
c -= _cats;
if (_normSub != null) v -= _normSub[c];
if (_normMul != null) v *= _normMul[c];
}
return v;
}
/**
* Prepare a Frame (with a single response) to be processed by the FrameTask
* 1) Place response at the end
* 2) (Optionally) Remove columns with constant values or with greater than 20% NaNs
* 3) Possibly turn integer categoricals into enums
*
* @param source A frame to be expanded and sanity checked
* @param response (should be part of source)
* @param toEnum Whether or not to turn categoricals into enums
* @param dropConstantCols Whether or not to drop constant columns
* @return Frame to be used by FrameTask
*/
public static Frame prepareFrame(Frame source, Vec response, int[] ignored_cols, boolean toEnum, boolean dropConstantCols, boolean dropNACols) {
return prepareFrame(source,response != null?new Vec[]{response}:null,ignored_cols,toEnum,dropConstantCols,dropNACols);
}
public static Frame prepareFrame(Frame source, Vec [] response, int[] ignored_cols, boolean toEnum, boolean dropConstantCols, boolean dropNACols) {
Frame fr = new Frame(Key.makeSystem(Key.make().toString()), source._names.clone(), source.vecs().clone());
if(ignored_cols != null && !Utils.isSorted(ignored_cols))
Arrays.sort(ignored_cols);
if(response != null && ignored_cols != null)
for(Vec v:response){
int id = source.find(v);
if(Arrays.binarySearch(ignored_cols,id) >= 0)
throw new IllegalArgumentException("Column can not be both ignored and used as a response.");
}
if (ignored_cols != null) fr.remove(ignored_cols);
final Vec[] vecs = fr.vecs();
// compute rollupstats in parallel
Futures fs = new Futures();
for (Vec v : vecs) v.rollupStats(fs);
fs.blockForPending();
// put response to the end (if not already)
if (response != null) {
for(Vec v:response){
int id = fr.find(v);
final String n = fr._names[id];
if (toEnum && !vecs[id].isEnum()) fr.add(n, fr.remove(id).toEnum()); //convert int classes to enums
else fr.add(n, fr.remove(id));
}
}
ArrayList<Integer> constantOrNAs = new ArrayList<Integer>();
{
ArrayList<Integer> constantCols = new ArrayList<Integer>();
ArrayList<Integer> NACols = new ArrayList<Integer>();
for(int i = 0; i < vecs.length-1; ++i) {
// remove constant cols and cols with too many NAs
final boolean dropconstant = dropConstantCols && vecs[i].min() == vecs[i].max();
final boolean droptoomanyNAs = dropNACols && vecs[i].naCnt() > vecs[i].length()*1;
if(dropconstant) {
constantCols.add(i);
} else if (droptoomanyNAs) {
NACols.add(i);
}
}
constantOrNAs.addAll(constantCols);
constantOrNAs.addAll(NACols);
// Report what is dropped
String msg = "";
if (constantCols.size() > 0) msg += "Dropping constant column(s): ";
for (int i : constantCols) msg += fr._names[i] + " ";
if (NACols.size() > 0) msg += "Dropping column(s) with too many missing values: ";
for (int i : NACols) msg += fr._names[i] + " (" + String.format("%.2f", vecs[i].naCnt() * 100. / vecs[i].length()) + "%) ";
for (String s : msg.split("\n")) Log.info(s);
}
if(!constantOrNAs.isEmpty()){
int [] cols = new int[constantOrNAs.size()];
for(int i = 0; i < cols.length; ++i)
cols[i] = constantOrNAs.get(i);
fr.remove(cols);
}
return fr;
}
public static Frame prepareFrame(Frame source, int[] ignored_cols, boolean dropConstantCols, boolean dropNACols) {
Frame fr = new Frame(Key.makeSystem(Key.make().toString()), source._names.clone(), source.vecs().clone());
if (ignored_cols != null) fr.remove(ignored_cols);
final Vec[] vecs = fr.vecs();
// compute rollupstats in parallel
Futures fs = new Futures();
for (Vec v : vecs) v.rollupStats(fs);
fs.blockForPending();
ArrayList<Integer> constantOrNAs = new ArrayList<Integer>();
{
ArrayList<Integer> constantCols = new ArrayList<Integer>();
ArrayList<Integer> NACols = new ArrayList<Integer>();
for(int i = 0; i < vecs.length; ++i) {
// remove constant cols and cols with too many NAs
final boolean dropconstant = dropConstantCols && vecs[i].min() == vecs[i].max();
final boolean droptoomanyNAs = dropNACols && vecs[i].naCnt() > vecs[i].length()*0.2;
if(dropconstant) {
constantCols.add(i);
} else if (droptoomanyNAs) {
NACols.add(i);
}
}
constantOrNAs.addAll(constantCols);
constantOrNAs.addAll(NACols);
// Report what is dropped
String msg = "";
if (constantCols.size() > 0) msg += "Dropping constant column(s): ";
for (int i : constantCols) msg += fr._names[i] + " ";
if (NACols.size() > 0) msg += "Dropping column(s) with too many missing values: ";
for (int i : NACols) msg += fr._names[i] + " (" + String.format("%.2f", vecs[i].naCnt() * 100. / vecs[i].length()) + "%) ";
for (String s : msg.split("\n")) Log.info(s);
}
if(!constantOrNAs.isEmpty()){
int [] cols = new int[constantOrNAs.size()];
for(int i = 0; i < cols.length; ++i)
cols[i] = constantOrNAs.get(i);
fr.remove(cols);
}
return fr;
}
public static Frame prepareFrame(Frame source, Vec response, int[] ignored_cols, boolean toEnum, boolean dropConstantCols) {
return prepareFrame(source, response, ignored_cols, toEnum, dropConstantCols, false);
}
public DataInfo(Frame fr, int nResponses, boolean hasIntercept, boolean useAllFactors, TransformType predictor_transform) {
this(fr, nResponses, hasIntercept, useAllFactors, predictor_transform, TransformType.NONE);
}
//new DataInfo(f,catLvls, _responses, _standardize, _response_transform);
private DataInfo(Frame fr, int[][] catLevels, int responses, boolean hasIntercept, TransformType predictor_transform, TransformType response_transform, int foldId, int nfolds){
_hasIntercept = hasIntercept;
_adaptedFrame = fr;
_catOffsets = MemoryManager.malloc4(catLevels.length+1);
_catMissing = new int[catLevels.length];
int s = 0;
// compute rollupstats in parallel
Futures fs = new Futures();
for (Vec v : fr.vecs()) v.rollupStats(fs);
fs.blockForPending();
for(int i = 0; i < catLevels.length; ++i){
_catOffsets[i] = s;
s += catLevels[i].length;
}
_catLvls = catLevels;
_catOffsets[_catOffsets.length-1] = s;
_responses = responses;
_cats = catLevels.length;
_nums = fr.numCols()-_cats - responses;
_predictor_transform = predictor_transform;
if(_nums > 0){
switch(_predictor_transform) {
case STANDARDIZE:
_normMul = MemoryManager.malloc8d(_nums);
_normSub = MemoryManager.malloc8d(_nums);
for (int i = 0; i < _nums; ++i) {
Vec v = fr.vec(catLevels.length+i);
_normMul[i] = (v.sigma() != 0)?1.0/v.sigma():1.0;
_normSub[i] = v.mean();
}
break;
case NORMALIZE:
_normMul = MemoryManager.malloc8d(_nums);
_normSub = MemoryManager.malloc8d(_nums);
for (int i = 0; i < _nums; ++i) {
Vec v = fr.vec(catLevels.length+i);
_normMul[i] = (v.max() - v.min() > 0)?1.0/(v.max() - v.min()):1.0;
_normSub[i] = v.mean();
}
break;
case DEMEAN:
_normMul = null;
_normSub = MemoryManager.malloc8d(_nums);
for (int i = 0; i < _nums; ++i) {
Vec v = fr.vec(catLevels.length+i);
_normSub[i] = v.mean();
}
break;
case DESCALE:
_normSub = null;
_normMul = MemoryManager.malloc8d(_nums);;
for (int i = 0; i < _nums; ++i) {
Vec v = fr.vec(catLevels.length+i);
_normMul[i] = (v.sigma() != 0)?1.0/v.sigma():1.0;
}
break;
case NONE:
_normMul = null;
_normSub = null;
break;
default:
throw H2O.unimpl();
}
}
_response_transform = response_transform;
if(responses > 0){
switch(_response_transform) {
case STANDARDIZE:
_normRespMul = MemoryManager.malloc8d(responses);
_normRespSub = MemoryManager.malloc8d(responses);
for (int i = 0; i < responses; ++i) {
Vec v = fr.vec(fr.numCols()-responses+i);
_normRespMul[i] = (v.sigma() != 0)?1.0/v.sigma():1.0;
_normRespSub[i] = v.mean();
}
break;
case NORMALIZE:
_normRespMul = MemoryManager.malloc8d(responses);
_normRespSub = MemoryManager.malloc8d(responses);
for (int i = 0; i < responses; ++i) {
Vec v = fr.vec(fr.numCols()-responses+i);
_normRespMul[i] = (v.max() - v.min() > 0)?1.0/(v.max() - v.min()):1.0;
_normRespSub[i] = v.mean();
}
break;
case DEMEAN:
_normRespMul = null;
_normRespSub = MemoryManager.malloc8d(responses);
for (int i = 0; i < responses; ++i) {
Vec v = fr.vec(fr.numCols()-responses+i);
_normRespSub[i] = v.mean();
}
break;
case DESCALE:
_normRespMul = MemoryManager.malloc8d(responses);
_normRespSub = null;
for (int i = 0; i < responses; ++i) {
Vec v = fr.vec(fr.numCols()-responses+i);
_normRespMul[i] = (v.sigma() != 0)?1.0/v.sigma():1.0;
}
break;
case NONE:
_normRespMul = null;
_normRespSub = null;
break;
default:
throw H2O.unimpl();
}
}
_useAllFactorLevels = false;
_adaptedFrame.reloadVecs();
_nfolds = nfolds;
_foldId = foldId;
}
public DataInfo(Frame fr, int nResponses, boolean hasIntercept, boolean useAllFactorLevels, TransformType predictor_transform, TransformType response_transform) {
_nfolds = _foldId = 0;
_predictor_transform = predictor_transform;
_response_transform = response_transform;
_responses = nResponses;
_useAllFactorLevels = useAllFactorLevels;
_catLvls = null;
_hasIntercept = hasIntercept;
final Vec [] vecs = fr.vecs();
// compute rollupstats in parallel
Futures fs = new Futures();
for (Vec v : vecs) v.rollupStats(fs);
fs.blockForPending();
final int n = vecs.length-_responses;
if (n < 1) throw new IllegalArgumentException("Training data must have at least one column.");
int [] nums = MemoryManager.malloc4(n);
int [] cats = MemoryManager.malloc4(n);
int nnums = 0, ncats = 0;
for(int i = 0; i < n; ++i){
if(vecs[i].isEnum())
cats[ncats++] = i;
else
nums[nnums++] = i;
}
_nums = nnums;
_cats = ncats;
// sort the cats in the decreasing order according to their size
for(int i = 0; i < ncats; ++i)
for(int j = i+1; j < ncats; ++j)
if(vecs[cats[i]].domain().length < vecs[cats[j]].domain().length){
int x = cats[i];
cats[i] = cats[j];
cats[j] = x;
}
Vec [] vecs2 = vecs.clone();
String [] names = fr._names.clone();
_catOffsets = MemoryManager.malloc4(ncats+1);
_catMissing = new int[ncats];
int len = _catOffsets[0] = 0;
for(int i = 0; i < ncats; ++i){
Vec v = (vecs2[i] = vecs[cats[i]]);
names[i] = fr._names[cats[i]];
_catMissing[i] = v.naCnt() > 0 ? 1 : 0; //needed for test time
_catOffsets[i+1] = (len += v.domain().length - (useAllFactorLevels?0:1) + (v.naCnt()>0?1:0)); //missing values turn into a new factor level
}
switch(predictor_transform) {
case STANDARDIZE:
case NORMALIZE:
_normSub = MemoryManager.malloc8d(nnums);
_normMul = MemoryManager.malloc8d(nnums); Arrays.fill(_normMul, 1);
break;
case DEMEAN:
_normSub = MemoryManager.malloc8d(nnums);
_normMul = null;
break;
case DESCALE:
_normSub = null;
_normMul = MemoryManager.malloc8d(nnums);
break;
case NONE:
_normSub = _normMul = null;
break;
default:
break;
}
for(int i = 0; i < nnums; ++i){
Vec v = (vecs2[i+ncats] = vecs[nums[i]]);
names[i+ncats] = fr._names[nums[i]];
switch(predictor_transform){
case STANDARDIZE:
_normSub[i] = v.mean();
_normMul[i] = v.sigma() != 0 ? 1.0/v.sigma() : 1.0;
break;
case NORMALIZE:
_normSub[i] = v.mean();
_normMul[i] = (v.max() - v.min() > 0)?1.0/(v.max() - v.min()):1.0;
break;
case DEMEAN:
_normSub[i] = v.mean();
break;
case DESCALE:
_normMul[i] = (v.sigma() != 0)?1.0/v.sigma():1.0;
break;
case NONE:
break;
default:
break;
}
}
if (_responses > 0) {
switch(response_transform){
case STANDARDIZE:
case NORMALIZE:
_normRespSub = MemoryManager.malloc8d(_responses);
_normRespMul = MemoryManager.malloc8d(_responses); Arrays.fill(_normRespMul, 1);
break;
case DEMEAN:
_normRespSub = MemoryManager.malloc8d(_responses);
_normRespMul = null;
break;
case DESCALE:
_normRespSub = null;
_normRespMul = MemoryManager.malloc8d(_responses);
break;
case NONE:
_normRespSub = _normRespMul = null;
break;
default:
throw H2O.unimpl();
}
for(int i = 0; i < _responses; ++i){
Vec v = (vecs2[nnums+ncats+i] = vecs[nnums+ncats+i]);
switch(response_transform){
case STANDARDIZE:
_normRespSub[i] = v.mean();
_normRespMul[i] = v.sigma() != 0 ? 1.0/v.sigma() : 1.0;
break;
case NORMALIZE:
_normRespSub[i] = v.mean();
_normRespMul[i] = (v.max() - v.min() > 0)?1.0/(v.max() - v.min()):1.0;
break;
case DEMEAN:
_normRespSub[i] = v.mean();
break;
case DESCALE:
_normRespMul[i] = v.sigma() != 0 ? 1.0/v.sigma() : 1.0;
break;
case NONE:
break;
default:
throw H2O.unimpl();
}
}
}
_adaptedFrame = new Frame(names,vecs2);
_adaptedFrame.reloadVecs();
}
public DataInfo filterExpandedColumns(int [] cols){
if(cols == null)return this;
int i = 0, j = 0, ignoredCnt = 0;
//public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub, double [] normMul, double [] normRespSub, double [] normRespMul){
int [][] catLvls = new int[_cats][];
int [] ignoredCols = MemoryManager.malloc4(_nums + _cats);
// first do categoricals...
if(_catOffsets != null)
while(i < cols.length && cols[i] < _catOffsets[_catOffsets.length-1]){
int [] levels = MemoryManager.malloc4(_catOffsets[j+1] - _catOffsets[j]);
int k = 0;
while(i < cols.length && cols[i] < _catOffsets[j+1])
levels[k++] = cols[i++]-_catOffsets[j];
if(k > 0)
catLvls[j] = Arrays.copyOf(levels, k);
++j;
}
for(int k =0; k < catLvls.length; ++k)
if(catLvls[k] == null)ignoredCols[ignoredCnt++] = k;
if(ignoredCnt > 0){
int [][] c = new int[_cats-ignoredCnt][];
int y = 0;
for (int[] catLvl : catLvls) if (catLvl != null) c[y++] = catLvl;
assert y == c.length;
catLvls = c;
}
// now numerics
int prev = j = 0;
for(; i < cols.length; ++i){
for(int k = prev; k < (cols[i]-numStart()); ++k ){
ignoredCols[ignoredCnt++] = k+_cats;
++j;
}
prev = ++j;
}
for(int k = prev; k < _nums; ++k)
ignoredCols[ignoredCnt++] = k+_cats;
Frame f = new Frame(_adaptedFrame.names().clone(),_adaptedFrame.vecs().clone());
if(ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols,ignoredCnt));
assert catLvls.length < f.numCols():"cats = " + catLvls.length + " numcols = " + f.numCols();
return new DataInfo(f,catLvls, _responses, _hasIntercept, _predictor_transform, _response_transform, _foldId, _nfolds);
}
public String toString(){
return "";
}
public DataInfo getFold(int foldId, int nfolds){
return new DataInfo(this, foldId, nfolds);
}
public final int fullN(){return _nums + _catOffsets[_cats];}
public final int largestCat(){return _cats > 0?_catOffsets[1]:0;}
public final int numStart(){return _catOffsets[_cats];}
public final String [] coefNames(){
int k = 0;
final int n = fullN();
String [] res = new String[n];
final Vec [] vecs = _adaptedFrame.vecs();
for(int i = 0; i < _cats; ++i) {
for (int j = _useAllFactorLevels ? 0 : 1; j < vecs[i]._domain.length; ++j)
res[k++] = _adaptedFrame._names[i] + "." + vecs[i]._domain[j];
if (vecs[i].naCnt() > 0) res[k++] = _adaptedFrame._names[i] + ".missing(NA)";
}
final int nums = n-k;
System.arraycopy(_adaptedFrame._names, _cats, res, k, nums);
return res;
}
/**
* Normalize horizontalized categoricals to become probabilities per factor level.
* This is done with the SoftMax function.
* @param in input values
* @param out output values (can be the same as input)
*/
public final void softMaxCategoricals(float[] in, float[] out) {
if (_cats == 0) return;
if (!_useAllFactorLevels) throw new UnsupportedOperationException("All factor levels must be present for re-scaling with SoftMax.");
assert (in.length == out.length);
assert (in.length == fullN());
final Vec[] vecs = _adaptedFrame.vecs();
int k = 0;
for (int i = 0; i < _cats; ++i) {
final int factors = vecs[i]._domain.length;
final float max = Utils.maxValue(in, k, k + factors);
float scale = 0;
for (int j = 0; j < factors; ++j) {
out[k + j] = (float) Math.exp(in[k + j] - max);
scale += out[k + j];
}
for (int j = 0; j < factors; ++j)
out[k + j] /= scale;
k += factors;
}
assert(k == numStart());
}
/**
* Undo the standardization/normalization of numerical columns
* @param in input values
* @param out output values (can be the same as input)
*/
public final void unScaleNumericals(float[] in, float[] out) {
if (_nums == 0) return;
assert (in.length == out.length);
assert (in.length == fullN());
for (int k=numStart(); k < fullN(); ++k)
out[k] = in[k] / (float)_normMul[k-numStart()] + (float)_normSub[k-numStart()];
}
}
@Override
public T dfork(Frame fr){
assert fr == _dinfo._adaptedFrame;
return super.dfork(fr);
}
/**
* Override this to initialize at the beginning of chunk processing.
*/
protected void chunkInit(){}
/**
* Override this to do post-chunk processing work.
* @param n Number of processed rows
*/
protected void chunkDone(long n){}
/**
* Extracts the values, applies standardization/normalization to numerics, adds appropriate offsets to categoricals,
* and adapts response according to the CaseMode/CaseValue if set.
*/
@Override public final void map(Chunk [] chunks, NewChunk [] outputs){
if(_jobKey != null && !Job.isRunning(_jobKey))throw new JobCancelledException();
final int nrows = chunks[0]._len;
final long offset = chunks[0]._start;
chunkInit();
double [] nums = MemoryManager.malloc8d(_dinfo._nums);
int [] cats = MemoryManager.malloc4(_dinfo._cats);
double [] response = _dinfo._responses == 0 ? null : MemoryManager.malloc8d(_dinfo._responses);
int start = 0;
int end = nrows;
Random skip_rng = null; //random generator for skipping rows
//Example:
// _useFraction = 0.8 -> 1 repeat with fraction = 0.8
// _useFraction = 1.0 -> 1 repeat with fraction = 1.0
// _useFraction = 1.1 -> 2 repeats with fraction = 0.55
// _useFraction = 2.1 -> 3 repeats with fraction = 0.7
// _useFraction = 3.0 -> 3 repeats with fraction = 1.0
final int repeats = (int)Math.ceil(_useFraction);
final float fraction = _useFraction / repeats;
if (fraction < 1.0) skip_rng = water.util.Utils.getDeterRNG(new Random().nextLong());
long[] shuf_map = null;
if (_shuffle) {
shuf_map = new long[end-start];
for (int i=0;i<shuf_map.length;++i)
shuf_map[i] = start + i;
Utils.shuffleArray(shuf_map, new Random().nextLong());
}
long num_processed_rows = 0;
for(int rrr = 0; rrr < repeats; ++rrr) {
OUTER:
for(int rr = start; rr < end; ++rr){
final int r = shuf_map != null ? (int)shuf_map[rr-start] : rr;
final long lr = r + chunks[0]._start;
if ((_dinfo._nfolds > 0 && (lr % _dinfo._nfolds) == _dinfo._foldId)
|| (skip_rng != null && skip_rng.nextFloat() > fraction))continue;
++num_processed_rows; //count rows with missing values even if they are skipped
for(Chunk c:chunks)if(skipMissing() && c.isNA0(r))continue OUTER; // skip rows with NAs!
int i = 0, ncats = 0;
for(; i < _dinfo._cats; ++i){
int c;
if (chunks[i].isNA0(r)) {
cats[ncats++] = (_dinfo._catOffsets[i+1]-1); //missing value turns into extra (last) factor
} else {
c = (int) chunks[i].at80(r);
if (_dinfo._catLvls != null) { // some levels are ignored?
c = Arrays.binarySearch(_dinfo._catLvls[i], c);
if (c >= 0)
cats[ncats++] = c + _dinfo._catOffsets[i];
} else if (_dinfo._useAllFactorLevels)
cats[ncats++] = c + _dinfo._catOffsets[i];
else if (c != 0)
cats[ncats++] = c + _dinfo._catOffsets[i] - 1;
}
}
final int n = chunks.length- _dinfo._responses;
for(;i < n;++i){
double d = chunks[i].at0(r); //can be NA if skipMissing() == false
if(_dinfo._normSub != null) d -= _dinfo._normSub[i- _dinfo._cats];
if(_dinfo._normMul != null) d *= _dinfo._normMul[i- _dinfo._cats];
nums[i- _dinfo._cats] = d;
}
for(i = 0; i < _dinfo._responses; ++i) {
response[i] = chunks[chunks.length- _dinfo._responses + i].at0(r);
if (_dinfo._normRespSub != null) response[i] -= _dinfo._normRespSub[i];
if (_dinfo._normRespMul != null) response[i] *= _dinfo._normRespMul[i];
if(Double.isNaN(response[i]))continue OUTER; // skip rows without a valid response (no supervised training possible)
}
long seed = offset + rrr*(end-start) + r;
if (outputs != null && outputs.length > 0)
processRow(seed, nums, ncats, cats, response, outputs);
else
processRow(seed, nums, ncats, cats, response);
}
}
chunkDone(num_processed_rows);
}
}