package hex; import water.*; import water.H2O.H2OCountedCompleter; import water.Job.JobCancelledException; import water.fvec.Chunk; import water.fvec.Frame; import water.fvec.NewChunk; import water.fvec.Vec; import water.util.Log; import water.util.Utils; import java.util.ArrayList; import java.util.Arrays; import java.util.Random; public abstract class FrameTask<T extends FrameTask<T>> extends MRTask2<T>{ public final DataInfo _dinfo; final protected Key _jobKey; // double _ymu = Double.NaN; // mean of the response // size of the expanded vector of parameters protected float _useFraction = 1.0f; protected boolean _shuffle = false; protected boolean skipMissing() { return true; } public FrameTask(Key jobKey, DataInfo dinfo) { this(jobKey,dinfo,null); } public FrameTask(Key jobKey, DataInfo dinfo, H2OCountedCompleter cmp) { super(cmp); _jobKey = jobKey; _dinfo = dinfo; } protected FrameTask(FrameTask ft){ _dinfo = ft._dinfo; _jobKey = ft._jobKey; _useFraction = ft._useFraction; _shuffle = ft._shuffle; } public final double [] normMul(){return _dinfo._normMul;} public final double [] normSub(){return _dinfo._normSub;} public final double [] normRespMul(){return _dinfo._normMul;} public final double [] normRespSub(){return _dinfo._normSub;} /** * Method to process one row of the data for GLM functions. * Numeric and categorical values are passed separately, as is response. * Categoricals are passed as absolute indexes into the expanded beta vector, 0-levels are skipped * (so the number of passed categoricals will not be the same for every row). * * Categorical expansion/indexing: * Categoricals are placed in the beginning of the beta vector. * Each cat variable with n levels is expanded into n-1 independent binary variables. * Indexes in cats[] will point to the appropriate coefficient in the beta vector, so e.g. * assume we have 2 categorical columns both with values A,B,C, then the following rows will have following indexes: * A,A - ncats = 0, we do not pass any categorical here * A,B - ncats = 1, indexes = [2] * B,B - ncats = 2, indexes = [0,2] * and so on * * @param gid - global id of this row, in [0,_adaptedFrame.numRows()) * @param nums - numeric values of this row * @param ncats - number of passed (non-zero) categoricals * @param cats - indexes of categoricals into the expanded beta-vector. * @param response - numeric value for the response */ protected void processRow(long gid, double [] nums, int ncats, int [] cats, double [] response){throw new RuntimeException("should've been overriden!");} protected void processRow(long gid, double [] nums, int ncats, int [] cats, double [] response, NewChunk [] outputs){throw new RuntimeException("should've been overriden!");} public static class DataInfo extends Iced { public Frame _adaptedFrame; public int _responses; // number of responses public enum TransformType { NONE, STANDARDIZE, NORMALIZE, DEMEAN, DESCALE }; public TransformType _predictor_transform; public TransformType _response_transform; public boolean _useAllFactorLevels; public int _nums; public int _cats; public int [] _catOffsets; public int [] _catMissing; public double [] _normMul; public double [] _normSub; public double [] _normRespMul; public double [] _normRespSub; public int _foldId; public int _nfolds; public Key _frameKey; public boolean _hasIntercept; public DataInfo deep_clone() { AutoBuffer ab = new AutoBuffer(); this.write(ab); ab.flipForReading(); return new DataInfo().read(ab); } private DataInfo() {_catLvls = null; _hasIntercept = true;} private DataInfo(DataInfo dinfo, int foldId, int nfolds){ assert dinfo._catLvls == null:"Should not be called with filtered levels (assuming the selected levels may change with fold id) "; _predictor_transform = dinfo._predictor_transform; _response_transform = dinfo._response_transform; _responses = dinfo._responses; _nums = dinfo._nums; _cats = dinfo._cats; _adaptedFrame = dinfo._adaptedFrame; _catOffsets = dinfo._catOffsets; _catMissing = dinfo._catMissing; _normMul = dinfo._normMul; _normSub = dinfo._normSub; _normRespMul = dinfo._normRespMul; _normRespSub = dinfo._normRespSub; _foldId = foldId; _nfolds = nfolds; _useAllFactorLevels = dinfo._useAllFactorLevels; _catLvls = null; _hasIntercept = dinfo._hasIntercept; } public DataInfo(Frame fr, int hasResponses, boolean hasIntercept, boolean useAllFactorLvls, double [] normSub, double [] normMul, TransformType predictor_transform, double [] normRespSub, double [] normRespMul){ this(fr, hasResponses, hasIntercept, useAllFactorLvls, normMul != null && normSub != null ? predictor_transform : TransformType.NONE, //just allocate, doesn't matter whether standardize or normalize is used (will be overwritten below) normRespMul != null && normRespSub != null ? TransformType.STANDARDIZE : TransformType.NONE); assert (normSub == null) == (normMul == null); assert (normRespSub == null) == (normRespMul == null); if(normSub != null) { System.arraycopy(normSub, 0, _normSub, 0, normSub.length); System.arraycopy(normMul, 0, _normMul, 0, normMul.length); } if(normRespSub != null) { System.arraycopy(normRespSub, 0, _normRespSub, 0, normRespSub.length); System.arraycopy(normRespMul, 0, _normRespMul, 0, normRespMul.length); } } final int [][] _catLvls; /** * Apply data transformation on the given column. * * @param c - index into fully exponded vector * @param v - value of the column to be transformed * @return v transformed by the transformation (e.g. standardization) defined by this dataset for this column */ public double applyTransform(int c, double v){ if(c >= _catOffsets[_catOffsets.length-1]) { c -= _cats; if (_normSub != null) v -= _normSub[c]; if (_normMul != null) v *= _normMul[c]; } return v; } /** * Prepare a Frame (with a single response) to be processed by the FrameTask * 1) Place response at the end * 2) (Optionally) Remove columns with constant values or with greater than 20% NaNs * 3) Possibly turn integer categoricals into enums * * @param source A frame to be expanded and sanity checked * @param response (should be part of source) * @param toEnum Whether or not to turn categoricals into enums * @param dropConstantCols Whether or not to drop constant columns * @return Frame to be used by FrameTask */ public static Frame prepareFrame(Frame source, Vec response, int[] ignored_cols, boolean toEnum, boolean dropConstantCols, boolean dropNACols) { return prepareFrame(source,response != null?new Vec[]{response}:null,ignored_cols,toEnum,dropConstantCols,dropNACols); } public static Frame prepareFrame(Frame source, Vec [] response, int[] ignored_cols, boolean toEnum, boolean dropConstantCols, boolean dropNACols) { Frame fr = new Frame(Key.makeSystem(Key.make().toString()), source._names.clone(), source.vecs().clone()); if(ignored_cols != null && !Utils.isSorted(ignored_cols)) Arrays.sort(ignored_cols); if(response != null && ignored_cols != null) for(Vec v:response){ int id = source.find(v); if(Arrays.binarySearch(ignored_cols,id) >= 0) throw new IllegalArgumentException("Column can not be both ignored and used as a response."); } if (ignored_cols != null) fr.remove(ignored_cols); final Vec[] vecs = fr.vecs(); // compute rollupstats in parallel Futures fs = new Futures(); for (Vec v : vecs) v.rollupStats(fs); fs.blockForPending(); // put response to the end (if not already) if (response != null) { for(Vec v:response){ int id = fr.find(v); final String n = fr._names[id]; if (toEnum && !vecs[id].isEnum()) fr.add(n, fr.remove(id).toEnum()); //convert int classes to enums else fr.add(n, fr.remove(id)); } } ArrayList<Integer> constantOrNAs = new ArrayList<Integer>(); { ArrayList<Integer> constantCols = new ArrayList<Integer>(); ArrayList<Integer> NACols = new ArrayList<Integer>(); for(int i = 0; i < vecs.length-1; ++i) { // remove constant cols and cols with too many NAs final boolean dropconstant = dropConstantCols && vecs[i].min() == vecs[i].max(); final boolean droptoomanyNAs = dropNACols && vecs[i].naCnt() > vecs[i].length()*1; if(dropconstant) { constantCols.add(i); } else if (droptoomanyNAs) { NACols.add(i); } } constantOrNAs.addAll(constantCols); constantOrNAs.addAll(NACols); // Report what is dropped String msg = ""; if (constantCols.size() > 0) msg += "Dropping constant column(s): "; for (int i : constantCols) msg += fr._names[i] + " "; if (NACols.size() > 0) msg += "Dropping column(s) with too many missing values: "; for (int i : NACols) msg += fr._names[i] + " (" + String.format("%.2f", vecs[i].naCnt() * 100. / vecs[i].length()) + "%) "; for (String s : msg.split("\n")) Log.info(s); } if(!constantOrNAs.isEmpty()){ int [] cols = new int[constantOrNAs.size()]; for(int i = 0; i < cols.length; ++i) cols[i] = constantOrNAs.get(i); fr.remove(cols); } return fr; } public static Frame prepareFrame(Frame source, int[] ignored_cols, boolean dropConstantCols, boolean dropNACols) { Frame fr = new Frame(Key.makeSystem(Key.make().toString()), source._names.clone(), source.vecs().clone()); if (ignored_cols != null) fr.remove(ignored_cols); final Vec[] vecs = fr.vecs(); // compute rollupstats in parallel Futures fs = new Futures(); for (Vec v : vecs) v.rollupStats(fs); fs.blockForPending(); ArrayList<Integer> constantOrNAs = new ArrayList<Integer>(); { ArrayList<Integer> constantCols = new ArrayList<Integer>(); ArrayList<Integer> NACols = new ArrayList<Integer>(); for(int i = 0; i < vecs.length; ++i) { // remove constant cols and cols with too many NAs final boolean dropconstant = dropConstantCols && vecs[i].min() == vecs[i].max(); final boolean droptoomanyNAs = dropNACols && vecs[i].naCnt() > vecs[i].length()*0.2; if(dropconstant) { constantCols.add(i); } else if (droptoomanyNAs) { NACols.add(i); } } constantOrNAs.addAll(constantCols); constantOrNAs.addAll(NACols); // Report what is dropped String msg = ""; if (constantCols.size() > 0) msg += "Dropping constant column(s): "; for (int i : constantCols) msg += fr._names[i] + " "; if (NACols.size() > 0) msg += "Dropping column(s) with too many missing values: "; for (int i : NACols) msg += fr._names[i] + " (" + String.format("%.2f", vecs[i].naCnt() * 100. / vecs[i].length()) + "%) "; for (String s : msg.split("\n")) Log.info(s); } if(!constantOrNAs.isEmpty()){ int [] cols = new int[constantOrNAs.size()]; for(int i = 0; i < cols.length; ++i) cols[i] = constantOrNAs.get(i); fr.remove(cols); } return fr; } public static Frame prepareFrame(Frame source, Vec response, int[] ignored_cols, boolean toEnum, boolean dropConstantCols) { return prepareFrame(source, response, ignored_cols, toEnum, dropConstantCols, false); } public DataInfo(Frame fr, int nResponses, boolean hasIntercept, boolean useAllFactors, TransformType predictor_transform) { this(fr, nResponses, hasIntercept, useAllFactors, predictor_transform, TransformType.NONE); } //new DataInfo(f,catLvls, _responses, _standardize, _response_transform); private DataInfo(Frame fr, int[][] catLevels, int responses, boolean hasIntercept, TransformType predictor_transform, TransformType response_transform, int foldId, int nfolds){ _hasIntercept = hasIntercept; _adaptedFrame = fr; _catOffsets = MemoryManager.malloc4(catLevels.length+1); _catMissing = new int[catLevels.length]; int s = 0; // compute rollupstats in parallel Futures fs = new Futures(); for (Vec v : fr.vecs()) v.rollupStats(fs); fs.blockForPending(); for(int i = 0; i < catLevels.length; ++i){ _catOffsets[i] = s; s += catLevels[i].length; } _catLvls = catLevels; _catOffsets[_catOffsets.length-1] = s; _responses = responses; _cats = catLevels.length; _nums = fr.numCols()-_cats - responses; _predictor_transform = predictor_transform; if(_nums > 0){ switch(_predictor_transform) { case STANDARDIZE: _normMul = MemoryManager.malloc8d(_nums); _normSub = MemoryManager.malloc8d(_nums); for (int i = 0; i < _nums; ++i) { Vec v = fr.vec(catLevels.length+i); _normMul[i] = (v.sigma() != 0)?1.0/v.sigma():1.0; _normSub[i] = v.mean(); } break; case NORMALIZE: _normMul = MemoryManager.malloc8d(_nums); _normSub = MemoryManager.malloc8d(_nums); for (int i = 0; i < _nums; ++i) { Vec v = fr.vec(catLevels.length+i); _normMul[i] = (v.max() - v.min() > 0)?1.0/(v.max() - v.min()):1.0; _normSub[i] = v.mean(); } break; case DEMEAN: _normMul = null; _normSub = MemoryManager.malloc8d(_nums); for (int i = 0; i < _nums; ++i) { Vec v = fr.vec(catLevels.length+i); _normSub[i] = v.mean(); } break; case DESCALE: _normSub = null; _normMul = MemoryManager.malloc8d(_nums);; for (int i = 0; i < _nums; ++i) { Vec v = fr.vec(catLevels.length+i); _normMul[i] = (v.sigma() != 0)?1.0/v.sigma():1.0; } break; case NONE: _normMul = null; _normSub = null; break; default: throw H2O.unimpl(); } } _response_transform = response_transform; if(responses > 0){ switch(_response_transform) { case STANDARDIZE: _normRespMul = MemoryManager.malloc8d(responses); _normRespSub = MemoryManager.malloc8d(responses); for (int i = 0; i < responses; ++i) { Vec v = fr.vec(fr.numCols()-responses+i); _normRespMul[i] = (v.sigma() != 0)?1.0/v.sigma():1.0; _normRespSub[i] = v.mean(); } break; case NORMALIZE: _normRespMul = MemoryManager.malloc8d(responses); _normRespSub = MemoryManager.malloc8d(responses); for (int i = 0; i < responses; ++i) { Vec v = fr.vec(fr.numCols()-responses+i); _normRespMul[i] = (v.max() - v.min() > 0)?1.0/(v.max() - v.min()):1.0; _normRespSub[i] = v.mean(); } break; case DEMEAN: _normRespMul = null; _normRespSub = MemoryManager.malloc8d(responses); for (int i = 0; i < responses; ++i) { Vec v = fr.vec(fr.numCols()-responses+i); _normRespSub[i] = v.mean(); } break; case DESCALE: _normRespMul = MemoryManager.malloc8d(responses); _normRespSub = null; for (int i = 0; i < responses; ++i) { Vec v = fr.vec(fr.numCols()-responses+i); _normRespMul[i] = (v.sigma() != 0)?1.0/v.sigma():1.0; } break; case NONE: _normRespMul = null; _normRespSub = null; break; default: throw H2O.unimpl(); } } _useAllFactorLevels = false; _adaptedFrame.reloadVecs(); _nfolds = nfolds; _foldId = foldId; } public DataInfo(Frame fr, int nResponses, boolean hasIntercept, boolean useAllFactorLevels, TransformType predictor_transform, TransformType response_transform) { _nfolds = _foldId = 0; _predictor_transform = predictor_transform; _response_transform = response_transform; _responses = nResponses; _useAllFactorLevels = useAllFactorLevels; _catLvls = null; _hasIntercept = hasIntercept; final Vec [] vecs = fr.vecs(); // compute rollupstats in parallel Futures fs = new Futures(); for (Vec v : vecs) v.rollupStats(fs); fs.blockForPending(); final int n = vecs.length-_responses; if (n < 1) throw new IllegalArgumentException("Training data must have at least one column."); int [] nums = MemoryManager.malloc4(n); int [] cats = MemoryManager.malloc4(n); int nnums = 0, ncats = 0; for(int i = 0; i < n; ++i){ if(vecs[i].isEnum()) cats[ncats++] = i; else nums[nnums++] = i; } _nums = nnums; _cats = ncats; // sort the cats in the decreasing order according to their size for(int i = 0; i < ncats; ++i) for(int j = i+1; j < ncats; ++j) if(vecs[cats[i]].domain().length < vecs[cats[j]].domain().length){ int x = cats[i]; cats[i] = cats[j]; cats[j] = x; } Vec [] vecs2 = vecs.clone(); String [] names = fr._names.clone(); _catOffsets = MemoryManager.malloc4(ncats+1); _catMissing = new int[ncats]; int len = _catOffsets[0] = 0; for(int i = 0; i < ncats; ++i){ Vec v = (vecs2[i] = vecs[cats[i]]); names[i] = fr._names[cats[i]]; _catMissing[i] = v.naCnt() > 0 ? 1 : 0; //needed for test time _catOffsets[i+1] = (len += v.domain().length - (useAllFactorLevels?0:1) + (v.naCnt()>0?1:0)); //missing values turn into a new factor level } switch(predictor_transform) { case STANDARDIZE: case NORMALIZE: _normSub = MemoryManager.malloc8d(nnums); _normMul = MemoryManager.malloc8d(nnums); Arrays.fill(_normMul, 1); break; case DEMEAN: _normSub = MemoryManager.malloc8d(nnums); _normMul = null; break; case DESCALE: _normSub = null; _normMul = MemoryManager.malloc8d(nnums); break; case NONE: _normSub = _normMul = null; break; default: break; } for(int i = 0; i < nnums; ++i){ Vec v = (vecs2[i+ncats] = vecs[nums[i]]); names[i+ncats] = fr._names[nums[i]]; switch(predictor_transform){ case STANDARDIZE: _normSub[i] = v.mean(); _normMul[i] = v.sigma() != 0 ? 1.0/v.sigma() : 1.0; break; case NORMALIZE: _normSub[i] = v.mean(); _normMul[i] = (v.max() - v.min() > 0)?1.0/(v.max() - v.min()):1.0; break; case DEMEAN: _normSub[i] = v.mean(); break; case DESCALE: _normMul[i] = (v.sigma() != 0)?1.0/v.sigma():1.0; break; case NONE: break; default: break; } } if (_responses > 0) { switch(response_transform){ case STANDARDIZE: case NORMALIZE: _normRespSub = MemoryManager.malloc8d(_responses); _normRespMul = MemoryManager.malloc8d(_responses); Arrays.fill(_normRespMul, 1); break; case DEMEAN: _normRespSub = MemoryManager.malloc8d(_responses); _normRespMul = null; break; case DESCALE: _normRespSub = null; _normRespMul = MemoryManager.malloc8d(_responses); break; case NONE: _normRespSub = _normRespMul = null; break; default: throw H2O.unimpl(); } for(int i = 0; i < _responses; ++i){ Vec v = (vecs2[nnums+ncats+i] = vecs[nnums+ncats+i]); switch(response_transform){ case STANDARDIZE: _normRespSub[i] = v.mean(); _normRespMul[i] = v.sigma() != 0 ? 1.0/v.sigma() : 1.0; break; case NORMALIZE: _normRespSub[i] = v.mean(); _normRespMul[i] = (v.max() - v.min() > 0)?1.0/(v.max() - v.min()):1.0; break; case DEMEAN: _normRespSub[i] = v.mean(); break; case DESCALE: _normRespMul[i] = v.sigma() != 0 ? 1.0/v.sigma() : 1.0; break; case NONE: break; default: throw H2O.unimpl(); } } } _adaptedFrame = new Frame(names,vecs2); _adaptedFrame.reloadVecs(); } public DataInfo filterExpandedColumns(int [] cols){ if(cols == null)return this; int i = 0, j = 0, ignoredCnt = 0; //public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub, double [] normMul, double [] normRespSub, double [] normRespMul){ int [][] catLvls = new int[_cats][]; int [] ignoredCols = MemoryManager.malloc4(_nums + _cats); // first do categoricals... if(_catOffsets != null) while(i < cols.length && cols[i] < _catOffsets[_catOffsets.length-1]){ int [] levels = MemoryManager.malloc4(_catOffsets[j+1] - _catOffsets[j]); int k = 0; while(i < cols.length && cols[i] < _catOffsets[j+1]) levels[k++] = cols[i++]-_catOffsets[j]; if(k > 0) catLvls[j] = Arrays.copyOf(levels, k); ++j; } for(int k =0; k < catLvls.length; ++k) if(catLvls[k] == null)ignoredCols[ignoredCnt++] = k; if(ignoredCnt > 0){ int [][] c = new int[_cats-ignoredCnt][]; int y = 0; for (int[] catLvl : catLvls) if (catLvl != null) c[y++] = catLvl; assert y == c.length; catLvls = c; } // now numerics int prev = j = 0; for(; i < cols.length; ++i){ for(int k = prev; k < (cols[i]-numStart()); ++k ){ ignoredCols[ignoredCnt++] = k+_cats; ++j; } prev = ++j; } for(int k = prev; k < _nums; ++k) ignoredCols[ignoredCnt++] = k+_cats; Frame f = new Frame(_adaptedFrame.names().clone(),_adaptedFrame.vecs().clone()); if(ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols,ignoredCnt)); assert catLvls.length < f.numCols():"cats = " + catLvls.length + " numcols = " + f.numCols(); return new DataInfo(f,catLvls, _responses, _hasIntercept, _predictor_transform, _response_transform, _foldId, _nfolds); } public String toString(){ return ""; } public DataInfo getFold(int foldId, int nfolds){ return new DataInfo(this, foldId, nfolds); } public final int fullN(){return _nums + _catOffsets[_cats];} public final int largestCat(){return _cats > 0?_catOffsets[1]:0;} public final int numStart(){return _catOffsets[_cats];} public final String [] coefNames(){ int k = 0; final int n = fullN(); String [] res = new String[n]; final Vec [] vecs = _adaptedFrame.vecs(); for(int i = 0; i < _cats; ++i) { for (int j = _useAllFactorLevels ? 0 : 1; j < vecs[i]._domain.length; ++j) res[k++] = _adaptedFrame._names[i] + "." + vecs[i]._domain[j]; if (vecs[i].naCnt() > 0) res[k++] = _adaptedFrame._names[i] + ".missing(NA)"; } final int nums = n-k; System.arraycopy(_adaptedFrame._names, _cats, res, k, nums); return res; } /** * Normalize horizontalized categoricals to become probabilities per factor level. * This is done with the SoftMax function. * @param in input values * @param out output values (can be the same as input) */ public final void softMaxCategoricals(float[] in, float[] out) { if (_cats == 0) return; if (!_useAllFactorLevels) throw new UnsupportedOperationException("All factor levels must be present for re-scaling with SoftMax."); assert (in.length == out.length); assert (in.length == fullN()); final Vec[] vecs = _adaptedFrame.vecs(); int k = 0; for (int i = 0; i < _cats; ++i) { final int factors = vecs[i]._domain.length; final float max = Utils.maxValue(in, k, k + factors); float scale = 0; for (int j = 0; j < factors; ++j) { out[k + j] = (float) Math.exp(in[k + j] - max); scale += out[k + j]; } for (int j = 0; j < factors; ++j) out[k + j] /= scale; k += factors; } assert(k == numStart()); } /** * Undo the standardization/normalization of numerical columns * @param in input values * @param out output values (can be the same as input) */ public final void unScaleNumericals(float[] in, float[] out) { if (_nums == 0) return; assert (in.length == out.length); assert (in.length == fullN()); for (int k=numStart(); k < fullN(); ++k) out[k] = in[k] / (float)_normMul[k-numStart()] + (float)_normSub[k-numStart()]; } } @Override public T dfork(Frame fr){ assert fr == _dinfo._adaptedFrame; return super.dfork(fr); } /** * Override this to initialize at the beginning of chunk processing. */ protected void chunkInit(){} /** * Override this to do post-chunk processing work. * @param n Number of processed rows */ protected void chunkDone(long n){} /** * Extracts the values, applies standardization/normalization to numerics, adds appropriate offsets to categoricals, * and adapts response according to the CaseMode/CaseValue if set. */ @Override public final void map(Chunk [] chunks, NewChunk [] outputs){ if(_jobKey != null && !Job.isRunning(_jobKey))throw new JobCancelledException(); final int nrows = chunks[0]._len; final long offset = chunks[0]._start; chunkInit(); double [] nums = MemoryManager.malloc8d(_dinfo._nums); int [] cats = MemoryManager.malloc4(_dinfo._cats); double [] response = _dinfo._responses == 0 ? null : MemoryManager.malloc8d(_dinfo._responses); int start = 0; int end = nrows; Random skip_rng = null; //random generator for skipping rows //Example: // _useFraction = 0.8 -> 1 repeat with fraction = 0.8 // _useFraction = 1.0 -> 1 repeat with fraction = 1.0 // _useFraction = 1.1 -> 2 repeats with fraction = 0.55 // _useFraction = 2.1 -> 3 repeats with fraction = 0.7 // _useFraction = 3.0 -> 3 repeats with fraction = 1.0 final int repeats = (int)Math.ceil(_useFraction); final float fraction = _useFraction / repeats; if (fraction < 1.0) skip_rng = water.util.Utils.getDeterRNG(new Random().nextLong()); long[] shuf_map = null; if (_shuffle) { shuf_map = new long[end-start]; for (int i=0;i<shuf_map.length;++i) shuf_map[i] = start + i; Utils.shuffleArray(shuf_map, new Random().nextLong()); } long num_processed_rows = 0; for(int rrr = 0; rrr < repeats; ++rrr) { OUTER: for(int rr = start; rr < end; ++rr){ final int r = shuf_map != null ? (int)shuf_map[rr-start] : rr; final long lr = r + chunks[0]._start; if ((_dinfo._nfolds > 0 && (lr % _dinfo._nfolds) == _dinfo._foldId) || (skip_rng != null && skip_rng.nextFloat() > fraction))continue; ++num_processed_rows; //count rows with missing values even if they are skipped for(Chunk c:chunks)if(skipMissing() && c.isNA0(r))continue OUTER; // skip rows with NAs! int i = 0, ncats = 0; for(; i < _dinfo._cats; ++i){ int c; if (chunks[i].isNA0(r)) { cats[ncats++] = (_dinfo._catOffsets[i+1]-1); //missing value turns into extra (last) factor } else { c = (int) chunks[i].at80(r); if (_dinfo._catLvls != null) { // some levels are ignored? c = Arrays.binarySearch(_dinfo._catLvls[i], c); if (c >= 0) cats[ncats++] = c + _dinfo._catOffsets[i]; } else if (_dinfo._useAllFactorLevels) cats[ncats++] = c + _dinfo._catOffsets[i]; else if (c != 0) cats[ncats++] = c + _dinfo._catOffsets[i] - 1; } } final int n = chunks.length- _dinfo._responses; for(;i < n;++i){ double d = chunks[i].at0(r); //can be NA if skipMissing() == false if(_dinfo._normSub != null) d -= _dinfo._normSub[i- _dinfo._cats]; if(_dinfo._normMul != null) d *= _dinfo._normMul[i- _dinfo._cats]; nums[i- _dinfo._cats] = d; } for(i = 0; i < _dinfo._responses; ++i) { response[i] = chunks[chunks.length- _dinfo._responses + i].at0(r); if (_dinfo._normRespSub != null) response[i] -= _dinfo._normRespSub[i]; if (_dinfo._normRespMul != null) response[i] *= _dinfo._normRespMul[i]; if(Double.isNaN(response[i]))continue OUTER; // skip rows without a valid response (no supervised training possible) } long seed = offset + rrr*(end-start) + r; if (outputs != null && outputs.length > 0) processRow(seed, nums, ncats, cats, response, outputs); else processRow(seed, nums, ncats, cats, response); } } chunkDone(num_processed_rows); } }