package hex; import water.*; import water.fvec.*; import water.util.ArrayUtils; import java.util.ArrayList; import java.util.Arrays; /** * Created by tomasnykodym on 1/29/15. * * Provides higher level interface for accessing data row-wise. * * Performs on the fly auto-expansion of categorical variables (to 1 hot encoding) and standardization ( or normalize/demean/descale/none) of predictors and response. * Supports sparse data, sparse columns can be transformed to sparse rows on the fly with some (significant) memory overhead, * as the data of the whole chunk(s) will be copied. * */ public class DataInfo extends Keyed<DataInfo> { public int [] _activeCols; public Frame _adaptedFrame; // the modified DataInfo frame (columns sorted by largest categorical -> least then all numerical columns) public int _responses; // number of responses public int _outpus; // number of outputs public Vec setWeights(String name, Vec vec) { if(_weights) return _adaptedFrame.replace(weightChunkId(),vec); _adaptedFrame.insertVec(weightChunkId(),name,vec); _weights = true; return null; } public void dropWeights() { if(!_weights)return; _adaptedFrame.remove(weightChunkId()); _weights = false; } public void dropInteractions() { // only called to cleanup the InteractionWrappedVecs! if(_interactions!=null) { Vec[] vecs = _adaptedFrame.remove(_interactionVecs); for(Vec v:vecs)v.remove(); _interactions = null; } } public int[] activeCols() { if(_activeCols != null) return _activeCols; int [] res = new int[fullN()+1]; for(int i = 0; i < res.length; ++i) res[i] = i; return res; } public void addResponse(String [] names, Vec[] vecs) { _adaptedFrame.add(names,vecs); _responses += vecs.length; } public int[] catNAFill() {return _catNAFill;} public int catNAFill(int cid) {return _catNAFill[cid];} public void setCatNAFill(int[] catNAFill) { _catNAFill = catNAFill; } public enum TransformType { NONE, STANDARDIZE, NORMALIZE, DEMEAN, DESCALE; public boolean isMeanAdjusted(){ switch(this){ case NONE: case DESCALE: case NORMALIZE: return false; case STANDARDIZE: case DEMEAN: return true; default: throw H2O.unimpl(); } } public boolean isSigmaScaled(){ switch(this){ case NONE: case DEMEAN: case NORMALIZE: return false; case STANDARDIZE: case DESCALE: return true; default: throw H2O.unimpl(); } } } public TransformType _predictor_transform; public TransformType _response_transform; public boolean _useAllFactorLevels; public int _nums; // "raw" number of numerical columns as they exist in the frame public int _cats; // "raw" number of categorical columns as they exist in the frame public int [] _catOffsets; // offset column indices for the 1-hot expanded values (includes enum-enum interaction) public boolean [] _catMissing; // bucket for missing levels private int [] _catNAFill; // majority class of each categorical col (or last bucket if _catMissing[i] is true) public int [] _permutation; // permutation matrix mapping input col indices to adaptedFrame public double [] _normMul; // scale the predictor column by this value public double [] _normSub; // subtract from the predictor this value public double [] _normRespMul; // scale the response column by this value public double [] _normRespSub; // subtract from the response column this value public double [] _numMeans; public boolean _intercept = true; public boolean _offset; public boolean _weights; public boolean _fold; public Model.InteractionPair[] _interactions; // raw set of interactions public String[] _interactionColumns; // the names of the columns to interact public int _interactionVecs[]; // the interaction columns appearing in _adaptedFrame public int[] _numOffsets; // offset column indices used by numerical interactions: total number of numerical columns is given by _numOffsets[_nums] - _numOffsets[0] public int responseChunkId(int n){return n + _cats + _nums + (_weights?1:0) + (_offset?1:0) + (_fold?1:0);} public int foldChunkId(){return _cats + _nums + (_weights?1:0) + (_offset?1:0);} public int offsetChunkId(){return _cats + _nums + (_weights ?1:0);} public int weightChunkId(){return _cats + _nums;} public int outputChunkId() { return outputChunkId(0);} public int outputChunkId(int n) { return n + _cats + _nums + (_weights?1:0) + (_offset?1:0) + (_fold?1:0) + _responses;} public void addOutput(String name, Vec v) {_adaptedFrame.add(name,v);} public Vec getOutputVec(int i) {return _adaptedFrame.vec(outputChunkId(i));} public void setResponse(String name, Vec v){ setResponse(name,v,0);} public void setResponse(String name, Vec v, int n){ _adaptedFrame.insertVec(responseChunkId(n),name,v);} public final boolean _skipMissing; public final boolean _imputeMissing; public boolean _valid; // DataInfo over validation data set, can have unseen (unmapped) categorical levels public final int [][] _catLvls; // cat lvls post filter (e.g. by strong rules) public final int [][] _intLvls; // interaction lvls post filter (e.g. by strong rules) private DataInfo() { _intLvls=null; _catLvls = null; _skipMissing = true; _imputeMissing = false; _valid = false; _offset = false; _weights = false; _fold = false; } public String[] _coefNames; @Override protected long checksum_impl() {throw H2O.unimpl();} // don't really need checksum // Modify the train & valid frames directly; sort the categorical columns // up front according to size; compute the mean/sigma for each column for // later normalization. public DataInfo(Frame train, Frame valid, boolean useAllFactorLevels, TransformType predictor_transform, boolean skipMissing, boolean imputeMissing, boolean missingBucket) { this(train, valid, 0, useAllFactorLevels, predictor_transform, TransformType.NONE, skipMissing, imputeMissing, missingBucket, /* weight */ false, /* offset */ false, /* fold */ false, /* intercept */ false); } public DataInfo(Frame train, Frame valid, int nResponses, boolean useAllFactorLevels, TransformType predictor_transform, TransformType response_transform, boolean skipMissing, boolean imputeMissing, boolean missingBucket, boolean weight, boolean offset, boolean fold) { this(train,valid,nResponses,useAllFactorLevels,predictor_transform,response_transform,skipMissing,imputeMissing,missingBucket,weight,offset,fold,null); } /** * * The train/valid Frame instances are sorted by categorical (themselves sorted by * cardinality greatest to least) with all numerical columns following. The response * column(s) are placed at the end. * * * Interactions: * 1. Num-Num (Note: N(0,1) * N(0,1) ~ N(0,1) ) * 2. Num-Enum * 3. Enum-Enum * * Interactions are produced on the fly and are dense (in all 3 cases). Consumers of * DataInfo should not have to care how these interactions are generated. Any heuristic * using the fullN value should continue functioning the same. * * Interactions are specified in two ways: * A. As a list of pairs of column indices. * B. As a list of pairs of column indices with limited enums. */ public DataInfo(Frame train, Frame valid, int nResponses, boolean useAllFactorLevels, TransformType predictor_transform, TransformType response_transform, boolean skipMissing, boolean imputeMissing, boolean missingBucket, boolean weight, boolean offset, boolean fold, String[] interactions) { super(Key.<DataInfo>make()); _valid = valid != null; assert predictor_transform != null; assert response_transform != null; _offset = offset; _weights = weight; _fold = fold; assert !(skipMissing && imputeMissing) : "skipMissing and imputeMissing cannot both be true"; _skipMissing = skipMissing; _imputeMissing = imputeMissing; _predictor_transform = predictor_transform; _response_transform = response_transform; _responses = nResponses; _useAllFactorLevels = useAllFactorLevels; _interactionColumns=interactions; int[] interactionIDs=null; if( null!=interactions ) { interactionIDs = new int[interactions.length]; for(int i=0;i<interactions.length;++i) { interactionIDs[i] = train.find(interactions[i]); if( interactionIDs[i]==-1 ) { interactionIDs=null; break; } // throw new IllegalArgumentException("missing column from the dataset, could not make interaction: " + interactions[i]); } } _interactions=Model.InteractionPair.generatePairwiseInteractionsFromList(interactionIDs); // create dummy InteractionWrappedVecs and shove them onto the front if( _interactions!=null ) { _interactionVecs=new int[_interactions.length]; train = Model.makeInteractions(train, false, _interactions, _useAllFactorLevels, _skipMissing,predictor_transform==TransformType.STANDARDIZE).add(train); if( valid!=null ) valid = Model.makeInteractions(valid, true, _interactions, _useAllFactorLevels, _skipMissing,predictor_transform==TransformType.STANDARDIZE).add(valid); // FIXME: should be using the training subs/muls! } _permutation = new int[train.numCols()]; final Vec[] tvecs = train.vecs(); // Count categorical-vs-numerical final int n = tvecs.length-_responses - (offset?1:0) - (weight?1:0) - (fold?1:0); int [] nums = MemoryManager.malloc4(n); int [] cats = MemoryManager.malloc4(n); int nnums = 0, ncats = 0; for(int i = 0; i < n; ++i) if (tvecs[i].isCategorical()) cats[ncats++] = i; else nums[nnums++] = i; _nums = nnums; _cats = ncats; _catLvls = new int[ncats][]; // sort the cats in the decreasing order according to their size for(int i = 0; i < ncats; ++i) for(int j = i+1; j < ncats; ++j) if( tvecs[cats[i]].domain().length < tvecs[cats[j]].domain().length ) { int x = cats[i]; cats[i] = cats[j]; cats[j] = x; } String[] names = new String[train.numCols()]; Vec[] tvecs2 = new Vec[train.numCols()]; // Compute the cardinality of each cat _catNAFill = new int[ncats]; _catOffsets = MemoryManager.malloc4(ncats+1); _catMissing = new boolean[ncats]; int len = _catOffsets[0] = 0; int interactionIdx=0; // simple index into the _interactionVecs array ArrayList<Integer> interactionIds; if( _interactions==null ) { interactionIds = new ArrayList<>(); for(int i=0;i<tvecs.length;++i) if( tvecs[i] instanceof InteractionWrappedVec ) interactionIds.add(i); if( interactionIds.size() > 0 ) { _interactionVecs = new int[interactionIds.size()]; for (int i = 0; i < _interactionVecs.length; ++i) _interactionVecs[i] = interactionIds.get(i); } } for(int i = 0; i < ncats; ++i) { names[i] = train._names[cats[i]]; Vec v = (tvecs2[i] = tvecs[cats[i]]); _catMissing[i] = missingBucket; //needed for test time if( v instanceof InteractionWrappedVec ) { if( _interactions!=null ) _interactions[interactionIdx].vecIdx=i; _interactionVecs[interactionIdx++]=i; // i (and not cats[i]) because this is the index in _adaptedFrame _catOffsets[i + 1] = (len += v.domain().length + (missingBucket ? 1 : 0)); } else _catOffsets[i+1] = (len += v.domain().length - (useAllFactorLevels?0:1) + (missingBucket? 1 : 0)); //missing values turn into a new factor level _catNAFill[i] = imputeMissing?imputeCat(train.vec(cats[i]),_useAllFactorLevels):_catMissing[i]?v.domain().length - (_useAllFactorLevels || isInteractionVec(i)?0:1):-100; _permutation[i] = cats[i]; } _numOffsets = MemoryManager.malloc4(nnums+1); _numOffsets[0]=len; boolean isIWV; // is InteractionWrappedVec? for(int i = 0; i < nnums; ++i) { names[i+ncats] = train._names[nums[i]]; Vec v = train.vec(nums[i]); tvecs2[i+ncats] = v; isIWV = v instanceof InteractionWrappedVec; if( isIWV ) { if( null!=_interactions ) _interactions[interactionIdx].vecIdx=i+ncats; _interactionVecs[interactionIdx++]=i+ncats; } _numOffsets[i+1] = (len+= (isIWV ? ((InteractionWrappedVec) v).expandedLength() : 1)); _permutation[i+ncats] = nums[i]; } _numMeans = new double[numNums()]; int meanIdx=0; for(int i=0;i<nnums;++i) { Vec v = train.vec(nums[i]); if( v instanceof InteractionWrappedVec ) { InteractionWrappedVec iwv = (InteractionWrappedVec)v; double[] means = iwv.getMeans(); int start = iwv._useAllFactorLevels?0:1; int length = iwv.expandedLength(); System.arraycopy(means,start,_numMeans,meanIdx,length); meanIdx+=length; } else _numMeans[meanIdx++]=v.mean(); } for(int i = names.length-nResponses - (weight?1:0) - (offset?1:0) - (fold?1:0); i < names.length; ++i) { names[i] = train._names[i]; tvecs2[i] = train.vec(i); } _adaptedFrame = new Frame(names,tvecs2); train.restructure(names,tvecs2); if (valid != null) valid.restructure(names,valid.vecs(names)); // _adaptedFrame = train; setPredictorTransform(predictor_transform); if(_responses > 0) setResponseTransform(response_transform); _intLvls = new int[_interactionVecs==null?0:_interactionVecs.length][]; } public DataInfo(Frame train, Frame valid, int nResponses, boolean useAllFactorLevels, TransformType predictor_transform, TransformType response_transform, boolean skipMissing, boolean imputeMissing, boolean missingBucket, boolean weight, boolean offset, boolean fold, boolean intercept) { this(train, valid, nResponses, useAllFactorLevels, predictor_transform, response_transform, skipMissing, imputeMissing, missingBucket, weight, offset, fold); _intercept = intercept; } public DataInfo validDinfo(Frame valid) { DataInfo res = new DataInfo(_adaptedFrame,null,1,_useAllFactorLevels,TransformType.NONE,TransformType.NONE,_skipMissing,_imputeMissing,!(_skipMissing || _imputeMissing),_weights,_offset,_fold); res._interactions=_interactions; res._interactionColumns=_interactionColumns; if( _interactionColumns!=null ) { int[] interactions = new int[_interactionColumns.length]; for(int i=0;i<interactions.length;++i) interactions[i] = valid.find(_interactionColumns[i]); valid = Model.makeInteractions(valid, true, _interactions, _useAllFactorLevels, _skipMissing, false).add(valid); } res._adaptedFrame = new Frame(_adaptedFrame.names(),valid.vecs(_adaptedFrame.names())); res._valid = true; return res; } public double[] denormalizeBeta(double [] beta) { int N = fullN()+1; assert (beta.length % N) == 0:"beta len = " + beta.length + " expected multiple of" + N; int nclasses = beta.length/N; beta = MemoryManager.arrayCopyOf(beta,beta.length); if (_predictor_transform == DataInfo.TransformType.STANDARDIZE) { for(int c = 0; c < nclasses; ++c) { int off = N*c; double norm = 0.0; // Reverse any normalization on the intercept // denormalize only the numeric coefs (categoricals are not normalized) final int numoff = numStart(); for (int i = numoff; i < N-1; i++) { double b = beta[off + i] * _normMul[i - numoff]; norm += b * _normSub[i - numoff]; // Also accumulate the intercept adjustment beta[off + i] = b; } beta[off + N - 1] -= norm; } } return beta; } private int [] _fullCatOffsets; private int [][] _catMap; protected int [] fullCatOffsets(){ return _fullCatOffsets == null?_catOffsets:_fullCatOffsets;} // private constructor called by filterExpandedColumns private DataInfo(DataInfo dinfo,Frame fr, double [] normMul, double [] normSub, int[][] catLevels, int[][] intLvls, int [] catModes, int[] activeCols) { _activeCols=activeCols; _fullCatOffsets = dinfo._catOffsets; if(!dinfo._useAllFactorLevels) { _fullCatOffsets = dinfo._catOffsets.clone(); for (int i = 0; i < _fullCatOffsets.length; ++i) _fullCatOffsets[i] += i; // add for the skipped zeros. } _cats = catLevels.length; _catMap = new int[_cats][]; _offset = dinfo._offset; _weights = dinfo._weights; _fold = dinfo._fold; _valid = false; _interactions = null; ArrayList<Integer> interactionVecs = new ArrayList<>(); for(int i=0;i<fr.numCols();++i) if( fr.vec(i) instanceof InteractionWrappedVec ) interactionVecs.add(i); if( interactionVecs.size() > 0 ) { _interactionVecs = new int[interactionVecs.size()]; for (int i = 0; i < _interactionVecs.length; ++i) _interactionVecs[i] = interactionVecs.get(i); } assert dinfo._predictor_transform != null; assert dinfo._response_transform != null; _predictor_transform = dinfo._predictor_transform; _response_transform = dinfo._response_transform; _skipMissing = dinfo._skipMissing; _imputeMissing = dinfo._imputeMissing; _adaptedFrame = fr; _catOffsets = MemoryManager.malloc4(catLevels.length + 1); _catMissing = new boolean[catLevels.length]; Arrays.fill(_catMissing,!(dinfo._imputeMissing || dinfo._skipMissing)); int s = 0; for(int i = 0; i < catLevels.length; ++i){ if(catLevels[i] != null) { _catMap[i] = new int[_adaptedFrame.vec(i).cardinality()]; Arrays.fill(_catMap[i],-1); for (int j = 0; j < catLevels[i].length; j++) { _catMap[i][catLevels[i][j]] = j; } } _catOffsets[i] = s; s += catLevels[i].length; } _catOffsets[_catOffsets.length-1] = s; _catLvls = catLevels; _intLvls = intLvls; _responses = dinfo._responses; _useAllFactorLevels = true;//dinfo._useAllFactorLevels; _normMul = normMul; _normSub = normSub; _catNAFill = catModes; } public static int imputeCat(Vec v) {return imputeCat(v,true);} public static int imputeCat(Vec v, boolean useAllFactorLevels) { if(v.isCategorical()) { if (useAllFactorLevels) return v.mode(); long[] bins = v.bins(); return ArrayUtils.maxIndex(bins,1); } return (int)Math.round(v.mean()); } /** * Filter the _adaptedFrame so that it contains only the Vecs referenced by the cols * parameter. * * @param cols Array of the expanded column indices to keep. * @return A DataInfo with _activeCols specifying the active columns */ public DataInfo filterExpandedColumns(int [] cols){ assert _activeCols==null; assert _predictor_transform != null; assert _response_transform != null; if(cols == null)return IcedUtils.deepCopy(this); // keep all columns int hasIcpt = (cols.length > 0 && cols[cols.length-1] == fullN())?1:0; int i = 0, j = 0, ignoredCnt = 0; //public DataInfo(Frame fr, int hasResponses, boolean useAllFactorLvls, double [] normSub, double [] normMul, double [] normRespSub, double [] normRespMul){ int [][] catLvls = new int[_cats][]; // categorical levels to keep (used in getCategoricalOffsetId binary search) int [][] intLvls = new int[_interactionVecs==null?0:_interactionVecs.length][]; // interactions levels to keep (used in getInteractionOffsetId binary search) int [] ignoredCols = MemoryManager.malloc4(_nums + _cats); // capital 'v' Vec indices to be frame.remove'd // first do categoricals... if(_catOffsets != null) { int coff = _useAllFactorLevels?0:1; while (i < cols.length && cols[i] < numStart()) { // iterate over categorical cols int[] levels = MemoryManager.malloc4(_catOffsets[j + 1] - _catOffsets[j]); int k = 0; // keep track of how many levels we have (so we can "trim" the levels array when inserting into catLvls) while (i < cols.length && cols[i] < _catOffsets[j + 1]) levels[k++] = (cols[i++] - _catOffsets[j]) + coff; if (k > 0) catLvls[j] = Arrays.copyOf(levels, k); ++j; } } int [] catModes = _catNAFill; for(int k =0; k < catLvls.length; ++k) if(catLvls[k] == null)ignoredCols[ignoredCnt++] = k; if(ignoredCnt > 0){ int [][] cs = new int[_cats-ignoredCnt][]; catModes = new int[_cats-ignoredCnt]; int y = 0; for (int c = 0; c < catLvls.length; ++c) if (catLvls[c] != null) { catModes[y] = _catNAFill[c]; cs[y++] = catLvls[c]; } assert y == cs.length; catLvls = cs; } // now do the interaction vecs -- these happen to always sit first in the "nums" section of _adaptedFrame // also, these have the exact same filtering logic as the categoricals above int prev=j=0; // reset j for _numOffsets if( _interactionVecs!=null ) { while( i < cols.length && cols[i] < _numOffsets[intLvls.length]) { int[] lvls = MemoryManager.malloc4(_numOffsets[j+1] - _numOffsets[j]); int k=0; // same as above while(i<cols.length && cols[i] < _numOffsets[j+1]) lvls[k++] = (cols[i++] - _numOffsets[j]); // no useAllFactorLevels offset since it's tucked away in the count already if( k>0 ) intLvls[j] = Arrays.copyOf(lvls,k); ++j; } int preIgnoredCnt=ignoredCnt; for(int k=0;k<intLvls.length;++k) if( null==intLvls[k] ) { ignoredCols[ignoredCnt++] = k+_cats; } if( ignoredCnt > preIgnoredCnt ) { // got more ignored, trim out the nulls int[][] is = new int[_interactionVecs.length - (ignoredCnt-preIgnoredCnt)][]; int y=0; for (int[] intLvl : intLvls) if (intLvl != null) is[y++] = intLvl; intLvls=is; } } // now numerics prev=j=_interactionVecs==null?0:_interactionVecs.length; for(;i<cols.length;++i){ int numsToIgnore = (cols[i]-_numOffsets[j]); for(int k=0;k<numsToIgnore;++k){ ignoredCols[ignoredCnt++] = _cats+prev++; ++j; } prev = ++j; } for(int k = prev; k < _nums; ++k) ignoredCols[ignoredCnt++] = k+_cats; Frame f = new Frame(_adaptedFrame.names().clone(),_adaptedFrame.vecs().clone()); if(ignoredCnt > 0) f.remove(Arrays.copyOf(ignoredCols,ignoredCnt)); assert catLvls.length < f.numCols():"cats = " + catLvls.length + " numcols = " + f.numCols(); double [] normSub = null; double [] normMul = null; int id = Arrays.binarySearch(cols,numStart()); if(id < 0) id = -id-1; int nnums = cols.length - id - hasIcpt; int off = numStart(); if(_normSub != null) { normSub = new double[nnums]; for(int k = id; k < (id + nnums); ++k) normSub[k-id] = _normSub[cols[k]-off]; } if(_normMul != null) { normMul = new double[nnums]; for(int k = id; k < (id + nnums); ++k) normMul[k-id] = _normMul[cols[k]-off]; } DataInfo dinfo = new DataInfo(this,f,normMul,normSub,catLvls,intLvls,catModes,cols); dinfo._nums=f.numCols()-dinfo._cats - dinfo._responses - (dinfo._offset?1:0) - (dinfo._weights?1:0) - (dinfo._fold?1:0); dinfo._numMeans=new double[nnums]; for(int k=id; k < (id+nnums);++k ) dinfo._numMeans[k-id] = _numMeans[cols[k]-off]; return dinfo; } public void updateWeightedSigmaAndMean(double [] sigmas, double [] mean) { int sub = numNums() - _nums; if(_predictor_transform.isSigmaScaled()) { if(sigmas.length+(sub) != _normMul.length) // numNums() - _nums checks for interactions (numNums() > _nums in the case of numerical interactions) throw new IllegalArgumentException("Length of sigmas does not match number of scaled columns."); for(int i = 0; i < _normMul.length; ++i) _normMul[i] = i<sub?_normMul[i]:(sigmas[i-sub] != 0?1.0/sigmas[i-sub]:1); } if(_predictor_transform.isMeanAdjusted()) { if(mean.length+(sub) != _normSub.length) // numNums() - _nums checks for interactions (numNums() > _nums in the case of numerical interactions) throw new IllegalArgumentException("Length of means does not match number of scaled columns."); for(int i=0;i<_normSub.length;++i) _normSub[i] = i<sub?_normSub[i]:mean[i-sub]; } } public void updateWeightedSigmaAndMeanForResponse(double [] sigmas, double [] mean) { if(_response_transform.isSigmaScaled()) { if(sigmas.length != _normRespMul.length) throw new IllegalArgumentException("Length of sigmas does not match number of scaled columns."); for(int i = 0; i < sigmas.length; ++i) _normRespMul[i] = sigmas[i] != 0?1.0/sigmas[i]:1; } if(_response_transform.isMeanAdjusted()) { if(mean.length != _normRespSub.length) throw new IllegalArgumentException("Length of means does not match number of scaled columns."); System.arraycopy(mean,0,_normRespSub,0,mean.length); } } private void setTransform(TransformType t, double [] normMul, double [] normSub, int vecStart, int n) { int idx=0; // idx!=i when interactions are in play, otherwise, it's just 'i' for (int i = 0; i < n; ++i) { Vec v = _adaptedFrame.vec(vecStart + i); boolean isIWV = v instanceof InteractionWrappedVec; switch (t) { case STANDARDIZE: if( isIWV ) { InteractionWrappedVec iwv = (InteractionWrappedVec)v; for(int offset=0;offset<iwv.expandedLength();++offset) { normMul[idx+offset] = iwv.getMul(offset+(_useAllFactorLevels?0:1)); normSub[idx+offset] = iwv.getSub(offset+(_useAllFactorLevels?0:1)); } } else { normMul[idx] = (v.sigma() != 0) ? 1.0 / v.sigma() : 1.0; normSub[idx] = v.mean(); } break; case NORMALIZE: if( isIWV ) throw H2O.unimpl(); normMul[idx] = (v.max() - v.min() > 0)?1.0/(v.max() - v.min()):1.0; normSub[idx] = v.mean(); break; case DEMEAN: if( isIWV ) throw H2O.unimpl(); normMul[idx] = 1; normSub[idx] = v.mean(); break; case DESCALE: if( isIWV ) throw H2O.unimpl(); normMul[idx] = (v.sigma() != 0)?1.0/v.sigma():1.0; normSub[idx] = 0; break; default: throw H2O.unimpl(); } assert !Double.isNaN(normMul[idx]); assert !Double.isNaN(normSub[idx]); idx = isIWV?(idx+nextNumericIdx(i)):(idx+1); } } public void setPredictorTransform(TransformType t){ _predictor_transform = t; if(t == TransformType.NONE) { _normMul = null; _normSub = null; } else { _normMul = MemoryManager.malloc8d(numNums()); _normSub = MemoryManager.malloc8d(numNums()); setTransform(t,_normMul,_normSub,_cats,_nums); } } public void setResponseTransform(TransformType t){ _response_transform = t; if(t == TransformType.NONE) { _normRespMul = null; _normRespSub = null; } else { _normRespMul = MemoryManager.malloc8d(_responses); _normRespSub = MemoryManager.malloc8d(_responses); setTransform(t,_normRespMul,_normRespSub,_adaptedFrame.numCols()-_responses,_responses); } } public boolean isInteractionVec(int colid) { if( null==_interactions && null==_interactionVecs ) return false; if( _adaptedFrame!=null ) return _adaptedFrame.vec(colid) instanceof InteractionWrappedVec; else return Arrays.binarySearch(_interactionVecs,colid) >= 0; } /** * * Get the fully expanded number of predictor columns. * Note that this value does not include: * response column(s) * weight column * offset column * fold column * * @return expanded number of columns in the underlying frame */ public final int fullN() { return numNums() + numCats(); } public final int largestCat(){ return _cats > 0?_catOffsets[1]:0; } public final int numStart() { return _catOffsets[_cats]; } public final int numCats() { return _catOffsets[_cats]; } public final int numNums() { int nnums=0; if( _numOffsets==null && _intLvls.length>0 ) { // filtered columns? for (int[] _intLvl : _intLvls) nnums += _intLvl==null?0:_intLvl.length-1; // minus 1 for the fact that we get a +1 from the dummy interaction vec sitting in the frame! return nnums+_nums; } return _interactionVecs!=null&&_numOffsets!=null?(_numOffsets[_numOffsets.length-1]-numStart()):_nums; } /** * Get the next expanded number-column index. */ public final int nextNumericIdx(int currentColIdx) { if( _numOffsets==null ) { if( currentColIdx < _interactionVecs.length ) { // currently sitting on an interaction vec, return the number of levels return _intLvls[currentColIdx].length; } else return 1; } if( currentColIdx+1 >= _numOffsets.length ) return fullN() - _numOffsets[currentColIdx]; return _numOffsets[currentColIdx+1] - _numOffsets[currentColIdx]; } public final String[] coefNames() { if (_coefNames != null) return _coefNames; // already computed int k = 0; final int n = fullN(); // total number of columns to compute String [] res = new String[n]; final Vec [] vecs = _adaptedFrame.vecs(); // first do all of the expanded categorical names for(int i = 0; i < _cats; ++i) { for (int j = (_useAllFactorLevels || vecs[i] instanceof InteractionWrappedVec) ? 0 : 1; j < vecs[i].domain().length; ++j) { int jj = getCategoricalId(i, j); if(jj < 0) continue; res[k++] = _adaptedFrame._names[i] + "." + vecs[i].domain()[j]; } if (_catMissing[i] && getCategoricalId(i, -1) >=0) res[k++] = _adaptedFrame._names[i] + ".missing(NA)"; if( vecs[i] instanceof InteractionWrappedVec ) { InteractionWrappedVec iwv = (InteractionWrappedVec)vecs[i]; if( null!=iwv.missingDomains() ) { for(String s: iwv.missingDomains() ) res[k++] = s+".missing(NA)"; } } } // now loop over the numerical columns, collecting up any expanded InteractionVec names if( _interactions==null ) { final int nums = n-k; System.arraycopy(_adaptedFrame._names, _cats, res, k, nums); } else { for (int i = 0; i <= _nums; i++) { InteractionWrappedVec v; if( i+_cats >= n || k >=n ) break; if (vecs[i+_cats] instanceof InteractionWrappedVec && ((v = (InteractionWrappedVec) vecs[i+_cats]).domain() != null)) { // in this case, get the categoricalOffset for (int j = _useAllFactorLevels?0:1; j < v.domain().length; ++j) { if (getCategoricalIdFromInteraction(_cats+i, j) < 0) continue; res[k++] = _adaptedFrame._names[i+_cats] + "." + v.domain()[j]; } } else res[k++] = _adaptedFrame._names[i+_cats]; } } _coefNames = res; return res; } // Return permutation matrix mapping input names to adaptedFrame colnames public int[] mapNames(String[] names) { assert names.length == _adaptedFrame._names.length : "Names must be the same length!"; int[] idx = new int[names.length]; Arrays.fill(idx, -1); for(int i = 0; i < _adaptedFrame._names.length; i++) { for(int j = 0; j < names.length; j++) { if( names[j].equals(_adaptedFrame.name(i)) ) { idx[i] = j; break; } } } return idx; } /** * Undo the standardization/normalization of numerical columns * @param in input values * @param out output values (can be the same as input) */ public final void unScaleNumericals(double[] in, double[] out) { if (_nums == 0) return; assert (in.length == out.length); assert (in.length == fullN()); for (int k=numStart(); k < fullN(); ++k) { double m = _normMul == null ? 1f : _normMul[k-numStart()]; double s = _normSub == null ? 0f : _normSub[k-numStart()]; out[k] = in[k] / m + s; } } public final class Row extends Iced { public boolean predictors_bad; // should the row be skipped (GLM skip NA for example) public boolean response_bad; public boolean isBad(){return predictors_bad || response_bad;} public double [] numVals; // the backing data of the row public double [] response; public int [] numIds; // location of next sparse value public int [] binIds; // location of categorical public long rid; // row number (sometimes within chunk, or absolute) public int cid; // categorical id public int nBins; // number of enum columns (not expanded) public int nNums; // number of numeric columns (not expanded) public int nOutpus; public double offset = 0; public double weight = 1; private C8DChunk [] _outputs; public void setOutput(int i, double v) {_outputs[i].set8D(cid,v);} public double getOutput(int i) {return _outputs[i].get8D(cid);} public final boolean isSparse(){return numIds != null;} public double[] mtrxMul(double [][] m, double [] res){ for(int i = 0; i < m.length; ++i) res[i] = innerProduct(m[i],false); return res; } public Row(boolean sparse, int nNums, int nBins, int nresponses, int i, long start) { binIds = MemoryManager.malloc4(nBins); numVals = MemoryManager.malloc8d(nNums); response = MemoryManager.malloc8d(nresponses); if(sparse) numIds = MemoryManager.malloc4(nNums); this.nNums = sparse?0:nNums; cid = i; rid = start + i; } public Row(boolean sparse, double[] numVals, int[] binIds, double[] response, int i, long start) { int nNums = numVals == null ? 0:numVals.length; this.numVals = numVals; if(sparse) numIds = MemoryManager.malloc4(nNums); this.nNums = sparse ? 0:nNums; this.nBins = binIds == null ? 0:binIds.length; this.binIds = binIds; this.response = response; cid = i; rid = start + i; } public Row(double [] nums) { numVals = nums; nNums = nums.length; } public double response(int i) {return response[i];} public double get(int i) { int off = numStart(); if(i >= off) { // numbers if(numIds == null) return numVals[i-off]; int j = Arrays.binarySearch(numIds,0,nNums,i); return j >= 0?numVals[j]:0; } else { // categoricals int j = Arrays.binarySearch(binIds,0,nBins,i); return j >= 0?1:0; } } public void addNum(int id, double val) { if(numIds.length == nNums) { int newSz = Math.max(4,numIds.length + (numIds.length >> 1)); numIds = Arrays.copyOf(numIds, newSz); numVals = Arrays.copyOf(numVals, newSz); } int i = nNums++; numIds[i] = id; numVals[i] = val; } /* This method will perform an inner product of rows. It will be able to handle categorical data as well as numerical data. However, the two rows must have exactly the same column types. This is used in a situation where the rows are coming from the same dataset. */ public final double dotSame(Row rowj) { // nums double elementij = 0.0; for(int i = 0; i < this.nNums; ++i) { elementij += this.numVals[i]*rowj.numVals[i]; // multiply numerical parts of columns } // cat X cat if (this.binIds.length > 0) { // categorical columns exists for (int j = 0; j < this.nBins; ++j) { if (this.binIds[j] == rowj.binIds[j]) { elementij += 1; } } } return elementij*this.weight*rowj.weight; } public final double innerProduct(double [] vec) { return innerProduct(vec,false);} public final double innerProduct(double [] vec, boolean icptFirst) { double res = 0; int off = 0; if(icptFirst) { off = 1; res = vec[0]; } int numStart = off + numStart(); for(int i = 0; i < nBins; ++i) res += vec[off+binIds[i]]; if(numIds == null) { for (int i = 0; i < numVals.length; ++i) res += numVals[i] * vec[numStart + i]; } else { for (int i = 0; i < nNums; ++i) res += numVals[i] * vec[off+numIds[i]]; } if(_intercept && !icptFirst) res += vec[vec.length-1]; return res; } public double[] expandCats() { if(isSparse() || _responses > 0) throw H2O.unimpl(); int N = fullN(); int numStart = numStart(); double[] res = new double[N + (_intercept ? 1:0)]; for(int i = 0; i < nBins; ++i) res[binIds[i]] = 1; if(numIds == null) { System.arraycopy(numVals,0,res,numStart,numVals.length); } else { for(int i = 0; i < nNums; ++i) res[numIds[i]] = numVals[i]; } if(_intercept) res[res.length-1] = 1; return res; } public String toString() { return this.rid + Arrays.toString(Arrays.copyOf(binIds,nBins)) + ", " + Arrays.toString(numVals); } public void setResponse(int i, double z) {response[i] = z;} public void standardize(double[] normSub, double[] normMul) { if(numIds == null){ for(int i = 0; i < numVals.length; ++i) numVals[i] = (numVals[i] - normSub[i])*normMul[i]; } else for(int i = 0; i < nNums; ++i) { int j = numIds[i]; numVals[i] = (numVals[i] - normSub[j])*normMul[j]; } } } public final int getCategoricalId(int cid, double val) { if(Double.isNaN(val)) return getCategoricalId(cid, -1); int ival = (int)val; if(ival != val) throw new IllegalArgumentException("Categorical id must be an integer or NA (missing)."); return getCategoricalId(cid,ival); } /** * Get the offset into the expanded categorical * @param cid the column id * @param val the integer representation of the categorical level * @return offset into the fullN set of columns */ public final int getCategoricalId(int cid, int val) { boolean isIWV = isInteractionVec(cid); if(val == -1) { // NA val = _catNAFill[cid]; } else if( !_useAllFactorLevels && !isIWV ) // categorical interaction vecs drop reference level in a special way val -= 1; if(val < 0) return -1; // column si to be skipped int [] offs = fullCatOffsets(); int expandedVal = val + offs[cid]; if(expandedVal >= offs[cid+1]) { // previously unseen level assert _valid:"Categorical value out of bounds, got " + val + ", next cat starts at " + fullCatOffsets()[cid+1]; if(_skipMissing) return -1; val = _catNAFill[cid]; } if (_catMap != null && _catMap[cid] != null) { // some levels are ignored? val = _catMap[cid][val]; assert _useAllFactorLevels; } return val < 0?-1:val + _catOffsets[cid]; } public final int getCategoricalIdFromInteraction(int cid, int val) { InteractionWrappedVec v; if( (v=(InteractionWrappedVec)_adaptedFrame.vec(cid)).isCategorical() ) return getCategoricalId(cid,val); assert v.domain()!=null : "No domain levels found for interactions! cid: " + cid + " val: " + val; cid -= _cats; if( val >= _numOffsets[cid+1] ) { // previously unseen interaction (aka new domain level) assert _valid:"interaction value out of bounds, got " + val + ", next cat starts at " + _numOffsets[cid+1]; val = v.mode(); } if( cid < _intLvls.length && _intLvls[cid]!=null ) { assert _useAllFactorLevels; val = Arrays.binarySearch(_intLvls[cid],val); } return val < 0?-1:val+_numOffsets[cid]; } public final Row extractDenseRow(Chunk[] chunks, int rid, Row row) { row.predictors_bad = false; row.response_bad = false; row.rid = rid + chunks[0].start(); row.cid = rid; if(_weights) row.weight = chunks[weightChunkId()].atd(rid); if(row.weight == 0) return row; if (_skipMissing) { int N = _cats + _nums; for (int i = 0; i < N; ++i) if (chunks[i].isNA(rid)) { row.predictors_bad = true; return row; } } int nbins = 0; for (int i = 0; i < _cats; ++i) { int cid = getCategoricalId(i,chunks[i].isNA(rid)? _catNAFill[i]:(int)chunks[i].at8(rid)); if(cid >= 0) row.binIds[nbins++] = cid; } row.nBins = nbins; final int n = _nums; int numValsIdx=0; // since we're dense, need a second index to track interaction nums for( int i=0;i<n;i++) { if( isInteractionVec(_cats + i) ) { // categorical-categorical interaction is handled as plain categorical (above)... so if we have interactions either v1 is categorical, v2 is categorical, or neither are categorical InteractionWrappedVec iwv = (InteractionWrappedVec)_adaptedFrame.vec(_cats+i); int interactionOffset = getInteractionOffset(chunks,_cats+i,rid); for(int offset=0;offset<iwv.expandedLength();++offset) { if( i < _intLvls.length && _intLvls[i]!=null && Arrays.binarySearch(_intLvls[i],offset) < 0 ) continue; // skip the filtered out interactions double d=0; if( offset==interactionOffset ) d=chunks[_cats + i].atd(rid); if( Double.isNaN(d) ) d = _numMeans[numValsIdx]; if( _normMul != null && _normSub != null ) d = (d - _normSub[numValsIdx]) * _normMul[numValsIdx]; row.numVals[numValsIdx++]=d; } } else { double d = chunks[_cats + i].atd(rid); // can be NA if skipMissing() == false if (Double.isNaN(d)) d = _numMeans[numValsIdx]; if (_normMul != null && _normSub != null) d = (d - _normSub[numValsIdx]) * _normMul[numValsIdx]; row.numVals[numValsIdx++] = d; } } for (int i = 0; i < _responses; ++i) { row.response[i] = chunks[responseChunkId(i)].atd(rid); if(Double.isNaN(row.response[i])) { row.response_bad = true; break; } if (_normRespMul != null) row.response[i] = (row.response[i] - _normRespSub[i]) * _normRespMul[i]; } if(_offset) row.offset = chunks[offsetChunkId()].atd(rid); return row; } public int getInteractionOffset(Chunk[] chunks, int cid, int rid) { boolean useAllFactors = ((InteractionWrappedVec)chunks[cid].vec())._useAllFactorLevels; InteractionWrappedVec.InteractionWrappedChunk c = (InteractionWrappedVec.InteractionWrappedChunk)chunks[cid]; if( c._c1IsCat ) return (int)c._c[0].at8(rid)-(useAllFactors?0:1); else if( c._c2IsCat ) return (int)c._c[1].at8(rid)-(useAllFactors?0:1); return 0; } public Vec getWeightsVec(){return _adaptedFrame.vec(weightChunkId());} public Vec getOffsetVec(){return _adaptedFrame.vec(offsetChunkId());} public Row newDenseRow(){return new Row(false,numNums(),_cats,_responses,0,0);} // TODO: _nums => numNums since currently extracting out interactions into dense public Row newDenseRow(double[] numVals, long start) { return new Row(false, numVals, null, null, 0, start); } public final class Rows { public final int _nrows; private final Row _denseRow; private final Row [] _sparseRows; public final boolean _sparse; private final Chunk [] _chks; private Rows(Chunk [] chks, boolean sparse) { _nrows = chks[0]._len; _sparse = sparse; long start = chks[0].start(); if(sparse) { _denseRow = null; _chks = null; _sparseRows = extractSparseRows(chks); } else { _denseRow = DataInfo.this.newDenseRow(); _chks = chks; _sparseRows = null; } } public Row row(int i) {return _sparse?_sparseRows[i]:extractDenseRow(_chks,i,_denseRow);} } public Rows rows(Chunk [] chks) { int cnt = 0; for(Chunk c:chks) if(c.isSparseZero()) ++cnt; return rows(chks,cnt > (chks.length >> 1)); } public Rows rows(Chunk [] chks, boolean sparse) {return new Rows(chks,sparse);} /** * Extract (sparse) rows from given chunks. * Note: 0 remains 0 - _normSub of DataInfo isn't used (mean shift during standarization is not reverted) - UNLESS offset is specified (for GLM only) * Essentially turns the dataset 90 degrees. * @param chunks - chunk of dataset * @return array of sparse rows */ public final Row[] extractSparseRows(Chunk [] chunks) { Row[] rows = new Row[chunks[0]._len]; long startOff = chunks[0].start(); for (int i = 0; i < rows.length; ++i) { rows[i] = new Row(true, Math.min(_nums, 16), _cats, _responses, i, startOff); // if sparse, _nums is the correct number of nonzero values! i.e., do not use numNums() rows[i].rid = chunks[0].start() + i; if(_offset) { rows[i].offset = chunks[offsetChunkId()].atd(i); if(Double.isNaN(rows[i].offset)) { rows[i].predictors_bad = true; continue; } } if(_weights) { rows[i].weight = chunks[weightChunkId()].atd(i); if(Double.isNaN(rows[i].weight)) rows[i].predictors_bad = true; } } // categoricals for (int i = 0; i < _cats; ++i) { for (int r = 0; r < chunks[0]._len; ++r) { Row row = rows[r]; boolean isMissing = chunks[i].isNA(r); if(_skipMissing && isMissing){ row.predictors_bad = true; continue; } int cid = getCategoricalId(i,isMissing? -1:(int)chunks[i].at8(r)); if(cid >=0) row.binIds[row.nBins++] = cid; } } // generic numbers + interactions int interactionOffset=0; for (int cid = 0; cid < _nums; ++cid) { Chunk c = chunks[_cats + cid]; int oldRow = -1; if (c instanceof InteractionWrappedVec.InteractionWrappedChunk) { // for each row, only 1 value in an interaction is 'hot' all other values are off (i.e., are 0) InteractionWrappedVec iwv = (InteractionWrappedVec)c.vec(); for(int r=0;r<c._len;++r) { // the vec is "vertically" dense and "horizontally" sparse (i.e., every row has one, and only one, value) Row row = rows[r]; if( c.isNA(r) && _skipMissing) row.predictors_bad = true; if(row.predictors_bad) continue; int cidVirtualOffset = getInteractionOffset(chunks,_cats+cid,r); // the "virtual" offset into the hot-expanded interaction if( cidVirtualOffset>=0 ) { if( cid < _intLvls.length && _intLvls[cid]!=null && Arrays.binarySearch(_intLvls[cid],cidVirtualOffset) < 0 ) continue; // skip the filtered out interactions if( c.atd(r)==0 ) continue; double d = c.atd(r); if( Double.isNaN(d) ) d = _numMeans[interactionOffset+cidVirtualOffset]; // FIXME: if this produces a "true" NA then should sub with mean? with? if (_normMul != null) d *= _normMul[interactionOffset+cidVirtualOffset]; row.addNum(numStart()+interactionOffset+cidVirtualOffset, d); } } interactionOffset+=nextNumericIdx(cid); } else { for (int r = c.nextNZ(-1); r < c._len; r = c.nextNZ(r)) { if (c.atd(r) == 0) continue; assert r > oldRow; oldRow = r;Row row = rows[r]; if (c.isNA(r) && _skipMissing) row.predictors_bad = true; if (row.predictors_bad) continue; double d = c.atd(r); if (Double.isNaN(d)) d = _numMeans[cid]; if (_normMul != null) d *= _normMul[interactionOffset]; row.addNum(numStart()+interactionOffset,d); } interactionOffset++; } } // response(s) for (int i = 1; i <= _responses; ++i) { int rid = responseChunkId(i-1); Chunk rChunk = chunks[rid]; for (int r = 0; r < chunks[0]._len; ++r) { Row row = rows[r]; row.response[i-1] = rChunk.atd(r); if(Double.isNaN(row.response[i-1])) { row.response_bad = true; } if (_normRespMul != null) { row.response[i-1] = (row.response[i-1] - _normRespSub[i-1]) * _normRespMul[i-1]; } } } return rows; } public DataInfo scoringInfo(String [] names, Frame adaptFrame){ DataInfo res = IcedUtils.deepCopy(this); res._normMul = null; res._normRespSub = null; res._normRespMul = null; res._normRespSub = null; res._predictor_transform = TransformType.NONE; res._response_transform = TransformType.NONE; res._adaptedFrame = adaptFrame; res._weights = _weights && adaptFrame.find(names[weightChunkId()]) != -1; res._offset = _offset && adaptFrame.find(names[offsetChunkId()]) != -1; res._fold = _fold && adaptFrame.find(names[foldChunkId()]) != -1; int resId = adaptFrame.find(names[responseChunkId(0)]); if(resId == -1 || adaptFrame.vec(resId).isBad()) res._responses = 0; else // NOTE: DataInfo can have extra columns encoded as response, e.g. helper columns when doing Multinomail IRLSM, don't need those for scoring!. res._responses = 1; res._valid = true; res._interactions=_interactions; res._interactionColumns=_interactionColumns; // ensure that vecs are in the DKV, may have been swept up in the Scope.exit call for( Vec v: res._adaptedFrame.vecs() ) if( v instanceof InteractionWrappedVec) { ((InteractionWrappedVec)v)._useAllFactorLevels=_useAllFactorLevels; ((InteractionWrappedVec)v)._skipMissing=_skipMissing; DKV.put(v); } return res; } }