package ai.h2o.automl;
import ai.h2o.automl.UserFeedbackEvent.*;
import ai.h2o.automl.collectors.MetaCollector;
import ai.h2o.automl.colmeta.ColMeta;
import ai.h2o.automl.utils.AutoMLUtils;
import hex.tree.DHistogram;
import water.*;
import water.fvec.*;
import water.rapids.Rapids;
import water.rapids.Val;
import water.rapids.ast.prims.mungers.AstNaOmit;
import water.util.ArrayUtils;
import water.util.Log;
import java.util.*;
import static ai.h2o.automl.utils.AutoMLUtils.intListToA;
/**
* Cache common questions asked upon the frame.
*/
public class FrameMetadata extends Iced {
final String _datasetName;
public final Frame _fr;
public int[] _catFeats;
public int[] _numFeats;
public int[] _intCols;
public int[] _dblCols;
public int[] _binaryCols;
public int[] _intNotBinaryCols;
public int _response;
private boolean _isClassification;
private String[] _ignoredCols;
private String[] _includeCols;
private long _naCnt=-1; // count of nas across whole frame
private int _numFeat=-1; // count of numerical features
private int _catFeat=-1; // count of categorical features
private long _nclass=-1; // number of classes if classification problem
private double[][] _dummies=null; // dummy predictions
public ColMeta[] _cols;
public Vec[] _trainTestWeight; // weight vecs for train/test splits
private long _featsWithNa=-1; // count of features with nas
private long _rowsWithNa=-1; // count of rows with nas
private double _minSkewness=-1; // minimum skewness across all numeric features
private double _maxSkewness=-1; // maximun skewness across all numeric features
private double _meanSkewness=-1; // mean skewness across all numeric features
private double _stdSkewness=-1; // standard deviation in skewness across all numeric features
private double _medianSkewness=-1; // median across all numeric features
private double _minKurtosis=-1; // minimum kurtosis across all numeric features
private double _maxKurtosis=-1; // maximun kurtosis across all numeric features
private double _meanKurtosis=-1; // mean kurtosis across all numeric features
private double _stdKurtosis=-1; // standard deviation in kurtosis across all numeric features
private double _medianKurtosis=-1; // median across all numeric features
private double _minCardinality=-1; // minimum count of symbols across all categorical features
private double _maxCardinality=-1; // maximun count of symbols across all categorical features
private double _meanCardinality=-1; // mean count of symbols across all categorical features
private double _stdCardinality=-1; // standard deviation in count of symbols across all categorical features
private double _medianCardinality=-1; // median count of symbols across all categorical features
private UserFeedback _userFeedback;
private AstNaOmit astNaOmit;
public static final double SQLNAN = -99999;
public void delete() {
for(Vec v: _trainTestWeight)
if( null!=v ) v.remove();
}
// TODO: UGH: use reflection!
public final static String[] METAVALUES = new String[]{
"DatasetName", "NRow", "NCol", "LogNRow", "LogNCol", "NACount", "NAFraction",
"NumberNumericFeat", "NumberCatFeat", "RatioNumericToCatFeat", "RatioCatToNumericFeat",
"DatasetRatio", "LogDatasetRatio", "InverseDatasetRatio", "LogInverseDatasetRatio",
"Classification", "DummyStratMSE", "DummyStratLogLoss", "DummyMostFreqMSE",
"DummyMostFreqLogLoss", "DummyRandomMSE", "DummyRandomLogLoss", "DummyMedianMSE",
"DummyMeanMSE", "NClass", "FeatWithNAs","RowsWithNAs","MinSkewness","MaxSkewness",
"MeanSkewness","StdSkewness","MedianSkewness","MinKurtosis","MaxKurtosis",
"MeanKurtosis","StdKurtosis","MedianKurtosis", "MinCardinality","MaxCardinality",
"MeanCardinality","StdCardinality","MedianCardinality"
};
// TODO: UGH: use reflection!
public static HashMap<String, Object> makeEmptyFrameMeta() {
HashMap<String,Object> hm = new LinkedHashMap<>(); // preserve insertion order
for(String key: FrameMetadata.METAVALUES) hm.put(key,null);
return hm;
}
// Takes empty frame meta hashmap and fills in the metadata not requiring MRTask
// TODO: make helper functions so that it's possible to iterate over METAVALUES only
public void fillSimpleMeta(HashMap<String, Object> fm) {
fm.put("DatasetName", _datasetName);
fm.put("NRow", (double)_fr.numRows());
fm.put("NCol", (double)_fr.numCols());
fm.put("LogNRow", Math.log((double)fm.get("NRow")));
fm.put("LogNCol", Math.log((double)fm.get("NCol")));
fm.put("NACount", _fr.naCount());
fm.put("NAFraction", _fr.naFraction());
fm.put("NumberNumericFeat", (double)numberOfNumericFeatures());
fm.put("NumberCatFeat", (double) numberOfCategoricalFeatures());
fm.put("RatioNumericToCatFeat", Double.isInfinite((double) fm.get("NumberCatFeat")) ? SQLNAN : (double) fm.get("NumberNumericFeat") / (double) fm.get("NumberCatFeat"));
fm.put("RatioCatToNumericFeat", Double.isInfinite((double) fm.get("NumberNumericFeat")) ? SQLNAN : (double) fm.get("NumberCatFeat") / (double) fm.get("NumberNumericFeat"));
fm.put("DatasetRatio", (double) _fr.numCols() / (double) _fr.numRows());
fm.put("LogDatasetRatio", Math.log((double) fm.get("DatasetRatio")));
fm.put("InverseDatasetRatio", (double)_fr.numRows() / (double) _fr.numCols() );
fm.put("LogInverseDatasetRatio", Math.log((double)fm.get("InverseDatasetRatio")));
fm.put("Classification", _isClassification?1:0);
fm.put("FeatWithNAs", (double)na_FeatureCount());
fm.put("RowsWithNAs",(double)rowsWithNa());
fm.put("NClass",(double)nClass());
double[] skew=skewness();
fm.put("MinSkewness",skew[0]);
fm.put("MaxSkewness", skew[1]);
fm.put("MeanSkewness", skew[2]);
fm.put("StdSkewness", skew[3]);
fm.put("MedianSkewness", skew[4]);
double[] kurt=kurtosis();
fm.put("MinKurtosis",kurt[0]);
fm.put("MaxKurtosis", kurt[1]);
fm.put("MeanKurtosis", kurt[2]);
fm.put("StdKurtosis", kurt[3]);
fm.put("MedianKurtosis", kurt[4]);
double[] sym=cardinality();
fm.put("MinCardinality",sym[0]);
fm.put("MaxCardinality", sym[1]);
fm.put("MeanCardinality", sym[2]);
fm.put("StdCardinality", sym[3]);
fm.put("MedianCardinality", sym[4]);
}
/**
* Get the non-ignored columns that are not in the filter; do not include the response.
* @param filterThese remove these columns
* @return an int[] of the non-ignored column indexes
*/
public int[] diffCols(int[] filterThese) {
HashSet<Integer> filter = new HashSet<>();
for(int i:filterThese)filter.add(i);
ArrayList<Integer> res = new ArrayList<>();
for(int i=0;i<_cols.length;++i) {
if( _cols[i]._ignored || _cols[i]._response || filter.contains(i) ) continue;
res.add(i);
}
return intListToA(res);
}
//count of features with nas
public long na_FeatureCount() {
if( _featsWithNa!=-1 ) return _featsWithNa;
long cnt=0;
for(int i=0;i<_cols.length;++i) {
if( !_cols[i]._ignored && !_cols[i]._response && _cols[i]._percentNA!=0) {
cnt += 1; // check if const columns along with user ignored columns are included in ignored
}
}
return (_featsWithNa=cnt);
}
//count of rows with nas
public long rowsWithNa() {
if( _rowsWithNa!=-1 ) return _rowsWithNa;
String x = String.format("(na.omit %s)", _fr._key);
Val res = Rapids.exec(x);
Frame f = res.getFrame();
long cnt = _fr.numRows() - f.numRows();
f.delete();
return (_rowsWithNa=cnt);
}
//number of classes if classification problem
public long nClass() {
if( _nclass!=-1 ) return _nclass;
if(_isClassification==true){
long cnt=0;
cnt = _fr.vec(_response).domain().length;
/*for(int i=0;i<_cols.length;++i) {
if(_cols[i]._response==true) cnt = _cols[i]._v.domain().length;
}*/
return(_nclass=cnt);
}else{
return(_nclass=0);
}
}
/** Loops over numeric features to get skewness summary for the frame **/
public double[] skewness(){
double[] ar = new double[5];
ar[0] = _minSkewness;
ar[1] = _maxSkewness;
ar[2] = _meanSkewness;
ar[3] = _stdSkewness;
ar[4] = _medianSkewness;
if( _minSkewness!=-1 && _maxSkewness!=-1 && _meanSkewness!=-1 && _stdSkewness!=-1 && _medianSkewness!=-1) return ar;
if(isAnyNumeric()){
int ar_size = numberOfNumericFeatures();
double[] darry = new double[ar_size];
int ind=0;
for(int i=0;i<_cols.length;++i) {
if( !_cols[i]._ignored && !_cols[i]._response && _cols[i]._isNumeric) {
darry[ind] = _cols[i]._skew;
ind +=1;
}
}
Vec vd1 = dvec(darry);
Vec[] vdd = new Vec[1];
vdd[0] = vd1;
Key<Frame> key = Key.make("keyD");
String[] names= new String[1];
names[0] ="vd1";
Frame dr = new Frame(key,names,vdd);
DKV.put(dr);
ar[0] = rapidMin(dr);
ar[1] = rapidMax(dr);
ar[2] = rapidMean(dr);
ar[3] = rapidSd(dr);
ar[4] = rapidMedian(dr);
_minSkewness = ar[0];
_maxSkewness = ar[1];
_meanSkewness = ar[2];
_stdSkewness = ar[3];
_medianSkewness = ar[4];
dr.remove();
}else{
ar[0] = Double.NaN;
ar[1] = Double.NaN;
ar[2] = Double.NaN;
ar[3] = Double.NaN;
ar[4] = Double.NaN;
_minSkewness = Double.NaN;
_maxSkewness = Double.NaN;
_meanSkewness = Double.NaN;
_stdSkewness = Double.NaN;
_medianSkewness = Double.NaN;
}
return ar;
}
/** Loops over numeric features to get kurtosis summary for the frame **/
public double[] kurtosis(){
double[] ar = new double[5];
ar[0] = _minKurtosis;
ar[1] = _maxKurtosis;
ar[2] = _meanKurtosis;
ar[3] = _stdKurtosis;
ar[4] = _medianKurtosis;
if( _minKurtosis!=-1 && _maxKurtosis!=-1 && _meanKurtosis!=-1 && _stdKurtosis!=-1 && _medianKurtosis!=-1) return ar;
if(isAnyNumeric()){
int ar_size = numberOfNumericFeatures();
double[] darry = new double[ar_size];
int ind=0;
for(int i=0;i<_cols.length;++i) {
if( !_cols[i]._ignored && !_cols[i]._response && _cols[i]._isNumeric) {
darry[ind] = _cols[i]._kurtosis;
ind +=1;
}
}
Vec vd1 = dvec(darry);
Vec[] vdd = new Vec[1];
vdd[0] = vd1;
Key<Frame> key = Key.make("keyD");
String[] names= new String[1];
names[0] ="vd1";
Frame dr = new Frame(key,names,vdd);
DKV.put(dr);
ar[0] = rapidMin(dr);
ar[1] = rapidMax(dr);
ar[2] = rapidMean(dr);
ar[3] = rapidSd(dr);
ar[4] = rapidMedian(dr);
_minKurtosis = ar[0];
_maxKurtosis = ar[1];
_meanKurtosis = ar[2];
_stdKurtosis = ar[3];
_medianKurtosis = ar[4];
dr.remove();
}else{
ar[0] = Double.NaN;
ar[1] = Double.NaN;
ar[2] = Double.NaN;
ar[3] = Double.NaN;
ar[4] = Double.NaN;
_minKurtosis = Double.NaN;
_maxKurtosis = Double.NaN;
_meanKurtosis = Double.NaN;
_stdKurtosis = Double.NaN;
_medianKurtosis = Double.NaN;
}
return ar;
}
/** Loops over categorical features to get cardinality summary for the frame **/
public double[] cardinality(){
double[] ar = new double[5];
ar[0] = _minCardinality;
ar[1] = _maxCardinality;
ar[2] = _meanCardinality;
ar[3] = _stdCardinality;
ar[4] = _medianCardinality;
if( _minCardinality!=-1 && _maxCardinality!=-1 && _meanCardinality!=-1 && _stdCardinality!=-1 && _medianCardinality!=-1) return ar;
if(isAnyCategorical()){
int ar_size = numberOfCategoricalFeatures();
double[] darry = new double[ar_size];
int ind=0;
for(int i=0;i<_cols.length;++i) {
if( !_cols[i]._ignored && !_cols[i]._response && _cols[i]._isCategorical) {
darry[ind] = _cols[i]._cardinality;
ind +=1;
}
}
Vec vd1 = dvec(darry);
Vec[] vdd = new Vec[1];
vdd[0] = vd1;
Key<Frame> key = Key.make("keyD");
String[] names= new String[1];
names[0] ="vd1";
Frame dr = new Frame(key,names,vdd);
DKV.put(dr);
ar[0] = rapidMin(dr);
ar[1] = rapidMax(dr);
ar[2] = rapidMean(dr);
ar[3] = rapidSd(dr);
ar[4] = rapidMedian(dr);
_minCardinality = ar[0];
_maxCardinality = ar[1];
_meanCardinality = ar[2];
_stdCardinality = ar[3];
_medianCardinality = ar[4];
dr.remove();
}else{
ar[0] = Double.NaN;
ar[1] = Double.NaN;
ar[2] = Double.NaN;
ar[3] = Double.NaN;
ar[4] = Double.NaN;
_minCardinality = Double.NaN;
_maxCardinality = Double.NaN;
_meanCardinality = Double.NaN;
_stdCardinality = Double.NaN;
_medianCardinality = Double.NaN;
}
return ar;
}
/** A numeric Vec from an array of doubles */
public static Vec dvec(double...rows) {
Key<Vec> k = Vec.VectorGroup.VG_LEN1.addVec();
Futures fs = new Futures();
AppendableVec avec = new AppendableVec(k, Vec.T_NUM);
NewChunk chunk = new NewChunk(avec, 0);
for (double r : rows)
chunk.addNum(r);
chunk.close(0, fs);
Vec vec = avec.layout_and_close(fs);
fs.blockForPending();
return vec;
}
/** checks if there are any numeric features in the frame*/
public boolean isAnyNumeric(){
int cnt =0;
for(int i=0;i<_cols.length;++i) {
if( !_cols[i]._ignored && !_cols[i]._response && _cols[i]._isNumeric) {
cnt = 1;
break;
}
}
if(cnt ==1) return true;
else return false;
}
/** checks if there are any categorical features in the frame*/
public boolean isAnyCategorical(){
int cnt =0;
for(int i=0;i<_cols.length;++i) {
if( !_cols[i]._ignored && !_cols[i]._response && _cols[i]._isCategorical) {
cnt = 1;
break;
}
}
if(cnt ==1) return true;
else return false;
}
/** min function from rapids */
public double rapidMin(Frame dr){
//String y0 = String.format("(min %s)",dr._key);
Val val = Rapids.exec("(min " + dr._key + ")");
double d = val.getNum();
return d;
}
/** max function from rapids */
public double rapidMax(Frame dr){
Val val = Rapids.exec("(max " + dr._key + ")");
double d = val.getNum();
return d;
}
/** mean function from rapids */
/** AstMean now accepts a flag to treat NAs as a 0 or ignore them completely */
public double rapidMean(Frame dr, boolean ignore_na) {
//String y0 = String.format("(mean %s %s %s)",dr._key,1,0/1);
Val val = Rapids.exec("(mean " + dr._key + " " + ignore_na + " false)");
double[] darray = val.getRow();
double d = darray[0];
return d;
}
/** mean function with default of ignore_na = true */
public double rapidMean(Frame dr) {
return rapidMean(dr, true);
}
/** sd function from rapids */
public double rapidSd(Frame dr){
Val val = Rapids.exec("(sd " + dr._key + " true)");
double[] darray = val.getNums();
double d = darray[0];
return d;
}
/** median function from rapids */
public double rapidMedian(Frame dr){
Val val = Rapids.exec("(median " + dr._key + " true)");
double[] darray = val.getNums();
double d = darray[0];
return d;
}
/**
* If predictors were passed, then any values computed/cached are based on those
* predictors
* @return
*/
public int numberOfNumericFeatures() {
if( _numFeat!=-1 ) return _numFeat;
ArrayList<Integer> idxs = new ArrayList<>();
ArrayList<Integer> intCols = new ArrayList<>();
ArrayList<Integer> dblCols = new ArrayList<>();
ArrayList<Integer> binCols = new ArrayList<>();
ArrayList<Integer> intNotBinCols = new ArrayList<>();
int cnt=0;
int idx=0;
for(Vec v: _fr.vecs()) {
boolean ignored = _cols[idx]._ignored;
boolean response= _cols[idx]._response;
if( v.isNumeric() && !ignored && !response) {
cnt += 1;
idxs.add(idx);
if( v.isInt() ) intCols.add(idx);
if( v.isBinary() ) binCols.add(idx);
if( v.isInt() && !v.isBinary() ) intNotBinCols.add(idx);
if( v.isNumeric() && !v.isInt() ) dblCols.add(idx);
}
idx++;
}
_numFeats = intListToA(idxs);
_intCols = intListToA(intCols);
_dblCols = intListToA(dblCols);
_binaryCols = intListToA(binCols);
_intNotBinaryCols = intListToA(intNotBinCols);
return (_numFeat=cnt);
}
public int numberOfCategoricalFeatures() {
if( _catFeat!=-1 ) return _catFeat;
ArrayList<Integer> idxs = new ArrayList<>();
int cnt=0;
int idx=0;
for(Vec v: _fr.vecs()) {
boolean ignored = _cols[idx]._ignored;
boolean response= _cols[idx]._response;
if( v.isCategorical() && !ignored && !response) {
cnt += 1;
idxs.add(idx);
}
idx++;
}
_catFeats = intListToA(idxs);
return (_catFeat=cnt);
}
public FrameMetadata(UserFeedback userFeedback, Frame fr, int response, String datasetName) {
_datasetName=datasetName;
_fr=fr;
_response=response;
_cols = new ColMeta[_fr.numCols()];
_userFeedback = userFeedback;
}
public FrameMetadata(UserFeedback userFeedback, Frame fr, int response, String datasetName, boolean isClassification) {
this(userFeedback, fr,response,datasetName);
_isClassification=isClassification;
}
public FrameMetadata(UserFeedback userFeedback, Frame fr, int response, int[] predictors, String datasetName, boolean isClassification) {
this(userFeedback, fr, response, intAtoStringA(predictors, fr.names()), datasetName, isClassification);
}
public FrameMetadata(UserFeedback userFeedback, Frame fr, int response, String[] predictors, String datasetName, boolean isClassification) {
this(userFeedback, fr, response, datasetName, isClassification);
_includeCols = predictors;
if( null==_includeCols )
for (int i = 0; i < _fr.numCols(); ++i)
_cols[i] = new ColMeta(_fr.vec(i),_fr.name(i),i,i==_response);
else {
HashSet<String> preds = new HashSet<>();
Collections.addAll(preds,_includeCols);
for(int i=0;i<_fr.numCols();++i)
_cols[i] = new ColMeta(_fr.vec(i),_fr.name(i),i,i==_response,!preds.contains(_fr.name(i)));
}
}
public boolean isClassification() { return _isClassification; }
public String[] ignoredCols() { // publishes private field
if( _ignoredCols==null ) {
ArrayList<Integer> cols = new ArrayList<>();
for(ColMeta c: _cols)
if( c._ignored ) cols.add(c._idx);
_ignoredCols=new String[cols.size()];
for(int i=0;i<cols.size();++i)
_ignoredCols[i]=_fr.name(cols.get(i));
}
return _ignoredCols;
}
public String[] includedCols() {
if( _includeCols==null ) {
if( null==ignoredCols() ) return _includeCols = _fr.names();
_includeCols = ArrayUtils.difference(_fr.names(), ignoredCols()); // clones _fr.names, so line above avoids one more copy
}
return _includeCols;
}
public ColMeta response() {
if( -1==_response ) {
for(int i=0;i<_cols.length;++i)
if(_cols[i]._response) {
_response=i; break;
}
}
return _cols[_response];
}
public boolean stratify() { return response()._stratify; }
public Vec[] weights() {
if( null!=_trainTestWeight) return _trainTestWeight;
return _trainTestWeight = stratify() ? AutoMLUtils.makeStratifiedWeights(response()._v,0.8, response()._weightMult)
: AutoMLUtils.makeWeights( response()._v,0.8, response()._weightMult);
}
// blocking call to compute 1st pass of column metadata
public FrameMetadata computeFrameMetaPass1() {
MetaPass1[] tasks = new MetaPass1[_fr.numCols()];
for(int i=0; i<tasks.length; ++i)
tasks[i] = new MetaPass1(i,this);
_isClassification = tasks[_response]._isClassification;
MetaCollector.ParallelTasks metaCollector = new MetaCollector.ParallelTasks<>(tasks);
long start = System.currentTimeMillis();
H2O.submitTask(metaCollector).join();
_userFeedback.info(Stage.FeatureAnalysis,
"Frame metadata analyzer pass 1 completed in " +
(System.currentTimeMillis()-start)/1000. +
" seconds");
double sumTimeToMRTaskPerCol=0;
ArrayList<Integer> dropCols=new ArrayList<>();
for(MetaPass1 cmt: tasks) {
if( cmt._colMeta._ignored ) dropCols.add(cmt._colMeta._idx);
else _cols[cmt._colMeta._idx] = cmt._colMeta;
sumTimeToMRTaskPerCol+= cmt._elapsed;
}
_userFeedback.info(Stage.FeatureAnalysis,
"Average time to analyze each column: " +
String.format("%.5f", (sumTimeToMRTaskPerCol/tasks.length) / 1000.0) +
" seconds");
if( dropCols.size()>0 )
dropIgnoredCols(intListToA(dropCols));
return this;
}
private void dropIgnoredCols(int[] dropCols) {
_userFeedback.info(Stage.FeatureAnalysis, "AutoML dropping " + dropCols.length + " ignored columns");
Vec[] vecsToRemove = _fr.remove(dropCols);
for(Vec v: vecsToRemove) v.remove();
ColMeta cm[] = new ColMeta[_fr.numCols()];
int idx=0;
for(int i=0;i<_fr.numCols();++i) {
while(null==_cols[idx]) idx++;
cm[i]=_cols[idx++];
}
_cols=cm;
flushCachedItems();
}
private void flushCachedItems() {
_catFeats=null;
_numFeats=null;
_intCols=null;
_dblCols=null;
_binaryCols=null;
_intNotBinaryCols=null;
_response=-1;
_naCnt=-1;
_numFeat=-1;
_catFeat=-1;
_nclass=-1;
_ignoredCols=null;
_includeCols=null;
_featsWithNa=-1;
_rowsWithNa=-1;
_minKurtosis=-1;
_minSkewness=-1;
_minCardinality=-1;
_maxKurtosis=-1;
_maxSkewness=-1;
_maxCardinality=-1;
_meanKurtosis=-1;
_meanSkewness=-1;
_meanCardinality=-1;
_stdKurtosis=-1;
_stdSkewness=-1;
_stdCardinality=-1;
_medianKurtosis=-1;
_medianSkewness=-1;
_medianCardinality=-1;
}
public static String[] intAtoStringA(int[] select, String[] names) {
String[] preds = new String[select.length];
int i=0;
for(int p: select) preds[i++] = names[p];
return preds;
}
private static class MetaPass1 extends H2O.H2OCountedCompleter<MetaPass1> {
private final boolean _response; // compute class distribution & more granular histo
private boolean _isClassification; // is this a classification problem?
private double _mean; // mean of the column, passed
private final ColMeta _colMeta; // result; also holds onto the DHistogram
private long _elapsed; // time to mrtask
static double log2(double numerator) { return (Math.log(numerator))/Math.log(2)+1e-10; }
public MetaPass1(int idx, FrameMetadata fm) {
Vec v = fm._fr.vec(idx);
_response=fm._response==idx;
String colname = fm._fr.name(idx);
_colMeta = new ColMeta(v, colname, idx, _response);
if( _response ) _isClassification = _colMeta.isClassification();
_mean = v.mean();
if(v.isCategorical()){
_colMeta._cardinality = v.cardinality();
}else{
_colMeta._cardinality = 0;
}
int nbins = (int) Math.ceil(1 + log2(v.length())); // Sturges nbins
int xbins = (char) ((long) v.max() - (long) v.min());
if(!(_colMeta._ignored) && !(_colMeta._v.isBad()) && xbins > 0) {
_colMeta._histo = MetaCollector.DynamicHisto.makeDHistogram(colname, nbins, nbins, (byte) (v.isCategorical() ? 2 : (v.isInt() ? 1 : 0)), v.min(), v.max());
}
//skewness, kurtosis using rapids call
Key<Frame> key = Key.make("keyW");
String[] names= new String[1];
names[0] = "num1";
Vec[] vecs = new Vec[1];
vecs[0] = v;
Frame vec_tofr = new Frame(key,names, vecs);
DKV.put(vec_tofr);
//Skewness
String x = String.format("(skewness %s %s )",vec_tofr._key,true);
Val res1 = Rapids.exec(x);
double[] darray1 = res1.getNums();
_colMeta._skew = darray1[0];
//Kurtosis
String y = String.format("(kurtosis %s %s )",vec_tofr._key,true);
Val res2 = Rapids.exec(y);
double[] darray2 = res2.getNums();
_colMeta._kurtosis = darray2[0];
DKV.remove(vec_tofr._key);
//vec_tofr.remove();
}
public ColMeta meta() { return _colMeta; }
public long elapsed() { return _elapsed; }
@Override public void compute2() {
long start = System.currentTimeMillis();
int xbins = (char) ((long) _colMeta._v.max() - (long) _colMeta._v.min());
if (!(_colMeta._ignored) && !(_colMeta._v.isBad()) && xbins > 0) {
HistTask t = new HistTask(_colMeta._histo, _mean).doAll(_colMeta._v);
_elapsed = System.currentTimeMillis() - start;
_colMeta._thirdMoment = t._thirdMoment / ((_colMeta._v.length() - _colMeta._v.naCnt()) - 1);
_colMeta._fourthMoment = t._fourthMoment / ((_colMeta._v.length() - _colMeta._v.naCnt()) - 1);
_colMeta._MRTaskMillis = _elapsed;
Log.info("completed MetaPass1 for col number: " + _colMeta._idx);
//_colMeta._skew = _colMeta._thirdMoment / Math.sqrt(_colMeta._variance*_colMeta._variance*_colMeta._variance);
//_colMeta._kurtosis = _colMeta._fourthMoment / (_colMeta._variance * _colMeta._variance);
}
tryComplete();
}
private static class HistTask extends MRTask<HistTask> {
private DHistogram _h;
private double _thirdMoment; // used for skew/kurtosis; NaN if not numeric
private double _fourthMoment; // used for skew/kurtosis; NaN if not numeric
private double _mean;
HistTask(DHistogram h, double mean) { _h = h; _mean=mean; }
@Override public void setupLocal() { _h.init(); }
@Override public void map( Chunk C ) {
double min = _h.find_min();
double max = _h.find_maxIn();
double[] bins = new double[_h._nbin];
double colData;
for(int r=0; r<C._len; ++r) {
if( Double.isNaN(colData=C.atd(r)) ) continue;
if( colData < min ) min = colData;
if( colData > max ) max = colData;
bins[_h.bin(colData)]++; double delta = colData - _mean;
double threeDelta = delta*delta*delta;
_thirdMoment += threeDelta;
_fourthMoment += threeDelta*delta;
}
_h.setMin(min); _h.setMaxIn(max);
for(int b=0; b<bins.length; ++b)
if( bins[b]!=0 )
_h.addWAtomic(b, bins[b]);
}
@Override public void reduce(HistTask t) {
if( _h==t._h ) return;
if( _h==null ) _h=t._h;
else if( t._h!=null )
_h.add(t._h);
if( !Double.isNaN(t._thirdMoment) ) {
if( Double.isNaN(_thirdMoment) ) _thirdMoment = t._thirdMoment;
else _thirdMoment += t._thirdMoment;
}
if( !Double.isNaN(t._fourthMoment) ) {
if( Double.isNaN(_fourthMoment) ) _fourthMoment = t._fourthMoment;
else _fourthMoment += t._fourthMoment;
}
}
}
}
}