package ai.h2o.automl.colmeta;
import hex.tree.DHistogram;
import water.Iced;
import water.fvec.Vec;
import java.util.HashMap;
/** Column Meta Data
*
* Holds usual rollup stats and additional interesting bits of information.
*/
public class ColMeta extends Iced {
public static final String[] METAVALUES = new String[]{
"idFrame", "ColumnName", "ColumnType", "Min", "Max", "Mean", "Median",
"Variance", "Cardinality", "Kurtosis", "Skew", "VIF", "FractionNA",
"TimeToMRTaskMillis"};
//private static transient Set<Class<? extends Guesser>> _guesserClasses;
private static transient String[] _guessers;
public final Vec _v; // the column
public byte _nameType; // guessed at by ColNameScanner
public final String _name; // column name
public final int _idx; // index into the input frame from automl
public boolean _ignored; // should this column be ignored outright
public boolean _response; // is this a response column?
public double _percentNA; // fraction of NAs in the column
public double _variance; // variance of the column, pulled from the vec
public double _sigma; // pulled from vec rollups
public boolean _stratify; // do stratified sampling when building weight columns
public double[] _dist; // distribution of classes
public double[] _weightMult; // weight multipliers for each class to even out distributions
public boolean _isNumeric; // is this a numeric column
public boolean _isCategorical; // is this a categorical column
public static final double SQLNAN = -99999;
public boolean _isClass; // is a classification problem, only valid to ask when _response is true
public boolean isClassification() {
if( _response ) return _isClass;
throw new IllegalArgumentException("Cannot ask non-response metadata if problem is classification");
}
/**
* Meta data collected on the first pass over this column.
*
* This is done by computing a very coarse histogram (~500 bins) for numeric vectors,
* and building a table of occurrences for enum/string vectors (see NB below).
*
* Additionally, the time it takes to MRTask over the column is collected and stored.
* This will be helpful in understanding the time it takes to build DTree instances in
* the tree-based models.
*
* NB: For any table'ing done on a vec, there's a limit of 10K uniques before a more
* efficient (i.e., MDowle-style radix sort) scheme should be engaged.
*/
// (discussion) need to have guardrails around data and possibly some warning or error
// about data that doesn't fit the distribution that was trained. Will want to compare
// histos from training and testing data to see if they "match". WTM
public DHistogram _histo;
public long _MRTaskMillis;
public double _thirdMoment; // used for skew/kurtosis; NaN if not numeric
public double _fourthMoment; // used for skew/kurtosis; NaN if not numeric
public double _kurtosis; // the sharpness of the peak of a frequency-distribution curve
public double _skew; // measure of the assymetry of a distribution; < 0 means shifted to the right; > 0 means shifted to the left
public int _cardinality; // length of domain
// VIF
public double _vif; // vifs computed by FrameMeta
// SECOND PASS
// https://0xdata.atlassian.net/browse/STEAM-41 --column metadata to gather
public long _numUniques;
public double _avgUniquesPerChunk; // number of uniques per chunk divided by number of chunks
public boolean _chunksMonotonicallyIncreasing; // indicates some weak ordering in the dataset (by this column)
public double[] _chunkBoundaries; // boundary values for each chunk [c1.min, c1.max, c2.min, c2.max, ...]; only for numeric vecs
public double _median;
// is this an ID for IID data?
// - all unique
// - increasing ints
public boolean _isRowBasedId;
// is this an ID for non-IID data, like timeseries?
public boolean _isNonIidId;
// is it a date column?
// - has date info (possibly a month column since values are all 1-12, or 0-11)
// - possibly next to other date columns?
public boolean _isDate;
/**
* Build metrics against the response column for this vec.
*
* Four scenarios:
* this vec response vec
* 1. Categorical, Categorical - build int[this_vec.domain()][counts_per_response_cat]
* 2. Categorical, Numerical - build double[this_vec.domain()][mean,sd]
* 3. Numerical, Categorical - build double[response_vec.domain()][mean,sd]
* 4. Numerical, Numerical - not defined, could do a stupid thing with quantile
*/
public ColMeta(Vec v, String colname, int idx, boolean response, boolean ignored) {
_v = v;
_name = colname;
_ignored=ignored;
//if(_ignored) _ignoredReason = IgnoreReason.user_specified;
_idx = idx;
//_nameType=ColNameScanner.IGNORED;
_response = response;
_percentNA = (double) v.naCnt() / (double) v.length();
_sigma = v.sigma();
_variance = _sigma * _sigma;
_vif = -1;
if(v.isNumeric() && !ignored && !response){
_isNumeric =true;
}else{
_isNumeric =false;
}
if(v.isCategorical() && !ignored && !response){
_isCategorical =true;
}else{
_isCategorical =false;
}
}
public ColMeta(Vec v, String colname, int idx, boolean response) {
this(v,colname,idx,response,false);
}
public static HashMap<String, Object> makeEmptyColMeta() {
HashMap<String,Object> hm = new HashMap<>();
for(String key: ColMeta.METAVALUES) hm.put(key,null);
return hm;
}
// TODO: enum, please!
public String selectBasicTransform() {
if( _ignored ) return "ignored";
if( _v.isBinary() ) return "none";
if( _v.isTime() || _isDate ) return "time"; // actually we have a time/date column, so apply some time transforms
if( _v.max() - _v.min() > 1e4) return "log"; // take a log if spans more than 2 orders
if( _v.isNumeric() && !_v.isInt() ) return "recip"; // try the reciprocal!
return "none"; // no transform if not interesting
}
public void fillColMeta(HashMap<String, Object> cm, int idFrame) {
cm.put("idFrame", idFrame);
cm.put("ColumnName", _name);
cm.put("ColumnType", _v.get_type_str()); // TODO:
if( !_v.isNumeric() ) {
cm.put("Min", SQLNAN);
cm.put("Max", SQLNAN);
cm.put("Mean", SQLNAN);
cm.put("Median", SQLNAN);
cm.put("Variance", SQLNAN);
cm.put("Cardinality", _v.cardinality());
cm.put("Kurtosis", SQLNAN);
cm.put("Skew", SQLNAN);
cm.put("VIF", SQLNAN);
} else {
cm.put("Min", _v.min());
cm.put("Max", _v.max());
cm.put("Mean", _v.mean());
cm.put("Median", _v.pctiles()[8/*p=0.5 pctile; see Vec.PERCENTILES*/]);
cm.put("Variance", _v.sigma()*_v.sigma());
cm.put("Cardinality", SQLNAN);
cm.put("Kurtosis", _kurtosis);
cm.put("Skew", _skew);
cm.put("VIF", _vif);
}
cm.put("FractionNA", (double) _v.naCnt() / (double) _v.length() );
cm.put("TimeToMRTaskMillis", _MRTaskMillis);
}
}