DataAdapter.java example

Explorer
h2o-2-master
package hex.singlenoderf;


import jsr166y.ForkJoinTask;
import jsr166y.RecursiveAction;
import water.*;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.Log;
import water.util.Log.Tag.Sys;

import java.text.DecimalFormat;
import java.util.Arrays;

/**A DataAdapter maintains an encoding of the original data. Every raw value (of type float)
 * is represented by a short value. When the number of unique raw value is larger that binLimit,
 * the DataAdapter will perform binning on the data and use the same short encoded value to
 * represent several consecutive raw values.
 *
 * Missing values, NaNs and Infinity are treated as BAD data. */
final class DataAdapter  {

  /** Place holder for missing data, NaN, Inf in short encoding.*/
  static final short BAD = Short.MIN_VALUE;

  /** Number of classes. */
  private final int _numClasses;
  /** Columns. */
  final Col[]    _c;
  /** Seed for sampling */
  private final long     _seed;
  /** Number of rows */
  public  final int      _numRows;
  /** Class weights */
  public  final double[] _classWt;
  /** Use regression */
  public final boolean _regression;

  public Key _jobKey;

  DataAdapter(Frame fr, SpeeDRFModel model, int[] modelDataMap, int rows,
              long unique, long seed, int binLimit, double[] classWt) {
//    assert model._dataKey == fr._key;
    _seed       = seed+(unique<<16); // This is important to preserve sampling selection!!!
    /* Maximum arity for a column (not a hard limit) */
    _numRows    = rows;
    _jobKey     = model.jobKey;
    _numClasses = model.regression ? 1 : model.classes();
    _regression = model.regression;
    _c = new Col[fr.numCols()];
    for( int i = 0; i < _c.length; i++ ) {
      if(model.jobKey != null && !Job.isRunning(model.jobKey)) throw new Job.JobCancelledException();
      assert fr._names[modelDataMap[i]].equals(fr._names[i]);
      Vec v = fr.vecs()[i];
      if( isByteCol(v,rows, i == _c.length-1, _regression) ) // we do not bin for small values
        _c[i] = new Col(fr._names[i], rows, i == _c.length-1);
      else
        _c[i] = new Col(fr._names[i], rows, i == _c.length-1, binLimit, !(v.isEnum() || v.isInt()));
    }
    boolean trivial = true;
    if (classWt != null) for(double f: classWt) if (f != 1.0) trivial = false;
    _classWt = trivial ?  null : classWt;
  }

  static boolean isByteCol( Vec C, int rows, boolean isClass, boolean regression) {
    if (regression) {
      return !isClass && (C.isInt() || C.isEnum()) && C.min() >= 0 && C.length() == rows && (C.max() < 255 || C.max() < 256 && C.length() == rows);
    }
    return (C.isInt() || C.isEnum()) && !isClass && C.min() >= 0 && C.length()==rows &&
            (C.max()<255 || C.max() <256 && C.length()==rows);
  }

  /** Given a value in enum format, returns:  the value in the original format if no
   * binning was applied,  or if binning was applied a value that is inbetween
   * the idx and the next value.  If the idx is the last value return (2*idx+1)/2. */
  public float unmap(int col, int idx){ return _c[col].rawSplit(idx); }
  public boolean isFloat(int col) { return _c[col].isFloat(); }
  public long seed()          { return _seed; }
  public int columns()        { return _c.length;}
  public int classOf(int idx) { return _c[_c.length-1].get(idx); }
  /** The number of possible prediction classes. */
  public int classes()        { return _numClasses; }
  /** Transforms given binned index (short) from class column into a value from interval [0..N-1]
   * corresponding to a particular predictor class */
  public int unmapClass(int clazz) {
    Col c = _c[_c.length-1];
    if (c._isByte)
      return clazz;
    else {
      // OK, this is not fully correct bad handle corner-cases like for example dataset uses classes only
      // with 0 and 3. Our API reports that there are 4 classes but in fact there are only 2 classes.
      if (clazz >= c._binned2raw.length) clazz = c._binned2raw.length - 1;
      return (int) (c.raw(clazz) - c._min);
    }
  }

  /** Returns the number of bins, i.e. the number of distinct values in the column.  */
  public int columnArity(int col) { return _c[col].arity(); }
  public int columnArityOfClassCol() { return _c[_c.length - 1].arity(); }

  /** Return a short that represents the binned value of the original row,column value.  */
  public short getEncodedColumnValue(int row, int col) { return _c[col].get(row); }
  public short getEncodedClassColumnValue(int row) { return _c[_c.length-1].get(row); }
  public double getRawColumnValue(int row, int col) { return _c[col].getRaw(row); }
  public float getRawClassColumnValueFromBin(int row) {
    int idx = _c.length-1;
    short btor = _c[idx].get(row);
    if (_c[idx]._binned == null) {
      return (float)(0xFF & _c[idx]._rawB[row]);
    }
    return _c[_c.length-1]._binned2raw[btor];
  }

  public void shrink() {
    if(_jobKey != null && !Job.isRunning(_jobKey)) throw new Job.JobCancelledException();
//    for ( Col c: _c) c.shrink();
    // sort columns in parallel: c.shrink() calls single-threaded Arrays.sort()
    RecursiveAction [] ras = new RecursiveAction[_c.length];
    int i=0;
    for ( final Col c: _c) {
      ras[i++] = new RecursiveAction() {
        @Override public void compute() { c.shrink(); }
      };
    }
    ForkJoinTask.invokeAll(ras);
  }

  public String columnName(int i) { return _c[i].name(); }

  public boolean isValid(int col, float f) {
    return !_c[col].isFloat() || !Float.isInfinite(f);
  }

  public final void    add(float v, int row, int col) {
    _c[col].add (row,v); }
  public final void    add1(int  v, int row, int col) {
    _c[col].add1(row,v); }
  public final void    addBad(int row, int col)       { _c[col].addBad(row); }
  public final boolean hasBadValue(int row, int col)  { return _c[col].isBad(row); }
  public final boolean isBadRow(int row)              { return _c[_c.length-1].isBad(row); }
  public final boolean isBadRowRaw(int row)           { return _c[_c.length-1].isBadRaw(row); }
  public final boolean isIgnored(int col)             { return _c[col].isIgnored(); }
  public final void    markIgnoredRow(int row)        { _c[_c.length-1].addBad(row);  }
  public final int     classColIdx()                  { return _c.length - 1; }
  public final boolean hasAnyInvalid(int col)         { return _c[col]._invalidValues!=0; }

  static class Col {
    /** Encoded values*/
    short[] _binned;
    /** Original values, kept only during inhale*/
    float[] _raw;
    /** Original values which we do not want to bin */
    byte[]  _rawB;
    /** Map from binned to original*/
    float[] _binned2raw;
    final boolean _isClass, _isFloat, _isByte;
    final int _colBinLimit;
    final String _name;
    /** Total number of bad values in the column. */
    int _invalidValues;
    float _min, _max;
    int _arity;

    static final DecimalFormat df = new  DecimalFormat ("0.##");
    boolean _ignored;

    Col(String s, int rows, boolean isClass) {
      _name = s; _isClass = isClass;
      _rawB = MemoryManager.malloc1(rows);
      _isFloat = false;
      _isByte  = true;
      _colBinLimit = 0;
    }

    Col(String s, int rows, boolean isClass, int binLimit, boolean isFloat) {
      _name = s; _isFloat = isFloat; _isClass = isClass; _colBinLimit = binLimit; _isByte = false;
      _raw = MemoryManager.malloc4f(rows);
      _ignored = false;
    }

    boolean isFloat()      { return _isFloat; }
    boolean isIgnored()    { return _ignored; }
    int arity()            { return _ignored ? -1 : _arity; }
    String name()          { return _name;        }
    short get(int row)     { return (short) (_isByte ? (_rawB[row]&0xFF) : _binned[row]); }
    double getRaw(int row) { return (double)(_isByte ? (_rawB[row]&0xFF) : _binned2raw[_binned[row]]);}

    void add(int row, float val) {
      _raw [row] = val; }
    void add1(int row, int  val) {
      _rawB[row] = (byte)val; }
    void addBad(int row)         { if (!_isByte) _raw[row] = Float.NaN; else _rawB[row] = (byte)255; }

    private boolean isBadRaw(float f) { return Float.isNaN(f); }
    boolean isBad(int row)            {
      return _isByte ? (_rawB[row]&0xFF)==255 : _binned[row] == BAD;
    }

    /** For all columns - encode all floats as unique shorts. */
    void shrink() {
      if (_isByte) {
        _arity = 256;
        return ; // do not shrink byte columns
      }
      float[] vs = _raw.clone();
      Arrays.sort(vs); // Sort puts all Float.NaN at the end of the array (according Float.NaN doc)
      int ndups = 0, i = 0, nans = 0; // Counter of all NaNs
      while(i < vs.length-1) {      // count dups
        int j = i+1;
        if (isBadRaw(vs[i]))  { nans = vs.length - i; break; } // skip all NaNs
        if (isBadRaw(vs[j]))  { nans = vs.length - j; break; } // there is only one remaining NaN (do not forget on it)
        while(j < vs.length && vs[i] == vs[j]){  ++ndups; ++j; }
        i = j;
      }
      _invalidValues = nans;
      if ( vs.length <= nans) {
        // to many NaNs in the column => ignore it
        _ignored = true;
        _raw     = null;
        Log.info(Sys.RANDF, "Ignore column: " + this);
        return;
      }
      int n = vs.length - ndups - nans;
      int rem = n % _colBinLimit;
      int maxBinSize = (n > _colBinLimit) ? (n / _colBinLimit + Math.min(rem,1)) : 1;
      // Assign shorts to floats, with binning.
      _binned2raw = MemoryManager.malloc4f(Math.min(n, _colBinLimit)); // if n is smaller than bin limit no need to compact
      int smax = 0, cntCurBin = 1;
      i = 0;
      _binned2raw[0] = vs[i];
      for(; i < vs.length; ++i) {
        if(isBadRaw(vs[i])) break; // the first NaN, there are only NaN in the rest of vs[] array
        if(vs[i] == _binned2raw[smax]) continue; // remove dups
        if( ++cntCurBin > maxBinSize ) {
          if(rem > 0 && --rem == 0)--maxBinSize; // check if we can reduce the bin size
          ++smax;
          cntCurBin = 1;
        }
        _binned2raw[smax] = vs[i];
      }
      ++smax;
//      for(i = 0; i< vs.length; i++) if (!isBadRaw(vs[i])) break;
      // All Float.NaN are at the end of vs => min is stored in vs[0]
      _min = vs[0];
      for(i = vs.length -1; i>= 0; i--) if (!isBadRaw(vs[i])) break;
      _max = vs[i];
      vs = null; // GCed
      _binned = MemoryManager.malloc2(_raw.length);
      // Find the bin value by lookup in bin2raw array which is sorted so we can do binary lookup.
      for(i = 0; i < _raw.length; i++)
        if (isBadRaw(_raw[i]))
          _binned[i] = BAD;
        else {
          short idx = (short) Arrays.binarySearch(_binned2raw, _raw[i]);
          if (idx >= 0) _binned[i] = idx;
          else _binned[i] = (short) (-idx - 1); // this occurs when we are looking for a binned value, we return the smaller value in the array.
          assert _binned[i] < _binned2raw.length;
        }
      if( n > _colBinLimit )   Log.info(Sys.RANDF,this+" this column's arity was cut from "+n+" to "+smax);
      _arity = _binned2raw.length;
      _raw = null; // GCced
    }

    /**Given an encoded short value, return the original float*/
    public float raw(int idx) { return _binned2raw[idx]; }

    /**Given an encoded short value, return the float that splits that value with the next.*/
    public float rawSplit(int idx){
      if (_isByte) return idx; // treat index as value
      if (idx == BAD) return Float.NaN;
      float flo = _binned2raw[idx]; // Convert to the original values
      float fhi = (idx+1 < _binned2raw.length)? _binned2raw[idx+1] : flo+1.f;
      //assert flo < fmid && fmid < fhi : "Values " + flo +","+fhi ; // Assert that the float will properly split
      return (flo+fhi)/2.0f;
    }

    int rows() { return _isByte ? _rawB.length : _binned.length; }

    @Override public String toString() {
      String res = "Column("+_name+"){";
      if (_ignored) res+="IGNORED";
      else {
        res+= " ["+df.format(_min) +","+df.format(_max)+"]";
        res+=",bad values=" + _invalidValues + "/" + rows();
        if (_isClass) res+= " CLASS ";
      }
      res += "}";
      return res;
    }
  }
}