DABuilder.java example

Explorer
h2o-2-master
package hex.singlenoderf;


import hex.singlenoderf.SpeeDRF.DRFParams;
import jsr166y.ForkJoinTask;
import jsr166y.RecursiveAction;
import water.Job;
import water.Key;
import water.Timer;
import water.UKV;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.util.Log;

import java.util.ArrayList;


public class DABuilder {
    protected final DRFParams    _rfParams;
    protected final Key _rfModel;

    static DABuilder create(final DRFParams rfParams, final Key rfModel) {
      switch( rfParams.sampling_strategy ) {
        case RANDOM                :
        default                    : return new DABuilder(rfParams, rfModel);
      }
    }

    DABuilder(final DRFParams rfparams, final Key rfmodel) { _rfParams = rfparams; _rfModel = rfmodel; }

    final DataAdapter build(Frame fr, boolean useNonLocal) { return inhaleData(fr, useNonLocal); }

    /** Check that we have proper number of valid columns vs. features selected, if not cap*/
    private void checkAndLimitFeatureUsedPerSplit(Frame fr) {
      int validCols = fr.numCols()-1; // for classIdx column
      if (validCols < _rfParams.num_split_features) {
        Log.info(Log.Tag.Sys.RANDF, "Limiting features from " + _rfParams.num_split_features +
                " to " + validCols + " because there are no more valid columns in the dataset");
        _rfParams.num_split_features= validCols;
      }
    }

    /** Return the number of rows on this node. */
    private int getRowCount(Frame fr) { return (int)fr.numRows(); }

    /** Return chunk index of the first chunk on this node. Used to identify the trees built here.*/
    private long getChunkId(final Frame fr) {
      Key[] keys = new Key[fr.anyVec().nChunks()];
      for(int i = 0; i < fr.anyVec().nChunks(); ++i) {
        keys[i] = fr.anyVec().chunkKey(i);
      }
      for(int i = 0; i < keys.length; ++i) {
        if (keys[i].home()) return i;
      }
      return -99999; //throw new Error("No key on this node");
    }

    private static int find(String n, String[] names) {
      if( n == null ) return -1;
      for( int j = 0; j<names.length; j++ )
        if( n.equals(names[j]) )
          return j;
      return -1;
    }

    public static int[] colMap( String[] frame_names, String[] model_names ) {
      int mapping[] = new int[frame_names.length];
      for( int i = 0; i<mapping.length; i++ )
        mapping[i] = find(frame_names[i],model_names);
      return mapping;
    }

    /** Build data adapter for given frame */
    protected DataAdapter inhaleData(Frame fr, boolean useNonLocal) {
      Log.info("Prepping for data inhale.");
      long id = getChunkId(fr);
      if (id == -99999) {
        return null;
      }
      Timer t_inhale = new Timer();
      final SpeeDRFModel rfmodel = UKV.get(_rfModel);
      boolean[] _isByteCol = new boolean[fr.numCols()];
      long[] _naCnts = new long[fr.numCols()];
      for (int i = 0; i < _isByteCol.length; ++i) {
        _isByteCol[i] = DataAdapter.isByteCol(fr.vecs()[i], (int)fr.numRows(), i == _isByteCol.length - 1, rfmodel.regression);
        _naCnts[i] = fr.vecs()[i].naCnt();
      }
      // The model columns are dense packed - but there will be columns in the
      // data being ignored.  This is a map from the model's columns to the
      // building dataset's columns.
      final int[] modelDataMap = colMap(fr._names, rfmodel._names);
      final int totalRows = getRowCount(fr);
      final DataAdapter dapt = new DataAdapter(fr, rfmodel, modelDataMap,
              totalRows,
              getChunkId(fr),
              _rfParams.seed,
              _rfParams.bin_limit,
              _rfParams.class_weights);
      // Check that we have proper number of valid columns vs. features selected, if not cap.
      checkAndLimitFeatureUsedPerSplit(fr);

      // Collects jobs loading local chunks
      ArrayList<RecursiveAction> dataInhaleJobs = new ArrayList<RecursiveAction>();

      Log.info("\n\nTotal Number of Chunks: " + fr.anyVec().nChunks()+"\n\n");

      int cnter_local = 0;
      int cnter_remote = 0;
      for(int i = 0; i < fr.anyVec().nChunks(); ++i) {
        if (useNonLocal) {
          if (fr.anyVec().chunkKey(i).home()) { cnter_local++; } else { cnter_remote++; }
          dataInhaleJobs.add(loadChunkAction(dapt, fr, i, _isByteCol, _naCnts, rfmodel.regression));
        } else if (fr.anyVec().chunkKey(i).home()) {
          cnter_local++;
          dataInhaleJobs.add(loadChunkAction(dapt, fr, i, _isByteCol, _naCnts, rfmodel.regression));
        }
      }

      Log.info("\n\nTotal local  chunks to load: "+cnter_local+"\n\nTotal remote chunks to load:" +cnter_remote);

      SpeeDRF.DRFTask.updateRFModelStatus(_rfModel, "Inhaling Data.");
      Log.info(Log.Tag.Sys.RANDF,"Beginning Random Forest Inhale.");
      ForkJoinTask.invokeAll(dataInhaleJobs);

      if(dapt._jobKey != null && !Job.isRunning(dapt._jobKey)) throw new Job.JobCancelledException();

      // Shrink data
      dapt.shrink();
      if(dapt._jobKey != null && !Job.isRunning(dapt._jobKey)) throw new Job.JobCancelledException();
      Log.info(Log.Tag.Sys.RANDF,"Inhale done in " + t_inhale);
      return dapt;
    }

    static RecursiveAction loadChunkAction(final DataAdapter dapt, final Frame fr, final int cidx, final boolean[] isByteCol, final long[] naCnts, boolean regression) {
      return new RecursiveAction() {
        @Override protected void compute() {
          if(dapt._jobKey != null && !Job.isRunning(dapt._jobKey)) throw new Job.JobCancelledException();
          try {
            Chunk[] chks = new Chunk[fr.numCols()];
            int ncolumns = chks.length;
            for(int i = 0; i < chks.length; ++i) {
              chks[i] = fr.vecs()[i].chunkForChunkIdx(cidx);
            }
            for (int j = 0; j < chks[0]._len; ++j) {
              if(dapt._jobKey != null && !Job.isRunning(dapt._jobKey)) throw new Job.JobCancelledException();
              int rowNum = (int)chks[0]._start + j;
              boolean rowIsValid = false;
              for(int c = 0; c < chks.length; ++c) {
                if(naCnts[c] > 0) {
                  if(chks[c].isNA0(j)) {
                    if (c == ncolumns - 1) rowIsValid = false;
                    dapt.addBad(rowNum, c); continue;
                  }
                }
                if (isByteCol[c]) {
                  int val = (int)chks[c].at8(rowNum);
                  dapt.add1(val, rowNum, c);
                } else {
                  float f = (float)chks[c].at(rowNum);
                  if(!dapt.isValid(c, f)) { dapt.addBad(rowNum, c); continue; }
                  dapt.add(f, rowNum, c);
                }
                if (c != ncolumns - 1) {
                  rowIsValid |= true;
                }

              }
              if (!rowIsValid) dapt.markIgnoredRow(j);
            }
          } catch (Throwable t) {
            //
          }
        }
      };
    }
}