DeepLearningTask.java example

Explorer
h2o-3-master
package hex.deeplearning;

import hex.genmodel.utils.DistributionFamily;
import hex.deeplearning.DeepLearningModel.DeepLearningParameters;
import hex.DataInfo;
import hex.FrameTask;
import water.DKV;
import water.H2O;
import water.IcedUtils;
import water.Key;
import water.util.Log;
import water.util.RandomUtils;

import java.util.Arrays;
import java.util.Random;

public class DeepLearningTask extends FrameTask<DeepLearningTask> {
  final private boolean _training;
  private DeepLearningModelInfo _localmodel; //per-node state (to be reduced)
  private DeepLearningModelInfo _sharedmodel; //input/output
  transient Neurons[] _neurons;
  transient Random _dropout_rng;
  int _chunk_node_count = 1;

  /**
   * Accessor to the object containing the (final) state of the Deep Learning model
   * Should only be queried after calling this.doAll(Frame training)
   * @return "The" final model after one Map/Reduce iteration
   */
  final public DeepLearningModelInfo model_info() {
    assert(_sharedmodel != null);
    return _sharedmodel;
  }

  /**
   * The only constructor
   * @param jobKey
   * @param inputModel Initial model state
   * @param fraction Fraction of rows of the training to train with
   * @param iteration
   */
  public DeepLearningTask(Key jobKey, DeepLearningModelInfo inputModel, float fraction, int iteration){
    this(jobKey,inputModel,fraction,iteration,null);
  }
  public DeepLearningTask(Key jobKey, DeepLearningModelInfo inputModel, float fraction, int iteration, H2O.H2OCountedCompleter cmp){
    super(jobKey, inputModel.data_info(),inputModel.get_params()._seed + inputModel.get_processed_global(), iteration, inputModel.get_params()._sparse,cmp);
    assert(inputModel.get_processed_local() == 0);
    _training=true;
    _sharedmodel = inputModel;
//    if (model_info().get_params()._elastic_averaging)
//      DKV.put(_sharedmodel.elasticAverageModelInfoKey(), _sharedmodel);
    _useFraction=fraction;
    _shuffle = model_info().get_params()._shuffle_training_data;
  }

  /**
   * Transfer ownership from global (shared) model to local model which will be worked on
   */
  @Override protected void setupLocal(){
    assert(_localmodel == null);
    super.setupLocal();
    if (model_info().get_params()._elastic_averaging) {
      //Load my local model from DKV, to continue training
      _localmodel = DKV.getGet(_sharedmodel.localModelInfoKey(H2O.SELF));
      if (_localmodel != null) {
        if (!Arrays.equals(_localmodel.units, _sharedmodel.units)) {
          _localmodel = IcedUtils.deepCopy(_sharedmodel);
        } else {
          //Make sure that the local model has the right global (shared) parameters after checkpoint restart!
          _localmodel.set_params(_sharedmodel.get_params(), _sharedmodel._model_id);
          _localmodel.set_processed_global(_sharedmodel.get_processed_global());
        }
      }
      else {
        // first time around - use the randomized initial weights and don't spread the shared (random) model
        _localmodel = IcedUtils.deepCopy(_sharedmodel);
        _sharedmodel = null;
      }
    } else {
      _localmodel = _sharedmodel;
      _sharedmodel = null;
    }
    _localmodel.set_processed_local(0);
  }

  // Create local workspace (neurons) and link them to shared weights
  @Override protected boolean chunkInit(){
    if (_localmodel.get_processed_local() >= _useFraction * _fr.numRows())
      return false;
    _neurons = makeNeuronsForTraining(_localmodel);
    _dropout_rng = RandomUtils.getRNG(System.currentTimeMillis());
    return true;
  }

  /**
   * Process one training row at a time (online learning)
   * @param seed Seed is only used if reproducible mode is enabled
   * @param r Row (must be dense for now)
   * @param mb mini-batch internal index
   */
  @Override public final void processRow(long seed, DataInfo.Row r, int mb) {
    if (_localmodel.get_params()._reproducible) {
      seed += _localmodel.get_processed_global(); //avoid periodicity
    } else {
      seed = _dropout_rng.nextLong(); // non-reproducible case - make a fast & good random number
    }
    _localmodel.checkMissingCats(r.binIds);
    ((Neurons.Input) _neurons[0]).setInput(seed, r.isSparse() ? r.numIds : null, r.numVals, r.nBins, r.binIds, mb);
  }

  /**
   * Apply the gradient to update the weights
   * @param seed
   * @param responses
   * @param offsets
   * @param n number of trained examples in this last mini batch (usually == mini_batch_size, but can be less)
   */
  @Override public void processMiniBatch(long seed, double[] responses, double[] offsets, int n) {
    assert(_training);
    if (_localmodel.get_params()._reproducible) {
      seed += _localmodel.get_processed_global(); //avoid periodicity
    } else {
      seed = _dropout_rng.nextLong(); // non-reproducible case - make a fast & good random number
    }
    fpropMiniBatch(seed, _neurons, _localmodel, _localmodel.get_params()._elastic_averaging ? _sharedmodel : null, _training, responses, offsets, n);
    bpropMiniBatch(_neurons, n);
  }

  /**
   * Helper to apply back-propagation without clearing out the gradients afterwards
   * Used for gradient checking
   * @param neurons
   * @param n number of trained examples in this last mini batch (usually == mini_batch_size, but can be less)
   */
  static public void bpropMiniBatch(Neurons[] neurons, int n) {
    neurons[neurons.length - 1].bpropOutputLayer(n);
    for (int i = neurons.length - 2; i > 0; --i)
      neurons[i].bprop(n);

    for (int mb=0;mb<n;++mb) {
      // all errors are reset to 0
      for (int i = 0; i<neurons.length ;++i) {
        Storage.DenseVector e = neurons[i]._e == null ? null : neurons[i]._e[mb];
        if (e==null) continue;
        Arrays.fill(e.raw(), 0);
      }
    }
  }

  @Override
  protected int getMiniBatchSize() {
    return _localmodel.get_params()._mini_batch_size;
  }

  /**
   * After each chunk, add the number of processed rows to the counter
   * @param n Number of processed rows
   */
  @Override protected void chunkDone(long n) {
    if (_training) _localmodel.add_processed_local(n);
  }

  /**
   * After all maps are done on a node, this is called to store the per-node model into DKV (for elastic averaging)
   * Otherwise, do nothing.
   */
  @Override protected void closeLocal() {
    if (_localmodel.get_params()._elastic_averaging) {
      // store local model, as it will be reduced in the following, and hence averaged with other models
      DKV.put(_localmodel.localModelInfoKey(H2O.SELF), _localmodel, _fs);
    }
    _sharedmodel = null; //avoid serialization overhead
  }

  /**
   * Average the per-node models (for elastic averaging, already wrote them to DKV in postLocal())
   * This is a no-op between F/J worker threads (operate on the same weights/biases)
   * @param other
   */
  @Override public void reduce(DeepLearningTask other){
    if (_localmodel != null && other._localmodel != null && other._localmodel.get_processed_local() > 0 //other DLTask was active (its model_info should be used for averaging)
        && other._localmodel != _localmodel) //other DLTask worked on a different model_info
    {
      // avoid adding remote model info to unprocessed local data, still random
      // (this can happen if we have no chunks on the master node)
      if (_localmodel.get_processed_local() == 0) {
        _localmodel = other._localmodel;
        _chunk_node_count = other._chunk_node_count;
      } else {
        _localmodel.add(other._localmodel);
        _chunk_node_count += other._chunk_node_count;
      }
      if (other._localmodel.isUnstable()) _localmodel.setUnstable();
    }
  }


  static long _lastWarn;
  static long _warnCount;
  /**
   * After all reduces are done, the driver node calls this method to clean up
   * This is only needed if we're not inside a DeepLearningTask2 (which will do the reduction between replicated data workers).
   * So if replication is disabled, and every node works on partial data, then we have work to do here (model averaging).
   */
  @Override protected void postGlobal(){
    DeepLearningParameters dlp = _localmodel.get_params();
    if (H2O.CLOUD.size() > 1 && !dlp._replicate_training_data) {
      long now = System.currentTimeMillis();
      if (_chunk_node_count < H2O.CLOUD.size() && (now - _lastWarn > 5000) && _warnCount < 3) {
//        Log.info("Synchronizing across " + _chunk_node_count + " H2O node(s).");
        Log.warn(H2O.CLOUD.size() - _chunk_node_count + " node(s) (out of " + H2O.CLOUD.size()
            + ") are not contributing to model updates. Consider setting replicate_training_data to true or using a larger training dataset (or fewer H2O nodes).");
        _lastWarn = now;
        _warnCount++;
      }
    }
    // Check that we're not inside a DeepLearningTask2
    assert ((!dlp._replicate_training_data || H2O.CLOUD.size() == 1) == !_run_local);
    if (!_run_local) {
      _localmodel.add_processed_global(_localmodel.get_processed_local()); //move local sample counts to global ones
      _localmodel.set_processed_local(0l);
      // model averaging
      if (_chunk_node_count > 1)
        _localmodel.div(_chunk_node_count);
      if (_localmodel.get_params()._elastic_averaging)
        _sharedmodel = DeepLearningModelInfo.timeAverage(_localmodel);
    } else {
      //Get ready for reduction in DeepLearningTask2
      //Just swap the local and global models
      _sharedmodel = _localmodel;
    }
    if (_sharedmodel == null)
      _sharedmodel = _localmodel;
    _localmodel = null;
  }

  public static Neurons[] makeNeuronsForTraining(final DeepLearningModelInfo minfo) {
    return makeNeurons(minfo, true);
  }
  public static Neurons[] makeNeuronsForTesting(final DeepLearningModelInfo minfo) {
    return makeNeurons(minfo, false);
  }

  // Helper
  private static Neurons[] makeNeurons(final DeepLearningModelInfo minfo, boolean training) {
    DataInfo dinfo = minfo.data_info();
    final DeepLearningParameters params = minfo.get_params();
    final int[] h = params._hidden;
    Neurons[] neurons = new Neurons[h.length + 2]; // input + hidden + output
    // input
    neurons[0] = new Neurons.Input(params, minfo.units[0], dinfo);
    // hidden
    for( int i = 0; i < h.length + (params._autoencoder ? 1 : 0); i++ ) {
      int n = params._autoencoder && i == h.length ? minfo.units[0] : h[i];
      switch( params._activation ) {
        case Tanh:
          neurons[i+1] = new Neurons.Tanh(n);
          break;
        case TanhWithDropout:
          neurons[i+1] = params._autoencoder && i == h.length ? new Neurons.Tanh(n) : new Neurons.TanhDropout(n);
          break;
        case Rectifier:
          neurons[i+1] = new Neurons.Rectifier(n);
          break;
        case RectifierWithDropout:
          neurons[i+1] = params._autoencoder && i == h.length ? new Neurons.Rectifier(n) : new Neurons.RectifierDropout(n);
          break;
        case Maxout:
          neurons[i+1] = new Neurons.Maxout(params,(short)2,n);
          break;
        case MaxoutWithDropout:
          neurons[i+1] = params._autoencoder && i == h.length ? new Neurons.Maxout(params,(short)2,n) : new Neurons.MaxoutDropout(params,(short)2,n);
          break;
        case ExpRectifier:
          neurons[i+1] = new Neurons.ExpRectifier(n);
          break;
        case ExpRectifierWithDropout:
          neurons[i+1] = params._autoencoder && i == h.length ? new Neurons.ExpRectifier(n) : new Neurons.ExpRectifierDropout(n);
          break;
      }
    }
    if(!params._autoencoder) {
      if (minfo._classification && minfo.get_params()._distribution != DistributionFamily.modified_huber)
        neurons[neurons.length - 1] = new Neurons.Softmax(minfo.units[minfo.units.length - 1]);
      else
        neurons[neurons.length - 1] = new Neurons.Linear();
    }

    //copy parameters from NN, and set previous/input layer links
    for( int i = 0; i < neurons.length; i++ ) {
      neurons[i].init(neurons, i, params, minfo, training);
      neurons[i]._input = neurons[0];
    }

//    // debugging
//    for (Neurons n : neurons) Log.info(n.toString());
    return neurons;
  }

  /**
   * Forward propagation
   * assumption: layer 0 has _a filled with (horizontalized categoricals) double values
   * @param seed
   * @param neurons
   * @param minfo
   * @param consensus_minfo
   * @param training
   * @param n Number of actually trained samples in this mini-batch
   */
  public static void fpropMiniBatch(long seed, Neurons[] neurons, DeepLearningModelInfo minfo,
                                    DeepLearningModelInfo consensus_minfo, boolean training, double[] responses, double[] offset, int n) {
    // Forward propagation
    for (int i=1; i<neurons.length; ++i)
      neurons[i].fprop(seed, training, n);

    // Add offset (in link space) if applicable
    for (int mb=0;mb<n;++mb) {
      if (offset!=null && offset[mb] > 0) {
        assert (!minfo._classification); // Regression
        double[] m = minfo.data_info()._normRespMul;
        double[] s = minfo.data_info()._normRespSub;
        double mul = m == null ? 1 : m[0];
        double sub = s == null ? 0 : s[0];
        neurons[neurons.length - 1]._a[mb].add(0, ((offset[mb] - sub) * mul));
      }

      if (training) {
        // Compute the gradient at the output layer
        // auto-encoder: pass a dummy "response" (ignored)
        // otherwise: class label or regression target
        neurons[neurons.length - 1].setOutputLayerGradient(responses[mb], mb, n);

        // Elastic Averaging - set up helpers needed during back-propagation
        if (consensus_minfo != null) {
          for (int i = 1; i < neurons.length; i++) {
            neurons[i]._wEA = consensus_minfo.get_weights(i - 1);
            neurons[i]._bEA = consensus_minfo.get_biases(i - 1);
          }
        }
      }
    }
  }
}