package hex.deeplearning; import hex.DataInfo; import hex.genmodel.utils.DistributionFamily; import static java.lang.Double.isNaN; import hex.Model; import hex.deeplearning.DeepLearningModel.DeepLearningParameters; import water.*; import water.fvec.Frame; import water.util.*; import java.util.Arrays; import java.util.Random; /** * This class contains the state of the Deep Learning model * This will be shared: one per node */ final public class DeepLearningModelInfo extends Iced<DeepLearningModelInfo> { public TwoDimTable summaryTable; public DataInfo data_info; public DataInfo data_info() { return data_info; } // model is described by parameters and the following arrays private Storage.DenseRowMatrix[] dense_row_weights; //one 2D weight matrix per layer (stored as a 1D array each) private Storage.DenseVector[] biases; //one 1D bias array per layer private Storage.DenseVector[] avg_activations; //one 1D array per hidden layer // helpers for storing previous step deltas // Note: These two arrays *could* be made transient and then initialized freshly in makeNeurons() and in DeepLearningTask.initLocal() // But then, after each reduction, the weights would be lost and would have to restart afresh -> not *exactly* right, but close... private Storage.DenseRowMatrix[] dense_row_weights_momenta; private Storage.DenseVector[] biases_momenta; // helpers for AdaDelta private Storage.DenseRowMatrix[] dense_row_ada_dx_g; private Storage.DenseVector[] biases_ada_dx_g; private boolean[] _saw_missing_cats; // whether missing value was encountered for each categorical predictor - needed for varimp // compute model size (number of model parameters required for making predictions) // momenta are not counted here, but they are needed for model building public long size() { long siz = 0; for (Storage.DenseRowMatrix w : dense_row_weights) if (w != null) siz += w.size(); for (Storage.Vector b : biases) siz += b.size(); return siz; } /** * Check whether a missing value was found for every categorical predictor * @param cats activation of categorical buckets for a given row */ void checkMissingCats(int[] cats) { if (cats == null) return; if (_saw_missing_cats == null) return; for (int i=0; i<cats.length; ++i) { assert(data_info._catMissing[i]); //have a missing bucket for each categorical if (_saw_missing_cats[i]) continue; _saw_missing_cats[i] = (cats[i] == data_info._catOffsets[i+1]-1); } } // accessors to (shared) weights and biases - those will be updated racily (c.f. Hogwild!) boolean has_momenta() { return get_params()._momentum_start != 0 || get_params()._momentum_stable != 0; } boolean adaDelta() { return get_params()._adaptive_rate; } public final Storage.DenseRowMatrix get_weights(int i) { return dense_row_weights[i]; } public final Storage.DenseVector get_biases(int i) { return biases[i]; } public final Storage.DenseRowMatrix get_weights_momenta(int i) { return dense_row_weights_momenta[i]; } public final Storage.DenseVector get_biases_momenta(int i) { return biases_momenta[i]; } public final Storage.DenseRowMatrix get_ada_dx_g(int i) { return dense_row_ada_dx_g[i]; } public final Storage.DenseVector get_biases_ada_dx_g(int i) { return biases_ada_dx_g[i]; } //accessor to shared parameter defining avg activations public final Storage.DenseVector get_avg_activations(int i) { return avg_activations[i]; } public DeepLearningParameters parameters; Key<Model> _model_id; public final DeepLearningParameters get_params() { return parameters; } public final void set_params(DeepLearningParameters p, Key<Model> model_id ) { parameters = (DeepLearningParameters) p.clone(); _model_id = model_id; } private double[] mean_rate; private double[] rms_rate; private double[] mean_bias; private double[] rms_bias; private double[] mean_weight; public double[] rms_weight; public double[] mean_a; private volatile boolean unstable = false; public boolean isUnstable() { return unstable; } public void setUnstable() { if (!unstable) computeStats(); unstable = true; } private long processed_global; public synchronized long get_processed_global() { return processed_global; } public synchronized void set_processed_global(long p) { processed_global = p; } public synchronized void add_processed_global(long p) { processed_global += p; } private long processed_local; public synchronized long get_processed_local() { return processed_local; } public synchronized void set_processed_local(long p) { processed_local = p; } public synchronized void add_processed_local(long p) { processed_local += p; } public synchronized long get_processed_total() { return processed_global + processed_local; } // package local helpers int[] units; //number of neurons per layer, extracted from parameters and from datainfo final boolean _classification; // Classification cache (nclasses>1) final Frame _train; // Prepared training frame final Frame _valid; // Prepared validation frame /** * Dummy constructor, only to be used for deserialization from autobuffer */ private DeepLearningModelInfo() { super(); // key is null _classification = false; _train = _valid = null; } /** * Main constructor * @param params Model parameters * @param dinfo Data Info * @param nClasses number of classes (1 for regression, 0 for autoencoder) * @param train User-given training data frame, prepared by AdaptTestTrain * @param valid User-specified validation data frame, prepared by AdaptTestTrain */ public DeepLearningModelInfo(final DeepLearningParameters params, Key model_id, final DataInfo dinfo, int nClasses, Frame train, Frame valid) { _classification = nClasses > 1; _train = train; _valid = valid; data_info = dinfo; parameters = (DeepLearningParameters) params.clone(); //make a copy, don't change model's parameters _model_id = model_id; DeepLearningParameters.Sanity.modifyParms(parameters, parameters, nClasses); //sanitize the model_info's parameters final int num_input = dinfo.fullN(); final int num_output = get_params()._autoencoder ? num_input : (_classification && parameters._distribution != DistributionFamily.modified_huber ? train.vec(parameters._response_column).cardinality() : 1); if (!get_params()._autoencoder) assert(num_output == nClasses || parameters._distribution == DistributionFamily.modified_huber ); _saw_missing_cats = dinfo._cats > 0 ? new boolean[data_info._cats] : null; assert (num_input > 0); assert (num_output > 0); if (has_momenta() && adaDelta()) throw new IllegalArgumentException("Cannot have non-zero momentum and adaptive rate at the same time."); final int layers = get_params()._hidden.length; // units (# neurons for each layer) units = new int[layers + 2]; if (get_params()._max_categorical_features <= Integer.MAX_VALUE - dinfo._nums) units[0] = Math.min(dinfo._nums + get_params()._max_categorical_features, num_input); else units[0] = num_input; System.arraycopy(get_params()._hidden, 0, units, 1, layers); units[layers + 1] = num_output; boolean printLevels = units[0] > 1000L; boolean warn = units[0] > 100000L; if (printLevels) { final String[][] domains = dinfo._adaptedFrame.domains(); if (warn) { Log.warn("==================================================================================================================================="); Log.warn(num_input + " input features" + (dinfo._cats > 0 ? " (after categorical one-hot encoding)" : "") + ". Can be slow and require a lot of memory."); } FrameUtils.printTopCategoricalLevels(dinfo._adaptedFrame, warn, 10); if (warn) { Log.warn("Suggestions:"); Log.warn(" *) Limit the size of the first hidden layer"); if (dinfo._cats > 0) { Log.warn(" *) Limit the total number of one-hot encoded features with the parameter 'max_categorical_features'"); Log.warn(" *) Run h2o.interaction(...,pairwise=F) on high-cardinality categorical columns to limit the factor count, see http://learn.h2o.ai"); } Log.warn("==================================================================================================================================="); } } int[] mult = new int[layers + 1]; for (int i=0;i<layers;++i) { mult[i] = (get_params()._activation == DeepLearningParameters.Activation.Maxout || get_params()._activation == DeepLearningParameters.Activation.MaxoutWithDropout) ? 2 : 1; } mult[layers]=1; //Output is never Maxout // weights (to connect layers) dense_row_weights = new Storage.DenseRowMatrix[layers + 1]; dense_row_weights[0] = new Storage.DenseRowMatrix(mult[0]*units[1], units[0]); for (int i = 1; i <= layers; ++i) dense_row_weights[i] = new Storage.DenseRowMatrix(mult[i] * units[i + 1] /*rows*/, units[i] /*cols*/); // biases (only for hidden layers and output layer) biases = new Storage.DenseVector[layers + 1]; for (int i = 0; i <= layers; ++i) biases[i] = new Storage.DenseVector(mult[i] * units[i+1]); // average activation (only for hidden layers) if (get_params()._autoencoder && get_params()._sparsity_beta > 0) { avg_activations = new Storage.DenseVector[layers]; mean_a = new double[layers]; for (int i = 0; i < layers; ++i) avg_activations[i] = new Storage.DenseVector(mult[i] * units[i + 1]); } allocateHelperArrays(); // for diagnostics mean_rate = new double[units.length-1]; rms_rate = new double[units.length-1]; mean_bias = new double[units.length-1]; rms_bias = new double[units.length-1]; mean_weight = new double[units.length-1]; rms_weight = new double[units.length-1]; } /** * Allocate helper arrays for momentum/learning rate, etc. */ void allocateHelperArrays() { int[] mult = new int[units.length-1]; for (int i=0;i<units.length-1;++i) { mult[i] = (get_params()._activation == DeepLearningParameters.Activation.Maxout || get_params()._activation == DeepLearningParameters.Activation.MaxoutWithDropout) ? 2 : 1; } mult[units.length-2]=1; //Output is never Maxout if (has_momenta()) { dense_row_weights_momenta = new Storage.DenseRowMatrix[dense_row_weights.length]; if (dense_row_weights[0] != null) dense_row_weights_momenta[0] = new Storage.DenseRowMatrix(mult[0]*units[1], units[0]); for (int i = 1; i < dense_row_weights_momenta.length; ++i) dense_row_weights_momenta[i] = new Storage.DenseRowMatrix(mult[i]*units[i + 1], units[i]); biases_momenta = new Storage.DenseVector[biases.length]; for (int i = 0; i < biases_momenta.length; ++i) biases_momenta[i] = new Storage.DenseVector(mult[i]*units[i + 1]); } else if (adaDelta()) { dense_row_ada_dx_g = new Storage.DenseRowMatrix[dense_row_weights.length]; //AdaGrad dense_row_ada_dx_g[0] = new Storage.DenseRowMatrix(mult[0]*2*units[1], units[0]); for (int i = 1; i < dense_row_ada_dx_g.length; ++i) { dense_row_ada_dx_g[i] = new Storage.DenseRowMatrix(mult[i]*units[i + 1], 2 * units[i]); } biases_ada_dx_g = new Storage.DenseVector[biases.length]; for (int i = 0; i < biases_ada_dx_g.length; ++i) { biases_ada_dx_g[i] = new Storage.DenseVector(mult[i]*2* units[i + 1]); } } } /** * Create a summary table * @return TwoDimTable with the summary of the model */ TwoDimTable createSummaryTable() { computeStats(); Neurons[] neurons = DeepLearningTask.makeNeuronsForTesting(this); long byte_size = new AutoBuffer().put(this).buf().length; TwoDimTable table = new TwoDimTable( "Status of Neuron Layers", (!get_params()._autoencoder ? ("predicting " + data_info._adaptedFrame.lastVecName() + ", ") : "") + (get_params()._autoencoder ? "auto-encoder" : _classification ? (units[units.length - 1] + "-class classification") : "regression") + ", " + get_params()._distribution + " distribution, " + get_params()._loss + " loss, " + String.format("%,d", size()) + " weights/biases, " + PrettyPrint.bytes(byte_size) + ", " + String.format("%,d", get_processed_global()) + " training samples, " + "mini-batch size " + String.format("%,d", get_params()._mini_batch_size), new String[neurons.length], new String[]{"Layer", "Units", "Type", "Dropout", "L1", "L2", "Mean Rate", "Rate RMS", "Momentum", "Mean Weight", "Weight RMS", "Mean Bias", "Bias RMS" }, new String[]{"int", "int", "string", "double", "double", "double", "double", "double", "double", "double", "double", "double", "double" }, new String[]{"%d", "%d", "%s", "%2.2f %%", "%5f", "%5f", "%5f", "%5f", "%5f", "%5f", "%5f", "%5f", "%5f"}, ""); for (int i = 0; i < neurons.length; ++i) { table.set(i, 0, i + 1); table.set(i, 1, neurons[i].units); table.set(i, 2, neurons[i].getClass().getSimpleName()); if (i == 0) { table.set(i, 3, neurons[i].params._input_dropout_ratio * 100); continue; } else if (i < neurons.length - 1) { if (neurons[i].params._hidden_dropout_ratios == null) { table.set(i, 3, 0); } else { table.set(i, 3, neurons[i].params._hidden_dropout_ratios[i - 1] * 100); } } table.set(i, 4, neurons[i].params._l1); table.set(i, 5, neurons[i].params._l2); table.set(i, 6, (get_params()._adaptive_rate ? mean_rate[i-1] : neurons[i].rate(get_processed_total()))); table.set(i, 7, (get_params()._adaptive_rate ? rms_rate[i-1] : 0)); table.set(i, 8, get_params()._adaptive_rate ? 0 : neurons[i].momentum(get_processed_total())); table.set(i, 9, mean_weight[i-1]); table.set(i, 10, rms_weight[i-1]); table.set(i, 11, mean_bias[i-1]); table.set(i, 12, rms_bias[i-1]); } summaryTable = table; return summaryTable; } /** * Print a summary table * @return String containing ASCII version of summary table */ @Override public String toString() { StringBuilder sb = new StringBuilder(); if (!get_params()._quiet_mode) { if (get_params()._sparsity_beta > 0) { for (int k = 0; k < get_params()._hidden.length; k++) { sb.append("Average activation in hidden layer ").append(k).append(" is ").append(mean_a[k]).append(" \n"); } } createSummaryTable(); sb.append(summaryTable.toString(1)); } return sb.toString(); } /** * Debugging printout * @return String with useful info */ public String toStringAll() { StringBuilder sb = new StringBuilder(); sb.append(toString()); for (int i = 0; i < units.length - 1; ++i) sb.append("\nweights[").append(i).append("][]=").append(Arrays.toString(get_weights(i).raw())); for (int i = 0; i < units.length - 1; ++i) { sb.append("\nbiases[").append(i).append("][]=").append(Arrays.toString(get_biases(i).raw())); } if (has_momenta()) { for (int i = 0; i < units.length - 1; ++i) sb.append("\nweights_momenta[").append(i).append("][]=").append(Arrays.toString(get_weights_momenta(i).raw())); } if (biases_momenta != null) { for (int i = 0; i < units.length - 1; ++i) { sb.append("\nbiases_momenta[").append(i).append("][]=").append(Arrays.toString(biases_momenta[i].raw())); } } sb.append("\nunits[]=").append(Arrays.toString(units)); sb.append("\nprocessed global: ").append(get_processed_global()); sb.append("\nprocessed local: ").append(get_processed_local()); sb.append("\nprocessed total: ").append(get_processed_total()); sb.append("\n"); return sb.toString(); } /** * Initialize weights/biases */ void initializeMembers(Key<Frame>[] initial_weights, Key<Frame>[] initial_biases) { randomizeWeights(); //TODO: determine good/optimal/best initialization scheme for biases // hidden layers for (int i = 0; i < get_params()._hidden.length; ++i) { if (get_params()._activation == DeepLearningParameters.Activation.Rectifier || get_params()._activation == DeepLearningParameters.Activation.RectifierWithDropout || get_params()._activation == DeepLearningParameters.Activation.Maxout || get_params()._activation == DeepLearningParameters.Activation.MaxoutWithDropout ) { // Arrays.fill(biases[i], 1.); //old behavior Arrays.fill(biases[i].raw(), i == 0 ? 0.5f : 1f); //new behavior, might be slightly better } else if (get_params()._activation == DeepLearningParameters.Activation.Tanh || get_params()._activation == DeepLearningParameters.Activation.TanhWithDropout) { Arrays.fill(biases[i].raw(), 0f); } } Arrays.fill(biases[biases.length - 1].raw(), 0f); //output layer if (initial_weights!=null || initial_biases!=null) { Log.info("Initializing initial model state from user-given weights/biases."); for (int i = 0; i < get_params()._hidden.length+1; ++i) { if (initial_weights[i] == null) { Log.info("No user-given weight matrix given for weights #" + (i+1) + ". Initializing those weights randomly."); continue; } if (initial_biases[i] == null) { Log.info("No user-given bias vector given for biases #" + (i+1) + ". Initializing those biases randomly."); continue; } Frame w = initial_weights[i].get(); if (w==null) { throw new IllegalArgumentException("User-given weight matrix for weights #" + (i+1) + " '" + initial_weights[i].toString() + "' not found. Initializing those weights randomly."); } if (w.numRows() != get_weights(i).rows() || w.numCols() != get_weights(i).cols()) { throw new IllegalArgumentException("Dimensionality mismatch: initial_weights matrix #" + i + " should have " + get_weights(i).rows() + " rows and " + get_weights(i).cols() + " columns, but has " + w.numRows() + " rows and " + w.numCols() + " columns."); } Frame b = initial_biases[i].get(); if (b==null) { throw new IllegalArgumentException("User-given bias vector for biases #" + (i+1) + " '" + initial_biases[i].toString() + "' not found. Initializing those biases randomly."); } if (b.numRows() != get_biases(i).size() || b.numCols() != 1) { throw new IllegalArgumentException("Dimensionality mismatch: initial_biases vector #" + i + " should have " + get_biases(i).size() + " rows and 1" + " column, but has " + b.numRows() + " rows and " + b.numCols() + " column(s)."); } for (int c=0; c<w.numCols(); ++c) for (int r=0; r<w.numRows(); ++r) get_weights(i).set(r,c,(float)w.vec(c).at(r)); for (int r=0; r<w.numRows(); ++r) get_biases(i).set(r,(float)b.vec(0).at(r)); } } else { Log.info("Created random initial model state."); } } /** * Fill weights and biases from a pretrained autoencoder model * @param autoencoder Autoencoder DL model with matching inputs and hidden layers */ void initializeFromPretrainedModel(DeepLearningModelInfo autoencoder) { assert(autoencoder.parameters._autoencoder); randomizeWeights(); // now overwrite the weights with those from the pretrained model for (int w = 0; w < dense_row_weights.length-1 /*skip output layer*/; ++w) { if (get_weights(w).rows() != autoencoder.get_weights(w).rows()) throw new IllegalArgumentException("Mismatch between weights in pretrained model and this model: rows in layer " + w + ": " + autoencoder.get_weights(w).rows() + " vs " + get_weights(w).rows() + ". Enable ignored_const_cols for both models and/or check categorical levels for consistency."); if (get_weights(w).cols() != autoencoder.get_weights(w).cols()) throw new IllegalArgumentException("Mismatch between weights in pretrained model and this model: cols in layer " + w + ": " + autoencoder.get_weights(w).cols() + " vs " + get_weights(w).cols() + ". Enable ignored_const_cols for both models and/or check categorical levels for consistency."); for (int i = 0; i < get_weights(w).rows(); i++) { for (int j = 0; j < get_weights(w).cols(); j++) { get_weights(w).set(i, j, autoencoder.get_weights(w).get(i, j)); } } } for (int i = 0; i < get_params()._hidden.length; ++i) { for (int j = 0; j < biases[i].raw().length; ++j) { biases[i].set(j, autoencoder.biases[i].get(j)); } } Arrays.fill(biases[biases.length - 1].raw(), 0f); //output layer } /** * Add another model info into this * This will add the weights/biases/learning rate helpers, and the number of processed training samples * Note: It will NOT add the elastic averaging helpers, which are always kept constant (they already are the result of a reduction) * @param other Other DeepLearningModelInfo to add into this one */ public void add(DeepLearningModelInfo other) { for (int i = 0; i < dense_row_weights.length; ++i) ArrayUtils.add(get_weights(i).raw(), other.get_weights(i).raw()); for (int i = 0; i < biases.length; ++i) ArrayUtils.add(biases[i].raw(), other.biases[i].raw()); if (avg_activations != null) for (int i = 0; i < avg_activations.length; ++i) ArrayUtils.add(avg_activations[i].raw(), other.biases[i].raw()); if (has_momenta()) { assert (other.has_momenta()); for (int i = 0; i < dense_row_weights_momenta.length; ++i) ArrayUtils.add(get_weights_momenta(i).raw(), other.get_weights_momenta(i).raw()); for (int i = 0; i < biases_momenta.length; ++i) ArrayUtils.add(biases_momenta[i].raw(), other.biases_momenta[i].raw()); } if (adaDelta()) { assert (other.adaDelta()); for (int i = 0; i < dense_row_ada_dx_g.length; ++i) { ArrayUtils.add(get_ada_dx_g(i).raw(), other.get_ada_dx_g(i).raw()); } } add_processed_local(other.get_processed_local()); } /** * Multiply all weights/biases by a real-valued number * @param N multiplication factor */ protected void mult(double N) { div(1 / N); } /** * Divide all weights/biases by a real-valued number * @param N divisor */ protected void div(double N) { for (int i = 0; i < dense_row_weights.length; ++i) ArrayUtils.div(get_weights(i).raw(), (float)N); for (Storage.Vector bias : biases) ArrayUtils.div(bias.raw(), N); if (avg_activations != null) for (Storage.Vector avgac : avg_activations) ArrayUtils.div(avgac.raw(), N); if (has_momenta()) { for (int i = 0; i < dense_row_weights_momenta.length; ++i) ArrayUtils.div(get_weights_momenta(i).raw(), (float)N); for (Storage.Vector bias_momenta : biases_momenta) ArrayUtils.div(bias_momenta.raw(), N); } if (adaDelta()) { for (int i = 0; i < dense_row_ada_dx_g.length; ++i) { ArrayUtils.div(get_ada_dx_g(i).raw(), (float)N); } } } double uniformDist(Random rand, double min, double max) { return min + rand.nextFloat() * (max - min); } /** * Initialization of neural net weights * cf. http://machinelearning.wustl.edu/mlpapers/paper_files/AISTATS2010_GlorotB10.pdf */ private void randomizeWeights() { for (int w = 0; w < dense_row_weights.length; ++w) { final Random rng = water.util.RandomUtils.getRNG(get_params()._seed + 0xBAD5EED + w + 1); //to match NeuralNet behavior final double range = Math.sqrt(6. / (units[w] + units[w + 1])); for (int i = 0; i < get_weights(w).rows(); i++) { for (int j = 0; j < get_weights(w).cols(); j++) { if (get_params()._initial_weight_distribution == DeepLearningParameters.InitialWeightDistribution.UniformAdaptive) { // cf. http://machinelearning.wustl.edu/mlpapers/paper_files/AISTATS2010_GlorotB10.pdf if (w == dense_row_weights.length - 1 && _classification) get_weights(w).set(i, j, (float) (4. * uniformDist(rng, -range, range))); //Softmax might need an extra factor 4, since it's like a sigmoid else get_weights(w).set(i, j, (float) uniformDist(rng, -range, range)); } else if (get_params()._initial_weight_distribution == DeepLearningParameters.InitialWeightDistribution.Uniform) { get_weights(w).set(i, j, (float) uniformDist(rng, -get_params()._initial_weight_scale, get_params()._initial_weight_scale)); } else if (get_params()._initial_weight_distribution == DeepLearningParameters.InitialWeightDistribution.Normal) { get_weights(w).set(i, j, (float) (rng.nextGaussian() * get_params()._initial_weight_scale)); } } } } } // TODO: Add "subset randomize" function // int count = Math.min(15, _previous.units); // double min = -.1f, max = +.1f; // //double min = -1f, max = +1f; // for( int o = 0; o < units; o++ ) { // for( int n = 0; n < count; n++ ) { // int i = rand.nextInt(_previous.units); // int w = o * _previous.units + i; // _w[w] = uniformDist(rand, min, max); // } // } /** * Compute Variable Importance, based on * GEDEON: DATA MINING OF INPUTS: ANALYSING MAGNITUDE AND FUNCTIONAL MEASURES * * @return variable importances for input features */ public float[] computeVariableImportances() { float[] vi = new float[units[0]]; Arrays.fill(vi, 0f); float[][] Qik = new float[units[0]][units[2]]; //importance of input i on output k float[] sum_wj = new float[units[1]]; //sum of incoming weights into first hidden layer float[] sum_wk = new float[units[2]]; //sum of incoming weights into output layer (or second hidden layer) for (float[] Qi : Qik) Arrays.fill(Qi, 0f); Arrays.fill(sum_wj, 0f); Arrays.fill(sum_wk, 0f); // compute sum of absolute incoming weights for (int j = 0; j < units[1]; j++) { for (int i = 0; i < units[0]; i++) { float wij = get_weights(0).get(j, i); sum_wj[j] += Math.abs(wij); } } for (int k = 0; k < units[2]; k++) { for (int j = 0; j < units[1]; j++) { float wjk = get_weights(1).get(k, j); sum_wk[k] += Math.abs(wjk); } } // compute importance of input i on output k as product of connecting weights going through j for (int i = 0; i < units[0]; i++) { for (int k = 0; k < units[2]; k++) { for (int j = 0; j < units[1]; j++) { float wij = get_weights(0).get(j, i); float wjk = get_weights(1).get(k, j); //Qik[i][k] += Math.abs(wij)/sum_wj[j] * wjk; //Wong,Gedeon,Taggart '95 Qik[i][k] += Math.abs(wij) / sum_wj[j] * Math.abs(wjk) / sum_wk[k]; //Gedeon '97 } } } // normalize Qik over all outputs k for (int k = 0; k < units[2]; k++) { float sumQk = 0; for (int i = 0; i < units[0]; i++) sumQk += Qik[i][k]; for (int i = 0; i < units[0]; i++) Qik[i][k] /= sumQk; } // importance for feature i is the sum over k of i->k importances for (int i = 0; i < units[0]; i++) vi[i] = ArrayUtils.sum(Qik[i]); //normalize importances such that max(vi) = 1 ArrayUtils.div(vi, ArrayUtils.maxValue(vi)); // zero out missing categorical variables if they were never seen if (_saw_missing_cats != null) { for (int i = 0; i < _saw_missing_cats.length; ++i) { assert (data_info._catMissing[i]); //have a missing bucket for each categorical if (!_saw_missing_cats[i]) vi[data_info._catOffsets[i + 1] - 1] = 0; } } return vi; } /** * Compute statistics about this model on all nodes */ public void computeStats() { float[][] rate = get_params()._adaptive_rate ? new float[units.length - 1][] : null; if (get_params()._autoencoder && get_params()._sparsity_beta > 0) { for (int k = 0; k < get_params()._hidden.length; k++) { mean_a[k] = 0; for (int j = 0; j < avg_activations[k].size(); j++) mean_a[k] += avg_activations[k].get(j); mean_a[k] /= avg_activations[k].size(); } } for (int y = 0; y < units.length-1; y++) { mean_rate[y] = rms_rate[y] = 0; mean_bias[y] = rms_bias[y] = 0; mean_weight[y] = rms_weight[y] = 0; for (int u = 0; u < biases[y].size(); u++) { mean_bias[y] += biases[y].get(u); } if (rate != null) rate[y] = new float[get_weights(y).raw().length]; for (int u = 0; u < get_weights(y).raw().length; u++) { mean_weight[y] += get_weights(y).raw()[u]; if (rate != null) { // final float RMS_dx = (float)Math.sqrt(ada[y][2*u]+(float)get_params().epsilon); // final float invRMS_g = (float)(1/Math.sqrt(ada[y][2*u+1]+(float)get_params().epsilon)); final float RMS_dx = MathUtils.approxSqrt(get_ada_dx_g(y).raw()[2 * u] + (float) get_params()._epsilon); final float invRMS_g = MathUtils.approxInvSqrt(get_ada_dx_g(y).raw()[2 * u + 1] + (float) get_params()._epsilon); rate[y][u] = RMS_dx * invRMS_g; //not exactly right, RMS_dx should be from the previous time step -> but close enough for diagnostics. mean_rate[y] += rate[y][u]; } } mean_bias[y] /= biases[y].size(); mean_weight[y] /= get_weights(y).size(); if (rate != null) mean_rate[y] /= rate[y].length; for (int u = 0; u < biases[y].size(); u++) { final double db = biases[y].get(u) - mean_bias[y]; rms_bias[y] += db * db; } for (int u = 0; u < get_weights(y).size(); u++) { final double dw = get_weights(y).raw()[u] - mean_weight[y]; rms_weight[y] += dw * dw; if (rate != null) { final double drate = rate[y][u] - mean_rate[y]; rms_rate[y] += drate * drate; } } rms_bias[y] = MathUtils.approxSqrt(rms_bias[y] / biases[y].size()); rms_weight[y] = MathUtils.approxSqrt(rms_weight[y] / get_weights(y).size()); if (rate != null) rms_rate[y] = MathUtils.approxSqrt(rms_rate[y]/ rate[y].length); // rms_bias[y] = (float)Math.sqrt(rms_bias[y]/biases[y].length); // rms_weight[y] = (float)Math.sqrt(rms_weight[y]/weights[y].length); // if (rate != null) rms_rate[y] = (float)Math.sqrt(rms_rate[y]/rate[y].length); // Abort the run if weights or biases are unreasonably large (Note that all input values are normalized upfront) // This can happen with Rectifier units when L1/L2/max_w2 are all set to 0, especially when using more than 1 hidden layer. final double thresh = 1e10; final double bthresh = 1e5; unstable |= isNaN(mean_bias[y]) || isNaN(rms_bias[y]) || isNaN(mean_weight[y]) || isNaN(rms_weight[y]) // large weights || Math.abs(mean_weight[y]) > thresh || rms_weight[y] > thresh // large biases || Math.abs(mean_bias[y]) > bthresh || rms_bias[y] > bthresh; } } /** * Unique identifier for this model's state, based on raw numbers */ protected long checksum_impl() { computeStats(); Random rng = new Random(0xDECAFBBB); double cs = Double.longBitsToDouble(get_params()._seed); cs += size() * get_processed_total(); for (double d : mean_bias) cs += (rng.nextDouble() * (d+123.23)); for (double d : rms_bias) cs += (rng.nextDouble() * (d+123.23)); for (double d : mean_weight) cs += (rng.nextDouble() * (d+123.23)); for (double d : rms_weight) cs += (rng.nextDouble() * (d+123.23)); for (double d : mean_rate) cs += (rng.nextDouble() * (d+123.23)); for (double d : rms_rate) cs += (rng.nextDouble() * (d+123.23)); return Double.doubleToRawLongBits(cs); } /** * TimeAveraging as part of Elastic Averaging Algorithm * Cf. equation 6 of arXiv:1412.6651v5 * @param nodeAverageModel current average of per-node models * @return Time-average of node-averages (consensus model, "the" model) */ public static DeepLearningModelInfo timeAverage(DeepLearningModelInfo nodeAverageModel) { float pa = (float) nodeAverageModel.get_params()._elastic_averaging_moving_rate; assert(pa > 0 && pa <= 1); DeepLearningModelInfo elasticAverage = DKV.getGet(nodeAverageModel.elasticAverageModelInfoKey()); //get latest version from DKV if (elasticAverage == null || pa == 1) { elasticAverage = IcedUtils.deepCopy(nodeAverageModel); } else { nodeAverageModel.mult(pa); elasticAverage.mult(1 - pa); elasticAverage.add(nodeAverageModel); //ignore processed local value set here elasticAverage.set_processed_global(nodeAverageModel.get_processed_global()); } elasticAverage.set_processed_local(0); DKV.put(elasticAverage.elasticAverageModelInfoKey(), elasticAverage); // nodeAverageModel.computeStats(); // elasticAverage.computeStats(); // Log.info("Local Model :\n" + nodeAverageModel.toString()); // Log.info("Elastic Average:\n" + elasticAverage.toString()); return elasticAverage; } public Key localModelInfoKey(H2ONode node) { return Key.make(_model_id + ".node" + node.index(), (byte) 1 /*replica factor*/, (byte) 31 /*hidden user-key*/, true, node); } public Key elasticAverageModelInfoKey() { return Key.make(_model_id + ".elasticaverage", (byte) 1 /*replica factor*/, (byte) 31 /*hidden user-key*/, true, H2O.CLOUD._memary[0]); } static public class GradientCheck { GradientCheck(int l, int r, int c) { layer=l; row=r; col=c; gradient=0;} int layer; int row; int col; double gradient; void apply(int l, int r, int c, double g) { if (r==row && c==col && l==layer) { gradient += g; } } } static public GradientCheck gradientCheck = null; static public GradientCheck gradientCheckBias = null; }