ModelBag.java example

Explorer
LPmade-master
- weka
  - src
    - weka
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    EnsembleSelection.java
 *    Copyright (C) 2006 David Michael
 *
 */

package weka.classifiers.meta.ensembleSelection;

import weka.classifiers.Evaluation;
import weka.core.Instances;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;

import java.util.Random;

/**
 * This class is responsible for the duties of a bag of models. It is designed
 * for use with the EnsembleSelection meta classifier. It handles shuffling the
 * models, doing sort initialization, performing forward selection/ backwards
 * elimination, etc.
 * <p/>
 * We utilize a simple "virtual indexing" scheme inside. If we shuffle and/or
 * sort the models, we change the "virtual" order around. The elements of the
 * bag are always those elements with virtual index 0..(m_bagSize-1). Each
 * "virtual" index maps to some real index in m_models. Not every model in
 * m_models gets a virtual index... the virtual indexing is what defines the
 * subset of models of which our Bag is composed. This makes it easy to refer to
 * models in the bag, by their virtual index, while maintaining the original
 * indexing for our clients.
 * 
 * @author  David Michael
 * @version $Revision: 1.2 $
 */
public class ModelBag
  implements RevisionHandler {
  
  /**
   * The "models", as a multidimensional array of predictions for the
   * validation set. The first index is the model index, the second index is
   * the index of the instance, and the third is the typical "class" index for
   * a prediction's distribution. This is given to us in the constructor, and
   * we never change it.
   */
  private double m_models[][][];
  
  /**
   * Maps each model in our virtual indexing scheme to its original index as
   * it is in m_models. The first m_bag_size elements here are considered our
   * bag. Throughout the code, we use the index in to this array to refer to a
   * model. When we shuffle the models, we really simply shuffle this array.
   * When we want to refer back to the original model, it is easily looked up
   * in this array. That is, if j = m_model_index[i], then m_models[j] is the
   * model referred to by "virtual index" i. Models can easily be accessed by
   * their virtual index using the "model()" method.
   */
  private int m_modelIndex[];
  
  /**
   * The number of models in our bag. 1 <= m_bag_size <= m_models.length
   */
  private int m_bagSize;
  
  /**
   * The total number of models chosen thus far for this bag. This value is
   * important when calculating the predictions for the bag. (See
   * computePredictions).
   */
  private int m_numChosen;
  
  /**
   * The number of times each model has been chosen. Also can be thought of as
   * the weight for each model. Indexed by the "virtual index".
   */
  private int m_timesChosen[];
  
  /**
   * If true, print out debug information.
   */
  private boolean m_debug;
  
  /**
   * Double representing the best performance achieved thus far in this bag.
   * This Must be updated each time we make a change to the bag that improves
   * performance. This is so that after all hillclimbing is completed, we can
   * go back to the best ensemble that we encountered during hillclimbing.
   */
  private double m_bestPerformance;
  
  /**
   * Array representing the weights for all the models which achieved the best
   * performance thus far for the bag (i.e., the weights that achieved
   * m_bestPerformance. This Must be updated each time we make a change to the
   * bag (that improves performance, by calling updateBestTimesChosen. This is
   * so that after all hillclimbing is completed, we can go back to the best
   * ensemble that we encountered during hillclimbing. This array, unlike
   * m_timesChosen, uses the original indexing as taken from m_models. That
   * way, any time getModelWeights is called (which returns this array), the
   * array is in the correct format for our client.
   */
  private int m_bestTimesChosen[];
  
  /**
   * Constructor for ModelBag.
   * 
   * @param models
   *            The complete set of models from which to draw our bag. First
   *            index is for the model, second is for the instance. The last
   *            is a prediction distribution for that instance. Models are
   *            represented by this array of predictions for validation data,
   *            since that's all ensemble selection needs to know.
   * @param bag_percent
   *            The percentage of the set of given models that should be used
   *            in the Model Bag.
   * @param debug
   *            Whether the ModelBag should print debug information.
   * 
   */
  public ModelBag(double models[][][], double bag_percent, boolean debug) {
    m_debug = debug;
    if (models.length == 0) {
      throw new IllegalArgumentException(
      "ModelBag needs at least 1 model.");
    }
    m_bagSize = (int) ((double) models.length * bag_percent);
    m_models = models;
    m_modelIndex = new int[m_models.length];
    m_timesChosen = new int[m_models.length];
    m_bestTimesChosen = m_timesChosen;
    m_bestPerformance = 0.0;
    
    // Initially, no models are chosen.
    m_numChosen = 0;
    // Prepare our virtual indexing scheme. Initially, the indexes are
    // the same as the original.
    for (int i = 0; i < m_models.length; ++i) {
      m_modelIndex[i] = i;
      m_timesChosen[i] = 0;
    }
  }
  
  /**
   * Swap model at virtual index i with model at virtual index j. This is used
   * to shuffle the models. We do not change m_models, only the arrays which
   * use the virtual indexing; m_modelIndex and m_timesChosen.
   * 
   * @param i	first index
   * @param j	second index
   */
  private void swap(int i, int j) {
    if (i != j) {
      int temp_index = m_modelIndex[i];
      m_modelIndex[i] = m_modelIndex[j];
      m_modelIndex[j] = temp_index;
      
      int tempWeight = m_timesChosen[i];
      m_timesChosen[i] = m_timesChosen[j];
      m_timesChosen[j] = tempWeight;
    }
  }
  
  /**
   * Shuffle the models. The order in m_models is preserved, but we change our
   * virtual indexes around.
   * 
   * @param rand	the random number generator to use
   */
  public void shuffle(Random rand) {
    if (m_models.length < 2)
      return;
    
    for (int i = 0; i < m_models.length; ++i) {
      int swap_index = rand.nextInt(m_models.length - 1);
      if (swap_index >= i)
	++swap_index; // don't swap with itself
      swap(i, swap_index);
    }
  }
  
  /**
   * Convert an array of weights using virtual indices to an array of weights
   * using real indices.
   * 
   * @param virtual_weights	the virtual indices
   * @return			the real indices
   */
  private int[] virtualToRealWeights(int virtual_weights[]) {
    int real_weights[] = new int[virtual_weights.length];
    for (int i = 0; i < real_weights.length; ++i) {
      real_weights[m_modelIndex[i]] = virtual_weights[i];
    }
    return real_weights;
  }
  
  /**
   * 
   */
  private void updateBestTimesChosen() {
    m_bestTimesChosen = virtualToRealWeights(m_timesChosen);
  }
  
  /**
   * Sort initialize the bag.
   * 
   * @param num
   *            the Maximum number of models to initialize with
   * @param greedy
   *            True if we do greedy addition, up to num. Greedy sort
   *            initialization adds models (up to num) in order of best to
   *            worst performance until performance no longer improves.
   * @param instances
   *            the data set (needed for performance evaluation)
   * @param metric
   *            metric for which to optimize. See EnsembleMetricHelper
   * @return returns an array of indexes which were selected, in order
   *         starting from the model with best performance.
   * @throws Exception if something goes wrong
   */
  public int[] sortInitialize(int num, boolean greedy, Instances instances,
      int metric) throws Exception {
    
    // First, get the performance of each model
    double performance[] = new double[m_bagSize];
    for (int i = 0; i < m_bagSize; ++i) {
      performance[i] = evaluatePredictions(instances, model(i), metric);
    }
    int bestModels[] = new int[num]; // we'll use this to save model info
    // Now sort the models by their performance... note we only need the
    // first "num",
    // so we don't actually bother to sort the whole thing... instead, we
    // pick the num best
    // by running num iterations of selection sort.
    for (int i = 0; i < num; ++i) {
      int max_index = i;
      double max_value = performance[i];
      for (int j = i + 1; j < m_bagSize; ++j) {
	// Find the best model which we haven't already selected
	if (performance[j] > max_value) {
	  max_value = performance[j];
	  max_index = j;
	}
      }
      // Swap ith model in to the ith position (selection sort)
      this.swap(i, max_index);
      // swap performance numbers, too
      double temp_perf = performance[i];
      performance[i] = performance[max_index];
      performance[max_index] = temp_perf;
      
      bestModels[i] = m_modelIndex[i];
      if (!greedy) {
	// If we're not being greedy, we just throw the model in
	// no matter what
	++m_timesChosen[i];
	++m_numChosen;
      }
    }
    // Now the best "num" models are all sorted and in position.
    if (greedy) {
      // If the "greedy" option was specified, do a smart sort
      // initialization
      // that adds models only so long as they help overall performance.
      // This is what was done in the original Caruana paper.
      double[][] tempPredictions = null;
      double bestPerformance = 0.0;
      if (num > 0) {
	++m_timesChosen[0];
	++m_numChosen;
	updateBestTimesChosen();
      }
      for (int i = 1; i < num; ++i) {
	tempPredictions = computePredictions(i, true);
	double metric_value = evaluatePredictions(instances,
	    tempPredictions, metric);
	if (metric_value > bestPerformance) {
	  // If performance improved, update the appropriate info.
	  bestPerformance = metric_value;
	  ++m_timesChosen[i];
	  ++m_numChosen;
	  updateBestTimesChosen();
	} else {
	  // We found a model that doesn't help performance, so we
	  // stop adding models.
	  break;
	}
      }
    }
    updateBestTimesChosen();
    if (m_debug) {
      System.out.println("Sort Initialization added best " + m_numChosen
	  + " models to the bag.");
    }
    return bestModels;
  }
  
  /**
   * Add "weight" to the number of times each model in the bag was chosen.
   * Typically for use with backward elimination.
   * 
   * @param weight	the weight to add
   */
  public void weightAll(int weight) {
    for (int i = 0; i < m_bagSize; ++i) {
      m_timesChosen[i] += weight;
      m_numChosen += weight;
    }
    updateBestTimesChosen();
  }
  
  /**
   * Forward select one model. Will add the model which has the best effect on
   * performance. If replacement is false, and all models are chosen, no
   * action is taken. If a model can be added, one always is (even if it hurts
   * performance).
   * 
   * @param withReplacement
   *            whether a model can be added more than once.
   * @param instances
   *            The dataset, for calculating performance.
   * @param metric
   *            The metric to which we will optimize. See EnsembleMetricHelper
   * @throws Exception if something goes wrong
   */
  public void forwardSelect(boolean withReplacement, Instances instances,
      int metric) throws Exception {
    
    double bestPerformance = -1.0;
    int bestIndex = -1;
    double tempPredictions[][];
    for (int i = 0; i < m_bagSize; ++i) {
      // For each model in the bag
      if ((m_timesChosen[i] == 0) || withReplacement) {
	// If the model has not been chosen, or we're allowing
	// replacement
	// Get the predictions we would have if we add this model to the
	// ensemble
	tempPredictions = computePredictions(i, true);
	// And find out how the hypothetical ensemble would perform.
	double metric_value = evaluatePredictions(instances,
	    tempPredictions, metric);
	if (metric_value > bestPerformance) {
	  // If it's better than our current best, make it our NEW
	  // best.
	  bestIndex = i;
	  bestPerformance = metric_value;
	}
      }
    }
    if (bestIndex == -1) {
      // Replacement must be false, with more hillclimb iterations than
      // models. Do nothing and return.
      if (m_debug) {
	System.out.println("Couldn't add model.  No action performed.");
      }
      return;
    }
    // We picked bestIndex as our best model. Update appropriate info.
    m_timesChosen[bestIndex]++;
    m_numChosen++;
    if (bestPerformance > m_bestPerformance) {
      // We find the peak of our performance over all hillclimb
      // iterations.
      // If this forwardSelect step improved our overall performance,
      // update
      // our best ensemble info.
      updateBestTimesChosen();
      m_bestPerformance = bestPerformance;
    }
  }
  
  /**
   * Find the model whose removal will help the ensemble's performance the
   * most, and remove it. If there is only one model left, we leave it in. If
   * we can remove a model, we always do, even if it hurts performance.
   * 
   * @param instances
   *            The data set, for calculating performance
   * @param metric
   *            Metric to optimize for. See EnsembleMetricHelper.
   * @throws Exception if something goes wrong
   */
  public void backwardEliminate(Instances instances, int metric)
  throws Exception {
    
    // Find the best model to remove. I.e., model for which removal improves
    // performance the most (or hurts it least), and remove it.
    if (m_numChosen <= 1) {
      // If we only have one model left, keep it, as a bag
      // which chooses no models doesn't make much sense.
      return;
    }
    double bestPerformance = -1.0;
    int bestIndex = -1;
    double tempPredictions[][];
    for (int i = 0; i < m_bagSize; ++i) {
      // For each model in the bag
      if (m_timesChosen[i] > 0) {
	// If the model has been chosen at least once,
	// Get the predictions we would have if we remove this model
	tempPredictions = computePredictions(i, false);
	// And find out how the hypothetical ensemble would perform.
	double metric_value = evaluatePredictions(instances,
	    tempPredictions, metric);
	if (metric_value > bestPerformance) {
	  // If it's better than our current best, make it our NEW
	  // best.
	  bestIndex = i;
	  bestPerformance = metric_value;
	}
      }
    }
    if (bestIndex == -1) {
      // The most likely cause of this is that we didn't have any models
      // we could
      // remove. Do nothing & return.
      if (m_debug) {
	System.out
	.println("Couldn't remove model.  No action performed.");
      }
      return;
    }
    // We picked bestIndex as our best model. Update appropriate info.
    m_timesChosen[bestIndex]--;
    m_numChosen--;
    if (m_debug) {
      System.out.println("Removing model " + m_modelIndex[bestIndex]
                                                          + " (" + bestIndex + ") " + bestPerformance);
    }
    if (bestPerformance > m_bestPerformance) {
      // We find the peak of our performance over all hillclimb
      // iterations.
      // If this forwardSelect step improved our overall performance,
      // update
      // our best ensemble info.
      updateBestTimesChosen();
      m_bestPerformance = bestPerformance;
    }
    // return m_model_index[best_index]; //translate to original indexing
    // and return
  }
  
  /**
   * Find the best action to perform, be it adding a model or removing a
   * model, and perform it. Some action is always performed, even if it hurts
   * performance.
   * 
   * @param with_replacement
   *            whether we can add a model more than once
   * @param instances
   *            The dataset, for determining performance.
   * @param metric
   *            The metric for which to optimize. See EnsembleMetricHelper.
   * @throws Exception if something goes wrong
   */
  public void forwardSelectOrBackwardEliminate(boolean with_replacement,
      Instances instances, int metric) throws Exception {
    
    // Find the best action to perform, be it adding a model or removing a
    // model,
    // and do it.
    double bestPerformance = -1.0;
    int bestIndex = -1;
    boolean added = true;
    double tempPredictions[][];
    for (int i = 0; i < m_bagSize; ++i) {
      // For each model in the bag:
      // Try removing the model
      if (m_timesChosen[i] > 0) {
	// If the model has been chosen at least once,
	// Get the predictions we would have if we remove this model
	tempPredictions = computePredictions(i, false);
	// And find out how the hypothetical ensemble would perform.
	double metric_value = evaluatePredictions(instances,
	    tempPredictions, metric);
	if (metric_value > bestPerformance) {
	  // If it's better than our current best, make it our NEW
	  // best.
	  bestIndex = i;
	  bestPerformance = metric_value;
	  added = false;
	}
      }
      if ((m_timesChosen[i] == 0) || with_replacement) {
	// If the model hasn't been chosen, or if we can choose it more
	// than once, try adding it:
	// Get the predictions we would have if we added the model
	tempPredictions = computePredictions(i, true);
	// And find out how the hypothetical ensemble would perform.
	double metric_value = evaluatePredictions(instances,
	    tempPredictions, metric);
	if (metric_value > bestPerformance) {
	  // If it's better than our current best, make it our NEW
	  // best.
	  bestIndex = i;
	  bestPerformance = metric_value;
	  added = true;
	}
      }
    }
    if (bestIndex == -1) {
      // Shouldn't really happen. Possible (I think) if the model bag is
      // empty. Just return.
      if (m_debug) {
	System.out.println("Couldn't add or remove model.  No action performed.");
      }
      return;
    }
    // Now we've found the best change to make:
    // * bestIndex is the (virtual) index of the model we should change
    // * added is true if the model should be added (false if should be
    // removed)
    int changeInWeight = added ? 1 : -1;
    m_timesChosen[bestIndex] += changeInWeight;
    m_numChosen += changeInWeight;
    if (bestPerformance > m_bestPerformance) {
      // We find the peak of our performance over all hillclimb
      // iterations.
      // If this forwardSelect step improved our overall performance,
      // update
      // our best ensemble info.
      updateBestTimesChosen();
      m_bestPerformance = bestPerformance;
    }
  }
  
  /**
   * returns the model weights
   * 
   * @return		the model weights
   */
  public int[] getModelWeights() {
    return m_bestTimesChosen;
  }
  
  /**
   * Returns the "model" at the given virtual index. Here, by "model" we mean
   * its predictions with respect to the validation set. This is just a
   * convenience method, since we use the "virtual" index more than the real
   * one inside this class.
   * 
   * @param index
   *            the "virtual" index - the one for internal use
   * @return the predictions for the model for all validation instances.
   */
  private double[][] model(int index) {
    return m_models[m_modelIndex[index]];
  }
  
  /**
   * Compute predictions based on the current model, adding (or removing) the
   * model at the given (internal) index.
   * 
   * @param index_to_change
   *            index of model we're adding or removing
   * @param add
   *            whether we add it. If false, we remove it.
   * @return the predictions for all validation instances
   */
  private double[][] computePredictions(int index_to_change, boolean add) {
    double[][] predictions = new double[m_models[0].length][m_models[0][0].length];
    for (int i = 0; i < m_bagSize; ++i) {
      if (m_timesChosen[i] > 0) {
	for (int j = 0; j < m_models[0].length; ++j) {
	  for (int k = 0; k < m_models[0][j].length; ++k) {
	    predictions[j][k] += model(i)[j][k] * m_timesChosen[i];
	  }
	}
      }
    }
    for (int j = 0; j < m_models[0].length; ++j) {
      int change = add ? 1 : -1;
      for (int k = 0; k < m_models[0][j].length; ++k) {
	predictions[j][k] += change * model(index_to_change)[j][k];
	predictions[j][k] /= (m_numChosen + change);
      }
    }
    return predictions;
  }
  
  /**
   * Return the performance of the given predictions on the given instances
   * with respect to the given metric (see EnsembleMetricHelper).
   * 
   * @param instances
   *            the validation data
   * @param temp_predictions
   *            the predictions to evaluate
   * @param metric
   *            the metric for which to optimize (see EnsembleMetricHelper)
   * @return the performance
   * @throws Exception if something goes wrong
   */
  private double evaluatePredictions(Instances instances,
      double[][] temp_predictions, int metric) throws Exception {
    
    Evaluation eval = new Evaluation(instances);
    for (int i = 0; i < instances.numInstances(); ++i) {
      eval.evaluateModelOnceAndRecordPrediction(temp_predictions[i],
	  instances.instance(i));
    }
    return EnsembleMetricHelper.getMetric(eval, metric);
  }
  
  /**
   * Gets the individual performances of all the models in the bag.
   * 
   * @param instances
   *            The validation data, for which we want performance.
   * @param metric
   *            The desired metric (see EnsembleMetricHelper).
   * @return the performance
   * @throws Exception if something goes wrong
   */
  public double[] getIndividualPerformance(Instances instances, int metric)
    throws Exception {
    
    double[] performance = new double[m_bagSize];
    for (int i = 0; i < m_bagSize; ++i) {
      performance[i] = evaluatePredictions(instances, model(i), metric);
    }
    return performance;
  }
  
  /**
   * Returns the revision string.
   * 
   * @return		the revision
   */
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 1.2 $");
  }
}