/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* EnsembleSelection.java
* Copyright (C) 2006 David Michael
*
*/
package weka.classifiers.meta.ensembleSelection;
import weka.classifiers.Evaluation;
import weka.core.Instances;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import java.util.Random;
/**
* This class is responsible for the duties of a bag of models. It is designed
* for use with the EnsembleSelection meta classifier. It handles shuffling the
* models, doing sort initialization, performing forward selection/ backwards
* elimination, etc.
* <p/>
* We utilize a simple "virtual indexing" scheme inside. If we shuffle and/or
* sort the models, we change the "virtual" order around. The elements of the
* bag are always those elements with virtual index 0..(m_bagSize-1). Each
* "virtual" index maps to some real index in m_models. Not every model in
* m_models gets a virtual index... the virtual indexing is what defines the
* subset of models of which our Bag is composed. This makes it easy to refer to
* models in the bag, by their virtual index, while maintaining the original
* indexing for our clients.
*
* @author David Michael
* @version $Revision: 1.2 $
*/
public class ModelBag
implements RevisionHandler {
/**
* The "models", as a multidimensional array of predictions for the
* validation set. The first index is the model index, the second index is
* the index of the instance, and the third is the typical "class" index for
* a prediction's distribution. This is given to us in the constructor, and
* we never change it.
*/
private double m_models[][][];
/**
* Maps each model in our virtual indexing scheme to its original index as
* it is in m_models. The first m_bag_size elements here are considered our
* bag. Throughout the code, we use the index in to this array to refer to a
* model. When we shuffle the models, we really simply shuffle this array.
* When we want to refer back to the original model, it is easily looked up
* in this array. That is, if j = m_model_index[i], then m_models[j] is the
* model referred to by "virtual index" i. Models can easily be accessed by
* their virtual index using the "model()" method.
*/
private int m_modelIndex[];
/**
* The number of models in our bag. 1 <= m_bag_size <= m_models.length
*/
private int m_bagSize;
/**
* The total number of models chosen thus far for this bag. This value is
* important when calculating the predictions for the bag. (See
* computePredictions).
*/
private int m_numChosen;
/**
* The number of times each model has been chosen. Also can be thought of as
* the weight for each model. Indexed by the "virtual index".
*/
private int m_timesChosen[];
/**
* If true, print out debug information.
*/
private boolean m_debug;
/**
* Double representing the best performance achieved thus far in this bag.
* This Must be updated each time we make a change to the bag that improves
* performance. This is so that after all hillclimbing is completed, we can
* go back to the best ensemble that we encountered during hillclimbing.
*/
private double m_bestPerformance;
/**
* Array representing the weights for all the models which achieved the best
* performance thus far for the bag (i.e., the weights that achieved
* m_bestPerformance. This Must be updated each time we make a change to the
* bag (that improves performance, by calling updateBestTimesChosen. This is
* so that after all hillclimbing is completed, we can go back to the best
* ensemble that we encountered during hillclimbing. This array, unlike
* m_timesChosen, uses the original indexing as taken from m_models. That
* way, any time getModelWeights is called (which returns this array), the
* array is in the correct format for our client.
*/
private int m_bestTimesChosen[];
/**
* Constructor for ModelBag.
*
* @param models
* The complete set of models from which to draw our bag. First
* index is for the model, second is for the instance. The last
* is a prediction distribution for that instance. Models are
* represented by this array of predictions for validation data,
* since that's all ensemble selection needs to know.
* @param bag_percent
* The percentage of the set of given models that should be used
* in the Model Bag.
* @param debug
* Whether the ModelBag should print debug information.
*
*/
public ModelBag(double models[][][], double bag_percent, boolean debug) {
m_debug = debug;
if (models.length == 0) {
throw new IllegalArgumentException(
"ModelBag needs at least 1 model.");
}
m_bagSize = (int) ((double) models.length * bag_percent);
m_models = models;
m_modelIndex = new int[m_models.length];
m_timesChosen = new int[m_models.length];
m_bestTimesChosen = m_timesChosen;
m_bestPerformance = 0.0;
// Initially, no models are chosen.
m_numChosen = 0;
// Prepare our virtual indexing scheme. Initially, the indexes are
// the same as the original.
for (int i = 0; i < m_models.length; ++i) {
m_modelIndex[i] = i;
m_timesChosen[i] = 0;
}
}
/**
* Swap model at virtual index i with model at virtual index j. This is used
* to shuffle the models. We do not change m_models, only the arrays which
* use the virtual indexing; m_modelIndex and m_timesChosen.
*
* @param i first index
* @param j second index
*/
private void swap(int i, int j) {
if (i != j) {
int temp_index = m_modelIndex[i];
m_modelIndex[i] = m_modelIndex[j];
m_modelIndex[j] = temp_index;
int tempWeight = m_timesChosen[i];
m_timesChosen[i] = m_timesChosen[j];
m_timesChosen[j] = tempWeight;
}
}
/**
* Shuffle the models. The order in m_models is preserved, but we change our
* virtual indexes around.
*
* @param rand the random number generator to use
*/
public void shuffle(Random rand) {
if (m_models.length < 2)
return;
for (int i = 0; i < m_models.length; ++i) {
int swap_index = rand.nextInt(m_models.length - 1);
if (swap_index >= i)
++swap_index; // don't swap with itself
swap(i, swap_index);
}
}
/**
* Convert an array of weights using virtual indices to an array of weights
* using real indices.
*
* @param virtual_weights the virtual indices
* @return the real indices
*/
private int[] virtualToRealWeights(int virtual_weights[]) {
int real_weights[] = new int[virtual_weights.length];
for (int i = 0; i < real_weights.length; ++i) {
real_weights[m_modelIndex[i]] = virtual_weights[i];
}
return real_weights;
}
/**
*
*/
private void updateBestTimesChosen() {
m_bestTimesChosen = virtualToRealWeights(m_timesChosen);
}
/**
* Sort initialize the bag.
*
* @param num
* the Maximum number of models to initialize with
* @param greedy
* True if we do greedy addition, up to num. Greedy sort
* initialization adds models (up to num) in order of best to
* worst performance until performance no longer improves.
* @param instances
* the data set (needed for performance evaluation)
* @param metric
* metric for which to optimize. See EnsembleMetricHelper
* @return returns an array of indexes which were selected, in order
* starting from the model with best performance.
* @throws Exception if something goes wrong
*/
public int[] sortInitialize(int num, boolean greedy, Instances instances,
int metric) throws Exception {
// First, get the performance of each model
double performance[] = new double[m_bagSize];
for (int i = 0; i < m_bagSize; ++i) {
performance[i] = evaluatePredictions(instances, model(i), metric);
}
int bestModels[] = new int[num]; // we'll use this to save model info
// Now sort the models by their performance... note we only need the
// first "num",
// so we don't actually bother to sort the whole thing... instead, we
// pick the num best
// by running num iterations of selection sort.
for (int i = 0; i < num; ++i) {
int max_index = i;
double max_value = performance[i];
for (int j = i + 1; j < m_bagSize; ++j) {
// Find the best model which we haven't already selected
if (performance[j] > max_value) {
max_value = performance[j];
max_index = j;
}
}
// Swap ith model in to the ith position (selection sort)
this.swap(i, max_index);
// swap performance numbers, too
double temp_perf = performance[i];
performance[i] = performance[max_index];
performance[max_index] = temp_perf;
bestModels[i] = m_modelIndex[i];
if (!greedy) {
// If we're not being greedy, we just throw the model in
// no matter what
++m_timesChosen[i];
++m_numChosen;
}
}
// Now the best "num" models are all sorted and in position.
if (greedy) {
// If the "greedy" option was specified, do a smart sort
// initialization
// that adds models only so long as they help overall performance.
// This is what was done in the original Caruana paper.
double[][] tempPredictions = null;
double bestPerformance = 0.0;
if (num > 0) {
++m_timesChosen[0];
++m_numChosen;
updateBestTimesChosen();
}
for (int i = 1; i < num; ++i) {
tempPredictions = computePredictions(i, true);
double metric_value = evaluatePredictions(instances,
tempPredictions, metric);
if (metric_value > bestPerformance) {
// If performance improved, update the appropriate info.
bestPerformance = metric_value;
++m_timesChosen[i];
++m_numChosen;
updateBestTimesChosen();
} else {
// We found a model that doesn't help performance, so we
// stop adding models.
break;
}
}
}
updateBestTimesChosen();
if (m_debug) {
System.out.println("Sort Initialization added best " + m_numChosen
+ " models to the bag.");
}
return bestModels;
}
/**
* Add "weight" to the number of times each model in the bag was chosen.
* Typically for use with backward elimination.
*
* @param weight the weight to add
*/
public void weightAll(int weight) {
for (int i = 0; i < m_bagSize; ++i) {
m_timesChosen[i] += weight;
m_numChosen += weight;
}
updateBestTimesChosen();
}
/**
* Forward select one model. Will add the model which has the best effect on
* performance. If replacement is false, and all models are chosen, no
* action is taken. If a model can be added, one always is (even if it hurts
* performance).
*
* @param withReplacement
* whether a model can be added more than once.
* @param instances
* The dataset, for calculating performance.
* @param metric
* The metric to which we will optimize. See EnsembleMetricHelper
* @throws Exception if something goes wrong
*/
public void forwardSelect(boolean withReplacement, Instances instances,
int metric) throws Exception {
double bestPerformance = -1.0;
int bestIndex = -1;
double tempPredictions[][];
for (int i = 0; i < m_bagSize; ++i) {
// For each model in the bag
if ((m_timesChosen[i] == 0) || withReplacement) {
// If the model has not been chosen, or we're allowing
// replacement
// Get the predictions we would have if we add this model to the
// ensemble
tempPredictions = computePredictions(i, true);
// And find out how the hypothetical ensemble would perform.
double metric_value = evaluatePredictions(instances,
tempPredictions, metric);
if (metric_value > bestPerformance) {
// If it's better than our current best, make it our NEW
// best.
bestIndex = i;
bestPerformance = metric_value;
}
}
}
if (bestIndex == -1) {
// Replacement must be false, with more hillclimb iterations than
// models. Do nothing and return.
if (m_debug) {
System.out.println("Couldn't add model. No action performed.");
}
return;
}
// We picked bestIndex as our best model. Update appropriate info.
m_timesChosen[bestIndex]++;
m_numChosen++;
if (bestPerformance > m_bestPerformance) {
// We find the peak of our performance over all hillclimb
// iterations.
// If this forwardSelect step improved our overall performance,
// update
// our best ensemble info.
updateBestTimesChosen();
m_bestPerformance = bestPerformance;
}
}
/**
* Find the model whose removal will help the ensemble's performance the
* most, and remove it. If there is only one model left, we leave it in. If
* we can remove a model, we always do, even if it hurts performance.
*
* @param instances
* The data set, for calculating performance
* @param metric
* Metric to optimize for. See EnsembleMetricHelper.
* @throws Exception if something goes wrong
*/
public void backwardEliminate(Instances instances, int metric)
throws Exception {
// Find the best model to remove. I.e., model for which removal improves
// performance the most (or hurts it least), and remove it.
if (m_numChosen <= 1) {
// If we only have one model left, keep it, as a bag
// which chooses no models doesn't make much sense.
return;
}
double bestPerformance = -1.0;
int bestIndex = -1;
double tempPredictions[][];
for (int i = 0; i < m_bagSize; ++i) {
// For each model in the bag
if (m_timesChosen[i] > 0) {
// If the model has been chosen at least once,
// Get the predictions we would have if we remove this model
tempPredictions = computePredictions(i, false);
// And find out how the hypothetical ensemble would perform.
double metric_value = evaluatePredictions(instances,
tempPredictions, metric);
if (metric_value > bestPerformance) {
// If it's better than our current best, make it our NEW
// best.
bestIndex = i;
bestPerformance = metric_value;
}
}
}
if (bestIndex == -1) {
// The most likely cause of this is that we didn't have any models
// we could
// remove. Do nothing & return.
if (m_debug) {
System.out
.println("Couldn't remove model. No action performed.");
}
return;
}
// We picked bestIndex as our best model. Update appropriate info.
m_timesChosen[bestIndex]--;
m_numChosen--;
if (m_debug) {
System.out.println("Removing model " + m_modelIndex[bestIndex]
+ " (" + bestIndex + ") " + bestPerformance);
}
if (bestPerformance > m_bestPerformance) {
// We find the peak of our performance over all hillclimb
// iterations.
// If this forwardSelect step improved our overall performance,
// update
// our best ensemble info.
updateBestTimesChosen();
m_bestPerformance = bestPerformance;
}
// return m_model_index[best_index]; //translate to original indexing
// and return
}
/**
* Find the best action to perform, be it adding a model or removing a
* model, and perform it. Some action is always performed, even if it hurts
* performance.
*
* @param with_replacement
* whether we can add a model more than once
* @param instances
* The dataset, for determining performance.
* @param metric
* The metric for which to optimize. See EnsembleMetricHelper.
* @throws Exception if something goes wrong
*/
public void forwardSelectOrBackwardEliminate(boolean with_replacement,
Instances instances, int metric) throws Exception {
// Find the best action to perform, be it adding a model or removing a
// model,
// and do it.
double bestPerformance = -1.0;
int bestIndex = -1;
boolean added = true;
double tempPredictions[][];
for (int i = 0; i < m_bagSize; ++i) {
// For each model in the bag:
// Try removing the model
if (m_timesChosen[i] > 0) {
// If the model has been chosen at least once,
// Get the predictions we would have if we remove this model
tempPredictions = computePredictions(i, false);
// And find out how the hypothetical ensemble would perform.
double metric_value = evaluatePredictions(instances,
tempPredictions, metric);
if (metric_value > bestPerformance) {
// If it's better than our current best, make it our NEW
// best.
bestIndex = i;
bestPerformance = metric_value;
added = false;
}
}
if ((m_timesChosen[i] == 0) || with_replacement) {
// If the model hasn't been chosen, or if we can choose it more
// than once, try adding it:
// Get the predictions we would have if we added the model
tempPredictions = computePredictions(i, true);
// And find out how the hypothetical ensemble would perform.
double metric_value = evaluatePredictions(instances,
tempPredictions, metric);
if (metric_value > bestPerformance) {
// If it's better than our current best, make it our NEW
// best.
bestIndex = i;
bestPerformance = metric_value;
added = true;
}
}
}
if (bestIndex == -1) {
// Shouldn't really happen. Possible (I think) if the model bag is
// empty. Just return.
if (m_debug) {
System.out.println("Couldn't add or remove model. No action performed.");
}
return;
}
// Now we've found the best change to make:
// * bestIndex is the (virtual) index of the model we should change
// * added is true if the model should be added (false if should be
// removed)
int changeInWeight = added ? 1 : -1;
m_timesChosen[bestIndex] += changeInWeight;
m_numChosen += changeInWeight;
if (bestPerformance > m_bestPerformance) {
// We find the peak of our performance over all hillclimb
// iterations.
// If this forwardSelect step improved our overall performance,
// update
// our best ensemble info.
updateBestTimesChosen();
m_bestPerformance = bestPerformance;
}
}
/**
* returns the model weights
*
* @return the model weights
*/
public int[] getModelWeights() {
return m_bestTimesChosen;
}
/**
* Returns the "model" at the given virtual index. Here, by "model" we mean
* its predictions with respect to the validation set. This is just a
* convenience method, since we use the "virtual" index more than the real
* one inside this class.
*
* @param index
* the "virtual" index - the one for internal use
* @return the predictions for the model for all validation instances.
*/
private double[][] model(int index) {
return m_models[m_modelIndex[index]];
}
/**
* Compute predictions based on the current model, adding (or removing) the
* model at the given (internal) index.
*
* @param index_to_change
* index of model we're adding or removing
* @param add
* whether we add it. If false, we remove it.
* @return the predictions for all validation instances
*/
private double[][] computePredictions(int index_to_change, boolean add) {
double[][] predictions = new double[m_models[0].length][m_models[0][0].length];
for (int i = 0; i < m_bagSize; ++i) {
if (m_timesChosen[i] > 0) {
for (int j = 0; j < m_models[0].length; ++j) {
for (int k = 0; k < m_models[0][j].length; ++k) {
predictions[j][k] += model(i)[j][k] * m_timesChosen[i];
}
}
}
}
for (int j = 0; j < m_models[0].length; ++j) {
int change = add ? 1 : -1;
for (int k = 0; k < m_models[0][j].length; ++k) {
predictions[j][k] += change * model(index_to_change)[j][k];
predictions[j][k] /= (m_numChosen + change);
}
}
return predictions;
}
/**
* Return the performance of the given predictions on the given instances
* with respect to the given metric (see EnsembleMetricHelper).
*
* @param instances
* the validation data
* @param temp_predictions
* the predictions to evaluate
* @param metric
* the metric for which to optimize (see EnsembleMetricHelper)
* @return the performance
* @throws Exception if something goes wrong
*/
private double evaluatePredictions(Instances instances,
double[][] temp_predictions, int metric) throws Exception {
Evaluation eval = new Evaluation(instances);
for (int i = 0; i < instances.numInstances(); ++i) {
eval.evaluateModelOnceAndRecordPrediction(temp_predictions[i],
instances.instance(i));
}
return EnsembleMetricHelper.getMetric(eval, metric);
}
/**
* Gets the individual performances of all the models in the bag.
*
* @param instances
* The validation data, for which we want performance.
* @param metric
* The desired metric (see EnsembleMetricHelper).
* @return the performance
* @throws Exception if something goes wrong
*/
public double[] getIndividualPerformance(Instances instances, int metric)
throws Exception {
double[] performance = new double[m_bagSize];
for (int i = 0; i < m_bagSize; ++i) {
performance[i] = evaluatePredictions(instances, model(i), metric);
}
return performance;
}
/**
* Returns the revision string.
*
* @return the revision
*/
public String getRevision() {
return RevisionUtils.extract("$Revision: 1.2 $");
}
}