GreedyLabelClustering.java example

Explorer
sad-analyzer-master
- SADAnalyzer
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    GreedyLabelClustering.java
 *    Copyright (C) 2009-2010 Aristotle University of Thessaloniki, Thessaloniki, Greece
 */
package mulan.data;

import java.io.Serializable;
import java.util.Arrays;
import java.util.HashMap;

import mulan.classifier.MultiLabelLearner;
import mulan.classifier.meta.SubsetLearner;
import mulan.evaluation.Evaluator;
import mulan.evaluation.MultipleEvaluation;
import mulan.evaluation.measure.Measure;
import mulan.evaluation.measure.SubsetAccuracy;
import weka.classifiers.Classifier;

/**
 * A class for clustering dependent label pairs into disjoint subsets. <br>
 * <br>
 * The type of the learned dependencies is determined by the
 * {@link mulan.data.LabelPairsDependenceIdentifier} supplied to the class constructor. The
 * clustering process is straightforward: initially all labels are assumed to be independent. Then
 * we start group the label pairs according to their dependence score from most to least dependent.
 * An SubsetLearner is build for each new partition and its accuracy is evaluated in terms of the
 * {@link #measure}. The process of grouping labels continues as long as the accuracy improves (or
 * at least is not reduced). A number of steps specified by {@link #allowedNonImprovementSteps}
 * without seeking any concomitant improvement in the accuracy is allowed. Such a �non-useful�
 * partitions are filtered out and the algorithm continues to evaluate subsequent pairs of dependent
 * labels until one of the stop conditions is reached. The possible stop conditions are: <br>
 * - no more label pairs to consider; <br>
 * - all labels are clustered into one single group; <br>
 * - pair dependence score is below the specified {@link #criticalValue}; <br>
 * - the number of {@link #allowedNonImprovementSteps} is exceeded.
 * 
 * @author Lena Chekina (lenat@bgu.ac.il)
 * @version 05.05.2011
 */
public class GreedyLabelClustering implements LabelClustering, Serializable {
	/** Classifier that will be used for single label training and predictions */
	private Classifier singleLabelLearner;
	/** Classifier that will be used for multi-label training and predictions */
	private MultiLabelLearner multiLabelLearner;
	/** Defines the type of dependence identification process. */
	private LabelPairsDependenceIdentifier depLabelsIdentifier;
	/**
	 * Critical value below which the label pairs are considered independent. If set to 0 - the
	 * critical value returned by
	 * {@link mulan.data.LabelPairsDependenceIdentifier#getCriticalValue()} is used.
	 */
	private double criticalValue = 0;
	/** Number of folds for evaluation of SubsetLearner models */
	private int numFolds = 10;
	/** Number of allowed concurrent steps with reduced accuracy */
	private int allowedNonImprovementSteps = 10;
	/** Measure by which models are compared */
	private Measure measure = new SubsetAccuracy();
	/** Enable SubsetLearner caching mechanism */
	private boolean useSubsetLearnerCache = true;
	/** Enable debug output of the internal SubsetLearner */
	private boolean internalSubsetLearnerDebug = true;

	/**
	 * Initialize the GreedyLabelClustering with multilabel and single label learners and a method
	 * for labels dependence identification.
	 * 
	 * @param aMultiLabelLearner - a learner for multilabel classification
	 * @param aSingleLabelLearner - a learner for single label classification
	 * @param dependenceIdentifier - a method for label pairs dependence identification
	 */
	public GreedyLabelClustering(MultiLabelLearner aMultiLabelLearner,
			Classifier aSingleLabelLearner, LabelPairsDependenceIdentifier dependenceIdentifier) {
		multiLabelLearner = aMultiLabelLearner;
		depLabelsIdentifier = dependenceIdentifier;
		singleLabelLearner = aSingleLabelLearner;
	}

	/**
	 * Determines labels partitioning into dependent sets. It clusters label pairs according to
	 * their dependence score and evaluates the related models. The clustering process continues as
	 * long as the accuracy improves. The finally selected labels partition is returned.
	 * 
	 * @param trainingSet the training data set
	 */
	@Override
	public int[][] determineClusters(MultiLabelInstances trainingSet) {
		HashMap<String, int[][]> evaluatedSubsets = new HashMap<String, int[][]>();
		LabelsPair[] labelPairs;
		int[][] currClusters;
		int[][] newClusters;
		SubsetLearner currClassif;
		Evaluator eval = new Evaluator();
		MultipleEvaluation results;
		Double newAcc;
		Double currAcc;
		if (criticalValue == 0) {
			criticalValue = depLabelsIdentifier.getCriticalValue();
		}
		// compute dependency level between all label pairs
		labelPairs = depLabelsIdentifier.calculateDependence(trainingSet);
		int numLabels = trainingSet.getNumLabels();
		// build initial combination set (each label in a separate group)
		currClusters = buildInitialSet(numLabels);
		String currSubsetsStr = partitionToString(currClusters);
		System.out.println("Evaluating initial model: " + currSubsetsStr);
		currClassif = new SubsetLearner(currClusters, multiLabelLearner, singleLabelLearner);
		currClassif.setDebug(internalSubsetLearnerDebug);
		currClassif.setUseCache(useSubsetLearnerCache);
		// cross-validate initial combination
		results = eval.crossValidate(currClassif, trainingSet, numFolds);
		results.calculateStatistics();
		currAcc = results.getMean(measure.getName());
		System.out.println("Model's " + measure.getName() + " = " + currAcc);
		evaluatedSubsets.put(currSubsetsStr, currClusters);
		int noImprovementCounter = 0;

		// take next labels pair, create new combination and build a model
		for (LabelsPair pair : labelPairs) {
			Double score = pair.getScore();
			if (score < criticalValue) {
				System.out.println("Pairs dependence score: " + score
						+ " is below the criticalValue: " + criticalValue
						+ ". Stop the clustering process!");
				break; // stop the process
			}
			if (noImprovementCounter > allowedNonImprovementSteps) {
				System.out.println("noImprovementCounter: " + noImprovementCounter
						+ " is above the allowed: " + allowedNonImprovementSteps
						+ ". Stop the clustering process!");
				break; // stop the process
			}
			int[] comb = pair.getPair();
			int length = currClusters.length;
			if (length == 1) {
				System.out
						.println("All labels are in the same group. Stop the clustering process!");
				break; // no more combinations possible - stop the process
			}
			// construct new label set partition
			newClusters = buildCombinationSet(currClusters, comb);
			for (int[] newCluster : newClusters) { // sort the labels within each group
				Arrays.sort(newCluster);
			}
			String newSubsetsStr = partitionToString(newClusters);
			if (!evaluatedSubsets.containsKey(newSubsetsStr)) {
				// if was not evaluated already -> build new model and evaluate it
				System.out.println("Evaluating model:" + newSubsetsStr);
				currClassif.resetSubsets(newClusters);
				// cross-validate new model
				results = eval.crossValidate(currClassif, trainingSet, numFolds);
				evaluatedSubsets.put(newSubsetsStr, newClusters);
				results.calculateStatistics();
				newAcc = results.getMean(measure.getName());
				System.out.println("Model's " + measure.getName() + " = " + newAcc);
				if (newAcc >= currAcc) { // make a decision
					currClusters = newClusters; // accept the new partition
					currAcc = newAcc;
					noImprovementCounter = 0; // and reset the counter
				} else {
					noImprovementCounter++;
				}
			}
		}
		System.out.println("Returning  the final labels partition: "
				+ partitionToString(currClusters) + '\n');
		return currClusters;
	}

	/**
	 * Returns a string representation of the labels partition.
	 * 
	 * @param partition - a label set partition
	 * @return a string representation of the labels partition
	 */
	public static String partitionToString(int[][] partition) {
		StringBuilder result = new StringBuilder();
		for (int[] aGroup : partition) {
			result.append(Arrays.toString(aGroup));
			result.append(", ");
		}
		return result.toString();
	}

	/**
	 * Build initial label set partition - each label in a separate group. For example for
	 * numLabels=4 , it returns an array {{0},{1},{2},{3}}
	 * 
	 * @param numLabels number of labels in the trainingSet
	 * @return two dimensional array of size numLabels, when each inner array is of size 1
	 */
	private static int[][] buildInitialSet(int numLabels) {
		int[][] res = new int[numLabels][1];
		for (int i = 0; i < numLabels; i++) {
			res[i][0] = i;
		}
		return res;
	}

	/**
	 * Clusters a new pair of labels and integrates the new group into the given labels partition.
	 * 
	 * @param partition - label set partition
	 * @param pair - labels pair
	 * @return a new partition with clustered labels of the pair
	 */
	private static int[][] buildCombinationSet(int[][] partition, int[] pair) {
		int[][] newClusters = new int[partition.length - 1][];
		int[][] tmpClusters = new int[partition.length][];
		int i1 = -1;
		int i2 = -1;
		for (int i = 0; i < partition.length; i++) { // identify indexes of pair values in the
			// partition
			for (int j = 0; j < partition[i].length; j++) {
				if (partition[i][j] == pair[0]) {
					i1 = i;
				}
				if (partition[i][j] == pair[1]) {
					i2 = i;
				}
			}
		}
		if (i1 == i2) // if both labels already in the same set -> there is no change
			return partition;
		for (int k = 0; k < partition.length; k++) { // copy unchanged sets and unify sets with
			// values from pair
			if (i1 > i2) { // ensure that i1 is index of first occurrence of one of the values from
				// pair
				int temp = i1;
				i1 = i2;
				i2 = temp;
			}
			if (k != i1) { // if set's values not in pair -> copy as is
				tmpClusters[k] = partition[k];
			} else { // set new set to be a union of two previous sets
				tmpClusters[k] = new int[partition[i1].length + partition[i2].length];
				int m;
				for (m = 0; m < partition[i1].length; m++) {
					tmpClusters[k][m] = partition[i1][m];
				}
				int n;
				for (n = 0; n < partition[i2].length; n++) {
					tmpClusters[k][m + n] = partition[i2][n];
				}
			}
		}
		// delete the set which labels were added to another set:
		System.arraycopy(tmpClusters, 0, newClusters, 0, i2);
		// move all sets appearing after eliminated set into one index smaller
		System.arraycopy(tmpClusters, i2 + 1, newClusters, i2, newClusters.length - i2);
		return newClusters;
	}

	public int getNumFolds() {
		return numFolds;
	}

	public void setNumFolds(int numFolds) {
		this.numFolds = numFolds;
	}

	public Measure getMeasure() {
		return measure;
	}

	public void setMeasure(Measure measure) {
		this.measure = measure;
	}

	public int getAllowedNonImprovementSteps() {
		return allowedNonImprovementSteps;
	}

	public void setAllowedNonImprovementSteps(int allowedNonImprovementSteps) {
		this.allowedNonImprovementSteps = allowedNonImprovementSteps;
	}

	public double getCriticalValue() {
		return criticalValue;
	}

	public void setCriticalValue(double criticalValue) {
		this.criticalValue = criticalValue;
	}

	public Classifier getSingleLabelLearner() {
		return singleLabelLearner;
	}

	public MultiLabelLearner getMultiLabelLearner() {
		return multiLabelLearner;
	}

	public boolean isUseSubsetLearnerCache() {
		return useSubsetLearnerCache;
	}

	public void setUseSubsetLearnerCache(boolean useSubsetLearnerCache) {
		this.useSubsetLearnerCache = useSubsetLearnerCache;
	}

	public boolean isInternalSubsetLearnerDebug() {
		return internalSubsetLearnerDebug;
	}

	public void setInternalSubsetLearnerDebug(boolean internalSubsetLearnerDebug) {
		this.internalSubsetLearnerDebug = internalSubsetLearnerDebug;
	}
}