package mulan.data; import weka.classifiers.AbstractClassifier; import weka.classifiers.Classifier; import weka.classifiers.meta.FilteredClassifier; import weka.core.Instances; import weka.filters.unsupervised.attribute.Remove; import java.io.Serializable; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.Random; import java.util.logging.Level; import java.util.logging.Logger; /** * A class for identification of conditional dependence between each pair of labels. The conditional dependence between each pair of labels * is estimated by evaluating the advantage gained from exploiting this dependence for binary classification of each one of the labels. * Following the definition of conditional independence, for two conditionally independent labels, predictions of a label by probability-based classification * models trained once on a regular features space and second on the features space augmented by the second label should be at least very similar. For this * estimation two binary classifiers are trained and their accuracy is estimated using k-fold cross-validation. If the accuracy of the model trained on the * features space augmented by the second label is significantly higher, the labels are considered conditionally dependent. The statistical significance of * the difference between both classifiers is determined using a paired t-test. This procedure is performed for all possible label pairs considering the label * order in the pair . Among the two pairs with the same labels, the pair with maximal t-statistic value is added to the resulting list of dependent pairs. Finally, * the resultant label pairs are sorted according to their t-statistic value in descending order (i.e., from the most to the least dependent pairs). * * @author Lena Chekina (lenat@bgu.ac.il) * @version 30.11.2010 */ public class ConditionalDependenceIdentifier implements LabelPairsDependenceIdentifier, Serializable { /** A default t-critical value, corresponds to significance level 0.01. Label pairs with dependence value below the critical are considered as independent.*/ private double criticalValue = 3.25; /** A single-label classifier used to perform dependence test between labels. */ private Classifier baseLearner; /** Number of folds used for cross validation. */ private int numFolds = 10; /** Seed for replication of random experiments*/ protected int seed; /** A caching mechanism for reusing once constructed models. */ private static HashMap<String, FilteredClassifier> existingModels=null; /** * Initializes a single-label classifier used to perform dependence test between labels and a caching mechanism for reusing constructed models. * @param classifier - a single-label classifier used to perform dependence test between labels. */ public ConditionalDependenceIdentifier(Classifier classifier) { baseLearner = classifier; if (existingModels==null){ existingModels = new HashMap<String,FilteredClassifier>(); } } /** * Calculates t-statistic value for each pair of labels. * * @param mlInstances the {@link mulan.data.MultiLabelInstances} dataset on which dependencies should be calculated * @return an array of label pairs sorted in descending order of the t-statistic value */ public LabelsPair[] calculateDependence(MultiLabelInstances mlInstances){ int numLabels = mlInstances.getNumLabels(); int numPairs = numLabels*(numLabels-1)/2; LabelsPair[] pairs = new LabelsPair[numPairs]; int ind=0; for(int i=0; i<numLabels-1; i++){ //for each pair of labels i and j for(int j=i+1; j<numLabels;j++){ int[] comb1 = new int[2]; //will store a pair [i,j] int[] comb2 = new int[2]; //will store a pair [j,i] comb1[0] = i; comb1[1] = j; comb2[0] = j; comb2[1] = i; double val1 = testDependence(comb1, mlInstances, numFolds); // dependency test for classes i and j double val2 = testDependence(comb2, mlInstances, numFolds); // dependency test for classes j and i if(val1>=val2){ //add a pair with MAXIMAL value to the results object pairs[ind++] = new LabelsPair(comb1, val1); } else{ pairs[ind++] = new LabelsPair(comb2, val2); } } } Arrays.sort(pairs, Collections.reverseOrder()); return pairs; } /** * Performs dependency test between two labels. * * @param comb an array with indexes of the two labels for the test * @param mlData the {@link mulan.data.MultiLabelInstances} dataset on which dependencies should be calculated * @param numFolds number of folds used for cross validation * @return a value indicating the level of dependence between two labels. As higher is value as more conditionally dependent are the labels. For independent labels "-1" is returned. */ private double testDependence(int[] comb, MultiLabelInstances mlData, int numFolds) { double[] acc1 = null; double[] acc2 = null; double val; try{ int numLabels = mlData.getNumLabels(); int[] indecesToRemove1; int[] indecesToRemove2; int classIndex; final int[] labelIndices = mlData.getLabelIndices(); Instances[] trainSets = new Instances[numFolds]; Instances[] testSets = new Instances[numFolds]; weka.classifiers.Evaluation[] eval = new weka.classifiers.Evaluation[numFolds]; weka.classifiers.Evaluation[] eval2 = new weka.classifiers.Evaluation[numFolds]; acc1 = new double[numFolds]; acc2 = new double[numFolds]; Instances workingSet = new Instances(mlData.getDataSet()); Random random = new Random(seed); workingSet.randomize(random); //randomize numFolds train-test pairs for (int i=0; i<numFolds; i++) //build dependent and independent models on each fold { trainSets[i] = workingSet.trainCV(numFolds, i, random); testSets[i] = workingSet.testCV(numFolds, i); classIndex = labelIndices[comb[0]]; indecesToRemove1 = new int[numLabels-1]; //prepare indexes to build independent model (x -> L1) int counter2 = 0; for (int counter1 = 0; counter1<numLabels; counter1++){ if(counter1!=comb[0]){ indecesToRemove1[counter2] = labelIndices[counter1]; counter2++; } } FilteredClassifier indepModel; int foldHash = trainSets[i].toString().hashCode(); String modelKey = createKey(indecesToRemove1, foldHash); if (existingModels.containsKey(modelKey)) { indepModel=existingModels.get(modelKey); //Retrieving model from cache } else{ indepModel = buildModel(indecesToRemove1,classIndex, trainSets[i]); //Building independent model for L1 } indecesToRemove2 = new int[numLabels-2]; //prepare indexes to build dependent model (x, L2 - > L1) counter2 = 0; for (int counter1 = 0; counter1<numLabels; counter1++){ if((counter1!=comb[0]) && (counter1!=comb[1])){ indecesToRemove2[counter2] = labelIndices[counter1]; counter2++; } } FilteredClassifier depModel = buildModel(indecesToRemove2, //Building depend model for the L1 label classIndex, trainSets[i]); //evaluate independent model Instances filteredTrainData = prepareDatSet(indecesToRemove1,classIndex,trainSets[i]); Instances filteredTestData = prepareDatSet(indecesToRemove1,classIndex,testSets[i]); eval[i] = new weka.classifiers.Evaluation(filteredTrainData); eval[i].evaluateModel(indepModel, filteredTestData); acc1[i] = eval[i]. pctCorrect(); //evaluate dependent model Instances filteredTrainData2 = prepareDatSet(indecesToRemove2,classIndex,trainSets[i]); Instances filteredTestData2 = prepareDatSet(indecesToRemove2,classIndex,testSets[i]); eval2[i] = new weka.classifiers.Evaluation(filteredTrainData2); eval2[i].evaluateModel(depModel, filteredTestData2); acc2[i] = eval2[i]. pctCorrect(); } } catch (Exception e) { Logger.getLogger(ConditionalDependenceIdentifier.class.getSimpleName()).log(Level.SEVERE, null, e); } finally{ if(acc1==null || acc2==null){ val = -1; } else{ val = applyTtest(acc1,acc2); // /t-test on evaluation results } } return val; } /** * Performs paired t-test with same variances. * * @param val1 an array holding accuracy values of model1 * @param val2 an array holding accuracy values of model2 * @return t-statistic representing result of the t-test applied on the arrays values. Return "-1" if average accuracy of model1 is higher than that of model2 */ private double applyTtest(double[] val1, double[] val2) { double sum1=0; double sum2=0; final int count = val1.length; //compute Average for (int i=0; i< count; i++) { sum1+=val1[i]; sum2+=val2[i]; } double avg1=sum1/count; double avg2=sum2/count; if(avg1>avg2){ // If average accuracy of independent model is higher than average accuracy return -1; // of conditionally dependent model -> no need to model dependence! } //compute Variance double var1; double var2; double varDiff=0; for (int i=0; i< count; i++) { var1=val1[i]-avg1; var2=val2[i]-avg2; varDiff+=Math.pow(var1-var2,2); } //apply t-test double m =0; if(varDiff!=0){ m= Math.sqrt(count*(count-1) / varDiff); } double tValue = (avg1 - avg2) * m; if (tValue < 0){ tValue = tValue * (-1); } return tValue; } /** * Creating classification model. * * @param indicesToRemove indexes of labels to be removed from dataset * @param classIndex index of the label tested as class * @param trainDataset the {@link weka.core.Instances} dataset on which the model should be learned * @return {@link weka.classifiers.meta.FilteredClassifier} classification model * @throws Exception */ private FilteredClassifier buildModel(int[] indicesToRemove, int classIndex, Instances trainDataset) throws Exception { FilteredClassifier model = new FilteredClassifier(); model.setClassifier( AbstractClassifier.makeCopy(baseLearner)); Remove remove = new Remove(); remove.setAttributeIndicesArray(indicesToRemove); remove.setInputFormat(trainDataset); remove.setInvertSelection(false); model.setFilter(remove); trainDataset.setClassIndex(classIndex); model.buildClassifier(trainDataset); int foldHash = trainDataset.toString().hashCode(); String modelKey = createKey(indicesToRemove, foldHash); existingModels.put(modelKey, model); return model; } /** * Concatenate all integers from an array with additional integer into a single string. * * @param set an array representing labels subset * @param fold a hash code of the current training set * @return a string in the form: "_l1_l2_ ... ln_fold" */ private String createKey(int[] set, int fold) { StringBuilder sb = new StringBuilder("_"); for (int i : set){ sb.append(i); sb.append("_"); } sb.append(fold); return sb.toString(); } /** * Removes the specified features from the supplied dataset, and set the specified feature as class. * * @param indicesToRemove indexes of labels to be removed from the initial dataset * @param classIndex index of the class label * @param dataset the initial {@link weka.core.Instances} dataset * @return {@link weka.core.Instances} filtered dataset with set classIndex * @throws Exception */ private Instances prepareDatSet(int[] indicesToRemove, int classIndex, Instances dataset) throws Exception { Remove remove = new Remove(); remove.setAttributeIndicesArray(indicesToRemove); remove.setInputFormat(dataset); remove.setInvertSelection(false); dataset.setClassIndex(classIndex); return dataset; } public void setCriticalValue(double criticalValue) { this.criticalValue = criticalValue; } public double getCriticalValue() { return criticalValue; } public int getSeed() { return seed; } public void setSeed(int seed) { this.seed = seed; } public int getNumFolds() { return numFolds; } public void setNumFolds(int numFolds) { this.numFolds = numFolds; } }