/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * Statistics.java * Copyright (C) 2009-2010 Aristotle University of Thessaloniki, Thessaloniki, Greece */ package mulan.data; import java.io.Serializable; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.HashMap; import java.util.Set; import weka.core.Instance; import weka.core.Instances; import weka.core.Utils; import weka.filters.Filter; import weka.filters.unsupervised.attribute.Remove; /** <!-- globalinfo-start --> * Class for calculating statistics of a multilabel dataset <p> * <br/> * For more information, see<br/> * <br/> * G. Tsoumakas, I. Katakis (2007). Multi-Label Classification: An Overview. International Journal of Data Warehousing and Mining, 3(3):1-13. * </p> <!-- globalinfo-end --> * <!-- technical-bibtex-start --> * BibTeX: * <pre> * @article{tsoumakas+katakis:2007, * author = {G. Tsoumakas, I. Katakis}, * journal = {International Journal of Data Warehousing and Mining}, * pages = {1-13}, * title = {Multi-Label Classification: An Overview}, * volume = {3}, * number = {3}, * year = {2007} * } * </pre> * <p/> <!-- technical-bibtex-end --> * <!-- options-start --> * Valid options are: <p/> * * <pre> -F <filename> * The filename (including full path) of the multilabel mlData set).</pre> * * <pre> -L <number of labels> * Number of labels. </pre> * <!-- options-end --> * * @author Grigorios Tsoumakas * @author Robert Friberg * @version $Revision: 0.03 $ */ public class Statistics implements Serializable { private static final long serialVersionUID = 1206845794397561633L; /** the number of instances */ private int numInstances; /** the number of predictive attributes */ private int numPredictors = 0; /** the number of nominal predictive attributes */ private int numNominal = 0; /** the number of numeric attributes */ private int numNumeric = 0; /** the number of labels */ private int numLabels; /** the label density */ private double labelDensity; /** the label cardinality */ private double labelCardinality; /** percentage of instances per label */ private double[] examplesPerLabel; /** number of examples per cardinality, <br><br> * note that this array has size equal to the number of elements plus one, <br> * because the first element is the number of examples for cardinality=0 */ private double[] cardinalityDistribution; /** labelsets and their frequency */ private HashMap<LabelSet, Integer> labelsets; /** the array holding the phi correlations*/ double[][] phi; /** * returns the HashMap containing the distinct labelsets and their frequencies * * @return HashMap with distinct labelsest and their frequencies */ public HashMap<LabelSet, Integer> labelCombCount() { return labelsets; } /** * This method calculates and prints a matrix with the coocurrences of <br> * pairs of labels * * @param mdata a multi-label data set * @return a matrix of co-occurences */ public double[][] calculateCoocurrence(MultiLabelInstances mdata) { Instances data = mdata.getDataSet(); int labels = mdata.getNumLabels(); double[][] coocurrenceMatrix = new double[labels][labels]; numPredictors = data.numAttributes() - labels; for (int k = 0; k < data.numInstances(); k++) { Instance temp = data.instance(k); for (int i = 0; i < labels; i++) { for (int j = 0; j < labels; j++) { if (i >= j) { continue; } if (temp.stringValue(numPredictors + i).equals("1") && temp.stringValue(numPredictors + j).equals("1")) { coocurrenceMatrix[i][j]++; } } } } for (int i = 0; i < labels; i++) { for (int j = 0; j < labels; j++) { System.out.print(coocurrenceMatrix[i][j] + "\t"); } System.out.println(); } return coocurrenceMatrix; } /** * calculates various multilabel statistics, such as label cardinality, <br> * label density and the set of distinct labels along with their frequency * * @param mlData a multi-label dataset */ public void calculateStats(MultiLabelInstances mlData) { // initialize statistics Instances data = mlData.getDataSet(); numLabels = mlData.getNumLabels(); int[] labelIndices = mlData.getLabelIndices(); int[] featureIndices = mlData.getFeatureIndices(); numPredictors = featureIndices.length; labelCardinality = 0; numNominal = 0; numNumeric = 0; examplesPerLabel = new double[numLabels]; cardinalityDistribution = new double[numLabels + 1]; labelsets = new HashMap<LabelSet, Integer>(); // gather statistics for (int i = 0; i < featureIndices.length; i++) { if (data.attribute(featureIndices[i]).isNominal()) { numNominal++; } if (data.attribute(featureIndices[i]).isNumeric()) { numNumeric++; } } numInstances = data.numInstances(); for (int i = 0; i < numInstances; i++) { int exampleCardinality = 0; double[] dblLabels = new double[numLabels]; for (int j = 0; j < numLabels; j++) { if (data.instance(i).stringValue(labelIndices[j]).equals("1")) { dblLabels[j] = 1; exampleCardinality++; labelCardinality++; examplesPerLabel[j]++; } else { dblLabels[j] = 0; } } cardinalityDistribution[exampleCardinality]++; LabelSet labelSet = new LabelSet(dblLabels); if (labelsets.containsKey(labelSet)) { labelsets.put(labelSet, labelsets.get(labelSet) + 1); } else { labelsets.put(labelSet, 1); } } labelCardinality /= numInstances; labelDensity = labelCardinality / numLabels; for (int j = 0; j < numLabels; j++) { examplesPerLabel[j] /= numInstances; } } /** * Calculates phi correlation * * @param dataSet a multi-label dataset * @return a matrix containing phi correlations * @throws java.lang.Exception */ public double[][] calculatePhi(MultiLabelInstances dataSet) throws Exception { numLabels = dataSet.getNumLabels(); /** the indices of the label attributes */ int[] labelIndices; labelIndices = dataSet.getLabelIndices(); numLabels = dataSet.getNumLabels(); phi = new double[numLabels][numLabels]; Remove remove = new Remove(); remove.setInvertSelection(true); remove.setAttributeIndicesArray(labelIndices); remove.setInputFormat(dataSet.getDataSet()); Instances result = Filter.useFilter(dataSet.getDataSet(), remove); result.setClassIndex(result.numAttributes() - 1); for (int i = 0; i < numLabels; i++) { int a[] = new int[numLabels]; int b[] = new int[numLabels]; int c[] = new int[numLabels]; int d[] = new int[numLabels]; double e[] = new double[numLabels]; double f[] = new double[numLabels]; double g[] = new double[numLabels]; double h[] = new double[numLabels]; for (int j = 0; j < result.numInstances(); j++) { for (int l = 0; l < numLabels; l++) { if (result.instance(j).stringValue(i).equals("0")) { if (result.instance(j).stringValue(l).equals("0")) { a[l]++; } else { c[l]++; } } else { if (result.instance(j).stringValue(l).equals("0")) { b[l]++; } else { d[l]++; } } } } for (int l = 0; l < numLabels; l++) { e[l] = a[l] + b[l]; f[l] = c[l] + d[l]; g[l] = a[l] + c[l]; h[l] = b[l] + d[l]; double mult = e[l] * f[l] * g[l] * h[l]; double denominator = Math.sqrt(mult); double nominator = a[l] * d[l] - b[l] * c[l]; phi[i][l] = nominator / denominator; } } return phi; } /** * Prints out phi correlations */ public void printPhiCorrelations() { String pattern = "0.00"; DecimalFormat myFormatter = new DecimalFormat(pattern); for (int i = 0; i < numLabels; i++) { for (int j = 0; j < numLabels; j++) { System.out.print(myFormatter.format(phi[i][j]) + " "); } System.out.println(""); } } /** * Calculates a histogram of phi correlations * * @return an array with phi correlations */ public double[] getPhiHistogram() { double[] pairs = new double[numLabels * (numLabels - 1) / 2]; int counter = 0; for (int i = 0; i < numLabels - 1; i++) { for (int j = i + 1; j < numLabels; j++) { pairs[counter] = phi[i][j]; counter++; } } return pairs; } /** * returns the indices of the labels whose phi coefficient values lie * between -bound <= phi <= bound * * @param labelIndex * @param bound * @return the indices of the labels whose phi coefficient values lie between -bound <= phi <= bound */ public int[] uncorrelatedLabels(int labelIndex, double bound) { ArrayList<Integer> indiceslist = new ArrayList<Integer>(); for (int i = 0; i < numLabels; i++) { if (Math.abs(phi[labelIndex][i]) <= bound) { indiceslist.add(i); } } int[] indices = new int[indiceslist.size()]; for (int i = 0; i < indiceslist.size(); i++) { indices[i] = indiceslist.get(i); } return indices; } /** * Returns the indices of the labels that have the strongest phi correlation * with the label which is given as a parameter. The second parameter is * the number of labels that will be returned. * * @param labelIndex * @param k * @return the indices of the k most correlated labels */ public int[] topPhiCorrelatedLabels(int labelIndex, int k) { //create a new array containing the absolute values of the original array double[] absCorrelations = new double[numLabels]; for (int i = 0; i < numLabels; i++) { absCorrelations[i] = Math.abs(phi[labelIndex][i]); } //sort the array of correlations int[] sorted = Utils.stableSort(absCorrelations); int[] topPhiCorrelated = new int[k + 1]; //the k last values of the sorted array are the indices of the top k correlated labels for (int i = 0; i < k; i++) { topPhiCorrelated[i] = sorted[numLabels - 1 - i]; } // one more for the class topPhiCorrelated[k] = numLabels; return topPhiCorrelated; } /** * This method prints data, useful for the visualization of Phi per dataset. * It prints int(1/step) + 1 pairs of values. The first value of each pair * is the phi value and the second is the average number of labels that * correlate to the rest of the labels with correlation higher than the * specified phi value; * * @param step * the phi value increment step */ public void printPhiDiagram(double step) { String pattern = "0.00"; DecimalFormat myFormatter = new DecimalFormat(pattern); System.out.println("Phi AvgCorrelated"); double tempPhi = 0; while (tempPhi <= 1.001) { double avgCorrelated = 0; for (int i = 0; i < numLabels; i++) { int[] temp = uncorrelatedLabels(i, tempPhi); avgCorrelated += (numLabels - temp.length); } avgCorrelated /= numLabels; System.out.println(myFormatter.format(phi) + " " + avgCorrelated); tempPhi += step; } } /** * returns various multilabel statistics in textual representation */ @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("Examples: " + numInstances + "\n"); sb.append("Predictors: " + numPredictors + "\n"); sb.append("--Nominal: " + numNominal + "\n"); sb.append("--Numeric: " + numNumeric + "\n"); sb.append("Labels: " + numLabels + "\n"); sb.append("\n"); sb.append("Cardinality: " + labelCardinality + "\n"); sb.append("Density: " + labelDensity + "\n"); sb.append("Distinct Labelsets: " + labelsets.size() + "\n"); sb.append("\n"); for (int j = 0; j < numLabels; j++) { sb.append("Percentage of examples with label " + (j + 1) + ": " + examplesPerLabel[j] + "\n"); } sb.append("\n"); for (int j = 0; j <= numLabels; j++) { sb.append("Examples of cardinality " + j + ": " + cardinalityDistribution[j] + "\n"); } sb.append("\n"); for (LabelSet set : labelsets.keySet()) { sb.append("Examples of combination " + set + ": " + labelsets.get(set) + "\n"); } return sb.toString(); } /** * returns the prior probabilities of the labels * * @return array of prior probabilities of labels */ public double[] priors() { return examplesPerLabel; } /** * returns the label cardinality of the dataset * * @return label cardinality */ public double cardinality() { return labelCardinality; } /** * returns the label density of the dataset * * @return label density */ public double density() { return labelDensity; } /** * returns a set with the distinct labelsets of the dataset * * @return set of distinct labelsets */ public Set<LabelSet> labelSets() { return labelsets.keySet(); } /** * returns the frequency of a labelset in the dataset * * @param x a labelset * @return the frequency of the given labelset */ public int labelFrequency(LabelSet x) { return labelsets.get(x); } }