package cc.mallet.cluster.evaluate; import java.util.HashSet; import cc.mallet.cluster.Clustering; /** * Evaluate a Clustering using the MUC evaluation metric. See Marc * Vilain, John Burger, John Aberdeen, Dennis Connolly, and Lynette * Hirschman. 1995. A model-theoretic coreference scoring scheme. In * Proceedings fo the 6th Message Understanding Conference * (MUC6). 45--52. Morgan Kaufmann. * * Note that MUC more or less ignores singleton clusters. * * @author "Aron Culotta" <culotta@degas.cs.umass.edu> * @version 1.0 * @since 1.0 * @see ClusteringEvaluator */ public class MUCEvaluator extends ClusteringEvaluator { int precisionNumerator; int precisionDenominator; int recallNumerator; int recallDenominator; public MUCEvaluator () { precisionNumerator = precisionDenominator = recallNumerator = recallDenominator = 0; } public String evaluate (Clustering truth, Clustering predicted) { double[] vals = getEvaluationScores(truth, predicted); return "pr=" + vals[0] + " re=" + vals[1] + " f1=" + vals[2]; } public String evaluateTotals () { double precision = (double)precisionNumerator / precisionDenominator; double recall = (double)recallNumerator / recallDenominator; return "pr=" + precision + " re=" + recall + " f1=" + (2 * precision * recall / (precision + recall)); } @Override public double[] getEvaluationScores(Clustering truth, Clustering predicted) { // Precision = \sum_i [ |siprime| - |pOfsiprime| ] / \sum_i [ |siprime| - 1 ] // where siprime is a predicted cluster, pOfsiprime is the set of // true clusters that contain elements of siprime. int numerator = 0; int denominator = 0; for (int i = 0; i < predicted.getNumClusters(); i++) { int[] siprime = predicted.getIndicesWithLabel(i); HashSet<Integer> pOfsiprime = new HashSet<Integer>(); for (int j = 0; j < siprime.length; j++) pOfsiprime.add(truth.getLabel(siprime[j])); numerator += siprime.length - pOfsiprime.size(); denominator += siprime.length - 1; } precisionNumerator += numerator; precisionDenominator += denominator; double precision = (double)numerator / denominator; // Recall = \sum_i [ |si| - |pOfsi| ] / \sum_i [ |si| - 1 ] // where si is a true cluster, pOfsi is the set of predicted // clusters that contain elements of si. numerator = denominator = 0; for (int i = 0; i < truth.getNumClusters(); i++) { int[] si = truth.getIndicesWithLabel(i); HashSet<Integer> pOfsi = new HashSet<Integer>(); for (int j = 0; j < si.length; j++) pOfsi.add(new Integer(predicted.getLabel(si[j]))); numerator += si.length - pOfsi.size(); denominator += si.length - 1; } recallNumerator += numerator; recallDenominator += denominator; double recall = (double)numerator / denominator; return new double[]{precision,recall,(2 * precision * recall / (precision + recall))}; } }