/* * This file is part of ELKI: * Environment for Developing KDD-Applications Supported by Index-Structures * * Copyright (C) 2017 * ELKI Development Team * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package de.lmu.ifi.dbs.elki.evaluation.clustering; import java.util.Collection; import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList; import de.lmu.ifi.dbs.elki.evaluation.Evaluator; import de.lmu.ifi.dbs.elki.evaluation.scores.ScoreEvaluation; import de.lmu.ifi.dbs.elki.evaluation.scores.adapter.DBIDsTest; import de.lmu.ifi.dbs.elki.evaluation.scores.adapter.DistanceResultAdapter; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.math.MeanVariance; import de.lmu.ifi.dbs.elki.result.EvaluationResult; import de.lmu.ifi.dbs.elki.result.Result; import de.lmu.ifi.dbs.elki.result.ResultHierarchy; import de.lmu.ifi.dbs.elki.result.ResultUtil; import de.lmu.ifi.dbs.elki.utilities.Alias; import de.lmu.ifi.dbs.elki.utilities.io.FormatUtil; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * Evaluate a clustering result by comparing it to an existing cluster label. * * @author Erich Schubert * @since 0.4.0 * * @apiviz.landmark * @apiviz.uses ClusterContingencyTable * @apiviz.has EvaluateClustering.ScoreResult oneway - - «create» */ @Alias("de.lmu.ifi.dbs.elki.evaluation.paircounting.EvaluatePairCountingFMeasure") public class EvaluateClustering implements Evaluator { /** * Logger for debug output. */ private static final Logging LOG = Logging.getLogger(EvaluateClustering.class); /** * Parameter to obtain the reference clustering. Defaults to a flat label * clustering. */ public static final OptionID REFERENCE_ID = new OptionID("paircounting.reference", "Reference clustering to compare with. Defaults to a by-label clustering."); /** * Parameter flag for special noise handling. */ public static final OptionID NOISE_ID = new OptionID("paircounting.noisespecial", "Use special handling for noise clusters."); /** * Parameter flag to disable self-pairing */ public static final OptionID SELFPAIR_ID = new OptionID("paircounting.selfpair", "Enable self-pairing for cluster comparison."); /** * Reference algorithm. */ private ClusteringAlgorithm<?> referencealg; /** * Apply special handling to noise "clusters". */ private boolean noiseSpecialHandling; /** * Use self-pairing in pair-counting measures */ private boolean selfPairing; /** * Constructor. * * @param referencealg Reference clustering * @param noiseSpecialHandling Noise handling flag * @param selfPairing Self-pairing flag */ public EvaluateClustering(ClusteringAlgorithm<?> referencealg, boolean noiseSpecialHandling, boolean selfPairing) { super(); this.referencealg = referencealg; this.noiseSpecialHandling = noiseSpecialHandling; this.selfPairing = selfPairing; } /** * Evaluate given a cluster (of positive elements) and a scoring list. * * @param eval Evaluation method * @param clus Cluster object * @param ranking Object ranking * @return Score */ public static double evaluateRanking(ScoreEvaluation eval, Cluster<?> clus, DoubleDBIDList ranking) { return eval.evaluate(new DBIDsTest(DBIDUtil.ensureSet(clus.getIDs())), new DistanceResultAdapter(ranking.iter())); } @Override public void processNewResult(ResultHierarchy hier, Result newResult) { // We may just have added this result. if(newResult instanceof Clustering && isReferenceResult((Clustering<?>) newResult)) { return; } Database db = ResultUtil.findDatabase(hier); List<Clustering<?>> crs = Clustering.getClusteringResults(newResult); if(crs == null || crs.isEmpty()) { return; } // Compute the reference clustering Clustering<?> refc = null; // Try to find an existing reference clustering (globally) { Collection<Clustering<?>> cs = ResultUtil.filterResults(hier, db, Clustering.class); for(Clustering<?> test : cs) { if(isReferenceResult(test)) { refc = test; break; } } } // Try to find an existing reference clustering (locally) if(refc == null) { Collection<Clustering<?>> cs = ResultUtil.filterResults(hier, newResult, Clustering.class); for(Clustering<?> test : cs) { if(isReferenceResult(test)) { refc = test; break; } } } if(refc == null) { LOG.debug("Generating a new reference clustering."); Result refres = referencealg.run(db); List<Clustering<?>> refcrs = Clustering.getClusteringResults(refres); if(refcrs.isEmpty()) { LOG.warning("Reference algorithm did not return a clustering result!"); return; } if(refcrs.size() > 1) { LOG.warning("Reference algorithm returned more than one result!"); } refc = refcrs.get(0); } else { LOG.debug("Using existing clustering: " + refc.getLongName() + " " + refc.getShortName()); } for(Clustering<?> c : crs) { if(c == refc) { continue; } evaluteResult(db, c, refc); } } /** * Evaluate a clustering result. * * @param db Database * @param c Clustering * @param refc Reference clustering */ protected void evaluteResult(Database db, Clustering<?> c, Clustering<?> refc) { ClusterContingencyTable contmat = new ClusterContingencyTable(selfPairing, noiseSpecialHandling); contmat.process(refc, c); ScoreResult sr = new ScoreResult(contmat); sr.addHeader(c.getLongName()); db.getHierarchy().add(c, sr); } /** * Test if a clustering result is a valid reference result. * * @param t Clustering to test. * @return {@code true} if it is considered to be a reference result. */ private boolean isReferenceResult(Clustering<?> t) { // FIXME: don't hard-code strings if("bylabel-clustering".equals(t.getShortName())) { return true; } if("bymodel-clustering".equals(t.getShortName())) { return true; } if("allinone-clustering".equals(t.getShortName())) { return true; } if("allinnoise-clustering".equals(t.getShortName())) { return true; } return false; } /** * Result object for outlier score judgements. * * @author Erich Schubert * * @apiviz.composedOf ClusterContingencyTable */ public static class ScoreResult extends EvaluationResult { /** * Cluster contingency table */ protected ClusterContingencyTable contmat; /** * Constructor. * * @param contmat score result */ public ScoreResult(ClusterContingencyTable contmat) { super("Cluster-Evalation", "cluster-evaluation"); this.contmat = contmat; PairCounting paircount = contmat.getPaircount(); MeasurementGroup g = newGroup("Pair counting measures"); g.addMeasure("Jaccard", paircount.jaccard(), 0, 1, false); g.addMeasure("F1-Measure", paircount.f1Measure(), 0, 1, false); g.addMeasure("Precision", paircount.precision(), 0, 1, false); g.addMeasure("Recall", paircount.recall(), 0, 1, false); g.addMeasure("Rand", paircount.randIndex(), 0, 1, false); g.addMeasure("ARI", paircount.adjustedRandIndex(), 0, 1, false); g.addMeasure("FowlkesMallows", paircount.fowlkesMallows(), 0, 1, false); Entropy entropy = contmat.getEntropy(); g = newGroup("Entropy based measures"); g.addMeasure("NMI Joint", entropy.entropyNMIJoint(), 0, 1, false); g.addMeasure("NMI Sqrt", entropy.entropyNMISqrt(), 0, 1, false); BCubed bcubed = contmat.getBCubed(); g = newGroup("BCubed-based measures"); g.addMeasure("F1-Measure", bcubed.f1Measure(), 0, 1, false); g.addMeasure("Recall", bcubed.recall(), 0, 1, false); g.addMeasure("Precision", bcubed.precision(), 0, 1, false); SetMatchingPurity setm = contmat.getSetMatching(); g = newGroup("Set-Matching-based measures"); g.addMeasure("F1-Measure", setm.f1Measure(), 0, 1, false); g.addMeasure("Purity", setm.purity(), 0, 1, false); g.addMeasure("Inverse Purity", setm.inversePurity(), 0, 1, false); EditDistance edit = contmat.getEdit(); g = newGroup("Editing-distance measures"); g.addMeasure("F1-Measure", edit.f1Measure(), 0, 1, false); g.addMeasure("Precision", edit.editDistanceFirst(), 0, 1, false); g.addMeasure("Recall", edit.editDistanceSecond(), 0, 1, false); MeanVariance gini = contmat.averageSymmetricGini(); g = newGroup("Gini measures"); g.addMeasure("Mean +-" + FormatUtil.NF4.format(gini.getCount() > 1. ? gini.getSampleStddev() : 0.), gini.getMean(), 0, 1, false); } /** * Get the contingency table * * @return the contingency table */ public ClusterContingencyTable getContingencyTable() { return contmat; } @Override public boolean visualizeSingleton() { return true; } } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer extends AbstractParameterizer { protected ClusteringAlgorithm<?> referencealg = null; protected boolean noiseSpecialHandling = false; protected boolean selfPairing = false; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); ObjectParameter<ClusteringAlgorithm<?>> referencealgP = new ObjectParameter<>(REFERENCE_ID, ClusteringAlgorithm.class, ByLabelOrAllInOneClustering.class); if(config.grab(referencealgP)) { referencealg = referencealgP.instantiateClass(config); } Flag noiseSpecialHandlingF = new Flag(NOISE_ID); if(config.grab(noiseSpecialHandlingF)) { noiseSpecialHandling = noiseSpecialHandlingF.getValue(); } Flag selfPairingF = new Flag(SELFPAIR_ID); if(config.grab(selfPairingF)) { selfPairing = selfPairingF.getValue(); } } @Override protected EvaluateClustering makeInstance() { return new EvaluateClustering(referencealg, noiseSpecialHandling, !selfPairing); } } }