/* * This file is part of ELKI: * Environment for Developing KDD-Applications Supported by Index-Structures * * Copyright (C) 2017 * ELKI Development Team * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package de.lmu.ifi.dbs.elki.evaluation.clustering; import java.util.Iterator; import java.util.List; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.math.MeanVariance; import de.lmu.ifi.dbs.elki.utilities.datastructures.BitsUtil; /** * Class storing the contingency table and related data on two clusterings. * * @author Erich Schubert * @since 0.5.0 * * @apiviz.landmark * * @apiviz.uses Clustering * @apiviz.composedOf PairCounting * @apiviz.composedOf Entropy * @apiviz.composedOf EditDistance * @apiviz.composedOf BCubed * @apiviz.composedOf SetMatchingPurity */ public class ClusterContingencyTable { /** * Noise cluster handling */ protected boolean breakNoiseClusters = false; /** * Self pairing */ protected boolean selfPairing = true; /** * Number of clusters. */ protected int size1 = -1, size2 = -1; /** * Contingency matrix */ protected int[][] contingency = null; /** * Noise flags */ protected long[] noise1 = null, noise2 = null; /** * Pair counting measures */ protected PairCounting paircount = null; /** * Entropy-based measures */ protected Entropy entropy = null; /** * Set matching purity measures */ protected SetMatchingPurity smp = null; /** * Edit-Distance measures */ protected EditDistance edit = null; /** * BCubed measures */ protected BCubed bcubed = null; /** * Constructor. * * @param selfPairing Build self-pairs * @param breakNoiseClusters Break noise clusters into individual objects */ public ClusterContingencyTable(boolean selfPairing, boolean breakNoiseClusters) { super(); this.selfPairing = selfPairing; this.breakNoiseClusters = breakNoiseClusters; } /** * Process two clustering results. * * @param result1 First clustering * @param result2 Second clustering */ public void process(Clustering<?> result1, Clustering<?> result2) { // Get the clusters final List<? extends Cluster<?>> cs1 = result1.getAllClusters(); final List<? extends Cluster<?>> cs2 = result2.getAllClusters(); // Initialize size1 = cs1.size(); size2 = cs2.size(); contingency = new int[size1 + 2][size2 + 2]; noise1 = BitsUtil.zero(size1); noise2 = BitsUtil.zero(size2); // Fill main part of matrix { final Iterator<? extends Cluster<?>> it2 = cs2.iterator(); for(int i2 = 0; it2.hasNext(); i2++) { final Cluster<?> c2 = it2.next(); if(c2.isNoise()) { BitsUtil.setI(noise2, i2); } contingency[size1 + 1][i2] = c2.size(); contingency[size1 + 1][size2] += c2.size(); } } final Iterator<? extends Cluster<?>> it1 = cs1.iterator(); for(int i1 = 0; it1.hasNext(); i1++) { final Cluster<?> c1 = it1.next(); if(c1.isNoise()) { BitsUtil.setI(noise1, i1); } final DBIDs ids = DBIDUtil.ensureSet(c1.getIDs()); contingency[i1][size2 + 1] = c1.size(); contingency[size1][size2 + 1] += c1.size(); final Iterator<? extends Cluster<?>> it2 = cs2.iterator(); for(int i2 = 0; it2.hasNext(); i2++) { final Cluster<?> c2 = it2.next(); int count = DBIDUtil.intersectionSize(ids, c2.getIDs()); contingency[i1][i2] = count; contingency[i1][size2] += count; contingency[size1][i2] += count; contingency[size1][size2] += count; } } } @Override public String toString() { StringBuilder buf = new StringBuilder(); if(contingency != null) { for(int i1 = 0; i1 < size1 + 2; i1++) { if(i1 >= size1) { buf.append("------\n"); } for(int i2 = 0; i2 < size2 + 2; i2++) { if(i2 >= size2) { buf.append("| "); } buf.append(contingency[i1][i2]).append(' '); } buf.append('\n'); } } return buf.toString(); } /** * Get (compute) the pair counting measures. * * @return Pair counting measures */ public PairCounting getPaircount() { if(paircount == null) { paircount = new PairCounting(this); } return paircount; } /** * Get (compute) the entropy based measures * * @return Entropy based measures */ public Entropy getEntropy() { if(entropy == null) { entropy = new Entropy(this); } return entropy; } /** * Get (compute) the edit-distance based measures * * @return Edit-distance based measures */ public EditDistance getEdit() { if(edit == null) { edit = new EditDistance(this); } return edit; } /** * The BCubed based measures * * @return BCubed measures */ public BCubed getBCubed() { if(bcubed == null) { bcubed = new BCubed(this); } return bcubed; } /** * The set-matching measures * * @return Set-Matching measures */ public SetMatchingPurity getSetMatching() { if(smp == null) { smp = new SetMatchingPurity(this); } return smp; } /** * Compute the average Gini for each cluster (in both clusterings - * symmetric). * * @return Mean and variance of Gini */ public MeanVariance averageSymmetricGini() { MeanVariance mv = new MeanVariance(); for(int i1 = 0; i1 < size1; i1++) { double purity = 0.0; if(contingency[i1][size2] > 0) { final double cs = contingency[i1][size2]; // sum, as double. for(int i2 = 0; i2 < size2; i2++) { double rel = contingency[i1][i2] / cs; purity += rel * rel; } mv.put(purity, cs); } } for(int i2 = 0; i2 < size2; i2++) { double purity = 0.0; if(contingency[size1][i2] > 0) { final double cs = contingency[size1][i2]; // sum, as double. for(int i1 = 0; i1 < size1; i1++) { double rel = contingency[i1][i2] / cs; purity += rel * rel; } mv.put(purity, cs); } } return mv; } /** * Utility class. * * @author Erich Schubert * * @apiviz.exclude */ public static final class Util { /** * Private constructor. Static methods only. */ private Util() { // Do not use. } /** * F-Measure * * @param precision Precision * @param recall Recall * @param beta Beta value * @return F-Measure */ public static double fMeasure(double precision, double recall, double beta) { final double beta2 = beta * beta; return (1 + beta2) * precision * recall / (beta2 * precision + recall); } /** * F1-Measure (F-Measure with beta = 1) * * @param precision Precision * @param recall Recall * @return F-Measure */ public static double f1Measure(double precision, double recall) { return 2 * precision * recall / (precision + recall); } } }