/* * This file is part of ELKI: * Environment for Developing KDD-Applications Supported by Index-Structures * * Copyright (C) 2017 * ELKI Development Team * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package de.lmu.ifi.dbs.elki.evaluation.clustering; import de.lmu.ifi.dbs.elki.logging.LoggingUtil; import de.lmu.ifi.dbs.elki.utilities.datastructures.BitsUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import net.jafama.FastMath; /** * Pair-counting measures. * * @author Erich Schubert * @since 0.5.0 */ public class PairCounting { /** * This is the maximum size this implementation can support. * * Note: this is approximately sqrt(2) * Integer.MAX_VALUE as long = 63 bits * (+unused sign bit), int = 31 bits (+unused sign bit) */ public static final long MAX_SIZE = (long) Math.floor(Math.sqrt(Long.MAX_VALUE)); /** * Pair counting confusion matrix (flat: inBoth, inFirst, inSecond, inNone) */ protected long[] pairconfuse = null; /** * Constructor. */ protected PairCounting(ClusterContingencyTable table) { super(); // Aggregations long inBoth = 0, in1 = 0, in2 = 0, total = 0; // Process first clustering: { for(int i1 = 0; i1 < table.size1; i1++) { final int size = table.contingency[i1][table.size2 + 1]; if(table.breakNoiseClusters && BitsUtil.get(table.noise1, i1)) { if(table.selfPairing) { in1 += size; } // else: 0 } else { in1 += size * (long) (table.selfPairing ? size : (size - 1)); } } } // Process second clustering: { for(int i2 = 0; i2 < table.size2; i2++) { final int size = table.contingency[table.size1 + 1][i2]; if(table.breakNoiseClusters && BitsUtil.get(table.noise2, i2)) { if(table.selfPairing) { in2 += size; } // else: 0 } else { in2 += size * (long) (table.selfPairing ? size : (size - 1)); } } } // Process combinations for(int i1 = 0; i1 < table.size1; i1++) { for(int i2 = 0; i2 < table.size2; i2++) { final int size = table.contingency[i1][i2]; if(table.breakNoiseClusters && (BitsUtil.get(table.noise1, i1) || BitsUtil.get(table.noise2, i2))) { if(table.selfPairing) { inBoth += size; } // else: 0 } else { inBoth += size * (long) (table.selfPairing ? size : (size - 1)); } } } // The official sum int tsize = table.contingency[table.size1][table.size2]; if(table.contingency[table.size1][table.size2 + 1] != tsize || table.contingency[table.size1 + 1][table.size2] != tsize) { LoggingUtil.warning("PairCounting F-Measure is not well defined for overlapping and incomplete clusterings. The number of elements are: " + table.contingency[table.size1][table.size2 + 1] + " != " + table.contingency[table.size1 + 1][table.size2] + " elements."); } if(tsize < 0 || tsize >= MAX_SIZE) { LoggingUtil.warning("Your data set size probably is too big for this implementation, which uses only long precision."); } total = tsize * (long) (table.selfPairing ? tsize : (tsize - 1)); long inFirst = in1 - inBoth, inSecond = in2 - inBoth; long inNone = total - (inBoth + inFirst + inSecond); pairconfuse = new long[] { inBoth, inFirst, inSecond, inNone }; } /** * Get the pair-counting F-Measure * * @param beta Beta value. * @return F-Measure */ public double fMeasure(double beta) { final double beta2 = beta * beta; double fmeasure = ((1 + beta2) * pairconfuse[0]) / ((1 + beta2) * pairconfuse[0] + beta2 * pairconfuse[1] + pairconfuse[2]); return fmeasure; } /** * Get the pair-counting F1-Measure. * * @return F1-Measure */ public double f1Measure() { return fMeasure(1.0); } /** * Computes the pair-counting precision. * * @return pair-counting precision */ public double precision() { return pairconfuse[0] / (double) (pairconfuse[0] + pairconfuse[2]); } /** * Computes the pair-counting recall. * * @return pair-counting recall */ public double recall() { return pairconfuse[0] / (double) (pairconfuse[0] + pairconfuse[1]); } /** * Computes the pair-counting Fowlkes-mallows (flat only, non-hierarchical!) * * <p> * Fowlkes, E.B. and Mallows, C.L.<br /> * A method for comparing two hierarchical clusterings<br /> * In: Journal of the American Statistical Association, Vol. 78 Issue 383 * </p> * * @return pair-counting Fowlkes-mallows */ // TODO: implement for non-flat clusterings! @Reference(authors = "Fowlkes, E.B. and Mallows, C.L.", // title = "A method for comparing two hierarchical clusterings", // booktitle = "Journal of the American Statistical Association, Vol. 78 Issue 383") public double fowlkesMallows() { return FastMath.sqrt(precision() * recall()); } /** * Computes the Rand index (RI). * * <p> * Rand, W. M.<br /> * Objective Criteria for the Evaluation of Clustering Methods<br /> * Journal of the American Statistical Association, Vol. 66 Issue 336 * </p> * * @return The Rand index (RI). */ @Reference(authors = "Rand, W. M.", // title = "Objective Criteria for the Evaluation of Clustering Methods", // booktitle = "Journal of the American Statistical Association, Vol. 66 Issue 336", // url = "http://www.jstor.org/stable/10.2307/2284239") public double randIndex() { final double sum = pairconfuse[0] + pairconfuse[1] + pairconfuse[2] + pairconfuse[3]; return (pairconfuse[0] + pairconfuse[3]) / sum; } /** * Computes the adjusted Rand index (ARI). * * @return The adjusted Rand index (ARI). */ public double adjustedRandIndex() { double d = FastMath.sqrt(pairconfuse[0] + pairconfuse[1] + pairconfuse[2] + pairconfuse[3]); // Note: avoid (a+b)*(a+c) as this will cause long overflows easily // Because we have O(N^2) pairs, and thus this value is temporarily O(N^4) double exp = (pairconfuse[0] + pairconfuse[1]) / d * (pairconfuse[0] + pairconfuse[2]) / d; double opt = pairconfuse[0] + 0.5 * (pairconfuse[1] + pairconfuse[2]); return (pairconfuse[0] - exp) / (opt - exp); } /** * Computes the Jaccard index * * @return The Jaccard index */ public double jaccard() { final double sum = pairconfuse[0] + pairconfuse[1] + pairconfuse[2]; return pairconfuse[0] / sum; } /** * Computes the Mirkin index * * @return The Mirkin index */ public long mirkin() { return 2 * (pairconfuse[1] + pairconfuse[2]); } }