/*
* File: CommunityComparisons.java
* Authors: Jeremy D. Wendt
* Company: Sandia National Laboratories
* Project: Cognitive Foundry
*
* Copyright 2016, Sandia Corporation.
* Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive
* license for use of this work by or on behalf of the U.S. Government.
* Export of this program may require a license from the United States
* Government. See CopyrightHistory.txt for complete details.
*
*/
package gov.sandia.cognition.graph.community;
import gov.sandia.cognition.annotation.PublicationReference;
import gov.sandia.cognition.annotation.PublicationType;
import java.util.HashSet;
import java.util.Set;
/**
* This class implements various comparisons of partitionings of a graph (or the
* resulting communities)
*
* @author jdwendt
* @param <NodeNameType> The type used to name nodes
*/
@PublicationReference(author = "Nguyen Xuan Vinh, Julien Epps, and James Bailey",
title
= "Information Theoretic Measures of Clusterings Comparison: Variants, Properties, Normalization and Correction for Chance",
type = PublicationType.Journal, year = 2010, publication
= "Journal of Machine Learning Research", pages = 2837 - 2854)
public class CommunityComparisons<NodeNameType>
{
/**
* The u-index/v-index contingency table that shows the overlap of u's
* clustering and v's clustering for each pair of clusters
*/
private final int[][] contingencyTable;
/**
* The size of each of u's clusters. Also the sum across contingency table's
* rows.
*/
private final int[] uSums;
/**
* The size of each of v's clusters. Also the sum across contingency table's
* columns.
*/
private final int[] vSums;
/**
* The number of nodes in the original set (also the sum of uSums or vSums
* or all entries in contingencyTable)
*/
private final int n;
/**
* The number of rows (nu) and columns (nv). uSums.length and vSums.length
*/
private final int nu, nv;
/**
* Once computed, the entropy of u (h(u)) is stored here
*/
private double entropyU;
/**
* Once computed, the entropy of v (h(v)) is stored here
*/
private double entropyV;
/**
* Once computed, the joint entropy (h(u, v)) is stored here
*/
private double jointEntropy;
/**
* Once computed, h(u|v) is stored here
*/
private double condEntropyUGivenV;
/**
* Once computed, h(v|u) is stored here
*/
private double condEntropyVGivenU;
/**
* Once computed, the mutual information (I(u, v)) is stored here
*/
private double mutualInformation;
/**
* Once computed, the expected mutual information (E(I(u, v))) is stored
* here
*/
private double expectedMutualInformation;
/**
* Initializes this with the input partitionings of the same underlying set
*
* @param u The first partitioning
* @param v The second partitioning
*/
public CommunityComparisons(NodePartitioning<NodeNameType> u,
NodePartitioning<NodeNameType> v)
{
// First, make sure they partition the same space
Set<NodeNameType> su = u.getAllMembers();
Set<NodeNameType> sv = v.getAllMembers();
if (su.size() != sv.size())
{
throw new IllegalArgumentException(
"Input partitionings do not contain the same number of members: "
+ su.size() + " != " + sv.size());
}
Set<NodeNameType> tmp = new HashSet<>(su);
tmp.retainAll(sv);
if (tmp.size() != sv.size())
{
throw new IllegalArgumentException(
"Input partitionings don't contain the same set of nodes: original size = "
+ su.size() + " != intersection size " + tmp.size());
}
entropyU = entropyV = jointEntropy = condEntropyUGivenV
= condEntropyVGivenU = mutualInformation = expectedMutualInformation
= Double.MAX_VALUE;
n = sv.size();
nu = u.getNumPartitions();
nv = v.getNumPartitions();
contingencyTable = new int[nu][];
uSums = new int[nu];
vSums = new int[nv];
for (int i = 0; i < nu; ++i)
{
contingencyTable[i] = new int[nv];
uSums[i] = 0;
}
for (int i = 0; i < nv; ++i)
{
vSums[i] = 0;
}
for (NodeNameType node : su) {
int i = u.getPartition(node);
int j = v.getPartition(node);
++contingencyTable[i][j];
++uSums[i];
++vSums[j];
}
}
/**
* This returns 0 if 0 is passed in. This is valid only because every time
* this is called with zero, the returned value will be multiplied by zero
* and the result should be zero.
*
* @param d The double whose log is sought
* @return the natural log of d unless d is zero (then zero)
*/
private static double safeLog(double d)
{
return (d == 0) ? 0 : Math.log(d);
}
/**
* Helper that computes the entropy of the input marginals where N is the
* sum of the entries in marginals
*
* @param N The sum of the entries in marginals
* @param marginals The partitioning of the values
* @return the entropy of the input marginals
*/
private static double getEntropy(int N,
int[] marginals)
{
double ret = 0;
for (int marginal : marginals)
{
double f = ((double) marginal) / ((double) N);
ret += f * safeLog(f);
}
return -ret;
}
/**
* Returns the entropy of the u partitioning (computing and storing it if
* this is the first time called)
*
* @return the entropy of the u partitioning
*/
public double getEntropyU()
{
if (entropyU == Double.MAX_VALUE)
{
entropyU = getEntropy(n, uSums);
}
return entropyU;
}
/**
* Returns the entropy of the v partitioning (computing and storing it if
* this is the first time called)
*
* @return the entropy of the v partitioning
*/
public double getEntropyV()
{
if (entropyV == Double.MAX_VALUE)
{
entropyV = getEntropy(n, vSums);
}
return entropyV;
}
/**
* Returns the joint entropy (H(u,v)), computing it if this is the first
* time called
*
* @return the joint entropy
*/
public double getJointEntropy()
{
if (jointEntropy == Double.MAX_VALUE)
{
jointEntropy = 0;
for (int[] row : contingencyTable)
{
for (int nij : row)
{
double f = ((double) nij) / (double) n;
jointEntropy += f * safeLog(f);
}
}
jointEntropy *= -1;
}
return jointEntropy;
}
/**
* Returns H(u|v), computing and storing it if this is the first time called
*
* @return H(u|v)
*/
public double getConditionalEntropyUGivenV()
{
if (condEntropyUGivenV == Double.MAX_VALUE)
{
condEntropyUGivenV = 0;
for (int i = 0; i < nu; ++i)
{
for (int j = 0; j < nv; ++j)
{
double numer = ((double) contingencyTable[i][j])
/ (double) n;
double denom = ((double) vSums[j]) / (double) n;
condEntropyUGivenV += numer * safeLog(numer / denom);
}
}
condEntropyUGivenV *= -1;
}
return condEntropyUGivenV;
}
/**
* Returns H(v|u), computing and storing it if this is the first time called
*
* @return H(v|u)
*/
public double getConditionalEntropyVGivenU()
{
if (condEntropyVGivenU == Double.MAX_VALUE)
{
condEntropyVGivenU = 0;
for (int i = 0; i < nu; ++i)
{
for (int j = 0; j < nv; ++j)
{
double numer = ((double) contingencyTable[i][j])
/ (double) n;
double denom = ((double) uSums[i]) / (double) n;
condEntropyVGivenU += numer * safeLog(numer / denom);
}
}
condEntropyVGivenU *= -1;
}
return condEntropyVGivenU;
}
/**
* Returns the mutual information (I(u, v)) for the two partitionings,
* computing and storing it if this is the first time called
*
* @return the mutual information
*/
public double getMutualInformation()
{
if (mutualInformation == Double.MAX_VALUE)
{
mutualInformation = 0;
for (int i = 0; i < nu; ++i)
{
for (int j = 0; j < nv; ++j)
{
double numer = ((double) contingencyTable[i][j])
/ (double) n;
double denom = ((double) uSums[i] * vSums[j]) / (double) (n
* n);
mutualInformation += numer * safeLog(numer / denom);
}
}
// TODO: Really? No negation?
}
return mutualInformation;
}
/**
* Returns the joint normalized mutual information (see Table 2 of cited
* paper)
*
* @return the joint normalized mutual information
*/
public double getNmiJoint()
{
return getMutualInformation() / getJointEntropy();
}
/**
* Returns the max normalized mutual information (see Table 2 of cited
* paper)
*
* @return the max normalized mutual information
*/
public double getNmiMax()
{
return getMutualInformation() / Math.max(getEntropyU(), getEntropyV());
}
/**
* Returns the sum normalized mutual information (see Table 2 of cited
* paper)
*
* @return the sum normalized mutual information
*/
public double getNmiSum()
{
return 2 * getMutualInformation() / (getEntropyU() + getEntropyV());
}
/**
* Returns the sqrt normalized mutual information (see Table 2 of cited
* paper)
*
* @return the sqrt normalized mutual information
*/
public double getNmiSqrt()
{
return getMutualInformation() / Math.sqrt(getEntropyU() * getEntropyV());
}
/**
* Returns the min normalized mutual information (see Table 2 of cited
* paper)
*
* @return the min normalized mutual information
*/
public double getNmiMin()
{
return getMutualInformation() / Math.min(getEntropyU(), getEntropyV());
}
/**
* Returns the approximate expected value of the mutual information for the
* two partitionings. Specifically, the middle value from Equation 4 of the
* cited paper. Computes and stores the value if this is the first time
* called.
*
* @return the approximate expected value of the mutual information
*/
public double getExpectedMutualInformation()
{
if (expectedMutualInformation == Double.MAX_VALUE)
{
expectedMutualInformation = 0;
double N = n;
for (int i = 0; i < nu; ++i)
{
double ai = uSums[i];
for (int j = 0; j < nv; ++j)
{
double bj = vSums[j];
expectedMutualInformation += (ai * bj) / (N * N) * Math.log(
(N * (ai - 1) * (bj - 1)) / ((N - 1) * ai * bj) + (N
/ (ai * bj)));
}
}
// TODO: Really again no negation?
}
return expectedMutualInformation;
}
/**
* Returns the max adjusted-for-chance mutual information (see Table 2 of
* cited paper)
*
* @return the max adjusted-for-chance mutual information
*/
public double getAmiMax()
{
return (getMutualInformation() - getExpectedMutualInformation())
/ (Math.max(getEntropyU(), getEntropyV())
- getExpectedMutualInformation());
}
/**
* Returns the sum adjusted-for-chance mutual information (see Table 2 of
* cited paper)
*
* @return the sum adjusted-for-chance mutual information
*/
public double getAmiSum()
{
return (getMutualInformation() - getExpectedMutualInformation()) / (0.5
* (getEntropyU() + getEntropyV()) - getExpectedMutualInformation());
}
/**
* Returns the sqrt adjusted-for-chance mutual information (see Table 2 of
* cited paper)
*
* @return the sqrt adjusted-for-chance mutual information
*/
public double getAmiSqrt()
{
return (getMutualInformation() - getExpectedMutualInformation())
/ (Math.sqrt(getEntropyU() * getEntropyV())
- getExpectedMutualInformation());
}
/**
* Returns the min adjusted-for-chance mutual information (see Table 2 of
* cited paper)
*
* @return the min adjusted-for-chance mutual information
*/
public double getAmiMin()
{
return (getMutualInformation() - getExpectedMutualInformation())
/ (Math.min(getEntropyU(), getEntropyV())
- getExpectedMutualInformation());
}
}