CommunityComparisons.java example

Explorer
Foundry-master
- Components
/*
 * File:                CommunityComparisons.java
 * Authors:             Jeremy D. Wendt
 * Company:             Sandia National Laboratories
 * Project:             Cognitive Foundry
 * 
 * Copyright 2016, Sandia Corporation.
 * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive
 * license for use of this work by or on behalf of the U.S. Government. 
 * Export of this program may require a license from the United States
 * Government. See CopyrightHistory.txt for complete details.
 * 
 */

package gov.sandia.cognition.graph.community;

import gov.sandia.cognition.annotation.PublicationReference;
import gov.sandia.cognition.annotation.PublicationType;
import java.util.HashSet;
import java.util.Set;

/**
 * This class implements various comparisons of partitionings of a graph (or the
 * resulting communities)
 *
 * @author jdwendt
 * @param <NodeNameType> The type used to name nodes
 */
@PublicationReference(author = "Nguyen Xuan Vinh, Julien Epps, and James Bailey",
    title
    = "Information Theoretic Measures of Clusterings Comparison: Variants, Properties, Normalization and Correction for Chance",
    type = PublicationType.Journal, year = 2010, publication
    = "Journal of Machine Learning Research", pages = 2837 - 2854)
public class CommunityComparisons<NodeNameType>
{

    /**
     * The u-index/v-index contingency table that shows the overlap of u's
     * clustering and v's clustering for each pair of clusters
     */
    private final int[][] contingencyTable;

    /**
     * The size of each of u's clusters. Also the sum across contingency table's
     * rows.
     */
    private final int[] uSums;

    /**
     * The size of each of v's clusters. Also the sum across contingency table's
     * columns.
     */
    private final int[] vSums;

    /**
     * The number of nodes in the original set (also the sum of uSums or vSums
     * or all entries in contingencyTable)
     */
    private final int n;

    /**
     * The number of rows (nu) and columns (nv). uSums.length and vSums.length
     */
    private final int nu, nv;

    /**
     * Once computed, the entropy of u (h(u)) is stored here
     */
    private double entropyU;

    /**
     * Once computed, the entropy of v (h(v)) is stored here
     */
    private double entropyV;

    /**
     * Once computed, the joint entropy (h(u, v)) is stored here
     */
    private double jointEntropy;

    /**
     * Once computed, h(u|v) is stored here
     */
    private double condEntropyUGivenV;

    /**
     * Once computed, h(v|u) is stored here
     */
    private double condEntropyVGivenU;

    /**
     * Once computed, the mutual information (I(u, v)) is stored here
     */
    private double mutualInformation;

    /**
     * Once computed, the expected mutual information (E(I(u, v))) is stored
     * here
     */
    private double expectedMutualInformation;

    /**
     * Initializes this with the input partitionings of the same underlying set
     *
     * @param u The first partitioning
     * @param v The second partitioning
     */
    public CommunityComparisons(NodePartitioning<NodeNameType> u,
        NodePartitioning<NodeNameType> v)
    {
        // First, make sure they partition the same space
        Set<NodeNameType> su = u.getAllMembers();
        Set<NodeNameType> sv = v.getAllMembers();
        if (su.size() != sv.size())
        {
            throw new IllegalArgumentException(
                "Input partitionings do not contain the same number of members: "
                + su.size() + " != " + sv.size());
        }
        Set<NodeNameType> tmp = new HashSet<>(su);
        tmp.retainAll(sv);
        if (tmp.size() != sv.size())
        {
            throw new IllegalArgumentException(
                "Input partitionings don't contain the same set of nodes: original size = "
                + su.size() + " != intersection size " + tmp.size());
        }
        entropyU = entropyV = jointEntropy = condEntropyUGivenV
            = condEntropyVGivenU = mutualInformation = expectedMutualInformation
            = Double.MAX_VALUE;
        n = sv.size();
        nu = u.getNumPartitions();
        nv = v.getNumPartitions();
        contingencyTable = new int[nu][];
        uSums = new int[nu];
        vSums = new int[nv];
        for (int i = 0; i < nu; ++i)
        {
            contingencyTable[i] = new int[nv];
            uSums[i] = 0;
        }
        for (int i = 0; i < nv; ++i)
        {
            vSums[i] = 0;
        }

        for (NodeNameType node : su) {
            int i = u.getPartition(node);
            int j = v.getPartition(node);
            ++contingencyTable[i][j];
            ++uSums[i];
            ++vSums[j];
        }
    }

    /**
     * This returns 0 if 0 is passed in. This is valid only because every time
     * this is called with zero, the returned value will be multiplied by zero
     * and the result should be zero.
     *
     * @param d The double whose log is sought
     * @return the natural log of d unless d is zero (then zero)
     */
    private static double safeLog(double d)
    {
        return (d == 0) ? 0 : Math.log(d);
    }

    /**
     * Helper that computes the entropy of the input marginals where N is the
     * sum of the entries in marginals
     *
     * @param N The sum of the entries in marginals
     * @param marginals The partitioning of the values
     * @return the entropy of the input marginals
     */
    private static double getEntropy(int N,
        int[] marginals)
    {
        double ret = 0;
        for (int marginal : marginals)
        {
            double f = ((double) marginal) / ((double) N);
            ret += f * safeLog(f);
        }

        return -ret;
    }

    /**
     * Returns the entropy of the u partitioning (computing and storing it if
     * this is the first time called)
     *
     * @return the entropy of the u partitioning
     */
    public double getEntropyU()
    {
        if (entropyU == Double.MAX_VALUE)
        {
            entropyU = getEntropy(n, uSums);
        }

        return entropyU;
    }

    /**
     * Returns the entropy of the v partitioning (computing and storing it if
     * this is the first time called)
     *
     * @return the entropy of the v partitioning
     */
    public double getEntropyV()
    {
        if (entropyV == Double.MAX_VALUE)
        {
            entropyV = getEntropy(n, vSums);
        }

        return entropyV;
    }

    /**
     * Returns the joint entropy (H(u,v)), computing it if this is the first
     * time called
     *
     * @return the joint entropy
     */
    public double getJointEntropy()
    {
        if (jointEntropy == Double.MAX_VALUE)
        {
            jointEntropy = 0;
            for (int[] row : contingencyTable)
            {
                for (int nij : row)
                {
                    double f = ((double) nij) / (double) n;
                    jointEntropy += f * safeLog(f);
                }
            }
            jointEntropy *= -1;
        }
        return jointEntropy;
    }

    /**
     * Returns H(u|v), computing and storing it if this is the first time called
     *
     * @return H(u|v)
     */
    public double getConditionalEntropyUGivenV()
    {
        if (condEntropyUGivenV == Double.MAX_VALUE)
        {
            condEntropyUGivenV = 0;
            for (int i = 0; i < nu; ++i)
            {
                for (int j = 0; j < nv; ++j)
                {
                    double numer = ((double) contingencyTable[i][j])
                        / (double) n;
                    double denom = ((double) vSums[j]) / (double) n;
                    condEntropyUGivenV += numer * safeLog(numer / denom);
                }
            }
            condEntropyUGivenV *= -1;
        }

        return condEntropyUGivenV;
    }

    /**
     * Returns H(v|u), computing and storing it if this is the first time called
     *
     * @return H(v|u)
     */
    public double getConditionalEntropyVGivenU()
    {
        if (condEntropyVGivenU == Double.MAX_VALUE)
        {
            condEntropyVGivenU = 0;
            for (int i = 0; i < nu; ++i)
            {
                for (int j = 0; j < nv; ++j)
                {
                    double numer = ((double) contingencyTable[i][j])
                        / (double) n;
                    double denom = ((double) uSums[i]) / (double) n;
                    condEntropyVGivenU += numer * safeLog(numer / denom);
                }
            }
            condEntropyVGivenU *= -1;
        }

        return condEntropyVGivenU;
    }

    /**
     * Returns the mutual information (I(u, v)) for the two partitionings,
     * computing and storing it if this is the first time called
     *
     * @return the mutual information
     */
    public double getMutualInformation()
    {
        if (mutualInformation == Double.MAX_VALUE)
        {
            mutualInformation = 0;
            for (int i = 0; i < nu; ++i)
            {
                for (int j = 0; j < nv; ++j)
                {
                    double numer = ((double) contingencyTable[i][j])
                        / (double) n;
                    double denom = ((double) uSums[i] * vSums[j]) / (double) (n
                        * n);
                    mutualInformation += numer * safeLog(numer / denom);
                }
            }
            // TODO: Really?  No negation?
        }

        return mutualInformation;
    }

    /**
     * Returns the joint normalized mutual information (see Table 2 of cited
     * paper)
     *
     * @return the joint normalized mutual information
     */
    public double getNmiJoint()
    {
        return getMutualInformation() / getJointEntropy();
    }

    /**
     * Returns the max normalized mutual information (see Table 2 of cited
     * paper)
     *
     * @return the max normalized mutual information
     */
    public double getNmiMax()
    {
        return getMutualInformation() / Math.max(getEntropyU(), getEntropyV());
    }

    /**
     * Returns the sum normalized mutual information (see Table 2 of cited
     * paper)
     *
     * @return the sum normalized mutual information
     */
    public double getNmiSum()
    {
        return 2 * getMutualInformation() / (getEntropyU() + getEntropyV());
    }

    /**
     * Returns the sqrt normalized mutual information (see Table 2 of cited
     * paper)
     *
     * @return the sqrt normalized mutual information
     */
    public double getNmiSqrt()
    {
        return getMutualInformation() / Math.sqrt(getEntropyU() * getEntropyV());
    }

    /**
     * Returns the min normalized mutual information (see Table 2 of cited
     * paper)
     *
     * @return the min normalized mutual information
     */
    public double getNmiMin()
    {
        return getMutualInformation() / Math.min(getEntropyU(), getEntropyV());
    }

    /**
     * Returns the approximate expected value of the mutual information for the
     * two partitionings. Specifically, the middle value from Equation 4 of the
     * cited paper. Computes and stores the value if this is the first time
     * called.
     *
     * @return the approximate expected value of the mutual information
     */
    public double getExpectedMutualInformation()
    {
        if (expectedMutualInformation == Double.MAX_VALUE)
        {
            expectedMutualInformation = 0;
            double N = n;
            for (int i = 0; i < nu; ++i)
            {
                double ai = uSums[i];
                for (int j = 0; j < nv; ++j)
                {
                    double bj = vSums[j];
                    expectedMutualInformation += (ai * bj) / (N * N) * Math.log(
                        (N * (ai - 1) * (bj - 1)) / ((N - 1) * ai * bj) + (N
                        / (ai * bj)));
                }
            }
            // TODO: Really again no negation?
        }

        return expectedMutualInformation;
    }

    /**
     * Returns the max adjusted-for-chance mutual information (see Table 2 of
     * cited paper)
     *
     * @return the max adjusted-for-chance mutual information
     */
    public double getAmiMax()
    {
        return (getMutualInformation() - getExpectedMutualInformation())
            / (Math.max(getEntropyU(), getEntropyV())
            - getExpectedMutualInformation());
    }

    /**
     * Returns the sum adjusted-for-chance mutual information (see Table 2 of
     * cited paper)
     *
     * @return the sum adjusted-for-chance mutual information
     */
    public double getAmiSum()
    {
        return (getMutualInformation() - getExpectedMutualInformation()) / (0.5
            * (getEntropyU() + getEntropyV()) - getExpectedMutualInformation());
    }

    /**
     * Returns the sqrt adjusted-for-chance mutual information (see Table 2 of
     * cited paper)
     *
     * @return the sqrt adjusted-for-chance mutual information
     */
    public double getAmiSqrt()
    {
        return (getMutualInformation() - getExpectedMutualInformation())
            / (Math.sqrt(getEntropyU() * getEntropyV())
            - getExpectedMutualInformation());
    }

    /**
     * Returns the min adjusted-for-chance mutual information (see Table 2 of
     * cited paper)
     *
     * @return the min adjusted-for-chance mutual information
     */
    public double getAmiMin()
    {
        return (getMutualInformation() - getExpectedMutualInformation())
            / (Math.min(getEntropyU(), getEntropyV())
            - getExpectedMutualInformation());
    }

}