ChiSquaredSimilarity.java example

Explorer
Foundry-master
- Components
/*
 * File:                TransferEntropy.java
 * Authors:             Tom Brounstein
 * Company:             Sandia National Laboratories
 * Project:             Cognitive Foundry
 * 
 * Copyright July 1, 2014, Sandia Corporation.
 * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive
 * license for use of this work by or on behalf of the U.S. Government.
 * Export of this program may require a license from the United States
 * Government. See CopyrightHistory.txt for complete details.
 * 
 */

package gov.sandia.cognition.statistics;

import gov.sandia.cognition.annotation.PublicationReference;
import gov.sandia.cognition.annotation.PublicationType;
import gov.sandia.cognition.math.matrix.Vector;
import gov.sandia.cognition.statistics.distribution.ChiSquareDistribution;

/**
 * A class for computing the chi-squared similarity between two vectors.  A chi-
 * squared test requires frequency vectors, typically representing documents,
 * so all values in the vectors will be computed as non-negative values. The 
 * test assumes one vector represents a document in a given category, and
 * another vector which is being tested to see if it is (likely) from the same
 * distribution as the original vector.  Note that the test is symmetric,
 * so the choice of which vector is the categorized vector and which one is the
 * testing vector is somewhat arbitrary.
 * @author trbroun
 * @since  3.4.2
 */
@PublicationReference(
    author="Yao-Tsung Chen, Meng Chang Chen",
    title="Using chi-square statistics to measure similarities for text categorization",
    type=PublicationType.Journal,
    year=2011,
    url="http://www.sciencedirect.com/science/article/pii/S0957417410008961#"
)
public class ChiSquaredSimilarity
{
    private Vector categorizedVector;
    private Vector testingVector;
    
    /**
     * Basic constructor.  Sets the variables in the intuitive manner.
     * @param categorizedVector  The vector from a known category.
     * @param testingVector The vector which is being tested to see if it comes from the same category.
     */
    public ChiSquaredSimilarity(Vector categorizedVector, Vector testingVector) {
        this.categorizedVector = categorizedVector.clone();
        this.testingVector = testingVector.clone();
    }
    
    /**
     * Basic setter for the categorized vector.
     * @param newCategorizedVector 
     */
    public void setCategorizedVector(Vector newCategorizedVector) {
        categorizedVector = newCategorizedVector.clone();
    }
    
    /**
     * Basic setter for the test vector.
     * @param newTestVector 
     */
    public void setTestVector(Vector newTestVector) {
        testingVector = newTestVector.clone();
    }
    
    /**
     * Basic getter for the categorized vector.
     * @return The categorized vector.
     */
    public Vector getCategorizedVector() {
        return categorizedVector;
    }
    
    /**
     * Basic getter for the testing vector.
     * @return The test vector.
     */
    public Vector getTestVector() {
        return testingVector;
    }
    
    /**
     * Computes the chi-squared statistic of the two vectors.  This is a raw number
     * and needs to be fed into a chi-squared distribution to compute a probability.
     * Both vectors must be non-zero.
     * @return The chi-squared statistic.
     */
    public double compute() {
        if(categorizedVector.getDimensionality() != testingVector.getDimensionality()) {
            throw new RuntimeException("Dimension mismatch.");
        }
        
        double sumCat = categorizedVector.norm1();
        double sumTest = testingVector.norm1();
        double h = sumCat + sumTest;
        
        if(sumCat == 0 || sumTest == 0) {
            throw new RuntimeException("One of the vectors is the 0 vector.");
        }
        
        double total = 0.;
        
        for (int ii = 0; ii < categorizedVector.getDimensionality(); ii++) {
            double catii = Math.abs(categorizedVector.getElement(ii));
            double testii = Math.abs(testingVector.getElement(ii));
            double pairSum = catii + testii;
            
            if (pairSum == 0) {
                continue;
            }
            
            double firstTerm = (catii*catii)/(sumCat*pairSum);
            double secondTerm = (testii*testii)/(sumTest*pairSum);
            
            total += firstTerm + secondTerm;            
        }
        
        double chiSquaredStat = h*total - h;
        
        return chiSquaredStat;
    }
    
    /**
     * Computes the chi-squared similarity statistic, then uses that to compute 
     * a cumulative probability.  Returns the probability that a chi-squared  
     * statistic falls between 0 and the critical value (the computed chi-squared
     * statistic for the two supplied vectors).  Naturally, a large chi-squared 
     * value generates a large cumulative probability value.
     * @return The probability of a chi-squared statistic being lower than the value of the chi-squared similarity of the given vectors.
     */
    public double computeCumulativeProbabilityValue() {
        double chiSquaredStat = compute();
        
        ChiSquareDistribution dist = new ChiSquareDistribution(categorizedVector.getDimensionality()-1);
        
        return dist.getCDF().evaluate(chiSquaredStat);
    }
    
}