/* * File: EntropyGlobalWeighter.java * Authors: Justin Basilico * Company: Sandia National Laboratories * Project: Cognitive Foundry * * Copyright April 22, 2009, Sandia Corporation. * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive * license for use of this work by or on behalf of the U.S. Government. Export * of this program may require a license from the United States Government. * See CopyrightHistory.txt for complete details. * */ package gov.sandia.cognition.text.term.vector.weighter.global; import gov.sandia.cognition.annotation.PublicationReference; import gov.sandia.cognition.annotation.PublicationType; import gov.sandia.cognition.math.matrix.SparseVectorFactory; import gov.sandia.cognition.math.matrix.Vector; import gov.sandia.cognition.math.matrix.VectorEntry; import gov.sandia.cognition.math.matrix.VectorFactory; import gov.sandia.cognition.math.matrix.VectorUtil; import gov.sandia.cognition.util.ObjectUtil; /** * Implements the entropy global term weighting scheme. It has been seen that * this weighting scheme can work well with Latent Semantic Analysis * (Dumais, 1991). * * For a term i, the global weight (W(i)) is: * W(i) = 1 - E(i) / log(n) * E(i) = - sum_j (p_ij log(p_ij)) * p_ij = tf_ij / gf_i * * where * n = The total number of documents * gf_i = The total number of times that term i appears * tf_ij = The number of times that term i appears in document j * * This class uses an optimization for computing E(i): * E(i) = - (sum_j (tf_ij log(tf_ij))) / log(gf_i) + log(gf_i) * which allows sum_j (tf_ij log(tf_ij)) to be incrementally computed and then * divided by gf_i when needed, instead of needing to compute p_ij each time. * @author Justin Basilico * @since 3.0 */ @PublicationReference( author={"Susan T. Dumais"}, title="Improving the retrieval of information from external sources", year=1991, type=PublicationType.Journal, publication="Behavior Research Methods, Instruments, and Computers", pages={229, 236}, url="http://www.google.com/url?sa=t&source=web&ct=res&cd=1&url=http%3A%2F%2Fwww.psychonomic.org%2Fsearch%2Fview.cgi%3Fid%3D5145&ei=o7joSdGEHY-itgPLre3tAQ&usg=AFQjCNEvm6PZEL6_Hk3XThI6DQ-gGx9EnQ&sig2=-gjFzNroJQirwGtwjaJvgQ" ) public class EntropyGlobalTermWeighter extends AbstractEntropyBasedGlobalTermWeighter { /** A vector caching the global entropy weight of the document collection. * It may be null. Use getEntropy() to compute the proper value if it has * not been updated yet. */ protected Vector entropy; /** * Creates a new {@code EntropyGlobalTermWeighter}. */ public EntropyGlobalTermWeighter() { this(SparseVectorFactory.getDefault()); } /** * Creates a new {@code EntropyGlobalTermWeighter}. * * @param vectorFactory * The vector factory to use. */ public EntropyGlobalTermWeighter( final VectorFactory<? extends Vector> vectorFactory) { super(vectorFactory); this.setEntropy(null); } @Override public EntropyGlobalTermWeighter clone() { final EntropyGlobalTermWeighter clone = (EntropyGlobalTermWeighter) super.clone(); clone.entropy = ObjectUtil.cloneSafe(this.entropy); return clone; } @Override public void add( final Vector counts) { super.add(counts); this.setEntropy(null); } @Override public boolean remove( final Vector counts) { final boolean result = super.remove(counts); if (result) { this.setEntropy(null); } return result; } public int getDimensionality() { return VectorUtil.safeGetDimensionality(this.getTermGlobalFrequencies()); } public Vector getGlobalWeights() { return this.getEntropy(); } /** * Gets the entropy weight (global weight) vector for all of the terms. * * @return * The entropy weight (global weight) vector for all of the terms. */ public Vector getEntropy() { // We cache the entropy. if (this.entropy == null && this.termGlobalFrequencies != null) { // Need to update the entropy. Start by creating an empty vector to // hold it. final int dimensionality = this.getDimensionality(); final Vector newEntropy = this.getVectorFactory().createVector(dimensionality); final double logDocumentCount = Math.log(this.documentCount); for (VectorEntry entry : this.termGlobalFrequencies) { final int index = entry.getIndex(); final double termEntropySum = this.termEntropiesSum.getElement(index); final double termOccurrences = entry.getValue(); // Calculate the actual entropy values. double value = 1.0; if( (termOccurrences != 0.0) && (logDocumentCount != 0.0) ) { value += (termEntropySum / termOccurrences - Math.log(termOccurrences))/ logDocumentCount; } newEntropy.setElement(index, value); } this.setEntropy(newEntropy); } return this.entropy; } /** * Sets the cached entropy weight vector. * * @param entropy * The cached entropy weight vector. */ protected void setEntropy( final Vector entropy) { this.entropy = entropy; } }