/* * File: DominanceGlobalTermWeighter.java * Authors: Justin Basilico * Company: Sandia National Laboratories * Project: Cognitive Foundry * * Copyright April 22, 2009, Sandia Corporation. * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive * license for use of this work by or on behalf of the U.S. Government. Export * of this program may require a license from the United States Government. * See CopyrightHistory.txt for complete details. * */ package gov.sandia.cognition.text.term.vector.weighter.global; import gov.sandia.cognition.math.matrix.SparseVectorFactory; import gov.sandia.cognition.math.matrix.Vector; import gov.sandia.cognition.math.matrix.VectorEntry; import gov.sandia.cognition.math.matrix.VectorFactory; import gov.sandia.cognition.math.matrix.VectorUtil; import gov.sandia.cognition.util.ObjectUtil; /** * Implements the dominance term gloal weighting scheme. It is based on the * entropy global weighting scheme, but instead the global weight favors terms * with high entropy instead of discounting them, which is called the term * dominance. The formula for weighting is given as: * * For term i, the global weight (D(i)) is: * D(i) = exp(H(i)) / n * H(i) = - sum_j { p_ij log(p_ij) } * p_ij = tf_ij / gf_i * * where * n = The total number of documents * gf_i = The total number of times that term i appears * tf_ij = The number of times that term i appears in document j * * This class uses an optimization for computing H(i): * H(i) = - (sum_j (tf_ij log(tf_ij))) / fg_i + log(fg_i) * which allows sum_j (tf_ij log(tf_ij)) to be incrementally computed and then * divided by gf_i when needed, instead of needing to compute p_ij each time. * * @author Justin Basilico * @since 3.0 */ public class DominanceGlobalTermWeighter extends AbstractEntropyBasedGlobalTermWeighter { /** A vector caching the global dominance weight of the document collection. * It may be null. Use getDominance() to compute the proper value if it has * not been updated yet. */ protected Vector dominance; /** * Creates a new {@code DominanceGlobalTermWeighter}. */ public DominanceGlobalTermWeighter() { this(SparseVectorFactory.getDefault()); } /** * Creates a new {@code DominanceGlobalTermWeighter}. * * @param vectorFactory * The vector factory. */ public DominanceGlobalTermWeighter( final VectorFactory<? extends Vector> vectorFactory) { super(vectorFactory); this.setDominance(null); } @Override public DominanceGlobalTermWeighter clone() { DominanceGlobalTermWeighter clone = (DominanceGlobalTermWeighter) super.clone(); clone.dominance = ObjectUtil.cloneSafe(this.dominance); return clone; } @Override public void add( final Vector counts) { super.add(counts); this.setDominance(null); } @Override public boolean remove( final Vector counts) { final boolean result = super.remove(counts); if (result) { this.setDominance(null); } return result; } public Vector getGlobalWeights() { return this.getDominance(); } public int getDimensionality() { return VectorUtil.safeGetDimensionality(this.getTermGlobalFrequencies()); } /** * Gets the dominance weight (global weight) vector for all of the terms. * * @return * The dominance weight (global weight) vector for all of the terms. */ public Vector getDominance() { // We cache the dominance. if (this.dominance == null && this.termGlobalFrequencies != null) { // Need to update the dominance. Start by creating an empty vector to // hold it. final int dimensionality = this.getDimensionality(); final Vector newDominance = this.getVectorFactory().createVector( dimensionality); for (VectorEntry entry : this.termGlobalFrequencies) { final int index = entry.getIndex(); final double termEntropySum = this.termEntropiesSum.getElement(index); final double termOccurrences = entry.getValue(); // Calculate the actual dominance values. double value = 0.0; if (termOccurrences != 0.0) { value = Math.exp(-(termEntropySum / termOccurrences - Math.log(termOccurrences))) / this.documentCount; } newDominance.setElement(index, value); } this.setDominance(newDominance); } return this.dominance; } /** * Sets the cached dominance weight vector. * * @param dominance * The cached dominance weight vector. */ protected void setDominance( final Vector dominance) { this.dominance = dominance; } }