/*
* File: AbstractEntropyBasedGlobalWeighter.java
* Authors: Justin Basilico
* Company: Sandia National Laboratories
* Project: Cognitive Foundry
*
* Copyright April 20, 2009, Sandia Corporation.
* Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive
* license for use of this work by or on behalf of the U.S. Government. Export
* of this program may require a license from the United States Government.
* See CopyrightHistory.txt for complete details.
*
*/
package gov.sandia.cognition.text.term.vector.weighter.global;
import gov.sandia.cognition.math.matrix.SparseVectorFactory;
import gov.sandia.cognition.math.matrix.Vector;
import gov.sandia.cognition.math.matrix.VectorEntry;
import gov.sandia.cognition.math.matrix.VectorFactory;
import gov.sandia.cognition.util.ObjectUtil;
/**
* An abstract implementation of a global term weighting scheme that keeps track
* of the sum of the entropy term (f_ij * log(f_ij)) over all documents. It is
* used as a speed-up for global term weighting methods that are based on
* entropy so that they can be computed incrementally.
*
* @author Justin Basilico
* @since 3.0
*/
public abstract class AbstractEntropyBasedGlobalTermWeighter
extends AbstractFrequencyBasedGlobalTermWeighter
{
/** A vector containing the sum of the entropy term (f_ij * log(f_ij)) over
* each document in the collection for each term.
*/
protected Vector termEntropiesSum;
/**
* Creates a new {@code AbstractEntropyBasedGlobalTermWeighter}.
*/
public AbstractEntropyBasedGlobalTermWeighter()
{
this(SparseVectorFactory.getDefault());
}
/**
* Creates a new {@code AbstractEntropyBasedGlobalTermWeighter}.
*
* @param vectorFactory
* The vector factory to use.
*/
public AbstractEntropyBasedGlobalTermWeighter(
final VectorFactory<? extends Vector> vectorFactory)
{
super(vectorFactory);
this.setTermEntropiesSum(null);
}
@Override
public AbstractEntropyBasedGlobalTermWeighter clone()
{
final AbstractEntropyBasedGlobalTermWeighter clone = (AbstractEntropyBasedGlobalTermWeighter)
super.clone();
clone.termEntropiesSum = ObjectUtil.cloneSafe(this.termEntropiesSum);
return clone;
}
@Override
public void add(
final Vector counts)
{
super.add(counts);
// Update the term occurrence counts.
for (VectorEntry entry : counts)
{
final int index = entry.getIndex();
final double count = entry.getValue();
if (count > 0.0)
{
final double termEntropySum = count * Math.log(count)
+ this.termEntropiesSum.getElement(index);
this.termEntropiesSum.setElement(index, termEntropySum);
}
}
}
@Override
public boolean remove(
final Vector counts)
{
final boolean result = super.remove(counts);
if (result)
{
// Update the term entropies sum.
for (VectorEntry entry : counts)
{
final int index = entry.getIndex();
final double count = entry.getValue();
if (count > 0.0)
{
final double termEntropySum = count * Math.log(count)
- this.termEntropiesSum.getElement(index);
this.termEntropiesSum.setElement(index, termEntropySum);
}
}
}
return result;
}
@Override
protected void initializeVectors(
final int dimensionality)
{
super.initializeVectors(dimensionality);
this.termEntropiesSum = this.getVectorFactory().createVector(
dimensionality);
}
@Override
protected void growVectors(
final int newDimensionality)
{
super.growVectors(newDimensionality);
this.termEntropiesSum = this.termEntropiesSum.stack(
this.getVectorFactory().createVector(
newDimensionality - this.termEntropiesSum.getDimensionality()));
}
/**
* Gets the vector containing the sum of term the entropies.
*
* @return
* The term entropies sum.
*/
public Vector getTermEntropiesSum()
{
return this.termEntropiesSum;
}
/**
* Sets the vector containing the sum of the term entropies.
*
* @param termEntropiesSum
* The term entropies sum.
*/
protected void setTermEntropiesSum(
final Vector termEntropiesSum)
{
this.termEntropiesSum = termEntropiesSum;
}
}