/* * File: InverseDocumentFrequencyGlobalWeighter.java * Authors: Justin Basilico * Company: Sandia National Laboratories * Project: Cognitive Foundry * * Copyright April 21, 2009, Sandia Corporation. * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive * license for use of this work by or on behalf of the U.S. Government. Export * of this program may require a license from the United States Government. * See CopyrightHistory.txt for complete details. * */ package gov.sandia.cognition.text.term.vector.weighter.global; import gov.sandia.cognition.annotation.PublicationReference; import gov.sandia.cognition.annotation.PublicationType; import gov.sandia.cognition.math.matrix.SparseVectorFactory; import gov.sandia.cognition.math.matrix.Vector; import gov.sandia.cognition.math.matrix.VectorEntry; import gov.sandia.cognition.math.matrix.VectorFactory; import gov.sandia.cognition.math.matrix.VectorUtil; import gov.sandia.cognition.util.ObjectUtil; /** * Implements the inverse-document-frequency (IDF) term global weighting scheme. * It is a commonly used term weighting approach that gives a higher weight to * terms that appear in a small number of documents in the collection. Its * formula is: * * idf_i = log(n / df_i) * * where n is the total number of documents and df_i is the number of documents * that term i appears in. * * @author Justin Basilico * @since 3.0 */ @PublicationReference ( author="Wikipedia", title="tf-idf", type=PublicationType.WebPage, url="http://en.wikipedia.org/wiki/tf-idf", year=2009 ) public class InverseDocumentFrequencyGlobalTermWeighter extends AbstractFrequencyBasedGlobalTermWeighter { /** The (cached) value of the inverse document frequency. The cached value * is cleared out whenever a document is added or removed. It is recomputed * from the other state values on request. */ protected Vector inverseDocumentFrequency; /** * Creates a new {@code InverseDocumentFrequencyGlobalTermWeighter}. */ public InverseDocumentFrequencyGlobalTermWeighter() { this(SparseVectorFactory.getDefault()); } /** * Creates a new {@code InverseDocumentFrequencyGlobalTermWeighter}. * * @param vectorFactory * The vector factory to use. */ public InverseDocumentFrequencyGlobalTermWeighter( final VectorFactory<? extends Vector> vectorFactory) { super(vectorFactory); } @Override public InverseDocumentFrequencyGlobalTermWeighter clone() { final InverseDocumentFrequencyGlobalTermWeighter clone = (InverseDocumentFrequencyGlobalTermWeighter) super.clone(); clone.inverseDocumentFrequency = ObjectUtil.cloneSafe(this.inverseDocumentFrequency); return clone; } @Override public void add( final Vector counts) { super.add(counts); this.setInverseDocumentFrequency(null); } @Override public boolean remove( final Vector counts) { final boolean result = super.remove(counts); if (result) { this.setInverseDocumentFrequency(null); } return result; } public int getDimensionality() { return VectorUtil.safeGetDimensionality(this.getTermDocumentFrequencies()); } public Vector getGlobalWeights() { return this.getInverseDocumentFrequency(); } /** * Gets the inverse-document-frequency (IDF) global weight values. * * @return * The inverse-document-frequency (IDF) values. */ public Vector getInverseDocumentFrequency() { // We cache the inverse document frequency. if ( this.inverseDocumentFrequency == null && this.termDocumentFrequencies != null) { // Need to update the IDF. Start by copying the term occurrence // counts vector since that is what we will use to compute the IDF. Vector newIDFs = this.getVectorFactory().copyVector(this.termDocumentFrequencies); for (VectorEntry entry : newIDFs) { // Get the number of documents this term occurrs in. final double count = entry.getValue(); if (count > 0.0) { // Compute the inverse-document frequency and use that as // the value. final double idf = Math.log(this.documentCount / count); entry.setValue(idf); } } this.setInverseDocumentFrequency(newIDFs); } return this.inverseDocumentFrequency; } /** * Sets the cached inverse-document-frequency (IDF) global weight values. * * @param inverseDocumentFrequency * The cached inverse-document-frequency (IDF) global weight values. */ protected void setInverseDocumentFrequency( final Vector inverseDocumentFrequency) { this.inverseDocumentFrequency = inverseDocumentFrequency; } }