/******************************************************************************* * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ package tml.vectorspace; import org.apache.log4j.Logger; import tml.corpus.Corpus; import tml.utils.Stats; import Jama.Matrix; /** * The TermWeighting filter transforms a basic * Term/Document matrix to a different Term/Weighting scheme. At the moment we * support a combination of Local Weights and Global Weights. Local Weights can * be: TF: The raw term frequency. TFn: Raw term frequency normalized within the * document. LOGTF: Calculates LogEntropy weight for all numeric values in the * given dataset (apart from the class attribute, if set). The resulting values * are the product of a local weight (1 + log(tf)) and a global weight 1 - Sum_i * ((tf/gf)*log(tf/gf))/log(N) with tf: the raw term frequency gf: the global * term frequency, number of times the term appears in the corpus (i.e. Sum_i * tf) N: number of documents (or parts) in the corpus. More details in"Dumais, Susan 1990 Enhancing Performance in Latent Semantic Indexing (LSI) Retrieval" * . <p/> * * @author Jorge Villalon */ public class TermWeighting { /** * Implemented global weight functions */ public enum GlobalWeight { /** Just 1, so it leaves the local weight as it is */ None, /** The norm of the document vector */ Normal, /** The global inverse document frequency */ GfIdf, /** Inverse document frequency */ Idf, /** Accumulated entropy */ Entropy } /** * Implemented local weight functions */ public enum LocalWeight { /** 1 if the term is in the document, 0 otherwise */ Binary, /** The frequency of the term in the document */ TF, /** * The normalised frequency, i.e. divided by the maximum frequency in * the doc */ TFn, /** The log of 1 plus the term frequency */ LOGTF } public TermWeighting(Corpus corpus) { this.corpus = corpus; } private Corpus corpus = null; private static Logger logger = Logger.getLogger(TermWeighting.class); private void calculateGlobalValues(Matrix termdoc) throws TermWeightingException { // if(this.corpus.getParameters().getTermWeightGlobal() == GlobalWeight.None) // return; // If the corpus is a projection, then we don't have to calculate // the term statistics, because they are defined by the corpus // on which we are projecting. if(this.corpus.isProjection()) { if( this.corpus.getTermEntropies() == null || this.corpus.getTermStats() == null || this.corpus.getDocStats() != null) throw new TermWeightingException(new Exception("The projected corpus should have entropies and termstats, and shouldn't have docstats calculated.")); } // First, calculate global statistics for both terms and documents Stats[] termStats = null; if(!this.corpus.isProjection()) // Ask first to save a little bit of memory and time termStats = new Stats[termdoc.getRowDimension()]; // Doc stats are always calculated, for a normal corpus and a projected one. Stats[] docStats = new Stats[termdoc.getColumnDimension()]; if(!this.corpus.isProjection()) for (int doc = 0; doc < termdoc.getRowDimension(); doc++) termStats[doc] = new Stats(); for (int term = 0; term < termdoc.getColumnDimension(); term++) docStats[term] = new Stats(); for (int doc = 0; doc < termdoc.getColumnDimension(); doc++) { for (int term = 0; term < termdoc.getRowDimension(); term++) { if(termdoc.get(term, doc) != 0) { if(!this.corpus.isProjection()) termStats[term].add(termdoc.get(term, doc)); docStats[doc].add(termdoc.get(term, doc)); } } } for (int doc = 0; doc < termdoc.getColumnDimension(); doc++) { docStats[doc].calculateDerived(); } if(!this.corpus.isProjection()) { for (int term = 0; term < termdoc.getRowDimension(); term++) { termStats[term].calculateDerived(); } this.corpus.setTermStats(termStats); } if(!this.corpus.isProjection()) { // Now calculate the entropy double[] termEntropies = new double[termdoc.getRowDimension()]; for (int doc = 0; doc < termdoc.getColumnDimension(); doc++) { for (int term = 0; term < termdoc.getRowDimension(); term++) { Stats stats = termStats[term]; double p = 0; if(stats.sum > 0) p = termdoc.get(term, doc) / stats.sum; validateValue("p for " + term + "," + doc, p); double entropy = 0; double n = termdoc.getColumnDimension(); if (p != 0 && n > 1) { entropy = (p * (Math.log(p) / Math.log(2))) / (Math.log(n) / Math.log(2)); } validateValue("entropy for " + term + "," + doc + " and p=" + p, entropy); termEntropies[term] += entropy; } } this.corpus.setTermEntropies(termEntropies); } this.corpus.setDocStats(docStats); } private double getGlobalValue(Matrix termdoc, int doc, int term) throws TermWeightingException { if(corpus.getParameters().getTermWeightGlobal() == GlobalWeight.None) return 1; double df = corpus.getTermStats()[term].count; if(df <= 0) throw new TermWeightingException (new Exception("Invalid document frequency, this should be impossible!")); validateValue("df", df); double ndocs = termdoc.getColumnDimension(); validateValue("ndocs", ndocs); double gf = corpus.getTermStats()[term].sum; validateValue("gf", gf); double sumsq = corpus.getTermStats()[term].sumSq; if(sumsq <= 0) throw new TermWeightingException (new Exception("Invalid term frequency, this should be impossible!")); validateValue("sumsq", sumsq); double entropy = corpus.getTermEntropies()[term]; validateValue("entropy", entropy); double value; switch (this.corpus.getParameters().getTermWeightGlobal()) { case Entropy: value = 1 + entropy; break; case GfIdf: value = gf / df; break; case Idf: value = (Math.log(ndocs / df) / Math.log(2)) + 1; break; case None: value = 1; break; case Normal: value = 1 / Math.sqrt(sumsq); break; default: value = 0; } validateValue("global value", value); return value; } private double getLocalValue(Matrix termdoc, int doc, int term) throws TermWeightingException { double value = termdoc.get(term, doc); validateValue("local value", value); switch (this.corpus.getParameters().getTermWeightLocal()) { case Binary: if (value > 0) return 1; return 0; case TF: return value; case TFn: if(corpus.getDocStats()[doc].count == 0) return 0; return value / corpus.getDocStats()[doc].max; case LOGTF: return Math.log(1 + value); default: return 0; } } public Matrix process(Matrix termdoc) throws TermWeightingException { if(this.corpus.isProjection()) { logger.debug("Corpus is projection, no term weighting applied."); return termdoc; } logger.debug("Term weighting. Local: " + this.corpus.getParameters().getTermWeightLocal() + " Global: " + this.corpus.getParameters().getTermWeightGlobal()); calculateGlobalValues(termdoc); logger.debug("Updating weights"); for (int doc = 0; doc < termdoc.getColumnDimension(); doc++) { for (int term = 0; term < termdoc.getRowDimension(); term++) { double localValue = getLocalValue(termdoc, doc, term); double globalValue = getGlobalValue(termdoc, doc, term); double value = localValue * globalValue; if (Double.isInfinite(value)) throw new TermWeightingException(new Exception("Damn it! Infinite")); if (Double.isNaN(value)) throw new TermWeightingException(new Exception("Damn it! NaN")); termdoc.set(term, doc, value); } } return termdoc; } private void validateValue(String name, double value) throws TermWeightingException { if (Double.isInfinite(value)) throw new TermWeightingException(new Exception(name + ":" + value + " is invalid - Infinite")); if (Double.isNaN(value)) throw new TermWeightingException(new Exception(name + ":" + value + " is invalid - NaN")); } }