/******************************************************************************* * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ package tml.corpus; import java.util.ArrayList; import java.util.Collection; import java.util.Hashtable; import java.util.List; import org.apache.log4j.Logger; /** * This class represents a group of {@link Term}s or words/symbols, usually * obtained from a set of documents or text passages. It is the common set of * words for a group of documents. * * The dictionary can filter {@link Term}s based on a selection criteria and its * specific threshold. By default the {@link TermSelection} criteria is a * minimum DF or Document Frequency, i.e. the {@link Term} must appear in at * least a certain number of different {@link TextPassage}s indicated by the * threshold. * * A {@link Dictionary} also maintains the list of {@link Term}s inside a * {@link TextPassage}. When the {@link TermSelection} criteria is applied, the * {@link Dictionary} removes the unused {@link Term}s from the * {@link TextPassage}s that contain those {@link Term}s. * * @author Jorge Villalon * */ public class Dictionary { /** Logger */ private static Logger logger = Logger.getLogger(Dictionary.class); /** Terms indexed by their lexical representation (String) */ private Hashtable<String, Term> termsByText; /** The list of lexical representation of the terms */ private List<String> terms; /** The corpus to which the dictionary belongs */ private Corpus corpus = null; /** Implemented local weight functions */ /** * Basic constructor of a {@link Dictionary}, initialises the list and index * of {@link Term}s * * @param corpus */ public Dictionary(Corpus corpus) { assert (corpus != null); this.termsByText = new Hashtable<String, Term>(); this.terms = new ArrayList<String>(); this.corpus = corpus; logger.debug("Creating dictionary for corpus " + corpus); } /** * Adds an array of {@link Term}s to the {@link Dictionary} and their * frequencies. Both must come from a specific {@link TextPassage} * * @param newTerms * @param termFreqs * @param document */ public void addTerms(String[] newTerms, int[] termFreqs, TextPassage document) { assert (document != null); assert (newTerms != null); assert (termFreqs != null); int currentTerm = 0; // For each lexical representation of a term (word or symbol) for (String t : newTerms) { // Its correspondent Term is obtained, if it doesn't exist is // created Term term = this.termsByText.get(t); if (term == null) { term = new Term(t, this.termsByText.size()); this.termsByText.put(term.getTerm(), term); this.terms.add(term.getTerm()); } // The document is added to the term's list of documents term.addTermAppearance(document, termFreqs[currentTerm]); // The term is added to the list of terms in the document document.addTerm(term, termFreqs[currentTerm]); currentTerm++; } } /** * Remove the {@link Term}s from the {@link Dictionary} that doesn't meet * the {@link TermSelection} criteria according to the threshold. */ public void removeTerms() { int termIndex = 0; int originalNumberOfTerms = this.terms.size(); // For each term in the dictionary for (int currentIndex = 0; currentIndex < originalNumberOfTerms; currentIndex++) { // Gets the term and its statistics Term term = this.termsByText.get(this.terms.get(currentIndex)); double termValue = 0; // According to the criteria, a different value is read switch (this.corpus.getParameters().getTermSelectionCriterion()) { case AVG_TF: termValue = term.getTermGlobalFrequencyMean(); break; case DF: termValue = term.getDocumentFrequency(); break; case TF: termValue = term.getTermGlobalFrequency(); break; default: logger.error("Invalid term selection criteria"); } // Now validate if the term value meets the criteria if (termValue < this.corpus.getParameters().getTermSelectionThreshold()) { // Remove the term from the dictionary and from its documents this.termsByText.remove(term.getTerm()); for (TextPassage document : term.getTextPassages()) { document.removeTerm(term); } } else { // Update the term index if necessary if (term.getIndex() != termIndex || true) { term.setIndex(termIndex); } termIndex++; } } logger.debug("Removal of terms finished, " + (originalNumberOfTerms - termIndex) + " terms removed, " + termIndex + " terms kept."); } /** * Returns the collection of {@link Term}s in the {@link Dictionary} * * @return a {@link Collection} of {@link Term}s */ public Collection<Term> getTerms() { return this.termsByText.values(); } /** * Gets the {@link Corpus} to which the {@link Dictionary} belongs * * @return a {@link Corpus} */ public Corpus getCorpus() { return this.corpus; } /** * Returns a {@link Term} that represents a word, null if it is not in the * {@link Dictionary} * * @param word * the word to look for * @return the {@link Term} */ public Term getTermByText(String word) { return this.termsByText.get(word); } }