/******************************************************************************* * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ package tml.corpus; import java.util.ArrayList; import java.util.List; import tml.utils.Stats; /** * <p> * The {@link Term} class represents a unique word within a {@link Corpus}. It * is stored by the {@link Corpus}' {@link Dictionary} and it contain links to * all the {@link TextPassage}s that contain the {@link Term}. * </p> * <p> * The class also contains some statistics like the total number of times it * appears in a {@link Corpus}. * </p> * <p> * The following code shows how to use the {@link Term}s from a * {@link TextDocument} * </p> * * <pre> * ... * TextDocument document = .....; * List<Term> terms = document.getSentenceCorpus().getDictionary().getTerms(); * for(Term term : terms) { * System.out.println("Term:" + term.getTerm()); * System.out.println("DF:" + term.getDocumentFrequency()); * System.out.println("TF:" + term.getTermFrequency()); * } * </pre> * * @author Jorge Villalon * */ public class Term { /** The word for the term */ private String term; /** The index within the dictionary */ private int index; /** The text passages to which the term belongs */ private List<TextPassage> textPassages; /** The term statistics */ private Stats termStats; /** If there's new info */ private boolean dirty = true; /** Indicates if the term is considered a concept */ private boolean isConcept = false; /** * @param isConcept * if the {@link Term} is a Concept */ public void setConcept(boolean isConcept) { this.isConcept = isConcept; } /** * Creates a new {@link Term}, with an index defined by an external source * (usually a {@link Dictionary}. * * @param term * @param index */ public Term(String term, int index) { this.term = term; this.index = index; this.termStats = new Stats(); this.textPassages = new ArrayList<TextPassage>(); } /** * Adds a new {@link TextPassage} to the {@link Term} * * @param textPassage * @param termFrequency */ public void addTermAppearance(TextPassage textPassage, double termFrequency) { assert (this.textPassages.contains(textPassage) == false); this.textPassages.add(textPassage); this.termStats.add(termFrequency); this.dirty = true; } /** * Checks if the statistics are dirty and recalculates it */ private void checkDirtyStats() { if (this.dirty) { this.termStats.calculateDerived(); this.dirty = false; } } /** * @return the document frequency of the term, i.e. in how many documents * does it appears. */ public int getDocumentFrequency() { this.checkDirtyStats(); return (int) this.termStats.count; } /** * @return the index of the {@link Term} within its {@link Dictionary} */ public int getIndex() { return index; } /** * @return the word that represents the {@link Term} */ public String getTerm() { return term; } /** * @return the term frequency of the {@link Term}, i.e. how many times the * word appears in the {@link Corpus}. */ public int getTermGlobalFrequency() { this.checkDirtyStats(); return (int) this.termStats.sum; } /** * @return the mean appearance of the {@link Term} along the * {@link TextPassage}s of a {@link Corpus}. */ public double getTermGlobalFrequencyMean() { this.checkDirtyStats(); return this.termStats.mean; } /** * @return the list of {@link TextPassage}s to which the {@link Term} * belongs */ public List<TextPassage> getTextPassages() { return textPassages; } /** * Changes the value of the index for the {@link Term} within a * {@link Corpus} * * @param index */ public void setIndex(int index) { for (TextPassage document : this.getTextPassages()) { document.updateTermIndex(this, this.getIndex(), index); } this.index = index; } /** * The default string for a Term is it's own word */ @Override public String toString() { return this.term; } /** * @return if the Term correspond to a potential concept */ public boolean isConcept() { return isConcept; } }