Dictionary.java example

Explorer

tml-master
- tml
  - src
    - main
      - java
        SemanticSpace.java
        TmlCommandLine.java
        tml
        Configuration.java
        annotators
        AbstractAnnotator.java
        Annotator.java
        AnnotatorManager.java
        PennTreeAnnotator.java
        corpus
        Corpus.java
        CorpusParameters.java
        Dictionary.java
        ParagraphCorpus.java
        RepositoryCorpus.java
        SearchResultsCorpus.java
        SentenceCorpus.java
        SimpleCorpus.java
        Term.java
        TextDocument.java
        TextPassage.java
        sql
        DbConnection.java
        storage
        DocumentAnnotator.java
        DocumentCleanup.java
        Repository.java
        RepositoryEvent.java
        RepositoryListener.java
        TmlAnnotatorTask.java
        TmlCleanupTask.java
        TmlIndexerTask.java
        importers
        AbstractImporter.java
        HtmlImporter.java
        Importer.java
        PdfImporter.java
        TextImporter.java
        test
        AbstractTmlIndexingTest.java
        utils
        DBUtils.java
        DistanceLib.java
        Highlighting.java
        JDBCUtils.java
        LanczosSVDLIBCUtils.java
        LanczosSVDPACKCUtils.java
        LuceneUtils.java
        MatrixUtils.java
        RegexUtils.java
        StanfordUtils.java
        Stats.java
        WordNetUtils.java
        vectorspace
        EmptyTextPassageException.java
        NoDocumentsInCorpusException.java
        NotEnoughTermsInCorpusException.java
        SVD.java
        SemanticSpace.java
        TermWeighting.java
        TermWeightingException.java
        factorisation
        MatrixFactorisation.java
        MultiDimensionalScalingNR.java
        NonnegativeMatrixFactorisationED.java
        NonnegativeMatrixFactorisationKL.java
        PrincipalCoordinateAnalysis.java
        ProbabilisticLatentSemanticAnalysis.java
        SingularValueDecomposition.java
        SpaceDecomposition.java
        operations
        AbstractOperation.java
        ClassDiscovery.java
        CompoundNounsSummarized.java
        ConceptExtraction.java
        FactorAnalysisPlot.java
        LastPassage.java
        LexiconAnalysis.java
        Operation.java
        OperationEvent.java
        OperationListener.java
        ParagraphCoherenceIndex.java
        PassageDistances.java
        PassageExtractionSummarization.java
        PassagesSimilarity.java
        RapidAutomaticKeywordExtraction.java
        Readability.java
        RelationshipExtraction.java
        Summary.java
        TagClouds.java
        TermExtractionSummarization.java
        results
        AbstractResult.java
        FactorAnalysisPlotResult.java
        LastPassageResult.java
        LexiconAnalysisResult.java
        NullResult.java
        ParagraphCoherenceIndexResult.java
        PassageClusteringLingoResult.java
        PassageDistancesResult.java
        PassageExtractionSummarizationResult.java
        PassageSimilarityResult.java
        RapidAutomaticKeywordExtractionResult.java
        ReadabilityResult.java
        RelationshipExtractionResult.java
        Summary.java
        SummaryResult.java
        TagCloudsResult.java
        TermRankedResult.java
        TermsExtractionSummarizationResult.java
        summarization
        AbstractSummarizationOperation.java
        LatentSemanticAnalysisSummarization.java
        SummarizationOperation.java
        VectorLengthSummarization.java
        visualizations
        AbstractVisualization.java
        TagClouds.java
        Visualization.java
    - test
      - java
        tml
        test
        DbConnectionTest.java
        FactorAnalysisPlotTest.java
        IndexingDocumentsTest.java
        IndexingHtmlTest.java
        IndexingInvalidDocumentsTest.java
        IndexingPlainTextTest.java
        LanczosTest.java
        LuceneSearchTest.java
        NonNegativeMatrixFactorizationTest.java
        RapidAutomaticKeywordExtractionTest.java
        ReadabilityTest.java
        SimpleCorpusTest.java
        StemmingTest.java
        TagCloudsTest.java
        ValidateBerryDumaisTest.java
        ValidateDistancesTest.java
        ValidateHandbookOfLSATest.java
        ValidateIntroToLSATest.java
        ValidateSameDistancesAllDimensions.java

/*******************************************************************************
 *  Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl)
 *  
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License. 
 *  You may obtain a copy of the License at 
 *  
 *  	http://www.apache.org/licenses/LICENSE-2.0 
 *  	
 *  Unless required by applicable law or agreed to in writing, software 
 *  distributed under the License is distributed on an "AS IS" BASIS, 
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 *  See the License for the specific language governing permissions and 
 *  limitations under the License.
 *******************************************************************************/
package tml.corpus;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Hashtable;
import java.util.List;

import org.apache.log4j.Logger;

/**
 * This class represents a group of {@link Term}s or words/symbols, usually
 * obtained from a set of documents or text passages. It is the common set of
 * words for a group of documents.
 * 
 * The dictionary can filter {@link Term}s based on a selection criteria and its
 * specific threshold. By default the {@link TermSelection} criteria is a
 * minimum DF or Document Frequency, i.e. the {@link Term} must appear in at
 * least a certain number of different {@link TextPassage}s indicated by the
 * threshold.
 * 
 * A {@link Dictionary} also maintains the list of {@link Term}s inside a
 * {@link TextPassage}. When the {@link TermSelection} criteria is applied, the
 * {@link Dictionary} removes the unused {@link Term}s from the
 * {@link TextPassage}s that contain those {@link Term}s.
 * 
 * @author Jorge Villalon
 * 
 */
public class Dictionary {

	/** Logger */
	private static Logger logger = Logger.getLogger(Dictionary.class);
	
	/** Terms indexed by their lexical representation (String) */
	private Hashtable<String, Term> termsByText;
	/** The list of lexical representation of the terms */
	private List<String> terms;
	/** The corpus to which the dictionary belongs */
	private Corpus corpus = null;

	/** Implemented local weight functions */

	/**
	 * Basic constructor of a {@link Dictionary}, initialises the list and index
	 * of {@link Term}s
	 * 
	 * @param corpus
	 */
	public Dictionary(Corpus corpus) {
		assert (corpus != null);

		this.termsByText = new Hashtable<String, Term>();
		this.terms = new ArrayList<String>();
		this.corpus = corpus;
		logger.debug("Creating dictionary for corpus " + corpus);
	}

	/**
	 * Adds an array of {@link Term}s to the {@link Dictionary} and their
	 * frequencies. Both must come from a specific {@link TextPassage}
	 * 
	 * @param newTerms
	 * @param termFreqs
	 * @param document
	 */
	public void addTerms(String[] newTerms, int[] termFreqs,
			TextPassage document) {
		assert (document != null);
		assert (newTerms != null);
		assert (termFreqs != null);

		int currentTerm = 0;
		// For each lexical representation of a term (word or symbol)
		for (String t : newTerms) {

			// Its correspondent Term is obtained, if it doesn't exist is
			// created
			Term term = this.termsByText.get(t);

			if (term == null) {
				term = new Term(t, this.termsByText.size());
				this.termsByText.put(term.getTerm(), term);
				this.terms.add(term.getTerm());
			}

			// The document is added to the term's list of documents
			term.addTermAppearance(document, termFreqs[currentTerm]);

			// The term is added to the list of terms in the document
			document.addTerm(term, termFreqs[currentTerm]);

			currentTerm++;
		}
	}

	/**
	 * Remove the {@link Term}s from the {@link Dictionary} that doesn't meet
	 * the {@link TermSelection} criteria according to the threshold.
	 */
	public void removeTerms() {

		int termIndex = 0;
		int originalNumberOfTerms = this.terms.size();

		// For each term in the dictionary
		for (int currentIndex = 0; currentIndex < originalNumberOfTerms; currentIndex++) {

			// Gets the term and its statistics
			Term term = this.termsByText.get(this.terms.get(currentIndex));

			double termValue = 0;

			// According to the criteria, a different value is read
			switch (this.corpus.getParameters().getTermSelectionCriterion()) {
			case AVG_TF:
				termValue = term.getTermGlobalFrequencyMean();
				break;
			case DF:
				termValue = term.getDocumentFrequency();
				break;
			case TF:
				termValue = term.getTermGlobalFrequency();
				break;
			default:
				logger.error("Invalid term selection criteria");
			}

			// Now validate if the term value meets the criteria
			if (termValue < this.corpus.getParameters().getTermSelectionThreshold()) {
				// Remove the term from the dictionary and from its documents
				this.termsByText.remove(term.getTerm());
				for (TextPassage document : term.getTextPassages()) {
					document.removeTerm(term);
				}
			} else {
				// Update the term index if necessary
				if (term.getIndex() != termIndex || true) {
					term.setIndex(termIndex);
				}
				termIndex++;
			}
		}

		logger.debug("Removal of terms finished, "
				+ (originalNumberOfTerms - termIndex) + " terms removed, "
				+ termIndex + " terms kept.");
	}

	/**
	 * Returns the collection of {@link Term}s in the {@link Dictionary}
	 * 
	 * @return a {@link Collection} of {@link Term}s
	 */
	public Collection<Term> getTerms() {
		return this.termsByText.values();
	}

	/**
	 * Gets the {@link Corpus} to which the {@link Dictionary} belongs
	 * 
	 * @return a {@link Corpus}
	 */
	public Corpus getCorpus() {
		return this.corpus;
	}

	/**
	 * Returns a {@link Term} that represents a word, null if it is not in the
	 * {@link Dictionary}
	 * 
	 * @param word
	 *            the word to look for
	 * @return the {@link Term}
	 */
	public Term getTermByText(String word) {
		return this.termsByText.get(word);
	}
}