package edu.northwestern.at.utils.corpuslinguistics.postagger.bigram; /* Please see the license information at the end of this file. */ import java.util.*; import edu.northwestern.at.utils.*; import edu.northwestern.at.utils.logger.*; import edu.northwestern.at.utils.corpuslinguistics.adornedword.*; import edu.northwestern.at.utils.corpuslinguistics.lexicon.*; import edu.northwestern.at.utils.corpuslinguistics.postagger.*; import edu.northwestern.at.utils.corpuslinguistics.postagger.smoothing.contextual.*; import edu.northwestern.at.utils.corpuslinguistics.postagger.smoothing.lexical.*; import edu.northwestern.at.utils.corpuslinguistics.tokenizer.*; import edu.northwestern.at.utils.math.*; /** Bigram Part of Speech tagger. * * <p> * The bigram part of speech tagger assigns tags to words in a sentence * assigning the most probable set of tags as determined by a bigram hidden * Markov model given the possible tags of the previous words. * The Viterbi algorithm is used to reduce the * amount of computation required to find the optimal tag assignments. * </p> */ public class BigramTagger extends AbstractPartOfSpeechTagger implements PartOfSpeechTagger { /** True for debug output. */ protected boolean debug = false; /** Contextual probabilities for a word in a sentence. */ protected Map2D<String, String, Probability> contextualProbabilities = Map2DFactory.createNewMap2D(); /** Total number of states rejected by beam search criterion. */ protected int beamSearchRejections = 0; /** Viterbi trellis for tags and probability scores. */ protected Viterbi viterbi = new Viterbi(); /** Create a bigram tagger. */ public BigramTagger() { super(); // Get a lexical smoother. lexicalSmoother = new LexicalSmootherFactory().newLexicalSmoother(); lexicalSmoother.setPartOfSpeechTagger( this ); // Get a contextual smoother. contextualSmoother = new ContextualSmootherFactory().newContextualSmoother(); contextualSmoother.setPartOfSpeechTagger( this ); } /** See if tagger uses a probability transition matrix. * * @return True since bigram tagger uses probability transition * matrix. */ public boolean usesTransitionProbabilities() { return true; } /** Tag a list of sentences. * * @param sentences The list of sentences. * * @return The sentences with words adorned with * parts of speech. * * <p> * The sentences are a {@link java.util.List} of * {@link java.util.List}s of words to be tagged. * Each sentence is represented as a list of * words. The output is a list of * {@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord}s. * </p> */ public List<List<AdornedWord>> tagSentences( List<List<String>> sentences ) { // Tag the words in the sentences. List<List<AdornedWord>> result = super.tagSentences( sentences ); // Report cache usage. if ( debug ) { logger.logDebug ( " # of cached lexical probabilties : " + lexicalSmoother.cachedProbabilitiesCount() ); logger.logDebug ( " # of cached contextual probabilties: " + contextualSmoother.cachedProbabilitiesCount() ); logger.logDebug ( " # of states rejected by beam search: " + beamSearchRejections ); } return result; } /** Tag a sentence. * * @param taggedSentence The sentence as an * {@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord}. * * @return An {@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord} * of the words in the sentence tagged with * parts of speech. * * <p> * The input sentence is a {@link java.util.List} of * string words to be tagged. The output is * {@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord} * of the words with parts of speech added. * </p> */ public<T extends AdornedWord> List<T> tagAdornedWordList ( List<T> taggedSentence ) { // Reset Viterbi trellis. viterbi.reset(); // Assume period as previous word tag. List<String> previousTags = ListFactory.createNewList(); previousTags.add( "." ); // Index of word in sentence. int wordIndex = 0; // Iterate over words in sentence. Iterator<? extends AdornedWord> wordsIter = taggedSentence.iterator(); List<String> tags = null; while ( wordsIter.hasNext() ) { // Get next word. AdornedWord word = wordsIter.next(); // Get part of speech tags for this // this word. tags = getTagsForWord( word.getStandardSpelling() ); // Process word. The returned tags // for the current word are those which // passed the Viterbi beam search // criterion. These possibly pruned tags // will be the previous tags for the // next word. previousTags = processWord ( wordIndex++ , word.getStandardSpelling() , previousTags , tags ); } // Retrieve optimal tags and // output (word,tag) . List<String> optimalTags = viterbi.optimalTags( wordIndex , tags ); wordIndex = 0; wordsIter = taggedSentence.iterator(); while ( wordsIter.hasNext() ) { // Get next word. AdornedWord word = (AdornedWord)wordsIter.next(); // Add word and tag to tagged sentence. word.setPartsOfSpeech( optimalTags.get( wordIndex++ ) ); } // Increment total count of states // rejections by beam search criterion. beamSearchRejections += viterbi.getBeamSearchRejections(); // We have a new finished sentence. return taggedSentence; } /** Process a single word. * * @param wordIndex Index of word in sentence (starts at 0). * @param word Word being processed. * @param previousTags The previous word's tags. * @param tags The current word's tags. * * @return Updated tag list. */ protected List<String> processWord ( int wordIndex , String word , List<String> previousTags , List<String> tags ) { // Find tag with largest probability // combined with previous word's tag. contextualProbabilities.clear(); int nTags = tags.size(); Probability[] lexicalProbs = new Probability[ nTags ]; for ( int i = 0 ; i < nTags ; i++ ) { lexicalProbs[ i ] = lexicalSmoother.lexicalProbability( word , (String)tags.get( i ) ); for ( int j = 0 ; j < previousTags.size() ; j++ ) { contextualProbabilities.put ( tags.get( i ) , previousTags.get( j ) , contextualSmoother.contextualProbability ( tags.get( i ) , previousTags.get( j ) ) ); } } return viterbi.updateScore ( wordIndex , lexicalProbs , contextualProbabilities , tags , previousTags ); } /** Set the logger. * * @param logger The logger. */ public void setLogger( Logger logger ) { this.logger = logger; ((UsesLogger)lexicalSmoother).setLogger( logger ); ((UsesLogger)contextualSmoother).setLogger( logger ); viterbi.setLogger( logger ); } /** Return tagger description. * * @return Tagger description. */ public String toString() { return "Bigram tagger"; } } /* Copyright (c) 2008, 2009 by Northwestern University. All rights reserved. Developed by: Academic and Research Technologies Northwestern University http://www.it.northwestern.edu/about/departments/at/ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal with the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimers. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimers in the documentation and/or other materials provided with the distribution. * Neither the names of Academic and Research Technologies, Northwestern University, nor the names of its contributors may be used to endorse or promote products derived from this Software without specific prior written permission. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. */