TrigramTagger.java example

Explorer
morphadorner-opensource-master
- src
package edu.northwestern.at.utils.corpuslinguistics.postagger.trigram;

/*	Please see the license information at the end of this file. */

import java.util.*;

import edu.northwestern.at.utils.*;
import edu.northwestern.at.utils.logger.*;
import edu.northwestern.at.utils.corpuslinguistics.adornedword.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.smoothing.contextual.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.smoothing.lexical.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.smoothing.*;
import edu.northwestern.at.utils.corpuslinguistics.tokenizer.*;
import edu.northwestern.at.utils.math.*;

/**	Trigram Part of Speech tagger.
 *
 *	<p>
 *	The tigram part of speech tagger assigns tags to words in a sentence
 *	assigning the most probable set of tags as determined by a trigram
 *	hidden Markov model given the possible tags of the previous words.
 *	The Viterbi algorithm is used to reduce the
 *	amount of computation required to find the optimal tag assignments.
 *	</p>
 */

public class TrigramTagger
	extends AbstractPartOfSpeechTagger
	implements PartOfSpeechTagger
{
	/**	True for debug output. */

	protected boolean debug		= false;

	/**	Contextual probabilities for a word in a sentence.
	 */

	protected Map3D<String, String, String, Probability>
		contextualProbabilities	= Map3DFactory.createNewMap3D();

	/**	Total number of states rejected by beam search criterion.
	 */

	protected int beamSearchRejections	= 0;

	/**	Viterbi trellis for tags and probability scores.
     */

	protected Viterbi viterbi	= new Viterbi();

	/**	Count of lines tagged. */

	protected int linesTagged	= 0;

	/**	Count of words tagged. */

	protected int wordsTagged	= 0;

	/**	Create a trigram tagger.
	 */

	public TrigramTagger()
	{
		super();
								//	Get a lexical smoother.
		lexicalSmoother	=
			new LexicalSmootherFactory().newLexicalSmoother();

		lexicalSmoother.setPartOfSpeechTagger( this );

								//	Get a contextual smoother.

		contextualSmoother	=
			new ContextualSmootherFactory().newContextualSmoother();

		contextualSmoother.setPartOfSpeechTagger( this );
	}

	/**	See if tagger uses a probability transition matrix.
	 *
	 *	@return		True since trigram tagger uses a probability
	 *					transition matrix.
	 */

	public boolean usesTransitionProbabilities()
	{
		return true;
	}

	/**	Report end of tagging statistics.
	 */

	protected void reportEndOfTaggingStats()
	{
								//	Report cache usage.
		if ( debug )
		{
			logger.logDebug
			(
				"      # of cached lexical probabilties   : " +
				lexicalSmoother.cachedProbabilitiesCount()
			);

			logger.logDebug
			(
				"      # of cached contextual probabilties: " +
				contextualSmoother.cachedProbabilitiesCount()
			);

			logger.logDebug
			(
				"      # of states rejected by beam search: " +
				beamSearchRejections
			);

			if ( retagger != null )
			{
				logger.logDebug
				(
					"      # of corrections applied by rules  : " +
					retagger.getRuleCorrections()
				);
			}
		}

		logger.logInfo
		(
			"      lines: " +
			Formatters.formatIntegerWithCommas( linesTagged ) +
			"; words: " +
			Formatters.formatIntegerWithCommas( wordsTagged )
		);
	}

	/**	Tag a list of sentences.
	 *
	 *	@param	sentences	The list of sentences.
	 *
	 *	<p>
	 *	The sentences are a {@link java.util.List} of
	 *	{@link java.util.List}s of words to be tagged.
	 *	Each sentence is represented as a list of
	 *	words.
	 *	</p>
	 */

	public List<List<AdornedWord>> tagSentences( List<List<String>> sentences )
	{
								//	Tag the words in the sentences.

		List<List<AdornedWord>> result	= super.tagSentences( sentences );

								//	Report end of tagging statistics.

		reportEndOfTaggingStats();

		return result;
	}

	/**	Tag a list of sentences containing adorned words.
	 *
	 *	@param	sentences	The list of sentences.
	 *
	 *	<p>
	 *	The sentences are a {@link java.util.List} of
	 *	{@link java.util.List}s of adorn words to be tagged.
	 *	Each sentence is represented as a list of
	 *	words.
	 *	</p>
	 */

	public<T extends AdornedWord> List<List<T>> tagAdornedWordSentences
	(
		List<List<T>> sentences
	)
	{
								//	Tag the words in the sentences.

		List<List<T>> result	=
			super.tagAdornedWordSentences( sentences );

								//	Report end of tagging statistics.

		reportEndOfTaggingStats();

		return result;
	}

	/**	Tag a sentence comprised of a list of adorned words.
	 *
	 *	@param	taggedSentence	The sentence as an
	 *							{@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord}.
	 *
	 *	@return					An {@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord}
	 *							of the words in the sentence tagged with
	 *							parts of speech.
	 *
	 *	<p>
	 *	The input sentence is a
	 *	{@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord}
	 *	of words to be tagged.  The output is the same list of words with
	 *	parts of speech added.
	 *	</p>
	 */

	public<T extends AdornedWord> List<T> tagAdornedWordList
	(
		List<T> taggedSentence
	)
	{
								//	Reset Viterbi trellis.
		viterbi.reset();
								//	Assume period as previous word tags.

		List<String> previousTags	=
			ListFactory.createNewList();

		previousTags.add( "." );

		List<String> previousPreviousTags	=
			ListFactory.createNewList();

		previousPreviousTags.add( "." );

								//	Part of speech tags for a word.

		List<String> tags	= null;
		AdornedWord word	= null;

								//	Loop over words in sentence.

		for ( int i = 0 ; i < taggedSentence.size() ; i++ )
		{
								//	Get next word.

			word	= taggedSentence.get( i );

								//	Get part of speech tags for this
								//	this word.

			tags	= getTagsForWord( word.getStandardSpelling() );

								//	Process word.  The returned tags
								//	for the current word are those which
								//	passed the Viterbi beam search
								//	criterion.  These possibly pruned tags
								//	will be the previous tags for the
								//	next word.
			tags	=
				processWord
				(
					i ,
					word.getStandardSpelling() ,
					previousPreviousTags ,
					previousTags ,
					tags
				);
								//	These tags will be previous tags
								//	for the next word.

			previousPreviousTags	= previousTags;
			previousTags			= tags;
		}
								//	Retrieve optimal part of speech tags and
								//	adorn each word with its proper tag.

		List<String> optimalTags	=
			viterbi.optimalTags( taggedSentence.size() , tags );

		for ( int i = 0 ; i < taggedSentence.size() ; i++ )
		{
								//	Get next word.

			word	= taggedSentence.get( i );

								//	Add part of speech tag to word.

			word.setPartsOfSpeech( (String)optimalTags.get( i ) );
		}
								//	Increment total count of states
								//	rejections by beam search criterion.

		beamSearchRejections	+= viterbi.getBeamSearchRejections();

								//	Increment counts of lines and
								//	word tagged.
		linesTagged++;
		wordsTagged	+= taggedSentence.size();

		if ( ( linesTagged % 1000 ) == 0 )
		{
			logger.logInfo
			(
				"      lines: " +
				Formatters.formatIntegerWithCommas( linesTagged ) +
				"; words: " +
				Formatters.formatIntegerWithCommas( wordsTagged )
			);
		}
								//	We have a new finished sentence.

		return taggedSentence;
	}

	/**	Process a single word.
	 *
	 *	@param	wordIndex				Index of word in sentence
	 *										(starts at 0).
	 *	@param	word					Word being processed.
	 *	@param	previousPreviousTags	The previous word's previous
	 *										word's tags.
 	 *	@param	previousTags			The previous word's tags.
	 *	@param	tags					The current word's tags.
	 *
	 *	@return							Updated tag list.
	 */

	protected List<String> processWord
	(
		int wordIndex ,
		String word ,
		List<String> previousPreviousTags ,
		List<String> previousTags ,
		List<String> tags
	)
	{
								//	Find tag with largest probability
								//	combined with previous word's tag.

		contextualProbabilities.clear();

		int nTags					= tags.size();
		Probability[] lexicalProbs	= new Probability[ nTags ];

		for ( int i = 0 ; i < nTags ; i++ )
		{
			String tagI	= (String)tags.get( i );

			lexicalProbs[ i ]	=
				lexicalSmoother.lexicalProbability( word , tagI );

			for ( int j = 0 ; j < previousTags.size() ; j++ )
			{
				String previousTagJ	= (String)previousTags.get( j );

				for ( int k = 0 ; k < previousPreviousTags.size() ; k++ )
				{
					contextualProbabilities.put
					(
						tagI,
						previousTagJ ,
						previousPreviousTags.get( k ) ,
						contextualSmoother.contextualProbability
						(
							tagI ,
							previousTagJ ,
							previousPreviousTags.get( k )
						)
					);
				}
			}
		}

		return viterbi.updateScore
		(
			wordIndex ,
			lexicalProbs ,
			contextualProbabilities ,
			tags ,
			previousTags ,
			previousPreviousTags
		);
	}

	/**	Set the logger.
	 *
	 *	@param	logger		The logger.
	 */

	public void setLogger( Logger logger )
	{
		this.logger	= logger;

		((UsesLogger)lexicalSmoother).setLogger( logger );
		((UsesLogger)contextualSmoother).setLogger( logger );
		viterbi.setLogger( logger );
	}

	/**	Return tagger description.
	 *
	 *	@return		Tagger description.
	 */

	public String toString()
	{
		return "Trigram tagger";
	}
}

/*
Copyright (c) 2008, 2009 by Northwestern University.
All rights reserved.

Developed by:
   Academic and Research Technologies
   Northwestern University
   http://www.it.northwestern.edu/about/departments/at/

Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal with the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:

    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimers.

    * Redistributions in binary form must reproduce the above
      copyright notice, this list of conditions and the following
      disclaimers in the documentation and/or other materials provided
      with the distribution.

    * Neither the names of Academic and Research Technologies,
      Northwestern University, nor the names of its contributors may be
      used to endorse or promote products derived from this Software
      without specific prior written permission.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
*/