AbstractPartOfSpeechTagger.java example

Explorer
morphadorner-opensource-master
- src
package edu.northwestern.at.utils.corpuslinguistics.postagger;

/*	Please see the license information at the end of this file. */

import java.util.*;

import edu.northwestern.at.utils.*;
import edu.northwestern.at.utils.logger.*;
import edu.northwestern.at.utils.corpuslinguistics.adornedword.*;
import edu.northwestern.at.utils.corpuslinguistics.lexicon.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.guesser.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.smoothing.contextual.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.smoothing.lexical.*;
import edu.northwestern.at.utils.corpuslinguistics.spellingstandardizer.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.transitionmatrix.*;
import edu.northwestern.at.utils.corpuslinguistics.tokenizer.*;

/**	Abstract Part of Speech tagger.
 *
 *	<p>
 *	Provides default implementations for all of the PartOfSpeech
 *	interface methods.  To create a new PartOfSpeech tagger,
 *	extend this class and override methods as needed.  You must
 *	override the tagSentence method as a minimum.
 *	</p>
 */

abstract public class AbstractPartOfSpeechTagger
	extends IsCloseableObject
	implements PartOfSpeechTagger, IsCloseable, UsesLexicon, UsesLogger
{
	/**	Static lexicon used by tagger. */

	protected Lexicon lexicon;

	/**	Dynamic lexicon built on-the-fly for words not in static lexicon. */

	protected Lexicon dynamicLexicon;

	/**	Transition matrix used by tagger. */

	protected TransitionMatrix transitionMatrix;

	/**	Context rules. */

	protected String[] contextRules;

	/**	Lexical rules. */

	protected String[] lexicalRules;

	/**	Lexical smoother. */

	protected LexicalSmoother lexicalSmoother;

	/**	Contextual smoother. */

	protected ContextualSmoother contextualSmoother;

	/**	Fixup retagger. */

	protected PartOfSpeechRetagger retagger;

	/**	Part of speech guesser for words not in lexicon. */

	protected PartOfSpeechGuesser partOfSpeechGuesser;

	/**	PostTokenizer for mapping raw tokens to initial spellings. */

	protected PostTokenizer postTokenizer;

	/**	Number of corrections applied by rules. */

	protected int ruleCorrections	= 0;

	/**	Logger used for output. */

	protected Logger logger;

	/**	Create tagger.
	 */

	public AbstractPartOfSpeechTagger()
	{
								//	Default lexicons are empty.

		LexiconFactory lexiconFactory	= new LexiconFactory();

		lexicon							= lexiconFactory.newLexicon();
		dynamicLexicon					= lexiconFactory.newLexicon();;

								//	Create post tokenizer.

		PostTokenizerFactory postTokenizerFactory	=
			new PostTokenizerFactory();

		postTokenizer	= postTokenizerFactory.newPostTokenizer();

								//	Create dummy logger.

		logger			= new DummyLogger();
	}

	/**	Get the logger.
	 *
	 *	@return		The logger.
	 */

	public Logger getLogger()
	{
		return logger;
	}

	/**	Set the logger.
	 *
	 *	@param	logger		The logger.
	 */

	public void setLogger( Logger logger )
	{
		this.logger	= logger;
	}

	/**	See if tagger uses context rules.
	 *
	 *	@return		True if tagger uses context rules.
	 */

	public boolean usesContextRules()
	{
		return false;
	}

	/**	See if tagger uses lexical rules.
	 *
	 *	@return		True if tagger uses lexical rules.
	 */

	public boolean usesLexicalRules()
	{
		return false;
	}

	/**	See if tagger uses a probability transition matrix.
	 *
	 *	@return		True if tagger uses probability transition matrix.
	 */

	public boolean usesTransitionProbabilities()
	{
		return false;
	}

	/**	Set context rules for tagging.
	 *
	 *	@param	contextRules	String array of context rules.
	 *
	 *	@throws	InvalidRuleException if a rule is bad.
	 *
	 *	<p>
	 *	For taggers which do not use context rules, this is a no-op.
	 *	</p>
	 */

	public void setContextRules( String[] contextRules )
		throws InvalidRuleException
	{
		this.contextRules	= contextRules;

								//	Set context rules in fixup retagger
								//	if it exists.

		if ( retagger != null )
		{
			retagger.setContextRules( contextRules );
		}
	}

	/**	Set lexical rules for tagging.
	 *
	 *	@param	lexicalRules	String array of lexical rules.
	 *
	 *	@throws	InvalidRuleException if a rule is bad.
	 *
	 *	<p>
	 *	For taggers which do not use lexical rules, this is a no-op.
	 *	</p>
	 */

	public void setLexicalRules( String[] lexicalRules )
		throws InvalidRuleException
	{
		this.lexicalRules	= lexicalRules;

								//	Set lexicl rules in fixup tagger
								//	if it exists.

		if ( retagger != null )
		{
			retagger.setLexicalRules( lexicalRules );
		}
	}

	/**	Get the static word lexicon.
	 *
	 *	@return		The static word lexicon.
	 */

	public Lexicon getLexicon()
	{
		return lexicon;
	}

	/**	Get the dynamic word lexicon.
	 *
	 *	@return		The dynamic lexicon.
	 */

	public Lexicon getDynamicLexicon()
	{
		return dynamicLexicon;
	}

	/**	Get the lexicon associated with a specific word.
	 *
	 *	@param		word	The word whose source lexicon is sought.
	 *
	 *	@return		The lexicon.
	 *
	 *	<p>
	 *	Most words do not have a source lexicon defined, in which
	 *	case they come from the main static word lexicon.
	 *	Usually only words derived by a suffix analysis have
	 *	a source lexicon defined, which will of course be the
	 *	suffix lexicon.
	 *	</p>
	 */

	public Lexicon getLexicon( String word )
	{
		Lexicon result	= lexicon;

		if ( partOfSpeechGuesser != null )
		{
			result	=
				partOfSpeechGuesser.getCachedLexiconForWord( word );
		}

		return result;
	}

	/**	Set the lexicon.
	 *
	 *	@param	lexicon		Lexicon used for tagging.
	 */

	public void setLexicon( Lexicon lexicon )
	{
		this.lexicon	= lexicon;

								//	Set lexicon into fixup tagger
								//	if it exists.

		if ( retagger != null )
		{
			retagger.setLexicon( this.lexicon );
		}
	}

	/**	Get tag transition probabilities matrix.
	 *
	 *	@return		Tag probabilities transition matrix.
	 *	        	May be null for taggers which do not use
	 *				a transition matrix.
	 */

	public TransitionMatrix getTransitionMatrix()
	{
		return transitionMatrix;
	}

	/**	Set tag transition probabilities matrix.
	 *
	 *	@param	transitionMatrix	Tag probabilities transition matrix.
	 *
	 *	<p>
	 *	For taggers which do not use transition matrices, this is a no-op.
	 *	</p>
	 */

	public void setTransitionMatrix( TransitionMatrix transitionMatrix )
	{
		this.transitionMatrix	= transitionMatrix;
	}

	/**	Get part of speech guesser.
	 *
	 *	@return		The part of speech guesser.
	 */

	public PartOfSpeechGuesser getPartOfSpeechGuesser()
	{
		return this.partOfSpeechGuesser;
	}

	/**	Set part of speech guesser.
	 *
	 *	@param	partOfSpeechGuesser		The part of speech guesser.
	 */

	public void setPartOfSpeechGuesser
	(
		PartOfSpeechGuesser partOfSpeechGuesser
	)
	{
		this.partOfSpeechGuesser	= partOfSpeechGuesser;
	}

	/**	Get part of speech retagger.
	 *
	 *	@return		The part of speech retagger.  May be null.
	 */

	public PartOfSpeechRetagger getRetagger()
	{
		return retagger;
	}

	/**	Set part of speech retagger.
	 *
	 *	@param	retagger	The part of speech retagger.
	 */

	public void setRetagger( PartOfSpeechRetagger retagger )
	{
		this.retagger	= retagger;
	}

	/**	Get potential part of speech tags for a word.
	 *
	 *	@param	word	The word whose part of speech tags we want.
	 *
	 *	@return			List of part of speech tags.
	 *					May be null or empty.
	 *
	 *	<p>
	 *	When the word does not appear in the lexicon, the
	 *	part of speech guesser is used to determine the tags
	 *	based upon features of the word (suffix analysis, etc.).
	 *	</p>
	 */

	public List<String> getTagsForWord( String word )
	{
								//	Get part of speech tags for this word
								//	from main or dynamic lexicon.

		Set<String> tagSet	= null;

								//	Word in main lexicon?

		if ( lexicon.containsEntry( word ) )
		{
			tagSet	= lexicon.getCategoriesForEntry( word );
		}
								//	Word in dynamic lexicon?

		else if ( dynamicLexicon.containsEntry( word ) )
		{
			tagSet	= dynamicLexicon.getCategoriesForEntry( word );
		}
								//	Word in neither lexicon.
								//	Get potential parts of speech
								//	and counts from guesser.
								//	Add the guesser results to the
								//	dynamic lexicon.
		else
		{
								//	If we don't have a part of speech
								//	guesser, create one now.

			if ( partOfSpeechGuesser == null )
			{
				createPartOfSpeechGuesser();
			}

			Map<String, MutableInteger> tagMap	=
				partOfSpeechGuesser.guessPartsOfSpeech( word );

			tagSet	= tagMap.keySet();

			Iterator<String> iterator	= tagSet.iterator();

			while ( iterator.hasNext() )
			{
				String category			= iterator.next();
				MutableInteger count	= tagMap.get( category );

				dynamicLexicon.updateEntryCount
				(
					word ,
					category ,
					"*" ,
					count.intValue()
				);
			}
		}

		List<String> result	= ListFactory.createNewList( tagSet );

		return result;
	}

	/**	Get count of times a word appears with a given tag.
	 *
	 *	@param	word	The word.
	 *	@param	tag		The part of speech tag.
	 *
	 *	@return			The number of times the word appears
	 *					with the given tag.
	 *
	 *	<p>
	 *	When the word does not appear in the lexicon, the
	 *	part of speech guesser is used to compute a count
	 *	based upon features of the word (suffix analysis, etc.).
	 *	</p>
	 */

	public int getTagCount( String word , String  tag )
	{
								//	Total number of times this word
								//	appeared with this tag in the
								//	training data.
		int result	= 0;

								//	Word in main lexicon?

		if ( lexicon.containsEntry( word ) )
		{
			result	= lexicon.getCategoryCount( word , tag );
		}
								//	Word in dynamic lexicon?

		else if ( dynamicLexicon.containsEntry( word ) )
		{
			result	= dynamicLexicon.getCategoryCount( word , tag );
		}
								//	Word in neither lexicon.
								//	Add the guesser results to the
								//	dynamic lexicon.
		else
		{
			getTagsForWord( word );

			result	= dynamicLexicon.getCategoryCount( word , tag );
		}

		return Math.max( result , 1 );
    }

	/**	Get the most common tag for a word.
	 *
	 *	@param	word	The word.
	 *
	 *	@return			The most common part of speech tag for the word.
	 */

	public String getMostCommonTag( String word )
	{
		String result	= "";

								//	Word in main lexicon?

		if ( lexicon.containsEntry( word ) )
		{
			result	= lexicon.getLargestCategory( word );
		}
								//	Word in dynamic lexicon?

		else if ( dynamicLexicon.containsEntry( word ) )
		{
			result	= dynamicLexicon.getLargestCategory( word );
		}
								//	Word in neither lexicon.
								//	Add the guesser results to the
								//	dynamic lexicon.
		else
		{
			getTagsForWord( word );

			result	= dynamicLexicon.getLargestCategory( word );
		}

		return result;
	}

	/**	Tag a list of sentences.
	 *
	 *	@param	sentences	The list of sentences.
	 *
	 *	@return				The sentences with words adorned with
	 *						parts of speech.
	 *
	 *	<p>
	 *	The sentences are a {@link java.util.List} of
	 *	{@link java.util.List}s of words to be tagged.
	 *	Each sentence is represented as a list of
	 *	words.  The output is a list of
	 *	{@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord}s.
	 *	</p>
	 */

	public List<List<AdornedWord>> tagSentences( List<List<String>> sentences )
	{
								//	Holds list of tagged sentences.

		List<List<AdornedWord>> output	= ListFactory.createNewList();

								//	Iterator over sentences.

		Iterator<List<String>> sentencesIter	= sentences.iterator();

								//	Tag each sentence in list of sentences.

		while ( sentencesIter.hasNext() )
		{
								//	Get next sentence,

			List<String> sentence	= sentencesIter.next();

								//	Tag sentence and add to output list.

			output.add( retagWords( tagSentence( sentence ) ) );
		}

		return output;
	}

	/**	Tag a list of sentences.
	 *
	 *	@param	sentences	The list of sentences.
	 *
	 *	@return				The sentences with words adorned with
	 *						parts of speech.
	 *
	 *	<p>
	 *	The sentences are a {@link java.util.List} of
	 *	{@link java.util.List}s of adorned words to be tagged.
	 *	Each sentence is represented as a list of
	 *	words.  The output is a list of
	 *	{@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord}s.
	 *	</p>
	 */

	public<T extends AdornedWord> List<List<T>> tagAdornedWordSentences
	(
		List<List<T>> sentences
	)
	{
								//	Iterator over sentences.

		Iterator<List<T>> sentencesIter	= sentences.iterator();

								//	Tag each sentence in list of sentences.

		while ( sentencesIter.hasNext() )
		{
								//	Get next sentence,

			List<T> sentence	= sentencesIter.next();

								//	Tag sentence and add to output list.

			retagWords( tagAdornedWordSentence( sentence ) );
		}

		return sentences;
	}

	/**	Retag words in a tagged sentence.
	 *
	 *	@param	taggedSentence	The tagged sentence.
	 *
	 *	@return					The retagged sentence.
	 *
	 *	<p>
	 *	This method calls the retagger, if any.  If no retagger
	 *	is defined, the input tagged sentence is returned unchanged.
	 *	Override this method to add custom retagging without
	 *	the use of a retagger.
	 *	</p>
	 */

	public<T extends AdornedWord> List<T> retagWords
	(
		List<T> taggedSentence
	)
	{
								//	Call fixup tagger to fix the
								//	tagging produced by the bigram
								//	tagger.

		if ( retagger != null )
		{
			return retagger.retagSentence( taggedSentence );
		}
		else
		{
			return taggedSentence;
		}
	}

	/**	Clear count of successful rule applications.
	 */

	public void clearRuleCorrections()
	{
		ruleCorrections	= 0;
	}

	/**	Increment count of successful rule applications.
	 */

	public void incrementRuleCorrections()
	{
		ruleCorrections++;
	}

	/**	Get count of successful rule applications.
	 */

	public int getRuleCorrections()
	{
		return ruleCorrections;
	}

	/**	Create a part of speech guesser.
	 */

	protected void createPartOfSpeechGuesser()
	{
		try
		{
			if ( partOfSpeechGuesser == null )
			{
				AbstractPartOfSpeechGuesser guesser	=
					new DefaultPartOfSpeechGuesser();

				if ( lexicon == null )
				{
					setLexicon( new DefaultWordLexicon() );
				}

				guesser.setWordLexicon( lexicon );
				guesser.setSuffixLexicon( new DefaultSuffixLexicon() );

				guesser.setLogger( logger );

				setPartOfSpeechGuesser( guesser );
			}
		}
		catch ( Exception e )
		{
		}
	}

	/**	Tag a sentence.
	 *
	 *	@param	sentence	The sentence as a list of string words.
	 *
	 *	@return				An {@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord}
	 *						of the words in the sentence tagged with
	 *						parts of speech.
	 *
	 *	<p>
	 *	The input sentence is a {@link java.util.List} of
	 *	string words to be tagged.  The output is
	 *	{@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord}
	 *	of the words with parts of speech added.
	 *	</p>
	 */

	public List<AdornedWord> tagSentence( List<String> sentence )
	{
								//	List of adorned word results.

		List<AdornedWord> taggedSentence	= ListFactory.createNewList();

								//	Create initial adorned word list
								//	from string tokens in sentence.
		String token;
		String spelling;
		String standardSpelling;

		for ( int i = 0 ; i < sentence.size() ; i++ )
		{
								//	Get next token in input sentence.

			token				= (String)sentence.get( i );
			spelling			= token;
			standardSpelling	= token;

								//	Apply post tokenization to
								//	get spelling.

			if ( postTokenizer != null )
			{
				String[] spellings	= postTokenizer.postTokenize( token );

				spelling			= spellings[ 0 ];
				standardSpelling	= spellings[ 1 ];
			}
								//	Create adorned word from token
								//	and spelling.

			AdornedWord word	= new BaseAdornedWord( token );

			word.setSpelling( spelling );
			word.setStandardSpelling( standardSpelling );

								//	Add adorned word to output sentence.

			taggedSentence.add( word );
		}
								//	Obtain part of speech tag for
								//	each word in sentence.

		tagAdornedWordList( taggedSentence );

		return taggedSentence;
	}

	/**	Tag a sentence.
	 *
	 *	@param	sentence	The sentence as a list of string words.
	 *
	 *	@return				An {@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord}
	 *						of the words in the sentence tagged with
	 *						parts of speech.
	 *
	 *	<p>
	 *	The input sentence is a {@link java.util.List} of
	 *	adorned words to be tagged.  The output is
	 *	the same list with parts of speech added/modified.
	 *	</p>
	 */

	public<T extends AdornedWord> List<T> tagAdornedWordSentence
	(
		List<T> sentence
	)
	{
								//	Create initial adorned word list
								//	from string tokens in sentence.
		String token;
		String spelling;
		String standardSpelling;

		for ( int i = 0 ; i < sentence.size() ; i++ )
		{
			AdornedWord word	= sentence.get( i );

								//	Get next token in input sentence.

			token				= word.getToken();
			spelling			= token;
			standardSpelling	= token;

								//	Apply post tokenization to
								//	get spelling.

			if ( postTokenizer != null )
			{
				String[] spellings	= postTokenizer.postTokenize( token );

				spelling			= spellings[ 0 ];
				standardSpelling	= spellings[ 1 ];
			}
								//	Set spellings into adorned word.

			word.setSpelling( spelling );
			word.setStandardSpelling( standardSpelling );
		}
								//	Obtain part of speech tag for
								//	each word in sentence.

		tagAdornedWordList( sentence );

		return sentence;
	}

	/**	Tag a list of adorned words.
	 *
	 *	@param	sentence	The sentence as an
	 *						{@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord}.
	 *
	 *	@return				The tagged sentence (same as input with
	 *						parts of speech added).
	 */

	abstract public<T extends AdornedWord> List<T> tagAdornedWordList
	(
		List<T> sentence
	);
}

/*
Copyright (c) 2008, 2009 by Northwestern University.
All rights reserved.

Developed by:
   Academic and Research Technologies
   Northwestern University
   http://www.it.northwestern.edu/about/departments/at/

Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal with the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:

    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimers.

    * Redistributions in binary form must reproduce the above
      copyright notice, this list of conditions and the following
      disclaimers in the documentation and/or other materials provided
      with the distribution.

    * Neither the names of Academic and Research Technologies,
      Northwestern University, nor the names of its contributors may be
      used to endorse or promote products derived from this Software
      without specific prior written permission.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
*/