HeppleTagger.java example

Explorer
morphadorner-opensource-master
- src
package edu.northwestern.at.utils.corpuslinguistics.postagger.hepple;

/*	Please see the license information in the header below. */

import java.io.*;
import java.net.URL;
import java.util.*;
import java.text.*;

import edu.northwestern.at.utils.*;
import edu.northwestern.at.utils.corpuslinguistics.adornedword.*;
import edu.northwestern.at.utils.corpuslinguistics.tokenizer.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.hepple.rules.*;

/**	HeppleTagger: Mark Hepple's Part of Speech Tagger.
 *
 *	<p>
 *	Copyright (c) 2001-2005, The University of Sheffield.
 *	</p>
 *
 *	<p>
 *	This file is part of GATE (see http://gate.ac.uk/), and is free
 *	software, licenced under the GNU Library General Public License,
 *	Version 2, June 1991 (in the distribution as file licence.html,
 *	and also available at http://gate.ac.uk/gate/licence.html).
 *	</p>
 *
 *	<p>
 *	HeppleTagger was originally written by Mark Hepple.  The GATE version
 *	contains modifications by Valentin Tablan and Niraj Aswani.
 *	</p>
 *
 *	<p>
 *	This version also contains many modifications made at
 *	Northwestern University for use in the WordHoard project.
 *	</p>
 *
 *	<p>
 *	Comments:
 *	</p>
 *
 *	<p>
 *	Implements a version of the decision list based tagging method
 *	described in:
 *	</p>
 *
 *	<p>
 *	M. Hepple. 2000. Independence and Commitment: Assumptions for Rapid
 *	Training and Execution of Rule-based Part-of-Speech Taggers.
 *	Proceedings of the 38th Annual Meeting of the Association for
 *	Computational Linguistics (ACL-2000). Hong Kong, October 2000.
 *	</p>
 *
 *	<p>
 *	Modified by Philip R. Burns at Northwestern University to remove
 *	dependencies upon the Penn Treebank tag set, to allow plugable
 *	handling of unknown words, to remove all input/output for
 *	tagged text and rules to calling classes, and to allow the
 *	Hepple tagger to be used as a retagger.
 *	</p>
 */

public class HeppleTagger extends AbstractPartOfSpeechTagger
	implements PartOfSpeechTagger, PartOfSpeechRetagger
{
	/**	Tagging rules.
	 *
	 *	<p>
	 *	The tagging rules are stored in a map.  The map keys
	 *	are parts of speech.  The value for each part of speech
	 *	key is a lists of rules which apply to that part of speech.
	 *	</p>
	 *
	 *	<p>
	 *	Tagging rules are specified using the syntax proposed by
	 *	Eric Brill in his dissertation.  Rules take the general form:
	 *	</p>
	 *
	 *	<blockquote>
	 *	<p>
	 *	<code>
	 *	fromtag totag condition param1 param2
	 *	</code>
	 *	</p>
	 *	</blockquote>
	 *
	 *	<p>
	 *	where "fromtag" is the current tag for a word,
	 *	"totag" is the new tag to replace the current tag if the
	 *	"condition" is met, and "param1" and "param2" are optional
	 *	values for the condition test.  Each rule must specify at
	 *	least the fromtag. totag, and condition.  The fromtag
	 *	values are the keys for the rules map.
	 *	</p>
	 */

	protected Map<String, List<Rule>> rules	= MapFactory.createNewMap();

	/**	Marks unused positions in sliding word buffer. */

	protected static final String staart				= "STAART";

	protected static final String[] staartLex			=
		new String[]{ staart };

	protected static final AdornedWord staartWordAndTag	=
		new BaseAdornedWord( staart , staart );

	/**	Sliding word buffer. */

	public String[] wordBuff =
		{ staart, staart, staart, staart, staart, staart, staart };

	/**	Sliding tag buffer. */

	public String[] tagBuff	=
		{ staart, staart, staart, staart, staart, staart, staart };

	/**	Sliding parts of speech buffer. */

	public String[][] lexBuff	=
		{	staartLex, staartLex, staartLex, staartLex, staartLex,
			staartLex, staartLex };

	/**	Debug flag. */

	protected boolean debug	= false;

	/**	Construct a Hepple POS tagger.
	 */

	public HeppleTagger()
	{
	}

	/**	See if tagger uses context rules.
	 *
	 *	@return		True since Hepple tagger uses context rules.
	 */

	public boolean usesContextRules()
	{
		return true;
	}

	/**	Set context rules for tagging.
	 *
	 *	@param	contextRules	String array of context rules.
	 *
	 *	@throws	InvalidRuleException if a rule is bad.
	 */

	public void setContextRules( String[] contextRules )
		throws InvalidRuleException
	{
		this.contextRules	= contextRules;
		this.rules.clear();

		String line;
		Rule newRule;
								//	Loop over each context rule.

		for ( int i = 0 ; i < this.contextRules.length ; i++ )
		{
			line					= contextRules[ i ];

								//	Tokenize rule into rule parts.

			List<String> ruleParts	= ListFactory.createNewList();
			StringTokenizer tokens	= new StringTokenizer( line );

			while ( tokens.hasMoreTokens() )
			{
				ruleParts.add( tokens.nextToken() );
			}
								//	There must be at least three
								//	rule parts in a rule: a "from"
								//	tag, a "to" tag, and an operation name.
								//	Throw error if there aren't at least
								//	three.

			if ( ruleParts.size() < 3 )
			{
				throw new InvalidRuleException( line );
			}
                                //	Create new rule from rule parts.

			newRule	= createNewRule( (String)ruleParts.get( 2 ) );

			newRule.initialise( ruleParts );

                				//	Get list of existing rules for
                				//	the "from" tag.  If none,
                				//	create a new list to hold the
                				//	rules for this tag.

			List<Rule> existingRules	= rules.get( newRule.from );

			if ( existingRules == null )
			{
				existingRules	= ListFactory.createNewList();
				rules.put( newRule.from , existingRules );
			}
								//	Add the new rule to the list of
								//	rules for the "from" tag.

			existingRules.add( newRule );
		}
	}

	/**	Creates a new rule of the required type according to the provided ID.
	 *
	 *	@param	ruleId		The ID for the rule to be created
	 */

	protected Rule createNewRule( String ruleId )
		throws InvalidRuleException
	{
		String baseClassName	=
			ClassUtils.packageName( HeppleTagger.class.getName() ) +
			".rules.Rule_";

		try
		{
			String ruleClassName	= baseClassName + ruleId;

			Class ruleClass			= Class.forName( ruleClassName );

			return (Rule)ruleClass.newInstance();
		}
		catch ( Exception e )
		{
			throw new InvalidRuleException(
				"Could not create rule " + ruleId + "!\n" + e.toString() );
		}
	}

	/**	Tag an adorned word list.
	 *
	 *	@param	sentence	The sentence as an {@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord}.
	 *
	 *	@return				An {@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord}
	 *							of the words in the sentence tagged with
	 *							parts of speech.
	 *
	 *	<p>
	 *	The input sentence is a {@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord}
	 *	of words to be tagged.  The output is the same list of words with
	 *	parts of speech added.
	 *	</p>
	 */

	public<T extends AdornedWord> List<T> tagAdornedWordList
	(
		List<T> sentence
	)
	{
								//	Loop over words in sentence.

		boolean isFirstWord	= true;

		for ( int i = 0 ; i < sentence.size() ; i++ )
		{
								//	Get next word.

			AdornedWord newWord	= sentence.get( i );

								//	Tag it,

			oneStep( newWord , isFirstWord , sentence );

			isFirstWord	= false;
		}
								//	Add six more "staarts"
								//	to flush all words out of the
								//	tagging buffer.

		for ( int i = 0 ; i < 6 ; i++ )
		{
			oneStep( new BaseAdornedWord( staart ) , isFirstWord , sentence );
			isFirstWord	= false;
		}
								//	We have a new finished sentence.

		return sentence;
	}

	/**	Adds a new word to the current tagging window.
	 *
	 *	@param	word				The new word to add.
	 *	@param	isFirstWord			True if word is first in sentence.
	 *	@param	taggedSentence		A List of adorned words
	 *									representing the results of tagging
	 *									the current sentence so far.
	 *
	 *	@return						true if a full sentence is now tagged,
	 *								false otherwise.
	 *
	 *	<p>
	 *	Adds a new word to the current window of 7 words (on
	 *	the last position) and tags the word currently in the
	 *	middle (i.e. on position 3). This function also reads the
	 *	word on the first position and adds its tag to the
	 *	taggedSentence structure as this word would be lost at the
	 *	next advance. If this word completes a sentence then it
	 *	returns true otherwise it returns false.
	 *	</p>
	 */

	@SuppressWarnings("unchecked")
	protected boolean oneStep
	(
		AdornedWord word ,
		boolean isFirstWord ,
//		List<AdornedWord> taggedSentence
		List taggedSentence
	)
	{
								//	Add the new word at the end of the
								//	text window.

		for ( int i = 1 ; i < 7 ; i++ )
		{
			wordBuff[ i - 1 ]	= wordBuff[ i ];
			tagBuff[ i - 1 ]	= tagBuff[ i ];
			lexBuff[ i - 1 ]	= lexBuff[ i ];
		}

		wordBuff[ 6 ]	= word.getSpelling();

								//	This tagger assumes the first
								//	part of speech tag is the most
								//	frequently occurring, so
								//	getPartsOfSpeech returns the
								//	most frequent tag first.

		lexBuff[ 6 ]	= getPartsOfSpeech( word.getSpelling() , false );
		tagBuff[ 6 ]	= lexBuff[ 6 ][ 0 ];

								//	Apply the rules to the word in the
								//	middle of the text window.
								//	Try to fire a rule for the current
								//	lexical entry. It may be the case that
								//	no rule applies.
		if ( debug )
		{
			System.out.println(
				"===> word=" + wordBuff[ 3 ] +
				" currently tagged " + tagBuff[ 3 ] );
		}

		boolean done	= false;

		while ( !done )
		{
			String currentTag	= tagBuff[ 3 ];

			List rulesToApply	= (List)rules.get( lexBuff[ 3 ][ 0 ] );

			if ( ( rulesToApply != null ) && ( rulesToApply.size() > 0 ) )
			{
				Iterator rulesIter = rulesToApply.iterator();

								//	Find the first rule that applies,
								//	fire it, and stop.

				while (	( rulesIter.hasNext() &&
						!((Rule)rulesIter.next()).apply( this ) ) )
				{
				}
			}

//			done	= currentTag.equals( tagBuff[ 3 ] );
			done	= true;
		}
								//	Save the tagged word from the
								//	first position.

		String taggedWord	= wordBuff[ 0 ];

		if ( taggedWord != staart )
		{
			AdornedWord newWord	=
				new BaseAdornedWord( taggedWord , tagBuff[ 0 ] );

			taggedSentence.add( newWord );

			if ( wordBuff[ 1 ] == staart )
			{
								//	wordTag[ 0 ] was the end of a sentence.
				return true;
			}
		}

		return false;
	}

	/**	Retag one sentence.
	 *
	 *	@param	sentence	List of adorned words to retag.
	 *
	 *	@return				List of retagged words.
	 */

	@SuppressWarnings("unchecked")
	public<T extends AdornedWord> List<T> retagSentence
	(
		List<T> sentence
	)
	{
								//	List of (word, tag) results.

		List<T> taggedSentence	= ListFactory.createNewList();

								//	Iterate over words in sentence.

		Iterator<T> taggedWordsIter	= sentence.iterator();
		boolean isFirstWord			= true;

		while ( taggedWordsIter.hasNext() )
		{
								//	Get next word.

			T nextWord	= taggedWordsIter.next();

								//	Tag it,

			oneRetagStep( nextWord , isFirstWord , taggedSentence );

			isFirstWord	= false;
		}
								//	Add six more "staarts"
								//	to flush all words out of the
								//	tagging buffer.

		for ( int i = 0 ; i < 6 ; i++ )
		{
			oneRetagStep( (T)staartWordAndTag , isFirstWord , taggedSentence );
			isFirstWord	= false;
		}
								//	We have a new finished sentence.

		return taggedSentence;
	}

	/**	Adds a new word to the current retagging window.
	 *
	 *	@param	adornedWord		The new word and its tag.
	 *	@param	isFirstWord		True if word is first in sentence.
	 *	@param	taggedSentence	A List of adorned words
	 *							representing the results of tagging
	 *							the current sentence so far.
	 *
	 *	@return					true if a full sentence is now tagged,
	 *							false otherwise.
	 *
	 *	<p>
	 *	Adds a new word to the current window of 7 words (on
	 *	the last position) and tags the word currently in the
	 *	middle (i.e. on position 3). This function also reads the
	 *	word on the first position and adds its tag to the
	 *	taggedSentence structure as this word would be lost at the
	 *	next advance. If this word completes a sentence then it
	 *	returns true otherwise it returns false.
	 *	</p>
	 */

	@SuppressWarnings("unchecked")
	protected<T extends AdornedWord> boolean oneRetagStep
	(
		T adornedWord ,
		boolean isFirstWord ,
		List<T> taggedSentence
	)
	{
								//	Add the new word at the end of the
								//	text window.

		for ( int i = 1 ; i < 7 ; i++ )
		{
			wordBuff[ i - 1 ]	= wordBuff[ i ];
			tagBuff[ i - 1 ]	= tagBuff[ i ];
			lexBuff[ i - 1 ]	= lexBuff[ i ];
		}
								//	Get the word.

		wordBuff[ 6 ]	= adornedWord.getSpelling();

								//	Get the possible tags.

		lexBuff[ 6 ]	=
			getPartsOfSpeech( adornedWord.getSpelling() , false );

		tagBuff[ 6 ]	= adornedWord.getPartsOfSpeech();

								//	Apply the rules to the word in the
								//	middle of the text window.
								//	Try to fire a rule for the current
								//	lexical entry. It may be the case that
								//	no rule applies.

		List rulesToApply	= (List)rules.get( lexBuff[ 3 ][ 0 ] );

		if ( ( rulesToApply != null ) && ( rulesToApply.size() > 0 ) )
		{
			Iterator rulesIter = rulesToApply.iterator();

								//	Find the first rule that applies,
								//	fire it, and stop.

			while (	( rulesIter.hasNext() &&
					!((Rule)rulesIter.next()).apply( this ) ) )
			{
			}
		}
								//	Save the tagged word from the
								//	first position.

		String taggedWord	= wordBuff[ 0 ];

		if ( taggedWord != staart )
		{
			AdornedWord aWord	=
				new BaseAdornedWord( taggedWord , tagBuff[ 0 ] );

			taggedSentence.add( (T)aWord );

			if ( wordBuff[ 1 ] == staart )
			{
								//	wordTag[ 0 ] was the end of a sentence.
				return true;
			}
		}

		return false;
	}

	/** Get parts of speech for a word.
	 *
	 *	@param	word			The word to be classified.
	 *	@param	isFirstWord		True if word is first word in sentence.
	 *
	 *	@return			String array of potential parts of speech.
	 *
	 *	<p>
	 *	The lexicon must always return one or more parts of speech.
	 *	In addition, for this tagger, the most frequently occurring
	 *	tag must be the first one in the returned string array.
	 *	</p>
	 */

	protected String[] getPartsOfSpeech( String word , boolean isFirstWord )
	{
		String[] result	= new String[ 0 ];

		if ( word == staart ) return staartLex;

								//	Get all the categories for the word.

		List<String> categories	= getTagsForWord( word );

								//	Lexicon should never return null
								//	set of categories, but check anyway.

		if ( categories != null )
		{
								//	Get the categories.  Make sure the
								//	largest category is first.

			String largestCategory	= getMostCommonTag( word );

								//	Get the categories.

			result		= new String[ categories.size() ];

								//	Put largest (or only) category first.

			result[ 0 ]	= largestCategory;

								//	Copy remaining categories, if any.

			if ( categories.size() > 1 )
			{
				int k				= 1;
				Iterator<String> iterator	= categories.iterator();

				while ( iterator.hasNext() )
				{
					String category	= iterator.next();

					if ( !category.equals( largestCategory ) )
					{
						result[ k++ ]	= category;
					}
				}
			}
		}

		return result;
	}

	/**	Can retagger add or delete words in the original sentence?
	 *
	 *	@return		true if retagger can add or delete words.
	 */

	public boolean canAddOrDeleteWords()
	{
		return false;
	}

	/**	Return tagger description.
	 *
	 *	@return		Tagger description.
	 */

	public String toString()
	{
		return "Hepple tagger";
	}
}