package edu.northwestern.at.utils.corpuslinguistics.postagger.hepple;
/* Please see the license information in the header below. */
import java.io.*;
import java.net.URL;
import java.util.*;
import java.text.*;
import edu.northwestern.at.utils.*;
import edu.northwestern.at.utils.corpuslinguistics.adornedword.*;
import edu.northwestern.at.utils.corpuslinguistics.tokenizer.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.*;
import edu.northwestern.at.utils.corpuslinguistics.postagger.hepple.rules.*;
/** HeppleTagger: Mark Hepple's Part of Speech Tagger.
*
* <p>
* Copyright (c) 2001-2005, The University of Sheffield.
* </p>
*
* <p>
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
* </p>
*
* <p>
* HeppleTagger was originally written by Mark Hepple. The GATE version
* contains modifications by Valentin Tablan and Niraj Aswani.
* </p>
*
* <p>
* This version also contains many modifications made at
* Northwestern University for use in the WordHoard project.
* </p>
*
* <p>
* Comments:
* </p>
*
* <p>
* Implements a version of the decision list based tagging method
* described in:
* </p>
*
* <p>
* M. Hepple. 2000. Independence and Commitment: Assumptions for Rapid
* Training and Execution of Rule-based Part-of-Speech Taggers.
* Proceedings of the 38th Annual Meeting of the Association for
* Computational Linguistics (ACL-2000). Hong Kong, October 2000.
* </p>
*
* <p>
* Modified by Philip R. Burns at Northwestern University to remove
* dependencies upon the Penn Treebank tag set, to allow plugable
* handling of unknown words, to remove all input/output for
* tagged text and rules to calling classes, and to allow the
* Hepple tagger to be used as a retagger.
* </p>
*/
public class HeppleTagger extends AbstractPartOfSpeechTagger
implements PartOfSpeechTagger, PartOfSpeechRetagger
{
/** Tagging rules.
*
* <p>
* The tagging rules are stored in a map. The map keys
* are parts of speech. The value for each part of speech
* key is a lists of rules which apply to that part of speech.
* </p>
*
* <p>
* Tagging rules are specified using the syntax proposed by
* Eric Brill in his dissertation. Rules take the general form:
* </p>
*
* <blockquote>
* <p>
* <code>
* fromtag totag condition param1 param2
* </code>
* </p>
* </blockquote>
*
* <p>
* where "fromtag" is the current tag for a word,
* "totag" is the new tag to replace the current tag if the
* "condition" is met, and "param1" and "param2" are optional
* values for the condition test. Each rule must specify at
* least the fromtag. totag, and condition. The fromtag
* values are the keys for the rules map.
* </p>
*/
protected Map<String, List<Rule>> rules = MapFactory.createNewMap();
/** Marks unused positions in sliding word buffer. */
protected static final String staart = "STAART";
protected static final String[] staartLex =
new String[]{ staart };
protected static final AdornedWord staartWordAndTag =
new BaseAdornedWord( staart , staart );
/** Sliding word buffer. */
public String[] wordBuff =
{ staart, staart, staart, staart, staart, staart, staart };
/** Sliding tag buffer. */
public String[] tagBuff =
{ staart, staart, staart, staart, staart, staart, staart };
/** Sliding parts of speech buffer. */
public String[][] lexBuff =
{ staartLex, staartLex, staartLex, staartLex, staartLex,
staartLex, staartLex };
/** Debug flag. */
protected boolean debug = false;
/** Construct a Hepple POS tagger.
*/
public HeppleTagger()
{
}
/** See if tagger uses context rules.
*
* @return True since Hepple tagger uses context rules.
*/
public boolean usesContextRules()
{
return true;
}
/** Set context rules for tagging.
*
* @param contextRules String array of context rules.
*
* @throws InvalidRuleException if a rule is bad.
*/
public void setContextRules( String[] contextRules )
throws InvalidRuleException
{
this.contextRules = contextRules;
this.rules.clear();
String line;
Rule newRule;
// Loop over each context rule.
for ( int i = 0 ; i < this.contextRules.length ; i++ )
{
line = contextRules[ i ];
// Tokenize rule into rule parts.
List<String> ruleParts = ListFactory.createNewList();
StringTokenizer tokens = new StringTokenizer( line );
while ( tokens.hasMoreTokens() )
{
ruleParts.add( tokens.nextToken() );
}
// There must be at least three
// rule parts in a rule: a "from"
// tag, a "to" tag, and an operation name.
// Throw error if there aren't at least
// three.
if ( ruleParts.size() < 3 )
{
throw new InvalidRuleException( line );
}
// Create new rule from rule parts.
newRule = createNewRule( (String)ruleParts.get( 2 ) );
newRule.initialise( ruleParts );
// Get list of existing rules for
// the "from" tag. If none,
// create a new list to hold the
// rules for this tag.
List<Rule> existingRules = rules.get( newRule.from );
if ( existingRules == null )
{
existingRules = ListFactory.createNewList();
rules.put( newRule.from , existingRules );
}
// Add the new rule to the list of
// rules for the "from" tag.
existingRules.add( newRule );
}
}
/** Creates a new rule of the required type according to the provided ID.
*
* @param ruleId The ID for the rule to be created
*/
protected Rule createNewRule( String ruleId )
throws InvalidRuleException
{
String baseClassName =
ClassUtils.packageName( HeppleTagger.class.getName() ) +
".rules.Rule_";
try
{
String ruleClassName = baseClassName + ruleId;
Class ruleClass = Class.forName( ruleClassName );
return (Rule)ruleClass.newInstance();
}
catch ( Exception e )
{
throw new InvalidRuleException(
"Could not create rule " + ruleId + "!\n" + e.toString() );
}
}
/** Tag an adorned word list.
*
* @param sentence The sentence as an {@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord}.
*
* @return An {@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord}
* of the words in the sentence tagged with
* parts of speech.
*
* <p>
* The input sentence is a {@link edu.northwestern.at.utils.corpuslinguistics.adornedword.AdornedWord}
* of words to be tagged. The output is the same list of words with
* parts of speech added.
* </p>
*/
public<T extends AdornedWord> List<T> tagAdornedWordList
(
List<T> sentence
)
{
// Loop over words in sentence.
boolean isFirstWord = true;
for ( int i = 0 ; i < sentence.size() ; i++ )
{
// Get next word.
AdornedWord newWord = sentence.get( i );
// Tag it,
oneStep( newWord , isFirstWord , sentence );
isFirstWord = false;
}
// Add six more "staarts"
// to flush all words out of the
// tagging buffer.
for ( int i = 0 ; i < 6 ; i++ )
{
oneStep( new BaseAdornedWord( staart ) , isFirstWord , sentence );
isFirstWord = false;
}
// We have a new finished sentence.
return sentence;
}
/** Adds a new word to the current tagging window.
*
* @param word The new word to add.
* @param isFirstWord True if word is first in sentence.
* @param taggedSentence A List of adorned words
* representing the results of tagging
* the current sentence so far.
*
* @return true if a full sentence is now tagged,
* false otherwise.
*
* <p>
* Adds a new word to the current window of 7 words (on
* the last position) and tags the word currently in the
* middle (i.e. on position 3). This function also reads the
* word on the first position and adds its tag to the
* taggedSentence structure as this word would be lost at the
* next advance. If this word completes a sentence then it
* returns true otherwise it returns false.
* </p>
*/
@SuppressWarnings("unchecked")
protected boolean oneStep
(
AdornedWord word ,
boolean isFirstWord ,
// List<AdornedWord> taggedSentence
List taggedSentence
)
{
// Add the new word at the end of the
// text window.
for ( int i = 1 ; i < 7 ; i++ )
{
wordBuff[ i - 1 ] = wordBuff[ i ];
tagBuff[ i - 1 ] = tagBuff[ i ];
lexBuff[ i - 1 ] = lexBuff[ i ];
}
wordBuff[ 6 ] = word.getSpelling();
// This tagger assumes the first
// part of speech tag is the most
// frequently occurring, so
// getPartsOfSpeech returns the
// most frequent tag first.
lexBuff[ 6 ] = getPartsOfSpeech( word.getSpelling() , false );
tagBuff[ 6 ] = lexBuff[ 6 ][ 0 ];
// Apply the rules to the word in the
// middle of the text window.
// Try to fire a rule for the current
// lexical entry. It may be the case that
// no rule applies.
if ( debug )
{
System.out.println(
"===> word=" + wordBuff[ 3 ] +
" currently tagged " + tagBuff[ 3 ] );
}
boolean done = false;
while ( !done )
{
String currentTag = tagBuff[ 3 ];
List rulesToApply = (List)rules.get( lexBuff[ 3 ][ 0 ] );
if ( ( rulesToApply != null ) && ( rulesToApply.size() > 0 ) )
{
Iterator rulesIter = rulesToApply.iterator();
// Find the first rule that applies,
// fire it, and stop.
while ( ( rulesIter.hasNext() &&
!((Rule)rulesIter.next()).apply( this ) ) )
{
}
}
// done = currentTag.equals( tagBuff[ 3 ] );
done = true;
}
// Save the tagged word from the
// first position.
String taggedWord = wordBuff[ 0 ];
if ( taggedWord != staart )
{
AdornedWord newWord =
new BaseAdornedWord( taggedWord , tagBuff[ 0 ] );
taggedSentence.add( newWord );
if ( wordBuff[ 1 ] == staart )
{
// wordTag[ 0 ] was the end of a sentence.
return true;
}
}
return false;
}
/** Retag one sentence.
*
* @param sentence List of adorned words to retag.
*
* @return List of retagged words.
*/
@SuppressWarnings("unchecked")
public<T extends AdornedWord> List<T> retagSentence
(
List<T> sentence
)
{
// List of (word, tag) results.
List<T> taggedSentence = ListFactory.createNewList();
// Iterate over words in sentence.
Iterator<T> taggedWordsIter = sentence.iterator();
boolean isFirstWord = true;
while ( taggedWordsIter.hasNext() )
{
// Get next word.
T nextWord = taggedWordsIter.next();
// Tag it,
oneRetagStep( nextWord , isFirstWord , taggedSentence );
isFirstWord = false;
}
// Add six more "staarts"
// to flush all words out of the
// tagging buffer.
for ( int i = 0 ; i < 6 ; i++ )
{
oneRetagStep( (T)staartWordAndTag , isFirstWord , taggedSentence );
isFirstWord = false;
}
// We have a new finished sentence.
return taggedSentence;
}
/** Adds a new word to the current retagging window.
*
* @param adornedWord The new word and its tag.
* @param isFirstWord True if word is first in sentence.
* @param taggedSentence A List of adorned words
* representing the results of tagging
* the current sentence so far.
*
* @return true if a full sentence is now tagged,
* false otherwise.
*
* <p>
* Adds a new word to the current window of 7 words (on
* the last position) and tags the word currently in the
* middle (i.e. on position 3). This function also reads the
* word on the first position and adds its tag to the
* taggedSentence structure as this word would be lost at the
* next advance. If this word completes a sentence then it
* returns true otherwise it returns false.
* </p>
*/
@SuppressWarnings("unchecked")
protected<T extends AdornedWord> boolean oneRetagStep
(
T adornedWord ,
boolean isFirstWord ,
List<T> taggedSentence
)
{
// Add the new word at the end of the
// text window.
for ( int i = 1 ; i < 7 ; i++ )
{
wordBuff[ i - 1 ] = wordBuff[ i ];
tagBuff[ i - 1 ] = tagBuff[ i ];
lexBuff[ i - 1 ] = lexBuff[ i ];
}
// Get the word.
wordBuff[ 6 ] = adornedWord.getSpelling();
// Get the possible tags.
lexBuff[ 6 ] =
getPartsOfSpeech( adornedWord.getSpelling() , false );
tagBuff[ 6 ] = adornedWord.getPartsOfSpeech();
// Apply the rules to the word in the
// middle of the text window.
// Try to fire a rule for the current
// lexical entry. It may be the case that
// no rule applies.
List rulesToApply = (List)rules.get( lexBuff[ 3 ][ 0 ] );
if ( ( rulesToApply != null ) && ( rulesToApply.size() > 0 ) )
{
Iterator rulesIter = rulesToApply.iterator();
// Find the first rule that applies,
// fire it, and stop.
while ( ( rulesIter.hasNext() &&
!((Rule)rulesIter.next()).apply( this ) ) )
{
}
}
// Save the tagged word from the
// first position.
String taggedWord = wordBuff[ 0 ];
if ( taggedWord != staart )
{
AdornedWord aWord =
new BaseAdornedWord( taggedWord , tagBuff[ 0 ] );
taggedSentence.add( (T)aWord );
if ( wordBuff[ 1 ] == staart )
{
// wordTag[ 0 ] was the end of a sentence.
return true;
}
}
return false;
}
/** Get parts of speech for a word.
*
* @param word The word to be classified.
* @param isFirstWord True if word is first word in sentence.
*
* @return String array of potential parts of speech.
*
* <p>
* The lexicon must always return one or more parts of speech.
* In addition, for this tagger, the most frequently occurring
* tag must be the first one in the returned string array.
* </p>
*/
protected String[] getPartsOfSpeech( String word , boolean isFirstWord )
{
String[] result = new String[ 0 ];
if ( word == staart ) return staartLex;
// Get all the categories for the word.
List<String> categories = getTagsForWord( word );
// Lexicon should never return null
// set of categories, but check anyway.
if ( categories != null )
{
// Get the categories. Make sure the
// largest category is first.
String largestCategory = getMostCommonTag( word );
// Get the categories.
result = new String[ categories.size() ];
// Put largest (or only) category first.
result[ 0 ] = largestCategory;
// Copy remaining categories, if any.
if ( categories.size() > 1 )
{
int k = 1;
Iterator<String> iterator = categories.iterator();
while ( iterator.hasNext() )
{
String category = iterator.next();
if ( !category.equals( largestCategory ) )
{
result[ k++ ] = category;
}
}
}
}
return result;
}
/** Can retagger add or delete words in the original sentence?
*
* @return true if retagger can add or delete words.
*/
public boolean canAddOrDeleteWords()
{
return false;
}
/** Return tagger description.
*
* @return Tagger description.
*/
public String toString()
{
return "Hepple tagger";
}
}