package edu.northwestern.at.utils.corpuslinguistics.stemmer;
/* Please see the license information in the header below. */
import java.io.*;
import java.lang.*;
import java.util.*;
import edu.northwestern.at.utils.CharUtils;
/** LancasterStemmer: Implements the Lancaster (Paice/Husk) word stemmer.
*
* <p>
* Paice/Husk Stemmer - License Statement.
* </p>
*
* <p>
* This software was designed and developed at Lancaster
* University, Lancaster, UK, under the supervision of Dr Chris Paice.
* It is fully in the public domain, and may be used or adapted by any
* organisation or individual. Neither Dr Paice nor Lancaster
* University accepts any responsibility whatsoever for its use by
* other parties, and makes no guarantees, expressed or implied, about
* its quality, reliability, or any other characteristic.
* </p>
*
* <p>
* It is assumed that, as a matter of professional courtesy, anyone
* who incorporates this software into a system of their own, whether
* for commercial or research purposes, will acknowledge the source of
* the code.
* </p>
*
* <p>
* Modified from the original Java programs written by Christopher O'Neill
* and Rob Hooper.
* </p>
*/
public class LancasterStemmer implements Stemmer
{
/** Prefixes to remove from words before stemming.
*/
public static final String[] prefixes =
{
"intra" ,
"kilo" ,
"mega" ,
"micro" ,
"milli" ,
"nano" ,
"pico" ,
"pseudo" ,
"ultra" ,
};
/** Default stemming rules.
*
* <p>
* These rules MUST be stored in ascending alphanumeric order
* of the first character.
* </p>
*/
public static final String[] defaultStemmingRules =
new String[]
{
"ai*2. { -ia > - if intact }",
"a*1. { -a > - if intact }",
"bb1. { -bb > -b }",
"city3s. { -ytic > -ys }",
"ci2> { -ic > - }",
"cn1t> { -nc > -nt }",
"dd1. { -dd > -d }",
"dei3y> { -ied > -y }",
"deec2ss. { -ceed > -cess }",
"dee1. { -eed > -ee }",
"de2> { -ed > - }",
"dooh4> { -hood > - }",
"e1> { -e > - }",
"feil1v. { -lief > -liev }",
"fi2> { -if > - }",
"gni3> { -ing > - }",
"gai3y. { -iag > -y }",
"ga2> { -ag > - }",
"gg1. { -gg > -g }",
"ht*2. { -th > - if intact }",
"hsiug5ct. { -guish > -ct }",
"hsi3> { -ish > - }",
"i*1. { -i > - if intact }",
"i1y> { -i > -y }",
"ji1d. { -ij > -id -- see nois4j> & vis3j> }",
"juf1s. { -fuj > -fus }",
"ju1d. { -uj > -ud }",
"jo1d. { -oj > -od }",
"jeh1r. { -hej > -her }",
"jrev1t. { -verj > -vert }",
"jsim2t. { -misj > -mit }",
"jn1d. { -nj > -nd }",
"j1s. { -j > -s }",
"lbaifi6. { -ifiabl > - }",
"lbai4y. { -iabl > -y }",
"lba3> { -abl > - }",
"lbi3. { -ibl > - }",
"lib2l> { -bil > -bl }",
"lc1. { -cl > c }",
"lufi4y. { -iful > -y }",
"luf3> { -ful > - }",
"lu2. { -ul > - }",
"lai3> { -ial > - }",
"lau3> { -ual > - }",
"la2> { -al > - }",
"ll1. { -ll > -l }",
"mui3. { -ium > - }",
"mu*2. { -um > - if intact }",
"msi3> { -ism > - }",
"mm1. { -mm > -m }",
"nois4j> { -sion > -j }",
"noix4ct. { -xion > -ct }",
"noi3> { -ion > - }",
"nai3> { -ian > - }",
"na2> { -an > - }",
"nee0. { protect -een }",
"ne2> { -en > - }",
"nn1. { -nn > -n }",
"pihs4> { -ship > - }",
"pp1. { -pp > -p }",
"re2> { -er > - }",
"rae0. { protect -ear }",
"ra2. { -ar > - }",
"ro2> { -or > - }",
"ru2> { -ur > - }",
"rr1. { -rr > -r }",
"rt1> { -tr > -t }",
"rei3y> { -ier > -y }",
"sei3y> { -ies > -y }",
"sis2. { -sis > -s }",
"si2> { -is > - }",
"ssen4> { -ness > - }",
"ss0. { protect -ss }",
"suo3> { -ous > - }",
"su*2. { -us > - if intact }",
"s*1> { -s > - if intact }",
"s0. { -s > -s }",
"tacilp4y. { -plicat > -ply }",
"ta2> { -at > - }",
"tnem4> { -ment > - }",
"tne3> { -ent > - }",
"tna3> { -ant > - }",
"tpir2b. { -ript > -rib }",
"tpro2b. { -orpt > -orb }",
"tcud1. { -duct > -duc }",
"tpmus2. { -sumpt > -sum }",
"tpec2iv. { -cept > -ceiv }",
"tulo2v. { -olut > -olv }",
"tsis0. { protect -sist }",
"tsi3> { -ist > - }",
"tt1. { -tt > -t }",
"uqi3. { -iqu > - } ",
"ugo1. { -ogu > -og }",
"vis3j> { -siv > -j }",
"vie0. { protect -eiv }",
"vi2> { -iv > - }",
"ylb1> { -bly > -bl }",
"yli3y> { -ily > -y }",
"ylp0. { protect -ply }",
"yl2> { -ly > - }",
"ygo1. { -ogy > -og }",
"yhp1. { -phy > -ph }",
"ymo1. { -omy > -om }",
"ypo1. { -opy > -op }",
"yti3> { -ity > - }",
"yte3> { -ety > - }",
"ytl2. { -lty > -l }",
"yrtsi5. { -istry > - }",
"yra3> { -ary > - }",
"yro3> { -ory > - }",
"yfi3. { -ify > - }",
"ycn2t> { -ncy > -nt }",
"yca3> { -acy > - }",
"zi2> { -iz > - }",
"zy1s. { -yz > -ys }",
"end0."
};
/** Character for "0" digit.
*/
protected final static char zeroDigit = '0';
/* Array of rules. */
protected Vector<String> ruleTable;
/* Index to rule table.
*
* <p>
* For each letter 'a' through 'z', contains the index in
* ruleTable for the first rule beginning with the
* corresponding letter. Position 0 is for letter 'a',
* position 1 for letter 'b', and so on. In the default table above,
* ruleTableIndex[ 0 ] = 0, ruleTableIndex[ 1 ] = 2, etc.
* The index for Letters without a rule are assigned the index
* of the next letter which has a rule.
* </p>
*/
protected int[] ruleTableIndex;
/* True to remove prefixes when word length is greater than two. */
protected boolean preStrip;
/** Create a Paice/Husk stemmer using the default stemming rules.
*
* @throws StemmerException if something goes wrong.
*
* <p>
* Prefixes are automatically removed from words with more than
* two characters.
* </p>
*/
public LancasterStemmer()
{
this.preStrip = true;
loadRules( defaultStemmingRules );
}
/** Create a Paice/Husk stemmer from a string list of rules.
*
* @param rules The stemming rules as an array of String.
*
* <p>
* Prefixes are automatically removed from words with more than
* two characters.
* </p>
*/
public LancasterStemmer( String[] rules )
{
this.preStrip = true;
loadRules( rules );
}
/** Create a Paice/Husk stemmer from a string list of rules.
*
* @param rules The stemming rules as an array of String.
* @param preStrip True to remove prefixes from words with
* more than two characters.
*
* <p>
* Prefixes are automatically removed from words with more than
* two characters.
* </p>
*/
public LancasterStemmer( String[] rules , boolean preStrip )
{
this.preStrip = preStrip;
loadRules( rules );
}
/** Loads the stemming rules.
*
* @param rules String array of rules.
*/
protected void loadRules( String[] rules )
{
// Table of rules.
ruleTable = new Vector<String>();
// Maps letter to index of first rule
// in rule table starting with that letter.
ruleTableIndex = new int[ 26 ];
for ( int i = 0 ; i < 25 ; i++ )
{
ruleTableIndex[ i ] = 0;
}
// Loop over rules and add each
// to rule table.
for ( int i = 0 ; i < rules.length ; i++ )
{
// Remove blanks from rule and add it
// to rule table.
ruleTable.addElement( rules[ i ].replaceAll( " " , "" ) );
}
// Get starting index of rule
// for each letter. Letters without
// any rules get the index of the
// next letter with a rule.
char ch = 'a';
for ( int i = 0 ; i < ( rules.length - 1 ) ; i++ )
{
while( ((String)ruleTable.elementAt( i )).charAt( 0 ) != ch )
{
ch++;
ruleTableIndex[ charCode( ch ) ] = i;
}
}
}
/** Returns index of first vowel in string.
*
* @param s String to search for vowel.
* @param last Last position to search for vowel.
*
* @return Zero-based index of first vowel in string.
*/
protected int firstVowel( String s , int last )
{
char prevChar = 'a';
int i;
for ( i = 0 ;
( i < last ) &&
( !( vowel( s.charAt( i ) , prevChar ) ) ) ;
i++
)
{
prevChar = s.charAt( i );
}
return Math.min( i , last );
}
/** Strip suffixes from a string.
*
* @param s The string from which to remove suffixes.
*
* @return The string with suffixes removed.
*/
protected String stripSuffixes( String s )
{
// Is the current rule OK.
int ruleOK = 0;
// Are we done stemming a string.
int done = 0;
// Position of last letter in string.
int lastLetterPos = 0;
// Counter for number of characters
// to be replaced and length of stemmed
// string if rule was applied.
int replacedCharCount = 0;
// Position of first vowel in string.
int firstVowelPos = 0;
// Index into rule table.
int currentRuleIndex = 0;
// Index of current rule.
int ruleCharPos = 0;
// Index of word.
int wordCharPos = 0;
// Last letter in string.
char lastLetter = 0;
// Holds current stemming rule.
String rule = "";
// True if the input string has not yet
// been stemmed.
boolean intact = true;
// "stem" contains the stemmed input
// string in as the stemming process
// proceeds.
//
// Start by cleaning the input string
// of non-letters.
String stem = clean( s.toLowerCase() );
// Set lastLetterPos to the index of the
// last letter in the string. Normally
// we will have removed all non-letters
// from the string before we get here,
// so usually posLastletter will just be
// one less than the length of the string.
lastLetterPos = 0;
while ( ( ( lastLetterPos + 1 ) < stem.length() ) &&
isLetter( stem.charAt( lastLetterPos + 1 ) ) )
{
lastLetterPos++;
}
if ( lastLetterPos < 1 )
{
done = -1;
}
else
// Find position of first vowel in string.
{
firstVowelPos = firstVowel( stem , lastLetterPos );
wordCharPos = stem.length() - 1;
}
// Repeat rule processing until
// no more rules apply, i.e.,
// stemming is complete.
while ( done != -1 )
{
// Look for rule for new final letter.
done = 0;
// Get last letter in string.
lastLetter = stem.charAt( lastLetterPos );
// Are there are any possible rules
// for stemming for this letter?
if ( isLetter( lastLetter ) &&
( lastLetter >= 'a' ) &&
( lastLetter <= 'z' )
)
{
currentRuleIndex = ruleTableIndex[ charCode( lastLetter ) ];
}
else
{
currentRuleIndex = -1;
}
// No rule available -- stemming done.
if ( currentRuleIndex == -1 )
{
done = -1;
continue;
}
// Pick up first pontentially matching
// rule.
rule = (String)ruleTable.elementAt( currentRuleIndex );
while ( done == 0 )
{
ruleOK = 0;
if ( rule.charAt( 0 ) != lastLetter )
{
// Rule letter changed. We're done
// with this letter.
done = -1;
ruleOK = -1;
}
// Index in rule: second character.
ruleCharPos = 1;
// Index in stemmed string:
// next to last letter.
wordCharPos = lastLetterPos - 1;
// Loop over rules and try to find
// a rule that is acceptable.
while ( ruleOK == 0 )
{
// Is rule fully matched?
if ( isDigit( rule.charAt( ruleCharPos ) ) )
{
ruleOK = 1;
}
else if ( rule.charAt( ruleCharPos ) == '*' )
{
// Match only if word intact.
if ( intact )
{
// Move forwards in rule.
ruleCharPos++;
ruleOK = 1;
}
else
{
ruleOK = -1;
}
}
// Mismatch of letters.
else if ( rule.charAt( ruleCharPos ) !=
stem.charAt( wordCharPos ) )
{
ruleOK = -1;
}
// Insufficient stem remaining.
else if ( wordCharPos <= firstVowelPos )
{
ruleOK = -1;
}
// Compare next pair of letters.
// Move forwards in rule and
// backwards in string.
else
{
ruleCharPos++;
wordCharPos--;
}
}
// If the rule that has just been checked
// is valid for the current stem value,
// check the acceptability conditions
// for the current stem value.
if ( ruleOK == 1 )
{
// Count replacement letters.
replacedCharCount = 0;
while
(
!( ( rule.charAt(
ruleCharPos + replacedCharCount + 1 ) >=
'.' ) &&
( rule.charAt(
ruleCharPos + replacedCharCount + 1 ) <=
'>' )
)
)
{
replacedCharCount++;
}
replacedCharCount =
lastLetterPos + replacedCharCount + zeroDigit -
( (int)( rule.charAt( ruleCharPos ) ) );
// Position of last letter if rule used.
if ( firstVowelPos == 0 )
{
// If word starts with vowel...
if ( replacedCharCount < 1 )
{
// ... minimal stem is 2 letters.
ruleOK = -1;
}
}
// If word starts with a consonant,
// minimal stem is 3 letters
// including one or more vowels.
else if ( ( replacedCharCount < 2 ) ||
( replacedCharCount < firstVowelPos ) )
{
ruleOK = -1;
}
}
// If using rule passes the assertion
// tests, apply the matching rule.
if ( ruleOK == 1 )
{
// Input string is no longer intact.
intact = false;
// Move end of string marker to position
// given by the numeral in the rule.
lastLetterPos =
lastLetterPos + zeroDigit -
((int)( rule.charAt( ruleCharPos ) ) );
ruleCharPos++;
stem = stem.substring( 0 , ( lastLetterPos + 1 ) );
// Append any letters following numeral
// to the string.
while
(
( ruleCharPos < rule.length() ) &&
isLetter( rule.charAt( ruleCharPos ) )
)
{
stem += rule.charAt( ruleCharPos );
ruleCharPos++;
lastLetterPos++;
}
// Rule ends with '.'. We're done.
if ( ( rule.charAt( ruleCharPos ) ) == '.' )
{
done = -1;
}
else
{
// Here if rule ends with '>'. Continue.
done = 1;
}
}
else
{
// Rule did not match.
// Try next rule in rule table.
currentRuleIndex++;
rule =
(String)ruleTable.elementAt( currentRuleIndex );
// When the initial letter changes,
// there are no more rules to try.
if ( rule.charAt( 0 ) != lastLetter )
{
done = -1;
}
}
}
}
return stem;
}
/** Determine if character is a vowel or not.
*
* @param ch The potential vowel.
* @param prev The previous character.
*
* @return true if the character is a vowel.
*
* <p>
* When the character is a "y", the previous character is
* checked to see if it is a vowel. If so, "y" is not considered
* a vowel.
* </p>
*/
protected boolean vowel( char ch , char prev )
{
boolean result = CharUtils.isEnglishVowel( ch );
if ( !result && ( ch == 'y' ) )
{
result = !CharUtils.isEnglishVowel( prev );
}
return result;
}
/** Determine if character is a digit.
*
* @param ch The character to check.
*
* @return true if "ch" is a digit ('0' .. '9').
*/
protected boolean isDigit( char ch )
{
return CharUtils.isDigit( ch );
}
/** Determine if character is a letter.
*
* @param ch The character to check.
*
* @return true if "ch" is a letter ('a' .. 'z').
*/
protected boolean isLetter( char ch )
{
return CharUtils.isLetter( ch );
}
/** Converts a lower case letter to an index.
*
* @param ch The character. Must be in the range 'a' .. 'z'.
*
* @return The index, where 'a' = 0 .
*/
protected int charCode( char ch )
{
return ( (int)ch ) - 'a';
}
/** Removes prefixes from a string.
*
* @param s The string from which to remove prefixes.
*
* @return The string with prefixes removed.
*/
protected String stripPrefixes( String s )
{
String result = s;
String sLower = s.toLowerCase();
// Remove any prefix from string
// as long as the string is longer
// than the prefix.
for ( int i = 0 ; i < prefixes.length ; i++ )
{
if ( ( sLower.startsWith( prefixes[ i ] ) ) &&
( sLower.length() > prefixes[ i ].length() )
)
{
result = s.substring( prefixes[ i ].length() );
break;
}
}
return result;
}
/** Remove non-letters from a string.
*
* @param s String from which to remove non-letters.
*
* @return String with non-letters removed.
*/
protected String clean( String s )
{
StringBuffer result = new StringBuffer();
for ( int i = 0 ; i < s.length() ; i++ )
{
if ( isLetter( s.charAt( i ) ) )
{
result.append( s.charAt( i ) );
}
}
return result.toString();
}
/** Stem a specified string.
*
* @param s The string to stem.
*
* @return The stemmed string.
*/
public String stem( String s )
{
// Copy input string to be stemmed.
String result = s;
// Remove prefixes if the input string
// is longer than three characters and
// prefix stripping was requested.
if ( ( result.length() > 3 ) && preStrip )
{
result = stripPrefixes( result );
}
// Remove suffixes if the string
// is longer than three characters.
if ( result.length() > 3 )
{
result = stripSuffixes( result );
}
return result;
}
}