package edu.northwestern.at.utils.corpuslinguistics.stemmer; /* Please see the license information in the header below. */ import java.io.*; import java.lang.*; import java.util.*; import edu.northwestern.at.utils.CharUtils; /** LancasterStemmer: Implements the Lancaster (Paice/Husk) word stemmer. * * <p> * Paice/Husk Stemmer - License Statement. * </p> * * <p> * This software was designed and developed at Lancaster * University, Lancaster, UK, under the supervision of Dr Chris Paice. * It is fully in the public domain, and may be used or adapted by any * organisation or individual. Neither Dr Paice nor Lancaster * University accepts any responsibility whatsoever for its use by * other parties, and makes no guarantees, expressed or implied, about * its quality, reliability, or any other characteristic. * </p> * * <p> * It is assumed that, as a matter of professional courtesy, anyone * who incorporates this software into a system of their own, whether * for commercial or research purposes, will acknowledge the source of * the code. * </p> * * <p> * Modified from the original Java programs written by Christopher O'Neill * and Rob Hooper. * </p> */ public class LancasterStemmer implements Stemmer { /** Prefixes to remove from words before stemming. */ public static final String[] prefixes = { "intra" , "kilo" , "mega" , "micro" , "milli" , "nano" , "pico" , "pseudo" , "ultra" , }; /** Default stemming rules. * * <p> * These rules MUST be stored in ascending alphanumeric order * of the first character. * </p> */ public static final String[] defaultStemmingRules = new String[] { "ai*2. { -ia > - if intact }", "a*1. { -a > - if intact }", "bb1. { -bb > -b }", "city3s. { -ytic > -ys }", "ci2> { -ic > - }", "cn1t> { -nc > -nt }", "dd1. { -dd > -d }", "dei3y> { -ied > -y }", "deec2ss. { -ceed > -cess }", "dee1. { -eed > -ee }", "de2> { -ed > - }", "dooh4> { -hood > - }", "e1> { -e > - }", "feil1v. { -lief > -liev }", "fi2> { -if > - }", "gni3> { -ing > - }", "gai3y. { -iag > -y }", "ga2> { -ag > - }", "gg1. { -gg > -g }", "ht*2. { -th > - if intact }", "hsiug5ct. { -guish > -ct }", "hsi3> { -ish > - }", "i*1. { -i > - if intact }", "i1y> { -i > -y }", "ji1d. { -ij > -id -- see nois4j> & vis3j> }", "juf1s. { -fuj > -fus }", "ju1d. { -uj > -ud }", "jo1d. { -oj > -od }", "jeh1r. { -hej > -her }", "jrev1t. { -verj > -vert }", "jsim2t. { -misj > -mit }", "jn1d. { -nj > -nd }", "j1s. { -j > -s }", "lbaifi6. { -ifiabl > - }", "lbai4y. { -iabl > -y }", "lba3> { -abl > - }", "lbi3. { -ibl > - }", "lib2l> { -bil > -bl }", "lc1. { -cl > c }", "lufi4y. { -iful > -y }", "luf3> { -ful > - }", "lu2. { -ul > - }", "lai3> { -ial > - }", "lau3> { -ual > - }", "la2> { -al > - }", "ll1. { -ll > -l }", "mui3. { -ium > - }", "mu*2. { -um > - if intact }", "msi3> { -ism > - }", "mm1. { -mm > -m }", "nois4j> { -sion > -j }", "noix4ct. { -xion > -ct }", "noi3> { -ion > - }", "nai3> { -ian > - }", "na2> { -an > - }", "nee0. { protect -een }", "ne2> { -en > - }", "nn1. { -nn > -n }", "pihs4> { -ship > - }", "pp1. { -pp > -p }", "re2> { -er > - }", "rae0. { protect -ear }", "ra2. { -ar > - }", "ro2> { -or > - }", "ru2> { -ur > - }", "rr1. { -rr > -r }", "rt1> { -tr > -t }", "rei3y> { -ier > -y }", "sei3y> { -ies > -y }", "sis2. { -sis > -s }", "si2> { -is > - }", "ssen4> { -ness > - }", "ss0. { protect -ss }", "suo3> { -ous > - }", "su*2. { -us > - if intact }", "s*1> { -s > - if intact }", "s0. { -s > -s }", "tacilp4y. { -plicat > -ply }", "ta2> { -at > - }", "tnem4> { -ment > - }", "tne3> { -ent > - }", "tna3> { -ant > - }", "tpir2b. { -ript > -rib }", "tpro2b. { -orpt > -orb }", "tcud1. { -duct > -duc }", "tpmus2. { -sumpt > -sum }", "tpec2iv. { -cept > -ceiv }", "tulo2v. { -olut > -olv }", "tsis0. { protect -sist }", "tsi3> { -ist > - }", "tt1. { -tt > -t }", "uqi3. { -iqu > - } ", "ugo1. { -ogu > -og }", "vis3j> { -siv > -j }", "vie0. { protect -eiv }", "vi2> { -iv > - }", "ylb1> { -bly > -bl }", "yli3y> { -ily > -y }", "ylp0. { protect -ply }", "yl2> { -ly > - }", "ygo1. { -ogy > -og }", "yhp1. { -phy > -ph }", "ymo1. { -omy > -om }", "ypo1. { -opy > -op }", "yti3> { -ity > - }", "yte3> { -ety > - }", "ytl2. { -lty > -l }", "yrtsi5. { -istry > - }", "yra3> { -ary > - }", "yro3> { -ory > - }", "yfi3. { -ify > - }", "ycn2t> { -ncy > -nt }", "yca3> { -acy > - }", "zi2> { -iz > - }", "zy1s. { -yz > -ys }", "end0." }; /** Character for "0" digit. */ protected final static char zeroDigit = '0'; /* Array of rules. */ protected Vector<String> ruleTable; /* Index to rule table. * * <p> * For each letter 'a' through 'z', contains the index in * ruleTable for the first rule beginning with the * corresponding letter. Position 0 is for letter 'a', * position 1 for letter 'b', and so on. In the default table above, * ruleTableIndex[ 0 ] = 0, ruleTableIndex[ 1 ] = 2, etc. * The index for Letters without a rule are assigned the index * of the next letter which has a rule. * </p> */ protected int[] ruleTableIndex; /* True to remove prefixes when word length is greater than two. */ protected boolean preStrip; /** Create a Paice/Husk stemmer using the default stemming rules. * * @throws StemmerException if something goes wrong. * * <p> * Prefixes are automatically removed from words with more than * two characters. * </p> */ public LancasterStemmer() { this.preStrip = true; loadRules( defaultStemmingRules ); } /** Create a Paice/Husk stemmer from a string list of rules. * * @param rules The stemming rules as an array of String. * * <p> * Prefixes are automatically removed from words with more than * two characters. * </p> */ public LancasterStemmer( String[] rules ) { this.preStrip = true; loadRules( rules ); } /** Create a Paice/Husk stemmer from a string list of rules. * * @param rules The stemming rules as an array of String. * @param preStrip True to remove prefixes from words with * more than two characters. * * <p> * Prefixes are automatically removed from words with more than * two characters. * </p> */ public LancasterStemmer( String[] rules , boolean preStrip ) { this.preStrip = preStrip; loadRules( rules ); } /** Loads the stemming rules. * * @param rules String array of rules. */ protected void loadRules( String[] rules ) { // Table of rules. ruleTable = new Vector<String>(); // Maps letter to index of first rule // in rule table starting with that letter. ruleTableIndex = new int[ 26 ]; for ( int i = 0 ; i < 25 ; i++ ) { ruleTableIndex[ i ] = 0; } // Loop over rules and add each // to rule table. for ( int i = 0 ; i < rules.length ; i++ ) { // Remove blanks from rule and add it // to rule table. ruleTable.addElement( rules[ i ].replaceAll( " " , "" ) ); } // Get starting index of rule // for each letter. Letters without // any rules get the index of the // next letter with a rule. char ch = 'a'; for ( int i = 0 ; i < ( rules.length - 1 ) ; i++ ) { while( ((String)ruleTable.elementAt( i )).charAt( 0 ) != ch ) { ch++; ruleTableIndex[ charCode( ch ) ] = i; } } } /** Returns index of first vowel in string. * * @param s String to search for vowel. * @param last Last position to search for vowel. * * @return Zero-based index of first vowel in string. */ protected int firstVowel( String s , int last ) { char prevChar = 'a'; int i; for ( i = 0 ; ( i < last ) && ( !( vowel( s.charAt( i ) , prevChar ) ) ) ; i++ ) { prevChar = s.charAt( i ); } return Math.min( i , last ); } /** Strip suffixes from a string. * * @param s The string from which to remove suffixes. * * @return The string with suffixes removed. */ protected String stripSuffixes( String s ) { // Is the current rule OK. int ruleOK = 0; // Are we done stemming a string. int done = 0; // Position of last letter in string. int lastLetterPos = 0; // Counter for number of characters // to be replaced and length of stemmed // string if rule was applied. int replacedCharCount = 0; // Position of first vowel in string. int firstVowelPos = 0; // Index into rule table. int currentRuleIndex = 0; // Index of current rule. int ruleCharPos = 0; // Index of word. int wordCharPos = 0; // Last letter in string. char lastLetter = 0; // Holds current stemming rule. String rule = ""; // True if the input string has not yet // been stemmed. boolean intact = true; // "stem" contains the stemmed input // string in as the stemming process // proceeds. // // Start by cleaning the input string // of non-letters. String stem = clean( s.toLowerCase() ); // Set lastLetterPos to the index of the // last letter in the string. Normally // we will have removed all non-letters // from the string before we get here, // so usually posLastletter will just be // one less than the length of the string. lastLetterPos = 0; while ( ( ( lastLetterPos + 1 ) < stem.length() ) && isLetter( stem.charAt( lastLetterPos + 1 ) ) ) { lastLetterPos++; } if ( lastLetterPos < 1 ) { done = -1; } else // Find position of first vowel in string. { firstVowelPos = firstVowel( stem , lastLetterPos ); wordCharPos = stem.length() - 1; } // Repeat rule processing until // no more rules apply, i.e., // stemming is complete. while ( done != -1 ) { // Look for rule for new final letter. done = 0; // Get last letter in string. lastLetter = stem.charAt( lastLetterPos ); // Are there are any possible rules // for stemming for this letter? if ( isLetter( lastLetter ) && ( lastLetter >= 'a' ) && ( lastLetter <= 'z' ) ) { currentRuleIndex = ruleTableIndex[ charCode( lastLetter ) ]; } else { currentRuleIndex = -1; } // No rule available -- stemming done. if ( currentRuleIndex == -1 ) { done = -1; continue; } // Pick up first pontentially matching // rule. rule = (String)ruleTable.elementAt( currentRuleIndex ); while ( done == 0 ) { ruleOK = 0; if ( rule.charAt( 0 ) != lastLetter ) { // Rule letter changed. We're done // with this letter. done = -1; ruleOK = -1; } // Index in rule: second character. ruleCharPos = 1; // Index in stemmed string: // next to last letter. wordCharPos = lastLetterPos - 1; // Loop over rules and try to find // a rule that is acceptable. while ( ruleOK == 0 ) { // Is rule fully matched? if ( isDigit( rule.charAt( ruleCharPos ) ) ) { ruleOK = 1; } else if ( rule.charAt( ruleCharPos ) == '*' ) { // Match only if word intact. if ( intact ) { // Move forwards in rule. ruleCharPos++; ruleOK = 1; } else { ruleOK = -1; } } // Mismatch of letters. else if ( rule.charAt( ruleCharPos ) != stem.charAt( wordCharPos ) ) { ruleOK = -1; } // Insufficient stem remaining. else if ( wordCharPos <= firstVowelPos ) { ruleOK = -1; } // Compare next pair of letters. // Move forwards in rule and // backwards in string. else { ruleCharPos++; wordCharPos--; } } // If the rule that has just been checked // is valid for the current stem value, // check the acceptability conditions // for the current stem value. if ( ruleOK == 1 ) { // Count replacement letters. replacedCharCount = 0; while ( !( ( rule.charAt( ruleCharPos + replacedCharCount + 1 ) >= '.' ) && ( rule.charAt( ruleCharPos + replacedCharCount + 1 ) <= '>' ) ) ) { replacedCharCount++; } replacedCharCount = lastLetterPos + replacedCharCount + zeroDigit - ( (int)( rule.charAt( ruleCharPos ) ) ); // Position of last letter if rule used. if ( firstVowelPos == 0 ) { // If word starts with vowel... if ( replacedCharCount < 1 ) { // ... minimal stem is 2 letters. ruleOK = -1; } } // If word starts with a consonant, // minimal stem is 3 letters // including one or more vowels. else if ( ( replacedCharCount < 2 ) || ( replacedCharCount < firstVowelPos ) ) { ruleOK = -1; } } // If using rule passes the assertion // tests, apply the matching rule. if ( ruleOK == 1 ) { // Input string is no longer intact. intact = false; // Move end of string marker to position // given by the numeral in the rule. lastLetterPos = lastLetterPos + zeroDigit - ((int)( rule.charAt( ruleCharPos ) ) ); ruleCharPos++; stem = stem.substring( 0 , ( lastLetterPos + 1 ) ); // Append any letters following numeral // to the string. while ( ( ruleCharPos < rule.length() ) && isLetter( rule.charAt( ruleCharPos ) ) ) { stem += rule.charAt( ruleCharPos ); ruleCharPos++; lastLetterPos++; } // Rule ends with '.'. We're done. if ( ( rule.charAt( ruleCharPos ) ) == '.' ) { done = -1; } else { // Here if rule ends with '>'. Continue. done = 1; } } else { // Rule did not match. // Try next rule in rule table. currentRuleIndex++; rule = (String)ruleTable.elementAt( currentRuleIndex ); // When the initial letter changes, // there are no more rules to try. if ( rule.charAt( 0 ) != lastLetter ) { done = -1; } } } } return stem; } /** Determine if character is a vowel or not. * * @param ch The potential vowel. * @param prev The previous character. * * @return true if the character is a vowel. * * <p> * When the character is a "y", the previous character is * checked to see if it is a vowel. If so, "y" is not considered * a vowel. * </p> */ protected boolean vowel( char ch , char prev ) { boolean result = CharUtils.isEnglishVowel( ch ); if ( !result && ( ch == 'y' ) ) { result = !CharUtils.isEnglishVowel( prev ); } return result; } /** Determine if character is a digit. * * @param ch The character to check. * * @return true if "ch" is a digit ('0' .. '9'). */ protected boolean isDigit( char ch ) { return CharUtils.isDigit( ch ); } /** Determine if character is a letter. * * @param ch The character to check. * * @return true if "ch" is a letter ('a' .. 'z'). */ protected boolean isLetter( char ch ) { return CharUtils.isLetter( ch ); } /** Converts a lower case letter to an index. * * @param ch The character. Must be in the range 'a' .. 'z'. * * @return The index, where 'a' = 0 . */ protected int charCode( char ch ) { return ( (int)ch ) - 'a'; } /** Removes prefixes from a string. * * @param s The string from which to remove prefixes. * * @return The string with prefixes removed. */ protected String stripPrefixes( String s ) { String result = s; String sLower = s.toLowerCase(); // Remove any prefix from string // as long as the string is longer // than the prefix. for ( int i = 0 ; i < prefixes.length ; i++ ) { if ( ( sLower.startsWith( prefixes[ i ] ) ) && ( sLower.length() > prefixes[ i ].length() ) ) { result = s.substring( prefixes[ i ].length() ); break; } } return result; } /** Remove non-letters from a string. * * @param s String from which to remove non-letters. * * @return String with non-letters removed. */ protected String clean( String s ) { StringBuffer result = new StringBuffer(); for ( int i = 0 ; i < s.length() ; i++ ) { if ( isLetter( s.charAt( i ) ) ) { result.append( s.charAt( i ) ); } } return result.toString(); } /** Stem a specified string. * * @param s The string to stem. * * @return The stemmed string. */ public String stem( String s ) { // Copy input string to be stemmed. String result = s; // Remove prefixes if the input string // is longer than three characters and // prefix stripping was requested. if ( ( result.length() > 3 ) && preStrip ) { result = stripPrefixes( result ); } // Remove suffixes if the string // is longer than three characters. if ( result.length() > 3 ) { result = stripSuffixes( result ); } return result; } }