package cc.mallet.share.upenn.ner; import java.util.regex.*; import cc.mallet.pipe.*; import cc.mallet.types.*; /** * Matches a regular expression which spans several tokens. */ public class LongRegexMatches extends Pipe implements java.io.Serializable { String name; Pattern regex; int min; // how many tokens to merge for a match int max; public LongRegexMatches (String featureName, Pattern regex, int min, int max) { this.name = featureName; this.regex = regex; this.min = min; this.max = max; } public Instance pipe (Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); boolean[] marked = new boolean[ts.size()]; // avoid setting features twice for (int i=0; i < ts.size(); i++) { // On reaching a new token, test all strings with at least // min tokens which end in the new token. StringBuffer sb = new StringBuffer(); // start by testing rightmost suffix, and grow leftward for (int length = 1; length <= max; length++) { int loc = i - length + 1; if (loc < 0) break; // take another token sb.insert(0, ts.get(loc).getText()); // else prepend token // On a match, mark all participating tokens. if (length >= min && regex.matcher(sb.toString()).matches()) { for (int j=0; j<length; j++) marked[loc+j] = true; } } } // Set feature on all tokens participating in any match for (int i=0; i < ts.size(); i++) if (marked[i]) ts.get(i).setFeatureValue(name, 1.0); return carrier; } }