/*
* Created on Feb 18, 2005
*
* TODO To change the template for this generated file go to
* Window - Preferences - Java - Code Style - Code Templates
*/
package iitb.Model;
import java.util.regex.Pattern;
import iitb.CRF.DataSequence;
/**
* @author imran
*
*/
public class RegexCountFeatures extends FeatureTypes {
String patternString[][] = {
{"isInitCapitalWord", "[A-Z][a-z]+" },
{"isAllCapitalWord", "[A-Z][A-Z]+" },
{"isAllSmallCase", "[a-z]+" },
//{"isWord", "[a-zA-Z][a-zA-Z]+" },
//{"isAlphaNumeric", "[a-zA-Z0-9]+" },
{"singleCapLetter", "[A-Z]" },
{"containsDashes", ".*--.*"},
{"containsDash", ".*\\-.*" },
//{"singlePunctuation", "\\p{Punct}" },
{"singleDot", "[.]" },
{"singleComma", "[,]" },
{"singleQuote", "[']" },
{"isSpecialCharacter", "[#;:\\-/<>'\"()&]"},
{"fourDigits", "\\d\\d\\d\\d" },
{"isDigits", "\\d+" },
{"containsDigit", ".*\\d+.*" },
{"endsWithDot", "\\p{Alnum}+\\." }
};
Pattern p[];
int patternOccurence[], index, maxSegmentLength;
/**
* @param m
*/
public RegexCountFeatures(FeatureGenImpl m, int maxSegmentLength) {
this(m,maxSegmentLength,null);
}
public RegexCountFeatures(FeatureGenImpl m, int maxSegmentLength, String patternFile) {
super(m);
this.maxSegmentLength = maxSegmentLength;
if ((patternFile != null)&& (patternFile.length()>0))
patternString = ConcatRegexFeatures.getPatterns(patternFile);
assert(patternString != null);
p = new Pattern[patternString.length];
for(int i = 0; i < patternString.length; i++)
p[i] = Pattern.compile(patternString[i][1]);
patternOccurence = new int[patternString.length];
}
public boolean startScanFeaturesAt(DataSequence data, int prevPos, int pos) {
int i, j;
for(j = 0; j < patternOccurence.length; j++)
patternOccurence[j] = 0;
for(i = prevPos + 1; i <= pos; i++){
for(j = 0; j < p.length; j++){
if(p[j].matcher(data.x(i).toString()).matches())
patternOccurence[j]++;
}
}
index = -1;
return advance();
}
protected boolean advance() {
while(++index < (patternOccurence.length) && patternOccurence[index] <= 0);
return index < patternOccurence.length;
}
public boolean hasNext() {
return index < patternOccurence.length;
}
public void next(FeatureImpl f) {
f.val = 1;
patternOccurence[index] = Math.min(maxSegmentLength,patternOccurence[index]);
f.strId.id = maxSegmentLength * (index+1) + patternOccurence[index];
f.ystart = -1;
if(featureCollectMode()){
f.strId.name = patternString[index][0] + "_Count_" + patternOccurence[index];
//System.out.println((String)f.strId.name +" " +index + " " + f.strId.id);
}
advance();
}
@Override
public int labelIndependentId(FeatureImpl f) {
return f.id;
}
@Override
public int maxFeatureId() {
return maxSegmentLength*(patternString.length+1);
}
@Override
public String name() {
return "RC";
}
}