/*
* Created on May 4, 2008
* @author sunita
*/
package iitb.Model;
import iitb.CRF.DataSequence;
/**
*
* @author Sunita Sarawagi
* @since 1.3
* @version 1.3
*/
public class TokenShapeFeatures extends FeatureTypes {
/**
*
*/
private static final long serialVersionUID = 7648807821467697829L;
char pattern[][] = {
{'a','z','x'},
{'A','Z','X'},
{'0','9','d'},
{'.','.','.'},
{',',',',','},
{'-','-','-'},
{'\'','\'','\''},
{'"','"','"'},
{'o','o','o'}
};
String allowedPatterns[]={
"X", "XX", "XXX", "XXX+"
,"x","xx","xxx","xxx+"
,"Xx", "Xxx", "Xxx+"
,".", ",", "-"
,"X."
,"1", "11", "111", "111+"
,"1.1"
,"1,1+", "11,+", "1.1+", "11.+","11-+","11#"
,"'s", "\"", "'"
};
int idToFeatureIdMap[];
int numPos=3;
String word=null;
public TokenShapeFeatures(FeatureGenImpl fgen) {
super(fgen);
int maxPatternId = (int) Math.pow(pattern.length+1,numPos)*2;
idToFeatureIdMap = new int[maxPatternId];
for (int i = 0; i < allowedPatterns.length; i++) {
int id = getPatternId(allowedPatterns[i]);
idToFeatureIdMap[id] = i+1;
}
}
private int getPatternId(String word) {
int len = word.length();
int id=0;
// if (featureCollectMode() && (f != null)) f.strId.name=name();
for (int i = 0; i < numPos; i++) {
int matchPos=(i < len)?findMatchPosition(word.charAt(i)):pattern.length;
id = matchPos + id*(pattern.length+1);
/* if (featureCollectMode() && (f != null)) {
if (matchPos < pattern.length) {
f.strId.name = (String)f.strId.name + pattern[matchPos][2];
}
}
*/ }
// indicator for whether the word is > numPos
id *= 2;
if (len > numPos) {
id += 1;
// if (featureCollectMode() && (f != null))
// f.strId.name = f.strId.name+"+";
}
return id;
}
@Override
public boolean hasNext() {
return (word != null);
}
@Override
public void next(FeatureImpl f) {
f.strId.id = idToFeatureIdMap[getPatternId(word)];
if (featureCollectMode()) {
if (f.strId.id > 0)
f.strId.name = allowedPatterns[f.strId.id-1];
else
f.strId.name = "Other";
// System.out.println(word + " "+f.strId.name);
}
f.val=1;
word=null;
}
private int findMatchPosition(char c) {
for (int i = 0; i < pattern.length-1; i++) {
if ((pattern[i][0] <= c) && (pattern[i][1] >= c))
return i;
}
return pattern.length-1;
}
@Override
public boolean startScanFeaturesAt(DataSequence data, int prevPos, int pos) {
assert(pos-prevPos==1);
word = data.x(pos).toString();
return hasNext();
}
@Override
public int maxFeatureId() {
return allowedPatterns.length+1;
}
@Override
public String name() {
return "Shape_";
}
}