package cc.mallet.share.upenn.ner;
import java.util.*;
import cc.mallet.pipe.*;
import cc.mallet.types.*;
/**
* A feature approximating string length.
*/
public class LengthBins extends Pipe implements java.io.Serializable {
String name;
int[] bins;
String[] binNames;
/**
* <p>bins contains the maximum sizes of elements in each bin.
* <p>For example, passing in {1,3,7} would produce 4 bins, for strings
* of lengths 1, 2-3, 4-7, and 8+.
*/
public LengthBins (String featureName, int[] binMaxes) {
this.name = featureName;
this.bins = binMaxes;
Arrays.sort(bins);
int min = 1;
binNames = new String[bins.length+1];
for (int i=0; i<bins.length; i++) {
binNames[i] = (min == bins[i] ? "["+min+"]" :
"["+min+"-"+bins[i]+"]");
min = bins[i]+1;
}
binNames[bins.length] = "["+min+"+]";
}
public Instance pipe (Instance carrier) {
TokenSequence ts = (TokenSequence) carrier.getData();
tokens:
for (int i=0; i < ts.size(); i++) {
Token t = ts.get(i);
int length = t.getText().length();
for (int j=0; j<bins.length; j++)
if (length <= bins[j]) {
t.setFeatureValue(name+"="+binNames[j], 1.0);
continue tokens;
}
t.setFeatureValue(name+"="+binNames[bins.length], 1.0);
}
return carrier;
}
}