package edu.stanford.nlp.international.arabic.process;
import java.util.Collection;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.util.PaddedList;
/**
* Feature factory for the IOB clitic segmentation model described by
* Green and DeNero (2012).
*
* @author Spence Green
*
* @param <IN>
*/
public class StartAndEndArabicSegmenterFeatureFactory<IN extends CoreLabel> extends ArabicSegmenterFeatureFactory<IN> {
private static final long serialVersionUID = 6864940988019110930L;
public void init(SeqClassifierFlags flags) {
super.init(flags);
}
@Override
protected Collection<String> featuresCpC(PaddedList<IN> cInfo, int loc) {
Collection<String> features = super.featuresCpC(cInfo, loc);
CoreLabel c = cInfo.get(loc);
// "Wrapper" feature: identity of first and last two chars of the current word.
// This helps detect ma+_+sh in dialect, as well as avoiding segmenting possessive
// pronouns if the word starts with al-.
if (c.word().length() > 3) {
String start = c.word().substring(0, 2);
String end = c.word().substring(c.word().length() - 2);
if (c.index() == 2) {
features.add(start + "_" + end + "-begin-wrap");
}
if (c.index() == c.word().length() - 1) {
features.add(start + "_" + end + "-end-wrap");
}
}
return features;
}
}