package edu.stanford.nlp.international.arabic.process;
import java.util.ArrayList;
import java.util.Collection;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.sequences.Clique;
import edu.stanford.nlp.sequences.FeatureFactory;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.util.Characters;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PaddedList;
/**
* Feature factory for the IOB clitic segmentation model described by
* Green and DeNero (2012).
*
* @author Spence Green
*
* @param <IN>
*/
public class ArabicSegmenterFeatureFactory<IN extends CoreLabel> extends FeatureFactory<IN> {
private static final long serialVersionUID = -4560226365250020067L;
private static final String DOMAIN_MARKER = "@";
private static final int MAX_BEFORE = 5;
private static final int MAX_AFTER = 9;
private static final int MAX_LENGTH = 10;
public void init(SeqClassifierFlags flags) {
super.init(flags);
}
/**
* Extracts all the features from the input data at a certain index.
*
* @param cInfo The complete data set as a List of WordInfo
* @param loc The index at which to extract features.
*/
public Collection<String> getCliqueFeatures(PaddedList<IN> cInfo, int loc, Clique clique) {
Collection<String> features = Generics.newHashSet();
if (clique == cliqueC) {
addAllInterningAndSuffixing(features, featuresC(cInfo, loc), "C");
} else if (clique == cliqueCpC) {
addAllInterningAndSuffixing(features, featuresCpC(cInfo, loc), "CpC");
} else if (clique == cliqueCp2C) {
addAllInterningAndSuffixing(features, featuresCp2C(cInfo, loc), "Cp2C");
} else if (clique == cliqueCp3C) {
addAllInterningAndSuffixing(features, featuresCp3C(cInfo, loc), "Cp3C");
}
String domain = cInfo.get(loc).get(CoreAnnotations.DomainAnnotation.class);
if (domain != null) {
Collection<String> domainFeatures = Generics.newHashSet();
for (String feature : features) {
domainFeatures.add(feature + DOMAIN_MARKER + domain);
}
features.addAll(domainFeatures);
}
return features;
}
protected Collection<String> featuresC(PaddedList<IN> cInfo, int loc) {
Collection<String> features = new ArrayList<>();
CoreLabel c = cInfo.get(loc);
CoreLabel n = cInfo.get(loc + 1);
CoreLabel n2 = cInfo.get(loc + 2);
CoreLabel p = cInfo.get(loc - 1);
CoreLabel p2 = cInfo.get(loc - 2);
String charc = c.get(CoreAnnotations.CharAnnotation.class);
String charn = n.get(CoreAnnotations.CharAnnotation.class);
String charn2 = n2.get(CoreAnnotations.CharAnnotation.class);
String charp = p.get(CoreAnnotations.CharAnnotation.class);
String charp2 = p2.get(CoreAnnotations.CharAnnotation.class);
// Default feature set...a 5 character window
// plus a few other language-independent features
features.add(charc +"-c");
features.add(charn + "-n1");
features.add(charn2 + "-n2" );
features.add(charp + "-p");
features.add(charp2 + "-p2");
// Length feature
if (charc.length() > 1) {
features.add("length");
}
// Character-level class features
boolean seenPunc = false;
boolean seenDigit = false;
for (int i = 0, limit = charc.length(); i < limit; ++i) {
char charcC = charc.charAt(i);
seenPunc = seenPunc || Characters.isPunctuation(charcC);
seenDigit = seenDigit || Character.isDigit(charcC);
String cuBlock = Characters.unicodeBlockStringOf(charcC);
features.add(cuBlock + "-uBlock");
String cuType = String.valueOf(Character.getType(charcC));
features.add(cuType + "-uType");
}
if (seenPunc) features.add("haspunc");
if (seenDigit) features.add("hasdigit");
// Token-level features
String word = c.word();
int index = c.index();
features.add(Math.min(MAX_BEFORE, index) + "-before");
features.add(Math.min(MAX_AFTER, word.length() - charc.length() - index) + "-after");
features.add(Math.min(MAX_LENGTH, word.length()) + "-length");
// Indicator transition feature
features.add("cliqueC");
return features;
}
protected Collection<String> featuresCpC(PaddedList<IN> cInfo, int loc) {
Collection<String> features = new ArrayList<>();
CoreLabel c = cInfo.get(loc);
CoreLabel p = cInfo.get(loc - 1);
String charc = c.get(CoreAnnotations.CharAnnotation.class);
String charp = p.get(CoreAnnotations.CharAnnotation.class);
features.add(charc + charp + "-cngram");
// Indicator transition feature
features.add("cliqueCpC");
return features;
}
protected Collection<String> featuresCp2C(PaddedList<IN> cInfo, int loc) {
Collection<String> features = new ArrayList<>();
CoreLabel c = cInfo.get(loc);
CoreLabel p = cInfo.get(loc - 1);
CoreLabel p2 = cInfo.get(loc - 2);
String charc = c.get(CoreAnnotations.CharAnnotation.class);
String charp = p.get(CoreAnnotations.CharAnnotation.class);
String charp2 = p2.get(CoreAnnotations.CharAnnotation.class);
features.add(charc + charp + charp2 + "-cngram");
// Indicator transition feature
features.add("cliqueCp2C");
return features;
}
protected Collection<String> featuresCp3C(PaddedList<IN> cInfo, int loc) {
Collection<String> features = new ArrayList<>();
CoreLabel c = cInfo.get(loc);
CoreLabel p = cInfo.get(loc - 1);
CoreLabel p2 = cInfo.get(loc - 2);
CoreLabel p3 = cInfo.get(loc - 3);
String charc = c.get(CoreAnnotations.CharAnnotation.class);
String charp = p.get(CoreAnnotations.CharAnnotation.class);
String charp2 = p2.get(CoreAnnotations.CharAnnotation.class);
String charp3 = p3.get(CoreAnnotations.CharAnnotation.class);
features.add(charc + charp + charp2 + charp3 + "-cngram");
// Indicator transition feature
features.add("cliqueCp3C");
return features;
}
}