ArabicSegmenterFeatureFactory.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.international.arabic.process;

import java.util.ArrayList;
import java.util.Collection;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.sequences.Clique;
import edu.stanford.nlp.sequences.FeatureFactory;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.util.Characters;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PaddedList;

/**
 * Feature factory for the IOB clitic segmentation model described by
 * Green and DeNero (2012).
 * 
 * @author Spence Green
 *
 * @param <IN>
 */
public class ArabicSegmenterFeatureFactory<IN extends CoreLabel> extends FeatureFactory<IN> {
  
  private static final long serialVersionUID = -4560226365250020067L;
  
  private static final String DOMAIN_MARKER = "@";
  private static final int MAX_BEFORE = 5;
  private static final int MAX_AFTER = 9;
  private static final int MAX_LENGTH = 10;
  
  public void init(SeqClassifierFlags flags) {
    super.init(flags);
  }

  /**
   * Extracts all the features from the input data at a certain index.
   *
   * @param cInfo The complete data set as a List of WordInfo
   * @param loc  The index at which to extract features.
   */
  public Collection<String> getCliqueFeatures(PaddedList<IN> cInfo, int loc, Clique clique) {
    Collection<String> features = Generics.newHashSet();

    if (clique == cliqueC) {
      addAllInterningAndSuffixing(features, featuresC(cInfo, loc), "C");
    } else if (clique == cliqueCpC) {
      addAllInterningAndSuffixing(features, featuresCpC(cInfo, loc), "CpC");
    } else if (clique == cliqueCp2C) {
      addAllInterningAndSuffixing(features, featuresCp2C(cInfo, loc), "Cp2C");
    } else if (clique == cliqueCp3C) {
      addAllInterningAndSuffixing(features, featuresCp3C(cInfo, loc), "Cp3C");
    }

    String domain = cInfo.get(loc).get(CoreAnnotations.DomainAnnotation.class);
    if (domain != null) {
      Collection<String> domainFeatures = Generics.newHashSet();
      for (String feature : features) {
        domainFeatures.add(feature + DOMAIN_MARKER + domain);
      }
      features.addAll(domainFeatures);
    }
    
    return features;
  }

  protected Collection<String> featuresC(PaddedList<IN> cInfo, int loc) {
    Collection<String> features = new ArrayList<>();
    CoreLabel c = cInfo.get(loc);
    CoreLabel n = cInfo.get(loc + 1);
    CoreLabel n2 = cInfo.get(loc + 2);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel p2 = cInfo.get(loc - 2);

    String charc = c.get(CoreAnnotations.CharAnnotation.class);
    String charn = n.get(CoreAnnotations.CharAnnotation.class);
    String charn2 = n2.get(CoreAnnotations.CharAnnotation.class);
    String charp = p.get(CoreAnnotations.CharAnnotation.class);
    String charp2 = p2.get(CoreAnnotations.CharAnnotation.class);

    // Default feature set...a 5 character window
    // plus a few other language-independent features
    features.add(charc +"-c");
    features.add(charn + "-n1");
    features.add(charn2 + "-n2" );
    features.add(charp + "-p");
    features.add(charp2 + "-p2");
    
    // Length feature 
    if (charc.length() > 1) {
      features.add("length");
    }
    
    // Character-level class features
    boolean seenPunc = false;
    boolean seenDigit = false;
    for (int i = 0, limit = charc.length(); i < limit; ++i) {
      char charcC = charc.charAt(i);
      seenPunc = seenPunc || Characters.isPunctuation(charcC);
      seenDigit = seenDigit || Character.isDigit(charcC);
      String cuBlock = Characters.unicodeBlockStringOf(charcC);
      features.add(cuBlock + "-uBlock");
      String cuType = String.valueOf(Character.getType(charcC));
      features.add(cuType + "-uType");
    }
    if (seenPunc) features.add("haspunc");        
    if (seenDigit) features.add("hasdigit");        
    
    // Token-level features
    String word = c.word();
    int index = c.index();
    features.add(Math.min(MAX_BEFORE, index) + "-before");
    features.add(Math.min(MAX_AFTER, word.length() - charc.length() - index) + "-after");
    features.add(Math.min(MAX_LENGTH, word.length()) + "-length");

    // Indicator transition feature
    features.add("cliqueC");
    
    return features;
  }

  protected Collection<String> featuresCpC(PaddedList<IN> cInfo, int loc) {
    Collection<String> features = new ArrayList<>();
    CoreLabel c = cInfo.get(loc);
    CoreLabel p = cInfo.get(loc - 1);

    String charc = c.get(CoreAnnotations.CharAnnotation.class);
    String charp = p.get(CoreAnnotations.CharAnnotation.class);
    
    features.add(charc + charp + "-cngram");
    
    // Indicator transition feature
    features.add("cliqueCpC");
    
    return features;
  }

  protected Collection<String> featuresCp2C(PaddedList<IN> cInfo, int loc) {
    Collection<String> features = new ArrayList<>();
    CoreLabel c = cInfo.get(loc);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel p2 = cInfo.get(loc - 2);

    String charc = c.get(CoreAnnotations.CharAnnotation.class);
    String charp = p.get(CoreAnnotations.CharAnnotation.class);
    String charp2 = p2.get(CoreAnnotations.CharAnnotation.class);

    features.add(charc + charp + charp2 + "-cngram");

    // Indicator transition feature
    features.add("cliqueCp2C");
    
    return features;
  }

  protected Collection<String> featuresCp3C(PaddedList<IN> cInfo, int loc) {
    Collection<String> features = new ArrayList<>();
    CoreLabel c = cInfo.get(loc);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel p2 = cInfo.get(loc - 2);
    CoreLabel p3 = cInfo.get(loc - 3);

    String charc = c.get(CoreAnnotations.CharAnnotation.class);
    String charp = p.get(CoreAnnotations.CharAnnotation.class);
    String charp2 = p2.get(CoreAnnotations.CharAnnotation.class);
    String charp3 = p3.get(CoreAnnotations.CharAnnotation.class);
    
    features.add(charc + charp + charp2 + charp3 + "-cngram");
    
    // Indicator transition feature
    features.add("cliqueCp3C");
    
    return features;
  }
}