MweFeature.java example

Explorer

kpe-master
- src
  - edu
    - stanford
      - nlp
        pipeline
        HunTokenizerAnnotator.java
        MweDictAnnotator.java
        MyCleanXmlAnnotator.java
        NormalizerAnnotator.java
        OwnMorphaAnnotator.java
        OwnPOSTaggerAnnotator.java
        StopWordAnnotator.java
        SzTEAnnotationPipeline.java
        SzTECoreNLP.java
        process
        HunPTBLexer.java
        HunTokenizer.java
        tagger
        maxent
        OwnMaxentTagger.java
        OwnTestSentence.java
  - hu
    - u_szeged

package hu.u_szeged.kpe.features;

import hu.u_szeged.kpe.candidates.NGram;
import hu.u_szeged.kpe.candidates.NGramStats;
import hu.u_szeged.kpe.readers.DocumentData;

import java.util.AbstractMap.SimpleEntry;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.pipeline.MweDictAnnotator.MWEAnnotation;
import edu.stanford.nlp.util.CoreMap;

/**
 * This class makes use of a list, crawled from Wikipedia containing MWEs and identifies NGrams that can be
 * mapped to any element of it.
 */
public class MweFeature extends Feature {

  /**
   * 
   */
  private static final long serialVersionUID = 3440662396687971296L;

  public MweFeature() {
    scale = Scale.BINARY;
    collectionToStoreDocVals = HashSet.class;
  }

  @Override
  public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck,
      List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) {
    boolean isMWE = false, isCompoundMwe = false, containsMwe = false;
    List<Entry<Character, Integer>> types = new ArrayList<Entry<Character, Integer>>();
    List<String> markUps = new ArrayList<String>();
    List<int[]> spans = new ArrayList<int[]>();
    for (int i = 0; i < ngramForm.getKey().size(); ++i) {
      String mweAnnotation = ngramForm.getKey().get(i).get(MWEAnnotation.class);
      String[] mweAnnotationParts = mweAnnotation.split("@");
      int removed = 0;
      for (int p = 0; p < mweAnnotationParts.length; ++p) {
        char type = mweAnnotationParts[p].charAt(0);
        String markUp = mweAnnotationParts[p].replaceAll("[BIE]-", "");
        if (i == 0 && type != 'E' && type != 'O') {
          types.add(new SimpleEntry<Character, Integer>(type, i));
          markUps.add(markUp);
        } else if (i > 0) {
          if (type == 'E') {
            Entry<Character, Integer> t = types.remove(p - removed);
            markUps.remove(p - removed);
            removed++;

            boolean acceptablePOSsequence = true;
            for (int token = t.getValue(); token <= i; ++token) {
              String tag = ngramForm.getKey().get(token).get(PartOfSpeechAnnotation.class);
              if ((token == t.getValue() || token == i) && !tag.matches("(?i)nn.{0,2}|jj.?")) {
                acceptablePOSsequence = false;
                break;
              } else if (!tag.matches("(?i)nn.{0,2}|jj.?|in|pos")) {
                acceptablePOSsequence = false;
                break;
              }
            }
            if (acceptablePOSsequence) {
              containsMwe = containsMwe || t.getKey() == 'B';
              boolean fullScope = t.getValue() == 0 && i == ngramForm.getKey().size() - 1;
              if (t.getKey() == 'B' && !fullScope)
                spans.add(new int[] { t.getValue(), i });
              isMWE = isMWE || (t.getKey() == 'B' && fullScope);
            }
          } else if (type == 'B') {
            types.add(new SimpleEntry<Character, Integer>(type, i));
            markUps.add(markUp);
          }
        }
      }
    }
    if (spans.size() > 1) {
      for (int[] span : spans) {
        for (int[] span2 : spans) {
          if (span[1] >= span2[0] && span[0] == 0 && span2[1] == ngramForm.getKey().size() - 1) {
            isCompoundMwe = true;
          }
        }
      }
    }
    updateFeatureVals(this.getClass().getName() + "_NGram", isMWE ? 1.0d : 0.0d, docToCheck);
    updateFeatureVals(this.getClass().getName() + "_compoundNGram", isCompoundMwe ? 1.0d : 0.0d, docToCheck);
    updateFeatureVals(this.getClass().getName() + "_Containment", containsMwe ? 1.0d : 0.0d, docToCheck);
  }
}