MweDictAnnotator.java example

Explorer
kpe-master
- src
  - edu
    - stanford
      - nlp
        pipeline
        HunTokenizerAnnotator.java
        MweDictAnnotator.java
        MyCleanXmlAnnotator.java
        NormalizerAnnotator.java
        OwnMorphaAnnotator.java
        OwnPOSTaggerAnnotator.java
        StopWordAnnotator.java
        SzTEAnnotationPipeline.java
        SzTECoreNLP.java
        process
        HunPTBLexer.java
        HunTokenizer.java
        tagger
        maxent
        OwnMaxentTagger.java
        OwnTestSentence.java
  - hu
    - u_szeged
package edu.stanford.nlp.pipeline;

import hu.u_szeged.utils.NLPUtils;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.ArraySet;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Timing;

public class MweDictAnnotator implements Annotator {

  private Timing timer;
  private boolean VERBOSE;
  private Map<List<String>, Map<String, Integer>> mweFreqs;

  public MweDictAnnotator(String file) {
    this(false, file);
  }

  public MweDictAnnotator(boolean verbose, String file) {
    timer = new Timing();
    this.VERBOSE = verbose;
    if (VERBOSE) {
      timer.start();
      System.err.print("Adding normalized token annotation...");
    }
    mweFreqs = new HashMap<List<String>, Map<String, Integer>>();
    List<String> lines = new LinkedList<String>();
    NLPUtils.readDocToCollection(file, lines);
    for (String line : lines) {
      String[] lineParts = line.split("\t");
      Map<String, Integer> frequencies = new HashMap<String, Integer>();
      for (int col = 1; col < lineParts.length; ++col) {
        frequencies.put(lineParts[col], Integer.parseInt(lineParts[++col]));
      }
      List<String> tokens = new LinkedList<String>();
      for (String token : lineParts[0].split(" ")) {
        tokens.add(token);
      }
      mweFreqs.put(tokens, frequencies);
    }
    if (VERBOSE) {
      System.err.print("MWE dictionary ");
      timer.done();
    }
  }

  private List<String> getLemmatizedListForm(List<CoreLabel> coreLabels) {
    List<String> lemmatized = new ArrayList<String>(coreLabels.size());
    for (int st = 0; st < coreLabels.size(); ++st) {
      if (st == coreLabels.size() - 1)
        lemmatized.add(coreLabels.get(st).getString(LemmaAnnotation.class).toLowerCase());
      else
        lemmatized.add(coreLabels.get(st).word().toLowerCase());
    }
    return lemmatized;
  }

  @Override
  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      timer.start();
      System.err.print("Adding MWE token annotation...");
    }

    if (annotation.has(SentencesAnnotation.class)) {
      List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
      for (CoreMap sentence : sentences) {
        List<CoreLabel> successiveTokens = new ArrayList<CoreLabel>(4);
        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        for (int i = 0; i < tokens.size(); i++) {
          CoreLabel token = tokens.get(i);
          token.set(MWEAnnotation.class, "O");
          if (successiveTokens.size() == 4) {
            successiveTokens.remove(0);
          }
          successiveTokens.add(token);
          int size = successiveTokens.size();
          for (int p = 0; size > 1 && p < size - 1; ++p) {
            List<String> lemmatized = getLemmatizedListForm(successiveTokens.subList(p, size));
            Map<String, Integer> freqs = mweFreqs.get(lemmatized);
            if (freqs != null) {
              boolean firstFreq = true;
              for (Entry<String, Integer> freq : freqs.entrySet()) {
                String type = transformType(freq.getKey());
                for (int s = 0; s < lemmatized.size(); s++) {
                  CoreLabel prevToken = tokens.get(i - s);
                  String prevMweLabel = prevToken.get(MWEAnnotation.class).replace("O", "");
                  boolean insertDelimiter = prevMweLabel.length() > 0 && firstFreq;
                  if (s == 0) {
                    prevMweLabel += (prevMweLabel.length() == 0 ? "" : (insertDelimiter ? "@" : "|")) + "E-";
                  } else if (s == lemmatized.size() - 1) {
                    prevMweLabel += (prevMweLabel.length() == 0 ? "" : (insertDelimiter ? "@" : "|")) + "B-";
                  } else {
                    prevMweLabel += (prevMweLabel.length() == 0 ? "" : (insertDelimiter ? "@" : "|")) + "I-";
                  }
                  prevMweLabel += type + "_" + Integer.toString(freq.getValue());
                  tokens.get(i - s).set(MWEAnnotation.class, prevMweLabel);
                }
                firstFreq = false;
              }
            }
          }
        }
      }
    } else {
      throw new RuntimeException("unable to find words/tokens in: " + annotation);
    }

    if (VERBOSE)
      timer.stop("done.");
  }

  private String transformType(String type) {
    if (type.equals("link:")) {
      return "L";
    } else if (type.equals("italic:")) {
      return "I";
    } else if (type.equals("linkItalicBold:")) {
      return "LIB";
    } else if (type.equals("bold:")) {
      return "B";
    } else if (type.equals("linkBold:")) {
      return "LB";
    } else if (type.equals("linkItalic:")) {
      return "LI";
    } else if (type.equals("boldItalic:")) {
      return "IB";
    } else {
      return "N/A";
    }
  }

  public static class MWEAnnotation implements CoreAnnotation<String> {
    public Class<String> getType() {
      return String.class;
    }
  }

  @Override
  public Set<Requirement> requires() {
    return Collections.unmodifiableSet(new ArraySet<Requirement>());
  }

  @Override
  public Set<Requirement> requirementsSatisfied() {
    return Collections.singleton(TOKENIZE_REQUIREMENT);
  }

}