MorphaAnnotator.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.Morphology;
import edu.stanford.nlp.util.ArraySet;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.logging.Redwood;

import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;


/**
 * This class will add the lemmas of all the words to the Annotation.
 * It assumes that the Annotation already contains the tokenized words as
 * a {@code List<CoreLabel>} for a list of sentences under the
 * {@code SentencesAnnotation.class} key.
 * The Annotator adds lemma information to each CoreLabel,
 * in the LemmaAnnotation.class.
 *
 * @author Jenny Finkel
 */
public class MorphaAnnotator implements Annotator {

  /** A logger for this class */
  private static final Redwood.RedwoodChannels log = Redwood.channels(MorphaAnnotator.class);

  private boolean VERBOSE = false;


  private static final String[] prep = {"abroad", "across", "after", "ahead", "along", "aside", "away", "around", "back", "down", "forward", "in", "off", "on", "over", "out", "round", "together", "through", "up"};
  private static final List<String> particles = Arrays.asList(prep);

  public MorphaAnnotator() {
    this(true);
  }

  public MorphaAnnotator(boolean verbose) {
    VERBOSE = verbose;
  }

  @Override
  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      log.info("Finding lemmas ...");
    }
    Morphology morphology = new Morphology();
    if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
      for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        //log.info("Lemmatizing sentence: " + tokens);
        for (CoreLabel token : tokens) {
          String text = token.get(CoreAnnotations.TextAnnotation.class);
          String posTag = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
          addLemma(morphology, CoreAnnotations.LemmaAnnotation.class, token, text, posTag);
        }
      }
    } else {
      throw new RuntimeException("Unable to find words/tokens in: " +
                                 annotation);
    }
  }


  private static void addLemma(Morphology morpha,
                        Class<? extends CoreAnnotation<String>> ann,
                        CoreMap map, String word, String tag) {
    if ( ! tag.isEmpty()) {
      String phrasalVerb = phrasalVerb(morpha, word, tag);
      if (phrasalVerb == null) {
        map.set(ann, morpha.lemma(word, tag));
      } else {
        map.set(ann, phrasalVerb);
      }
    } else {
      map.set(ann, morpha.stem(word));
    }
  }


  /** If a token is a phrasal verb with an underscore between a verb and a
   *  particle, return the phrasal verb lemmatized. If not, return null
   */
  private static String phrasalVerb(Morphology morpha, String word, String tag) {

    // must be a verb and contain an underscore
    assert(word != null);
    assert(tag != null);
    if(!tag.startsWith("VB")  || !word.contains("_")) return null;

    // check whether the last part is a particle
    String[] verb = word.split("_");
    if(verb.length != 2) return null;
    String particle = verb[1];
    if(particles.contains(particle)) {
      String base = verb[0];
      String lemma = morpha.lemma(base, tag);
      return lemma + '_' + particle;
    }

    return null;
  }


  @Override
  public Set<Class<? extends CoreAnnotation>> requires() {
    return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
        CoreAnnotations.TextAnnotation.class,
        CoreAnnotations.TokensAnnotation.class,
        CoreAnnotations.SentencesAnnotation.class,
        CoreAnnotations.PartOfSpeechAnnotation.class
    )));
  }

  @Override
  public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
    return Collections.singleton(CoreAnnotations.LemmaAnnotation.class);
  }

}