NumberAnnotator.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.pipeline; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.ArraySet;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;

import java.util.*;

/**
 * This calls NumberSequenceClassifier, which is a rule based classifier, which
 * adds a NUMBER entity tag to numbers not already given another entity tag, and
 * also has additional rules for marking MONEY, TIME, and DATE. It assumes that
 * tokens already have a (POS) TagAnnotation, and an original round of NER that
 * covers MONEY and American DATE/TIME formats, such as MUC NER in
 * AnswerAnnotation, to which we add.
 *
 * @author Jenny Finkel
 */

public class NumberAnnotator implements Annotator  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(NumberAnnotator.class);

  private final AbstractSequenceClassifier<CoreLabel> nsc;

  private boolean VERBOSE = true;
  private static final String DEFAULT_BACKGROUND_SYMBOL = "O";
  private final String BACKGROUND_SYMBOL;

  public static final String BACKGROUND_SYMBOL_PROPERTY = "background";

  public NumberAnnotator() {
    this(DEFAULT_BACKGROUND_SYMBOL, true, NumberSequenceClassifier.USE_SUTIME_DEFAULT);
  }

  public NumberAnnotator(boolean verbose) {
    this(DEFAULT_BACKGROUND_SYMBOL, verbose, NumberSequenceClassifier.USE_SUTIME_DEFAULT);
  }

  public NumberAnnotator(boolean verbose, boolean useSUTime) {
    this(DEFAULT_BACKGROUND_SYMBOL, verbose, useSUTime);
  }

  public NumberAnnotator(String backgroundSymbol, boolean verbose, boolean useSUTime) {
    BACKGROUND_SYMBOL = backgroundSymbol;
    VERBOSE = verbose;
    nsc = new NumberSequenceClassifier(useSUTime);
  }

  public NumberAnnotator(String name, Properties props) {
    String property = name + "." + BACKGROUND_SYMBOL_PROPERTY;
    BACKGROUND_SYMBOL = props.getProperty(property, DEFAULT_BACKGROUND_SYMBOL);
    boolean useSUTime = PropertiesUtils.getBool(props,
        NumberSequenceClassifier.USE_SUTIME_PROPERTY,
        NumberSequenceClassifier.USE_SUTIME_DEFAULT);
    VERBOSE = false;
    nsc = new NumberSequenceClassifier(useSUTime);
  }

  @Override
  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      log.info("Adding number annotation ... ");
    }

    if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
      // classify tokens for each sentence
      for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        doOneSentenceNew(tokens, annotation, sentence);
      }
      if (VERBOSE) {
        log.info("done. Output: " + annotation.get(CoreAnnotations.SentencesAnnotation.class));
      }
    } else if (annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
      List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
      doOneSentenceNew(tokens, annotation, null);
    } else {
      throw new RuntimeException("unable to find sentences in: " + annotation);
    }
  }

  private void doOneSentenceNew(List<CoreLabel> words, Annotation doc, CoreMap sentence) {
    List<CoreLabel> newWords = NumberSequenceClassifier.copyTokens(words, sentence);

    nsc.classifyWithGlobalInformation(newWords, doc, sentence);

    Iterator<? extends CoreLabel> newFLIter = newWords.iterator();
    for (CoreLabel origWord : words) {
      CoreLabel newWord = newFLIter.next();
      String before = origWord.ner();
      String newGuess = newWord.get(CoreAnnotations.AnswerAnnotation.class);
      // log.info(origWord.word());
      // log.info(origWord.ner());
      if (VERBOSE)
        log.info(newWord);
      // log.info("-------------------------------------");
      if ((before == null || before.equals(BACKGROUND_SYMBOL) || before.equals("MISC"))
          && !newGuess.equals(BACKGROUND_SYMBOL)) {
        origWord.setNER(newGuess);
      }

      // transfer other annotations generated by SUTime or NumberNormalizer
      NumberSequenceClassifier.transferAnnotations(newWord, origWord);
    }
  }


  @Override
  public Set<Class<? extends CoreAnnotation>> requires() {
    return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
        CoreAnnotations.TokensAnnotation.class,
        CoreAnnotations.SentencesAnnotation.class
        )));
  }

  @Override
  public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
    // technically it adds some NER, but someone who wants full NER
    // labels will be very disappointed, so we do not claim to produce NER
    return Collections.singleton(CoreAnnotations.NumerizedTokensAnnotation.class);
  }
}