ParserAnnotator.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.pipeline;

import java.util.*;
import java.util.function.Function;
import java.util.function.Predicate;

import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.parser.common.NoSuchParseException;
import edu.stanford.nlp.parser.common.ParserAnnotations;
import edu.stanford.nlp.parser.common.ParserConstraint;
import edu.stanford.nlp.parser.common.ParserGrammar;
import edu.stanford.nlp.parser.common.ParserQuery;
import edu.stanford.nlp.parser.common.ParserUtils;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.parser.lexparser.TreeBinarizer;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.logging.Redwood;


/**
 * This class will add parse information to an Annotation.
 * It assumes that the Annotation already contains the tokenized words
 * as a {@code List<CoreLabel>} in the TokensAnnotation under each
 * particular CoreMap in the SentencesAnnotation.
 * If the words have POS tags, they will be used.
 * <br>
 * Parse trees are added to each sentence's CoreMap (get with
 * {@code CoreAnnotations.SentencesAnnotation}) under
 * {@code CoreAnnotations.TreeAnnotation}).
 *
 * @author Jenny Finkel
 */
public class ParserAnnotator extends SentenceAnnotator  {

  /** A logger for this class */
  private static final Redwood.RedwoodChannels log = Redwood.channels(ParserAnnotator.class);

  private final boolean VERBOSE;
  private final boolean BUILD_GRAPHS;
  private final ParserGrammar parser;

  private final Function<Tree, Tree> treeMap;

  /** Do not parse sentences larger than this sentence length */
  private final int maxSentenceLength;

  /**
   * Stop parsing if we exceed this time limit, in milliseconds.
   * Use 0 for no limit.
   */
  private final long maxParseTime;

  private final int kBest;

  private final GrammaticalStructureFactory gsf;

  private final int nThreads;

  private final boolean saveBinaryTrees;

  /** Whether to include punctuation dependencies in the output. Starting in 2015, the default is true. */
  private final boolean keepPunct;

  /** If true, don't re-annotate sentences that already have a tree annotation */
  private final boolean noSquash;
  private final GrammaticalStructure.Extras extraDependencies;

  public ParserAnnotator(boolean verbose, int maxSent) {
    this(System.getProperty("parse.model", LexicalizedParser.DEFAULT_PARSER_LOC), verbose, maxSent, StringUtils.EMPTY_STRING_ARRAY);
  }

  public ParserAnnotator(String parserLoc,
                         boolean verbose,
                         int maxSent,
                         String[] flags) {
    this(loadModel(parserLoc, verbose, flags), verbose, maxSent);
  }

  public ParserAnnotator(ParserGrammar parser, boolean verbose, int maxSent) {
    this(parser, verbose, maxSent, null);
  }

  public ParserAnnotator(ParserGrammar parser, boolean verbose, int maxSent, Function<Tree, Tree> treeMap) {
    this.VERBOSE = verbose;
    this.BUILD_GRAPHS = parser.getTLPParams().supportsBasicDependencies();
    this.parser = parser;
    this.maxSentenceLength = maxSent;
    this.treeMap = treeMap;
    this.maxParseTime = 0;
    this.kBest = 1;
    this.keepPunct = true;
    if (this.BUILD_GRAPHS) {
      TreebankLanguagePack tlp = parser.getTLPParams().treebankLanguagePack();
      this.gsf = tlp.grammaticalStructureFactory(tlp.punctuationWordRejectFilter(), parser.getTLPParams().typedDependencyHeadFinder());
    } else {
      this.gsf = null;
    }

    this.nThreads = 1;
    this.saveBinaryTrees = false;
    this.noSquash = false;
    this.extraDependencies = GrammaticalStructure.Extras.NONE;
  }


  public ParserAnnotator(String annotatorName, Properties props) {
    String model = props.getProperty(annotatorName + ".model", LexicalizedParser.DEFAULT_PARSER_LOC);
    if (model == null) {
      throw new IllegalArgumentException("No model specified for Parser annotator " + annotatorName);
    }
    this.VERBOSE = PropertiesUtils.getBool(props, annotatorName + ".debug", false);

    String[] flags = convertFlagsToArray(props.getProperty(annotatorName + ".flags"));
    this.parser = loadModel(model, VERBOSE, flags);
    this.maxSentenceLength = PropertiesUtils.getInt(props, annotatorName + ".maxlen", -1);

    String treeMapClass = props.getProperty(annotatorName + ".treemap");
    if (treeMapClass == null) {
      this.treeMap = null;
    } else {
      this.treeMap = ReflectionLoading.loadByReflection(treeMapClass, props);
    }

    this.maxParseTime = PropertiesUtils.getLong(props, annotatorName + ".maxtime", -1);

    this.kBest = PropertiesUtils.getInt(props, annotatorName + ".kbest", 1);

    this.keepPunct = PropertiesUtils.getBool(props, annotatorName + ".keepPunct", true);


    String buildGraphsProperty = annotatorName + ".buildgraphs";
    if (!this.parser.getTLPParams().supportsBasicDependencies()) {
      if (props.getProperty(buildGraphsProperty) != null && PropertiesUtils.getBool(props, buildGraphsProperty)) {
        log.info("WARNING: " + buildGraphsProperty + " set to true, but " + this.parser.getTLPParams().getClass() + " does not support dependencies");
      }
      this.BUILD_GRAPHS = false;
    } else {
      this.BUILD_GRAPHS = PropertiesUtils.getBool(props, buildGraphsProperty, true);
    }

    if (this.BUILD_GRAPHS) {
      boolean generateOriginalDependencies = PropertiesUtils.getBool(props, annotatorName + ".originalDependencies", false);
      parser.getTLPParams().setGenerateOriginalDependencies(generateOriginalDependencies);
      TreebankLanguagePack tlp = parser.getTLPParams().treebankLanguagePack();
      Predicate<String> punctFilter = this.keepPunct ? Filters.acceptFilter() : tlp.punctuationWordRejectFilter();
      this.gsf = tlp.grammaticalStructureFactory(punctFilter, parser.getTLPParams().typedDependencyHeadFinder());
    } else {
      this.gsf = null;
    }

    this.nThreads = PropertiesUtils.getInt(props, annotatorName + ".nthreads", PropertiesUtils.getInt(props, "nthreads", 1));
    boolean usesBinary = StanfordCoreNLP.usesBinaryTrees(props);
    this.saveBinaryTrees = PropertiesUtils.getBool(props, annotatorName + ".binaryTrees", usesBinary);
    this.noSquash = PropertiesUtils.getBool(props, annotatorName + ".nosquash", false);
    this.extraDependencies = MetaClass.cast(props.getProperty(annotatorName + ".extradependencies", "NONE"), GrammaticalStructure.Extras.class);
  }

  @SuppressWarnings("StringConcatenationInsideStringBufferAppend")
  public static String signature(String annotatorName, Properties props) {
    StringBuilder os = new StringBuilder();
    os.append(annotatorName + ".model:" +
            props.getProperty(annotatorName + ".model",
                    LexicalizedParser.DEFAULT_PARSER_LOC));
    os.append(annotatorName + ".debug:" +
            props.getProperty(annotatorName + ".debug", "false"));
    os.append(annotatorName + ".flags:" +
            props.getProperty(annotatorName + ".flags", ""));
    os.append(annotatorName + ".maxlen:" +
            props.getProperty(annotatorName + ".maxlen", "-1"));
    os.append(annotatorName + ".treemap:" +
            props.getProperty(annotatorName + ".treemap", ""));
    os.append(annotatorName + ".maxtime:" +
            props.getProperty(annotatorName + ".maxtime", "-1"));
    os.append(annotatorName + ".originalDependencies:" +
            props.getProperty(annotatorName + ".originalDependencies", "false"));
    os.append(annotatorName + ".buildgraphs:" +
      props.getProperty(annotatorName + ".buildgraphs", "true"));
    os.append(annotatorName + ".nthreads:" +
              props.getProperty(annotatorName + ".nthreads", props.getProperty("nthreads", "")));
    os.append(annotatorName + ".nosquash:" +
      props.getProperty(annotatorName + ".nosquash", "false"));
    os.append(annotatorName + ".keepPunct:" +
      props.getProperty(annotatorName + ".keepPunct", "true"));
    os.append(annotatorName + ".extradependencies:" +
        props.getProperty(annotatorName + ".extradependences", "NONE").toLowerCase());
    boolean usesBinary = StanfordCoreNLP.usesBinaryTrees(props);
    boolean saveBinaryTrees = PropertiesUtils.getBool(props, annotatorName + ".binaryTrees", usesBinary);
    os.append(annotatorName + ".binaryTrees:" + saveBinaryTrees);

    return os.toString();
  }

  private static String[] convertFlagsToArray(String parserFlags) {
    if (parserFlags == null || parserFlags.trim().isEmpty()) {
      return StringUtils.EMPTY_STRING_ARRAY;
    } else {
      return parserFlags.trim().split("\\s+");
    }
  }

  private static ParserGrammar loadModel(String parserLoc,
                                         boolean verbose,
                                         String[] flags) {
    if (verbose) {
      log.info("Loading Parser Model [" + parserLoc + "] ...");
      log.info("  Flags:");
      for (String flag : flags) {
        log.info("  " + flag);
      }
      log.info();
    }
    ParserGrammar result = ParserGrammar.loadModel(parserLoc);
    result.setOptionFlags(result.defaultCoreNLPFlags());
    result.setOptionFlags(flags);

    return result;
  }

  @Override
  protected int nThreads() {
    return nThreads;
  }

  @Override
  protected long maxTime() {
    return maxParseTime;
  }

  @Override
  protected void doOneSentence(Annotation annotation, CoreMap sentence) {
    // If "noSquash" is set, don't re-annotate sentences which already have a tree annotation
    if (noSquash &&
        sentence.get(TreeCoreAnnotations.TreeAnnotation.class) != null &&
        !"X".equalsIgnoreCase(sentence.get(TreeCoreAnnotations.TreeAnnotation.class).label().value())) {
      return;
    }

    final List<CoreLabel> words = sentence.get(CoreAnnotations.TokensAnnotation.class);
    if (VERBOSE) {
      log.info("Parsing: " + words);
    }
    List<Tree> trees = null;
    // generate the constituent tree
    if (maxSentenceLength <= 0 || words.size() <= maxSentenceLength) {
      try {
        final List<ParserConstraint> constraints = sentence.get(ParserAnnotations.ConstraintAnnotation.class);
        trees = doOneSentence(constraints, words);
      } catch (RuntimeInterruptedException e) {
        if (VERBOSE) {
          log.info("Took too long parsing: " + words);
        }
        trees = null;
      }
    }
    // tree == null may happen if the parser takes too long or if
    // the sentence is longer than the max length
    if (trees == null || trees.size() < 1) {
      doOneFailedSentence(annotation, sentence);
    } else {
      finishSentence(sentence, trees);
    }
  }

  @Override
  public void doOneFailedSentence(Annotation annotation, CoreMap sentence) {
    final List<CoreLabel> words = sentence.get(CoreAnnotations.TokensAnnotation.class);
    Tree tree = ParserUtils.xTree(words);
    for (CoreLabel word : words) {
      if (word.tag() == null) {
        word.setTag("XX");
      }
    }

    List<Tree> trees = Generics.newArrayList(1);
    trees.add(tree);
    finishSentence(sentence, trees);
  }

  private void finishSentence(CoreMap sentence, List<Tree> trees) {

    if (treeMap != null) {
      List<Tree> mappedTrees = Generics.newLinkedList();
      for (Tree tree : trees) {
        Tree mappedTree = treeMap.apply(tree);
        mappedTrees.add(mappedTree);
      }
      trees = mappedTrees;
    }

    ParserAnnotatorUtils.fillInParseAnnotations(VERBOSE, BUILD_GRAPHS, gsf, sentence, trees, extraDependencies);

    if (saveBinaryTrees) {
      TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack());
      Tree binarized = binarizer.transformTree(trees.get(0));
      Trees.convertToCoreLabels(binarized);
      sentence.set(TreeCoreAnnotations.BinarizedTreeAnnotation.class, binarized);
    }

    // for some reason in some corner cases nodes aren't having sentenceIndex set
    // do a pass and make sure all nodes have sentenceIndex set
    SemanticGraph sg = sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
    if (sg != null) {
      for (IndexedWord iw : sg.vertexSet()) {
        if (iw.get(CoreAnnotations.SentenceIndexAnnotation.class) == null
                && sentence.get(CoreAnnotations.SentenceIndexAnnotation.class) != null) {
          iw.setSentIndex(sentence.get(CoreAnnotations.SentenceIndexAnnotation.class));
        }
      }
    }
  }

  private List<Tree> doOneSentence(List<ParserConstraint> constraints,
                             List<CoreLabel> words) {
    ParserQuery pq = parser.parserQuery();
    pq.setConstraints(constraints);
    pq.parse(words);
    List<Tree> trees = Generics.newLinkedList();
    try {
      // Use bestParse if kBest is set to 1.
      if (this.kBest == 1) {
        Tree t = pq.getBestParse();
        if (t == null) {
          log.warn("Parsing of sentence failed.  " +
              "Will ignore and continue: " +
              SentenceUtils.listToString(words));
        } else {
          double score = pq.getBestScore();
          t.setScore(score % -10000.0);
          trees.add(t);
        }
      } else {
        List<ScoredObject<Tree>> scoredObjects = pq.getKBestParses(this.kBest);
        if (scoredObjects == null || scoredObjects.size() < 1) {
          log.warn("Parsing of sentence failed.  " +
              "Will ignore and continue: " +
              SentenceUtils.listToString(words));
        } else {
          for (ScoredObject<Tree> so : scoredObjects) {
            // -10000 denotes unknown words
            Tree tree = so.object();
            tree.setScore(so.score() % -10000.0);
            trees.add(tree);
          }
        }
      }
    } catch (OutOfMemoryError e) {
      log.error(e); // Beware that we can now get an OOM in logging, too.
      log.warn("Parsing of sentence ran out of memory (length=" + words.size() + ").  " +
              "Will ignore and try to continue.");
    } catch (NoSuchParseException e) {
      log.warn("Parsing of sentence failed, possibly because of out of memory.  " +
              "Will ignore and continue: " +
              SentenceUtils.listToString(words));
    }
    return trees;
  }

  @Override
  public Set<Class<? extends CoreAnnotation>> requires() {
    if (parser.requiresTags()) {
      return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
          CoreAnnotations.TextAnnotation.class,
          CoreAnnotations.TokensAnnotation.class,
          CoreAnnotations.ValueAnnotation.class,
          CoreAnnotations.OriginalTextAnnotation.class,
          CoreAnnotations.CharacterOffsetBeginAnnotation.class,
          CoreAnnotations.CharacterOffsetEndAnnotation.class,
          CoreAnnotations.IndexAnnotation.class,
          CoreAnnotations.SentencesAnnotation.class,
          CoreAnnotations.SentenceIndexAnnotation.class,
          CoreAnnotations.PartOfSpeechAnnotation.class
      )));
    } else {
      return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
          CoreAnnotations.TextAnnotation.class,
          CoreAnnotations.TokensAnnotation.class,
          CoreAnnotations.ValueAnnotation.class,
          CoreAnnotations.OriginalTextAnnotation.class,
          CoreAnnotations.CharacterOffsetBeginAnnotation.class,
          CoreAnnotations.CharacterOffsetEndAnnotation.class,
          CoreAnnotations.IndexAnnotation.class,
          CoreAnnotations.SentencesAnnotation.class,
          CoreAnnotations.SentenceIndexAnnotation.class
      )));
    }
  }

  @Override
  public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
    if (this.BUILD_GRAPHS) {
      if (this.saveBinaryTrees) {
        return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
            CoreAnnotations.PartOfSpeechAnnotation.class,
            TreeCoreAnnotations.TreeAnnotation.class,
            TreeCoreAnnotations.BinarizedTreeAnnotation.class,
            SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class,
            SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class,
            SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class,
            SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class,
            SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class,
            CoreAnnotations.BeginIndexAnnotation.class,
            CoreAnnotations.EndIndexAnnotation.class,
            CoreAnnotations.CategoryAnnotation.class
        )));
      } else {
        return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
            CoreAnnotations.PartOfSpeechAnnotation.class,
            TreeCoreAnnotations.TreeAnnotation.class,
            SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class,
            SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class,
            SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class,
            SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class,
            SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class,
            CoreAnnotations.BeginIndexAnnotation.class,
            CoreAnnotations.EndIndexAnnotation.class,
            CoreAnnotations.CategoryAnnotation.class
        )));
      }
    } else {
      if (this.saveBinaryTrees) {
        return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
            CoreAnnotations.PartOfSpeechAnnotation.class,
            TreeCoreAnnotations.TreeAnnotation.class,
            TreeCoreAnnotations.BinarizedTreeAnnotation.class,
            CoreAnnotations.CategoryAnnotation.class
        )));
      } else {
        return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
            CoreAnnotations.PartOfSpeechAnnotation.class,
            TreeCoreAnnotations.TreeAnnotation.class,
            CoreAnnotations.CategoryAnnotation.class
        )));
      }
    }
  }

}