ParserGrammar.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.parser.common;

import java.io.IOException;
import java.io.StringReader;
import java.util.List;


import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.parser.metrics.Eval;
import edu.stanford.nlp.parser.metrics.ParserQueryEval;
import edu.stanford.nlp.process.Morphology;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import java.util.function.Function;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.ReflectionLoading;
import edu.stanford.nlp.util.Timing;
// TODO: it would be nice to move these to common, but that would
// wreck all existing models
import edu.stanford.nlp.parser.lexparser.Options;
import edu.stanford.nlp.parser.lexparser.TreebankLangParserParams;

/**
 * An interface for the classes which store the data for a parser.
 * Objects which inherit this interface have a way to produce
 * ParserQuery objects, have a general Options object, and return a
 * list of Evals to perform on a parser.  This helps classes such as
 * {@link edu.stanford.nlp.parser.lexparser.EvaluateTreebank}
 * analyze the performance of a parser.
 *
 * TODO: it would be nice to actually make this an interface again.
 * Perhaps Java 8 will allow that
 *
 * @author John Bauer
 */
public abstract class ParserGrammar implements Function<List<? extends HasWord>, Tree> {

  private static Redwood.RedwoodChannels logger = Redwood.channels(ParserGrammar.class);

  public abstract ParserQuery parserQuery();

  /**
   * Parses the list of HasWord.  If the parse fails for some reason,
   * an X tree is returned instead of barfing.
   *
   * @param words The input sentence (a List of words)
   * @return A Tree that is the parse tree for the sentence.  If the parser
   *         fails, a new Tree is synthesized which attaches all words to the
   *         root.
   */
  @Override
  public Tree apply(List<? extends HasWord> words) {
    return parse(words);
  }

  /**
   * Tokenize the text using the parser's tokenizer
   */
  public List<? extends HasWord> tokenize(String sentence) {
    TokenizerFactory<? extends HasWord> tf = treebankLanguagePack().getTokenizerFactory();
    Tokenizer<? extends HasWord> tokenizer = tf.getTokenizer(new StringReader(sentence));
    List<? extends HasWord> tokens = tokenizer.tokenize();
    return tokens;
  }

  /**
   * Will parse the text in <code>sentence</code> as if it represented
   * a single sentence by first processing it with a tokenizer.
   */
  public Tree parse(String sentence) {
    List<? extends HasWord> tokens = tokenize(sentence);
    if (getOp().testOptions.preTag) {
      Function<List<? extends HasWord>, List<TaggedWord>> tagger = loadTagger();
      tokens = tagger.apply(tokens);
    }
    return parse(tokens);
  }

  private transient Function<List<? extends HasWord>, List<TaggedWord>> tagger;
  private transient String taggerPath;

  public Function<List<? extends HasWord>, List<TaggedWord>> loadTagger() {
    Options op = getOp();
    if (op.testOptions.preTag) {
      synchronized(this) { // TODO: rather coarse synchronization
        if (!op.testOptions.taggerSerializedFile.equals(taggerPath)) {
          taggerPath = op.testOptions.taggerSerializedFile;
          tagger = ReflectionLoading.loadByReflection("edu.stanford.nlp.tagger.maxent.MaxentTagger", taggerPath);
        }
        return tagger;
      }
    } else {
      return null;
    }
  }

  public List<CoreLabel> lemmatize(String sentence) {
    List<? extends HasWord> tokens = tokenize(sentence);
    return lemmatize(tokens);
  }

  /**
   * Only works on English, as it is hard coded for using the
   * Morphology class, which is English-only
   */
  public List<CoreLabel> lemmatize(List<? extends HasWord> tokens) {
    List<TaggedWord> tagged;
    if (getOp().testOptions.preTag) {
      Function<List<? extends HasWord>, List<TaggedWord>> tagger = loadTagger();
      tagged = tagger.apply(tokens);
    } else {
      Tree tree = parse(tokens);
      tagged = tree.taggedYield();
    }
    Morphology morpha = new Morphology();
    List<CoreLabel> lemmas = Generics.newArrayList();
    for (TaggedWord token : tagged) {
      CoreLabel label = new CoreLabel();
      label.setWord(token.word());
      label.setTag(token.tag());
      morpha.stem(label);
      lemmas.add(label);
    }
    return lemmas;
  }

  /**
   * Parses the list of HasWord.  If the parse fails for some reason,
   * an X tree is returned instead of barfing.
   *
   * @param words The input sentence (a List of words)
   * @return A Tree that is the parse tree for the sentence.  If the parser
   *         fails, a new Tree is synthesized which attaches all words to the
   *         root.
   */
  public abstract Tree parse(List<? extends HasWord> words);

  /**
   * Returns a list of extra Eval objects to use when scoring the parser.
   */
  public abstract List<Eval> getExtraEvals();

  /**
   * Return a list of Eval-style objects which care about the whole
   * ParserQuery, not just the finished tree
   */
  public abstract List<ParserQueryEval> getParserQueryEvals();

  public abstract Options getOp();

  public abstract TreebankLangParserParams getTLPParams();

  public abstract TreebankLanguagePack treebankLanguagePack();

  /**
   * Returns a set of options which should be set by default when used
   * in corenlp.  For example, the English PCFG/RNN models want
   * -retainTmpSubcategories, and the ShiftReduceParser models may
   * want -beamSize 4 depending on how they were trained.
   * <br>
   * TODO: right now completely hardcoded, should be settable as a training time option
   */
  public abstract String[] defaultCoreNLPFlags();

  public abstract void setOptionFlags(String ... flags);

  /**
   * The model requires text to be pretagged
   */
  public abstract boolean requiresTags();

  public static ParserGrammar loadModel(String path, String ... extraFlags) {
    ParserGrammar parser;
    try {
      Timing timing = new Timing();
      parser = IOUtils.readObjectFromURLOrClasspathOrFileSystem(path);
      timing.done(logger, "Loading parser from serialized file " + path);
    } catch (IOException | ClassNotFoundException e) {
      throw new RuntimeIOException(e);
    }
    if (extraFlags.length > 0) {
      parser.setOptionFlags(extraFlags);
    }
    return parser;
  }

}