DocumentMaker.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.coref.data;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Properties;

import edu.stanford.nlp.coref.CorefCoreAnnotations;
import edu.stanford.nlp.coref.CorefProperties;
import edu.stanford.nlp.coref.docreader.CoNLLDocumentReader;
import edu.stanford.nlp.coref.docreader.DocReader;
import edu.stanford.nlp.coref.md.CorefMentionFinder;
import edu.stanford.nlp.coref.md.DependencyCorefMentionFinder;
import edu.stanford.nlp.coref.md.RuleBasedCorefMentionFinder;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.SemanticHeadFinder;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.trees.international.pennchinese.ChineseSemanticHeadFinder;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;

/**
 * Class for creating {@link Document}s from raw {@link Annotation}s or from CoNLL input data.
 * @author Heeyoung Lee
 * @author Kevin Clark
 */
public class DocumentMaker {
  private final Properties props;
  private final DocReader reader;
  private final HeadFinder headFinder;
  private final Dictionaries dict;
  private final CorefMentionFinder md;

  public DocumentMaker(Properties props, Dictionaries dictionaries)
      throws ClassNotFoundException, IOException {
    this.props = props;
    this.dict = dictionaries;
    reader = getDocumentReader(props);
    headFinder = getHeadFinder(props);
    md = CorefProperties.useGoldMentions(props) ?
        new RuleBasedCorefMentionFinder(headFinder, props) : null;
  }

  private static DocReader getDocumentReader(Properties props) {
    String corpusPath = CorefProperties.getInputPath(props);
    if (corpusPath == null) {
      return null;
    }
    CoNLLDocumentReader.Options options = new CoNLLDocumentReader.Options();
    if (!PropertiesUtils.getBool(props,"coref.printConLLLoadingMessage",true))
      options.printConLLLoadingMessage = false;
    options.annotateTokenCoref = false;
    String conllFileFilter = props.getProperty("coref.conllFileFilter", ".*_auto_conll$");
    options.setFilter(conllFileFilter);
    options.lang = CorefProperties.getLanguage(props);
    return new CoNLLDocumentReader(corpusPath, options);
  }

  private static HeadFinder getHeadFinder(Properties props) {
    Locale lang = CorefProperties.getLanguage(props);
    if (lang == Locale.ENGLISH) return new SemanticHeadFinder();
    else if (lang == Locale.CHINESE) return new ChineseSemanticHeadFinder();
    else {
      throw new RuntimeException("Invalid language setting: cannot load HeadFinder");
    }
  }

  public Document makeDocument(Annotation anno) throws Exception {
    return makeDocument(new InputDoc(anno, null, null));
  }

  public Document makeDocument(InputDoc input) throws Exception {
    List<List<Mention>> mentions = new ArrayList<>() ;
    if (CorefProperties.useGoldMentions(props)) {
      List<CoreMap> sentences = input.annotation.get(CoreAnnotations.SentencesAnnotation.class);
      for (int i = 0; i < sentences.size(); i++) {
        CoreMap sentence = sentences.get(i);
        List<CoreLabel> sentenceWords = sentence.get(CoreAnnotations.TokensAnnotation.class);
        List<Mention> sentenceMentions = new ArrayList<>();
        mentions.add(sentenceMentions);
        for (Mention g : input.goldMentions.get(i)) {
          sentenceMentions.add(new Mention(-1, g.startIndex, g.endIndex, sentenceWords,
              null, null, new ArrayList<>(sentenceWords.subList(g.startIndex, g.endIndex))));
        }
        md.findHead(sentence, sentenceMentions);
      }
    } else {
      for (CoreMap sentence : input.annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        mentions.add(sentence.get(CorefCoreAnnotations.CorefMentionsAnnotation.class));
      }
    }
    Document doc = new Document(input, mentions);

    if (input.goldMentions != null) {
      findGoldMentionHeads(doc);
    }
    DocumentPreprocessor.preprocess(doc, dict, null, headFinder);

    return doc;
  }

  private void findGoldMentionHeads(Document doc) {
    List<CoreMap> sentences = doc.annotation.get(SentencesAnnotation.class);
    for(int i=0 ; i<sentences.size() ; i++ ) {
      DependencyCorefMentionFinder.findHeadInDependency(sentences.get(i), doc.goldMentions.get(i));
    }
  }

  private StanfordCoreNLP coreNLP;
  private StanfordCoreNLP getStanfordCoreNLP(Properties props) {
    if (coreNLP != null) {
      return coreNLP;
    }

    Properties pipelineProps = new Properties(props);
    if (CorefProperties.conll(props)) {
      pipelineProps.put("annotators", (CorefProperties.getLanguage(props) == Locale.CHINESE ?
          "lemma, ner" : "lemma") + (CorefProperties.useGoldMentions(props) ? "" : ", mention"));
    } else {
      pipelineProps.put("annotators", "pos, lemma, ner, " +
          (CorefProperties.useConstituencyParse(props) ? "parse" : "depparse") +
          (CorefProperties.useGoldMentions(props) ? "" : ", mention"));
    }
    return (coreNLP = new StanfordCoreNLP(pipelineProps, false));
  }

  public Document nextDoc() throws Exception {
    InputDoc input = reader.nextDoc();
    if (input == null) {
      return null;
    }

    if (!CorefProperties.useConstituencyParse(props)) {
      for (CoreMap sentence : input.annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        sentence.remove(TreeCoreAnnotations.TreeAnnotation.class);
      }
    }

    getStanfordCoreNLP(props).annotate(input.annotation);
    if (CorefProperties.conll(props)) {
      input.annotation.set(CoreAnnotations.UseMarkedDiscourseAnnotation.class, true);
    }

    return makeDocument(input);
  }

  public void resetDocs() {
    reader.reset();
  }
}