package edu.stanford.nlp.coref.data;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Properties;
import edu.stanford.nlp.coref.CorefCoreAnnotations;
import edu.stanford.nlp.coref.CorefProperties;
import edu.stanford.nlp.coref.docreader.CoNLLDocumentReader;
import edu.stanford.nlp.coref.docreader.DocReader;
import edu.stanford.nlp.coref.md.CorefMentionFinder;
import edu.stanford.nlp.coref.md.DependencyCorefMentionFinder;
import edu.stanford.nlp.coref.md.RuleBasedCorefMentionFinder;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.SemanticHeadFinder;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.trees.international.pennchinese.ChineseSemanticHeadFinder;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
/**
* Class for creating {@link Document}s from raw {@link Annotation}s or from CoNLL input data.
* @author Heeyoung Lee
* @author Kevin Clark
*/
public class DocumentMaker {
private final Properties props;
private final DocReader reader;
private final HeadFinder headFinder;
private final Dictionaries dict;
private final CorefMentionFinder md;
public DocumentMaker(Properties props, Dictionaries dictionaries)
throws ClassNotFoundException, IOException {
this.props = props;
this.dict = dictionaries;
reader = getDocumentReader(props);
headFinder = getHeadFinder(props);
md = CorefProperties.useGoldMentions(props) ?
new RuleBasedCorefMentionFinder(headFinder, props) : null;
}
private static DocReader getDocumentReader(Properties props) {
String corpusPath = CorefProperties.getInputPath(props);
if (corpusPath == null) {
return null;
}
CoNLLDocumentReader.Options options = new CoNLLDocumentReader.Options();
if (!PropertiesUtils.getBool(props,"coref.printConLLLoadingMessage",true))
options.printConLLLoadingMessage = false;
options.annotateTokenCoref = false;
String conllFileFilter = props.getProperty("coref.conllFileFilter", ".*_auto_conll$");
options.setFilter(conllFileFilter);
options.lang = CorefProperties.getLanguage(props);
return new CoNLLDocumentReader(corpusPath, options);
}
private static HeadFinder getHeadFinder(Properties props) {
Locale lang = CorefProperties.getLanguage(props);
if (lang == Locale.ENGLISH) return new SemanticHeadFinder();
else if (lang == Locale.CHINESE) return new ChineseSemanticHeadFinder();
else {
throw new RuntimeException("Invalid language setting: cannot load HeadFinder");
}
}
public Document makeDocument(Annotation anno) throws Exception {
return makeDocument(new InputDoc(anno, null, null));
}
public Document makeDocument(InputDoc input) throws Exception {
List<List<Mention>> mentions = new ArrayList<>() ;
if (CorefProperties.useGoldMentions(props)) {
List<CoreMap> sentences = input.annotation.get(CoreAnnotations.SentencesAnnotation.class);
for (int i = 0; i < sentences.size(); i++) {
CoreMap sentence = sentences.get(i);
List<CoreLabel> sentenceWords = sentence.get(CoreAnnotations.TokensAnnotation.class);
List<Mention> sentenceMentions = new ArrayList<>();
mentions.add(sentenceMentions);
for (Mention g : input.goldMentions.get(i)) {
sentenceMentions.add(new Mention(-1, g.startIndex, g.endIndex, sentenceWords,
null, null, new ArrayList<>(sentenceWords.subList(g.startIndex, g.endIndex))));
}
md.findHead(sentence, sentenceMentions);
}
} else {
for (CoreMap sentence : input.annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
mentions.add(sentence.get(CorefCoreAnnotations.CorefMentionsAnnotation.class));
}
}
Document doc = new Document(input, mentions);
if (input.goldMentions != null) {
findGoldMentionHeads(doc);
}
DocumentPreprocessor.preprocess(doc, dict, null, headFinder);
return doc;
}
private void findGoldMentionHeads(Document doc) {
List<CoreMap> sentences = doc.annotation.get(SentencesAnnotation.class);
for(int i=0 ; i<sentences.size() ; i++ ) {
DependencyCorefMentionFinder.findHeadInDependency(sentences.get(i), doc.goldMentions.get(i));
}
}
private StanfordCoreNLP coreNLP;
private StanfordCoreNLP getStanfordCoreNLP(Properties props) {
if (coreNLP != null) {
return coreNLP;
}
Properties pipelineProps = new Properties(props);
if (CorefProperties.conll(props)) {
pipelineProps.put("annotators", (CorefProperties.getLanguage(props) == Locale.CHINESE ?
"lemma, ner" : "lemma") + (CorefProperties.useGoldMentions(props) ? "" : ", mention"));
} else {
pipelineProps.put("annotators", "pos, lemma, ner, " +
(CorefProperties.useConstituencyParse(props) ? "parse" : "depparse") +
(CorefProperties.useGoldMentions(props) ? "" : ", mention"));
}
return (coreNLP = new StanfordCoreNLP(pipelineProps, false));
}
public Document nextDoc() throws Exception {
InputDoc input = reader.nextDoc();
if (input == null) {
return null;
}
if (!CorefProperties.useConstituencyParse(props)) {
for (CoreMap sentence : input.annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
sentence.remove(TreeCoreAnnotations.TreeAnnotation.class);
}
}
getStanfordCoreNLP(props).annotate(input.annotation);
if (CorefProperties.conll(props)) {
input.annotation.set(CoreAnnotations.UseMarkedDiscourseAnnotation.class, true);
}
return makeDocument(input);
}
public void resetDocs() {
reader.reset();
}
}