package edu.stanford.nlp.pipeline;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.paragraphs.ParagraphAnnotator;
import edu.stanford.nlp.quoteattribution.ChapterAnnotator;
import edu.stanford.nlp.quoteattribution.Person;
import edu.stanford.nlp.quoteattribution.QuoteAttributionUtils;
import edu.stanford.nlp.quoteattribution.Sieves.MSSieves.BaselineTopSpeakerSieve;
import edu.stanford.nlp.quoteattribution.Sieves.MSSieves.DeterministicSpeakerSieve;
import edu.stanford.nlp.quoteattribution.Sieves.MSSieves.LooseConversationalSpeakerSieve;
import edu.stanford.nlp.quoteattribution.Sieves.MSSieves.MSSieve;
import edu.stanford.nlp.quoteattribution.Sieves.MSSieves.MajoritySpeakerSieve;
import edu.stanford.nlp.quoteattribution.Sieves.QMSieves.*;
import edu.stanford.nlp.util.Timing;
import edu.stanford.nlp.util.logging.Redwood;
import edu.stanford.nlp.util.*;
import java.util.*;
/**
* An annotator uses attributes quotes in a text to their speakers. It uses a two-stage process that first links quotes
* to mentions and then mentions to speakers. Each stage consists in a series of sieves that each try to make
* predictions on the quote or mentions that have not been linked by previous sieves.
*
* The annotator will add the following annotations to each QuotationAnnotation:
* <ul>
* <li>MentionAnnotation : the text of the mention</li>
* <li>MentionBeginAnnotation : the beginning token index of the mention</li>
* <li>MentionEndAnnotation : the end token index of the mention</li>
* <li>MentionTypeAnnotation : the type of mention (pronoun, name, or animate noun)</li>
* <li>MentionSieveAnnotation : the sieve that made the mention prediction</li>
* <li>SpeakerAnnotation : the name of the speaker</li>
* <li>SpeakerSieveAnnotation : the name of the sieve that made the speaker prediction</li>
* </ul>
*
* The annotator has the following options:
* <ul>
* <li>quoteattribution.charactersPath (required): path to file containing the character names, aliases,
* and gender information.</li>
* <li>quoteattribution.booknlpCoref (required): path to tokens file generated from
* <a href="https://github.com/dbamman/book-nlp">book-nlp</a> containing coref information.</li>
* <li>quoteattribution.QMSieves: list of sieves to use in the quote to mention linking phase
* (default=tri,dep,onename,voc,paraend,conv,sup,loose). More information about the sieves can be found at our
* <a href="stanfordnlp.github.io/CoreNLP/quoteattribution.html">website</a>. </li>
* <li>quoteattribution.MSSieves: list of sieves to use in the mention to speaker linking phase
* (default=det,top).</li>
* <li>quoteattribution.model: path to trained model file.</li>
* <li>quoteattribution.familyWordsFile: path to file with family words list.</li>
* <li>quoteattribution.animacyWordsFile: path to file with animacy words list.</li>
* <li>quoteattribution.genderNamesFile: path to file with names list with gender information.</li>
* </ul>
*
* @author Grace Muzny, Michael Fang
*/
public class QuoteAttributionAnnotator implements Annotator {
public static class MentionAnnotation implements CoreAnnotation<String> {
@Override
public Class<String> getType() {
return String.class;
}
}
public static class MentionBeginAnnotation implements CoreAnnotation<Integer> {
@Override
public Class<Integer> getType() {
return Integer.class;
}
}
public static class MentionEndAnnotation implements CoreAnnotation<Integer> {
@Override
public Class<Integer> getType() {
return Integer.class;
}
}
public static class MentionTypeAnnotation implements CoreAnnotation<String> {
@Override
public Class<String> getType() {
return String.class;
}
}
public static class MentionSieveAnnotation implements CoreAnnotation<String> {
@Override
public Class<String> getType() {
return String.class;
}
}
public static class SpeakerAnnotation implements CoreAnnotation<String> {
@Override
public Class<String> getType() { return String.class; }
}
public static class SpeakerSieveAnnotation implements CoreAnnotation<String> {
@Override
public Class<String> getType() { return String.class; }
}
private static Redwood.RedwoodChannels log = Redwood.channels(QuoteAttributionAnnotator.class);
// settings
public static final String DEFAULT_QMSIEVES = "tri,dep,onename,voc,paraend,conv,sup,loose";
public static final String DEFAULT_MSSIEVES = "det,top";
public static final String DEFAULT_MODEL_PATH = "edu/stanford/nlp/models/quoteattribution/quoteattribution_model.ser";
// these paths go in the props file
public static String FAMILY_WORD_LIST = "edu/stanford/nlp/models/quoteattribution/family_words.txt";
public static String ANIMACY_WORD_LIST = "edu/stanford/nlp/models/quoteattribution/animate.unigrams.txt";
public static String GENDER_WORD_LIST = "edu/stanford/nlp/models/quoteattribution/gender_filtered.txt";
public static String COREF_PATH = "";
public static String MODEL_PATH = "edu/stanford/nlp/models/quoteattribution/quoteattribution_model.ser";
public static String CHARACTERS_FILE = "";
public boolean buildCharacterMapPerAnnotation = false;
public static final Boolean VERBOSE = true;
// fields
private Set<String> animacyList;
private Set<String> familyRelations;
private Map<String, Person.Gender> genderMap;
private Map<String, List<Person>> characterMap;
private String qmSieveList;
private String msSieveList;
public QuoteAttributionAnnotator(Properties props) {
Timing timer = null;
COREF_PATH = props.getProperty("booknlpCoref", null);
if(COREF_PATH == null) {
log.err("Warning: no coreference map!");
}
MODEL_PATH = props.getProperty("modelPath", DEFAULT_MODEL_PATH);
CHARACTERS_FILE = props.getProperty("charactersPath", null);
if(CHARACTERS_FILE == null) {
log.err("Warning: no characters file!");
}
qmSieveList = props.getProperty("QMSieves", DEFAULT_QMSIEVES);
msSieveList = props.getProperty("MSSieves", DEFAULT_MSSIEVES);
if (VERBOSE) {
timer = new Timing();
log.info("Loading QuoteAttribution coref [" + COREF_PATH + "]...");
log.info("Loading QuoteAttribution characters [" + CHARACTERS_FILE + "]...");
}
// loading all our word lists
FAMILY_WORD_LIST = props.getProperty("familyWordsFile", FAMILY_WORD_LIST);
ANIMACY_WORD_LIST = props.getProperty("animacyWordsFile", ANIMACY_WORD_LIST);
GENDER_WORD_LIST = props.getProperty("genderNamesFile", GENDER_WORD_LIST);
familyRelations = QuoteAttributionUtils.readFamilyRelations(FAMILY_WORD_LIST);
genderMap = QuoteAttributionUtils.readGenderedNounList(GENDER_WORD_LIST);
animacyList = QuoteAttributionUtils.readAnimacyList(ANIMACY_WORD_LIST);
if (characterMap != null) {
characterMap = QuoteAttributionUtils.readPersonMap(CHARACTERS_FILE);
} else {
buildCharacterMapPerAnnotation = true;
}
if (VERBOSE) {
timer.stop("done.");
}
}
/** if no character list is provided, produce a list of person names from entity mentions annotation **/
public void entityMentionsToCharacterMap(Annotation annotation) {
characterMap = new HashMap<String, List<Person>>();
for (CoreMap entityMention : annotation.get(CoreAnnotations.MentionsAnnotation.class)) {
String entityMentionString = entityMention.toString();
if (entityMention.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals("PERSON")) {
Person newPerson = new Person(entityMentionString, "UNK", new ArrayList());
List<Person> newPersonList = new ArrayList<Person>();
newPersonList.add(newPerson);
characterMap.put(entityMentionString, newPersonList);
}
}
}
@Override
public void annotate(Annotation annotation) {
boolean perDocumentCharacterMap = false;
if (buildCharacterMapPerAnnotation) {
if (annotation.containsKey(CoreAnnotations.MentionsAnnotation.class)) {
entityMentionsToCharacterMap(annotation);
}
}
// 0. pre-preprocess the text with paragraph annotations
// TODO: maybe move this out, definitely make it so that you can set paragraph breaks
Properties propsPara = new Properties();
propsPara.setProperty("paragraphBreak", "one");
ParagraphAnnotator pa = new ParagraphAnnotator(propsPara, false);
pa.annotate(annotation);
// 1. preprocess the text
// a) setup coref
Map<Integer, String> pronounCorefMap =
QuoteAttributionUtils.setupCoref(COREF_PATH, characterMap, annotation);
//annotate chapter numbers in sentences. Useful for denoting chapter boundaries
new ChapterAnnotator().annotate(annotation);
// to incorporate sentences across paragraphs
QuoteAttributionUtils.addEnhancedSentences(annotation);
//annotate depparse of quote-removed sentences
QuoteAttributionUtils.annotateForDependencyParse(annotation);
Annotation preprocessed = annotation;
// 2. Quote->Mention annotation
Map<String, QMSieve> qmSieves = getQMMapping(preprocessed, pronounCorefMap);
for(String sieveName : qmSieveList.split(",")) {
qmSieves.get(sieveName).doQuoteToMention(preprocessed);
}
// 3. Mention->Speaker annotation
Map<String, MSSieve> msSieves = getMSMapping(preprocessed, pronounCorefMap);
for(String sieveName : msSieveList.split(",")) {
msSieves.get(sieveName).doMentionToSpeaker(preprocessed);
}
}
private Map<String, QMSieve> getQMMapping(Annotation doc, Map<Integer, String> pronounCorefMap) {
Map<String, QMSieve> map = new HashMap<>();
map.put("tri", new TrigramSieve(doc, characterMap, pronounCorefMap, animacyList));
map.put("dep", new DependencyParseSieve(doc, characterMap, pronounCorefMap, animacyList));
map.put("onename", new OneNameSentenceSieve(doc, characterMap, pronounCorefMap, animacyList));
map.put("voc", new VocativeSieve(doc, characterMap, pronounCorefMap, animacyList));
map.put("paraend", new ParagraphEndQuoteClosestSieve(doc, characterMap, pronounCorefMap, animacyList));
SupervisedSieve ss = new SupervisedSieve(doc, characterMap, pronounCorefMap, animacyList);
ss.loadModel(MODEL_PATH);
map.put("sup", ss);
map.put("conv", new ConversationalSieve(doc, characterMap, pronounCorefMap, animacyList));
map.put("loose", new LooseConversationalSieve(doc, characterMap, pronounCorefMap, animacyList));
map.put("closest", new ClosestMentionSieve(doc, characterMap, pronounCorefMap, animacyList));
return map;
}
private Map<String, MSSieve> getMSMapping(Annotation doc, Map<Integer, String> pronounCorefMap) {
Map<String, MSSieve> map = new HashMap<>();
map.put("det", new DeterministicSpeakerSieve(doc, characterMap, pronounCorefMap, animacyList));
map.put("loose", new LooseConversationalSpeakerSieve(doc, characterMap, pronounCorefMap, animacyList));
map.put("top", new BaselineTopSpeakerSieve(doc, characterMap, pronounCorefMap, animacyList, genderMap,
familyRelations));
map.put("maj", new MajoritySpeakerSieve(doc, characterMap, pronounCorefMap, animacyList));
return map;
}
@Override
public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
return new HashSet<>(Arrays.asList(
MentionAnnotation.class,
MentionBeginAnnotation.class,
MentionEndAnnotation.class,
MentionTypeAnnotation.class,
MentionSieveAnnotation.class,
SpeakerAnnotation.class,
SpeakerSieveAnnotation.class,
CoreAnnotations.ParagraphIndexAnnotation.class
));
}
@Override
public Set<Class<? extends CoreAnnotation>> requires() {
return new HashSet<>(Arrays.asList(
CoreAnnotations.TextAnnotation.class,
CoreAnnotations.TokensAnnotation.class,
CoreAnnotations.SentencesAnnotation.class,
CoreAnnotations.CharacterOffsetBeginAnnotation.class,
CoreAnnotations.CharacterOffsetEndAnnotation.class,
CoreAnnotations.PartOfSpeechAnnotation.class,
CoreAnnotations.LemmaAnnotation.class,
CoreAnnotations.BeforeAnnotation.class,
CoreAnnotations.AfterAnnotation.class,
CoreAnnotations.TokenBeginAnnotation.class,
CoreAnnotations.TokenEndAnnotation.class,
CoreAnnotations.IndexAnnotation.class,
CoreAnnotations.OriginalTextAnnotation.class
// CoreAnnotations.ParagraphIndexAnnotation.class
));
}
}