package edu.stanford.nlp.quoteattribution;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer;
import edu.stanford.nlp.pipeline.QuoteAnnotator;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.XMLUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import java.io.*;
import java.util.*;
/**
* Created by mjfang on 12/18/16.
*/
public class XMLToAnnotation {
public static String getJustText(Node text)
{
StringBuilder sb = new StringBuilder();
NodeList textElems = text.getChildNodes();
for(int i = 0; i < textElems.getLength(); i++)
{
Node child = textElems.item(i);
String str = child.getTextContent();
//replace single occurrence of \n with " ", double occurrences with a single one.
str = str.replaceAll("\n(?!\n)", " ");
str = str.replaceAll("_", ""); //bug fix for sentence splitting
sb.append(str + " ");
}
return sb.toString();
}
//for standard annotations + quotes
public static Properties getProcessedCoreNLPProperties()
{
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, depparse, quote");
props.setProperty("ner.useSUTime","false");
props.setProperty("ner.applyNumericClassifiers","false");
props.setProperty("ssplit.newlineIsSentenceBreak","always");
props.setProperty("outputFormat","serialized");
props.setProperty("serializer","edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer");
props.setProperty("threads", "1");
return props;
}
public static void processCoreNLPIfDoesNotExist(File processedFile, Properties coreNLPProps, String text) {
if (!processedFile.exists()) {
try {
StanfordCoreNLP coreNLP = new StanfordCoreNLP(coreNLPProps);
Annotation processedAnnotation = coreNLP.process(text); //this document holds the split for paragraphs.
ProtobufAnnotationSerializer pas = new ProtobufAnnotationSerializer(true);
OutputStream fos = new BufferedOutputStream(new FileOutputStream(processedFile.getAbsolutePath()));
pas.write(processedAnnotation, fos);
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static Annotation getAnnotatedFile(String text, String baseFileName, Properties props) throws IOException{
File processedFile = new File(baseFileName + ".ser.gz");
processCoreNLPIfDoesNotExist(processedFile, props, text);
Annotation doc = ExtractQuotesUtil.readSerializedProtobufFile(processedFile);
new QuoteAnnotator(new Properties()).annotate(doc); //important! Re-annotate to take into account that certain tokens are removed in the serialization process.
return doc;
}
public static List<Integer> readConnection(String connection) {
List<Integer> connectionList = new ArrayList<>();
if(connection.equals("")) {
return connectionList;
}
String[] connections = connection.split(",");
for(String c : connections) {
connectionList.add(Integer.parseInt(c.substring(1)));
}
return connectionList;
}
//return index of the token that ends this block of text.
//key assumption: blocks are delimited by tokens (i.e. no token spans two blocks.)
public static int getEndIndex(int startIndex, List<CoreLabel> tokens, String text)
{
text = text.trim(); //remove newlines that may throw off text length
int currIndex = startIndex;
CoreLabel token = tokens.get(startIndex);
int tokenBeginChar = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
int offset = text.indexOf(token.get(CoreAnnotations.OriginalTextAnnotation.class));
while(true) {
int tokenEndChar = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
if(tokenEndChar - tokenBeginChar == text.length()) {
return currIndex;
}
else if(tokenEndChar - tokenBeginChar > text.length()) {
return currIndex - 1;
}
currIndex++;
if(currIndex == tokens.size()) {
return currIndex - 1;
}
token = tokens.get(currIndex);
}
}
public static class GoldQuoteInfo {
public int mentionStartTokenIndex, mentionEndTokenIndex;
public String speaker, mention;
public GoldQuoteInfo(int mentionStartTokenIndex, int mentionEndTokenIndex, String speaker, String mention) {
this.mentionStartTokenIndex = mentionStartTokenIndex;
this.mentionEndTokenIndex = mentionEndTokenIndex;
this.speaker = speaker;
this.mention = mention;
}
}
public static class Data {
public List<GoldQuoteInfo> goldList; //the gold values (mention location and speaker name) of the quotes
public List<Person> personList;
public Annotation doc;
public Data(List<GoldQuoteInfo> goldList, List<Person> personList, Annotation doc) {
this.goldList = goldList;
this.personList = personList;
this.doc = doc;
}
}
public static List<Person> readXMLCharacterList(Document doc) {
List<Person> personList = new ArrayList<>();
NodeList characters = doc.getDocumentElement().getElementsByTagName("characters").item(0).getChildNodes();
for(int i = 0; i < characters.getLength(); i++)
{
Node child = characters.item(i);
if(child.getNodeName().equals("character")) {
String name = child.getAttributes().getNamedItem("name").getNodeValue();
char[] cName = name.toCharArray();
cName[0] = Character.toUpperCase(cName[0]);
name = new String(cName);
List<String> aliases = Arrays.asList(child.getAttributes().getNamedItem("aliases").getNodeValue().split(";"));
String gender = (child.getAttributes().getNamedItem("gender") == null) ? "" : child.getAttributes().getNamedItem("gender").getNodeValue();
personList.add(new Person(child.getAttributes().getNamedItem("name").getNodeValue(), gender, aliases));
}
}
return personList;
}
//write the character list to a file to work with the annotator
public static void writeCharacterList(String fileName, List<Person> personList) throws IOException {
StringBuilder text = new StringBuilder();
for(Person p : personList) {
String gender = "";
switch (p.gender) {
case MALE: gender = "M";
break;
case FEMALE: gender = "F";
break;
case UNK: gender = "";
break;
}
text.append(p.name + ";" + gender);
for (String alias : p.aliases) {
text.append(";" + alias);
}
text.append("\n");
}
PrintWriter pw = IOUtils.getPrintWriter(fileName);
pw.print(text);
pw.close();
}
protected static class Mention {
String text;
int begin, end;
public Mention(String text, int begin, int end) {
this.text = text;
this.begin = begin;
this.end = end;
}
}
public static Data readXMLFormat(String fileName) throws Exception {
//Extract character list, gold quote speaker and mention information from the XML document.
Document doc = XMLUtils.readDocumentFromFile(fileName);
Node text = doc.getDocumentElement().getElementsByTagName("text").item(0);
String docText = getJustText(text);
Annotation document = getAnnotatedFile(docText, fileName, getProcessedCoreNLPProperties());
List<CoreMap> quotes = document.get(CoreAnnotations.QuotationsAnnotation.class);
List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
List<GoldQuoteInfo> goldList = new ArrayList<>();
Map<Integer, Mention> idToMention = new HashMap<>();
List<Person> personList = readXMLCharacterList(doc);
Map<String, List<Person>> personMap = QuoteAttributionUtils.readPersonMap(personList);
List<Pair<Integer, String>> mentionIdToSpeakerList = new ArrayList<>();
//there is at least 1 case in which the XML quote does not match up with the automatically-extracted quote. (Ex: quote by Mr. Collins that begins, "Hunsford, near Westerham, Kent, ...")
//as the dirty solution, we treat all quotes encapsulated within an XML quote as the same speaker (although this is not 100% accurate!)
int quoteIndex = 0;
NodeList textElems = text.getChildNodes();
int tokenIndex = 0;
for(int i = 0; i < textElems.getLength(); i++) {
Node chapterNode = textElems.item(i);
if(chapterNode.getNodeName().equals("chapter")) {
NodeList chapElems = chapterNode.getChildNodes();
for (int j = 0; j < chapElems.getLength(); j++) {
Node child = chapElems.item(j);
if (child.getNodeName().equals("quote")) {
//search for nested mentions
NodeList quoteChildren = child.getChildNodes();
for(int k = 0; k < quoteChildren.getLength(); k++)
{
Node quoteChild = quoteChildren.item(k);
if(quoteChild.getNodeName().equals("mention"))
{
String mentionText = quoteChild.getTextContent();
int id = Integer.parseInt(quoteChild.getAttributes().getNamedItem("id").getTextContent().substring(1));
List<Integer> connections = readConnection(quoteChild.getAttributes().getNamedItem("connection").getNodeValue());
int endIndex = getEndIndex(tokenIndex, tokens, mentionText);
// mentions.put(id, new XMLMention(quoteChild.getTextContent(), tokenIndex, endIndex, id, connections));
idToMention.put(id, new Mention(mentionText, tokenIndex, endIndex));
tokenIndex = endIndex + 1;
}
else{
String quoteText = quoteChild.getTextContent();
quoteText = quoteText.replaceAll("\n(?!\n)", " "); //trim unnecessarily newlines
quoteText = quoteText.replaceAll("_", "");
tokenIndex = getEndIndex(tokenIndex, tokens, quoteText) + 1;
}
}
String quoteText = child.getTextContent();
// tokenIndex = getEndIndex(tokenIndex, tokens, quoteText) + 1;
quoteText = quoteText.replaceAll("\n(?!\n)", " "); //trim unnecessarily newlines
quoteText = quoteText.replaceAll("_", "");
int quotationOffset = 1;
if (quoteText.startsWith("``"))
quotationOffset = 2;
List<Integer> connections = readConnection(child.getAttributes().getNamedItem("connection").getTextContent());
int id = Integer.parseInt(child.getAttributes().getNamedItem("id").getTextContent().substring(1));
Integer mention_id = null;
if (connections.size() > 0)
mention_id = connections.get(0);
else {
System.out.println("quote w/ no mention. ID: " + id);
}
// Pair<Integer, Integer> mentionPair = idToMentionPair.get(mention_id);
mentionIdToSpeakerList.add(new Pair<>(mention_id, child.getAttributes().getNamedItem("speaker").getTextContent()));
String annotatedQuoteText = quotes.get(quoteIndex).get(CoreAnnotations.TextAnnotation.class);
while(!quoteText.endsWith(annotatedQuoteText)) {
quoteIndex++;
annotatedQuoteText = quotes.get(quoteIndex).get(CoreAnnotations.TextAnnotation.class);
mentionIdToSpeakerList.add(new Pair<>(mention_id, child.getAttributes().getNamedItem("speaker").getTextContent()));
}
// idToMentionPair.put(id, new Pair<>(-1, -1));
// imention_id = connections.get(0);
// quotes.add(new XMLQuote(quoteText.substring(quotationOffset, quoteText.length() - quotationOffset), child.getAttributes().getNamedItem("speaker").getTextContent(), id, chapterIndex, mention_id));
quoteIndex++;
} else if (child.getNodeName().equals("mention")) {
String mentionText = child.getTextContent();
int id = Integer.parseInt(child.getAttributes().getNamedItem("id").getTextContent().substring(1));
List<Integer> connections = readConnection(child.getAttributes().getNamedItem("connection").getNodeValue());
int endIndex = getEndIndex(tokenIndex, tokens, mentionText);
idToMention.put(id, new Mention(mentionText, tokenIndex, endIndex));
// mentions.put(id, new XMLMention(child.getTextContent(), tokenIndex, endIndex, id, connections));
tokenIndex = endIndex + 1;
} else {//#text
String nodeText = child.getTextContent();
nodeText = nodeText.replaceAll("\n(?!\n)", " ");
nodeText = nodeText.replaceAll("_", "");
if(tokenIndex >= tokens.size()) {
continue;
}
tokenIndex = getEndIndex(tokenIndex, tokens, nodeText) + 1;
}
}
}
}
for(Pair<Integer, String> item : mentionIdToSpeakerList) {
Mention mention = idToMention.get(item.first);
if(mention == null) {
goldList.add(new GoldQuoteInfo(-1, -1, item.second, null));
} else {
goldList.add(new GoldQuoteInfo(mention.begin, mention.end, item.second, mention.text));
}
}
//verify
if(document.get(CoreAnnotations.QuotationsAnnotation.class).size() != goldList.size()) {
throw new RuntimeException("Quotes size and gold size don't match!");
}
return new Data(goldList, personList, document);
}
}