package edu.stanford.nlp.quoteattribution;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by michaelf on 12/30/15. Adapted from Grace Muzny's codebase
*/
public class BammanCorefReader {
/**
* The main output here is data/tokens/dickens.oliver.tokens, which contains the original book, one token per line, with part of speech, syntax, NER, coreference and other annotations. The (tab-separated) format is:
*
* Paragraph id
* Sentence id
* Token id
* Byte start
* Byte end
* Whitespace following the token (useful for pretty-printing the original text)
* Syntactic head id (-1 for the sentence root)
* Original token
* Normalized token (for quotes etc.)
* Lemma
* Penn Treebank POS tag
* NER tag (PERSON, NUMBER, DATE, DURATION, MISC, TIME, LOCATION, ORDINAL, MONEY, ORGANIZATION, SET, O)
* Stanford basic dependency label
* Within-quotation flag
* Character id (all coreferent tokens share the same character id)
*
* @param filename
*/
public static Map<Integer, List<CoreLabel>> readTokenFile(String filename, Annotation novel) {
List<String> lines = IOUtils.linesFromFile(filename);
Map<Integer, List<CoreLabel>> charsToTokens = new HashMap<>();
boolean first = true;
int tokenOffset = 0;
for (String line : lines) {
if (first) {
first = false;
continue;
}
String[] pieces = line.split("\t");
int tokenId = Integer.parseInt(pieces[2]) + tokenOffset;
String token = pieces[7];
String normalizedTok = pieces[8];
int characterId = Integer.parseInt(pieces[14]);
CoreLabel novelTok = novel.get(CoreAnnotations.TokensAnnotation.class).get(tokenId);
// CoreNLP sometimes splits ". . . ." as ". . ." and "." and sometimes lemmatizes it. (The Steppe)
if(pieces[7].equals(". . . .") && !novelTok.get(CoreAnnotations.OriginalTextAnnotation.class).equals(". . . .")) {
tokenOffset++;
}
if (characterId != -1) {
if (!novelTok.get(CoreAnnotations.TextAnnotation.class).equals(normalizedTok)) {
System.err.println(token + " != " + novelTok.get(CoreAnnotations.TextAnnotation.class));
} else {
if (!charsToTokens.containsKey(characterId)) {
charsToTokens.put(characterId, new ArrayList<>());
}
charsToTokens.get(characterId).add(novelTok);
}
}
}
return charsToTokens;
}
}