package arkref.data;
import java.util.*;
import java.io.*;
import com.aliasi.util.Math;
import arkref.analysis.ARKref;
import arkref.analysis.FindMentions;
import arkref.analysis.Preprocess;
import arkref.parsestuff.AlignedSub;
import arkref.parsestuff.AnalysisUtilities;
import arkref.parsestuff.TregexPatternFactory;
import arkref.parsestuff.U;
import arkref.sent.SentenceBreaker;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.util.IntPair;
import edu.stanford.nlp.util.StringUtils;
public class Document implements Serializable{
private static final long serialVersionUID = 55739275200700333L;
private ArrayList<Sentence> sentences;
private ArrayList<Mention> mentions;
public NodeHashMap<Mention> node2mention;
private RefGraph refGraph;
private Tree docTree = null; //tree that includes all the trees for the sentences, in order, under a dummy node
private EntityGraph entGraph;
public Document() {
sentences = new ArrayList<Sentence>();
mentions = new ArrayList<Mention>();
node2mention = new NodeHashMap<Mention>();
refGraph = new RefGraph();
}
public Document(List<Tree> trees, List<String> entityStrings) {
sentences = new ArrayList<Sentence>();
mentions = new ArrayList<Mention>();
node2mention = new NodeHashMap<Mention>();
refGraph = new RefGraph();
for(int i=0; i<trees.size(); i++){
Sentence sent = new Sentence(i);
Tree t = trees.get(i);
String entityString = entityStrings.get(i);
boolean parseSuccess = !t.getChild(0).label().toString().equals(".");
sent.setStuff(t, entityString, parseSuccess);
sentences.add(sent);
}
}
/**
* NOT USED
*
* if there is no mention for the given node, this will walk up the tree
* to try to find one, as in H&K EMNLP 09. Such a method is necessary
* because the test data coref labels may not match up with constituents exactly
*
* @param s
* @param node
* @return
*/
public Mention findMentionDominatingNode(int sentenceIndex, Tree node) {
Mention res = null;
Tree tmpNode = node;
if (sentenceIndex >= sentences.size()){
return null;
}
Sentence s = sentences.get(sentenceIndex);
do {
res = node2mention.get(s, tmpNode);
tmpNode = tmpNode.parent(s.rootNode());
} while(res == null && tmpNode != null);
return res;
}
/**
* Given a span defined by indexes for the sentence, start token, and end token,
* this method returns the smallest node that includes that span.
*
* @param sentenceIndex
* @param spanStart inclusive
* @param spanEnd inclusive
* @return
*/
public Tree findNodeThatCoversSpan(int sentenceIndex, int spanStart, int spanEnd){
if(sentenceIndex >= sentences.size()) {
return null;
}
Sentence sent = sentences.get(sentenceIndex);
return findNodeThatCoversSpan(sent, spanStart, spanEnd);
}
public Tree findNodeThatCoversSpan(Sentence sent, int spanStart, int spanEnd) {
List<Tree> leaves = sent.rootNode().getLeaves();
if(spanStart < 0 || leaves.size() == 0 || spanEnd >= leaves.size()) {
return null;
}
Tree startLeaf = leaves.get(spanStart);
Tree endLeaf = leaves.get(spanEnd);
return findNodeThatCoversSpan(sent, startLeaf, endLeaf);
}
public Tree findNodeThatCoversSpan(Sentence sent, Tree startLeaf, Tree endLeaf) {
Tree cur = startLeaf;
while(cur != null) {
if (cur.dominates(startLeaf) && cur.dominates(endLeaf))
return cur;
cur = cur.parent(sent.rootNode());
}
assert false : "got to top without finding covering span";
return cur;
}
public Tree getLeaf(int sentenceIndex, int leafIndex) {
Sentence sent = sentences.get(sentenceIndex);
List<Tree> leaves = sent.rootNode().getLeaves();
return leaves.get(leafIndex);
}
public static Document loadFiles(String path) throws IOException {
Document d = new Document();
String shortpath = Preprocess.shortPath(path);
String parseFilename = shortpath + ".parse";
String neFilename = path = shortpath + ".sst";
BufferedReader parseR = new BufferedReader(new FileReader(parseFilename));
BufferedReader sstR = new BufferedReader(new FileReader(neFilename));
String parseLine, sst;
int curSentId = 0;
while ( (parseLine = parseR.readLine()) != null) {
Sentence sent = new Sentence(++curSentId);
parseLine = parseLine.replace("=H ", " ");
Tree tree = null;
if (parseLine.split("\t").length == 1) {
// old version: just the parse
tree = AnalysisUtilities.getInstance().readTreeFromString(parseLine);
sent.hasParse = true;
} else {
tree = AnalysisUtilities.getInstance().readTreeFromString(parseLine.split("\t")[2]);
sent.hasParse = !parseLine.split("\t")[0].equals("ERROR");
}
Document.addNPsAbovePossessivePronouns(tree);
Document.addInternalNPStructureForRoleAppositives(tree);
sst = sstR.readLine();
sent.setStuff(tree, sst, sent.hasParse);
d.sentences.add(sent);
}
return d;
}
/** do sentence breaking (again) on the .txt file for surface info, after parses etc. have been loaded
* @throws FileNotFoundException **/
public void loadSurfaceSentences(String path) throws FileNotFoundException {
if (! new File(path+".txt").exists()) {
throw new FileNotFoundException("Need the .txt file to re-break");
}
int i=0;
for (SentenceBreaker.Sentence s : AnalysisUtilities.cleanAndBreakSentences(U.readFile(path+".txt"))) {
sentences.get(i).surfSent = s;
i++;
}
}
public void ensureSurfaceSentenceLoad(String path) throws FileNotFoundException {
if (sentences.size()>0 && sentences.get(0).surfSent == null) {
loadSurfaceSentences(path);
}
}
public Sentence getSentenceContaining(int charOffset) {
for (Sentence s : sentences) {
if (s.surfSent.charStart <= charOffset && charOffset < s.surfSent.charEnd) {
return s;
}
}
assert false : "no sentence for char offset "+charOffset;
return null;
}
public static void addNPsAbovePossessivePronouns(Tree tree) {
TreeFactory factory = new LabeledScoredTreeFactory(); //TODO might want to keep this around to save time
String patS = "NP=parentnp < /^PRP\\$/=pro"; //needs to be the maximum projection of a head word
TregexPattern pat = TregexPatternFactory.getPattern(patS);
TregexMatcher matcher = pat.matcher(tree);
while (matcher.find()) {
Tree parentNP = matcher.getNode("parentnp");
Tree pro = matcher.getNode("pro");
Tree newNP = factory.newTreeNode("NP", new ArrayList<Tree>());
int index = parentNP.indexOf(pro);
newNP.addChild(pro);
parentNP.removeChild(index);
parentNP.addChild(index, newNP);
}
}
public static void addInternalNPStructureForRoleAppositives(Tree tree) {
TreeFactory factory = new LabeledScoredTreeFactory(); //TODO might want to keep this around to save time
String patS = "NP=parentnp < (NN|NNS=role . NNP|NNPS)";
TregexPattern pat = TregexPatternFactory.getPattern(patS);
TregexMatcher matcher = pat.matcher(tree);
Tree newNode;
while (matcher.find()) {
Tree parentNP = matcher.getNode("parentnp");
Tree roleNP = matcher.getNode("role");
Tree tmpTree;
newNode = factory.newTreeNode("NP", new ArrayList<Tree>());
int i = parentNP.indexOf(roleNP);
while(i>=0){
tmpTree = parentNP.getChild(i);
if(!tmpTree.label().value().matches("^NN|NNS|DT|JJ|ADVP$")){
break;
}
newNode.addChild(0, tmpTree);
parentNP.removeChild(i);
i--;
}
parentNP.addChild(i+1, newNode);
}
}
/** goes backwards through document **/
public Iterable<Mention> prevMentions(final Mention start) {
return new Iterable<Mention>() {
public Iterator<Mention> iterator() {
return new MentionRevIterIter(start);
}
};
}
public class MentionRevIterIter implements Iterator<Mention> {
int mi = -1;
int startingSentence = -1;
public MentionRevIterIter(Mention start) {
startingSentence = start.getSentence().ID();
for (int i=0; i < mentions.size(); i++) {
if (mentions.get(i) == start) {
this.mi = i;
break;
}
}
assert mi != -1;
}
@Override
public boolean hasNext() {
if (mi==0)
return false;
Mention mNext = mentions.get(mi-1);
if (startingSentence - mNext.getSentence().ID() > ARKref.Opts.sentenceWindow)
return false;
return true;
}
@Override
public Mention next() {
mi--;
assert mi != -1;
Mention m = mentions.get(mi);
return m;
}
@Override
public void remove() {
throw new RuntimeException("can't remove from the mention iterator!");
}
}
/**
* make a right branching tree out of all the sentence trees
* e.g., (DOCROOT T1 (DOCROOT T2 (DOCROOT T3)))
* This will make sure that ndoes in t3 are further from nodes in t1
* than they are fromnodes in t2.
*
* @return
*/
public Tree getTree() {
if(docTree == null){
TreeFactory factory = new LabeledScoredTreeFactory();
docTree = factory.newTreeNode("DOCROOT", new ArrayList<Tree>());
Tree tmpTree1 = docTree;
Tree tmpTree2;
for(int i=0; i<sentences.size(); i++){
tmpTree1.addChild(sentences.get(i).rootNode());
if(i<sentences.size()-1){
tmpTree2 = factory.newTreeNode("DOCROOT", new ArrayList<Tree>());
tmpTree1.addChild(tmpTree2);
tmpTree1 = tmpTree2;
}
}
}
return docTree;
}
/**
* saves doc-level token alignments in the analysis.Word objects
* Requires surfSent's in the document's sentences.
**/
public void doTokenAlignments(String docText) {
U.pl("*** Stanford <-> Raw Text alignment ***\n");
for (Sentence s : sentences) {
U.pf("S%-2d\t%s\n", s.ID(), StringUtils.join(s.tokens()));
// U.pl("SENTENCE WORDS " + s.words);
// U.pl("" + s.surfSent);
// U.pl("" + s.surfSent.rawText);
AlignedSub cleanText = AnalysisUtilities.moreCleanup(s.surfSent.rawText);
int[] wordAlignsInSent = AnalysisUtilities.alignTokens(cleanText.text, s.words);
for (int i=0; i<wordAlignsInSent.length; i++)
if (wordAlignsInSent[i] != -1)
wordAlignsInSent[i] = cleanText.alignments[wordAlignsInSent[i]];
// adjust to doc position
for (int i=0; i < s.words.size(); i++) {
if (wordAlignsInSent[i]==-1) {
s.words.get(i).charStart = -1;
} else {
s.words.get(i).charStart = s.surfSent.alignments[ wordAlignsInSent[i] ];
}
}
if (s.words != null && s.words.size()>0 && s.words.get(0).charStart==-1) {
s.words.get(0).charStart = s.surfSent.alignments[0];
}
for (int i=1; i < s.words.size(); i++) {
if (s.words.get(i).charStart==-1) {
Word prev = s.words.get(i-1);
s.words.get(i).charStart = prev.charStart + prev.token.length();
}
}
}
}
public List<Word> allWords() {
List<Word> allWords = new ArrayList<Word>();
for (Sentence s : sentences) {
for (Word w : s.words){
allWords.add(w);
}
}
return allWords;
}
public List<Mention> mentions() {
return mentions;
}
public List<Sentence> sentences() {
return sentences;
}
public RefGraph refGraph() {
return refGraph;
}
public void setEntGraph(EntityGraph entGraph) {
this.entGraph = entGraph;
}
public EntityGraph entGraph() {
return entGraph;
}
public Mention newMention(Sentence s, Tree subtree) {
Mention mention = new Mention(mentions.size()+1, s, subtree);
mentions.add(mention);
if (subtree != null)
node2mention.put(s, subtree, mention);
return mention;
}
}