Document.java example

Explorer
arkref-master
- src
  - arkref
package arkref.data;

import java.util.*;
import java.io.*;

import com.aliasi.util.Math;

import arkref.analysis.ARKref;
import arkref.analysis.FindMentions;
import arkref.analysis.Preprocess;
import arkref.parsestuff.AlignedSub;
import arkref.parsestuff.AnalysisUtilities;
import arkref.parsestuff.TregexPatternFactory;
import arkref.parsestuff.U;
import arkref.sent.SentenceBreaker;


import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.util.IntPair;
import edu.stanford.nlp.util.StringUtils;

public class Document implements Serializable{
	private static final long serialVersionUID = 55739275200700333L;
	private ArrayList<Sentence> sentences;
	private ArrayList<Mention> mentions;
	public NodeHashMap<Mention> node2mention;
	private RefGraph refGraph;
	private Tree docTree = null; //tree that includes all the trees for the sentences, in order, under a dummy node	
	private EntityGraph entGraph;


	public Document() {
		sentences = new ArrayList<Sentence>();
		mentions = new ArrayList<Mention>();
		node2mention = new NodeHashMap<Mention>();
		refGraph = new RefGraph();
	}

	
	public Document(List<Tree> trees, List<String> entityStrings) {
		sentences = new ArrayList<Sentence>();
		mentions = new ArrayList<Mention>();
		node2mention = new NodeHashMap<Mention>();
		refGraph = new RefGraph();
		
		for(int i=0; i<trees.size(); i++){
			Sentence sent = new Sentence(i);
			Tree t = trees.get(i);
			String entityString = entityStrings.get(i);
			boolean parseSuccess = !t.getChild(0).label().toString().equals(".");
			sent.setStuff(t, entityString, parseSuccess);
			sentences.add(sent);
		}
	}
	
	
	/**
	 * NOT USED
	 * 
	 * if there is no mention for the given node, this will walk up the tree 
	 * to try to find one, as in H&K EMNLP 09.   Such a method is necessary
	 * because the test data coref labels may not match up with constituents exactly
	 * 
	 * @param s
	 * @param node
	 * @return
	 */
	public Mention findMentionDominatingNode(int sentenceIndex, Tree node) {
		Mention res = null;
		Tree tmpNode = node;
		
		if (sentenceIndex >= sentences.size()){
			return null;
		}
		
		Sentence s = sentences.get(sentenceIndex);
		
		do {
			res = node2mention.get(s, tmpNode);
			tmpNode = tmpNode.parent(s.rootNode());
		} while(res == null && tmpNode != null);
			
		return res;
	}
	
	
	/**
	 * Given a span defined by indexes for the sentence, start token, and end token,
	 * this method returns the smallest node that includes that span. 
	 * 
	 * @param sentenceIndex
	 * @param spanStart inclusive
	 * @param spanEnd inclusive
	 * @return
	 */
	public Tree findNodeThatCoversSpan(int sentenceIndex, int spanStart, int spanEnd){
		if(sentenceIndex >= sentences.size()) {
			return null;
		}
		Sentence sent = sentences.get(sentenceIndex);
		return findNodeThatCoversSpan(sent, spanStart, spanEnd);
	}
	
	public Tree findNodeThatCoversSpan(Sentence sent, int spanStart, int spanEnd) {
		List<Tree> leaves = sent.rootNode().getLeaves();
		if(spanStart < 0 || leaves.size() == 0 || spanEnd >= leaves.size()) {
			return null;
		}

		Tree startLeaf = leaves.get(spanStart);
		Tree endLeaf = leaves.get(spanEnd);
		return findNodeThatCoversSpan(sent, startLeaf, endLeaf);
	}
	public Tree findNodeThatCoversSpan(Sentence sent, Tree startLeaf, Tree endLeaf) {
		Tree cur = startLeaf;
		while(cur != null) {
			if (cur.dominates(startLeaf) && cur.dominates(endLeaf))
				return cur;
			cur = cur.parent(sent.rootNode());
		}
		assert false : "got to top without finding covering span";
		return cur;
	}
	public Tree getLeaf(int sentenceIndex, int leafIndex) {
		Sentence sent = sentences.get(sentenceIndex);
		List<Tree> leaves = sent.rootNode().getLeaves();
		return leaves.get(leafIndex);
	}


	public static Document loadFiles(String path) throws IOException {
		Document d = new Document();

		String shortpath = Preprocess.shortPath(path);

		String parseFilename = shortpath + ".parse";
		String neFilename = path = shortpath + ".sst";
		BufferedReader parseR = new BufferedReader(new FileReader(parseFilename));
		BufferedReader sstR = new BufferedReader(new FileReader(neFilename));
		
		String parseLine, sst;
		int curSentId = 0;
		while ( (parseLine = parseR.readLine()) != null) {
			Sentence sent = new Sentence(++curSentId);
			
			parseLine = parseLine.replace("=H ", " ");
			Tree tree = null;
			if (parseLine.split("\t").length == 1) {
				// old version: just the parse
				tree = AnalysisUtilities.getInstance().readTreeFromString(parseLine);
				sent.hasParse = true;
			} else {
				tree = AnalysisUtilities.getInstance().readTreeFromString(parseLine.split("\t")[2]);
				sent.hasParse = !parseLine.split("\t")[0].equals("ERROR");
			}
			
			Document.addNPsAbovePossessivePronouns(tree);
			Document.addInternalNPStructureForRoleAppositives(tree);

			sst = sstR.readLine();
			sent.setStuff(tree, sst, sent.hasParse);
			d.sentences.add(sent);
		}
		return d;
	}
	
	/** do sentence breaking (again) on the .txt file for surface info, after parses etc. have been loaded 
	 * @throws FileNotFoundException **/
	public void loadSurfaceSentences(String path) throws FileNotFoundException {
		if (! new File(path+".txt").exists()) {
			throw new FileNotFoundException("Need the .txt file to re-break");
		}
		int i=0;
		for (SentenceBreaker.Sentence s : AnalysisUtilities.cleanAndBreakSentences(U.readFile(path+".txt"))) {
			sentences.get(i).surfSent = s;
			i++;
		}
		
	}
	
	public void ensureSurfaceSentenceLoad(String path) throws FileNotFoundException {
		if (sentences.size()>0 && sentences.get(0).surfSent == null) {
			loadSurfaceSentences(path);
		}
	}
	
	public Sentence getSentenceContaining(int charOffset) {
		for (Sentence s : sentences) {
			if (s.surfSent.charStart <= charOffset  &&  charOffset < s.surfSent.charEnd) {
				return s;
			}
		}
		assert false : "no sentence for char offset "+charOffset;
		return null;
	}


	public static void addNPsAbovePossessivePronouns(Tree tree) {
		TreeFactory factory = new LabeledScoredTreeFactory(); //TODO might want to keep this around to save time
		String patS = "NP=parentnp < /^PRP\\$/=pro"; //needs to be the maximum projection of a head word
		TregexPattern pat = TregexPatternFactory.getPattern(patS);
		TregexMatcher matcher = pat.matcher(tree);
		while (matcher.find()) {
			Tree parentNP = matcher.getNode("parentnp");
			Tree pro = matcher.getNode("pro");
			Tree newNP = factory.newTreeNode("NP", new ArrayList<Tree>());
			int index = parentNP.indexOf(pro);

			newNP.addChild(pro);
			parentNP.removeChild(index);
			parentNP.addChild(index, newNP);

		}
	}


	public static void addInternalNPStructureForRoleAppositives(Tree tree) {
		TreeFactory factory = new LabeledScoredTreeFactory(); //TODO might want to keep this around to save time
		String patS = "NP=parentnp < (NN|NNS=role . NNP|NNPS)";
		TregexPattern pat = TregexPatternFactory.getPattern(patS);
		TregexMatcher matcher = pat.matcher(tree);
		Tree newNode;

		while (matcher.find()) {
			Tree parentNP = matcher.getNode("parentnp");
			Tree roleNP = matcher.getNode("role");
			Tree tmpTree;
			
			newNode = factory.newTreeNode("NP", new ArrayList<Tree>());
			int i = parentNP.indexOf(roleNP);
			while(i>=0){
				tmpTree = parentNP.getChild(i);
				if(!tmpTree.label().value().matches("^NN|NNS|DT|JJ|ADVP$")){
					break;
				}
				newNode.addChild(0, tmpTree);
				parentNP.removeChild(i);
				i--;
			}
			
			parentNP.addChild(i+1, newNode);
			
		}
	}



	/** goes backwards through document **/
	public Iterable<Mention> prevMentions(final Mention start) {
		return new Iterable<Mention>() {
			public Iterator<Mention> iterator() {
				return new MentionRevIterIter(start);
			}
		};
	}
	public class MentionRevIterIter implements Iterator<Mention> {
		int mi = -1;
		int startingSentence = -1;
		public MentionRevIterIter(Mention start) {
			startingSentence = start.getSentence().ID();
			for (int i=0; i < mentions.size(); i++) {
				if (mentions.get(i) == start) {
					this.mi = i;
					break;
				}
			} 
			assert mi != -1;
		}

		@Override
		public boolean hasNext() {
			if (mi==0)
				return false;
			Mention mNext = mentions.get(mi-1);
			if (startingSentence - mNext.getSentence().ID() > ARKref.Opts.sentenceWindow)
				return false;
			return true;
		}

		@Override
		public Mention next() {
			mi--;
			assert mi != -1;
			Mention m = mentions.get(mi);
			return m;

		}
		@Override
		public void remove() {
			throw new RuntimeException("can't remove from the mention iterator!");
		}

	}


	/**
	 * make a right branching tree out of all the sentence trees
	 * e.g., (DOCROOT T1 (DOCROOT T2 (DOCROOT T3))) 
	 * This will make sure that ndoes in t3 are further from nodes in t1 
	 * than they are fromnodes in t2.
	 * 
	 * @return
	 */
	public Tree getTree() {
		if(docTree == null){
			TreeFactory factory = new LabeledScoredTreeFactory();
			docTree = factory.newTreeNode("DOCROOT", new ArrayList<Tree>());
			Tree tmpTree1 = docTree;
			Tree tmpTree2;
			for(int i=0; i<sentences.size(); i++){
				tmpTree1.addChild(sentences.get(i).rootNode());
				if(i<sentences.size()-1){ 
					tmpTree2 = factory.newTreeNode("DOCROOT", new ArrayList<Tree>());
					tmpTree1.addChild(tmpTree2);
					tmpTree1 = tmpTree2;
				}
			}

		}
		return docTree;
	}
	
	/** 
	 * saves doc-level token alignments in the analysis.Word objects
	 * Requires surfSent's in the document's sentences.
	 **/
	public void doTokenAlignments(String docText) {
		U.pl("*** Stanford <-> Raw Text alignment ***\n");
		for (Sentence s : sentences) {
			U.pf("S%-2d\t%s\n", s.ID(), StringUtils.join(s.tokens()));
//			U.pl("SENTENCE WORDS     " + s.words);
//			U.pl("" + s.surfSent);
//			U.pl("" + s.surfSent.rawText);
			AlignedSub cleanText = AnalysisUtilities.moreCleanup(s.surfSent.rawText); 
			int[] wordAlignsInSent = AnalysisUtilities.alignTokens(cleanText.text, s.words);
			for (int i=0; i<wordAlignsInSent.length; i++)
				if (wordAlignsInSent[i] != -1)
					wordAlignsInSent[i] = cleanText.alignments[wordAlignsInSent[i]];
			// adjust to doc position
			for (int i=0; i < s.words.size(); i++) {
				if (wordAlignsInSent[i]==-1) {
					s.words.get(i).charStart = -1;
				} else {
					s.words.get(i).charStart = s.surfSent.alignments[ wordAlignsInSent[i] ];
				}
			}
			if (s.words != null && s.words.size()>0 && s.words.get(0).charStart==-1) {
				s.words.get(0).charStart = s.surfSent.alignments[0];
			}
			for (int i=1; i < s.words.size(); i++) {
				if (s.words.get(i).charStart==-1) {
					Word prev = s.words.get(i-1);
					s.words.get(i).charStart = prev.charStart + prev.token.length();	
				}
				
			}
				
		}
	}
	
	public List<Word> allWords() {
		List<Word> allWords = new ArrayList<Word>();
		for (Sentence s : sentences) {
			for (Word w : s.words){ 
				allWords.add(w);
			}
		}
		return allWords;
	}

	public List<Mention> mentions() {
		return mentions;
	}

	public List<Sentence> sentences() {
		return sentences;
	}

	public RefGraph refGraph() {
		return refGraph;
	}

	public void setEntGraph(EntityGraph entGraph) {
		this.entGraph = entGraph;
	}

	public EntityGraph entGraph() {
		return entGraph;
	}

	public Mention newMention(Sentence s, Tree subtree) {
		Mention mention = new Mention(mentions.size()+1, s, subtree);
		mentions.add(mention);
		if (subtree != null)
			node2mention.put(s, subtree, mention);
		return mention;
	}

}