package arkref.analysis; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import arkref.data.Document; import arkref.data.Mention; import arkref.parsestuff.AnalysisUtilities; import arkref.parsestuff.TregexPatternFactory; import edu.stanford.nlp.trees.HeadFinder; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.tregex.TregexMatcher; import edu.stanford.nlp.trees.tregex.TregexPattern; public class SyntacticPaths { /** * finds the closest candidate by looking at the syntactic path distance * * @param mention * @param candidates other mentions that appeared previously * @return */ public static Mention findBestCandidateByShortestPath(Mention mention, List<Mention> candidates, Document document) { int minLength = 1000000; int minIndex = 0; Mention res; List<Integer> pathLengths = scoreCandidatesByPathLength(mention, candidates, document); //Mention tmpCandidate; int tmp; for(int i=0; i<pathLengths.size(); i++){ tmp = pathLengths.get(i); String tmpS = ""; if(candidates.get(i).node() != null){ tmpS = candidates.get(i).node().yield().toString(); } // U.pl("distance:"+tmp+"\t"+tmpS); if(tmp < minLength){ minLength = tmp; minIndex = i; } } res = candidates.get(minIndex); return res; } public static List<Integer> scoreCandidatesByPathLength(Mention mention, List<Mention> candidates, Document doc) { List<Integer> pathLengths = new ArrayList<Integer>(); Iterator<Mention> iter = candidates.iterator(); Mention tmpCandidate; while(iter.hasNext()){ tmpCandidate = iter.next(); pathLengths.add(computePathLength(mention.node(), tmpCandidate.node(), doc.getTree())); } return pathLengths; } /** * * @param node1 * @param node2 * @param commonRoot should contain both node1 and node2 * @return */ public static int computePathLength(Tree node1, Tree node2, Tree commonRoot) { int res = 1000; /* //find the node in the tree that dominates both input nodes int len1 = 0; int len2 = 0; Tree tmpNode = node1; List<Tree> dominationPath; while(tmpNode != null){ dominationPath = tmpNode.dominationPath(node2); if(dominationPath != null){ len2 = dominationPath.size()-1; } tmpNode = tmpNode.parent(commonRoot); len1++; } //sum the distances from each input node to their common ancestor res = len1+len2;*/ List<Tree> path = commonRoot.pathNodeToNode(node1, node2); if(path != null){ res = path.size()-1; } //System.err.println(res+"\t"+node2.toString()); return res; } public static boolean aIsDominatedByB(Mention A, Mention B) { boolean bDominatesA = B.node().dominates(A.node()); return bDominatesA; } public static Tree getMaximalProjection(Tree parent, Tree root) { Tree res = parent; Tree tmp = parent; HeadFinder hf = AnalysisUtilities.getInstance().getHeadFinder(); Tree parentHead = parent.headTerminal(hf); while(tmp != null){ if(tmp.headTerminal(hf) == parentHead && tmp.parent(root) != null){ res = tmp; }else{ break; } tmp = res.parent(root); //System.err.println("\tp:"+parent.toString()+"\tpHead:"+parentHead+"\ttmp:"+tmp); } //System.err.println("node:"+parent.toString()+"\tmaxProjection:"+res.toString()); return res; } /** * Objects (and other verb arguments) can't refer with the subjects of the same clause, * unless the object is reflexive. * * e.g., in "The man gave him a book.", "him" != "man" * * @param m1 * @param m2 * @return */ public static boolean inSubjectObjectRelationship(Mention m1, Mention m2) { Tree t = m2.node(); Tree root = m2.getSentence().rootNode(); //return false if these mentions are not in the same sentence if(root != m1.getSentence().rootNode()){ return false; } Tree ancestor = t.parent(root); //find the subject of the clause that m2 is part of (try to do so even if there is embedding) while(ancestor != null && ancestor != root){ if(ancestor.label().value().equals("S")){ TregexPattern pat = TregexPatternFactory.getPattern("S < (NP=subject !$,, NP) < VP"); TregexMatcher matcher = pat.matcher(ancestor); if (matcher.find()) { Tree subj = matcher.getNode("subject"); return m1.node() == subj; } }else if(ancestor.label().value().equals("NP")){ //return false if m2 is not a maximally projected node. //This accounts for cases like Nintendo introduced its new console return false; } ancestor = ancestor.parent(root); } return false; } /** * Subjects cannot refer to NPs in non-finite subordinate clauses, prepositional phrases, etc. * modifying the same main clause * * e.g., in "To call John, he picked up the phone" he != John * in "To John, he was a stranger." he != John * in "Because John likes cars, he bought a Ferrari." he might be John * * @param m2 * @param m1 * @return */ public static boolean isSubjectAndMentionInAdjunctPhrase(Mention m1, Mention m2) { Tree t = m1.node(); Tree root = m1.getSentence().rootNode(); Tree clause = t.parent(root); if(!clause.label().value().equals("S")){ return false; } TregexPattern pat = TregexPatternFactory.getPattern("NP=np !>> (S < NP < VP >> S)"); TregexMatcher matcher = pat.matcher(clause); while(matcher.find()) { Tree np = matcher.getNode("np"); if(np == m2.node()) return true; } return false; } public static boolean isInQuotation(Mention m){ //find all quote nodes, and see if any of them c-command the mention node TregexPattern pat = TregexPatternFactory.getPattern("``"); TregexMatcher matcher = pat.matcher(m.getSentence().rootNode()); while(matcher.find()) { Tree quote = matcher.getMatch(); if(cCommands(quote, m.node(), m.getSentence().rootNode())){ return true; } } return false; } /** * There is a bug in the stanford Tree.cCommands method, I think * @return */ public static boolean cCommands(Tree n1, Tree n2, Tree root){ Tree n1Parent = n1.parent(root); for(Tree sibling: n1Parent.getChildrenAsList()){ if(sibling == n1){ continue; } if(sibling.dominates(n2)){ return true; } } return false; } }