package tathya.semantics; import java.util.ArrayList; import java.util.List; import java.util.Vector; import tathya.semantics.Word.Type; import edu.stanford.nlp.trees.Tree; public class TripletExtractor { public TripletExtractor() { } public ArrayList<Triple> extract(Tree tree) { if(tree == null) { return null; } ArrayList<Triple> triples = new ArrayList<Triple>(); ArrayList<Tree> queue = new ArrayList<Tree>(); Tree root = tree; queue.add(root); while(!queue.isEmpty()) { Tree topNode = queue.remove(0); // extract triple if S -> NP VP detected if(topNode.value().startsWith("S")) { Tree NP = null; Tree VP = null; Tree[] children = topNode.children(); for(Tree c : children) { if(c.nodeString().startsWith("NP")) { NP = c; } else if(c.nodeString().startsWith("VP")) { VP = c; } } if(NP == null || VP == null) { //System.out.println("invalid parse tree"); } else { Triple triple = new Triple(); triple.setSubject(extractSubject(NP, tree)); List<Word> predicateAndObj = extractPredicateAndObject(VP, tree); if(predicateAndObj.size() > 0) { triple.setPredicate(predicateAndObj.get(0)); if(predicateAndObj.size() > 1) { triple.setObject(predicateAndObj.get(1)); } } triples.add(triple); } } // add all children to queue regardless for(Tree c : topNode.children()) { queue.add(c); } } return triples; } private Word extractSubject(Tree NP, Tree root) { Word subject = null; Tree subjectNode = null; ArrayList<Tree> queue = new ArrayList<Tree>(); StringBuffer subjBuff = new StringBuffer(); boolean hasSubject = false; Tree[] children = NP.children(); if(children != null) { for(Tree c : children) { queue.add(c); } while(!queue.isEmpty()) { Tree top = queue.remove(0); if(top.nodeString().startsWith("NP")) { hasSubject = true; return extractSubject(top, root); } else if(top.nodeString().startsWith("NN")) { subjectNode = top; subjBuff.append(top.children()[0].value() + " "); hasSubject = true; } else { if(hasSubject) { break; } Tree[] t_children = top.children(); for(Tree c : t_children) { queue.add(c); } } } } subject = new Word(subjBuff.toString().trim(), Word.Type.SUBJECT); subject = extractAttributes(subjectNode, root, subject); return subject; } private List<Word> extractPredicateAndObject(Tree VP, Tree root) { List<Word> result = new ArrayList<Word>(); String predicateStr = null; Tree predicateNode = null; Tree objectNode = null; ArrayList<Tree> siblings = new ArrayList<Tree>(); ArrayList<Tree> queue = new ArrayList<Tree>(); Word predicate = null; Word object = null; Tree[] children = VP.children(); if(children != null) { for(Tree c : children) { queue.add(c); } while(!queue.isEmpty()) { Tree top = queue.remove(0); if(top.nodeString().startsWith("VB")) { predicateNode = top; predicateStr = top.children()[0].value(); siblings.clear(); } else if(top.nodeString().startsWith("VP")) { Tree[] t_children = top.children(); for(Tree c : t_children) { queue.add(c); } } else if(top.nodeString().startsWith("PP") || top.nodeString().startsWith("NP") || top.nodeString().startsWith("ADJ")) { siblings.add(top); } } } predicate = new Word(predicateStr, Word.Type.PREDICATE); predicate = extractAttributes(predicateNode, root, predicate); if(!siblings.isEmpty()) { for(Tree sib : siblings) { if(sib.nodeString().startsWith("NP") || sib.nodeString().startsWith("PP")) { object = extractSubject(sib, root); break; } else { object = extractAdj(sib); break; } } } if(object != null) { object.setType(Word.Type.OBJECT); } result.add(predicate); result.add(object); return result; } private Word extractAdj(Tree subtree) { StringBuffer adj = new StringBuffer(); ArrayList<Tree> queue = new ArrayList<Tree>(); Tree[] children = subtree.children(); if(children != null) { for(Tree c : children) { queue.add(c); } while(!queue.isEmpty()) { Tree top = queue.remove(0); if(top.nodeString().startsWith("JJ")) { adj.append(top.children()[0].value() + " "); } else { Tree[] t_children = top.children(); for(Tree c : t_children) { queue.add(c); } } } } return new Word(adj.toString().trim(), Word.Type.OBJECT); } private Word extractAttributes(Tree node, Tree root, Word wrd) { if(node == null) { return wrd; } if(node.nodeString().startsWith("JJ")) { if(node == null || node.parent(root) == null) { return wrd; } Tree[] siblings = node.parent(root).children(); for(Tree sibling : siblings) { if(sibling.nodeString().startsWith("RB")) { wrd.addAttribute(new Word(sibling.children()[0].value(), Word.Type.ATTRIBUTE)); } } } else if(node.nodeString().startsWith("VB")) { if(node == null || node.parent(root) == null) { return wrd; } Tree[] siblings = node.parent(root).children(); for(Tree sibling : siblings) { if(sibling.nodeString().startsWith("ADV")) { wrd.addAttribute(new Word(sibling.children()[0].value(), Word.Type.ATTRIBUTE)); } } } else if(node.nodeString().startsWith("NN")) { if(node == null || node.parent(root) == null) { return wrd; } Tree[] siblings = node.parent(root).children(); for(Tree sibling : siblings) { if(sibling.nodeString().startsWith("DT") || sibling.nodeString().startsWith("POS") || sibling.nodeString().startsWith("JJ") || sibling.nodeString().startsWith("PRP") || sibling.nodeString().startsWith("CD") || sibling.nodeString().startsWith("ADJ") || sibling.nodeString().startsWith("QP")) { wrd.addAttributeAll(collectAllBFS(sibling, Word.Type.ATTRIBUTE)); } else if(sibling.nodeString().startsWith("NP")) { wrd.addAttributeAll(collectAllBFS(sibling, Word.Type.ATTRIBUTE)); } } } // search the uncles if(node.parent(root).parent(root) != null) { Tree[] uncles = node.parent(root).parent(root).children(); for(Tree uncle : uncles) { if(uncle.nodeString().startsWith("PP") || (node.nodeString().startsWith("VB") && uncle.nodeString().startsWith("PP"))) { wrd.addAttributeAll(collectAllBFS(uncle, Word.Type.ATTRIBUTE)); break; } } } return wrd; } private List<Word> collectAllBFS(Tree subtree, Type t) { ArrayList<Tree> queue = new ArrayList<Tree>(); ArrayList<Word> words = new ArrayList<Word>(); Tree[] children = subtree.children(); if(children != null) { for(Tree c : children) { queue.add(c); } while(!queue.isEmpty()) { Tree top = queue.remove(0); if(top.isLeaf()) { words.add(new Word(top.value(), t)); } Tree[] t_children = top.children(); for(Tree c : t_children) { queue.add(c); } } } return words; } }