package arkref.analysis;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import arkref.data.Document;
import arkref.data.Sentence;
import arkref.parsestuff.TregexPatternFactory;
import arkref.parsestuff.U;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
public class FindMentions {
public static void go(Document d) {
U.pl("\n*** Find Mentions ***\n");
for (Sentence s : d.sentences()){
for(Tree match: findMentionNodes(s.rootNode())){
d.newMention(s, match);
}
}
}
public static List<Tree> findMentionNodes(Tree root){
List<Tree> res = new ArrayList<Tree>();
String patS = "NP !>># NP"; //needs to be the maximum projection of a head word, or a conjunction
TregexPattern pat = TregexPatternFactory.getPattern(patS);
TregexMatcher matcher = pat.matcher(root);
while (matcher.find()) {
Tree t = matcher.getMatch();
if(t.numChildren() == 0) continue; //added to handle when NP is a word (i.e., terminal/leaf node)
res.add(matcher.getMatch());
}
return res;
}
public static void main(String[] args) throws IOException {
Document d = Document.loadFiles("/d/arkref/data/lcross");
FindMentions.go(d);
}
}