package edu.uncc.cs.watsonsim.scorers;
import java.util.HashSet;
import edu.stanford.nlp.trees.Tree;
import edu.uncc.cs.watsonsim.Answer;
import edu.uncc.cs.watsonsim.Passage;
import edu.uncc.cs.watsonsim.Phrase;
/* @author Wlodek
* @author Sean Gallagher
*
* Create a score based on how many parse trees the question, candidate answer
* and passage have in common.
*
* This scorer can be very slow.
*/
public class CommonConstituents extends PassageScorer {
/**
* Score the similarity of two sentences according to
* sum([ len(x) | x of X, y of Y, if x == y ])
* where X and Y are the sets of subtrees of the parses of s1 and s2.
* @param x
* @param y
* @return
*/
public static double getCommonSubtreeCount(Phrase t1, Phrase t2) {
HashSet<String> t1_subtrees = new HashSet<>();
HashSet<String> t2_subtrees = new HashSet<>();
for (Tree x : t1.getTrees()) t1_subtrees.add(x.toString());
for (Tree y : t2.getTrees()) t2_subtrees.add(y.toString());
t1_subtrees.retainAll(t2_subtrees);
// x.getLeaves().size() may also be a good idea.
// I don't have any intuition for which may be better.
return t1_subtrees.size();
}
/** Generate a simple score based on scorePhrases.
*
*/
public double scorePassage(Phrase q, Answer a, Passage p) {
return getCommonSubtreeCount(p, new Phrase(a.text));
}
}