package edu.uncc.cs.watsonsim.scorers;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import edu.uncc.cs.watsonsim.Answer;
import edu.uncc.cs.watsonsim.Passage;
import edu.uncc.cs.watsonsim.Phrase;
import edu.uncc.cs.watsonsim.StringUtils;
/**
* @author Sean Gallagher
*
*/
public class SkipBigram extends PassageScorer {
public double scorePassage(Phrase q, Answer a, Passage p) {
// Jane Austen
Set<String> a_set = generateBigrams(StringUtils.tokenize(a.text));
// Romantic novelist Jane Austen once wrote -the- book Emma.
Set<String> p_set = generateBigrams(p.getTokens());
a_set.retainAll(p_set);
return a_set.size();
}
private Set<String> generateBigrams(List<String> terms) {
Set<String> bigrams = new HashSet<>();
for (int ti=0; ti<terms.size()-1; ti++) {
// First the bigram
bigrams.add(terms.get(ti) + terms.get(ti+1));
if (ti < terms.size()-2) {
// Maybe the skip bigram, if we are more than one word from end
bigrams.add(terms.get(ti) + terms.get(ti+1));
}
}
return bigrams;
}
}