package edu.uncc.cs.watsonsim.scorers; import java.util.List; import edu.uncc.cs.watsonsim.Answer; import edu.uncc.cs.watsonsim.Passage; import edu.uncc.cs.watsonsim.Phrase; import edu.uncc.cs.watsonsim.StringUtils; /** * The Passage Term match scorer is designed, simply, to count the number of times * a term appears in the text. * * "This assigns a score by * matching question terms to passage terms, regardless * of grammatical relationship or word order." * * It returns a number which is equal to the number of occurrences * @author Jonathan Shuman * */ public class PassageTermMatch extends PassageScorer { public double scorePassage(Phrase q, Answer a, Passage p) { // Jane Austen String c_t = StringUtils.join(p.text, " "); // Romantic novelist Jane Austen once wrote -the- book Emma. String q_t = q.text; return generateNumberTerms(q_t, c_t); } /** * @param queryText The text of the query to search passages * @param passageText The text of the passage * @return Number of occurrences of words in query in the passage */ private int generateNumberTerms(String queryText, String passageText) { /* * We will first separate the text of the query and passage into terms. * Note: The parameters are assumed to have stopwords removed. */ List<String> qTerms = StringUtils.tokenize(queryText); List<String> pTerms = StringUtils.tokenize(passageText); // Join the passage back together with stop words removed. // We will use the StringUtils function to remove the words. String passageStopsRemoved = StringUtils.join(pTerms, " "); int matches = 0; //Scan through each of the terms to get its number of occurances in the passage text. for (String term : qTerms) { // First the bigram matches += StringUtils.countMatches(passageStopsRemoved, term); } return matches; } }