package info.ephyra.answerselection.filters; import info.ephyra.nlp.NETagger; import info.ephyra.nlp.SnowballStemmer; import info.ephyra.nlp.indices.FunctionWords; import info.ephyra.search.Result; import info.ephyra.util.StringUtils; import java.util.ArrayList; import java.util.HashMap; /** * <p>Prefers answer candidates for definitional questions that contain common * keywords.</p> * * <p>This class extends the class <code>Filter</code>.</p> * * @author Guido Sautter * @version 2008-02-15 */ public class TermImportanceFilter extends Filter { /** * Increments the score of each result snippet for each word in it according * to the number of result snippets containing this particular word. This is * sort of a centrality measure, which favors snippets that provide * information given frequently and thus likely to be more important with * regard to the target. * * @param results array of <code>Result</code> objects * @return filtered array of <code>Result</code> objects */ public Result[] apply(Result[] results) { // raw results returned by the searchers HashMap<String, Integer> termCounters = new HashMap<String, Integer>(); ArrayList<Result> rawResults = new ArrayList<Result>(); int lengthSum = 0; for (Result r : results) { if (r.getScore() != Float.NEGATIVE_INFINITY) { String text = r.getAnswer(); // tokenize and tag sentence String[] sentence = NETagger.tokenize(text); lengthSum += sentence.length; // scan sentence for NPs for (int i = 0; i < sentence.length; i++) { String term = SnowballStemmer.stem(sentence[i].toLowerCase()); if (term.length() > 1) { Integer count = (termCounters.containsKey(term) ? termCounters.get(term) : new Integer(0)); termCounters.put(term, new Integer(count.intValue() + 1)); } } } } for (Result r : results) { if (r.getScore() != Float.NEGATIVE_INFINITY) { String text = r.getAnswer(); // tokenize sentence String[] sentence = NETagger.tokenize(text); float importance = 0; // scan sentence for NPs for (int i = 0; i < sentence.length; i++) { String term = sentence[i]; if ((term.length() > 1) && !StringUtils.isSubsetKeywords(term, r.getQuery().getAnalyzedQuestion().getQuestion()) && !FunctionWords.lookup(term)) { // if (term.length() > 1) { term = SnowballStemmer.stem(term.toLowerCase()); Integer count = (termCounters.containsKey(term) ? termCounters.get(term) : new Integer(0)); if (count.intValue() > Math.floor(Math.sqrt(results.length / 100))) importance += count.intValue(); // if (count.intValue() > (results.length / 100)) // importance += (((float) count.intValue()) / ((float) results.length)); } } if (importance > 0) { r.incScore(importance); rawResults.add(r); // r.incScore((float) Math.sqrt(importance)); } } } return rawResults.toArray(new Result[rawResults.size()]); } }