TermImportanceFilter.java example

Explorer
lucida-master
- lucida
package info.ephyra.answerselection.filters;

import info.ephyra.nlp.NETagger;
import info.ephyra.nlp.SnowballStemmer;
import info.ephyra.nlp.indices.FunctionWords;
import info.ephyra.search.Result;
import info.ephyra.util.StringUtils;

import java.util.ArrayList;
import java.util.HashMap;

/**
 * <p>Prefers answer candidates for definitional questions that contain common
 * keywords.</p>
 * 
 * <p>This class extends the class <code>Filter</code>.</p>
 * 
 * @author Guido Sautter
 * @version 2008-02-15
 */
public class TermImportanceFilter extends Filter {
	/**
	 * Increments the score of each result snippet for each word in it according
	 * to the number of result snippets containing this particular word. This is
	 * sort of a centrality measure, which favors snippets that provide
	 * information given frequently and thus likely to be more important with
	 * regard to the target.
	 * 
	 * @param results array of <code>Result</code> objects
	 * @return filtered array of <code>Result</code> objects
	 */
	public Result[] apply(Result[] results) {
		// raw results returned by the searchers
		HashMap<String, Integer> termCounters = new HashMap<String, Integer>();
		ArrayList<Result> rawResults = new ArrayList<Result>();
		
		int lengthSum = 0;
		for (Result r : results) {
			if (r.getScore() != Float.NEGATIVE_INFINITY) {
				String text = r.getAnswer();
				
				//	tokenize and tag sentence
				String[] sentence = NETagger.tokenize(text);
				lengthSum += sentence.length;
				
				//	scan sentence for NPs
				for (int i = 0; i < sentence.length; i++) {
					String term = SnowballStemmer.stem(sentence[i].toLowerCase());
					if (term.length() > 1) {
						Integer count = (termCounters.containsKey(term) ? termCounters.get(term) : new Integer(0));
						termCounters.put(term, new Integer(count.intValue() + 1));
					}
				}
			}
		}
		
		for (Result r : results) {
			if (r.getScore() != Float.NEGATIVE_INFINITY) {
				String text = r.getAnswer();
				
				//	tokenize sentence
				String[] sentence = NETagger.tokenize(text);
				float importance = 0;
				
				//	scan sentence for NPs
				for (int i = 0; i < sentence.length; i++) {
					String term = sentence[i];
					if ((term.length() > 1) && !StringUtils.isSubsetKeywords(term, r.getQuery().getAnalyzedQuestion().getQuestion()) && !FunctionWords.lookup(term)) {
//					if (term.length() > 1) {
						term = SnowballStemmer.stem(term.toLowerCase());
						Integer count = (termCounters.containsKey(term) ? termCounters.get(term) : new Integer(0));
						if (count.intValue() > Math.floor(Math.sqrt(results.length / 100)))
							importance += count.intValue();
//						if (count.intValue() > (results.length / 100))
//							importance += (((float) count.intValue()) / ((float) results.length));
					}
				}
				
				if (importance > 0) {
					r.incScore(importance);
					rawResults.add(r);
//					r.incScore((float) Math.sqrt(importance));
				}
			}
		}
		
		return rawResults.toArray(new Result[rawResults.size()]);
	}
}