package info.ephyra.answerselection.filters;
import info.ephyra.nlp.NETagger;
import info.ephyra.nlp.indices.FunctionWords;
import info.ephyra.search.Result;
import java.util.ArrayList;
/**
* <p>Drops answer candidates for definitional questions that are enumerations
* of proper names.</p>
*
* <p>This class extends the class <code>Filter</code>.</p>
*
* @author Guido Sautter
* @version 2008-02-15
*/
public class ProperNameFilter extends Filter {
/**
* Filter out result snippets that contain too many proper names. This is to
* get rid of enumerations of named entities that happen to include the
* target. This might, for instance, be the track list of a compilation LP,
* which has a song by the target artist on it.
*
* @param results array of <code>Result</code> objects
* @return extended array of <code>Result</code> objects
*/
public Result[] apply(Result[] results) {
// raw results returned by the searchers
ArrayList<Result> rawResults = new ArrayList<Result>();
for (Result r : results) {
if (r.getScore() != Float.NEGATIVE_INFINITY) {
String text = r.getAnswer();
// tokenize and tag sentence
String[] sentence = NETagger.tokenize(text);
int upperCase = 0;
int lowerCase = 0;
// scan sentence for tokens in upper case
for (int i = 1; i < sentence.length; i++) {
String term = sentence[i];
if (term.matches("[A-Z]++.*+")) {
upperCase ++;
if (FunctionWords.lookup(term.toLowerCase())) upperCase += 2;//sentence.length;
}
else if (term.matches("[a-z]++.*+")) lowerCase ++;
else if (term.matches("[0-9]++")) lowerCase ++;
}
if (upperCase < lowerCase) rawResults.add(r);
// else System.out.println("ProperNameFilter: " + text);
}
}
return rawResults.toArray(new Result[rawResults.size()]);
}
}