package info.ephyra.answerselection.filters; import info.ephyra.nlp.NETagger; import info.ephyra.nlp.SnowballStemmer; import info.ephyra.nlp.indices.FunctionWords; import info.ephyra.search.Result; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; /** * <p>A variant of the <code>DuplicateFilter</code> for answer candidates that * are text snippets.</p> * * <p>This class extends the class <code>Filter</code>.</p> * * @author Guido Sautter * @version 2008-02-15 */ public class DuplicateSnippetFilter extends Filter { /** * Filters duplicate results and increments the scores of the remaining * results by the scores of the dropped results. * * @param results array of <code>Result</code> objects * @return array of <code>Result</code> objects without duplicates */ public Result[] apply(Result[] results) { // sort results by their scores in descending order results = (new ScoreSorterFilter()).apply(results); // return remaining results ArrayList<Result> rawResults = new ArrayList<Result>(); HashSet<String> contained = new HashSet<String>(); // drop duplicates for (Result res : results) { String text = res.getAnswer(); if (text != null) { // remove meaningless drivel text = text.toLowerCase().trim(); text = text.replaceAll("(\\'|\\\"|\\`|\\_)", ""); text = SnowballStemmer.stemAllTokens(text); // produce and store keywords for subset elimination String[] tokens = NETagger.tokenize(text); HashSet<String> keywords = new HashSet<String>(); for (String term : tokens) if ((term.length() > 1) && !FunctionWords.lookup(term)) keywords.add(term); // produce term string ArrayList<String> sortedKeywords = new ArrayList<String>(keywords); Collections.sort(sortedKeywords); StringBuffer keywordString = new StringBuffer(); for (String term : sortedKeywords) keywordString.append(" " + term); // check if same keywords contained in previous snippet if (contained.add(keywordString.toString().trim())) rawResults.add(res); } } return rawResults.toArray(new Result[rawResults.size()]); } }