package info.ephyra.answerselection.filters;
import info.ephyra.nlp.NETagger;
import info.ephyra.nlp.indices.Prepositions;
import info.ephyra.search.Result;
import info.ephyra.util.RegexConverter;
import info.ephyra.util.StringUtils;
import java.util.ArrayList;
import java.util.Hashtable;
/**
* <p>A filter that truncates the answer strings. It drops the following
* prefixes and suffixes:
* <ul>
* <li>blanks and some special characters</li>
* <li>articles</li>
* <li>"and", "or"</li>
* <li>prepositions</li>
* </ul>
* After truncation, similar answers are merged.</p>
*
* <p>This filter is not applied to answer strings that have been extracted with
* a rule- or list-based NE tagger since these answers are assumed to be
* properly formatted.</p>
*
* <p>This class extends the class <code>Filter</code>.</p>
*
* @author Nico Schlaefer
* @version 2007-05-28
*/
public class TruncationFilter extends Filter {
/**
* Special characters that are truncated from answer strings. The following
* characters are excluded because they may be the first/last character of
* an answer:
* <ul>
* <li>$������%~.</li>
* <li>��������</li>
* <li>��������������������������������������������������������������</li>
* </ul>
*/
private static final String SPECIAL_CHARS =
RegexConverter.strToRegex("-+�*��=_�|�\\/�:,;�?�!��\"���'�`" +
"()[]{}<>#&�@���");
/** Articles that are truncated from answer strings. */
private static final String ARTICLES = "(an?|that|the|these|this|those)";
/**
* Truncates a phrase.
*
* @param phrase phrase to truncate
* @return truncated phrase
*/
public static String truncate(String phrase) {
String old = "";
while (!old.equals(phrase)) {
old = phrase;
// drop leading and trailing blanks and some special characters
phrase = phrase.replaceFirst("^[\\s" + SPECIAL_CHARS + "]", "");
phrase = phrase.replaceFirst("[\\s" + SPECIAL_CHARS + "]$", "");
// drop leading '.' and trailing '.' if not preceded by an
// upper-case character (which indicates an acronym)
phrase = phrase.replaceFirst("^\\.", "");
if (phrase.matches(".*?(^|[^A-Z])\\.$"))
phrase = phrase.replaceFirst("\\.$", "");
// drop leading and trailing articles
phrase = phrase.replaceFirst("(?i)^" + ARTICLES + " ", "");
phrase = phrase.replaceFirst("(?i) " + ARTICLES + "$", "");
// drop leading and trailing "and", "or"
phrase = phrase.replaceFirst("(?i)^(and|or) ", "");
phrase = phrase.replaceFirst("(?i) (and|or)$", "");
// drop leading and trailing prepositions
String[] tokens = phrase.split(" ", -1);
if (Prepositions.lookup(tokens[0]))
phrase = phrase.replaceFirst("^[^ ]++($| )", "");
if (Prepositions.lookup(tokens[tokens.length - 1]))
phrase = phrase.replaceFirst("(^| )[^ ]++$", "");
}
return phrase;
}
/**
* Filters a single <code>Result</code> object.
*
* @param result result to filter
* @return result or <code>null</code>
*/
public Result apply(Result result) {
// do not apply the filter if the answer string is a NE that was
// extracted with a rule- or list-based tagger
if (result.isNamedEntity() &&
!NETagger.allModelType(result.getNeTypes())) return result;
String answer = result.getAnswer();
answer = truncate(answer);
result.setAnswer(answer);
return result;
}
/**
* Filters an array of <code>Result</code> objects.
*
* @param results results to filter
* @return filtered results
*/
public Result[] apply(Result[] results) {
// all results that pass the filter
ArrayList<Result> filtered = new ArrayList<Result>();
// for each extractor, truncated answers and corresponding results
Hashtable<String, Hashtable<String, Result>> truncated =
new Hashtable<String, Hashtable<String, Result>> ();
// sort results by their scores in descending order
results = (new ScoreSorterFilter()).apply(results);
for (Result result : results) {
// only truncate factoid answers
if (result.getScore() <= 0 ||
result.getScore() == Float.POSITIVE_INFINITY) {
filtered.add(result);
continue;
}
// make sure that answers come from a single extractor
String[] extractors = result.getExtractionTechniques();
if (extractors == null || extractors.length != 1) {
filtered.add(result);
continue;
}
String extractor = extractors[0];
// truncate result
result = apply(result);
// merge with similar results from same extractor
Hashtable<String, Result> truncatedT = truncated.get(extractor);
if (truncatedT == null) {
truncatedT = new Hashtable<String, Result>();
truncated.put(extractor, truncatedT);
}
String norm = StringUtils.normalize(result.getAnswer());
Result similar = truncatedT.get(norm);
if (similar == null) {
filtered.add(result);
truncatedT.put(norm, result);
} else {
similar.incScore(result.getScore());
}
}
return filtered.toArray(new Result[filtered.size()]);
}
}