package info.ephyra.trec;
import java.util.ArrayList;
/**
* A preprocessor for TREC target descriptions.
*
* @author Nico Schlaefer
* @version 2007-07-23
*/
public class TargetPreprocessor {
/**
* Creates a normalized target that is used as a query for the "Other"
* question and a condensed target that is appended to queries for factoid
* and list questions. Checks if the target is a noun phrase.
*
* @param target a TREC target
*/
private static void normalize(TRECTarget target) {
String /*targetDesc*/condensed = target.getTargetDesc();
// normalized target used as a query for the "Other" question
// String norm = "";
// condensed target appended to queries for factoid and list questions
// String condensed = "";
// indicates if the target is a noun phrase
// boolean nounPhrase = true;
// convert verbs to simple past (normalized target)
// or drop verbs (condensed target)
// String[] tokens = targetDesc.split(" ");
// for (String token : tokens) {
// if (token.substring(0, 1).matches("[a-z]") && // lower case
// WordNet.isVerb(token) && !WordNet.isNoun(token) && // verb
// !WordNet.isAdjective(token) && !WordNet.isAdverb(token))
// {
// if (token.endsWith("ed")) {
// // already simple past
// if (norm.length() > 0) norm += " ";
// norm += token;
// } else {
// String inf = WordNet.getInfinitive(token);
// String[] ps = VerbFormConverter.infinitiveToSimplePast(inf);
//
// for (String p : ps) {
// if (norm.length() > 0) norm += " ";
// norm += p;
// }
// }
//
// nounPhrase = false; // target is not a noun phrase
// } else {
// if (norm.length() > 0) norm += " ";
// norm += token;
//
// if (condensed.length() > 0) condensed += " ";
// condensed += token;
// }
// }
// // drop acronyms in parenthesis (condensed target)
// condensed = condensed.replaceAll("\\s*\\(\\s*" +
// "([A-Z][a-z0-9\\.&]*){2,}\\s*\\)", "");
// drops any string in parenthesis (condensed target)
condensed = condensed.replaceAll("\\s*\\([^\\)]*\\)", "");
// update target datastructure
// target.setTargetDesc(norm);
target.setCondensedTarget(condensed);
// target.setNounPhrase(nounPhrase);
}
/**
* Determines possible types for the target.
*
* @param target a TREC target
*/
private static void determineTypes(TRECTarget target) {
String targetDesc = target.getTargetDesc();
// possible target types
ArrayList<String> types = new ArrayList<String>();
for (String type : TRECTarget.TARGET_TYPES) types.add(type);
// not a noun phrase -> EVENT
if (!target.isNounPhrase()) {
types.remove("PERSON");
types.remove("ORGANISATION");
types.remove("THING");
}
// // some uper case words and finally a lower case word -> NO PERSON
// if (targetDesc.matches("([A-Z]([a-z])+\\b){2}([a-z])+"))
// types.remove("PERSON");
// acronym -> ORGANISATION, EVENT or THING
if (targetDesc.matches(".*([A-Z][a-z0-9\\.&]*){2,}.*"))
types.remove("PERSON");
// year date -> EVENT
if (targetDesc.matches(".*\\b\\d{4,4}\\b.*")) {
types.remove("PERSON");
types.remove("ORGANISATION");
types.remove("THING");
}
// one lower case word -> THING
if (targetDesc.matches("[a-z]+")) {
types.remove("PERSON");
types.remove("ORGANISATION");
types.remove("EVENT");
}
// Sir, Prof., ... -> PERSON
// TODO include titles and name lists from GATE
if (targetDesc.matches("(?i).*\\b" +
"(Doctor|Dr\\.|Junior|Jr\\.|Miss|Ms\\.|Misses|Mrs\\.|Mister|Mr\\." +
"|Prof\\.|Professor|Sir|Sr\\.)" +
"\\b.*"))
{
types.remove("ORGANISATION");
types.remove("EVENT");
types.remove("THING");
}
// Corporation, Inc., ... -> ORGANIZATION
if (targetDesc.matches("(?i).*\\b" +
"(administration|agenc(y|ies)|association|authorit(y|ies)|bank" +
"|board|brotherhood|bureau|center|centre|church|clinic|club" +
"|college|commission|committee|communit(y|ies)|corp\\." +
"|corporation|council|department|directorate|division|federation" +
"|foundation|fund|group|guild|hospital|hotel|inc\\.|incorporated" +
"|institute|lab|laboratory(ies)|ministr(y|ies)|museum|office" +
"|school|societ(y|ies)|squadron|syndicate|universit(y|ies)|union)" +
"(e?s)?\\b.*"))
{
types.remove("PERSON");
types.remove("EVENT");
types.remove("THING");
}
// Show, Conference, ... -> EVENT
// TODO think of more, go through TREC targets
if (targetDesc.matches("(?i).*\\b" +
"(championship|conference|desaster|cup|show|tournament|tradegy" +
"|workshop)" +
"(e?s)?\\b.*"))
{
types.remove("PERSON");
types.remove("ORGANISATION");
types.remove("THING");
}
// update target datastructure
if (types.size() > 0)
target.setTargetTypes(types.toArray(new String[types.size()]));
}
/**
* Preprocesses the target (normalization and type determination).
*
* @param target a TREC target
*/
public static void preprocess(TRECTarget target) {
normalize(target);
determineTypes(target);
}
}