package info.ephyra.questionanalysis;
import info.ephyra.nlp.VerbFormConverter;
import info.ephyra.nlp.semantics.Predicate;
import info.ephyra.nlp.semantics.ontologies.Ontology;
import info.ephyra.nlp.semantics.ontologies.WordNet;
import info.ephyra.util.StringUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.didion.jwnl.data.POS;
/**
* Expands single- and multi-token terms by looking them up in one or more open-
* or specific-domain ontologies.
*
* @author Nico Schlaefer
* @version 2007-02-30
*/
public class TermExpander {
/** Maximum number of expansions. */
public static final int MAX_EXPANSIONS = 100;
/** Minimum weight of an expansion. */
public static final double MIN_EXPANSION_WEIGHT = 0;
/** Maximum number of expansions used in a query. */
public static final int MAX_EXPANSIONS_QUERY = 10;
/** Minimum weight of an expansion used in a query. */
public static final double MIN_EXPANSION_WEIGHT_QUERY = 0.9;
/**
* Checks if the term is the target of one of the predicates.
*
* @param term a term
* @param ps predicates in the same sentence
* @return <code>true</code> iff the term is a predicate target
*/
private static boolean isTarget(Term term, Predicate[] ps) {
String text = term.getText();
String lemma = WordNet.getLemma(text, POS.VERB);
for (Predicate p : ps) {
String verb = p.getVerb();
if (verb.matches(".*?\\b" + text + "\\b.*+"))
return true;
// also try the infinitive in case the verb form was modified in the
// term but not in the predicate (e.g. because the predicate was
// built from an offset annotation)
if (lemma != null && verb.matches(".*?\\b" + lemma + "\\b.*+"))
return true;
}
return false;
}
/**
* Drops expansions that do not contain any keywords and applies the given
* thresholds for the maximum number of expansions and the minimum weight of
* an expansion.
*
* @param expansions expansions along with their weights
* @param maxExpansions maximum number of expansions
* @param minExpansionWeight minimum weight of an expansion
* @param strict iff <code>true</code>, the threshold for the maximum number
* of expansions is applied strictly even if that means that
* some of a number of expansion with equal weights have to be
* dropped randomly
* @return minimum weight of an expansion in the reduced map
*/
public static double cutOffExpansions(Map<String, Double> expansions,
int maxExpansions, double minExpansionWeight, boolean strict) {
// drop duplicates and expansions that do not contain keywords
ArrayList<String> dropped = new ArrayList<String>();
HashSet<String> expansionSet = new HashSet<String>();
for (String expansion : expansions.keySet()) {
if (!expansionSet.add(StringUtils.normalize(expansion))) {
dropped.add(expansion);
continue;
}
String[] kws = KeywordExtractor.getKeywords(expansion);
if (kws.length == 0) dropped.add(expansion);
}
for (String expansion : dropped) expansions.remove(expansion);
// get hurdle for weights of expansions to satisfy both thresholds
if (expansions.size() == 0) return 0;
Double[] weights =
expansions.values().toArray(new Double[expansions.size()]);
Arrays.sort(weights);
double hurdle = weights[Math.max(weights.length - maxExpansions, 0)];
hurdle = Math.max(hurdle, minExpansionWeight);
// drop expansions that have a weight below that hurdle
dropped.clear();
for (String expansion : expansions.keySet())
if (expansions.get(expansion) < hurdle)
dropped.add(expansion);
for (String expansion : dropped) expansions.remove(expansion);
dropped.clear();
if (strict) {
// drop as many expansions with a weight equal to the hurdle
// as required to satisfy the MAX_EXPANSIONS threshold
int numToDrop = Math.max(expansions.size() - maxExpansions, 0);
for (String expansion : expansions.keySet()) {
if (numToDrop == 0) break;
if (expansions.get(expansion) == hurdle) {
dropped.add(expansion);
numToDrop--;
}
}
for (String expansion : dropped) expansions.remove(expansion);
}
return hurdle;
}
/**
* Drops expansions that do not contain any keywords and applies the default
* thresholds for the maximum number of expansions and the minimum weight of
* an expansion.
*
* @param expansions expansions along with their weights
* @param strict iff <code>true</code>, the threshold for the maximum number
* of expansions is applied strictly even if that means that
* some of a number of expansion with equal weights have to be
* dropped randomly
* @return minimum weight of an expansion in the reduced map
*/
public static double cutOffExpansions(Map<String, Double> expansions,
boolean strict) {
return cutOffExpansions(expansions, MAX_EXPANSIONS,
MIN_EXPANSION_WEIGHT, strict);
}
/**
* Creates a new expansion map and applies the thresholds for the maximum
* number of expansions and the minimum weight of an expansion for queries.
*
* @param expansions expansions along with their weights
* @param strict iff <code>true</code>, the threshold for the maximum number
* of expansions is applied strictly even if that means that
* some of a number of expansion with equal weights have to be
* dropped randomly
* @return new reduced expansion map for queries
*/
public static Map<String, Double> reduceExpansionsQuery(
Map<String, Double> expansions, boolean strict) {
HashMap<String, Double> map = new HashMap<String, Double>();
for (String expansion : expansions.keySet())
map.put(expansion, expansions.get(expansion));
cutOffExpansions(map, MAX_EXPANSIONS_QUERY, MIN_EXPANSION_WEIGHT_QUERY,
strict);
return map;
}
/**
* Expands a term by looking up related terms in ontologies.
*
* @param term a term
* @param ps predicates in the same sentence
* @param ontologies ontologies used to expand the term
*/
public static void expandTerm(Term term, Predicate[] ps,
Ontology[] ontologies) {
String text = term.getText();
String pos = term.getPos();
Map<String, Double> lemmas = new Hashtable<String, Double>();
Map<String, Double> expansions = new Hashtable<String, Double>();
// expand events, entities and modifiers
if (isTarget(term, ps) || pos.startsWith("VB")) {
// lemmatize verbs that are in WordNet
String lemma = WordNet.getLemma(text, POS.VERB);
if (lemma == null) lemma = text;
// set lemma if the POS was misleading
if (!pos.startsWith("VB")) term.setLemma(lemma);
// expand event
for (Ontology ontology : ontologies) {
Map<String, Double> expanded = ontology.expandEvent(lemma);
lemmas.putAll(expanded);
}
// ensure that there are at most MAX_EXPANSIONS expansions with
// weights of at least MIN_EXPANSION_WEIGHT
cutOffExpansions(lemmas, true);
// restore verb form
if (pos.equals("VBZ")) {
// third person singular
for (String exp : lemmas.keySet()) {
double weight = lemmas.get(exp);
String form =
VerbFormConverter.infinitiveToThirdPersonS(exp);
expansions.put(form, weight);
}
} else if (pos.equals("VBG")) {
// gerund
for (String exp : lemmas.keySet()) {
double weight = lemmas.get(exp);
String[] forms =
VerbFormConverter.infinitiveToGerund(exp);
for (String form : forms) expansions.put(form, weight);
}
} else if (pos.equals("VBD")) {
// simple past
for (String exp : lemmas.keySet()) {
double weight = lemmas.get(exp);
String[] forms =
VerbFormConverter.infinitiveToSimplePast(exp);
for (String form : forms) expansions.put(form, weight);
}
} else if (pos.equals("VBN")) {
// past participle
for (String exp : lemmas.keySet()) {
double weight = lemmas.get(exp);
String[] forms =
VerbFormConverter.infinitiveToPastParticiple(exp);
for (String form : forms) expansions.put(form, weight);
}
}
} else if (pos.startsWith("JJ") || pos.startsWith("RB")) {
// get modifier type
POS modType =
(pos.startsWith("JJ")) ? POS.ADJECTIVE : POS.ADVERB;
// lemmatize adjectives and adverbs that are in WordNet
String lemma = WordNet.getLemma(text, modType);
if (lemma == null) lemma = text;
// expand modifier
for (Ontology ontology : ontologies) {
Map<String, Double> expanded =
ontology.expandModifier(lemma, modType);
lemmas.putAll(expanded);
}
// ensure that there are at most MAX_EXPANSIONS expansions with
// weights of at least MIN_EXPANSION_WEIGHT
cutOffExpansions(lemmas, true);
} else {
// lemmatize nouns that are in WordNet
String lemma;
if (pos.startsWith("COMPOUND"))
lemma = WordNet.getCompoundLemma(text, POS.NOUN); // compound
else
lemma = WordNet.getLemma(text, POS.NOUN); // single token
if (lemma == null) lemma = text;
// expand entity
for (Ontology ontology : ontologies) {
Map<String, Double> expanded = ontology.expandEntity(lemma);
lemmas.putAll(expanded);
}
// ensure that there are at most MAX_EXPANSIONS expansions with
// weights of at least MIN_EXPANSION_WEIGHT
cutOffExpansions(lemmas, true);
// TODO restore plural forms if possible
}
term.setExpansionLemmas(lemmas);
term.setExpansions((expansions.size() > 0) ? expansions : lemmas);
}
/**
* Expands all terms by looking up related terms in ontologies.
*
* @param terms the terms
* @param ps predicates in the same sentence
* @param ontologies ontologies used to expand the term
*/
public static void expandTerms(Term[] terms, Predicate[] ps,
Ontology[] ontologies) {
for (Term term : terms) expandTerm(term, ps, ontologies);
}
/**
* Expands a phrase by replacing the terms that occur within the phrase by
* their expansions. All possible combinations of expansions of the
* individual terms are formed and each is assigned the product of the
* weights of the expansions as a combined weight. The (at most)
* <code>MAX_EXPANSIONS</code> resulting phrases with the highest weights
* are returned.
*
* @param phrase phrase to expand
* @param terms expanded terms that potentially occur within the phrase
* @return expansions and their weights
*/
public static Map<String, Double> expandPhrase(String phrase,
Term[] terms) {
// regular expressions that match the terms
List<String> patterns = new ArrayList<String>();
// maps the terms to their expansions
Map<String, Map<String, Double>> expansionsMap =
new Hashtable<String, Map<String, Double>>();
for (Term term : terms) {
Map<String, Double> expansions = term.getExpansions();
if (expansions.size() > 0) {
String pattern = "\\b" + term.getText() + "\\b";
patterns.add(pattern);
expansionsMap.put(pattern, expansions);
}
}
Map<String, Double> phraseExps = new Hashtable<String, Double>();
phraseExps.put(phrase, 1d);
// obtain phrase expansions by combining term expansions
while (patterns.size() > 0) {
String[] phrases =
phraseExps.keySet().toArray(new String[phraseExps.size()]);
String pattern = patterns.get(0);
Map<String, Double> expansions = expansionsMap.get(pattern);
for (String phraseExp : phrases) {
Matcher m =
Pattern.compile(".*?" + pattern + ".*+").matcher(phraseExp);
if (m.matches()) {
for (String expansion : expansions.keySet()) {
String expanded = phraseExp.replaceFirst(pattern,
expansion);
Double weight = phraseExps.get(phraseExp) *
expansions.get(expansion);
phraseExps.put(expanded, weight);
}
} else {
// no (further) occurrences of the term
patterns.remove(0);
break;
}
}
}
// ensure that there are at most MAX_EXPANSIONS phrases with weights of
// at least MIN_EXPANSION_WEIGHT
phraseExps.remove(phrase);
cutOffExpansions(phraseExps, true);
return phraseExps;
}
}