package info.ephyra.questionanalysis;
import info.ephyra.nlp.indices.FunctionWords;
import info.ephyra.nlp.semantics.ontologies.WordNet;
import info.ephyra.util.StringUtils;
import java.io.Serializable;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Map;
import java.util.Set;
import net.didion.jwnl.data.POS;
/**
* <p>A <code>Term</code> comprises one or more tokens of text that form a unit
* of meaning. It can be an individual word, a compound noun or a named entity.
* </p>
*
* <p>This class implements the interface <code>Serializable</code>.</p>
*
* @author Nico Schlaefer
* @version 2008-01-23
*/
public class Term implements Serializable {
/** Version number used during deserialization. */
private static final long serialVersionUID = 20070501;
/** Part of speech tag for terms that comprise multiple tokens */
public static final String COMPOUND = "COMPOUND";
/** The textual representation of the term. */
private String text;
/** The lemma of the term. */
private String lemma;
/**
* The part of speech of the term or <code>COMPOUND</code> to indicate that
* it comprises multiple tokens.
*/
private String pos;
/** The named entity types of the term (optional). */
private String[] neTypes = new String[0];
/** Relative frequency of the term. */
private double relFrequency;
/** Maps expansions of the term to their weights. */
private Map<String, Double> expansions;
/** Maps lemmas of the expansions to their weights. */
private Map<String, Double> expansionLemmas;
// Getters/Setters
public String getText() {return text;}
public String getLemma() {return lemma;}
public String getPos() {return pos;}
public String[] getNeTypes() {return neTypes;}
public void setNeTypes(String[] neTypes) {this.neTypes = neTypes;}
public double getRelFrequency() {return relFrequency;}
public void setRelFrequency(double relFrequency) {
this.relFrequency = relFrequency;}
public Map<String, Double> getExpansions() {return expansions;}
public void setExpansions(Map<String, Double> expansions) {
this.expansions = expansions;}
/**
* Constructs a term from the provided information.
*
* @param text textual representation
* @param pos part of speech
*/
public Term(String text, String pos) {
this.text = text;
this.pos = pos;
// derive the lemma
generateLemma();
}
/**
* Constructs a term from the provided information.
*
* @param text textual representation
* @param pos part of speech
* @param neTypes named entity types
*/
public Term(String text, String pos, String[] neTypes) {
this(text, pos);
this.neTypes = neTypes;
}
/**
* Generates the lemma of the term.
*/
private void generateLemma() {
String lemma;
if (pos.startsWith("VB")) {
// lemmatize verbs that are in WordNet
lemma = WordNet.getLemma(text, POS.VERB);
} else if (pos.startsWith("JJ")) {
// lemmatize adjectives that are in WordNet
lemma = WordNet.getLemma(text, POS.ADJECTIVE);
} else if (pos.startsWith("RB")) {
// lemmatize adverbs that are in WordNet
lemma = WordNet.getLemma(text, POS.ADVERB);
} else {
// lemmatize nouns that are in WordNet
if (pos.startsWith("COMPOUND"))
lemma = WordNet.getCompoundLemma(text, POS.NOUN); // compound
else
lemma = WordNet.getLemma(text, POS.NOUN); // single token
}
if (lemma == null) lemma = text;
setLemma(lemma);
}
/**
* Normalizes and sets the lemma of the term.
*
* @param lemma the lemma of the term
*/
public void setLemma(String lemma) {
this.lemma = StringUtils.normalize(lemma);
}
/**
* Normalizes and sets the lemmas of the expansions.
*
* @param expansionLemmas the lemmas of the expansions
*/
public void setExpansionLemmas(Map<String, Double> expansionLemmas) {
Map<String, Double> normalized = new Hashtable<String, Double>();
for (String lemma : expansionLemmas.keySet()) {
double weight = expansionLemmas.get(lemma);
String norm = StringUtils.normalize(lemma);
normalized.put(norm, weight);
}
this.expansionLemmas = normalized;
}
/**
* Gets the weight of the term or expansion with the given lemma.
*
* @param lemma the lemma
* @return the weight or <code>0</code> if there is no match
*/
public double getWeight(String lemma) {
if (lemma.equals(this.lemma)) return 1;
if (expansionLemmas == null) return 0;
Double weight = expansionLemmas.get(lemma);
return (weight != null) ? weight : 0;
}
/**
* Calculates similarity scores for the given lemma and the lemmas of the
* term and its expansions based on their weights and the number of common
* tokens. Gets the maximum of all these scores.
*
* @param lemma lemma to compare with
* @return similarity score
*/
public double simScore(String lemma) {
// tokenize lemma,
// eliminate duplicates, function words and tokens of length < 2
String[] tokens = lemma.split(" ");
Set<String> lookupSet = new HashSet<String>();
for (String token : tokens)
if (token.length() > 1 && !FunctionWords.lookup(token))
lookupSet.add(token);
if (lookupSet.size() == 0) return 0;
// calculate similarity score for the term
// (Jaccard coefficient)
tokens = this.lemma.split(" ");
Set<String> tokenSet = new HashSet<String>();
for (String token : tokens)
if (token.length() > 1 && !FunctionWords.lookup(token))
tokenSet.add(token);
double intersect = 0;
int union = lookupSet.size();
for (String token : tokenSet)
if (lookupSet.contains(token)) intersect++; else union++;
double simScore = intersect / union;
// calculate similarity scores for the expansions
// (Jaccard coefficient)
for (String expansionLemma : expansionLemmas.keySet()) {
tokens = expansionLemma.split(" ");
tokenSet.clear();
for (String token : tokens)
if (token.length() > 1 && !FunctionWords.lookup(token))
tokenSet.add(token);
double weight = expansionLemmas.get(expansionLemma);
intersect = 0;
union = lookupSet.size();
for (String token : tokenSet)
if (lookupSet.contains(token)) intersect++; else union++;
simScore = Math.max(simScore, weight * (intersect / union));
}
return simScore;
}
/**
* Creates a string representation of the term.
*
* @return string representation
*/
public String toString() {
String s = "{\"" + text + "\"; POS: " + pos;
if (neTypes.length > 0) {
s += "; NE types: " + neTypes[0];
for (int i = 1; i < neTypes.length; i++)
s += ", " + neTypes[i];
}
if (expansions != null && expansions.size() > 0) {
String[] texts =
expansions.keySet().toArray(new String[expansions.size()]);
s += "; Expansions: {" + texts[0] + "=" + expansions.get(texts[0]);
for (int i = 1; i < expansions.size(); i++)
s += ", " + texts[i] + "=" + expansions.get(texts[i]);
s += "}";
}
s += "}";
return s;
}
}