package info.ephyra.questionanalysis;
import info.ephyra.answerselection.filters.TruncationFilter;
import info.ephyra.nlp.NETagger;
import info.ephyra.nlp.OpenNLP;
import info.ephyra.util.Dictionary;
import info.ephyra.util.StringUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* Extracts single- and multi-token terms from a sentence. Multi-token terms are
* named entities or compound terms found in dictionaries.
*
* @author Nico Schlaefer
* @version 2007-05-28
*/
public class TermExtractor {
/** Maximum length of a term in tokens. */
private static final int MAX_TERM_LENGTH = 4;
/**
* Checks if the given term is among the named entities and returns the
* types of the entities that match it.
*
* @param term a term, potentially a named entity
* @param nes named entities
* @return types of matching entities
*/
private static String[] getNeTypes(String term, String[][] nes) {
List<String> neTypes = new ArrayList<String>();
Set<String> neTypesSet = new HashSet<String>();
for (int neId = 0; neId < nes.length; neId++)
for (String ne : nes[neId])
if (term.equals(ne)) {
String neType = NETagger.getNeType(neId);
if (neTypesSet.add(neType))
// there may be multiple taggers (IDs) for one type
neTypes.add(neType);
break;
}
return neTypes.toArray(new String[neTypes.size()]);
}
/**
* Extracts named entities from the given sentence.
*
* @param sentence sentence to analyze
* @return named entities in the sentence
*/
public static String[][] getNes(String sentence) {
String[] tokens = NETagger.tokenize(sentence);
String[][] nes = NETagger.extractNes(new String[][] {tokens})[0];
// untokenize named entities
for (int i = 0; i < nes.length; i++)
for (int j = 0; j < nes[i].length; j++)
nes[i][j] = OpenNLP.untokenize(nes[i][j], sentence);
return nes;
}
/**
* Extracts named entities from the given sentence and context string.
*
* @param sentence sentence to analyze
* @param context context string
* @return named entities in the sentence and context string
*/
public static String[][] getNes(String sentence, String context) {
// extract NEs from sentence
String[][] sentenceNes = getNes(sentence);
if (context == null || context.length() == 0) return sentenceNes;
// extract NEs from context string
String[][] contextNes = getNes(context);
// merge NEs
String[][] nes = new String[sentenceNes.length][];
for (int i = 0; i < nes.length; i++) {
if (sentenceNes[i].length == 0) nes[i] = contextNes[i];
else if (contextNes[i].length == 0) nes[i] = sentenceNes[i];
else {
ArrayList<String> nesL = new ArrayList<String>();
for (String ne : sentenceNes[i]) nesL.add(ne);
for (String ne : contextNes[i]) nesL.add(ne);
nes[i] = nesL.toArray(new String[nesL.size()]);
}
}
return nes;
}
/**
* Extracts terms from the given sentence.
*
* @param sentence sentence to analyze
* @param dicts dictionaries with compound terms
* @return terms in the sentence
*/
public static Term[] getTerms(String sentence, Dictionary[] dicts) {
String[][] nes = getNes(sentence);
return getTerms(sentence, nes, dicts);
}
/**
* Extracts terms from the given sentence, reusing named entities that have
* been extracted before.
*
* @param sentence sentence to analyze
* @param nes named entities in the sentence
* @param dicts dictionaries with compound terms
* @return terms in the sentence
*/
public static Term[] getTerms(String sentence, String[][] nes,
Dictionary[] dicts) {
// extract tokens
String[] tokens = OpenNLP.tokenize(sentence);
// tag part of speech
String[] pos = OpenNLP.tagPos(tokens);
// tag phrase chunks
String[] chunks = OpenNLP.tagChunks(tokens, pos);
// mark tokens as not yet assigned to a term
boolean[] assigned = new boolean[tokens.length];
Arrays.fill(assigned, false);
// for each token a term that starts at that token or 'null'
Term[] terms = new Term[tokens.length];
// normalized terms (do identify duplicates)
Set<String> termSet = new HashSet<String>();
// construct multi-token terms
for (int length = MAX_TERM_LENGTH; length > 1; length--)
for (int id = 0; id < tokens.length - length + 1; id++) {
// one of the tokens is already assigned to a term?
boolean skip = false;
for (int offset = 0; offset < length; offset++)
if (assigned[id + offset]) {
skip = true;
continue;
}
if (skip) continue;
// get phrase spanning the tokens
String text = tokens[id];
for (int offset = 1; offset < length; offset++)
text += " " + tokens[id + offset];
text = OpenNLP.untokenize(text, sentence);
// phrase is a duplicate?
if (!termSet.add(StringUtils.normalize(text))) continue;
// phrase does not contain keywords?
if (KeywordExtractor.getKeywords(text).length == 0) continue;
// phrase is a named entity?
String[] neTypes = getNeTypes(text, nes);
if (neTypes.length > 0) {
// construct term
terms[id] = new Term(text, Term.COMPOUND, neTypes);
// mark tokens as assigned
for (int offset = 0; offset < length; offset++)
assigned[id + offset] = true;
continue;
}
for (Dictionary dict : dicts) {
// phrase is not a noun phrase or verb phrase?
if (!(chunks[id].endsWith("NP") && // look up noun phrases
chunks[id + length - 1].endsWith("NP"))/* &&
!(chunks[id].endsWith("VP") && // look up verb phrases
chunks[id + length - 1].endsWith("VP"))*/)
continue;
// phrase contains a special characters other than '.'?
if (text.matches(".*?[^\\w\\s\\.].*+")) continue;
// phrase can be truncated?
if (!text.equals(TruncationFilter.truncate(text))) continue;
// phrase is in the dictionary?
if (dict.contains(text)) {
// construct term
terms[id] = new Term(text, Term.COMPOUND);
// mark tokens as assigned
for (int offset = 0; offset < length; offset++)
assigned[id + offset] = true;
continue;
}
}
}
// construct single-token terms
for (int id = 0; id < tokens.length; id++) {
// token is part of a multi-token term?
if (assigned[id]) continue;
// token is a duplicate?
if (!termSet.add(StringUtils.normalize(tokens[id]))) continue;
// token does not contain keywords?
if (KeywordExtractor.getKeywords(tokens[id]).length == 0) continue;
// get named entity types and construct term
String[] neTypes = getNeTypes(tokens[id], nes);
terms[id] = new Term(tokens[id], pos[id], neTypes);
}
// get ordered list of terms
List<Term> termsL = new ArrayList<Term>();
for (Term term : terms)
if (term != null) termsL.add(term);
return termsL.toArray(new Term[termsL.size()]);
}
/**
* Extracts terms from the given sentence and context string.
*
* @param sentence sentence to analyze
* @param context context string
* @param nes named entities in the sentence and context string
* @param dicts dictionaries with compound terms
* @return terms in the sentence and context string
*/
public static Term[] getTerms(String sentence, String context,
String[][] nes, Dictionary[] dicts) {
// extract terms from sentence
Term[] sentenceTerms = getTerms(sentence, nes, dicts);
if (context == null || context.length() == 0) return sentenceTerms;
// extract terms from context string
Term[] contextTerms = getTerms(context, nes, dicts);
if (sentenceTerms.length == 0) return contextTerms;
if (contextTerms.length == 0) return sentenceTerms;
// merge terms, eliminate duplicates
List<Term> terms = new ArrayList<Term>();
Set<String> termSet = new HashSet<String>();
for (Term sentenceTerm : sentenceTerms)
if (termSet.add(StringUtils.normalize(sentenceTerm.getText())))
terms.add(sentenceTerm);
for (Term contextTerm : contextTerms)
if (termSet.add(StringUtils.normalize(contextTerm.getText())))
terms.add(contextTerm);
return terms.toArray(new Term[terms.size()]);
}
/**
* Extracts single-token terms from the given sentence.
*
* @param sentence sentence to analyze
* @return single-token terms in the sentence
*/
public static Term[] getSingleTokenTerms(String sentence) {
// extract tokens
String[] tokens = OpenNLP.tokenize(sentence);
// tag part of speech
String[] pos = OpenNLP.tagPos(tokens);
// extracted terms
ArrayList<Term> terms = new ArrayList<Term>();
// normalized terms (do identify duplicates)
Set<String> termSet = new HashSet<String>();
// construct single-token terms
for (int id = 0; id < tokens.length; id++) {
// token is a duplicate?
if (!termSet.add(StringUtils.normalize(tokens[id]))) continue;
// token does not contain keywords?
if (KeywordExtractor.getKeywords(tokens[id]).length == 0) continue;
// construct term
terms.add(new Term(tokens[id], pos[id]));
}
return terms.toArray(new Term[terms.size()]);
}
}