package info.ephyra.questionanalysis;
import info.ephyra.nlp.indices.FunctionWords;
import info.ephyra.nlp.indices.WordFrequencies;
import info.ephyra.util.StringUtils;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* <p>Extracts keywords from a question.</p>
*
* <p>The method <code>getKeywords()</code> tokenizes the question string and
* drops single characters and bad keywords that frequently appear in questions.
* Furthermore, all words that appear in the <code>FunctionWords</code>
* dictionary and duplicates are dropped.</p>
*
* <p>The method <code>getInfrequentKeywords()</code> additionally drops the
* most frequent keywords if the number of words exceeds the threshold specified
* in <code>MAX_WORDS</code>.</p>
*
* @author Nico Schlaefer
* @version 2007-02-09
*/
public class KeywordExtractor {
/** Tokens that are always separated with blanks. */
private static final Pattern DELIMS1 =
Pattern.compile("(\\!|\\?|;|\"|'|/|\\\\|\\(|\\)|\\[|\\]|\\{|\\})");
/** Tokens that are only separated with blanks if not in between numbers.*/
private static final Pattern DELIMS2 =
Pattern.compile("(^|\\D)(,|\\:)($|\\D)");
/** Tokens that are only separated with blanks if final token. */
private static final Pattern DELIMS3 = Pattern.compile("(\\.)$");
/** Words that should not be part of a query string. */
private static final String IGNORE = "(names?|give|tell|list)";
/** Maximum number of keywords that are extracted. */
private static final int MAX_WORDS = Integer.MAX_VALUE;
/**
* Drops single characters.
*
* @param words array of words
* @return array without single characters
*/
private static String[] dropSingleChars(String[] words) {
ArrayList<String> noChar = new ArrayList<String>();
for (String word : words)
if (word.length() > 1) noChar.add(word);
return noChar.toArray(new String[noChar.size()]);
}
/**
* Drops keywords that should not be part of a query string.
*
* @param words array of words
* @return array without bad keywords
*/
private static String[] dropBadKeywords(String[] words) {
ArrayList<String> goodKeywords = new ArrayList<String>();
for (String word : words)
if (!word.matches("(?i)" + IGNORE)) goodKeywords.add(word);
return goodKeywords.toArray(new String[goodKeywords.size()]);
}
/**
* Drops function words.
*
* @param words array of words
* @return array without function words
*/
private static String[] dropFunctionWords(String[] words) {
ArrayList<String> content = new ArrayList<String>();
for (String word : words)
if (!FunctionWords.lookup(word))
content.add(word);
return content.toArray(new String[content.size()]);
}
/**
* Drops duplicates.
*
* @param words array of words
* @return array without duplicates
*/
private static String[] dropDuplicates(String[] words) {
HashSet<String> normSet = new HashSet<String>();
ArrayList<String> wordList = new ArrayList<String>();
for (String word : words) {
String norm = StringUtils.normalize(word);
if (normSet.add(norm)) // compare normalizations
wordList.add(word);
}
return wordList.toArray(new String[wordList.size()]);
}
/**
* Removes the most frequent words if the number of words exceeds the
* threshold specified in the <code>MAX_WORDS</code> field.
*
* @param words array of words
* @return array of at most <code>MAX_WORDS</code> words
*/
private static String[] dropFrequentWords(String[] words) {
if (words.length > MAX_WORDS) { // number of words exceeds threshold
// get word frequencies
int[] frequencies = new int[words.length];
for (int i = 0; i < words.length; i++)
frequencies[i] = WordFrequencies.lookup(words[i]);
// mark most frequent words
int index = -1, max = -1;
for (int i = 0; i < words.length - MAX_WORDS; i++) {
for (int j = 0; j < words.length; j++)
if (frequencies[j] > max) {
index = j;
max = frequencies[j];
}
frequencies[index] = -1;
max = -1;
}
// create new array with rare words
String[] rare = new String[MAX_WORDS];
int pos = 0;
for (int i = 0; i < words.length; i++)
if (frequencies[i] >= 0) rare[pos++] = words[i];
return rare;
} else { // number of words less than or equal threshold
return words;
}
}
/**
* A rule-based tokenizer used to extract keywords for a query. This
* tokenizer is conservative, e.g. it does not split "F16" or "1,000.00".
*
* @param text text to tokenize
* @return string of space-delimited tokens
*/
public static String tokenizeWithSpaces(String text) {
String rep;
Matcher m1 = DELIMS1.matcher(text);
while (m1.find()) {
rep = " " + m1.group(0) + " ";
text = text.replace(m1.group(0), rep);
}
Matcher m2 = DELIMS2.matcher(text);
while (m2.find()) {
rep = m2.group(1) + " " + m2.group(2) + " " + m2.group(3);
text = text.replace(m2.group(0), rep);
}
Matcher m3 = DELIMS3.matcher(text);
if (m3.find()) {
rep = " " + m3.group(0);
text = text.substring(0, text.length() - 1) + rep;
}
text = text.replaceAll("\\s++", " ").trim();
return text;
}
/**
* Applies the rule-based tokenizer and splits the resulting string along
* whitespaces.
*
* @param text text to tokenize
* @return array of tokens
*/
public static String[] tokenize(String text) {
text = tokenizeWithSpaces(text);
return text.split(" ");
}
/**
* Extracts keywords from a question.
*
* @param verbMod question string with modified verbs
* @return keywords
*/
public static String[] getKeywords(String verbMod) {
// split question into words
String[] words = tokenize(verbMod);
// drop single characters
words = dropSingleChars(words);
// drop bad keywords
words = dropBadKeywords(words);
// drop function words
words = dropFunctionWords(words);
// drop duplicates
words = dropDuplicates(words);
return words;
}
/**
* Extracts keywords from a question and a context string.
*
* @param verbMod question string with modified verbs
* @param context context string
* @return keywords
*/
public static String[] getKeywords(String verbMod, String context) {
return getKeywords(verbMod + " " + context);
}
/**
* Extracts the up to <code>MAX_WORDS</code> least frequent keywords from a
* question.
*
* @param verbMod question string with modified verbs
* @return keywords
*/
public static String[] getInfrequentKeywords(String verbMod) {
// get all keywords
String[] words = getKeywords(verbMod);
// drop frequent keywords if the number of keywords exceeds 'MAX_WORDS'
words = dropFrequentWords(words);
return words;
}
/**
* Checks if the text contains one of the keywords.
*
* @param text a text
* @param kws keywords
* @return <code>true</code> iff the text contains one of the keywords
*/
public static boolean containsKeyword(String text, String[] kws) {
HashSet<String> kwsSet = new HashSet<String>();
for (String kw : kws) kwsSet.add(kw);
String[] words = tokenize(text);
for (String word : words)
if (kwsSet.contains(word)) return true;
return false;
}
}