KeywordExtractor.java example

Explorer
lucida-master
- lucida
package info.ephyra.questionanalysis;

import info.ephyra.nlp.indices.FunctionWords;
import info.ephyra.nlp.indices.WordFrequencies;
import info.ephyra.util.StringUtils;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * <p>Extracts keywords from a question.</p>
 * 
 * <p>The method <code>getKeywords()</code> tokenizes the question string and
 * drops single characters and bad keywords that frequently appear in questions.
 * Furthermore, all words that appear in the <code>FunctionWords</code>
 * dictionary and duplicates are dropped.</p>
 * 
 * <p>The method <code>getInfrequentKeywords()</code> additionally drops the
 * most frequent keywords if the number of words exceeds the threshold specified
 * in <code>MAX_WORDS</code>.</p>
 * 
 * @author Nico Schlaefer
 * @version 2007-02-09
 */
public class KeywordExtractor {
	/** Tokens that are always separated with blanks. */
	private static final Pattern DELIMS1 =
		Pattern.compile("(\\!|\\?|;|\"|'|/|\\\\|\\(|\\)|\\[|\\]|\\{|\\})");
	/** Tokens that are only separated with blanks if not in between numbers.*/
	private static final Pattern DELIMS2 =
		Pattern.compile("(^|\\D)(,|\\:)($|\\D)");
	/** Tokens that are only separated with blanks if final token. */
	private static final Pattern DELIMS3 = Pattern.compile("(\\.)$");
	/** Words that should not be part of a query string. */
	private static final String IGNORE = "(names?|give|tell|list)";
	/** Maximum number of keywords that are extracted. */
	private static final int MAX_WORDS = Integer.MAX_VALUE;
	
	/**
	 * Drops single characters.
	 * 
	 * @param words array of words
	 * @return array without single characters
	 */
	private static String[] dropSingleChars(String[] words) {
		ArrayList<String> noChar = new ArrayList<String>();
		
		for (String word : words)
			if (word.length() > 1) noChar.add(word);
		
		return noChar.toArray(new String[noChar.size()]);
	}
	
	/**
	 * Drops keywords that should not be part of a query string.
	 * 
	 * @param words array of words
	 * @return array without bad keywords
	 */
	private static String[] dropBadKeywords(String[] words) {
		ArrayList<String> goodKeywords = new ArrayList<String>();
		
		for (String word : words)
			if (!word.matches("(?i)" + IGNORE)) goodKeywords.add(word);
		
		return goodKeywords.toArray(new String[goodKeywords.size()]);
	}
	
	/**
	 * Drops function words.
	 * 
	 * @param words array of words
	 * @return array without function words
	 */
	private static String[] dropFunctionWords(String[] words) {
		ArrayList<String> content = new ArrayList<String>();
		
		for (String word : words)
			if (!FunctionWords.lookup(word))
				content.add(word);
		
		return content.toArray(new String[content.size()]);
	}
	
	/**
	 * Drops duplicates.
	 * 
	 * @param words array of words
	 * @return array without duplicates
	 */
	private static String[] dropDuplicates(String[] words) {
		HashSet<String> normSet = new HashSet<String>();
		ArrayList<String> wordList = new ArrayList<String>();
		
		for (String word : words) {
			String norm = StringUtils.normalize(word);
			if (normSet.add(norm))  // compare normalizations
				wordList.add(word);
		}
		
		return wordList.toArray(new String[wordList.size()]);
	}
	
	/**
	 * Removes the most frequent words if the number of words exceeds the
	 * threshold specified in the <code>MAX_WORDS</code> field.
	 * 
	 * @param words array of words
	 * @return array of at most <code>MAX_WORDS</code> words
	 */
	private static String[] dropFrequentWords(String[] words) {
		if (words.length > MAX_WORDS) {  // number of words exceeds threshold
			// get word frequencies
			int[] frequencies = new int[words.length];
			for (int i = 0; i < words.length; i++)
				frequencies[i] = WordFrequencies.lookup(words[i]);
			
			// mark most frequent words
			int index = -1, max = -1;
			for (int i = 0; i < words.length - MAX_WORDS; i++) {
				for (int j = 0; j < words.length; j++)
					if (frequencies[j] > max) {
						index = j;
						max = frequencies[j];
					}
				
				frequencies[index] = -1;
				max = -1;
			}
			
			// create new array with rare words
			String[] rare = new String[MAX_WORDS];
			int pos = 0;
			for (int i = 0; i < words.length; i++)
				if (frequencies[i] >= 0) rare[pos++] = words[i];
			
			return rare;
		} else {  // number of words less than or equal threshold
			return words;
		}
	}
	
	/**
	 * A rule-based tokenizer used to extract keywords for a query. This
	 * tokenizer is conservative, e.g. it does not split "F16" or "1,000.00".
	 * 
	 * @param text text to tokenize
	 * @return string of space-delimited tokens
	 */
	public static String tokenizeWithSpaces(String text) {
		String rep;
		
		Matcher m1 = DELIMS1.matcher(text);
		while (m1.find()) {
			rep = " " + m1.group(0) + " ";
			text = text.replace(m1.group(0), rep);
		}
		
		Matcher m2 = DELIMS2.matcher(text);
		while (m2.find()) {
			rep = m2.group(1) + " "  + m2.group(2) + " " + m2.group(3);
			text = text.replace(m2.group(0), rep);
		}
		
		Matcher m3 = DELIMS3.matcher(text);
		if (m3.find()) {
			rep = " " + m3.group(0);
			text = text.substring(0, text.length() - 1) + rep;
		}
		
		text = text.replaceAll("\\s++", " ").trim();
		
		return text;
	}
	
	/**
	 * Applies the rule-based tokenizer and splits the resulting string along
	 * whitespaces.
	 * 
	 * @param text text to tokenize
	 * @return array of tokens
	 */
	public static String[] tokenize(String text) {
		text = tokenizeWithSpaces(text);
		return text.split(" ");
	}
	
	/**
	 * Extracts keywords from a question.
	 * 
	 * @param verbMod question string with modified verbs
	 * @return keywords
	 */
	public static String[] getKeywords(String verbMod) {
		// split question into words
		String[] words = tokenize(verbMod);
		
		// drop single characters
		words = dropSingleChars(words);
		
		// drop bad keywords
		words = dropBadKeywords(words);
		
		// drop function words
		words = dropFunctionWords(words);
		
		// drop duplicates
		words = dropDuplicates(words);
		
		return words;
	}
	
	/**
	 * Extracts keywords from a question and a context string.
	 * 
	 * @param verbMod question string with modified verbs
	 * @param context context string
	 * @return keywords
	 */
	public static String[] getKeywords(String verbMod, String context) {
		return getKeywords(verbMod + " " + context);
	}
	
	/**
	 * Extracts the up to <code>MAX_WORDS</code> least frequent keywords from a
	 * question.
	 * 
	 * @param verbMod question string with modified verbs
	 * @return keywords
	 */
	public static String[] getInfrequentKeywords(String verbMod) {
		// get all keywords
		String[] words = getKeywords(verbMod);
		
		// drop frequent keywords if the number of keywords exceeds 'MAX_WORDS'
		words = dropFrequentWords(words);
		
		return words;
	}
	
	/**
	 * Checks if the text contains one of the keywords.
	 * 
	 * @param text a text
	 * @param kws keywords
	 * @return <code>true</code> iff the text contains one of the keywords
	 */
	public static boolean containsKeyword(String text, String[] kws) {
		HashSet<String> kwsSet = new HashSet<String>();
		for (String kw : kws) kwsSet.add(kw);
		
		String[] words = tokenize(text);
		for (String word : words)
			if (kwsSet.contains(word)) return true;
		
		return false;
	}
}