QuestionNormalizer.java example

Explorer
lucida-master
- lucida
package info.ephyra.questionanalysis;

import info.ephyra.nlp.NETagger;
import info.ephyra.nlp.OpenNLP;
import info.ephyra.nlp.VerbFormConverter;
import info.ephyra.nlp.semantics.ontologies.WordNet;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javatools.PlingStemmer;

/**
 * This class provides methods that modify a question to facilitate pattern
 * matching and to anticipate the format of text passages that answer the
 * question.
 * 
 * @author Nico Schlaefer
 * @version 2006-06-18
 */
public class QuestionNormalizer {
	/**
	 * Replaces short forms of "is" and "are" that occur in combination with
	 * interrogatives.
	 * 
	 * @param question the question string
	 * @return modified question string
	 */
	private static String replaceShortForms(String question) {
		// only replace occurences of "'s" and "'re" in combination with
		// interrogatives
		Pattern p = Pattern.compile("(?i)(how|what|which|when|where|who|why)'" +
									"(s|re)");
		Matcher m = p.matcher(question);
		
		if (m.find()) {
			String original = m.group();
			
			String replaced = original.replace("'s", " is");
			replaced = replaced.replace("'re", " are");
			
			return question.replace(original, replaced);
		}
		
		return question;  // no such short forms in the question
	}
	
	/**
	 * Drops filler words from the question string.
	 * 
	 * @param question the question string
	 * @return modified question string
	 */
	private static String dropFillers(String question) {
		String fillers = "(approximate|approximately|one of|so-called)";
		
		return question.replaceAll(fillers + " ", "");
	}
	
	/**
	 * <p>Modifies the question string by applying the following rule:</p>
	 * 
	 * <p><code>is/are/was/were [...] gerund / past participle ->
	 * is/are/was/were gerund / past participle</code></p>
	 * 
	 * @param question question string
	 * @param tagged tagged question
	 * @return modified question strings
	 */
	private static String[] handleAuxIs(String question, String tagged) {
		Pattern p = Pattern.compile("(?i)(.* )?(is|are|was|were)/.*? " +
									"(\\S*)/vb(g|n).*");
		Matcher m = p.matcher(tagged);
		
		String[] results;
		if (m.matches()) {
			String aux = m.group(2);
			String verb = m.group(3);
			
			results = new String[1];
			results[0] = question.replaceFirst(verb, aux + " " + verb);
			results[0] = results[0].replaceFirst(aux + " ", "");
			
			return results;
		}
		
		return null;
	}
	
	/**
	 * <p>Modifies the question string by applying the following rule:</p>
	 * 
	 * <p><code>can/could/will/would/shall/should/may/might/must [...]
	 * infinitive -> can/could/will/would/shall/should/may/might/must
	 * infinitive</code></p>
	 * 
	 * @param question question string
	 * @param tagged tagged question
	 * @return modified question strings
	 */
	private static String[] handleAuxCanMay(String question, String tagged) {
		Pattern p = Pattern.compile("(?i)(.* )?(can|could|will|would|shall" +
									"|should|may|might|must)/.*? " +
									"(\\S*)/vb(\\W.*)?");
		Matcher m = p.matcher(tagged);
		
		String[] results;
		if (m.matches()) {
			String aux = m.group(2);
			String verb = m.group(3);
			
			results = new String[1];
			results[0] = question.replaceFirst(verb, aux + " " + verb);
			results[0] = results[0].replaceFirst(aux + " ", "");
			
			return results;
		}
		
		return null;
	}
	
	/**
	 * <p>Modifies the question string by applying the following rule:</p>
	 * 
	 * <p><code>have/has/had [...] past_participle -> has/have/had
	 * past_participle / simple_past</code></p>
	 * 
	 * @param question question string
	 * @param tagged tagged question
	 * @return modified question strings
	 */
	private static String[] handleAuxHasHad(String question, String tagged) {
		Pattern p = Pattern.compile("(?i)(.* )?(has|have|had)/.*? " +
									"(\\S*)/vbn.*");
		Matcher m = p.matcher(tagged);
		
		String[] results;
		if (m.matches()) {
			String aux = m.group(2);
			String verb = m.group(3);
			String[] sp = VerbFormConverter.pastParticipleToSimplePast(verb);
			
			results = new String[sp.length + 1];
			results[0] = question.replaceFirst(verb, aux + " " + verb);
			results[0] = results[0].replaceFirst(aux + " ", "");
			for (int i = 0; i < sp.length; i++) {
				results[i + 1] = question.replaceFirst(verb, sp[i]);
				results[i + 1] = results[i + 1].replaceFirst(aux + " ", "");
			}
			
			return results;
		}
		
		return null;
	}
	
	/**
	 * <p>Modifies the question string by applying the following rule:</p>
	 * 
	 * <p><code>do [...] infinitive -> infinitive</code></p>
	 * 
	 * @param question question string
	 * @param tagged tagged question
	 * @return modified question strings
	 */
	private static String[] handleAuxDo(String question, String tagged) {
		Pattern p = Pattern.compile("(?i)(.* )?do/.*? (\\S*)/vb(\\W.*)?");
		Matcher m = p.matcher(tagged);
		
		String[] results;
		if (m.matches()) {
			results = new String[1];
			results[0] = question.replaceFirst("do ", "");
			
			return results;
		}
		
		return null;
	}
	
	/**
	 * <p>Modifies the question string by applying the following rule:</p>
	 * 
	 * <p><code>does [...] infinitive -> 3rd person singular</code></p>
	 * 
	 * @param question question string
	 * @param tagged tagged question
	 * @return modified question strings
	 */
	private static String[] handleAuxDoes(String question, String tagged) {
		Pattern p = Pattern.compile("(?i)(.* )?does/.*? (\\S*)/vb(\\W.*)?");
		Matcher m = p.matcher(tagged);
		
		String[] results;
		if (m.matches()) {
			String verb = m.group(2);
			String tps = VerbFormConverter.infinitiveToThirdPersonS(verb);
			
			results = new String[1];
			results[0] = question.replaceFirst(verb, tps);
			results[0] = results[0].replaceFirst("does ", "");
			
			return results;
		}
		
		return null;
	}
	
	/**
	 * <p>Modifies the question string by applying the following rule:</p>
	 * 
	 * <p><code>did [...] infinitive -> simple_past</code></p>
	 * 
	 * @param question question string
	 * @param tagged tagged question
	 * @return modified question strings
	 */
	private static String[] handleAuxDid(String question, String tagged) {
		Pattern p = Pattern.compile("(?i)(.* )?did/.*? (\\S*)/vb(\\W.*)?");
		Matcher m = p.matcher(tagged);
		
		String[] results;
		if (m.matches()) {
			String verb = m.group(2);
			results = VerbFormConverter.infinitiveToSimplePast(verb);
			
			for (int i = 0; i < results.length; i++) {
				results[i] = question.replace(verb, results[i]);
				results[i] = results[i].replace("did ", "");
			}
			
			return results;
		}
		
		return null;
	}
	
	/**
	 * Removes the final punctuation mark and quotation marks from the question
	 * string.
	 * 
	 * @param question the question string
	 * @return modified question string
	 */
	private static String dropPunctuationMarks(String question) {
		// drop final punctuation mark
		question = question.replaceAll("(\\.|\\?|!)$", "");
		// drop quotation marks
		return question.replaceAll("\"", "");
	}
	
//	/**
//	 * Converts the first letter of the question string to lower case.
//	 * 
//	 * @param question the question string
//	 * @return modified question string
//	 */
//	private static String lowerFirstLetter(String question) {
//		if (question.length() > 0) {
//			String upper = question.substring(0, 1);  // get first letter
//			String lower = upper.toLowerCase();
//		
//			if (!lower.equals(upper))
//				return question.replaceFirst(upper, lower);
//		}
//		
//		return question;
//	}
	
	/**
	 * Normalizes a question string by removing abundant whitespaces, replacing
	 * short forms and dropping filler words.
	 * 
	 * @param question question string
	 * @return normalized question string
	 */
	public static String normalize(String question) {
		//remove leading and trailing whitespaces
		question = question.trim();
		//replace multiple whitespaces by a single blank
		question = question.replaceAll("\\s+", " ");
		// replace short forms of "is" and "are"
		question = replaceShortForms(question);
		// drop filler words
		question = dropFillers(question);
		
		return question;
	}
	
	/**
	 * Converts the verbs to infinitive and the nouns to their singular forms.
	 * 
	 * @param qn normalized question string
	 * @return stemmed question string
	 */
	public static String stemVerbsAndNouns(String qn) {
		// tokenize, tag POS and convert to lower case
		String[] tokens = OpenNLP.tokenize(qn);
		String[] pos = OpenNLP.tagPos(tokens);
		
		qn = qn.toLowerCase();
		for (int i = 0; i < tokens.length; i++)
			tokens[i] = tokens[i].toLowerCase();
		
		for (int i = 0; i < tokens.length; i++) {
			if (pos[i].startsWith("VB")) {
				String rep = WordNet.getLemma(tokens[i], WordNet.VERB);
				if (rep == null) rep = tokens[i];
				qn = qn.replace(tokens[i], rep);
			} else if (pos[i].startsWith("NN")) {
				String rep = PlingStemmer.stem(tokens[i]);
				qn = qn.replace(tokens[i], rep);
			}
		}
		
		// drop final punctuation mark and quotation marks
		qn = dropPunctuationMarks(qn);
		// convert the first letter to lower case
//		qn = lowerFirstLetter(qn);
		
		return qn;
	}
	
	/**
	 * Unstems a substring of the stemmed question string by mapping it to the
	 * normalized question string.
	 * 
	 * @param sub a substring of the stemmed question string
	 * @param stemmed the stemmed question string
	 * @param qn the normalized question string
	 * @return unstemmed string or <code>sub</code>, if it is not a substring of
	 * 		   <code>stemmed</code>
	 */
	public static String unstem(String sub, String stemmed, String qn) {
		String result = sub;
		
		// preprocess the normalized question string
		// - drop final punctuation mark and quotation marks
		qn = dropPunctuationMarks(qn);
		// - convert the first letter to lower case
//		qn = lowerFirstLetter(qn);
		
		String[] truncs = stemmed.split(sub, -1);
		if (truncs.length > 1) {  // substring occurs in stemmed string?
			int start = NETagger.tokenize(truncs[0]).length;
			int end = start + NETagger.tokenize(sub).length;
			String[] tokens = NETagger.tokenize(qn);
			
			result = tokens[start];
			for (int i = start + 1; i < end; i++) result += " " + tokens[i];
			
			result = OpenNLP.untokenize(result, qn);
		}
		
		return result;
	}
	
	/**
	 * <p>Handles auxiliary verbs by applying the rules specified in the
	 * documentations of the <code>handleAux...()</code> methods.</p>
	 * 
	 * @param qn normalized question string
	 * @return question strings with modified verbs
	 */
	public static String[] handleAuxiliaries(String qn) {
		String[] results = {qn};
		
		// tokenize, tag POS and convert to lower case
		String tokens = OpenNLP.tokenizeWithSpaces(qn);
		String tagged = OpenNLP.tagPos(tokens).toLowerCase();
		
		// is/are/was/were [...] gerund / past participle ->
		// is/are/was/were gerund / past participle
		results = handleAuxIs(qn, tagged);
		
		// can/could/will/would/shall/should/may/might/must [...] infinitive ->
		// can/could/will/would/shall/should/may/might/must infinitive
		if (results == null) results = handleAuxCanMay(qn, tagged);
			
		// have/has/had [...] past_participle ->
		// has/have/had past_participle / simple_past
		if (results == null) results = handleAuxHasHad(qn, tagged);
		
		// do [...] infinitive -> infinitive
		if (results == null) results = handleAuxDo(qn, tagged);
		
		// does [...] infinitive -> infinitive + "s"
		if (results == null) results = handleAuxDoes(qn, tagged);
		
		// did [...] infinitive -> simple_past
		if (results == null) results = handleAuxDid(qn, tagged);
		
		// none of the above rules applies
		if (results == null) results = new String[] {qn};
		
		for (int i = 0; i < results.length; i++) {
			// drop final punctuation mark and quotation marks
			results[i] = dropPunctuationMarks(results[i]);
			// convert the first letter to lower case
//			results[i] = lowerFirstLetter(results[i]);
		}
		
		return results;
	}
	
	/**
	 * Replaces certain expressions in a list question to transform it into a
	 * factoid question.
	 * 
	 * @param question a list question
	 * @return transformed question
	 */
	public static String transformList(String question) {
		question = normalize(question);
		
		String listPattern = ("(?i)^") +
			"(name|(what|which|who)( (is|are|was|were))?|list|give|provide|identify) " +
			"((a list of )?((the )?names of )?(all|every|a few|more|(the )?other|(the )?several|some( of)?|(the )?various) )?";
		Matcher m = Pattern.compile(listPattern).matcher(question);
		if (m.find()) {
			String match = m.group(0);
			
			String rep = m.group(1);
			if (rep.matches("(?i)(list|give|provide|identify)")) rep = "name";
			
			question = question.replaceFirst(match, rep + " ");
		}
		
		return question;
	}
}