package info.ephyra.questionanalysis; import info.ephyra.nlp.NETagger; import info.ephyra.nlp.OpenNLP; import info.ephyra.nlp.VerbFormConverter; import info.ephyra.nlp.semantics.ontologies.WordNet; import java.util.regex.Matcher; import java.util.regex.Pattern; import javatools.PlingStemmer; /** * This class provides methods that modify a question to facilitate pattern * matching and to anticipate the format of text passages that answer the * question. * * @author Nico Schlaefer * @version 2006-06-18 */ public class QuestionNormalizer { /** * Replaces short forms of "is" and "are" that occur in combination with * interrogatives. * * @param question the question string * @return modified question string */ private static String replaceShortForms(String question) { // only replace occurences of "'s" and "'re" in combination with // interrogatives Pattern p = Pattern.compile("(?i)(how|what|which|when|where|who|why)'" + "(s|re)"); Matcher m = p.matcher(question); if (m.find()) { String original = m.group(); String replaced = original.replace("'s", " is"); replaced = replaced.replace("'re", " are"); return question.replace(original, replaced); } return question; // no such short forms in the question } /** * Drops filler words from the question string. * * @param question the question string * @return modified question string */ private static String dropFillers(String question) { String fillers = "(approximate|approximately|one of|so-called)"; return question.replaceAll(fillers + " ", ""); } /** * <p>Modifies the question string by applying the following rule:</p> * * <p><code>is/are/was/were [...] gerund / past participle -> * is/are/was/were gerund / past participle</code></p> * * @param question question string * @param tagged tagged question * @return modified question strings */ private static String[] handleAuxIs(String question, String tagged) { Pattern p = Pattern.compile("(?i)(.* )?(is|are|was|were)/.*? " + "(\\S*)/vb(g|n).*"); Matcher m = p.matcher(tagged); String[] results; if (m.matches()) { String aux = m.group(2); String verb = m.group(3); results = new String[1]; results[0] = question.replaceFirst(verb, aux + " " + verb); results[0] = results[0].replaceFirst(aux + " ", ""); return results; } return null; } /** * <p>Modifies the question string by applying the following rule:</p> * * <p><code>can/could/will/would/shall/should/may/might/must [...] * infinitive -> can/could/will/would/shall/should/may/might/must * infinitive</code></p> * * @param question question string * @param tagged tagged question * @return modified question strings */ private static String[] handleAuxCanMay(String question, String tagged) { Pattern p = Pattern.compile("(?i)(.* )?(can|could|will|would|shall" + "|should|may|might|must)/.*? " + "(\\S*)/vb(\\W.*)?"); Matcher m = p.matcher(tagged); String[] results; if (m.matches()) { String aux = m.group(2); String verb = m.group(3); results = new String[1]; results[0] = question.replaceFirst(verb, aux + " " + verb); results[0] = results[0].replaceFirst(aux + " ", ""); return results; } return null; } /** * <p>Modifies the question string by applying the following rule:</p> * * <p><code>have/has/had [...] past_participle -> has/have/had * past_participle / simple_past</code></p> * * @param question question string * @param tagged tagged question * @return modified question strings */ private static String[] handleAuxHasHad(String question, String tagged) { Pattern p = Pattern.compile("(?i)(.* )?(has|have|had)/.*? " + "(\\S*)/vbn.*"); Matcher m = p.matcher(tagged); String[] results; if (m.matches()) { String aux = m.group(2); String verb = m.group(3); String[] sp = VerbFormConverter.pastParticipleToSimplePast(verb); results = new String[sp.length + 1]; results[0] = question.replaceFirst(verb, aux + " " + verb); results[0] = results[0].replaceFirst(aux + " ", ""); for (int i = 0; i < sp.length; i++) { results[i + 1] = question.replaceFirst(verb, sp[i]); results[i + 1] = results[i + 1].replaceFirst(aux + " ", ""); } return results; } return null; } /** * <p>Modifies the question string by applying the following rule:</p> * * <p><code>do [...] infinitive -> infinitive</code></p> * * @param question question string * @param tagged tagged question * @return modified question strings */ private static String[] handleAuxDo(String question, String tagged) { Pattern p = Pattern.compile("(?i)(.* )?do/.*? (\\S*)/vb(\\W.*)?"); Matcher m = p.matcher(tagged); String[] results; if (m.matches()) { results = new String[1]; results[0] = question.replaceFirst("do ", ""); return results; } return null; } /** * <p>Modifies the question string by applying the following rule:</p> * * <p><code>does [...] infinitive -> 3rd person singular</code></p> * * @param question question string * @param tagged tagged question * @return modified question strings */ private static String[] handleAuxDoes(String question, String tagged) { Pattern p = Pattern.compile("(?i)(.* )?does/.*? (\\S*)/vb(\\W.*)?"); Matcher m = p.matcher(tagged); String[] results; if (m.matches()) { String verb = m.group(2); String tps = VerbFormConverter.infinitiveToThirdPersonS(verb); results = new String[1]; results[0] = question.replaceFirst(verb, tps); results[0] = results[0].replaceFirst("does ", ""); return results; } return null; } /** * <p>Modifies the question string by applying the following rule:</p> * * <p><code>did [...] infinitive -> simple_past</code></p> * * @param question question string * @param tagged tagged question * @return modified question strings */ private static String[] handleAuxDid(String question, String tagged) { Pattern p = Pattern.compile("(?i)(.* )?did/.*? (\\S*)/vb(\\W.*)?"); Matcher m = p.matcher(tagged); String[] results; if (m.matches()) { String verb = m.group(2); results = VerbFormConverter.infinitiveToSimplePast(verb); for (int i = 0; i < results.length; i++) { results[i] = question.replace(verb, results[i]); results[i] = results[i].replace("did ", ""); } return results; } return null; } /** * Removes the final punctuation mark and quotation marks from the question * string. * * @param question the question string * @return modified question string */ private static String dropPunctuationMarks(String question) { // drop final punctuation mark question = question.replaceAll("(\\.|\\?|!)$", ""); // drop quotation marks return question.replaceAll("\"", ""); } // /** // * Converts the first letter of the question string to lower case. // * // * @param question the question string // * @return modified question string // */ // private static String lowerFirstLetter(String question) { // if (question.length() > 0) { // String upper = question.substring(0, 1); // get first letter // String lower = upper.toLowerCase(); // // if (!lower.equals(upper)) // return question.replaceFirst(upper, lower); // } // // return question; // } /** * Normalizes a question string by removing abundant whitespaces, replacing * short forms and dropping filler words. * * @param question question string * @return normalized question string */ public static String normalize(String question) { //remove leading and trailing whitespaces question = question.trim(); //replace multiple whitespaces by a single blank question = question.replaceAll("\\s+", " "); // replace short forms of "is" and "are" question = replaceShortForms(question); // drop filler words question = dropFillers(question); return question; } /** * Converts the verbs to infinitive and the nouns to their singular forms. * * @param qn normalized question string * @return stemmed question string */ public static String stemVerbsAndNouns(String qn) { // tokenize, tag POS and convert to lower case String[] tokens = OpenNLP.tokenize(qn); String[] pos = OpenNLP.tagPos(tokens); qn = qn.toLowerCase(); for (int i = 0; i < tokens.length; i++) tokens[i] = tokens[i].toLowerCase(); for (int i = 0; i < tokens.length; i++) { if (pos[i].startsWith("VB")) { String rep = WordNet.getLemma(tokens[i], WordNet.VERB); if (rep == null) rep = tokens[i]; qn = qn.replace(tokens[i], rep); } else if (pos[i].startsWith("NN")) { String rep = PlingStemmer.stem(tokens[i]); qn = qn.replace(tokens[i], rep); } } // drop final punctuation mark and quotation marks qn = dropPunctuationMarks(qn); // convert the first letter to lower case // qn = lowerFirstLetter(qn); return qn; } /** * Unstems a substring of the stemmed question string by mapping it to the * normalized question string. * * @param sub a substring of the stemmed question string * @param stemmed the stemmed question string * @param qn the normalized question string * @return unstemmed string or <code>sub</code>, if it is not a substring of * <code>stemmed</code> */ public static String unstem(String sub, String stemmed, String qn) { String result = sub; // preprocess the normalized question string // - drop final punctuation mark and quotation marks qn = dropPunctuationMarks(qn); // - convert the first letter to lower case // qn = lowerFirstLetter(qn); String[] truncs = stemmed.split(sub, -1); if (truncs.length > 1) { // substring occurs in stemmed string? int start = NETagger.tokenize(truncs[0]).length; int end = start + NETagger.tokenize(sub).length; String[] tokens = NETagger.tokenize(qn); result = tokens[start]; for (int i = start + 1; i < end; i++) result += " " + tokens[i]; result = OpenNLP.untokenize(result, qn); } return result; } /** * <p>Handles auxiliary verbs by applying the rules specified in the * documentations of the <code>handleAux...()</code> methods.</p> * * @param qn normalized question string * @return question strings with modified verbs */ public static String[] handleAuxiliaries(String qn) { String[] results = {qn}; // tokenize, tag POS and convert to lower case String tokens = OpenNLP.tokenizeWithSpaces(qn); String tagged = OpenNLP.tagPos(tokens).toLowerCase(); // is/are/was/were [...] gerund / past participle -> // is/are/was/were gerund / past participle results = handleAuxIs(qn, tagged); // can/could/will/would/shall/should/may/might/must [...] infinitive -> // can/could/will/would/shall/should/may/might/must infinitive if (results == null) results = handleAuxCanMay(qn, tagged); // have/has/had [...] past_participle -> // has/have/had past_participle / simple_past if (results == null) results = handleAuxHasHad(qn, tagged); // do [...] infinitive -> infinitive if (results == null) results = handleAuxDo(qn, tagged); // does [...] infinitive -> infinitive + "s" if (results == null) results = handleAuxDoes(qn, tagged); // did [...] infinitive -> simple_past if (results == null) results = handleAuxDid(qn, tagged); // none of the above rules applies if (results == null) results = new String[] {qn}; for (int i = 0; i < results.length; i++) { // drop final punctuation mark and quotation marks results[i] = dropPunctuationMarks(results[i]); // convert the first letter to lower case // results[i] = lowerFirstLetter(results[i]); } return results; } /** * Replaces certain expressions in a list question to transform it into a * factoid question. * * @param question a list question * @return transformed question */ public static String transformList(String question) { question = normalize(question); String listPattern = ("(?i)^") + "(name|(what|which|who)( (is|are|was|were))?|list|give|provide|identify) " + "((a list of )?((the )?names of )?(all|every|a few|more|(the )?other|(the )?several|some( of)?|(the )?various) )?"; Matcher m = Pattern.compile(listPattern).matcher(question); if (m.find()) { String match = m.group(0); String rep = m.group(1); if (rep.matches("(?i)(list|give|provide|identify)")) rep = "name"; question = question.replaceFirst(match, rep + " "); } return question; } }