package info.ephyra.questionanalysis; import info.ephyra.nlp.OpenNLP; import info.ephyra.nlp.semantics.ASSERT; import info.ephyra.nlp.semantics.Predicate; import info.ephyra.util.RegexConverter; import java.text.ParseException; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Extracts predicate-argument structures from a question. At first, the question string is transformed into a statement * with a dummy argument, then the predicates are extracted and finally the dummy argument is dropped. * * @author Nico Schlaefer * @version 2007-04-25 */ public class PredicateExtractor { /** Pattern that matches any form of 'to be'. */ private static final String BE_P = "(?i)\\b(be|is|are|was|were|been)\\b"; /** Pattern that matches any form of 'to do'. */ private static final String DO_P = "(?i)\\b(do|does|did|done)\\b"; /** Pattern that matches any form of 'to have'. */ private static final String HAVE_P = "(?i)\\b(have|has|had)\\b"; /** Pattern that matches other verb forms which should not be considered as predicates. */ private static final String IGNORE_P = "(?i)\\b(name|give|tell|list)\\b"; /** Pattern that matches any interrogative or 'that'. */ private static final String INTERROGATIVE_P = "(?i)\\b(who(m|se)?|what|which|when|where|why|how|that)\\b"; /** Pattern for questions seeking a person. */ private static final String PERSON_P = "(?i)\\bwho(m|se)?\\b"; /** Pattern for questions seeking a thing. */ private static final String THING_P = "(?i)\\b(what|which)\\b"; /** Pattern for questions seeking a date/time. */ private static final String DATE_TIME_P = "(?i)\\bwhen\\b"; /** Pattern for questions seeking a location. */ private static final String LOCATION_P = "(?i)\\bwhere\\b"; /** Pattern for questions seeking a purpose. */ private static final String PURPOSE_P = "(?i)\\bwhy\\b"; /** Pattern for questions seeking a manner. */ private static final String MANNER_P = "(?i)\\bhow\\b"; /** Replacement if question is seeking a person. */ private static final String PERSON_R = "a PERSON"; /** Replacement if question is seeking a thing. */ private static final String THING_R = "a THING"; /** Replacement if question is seeking a date/time. */ private static final String DATE_TIME_R = "in 1999"; /** Replacement if question is seeking a location. */ private static final String DURATION_R = "for one HOUR"; /** Replacement if question is seeking a location. */ private static final String LOCATION_R = "in AMERICA"; /** Replacement if question is seeking a purpose. */ private static final String PURPOSE_R = "for PURPOSE"; /** Replacement if question is seeking a manner. */ private static final String MANNER_R = "with MANNER"; /** Replacement if question is seeking a quantification. */ private static final String QUANTIFICATION_R = "NOT"; /** Replacement if question is seeking an entity of an unknown type. */ private static final String UNKNOWN_R = "a BIG"; /** * Checks if the question contains a predicate that can be labeled. * * @param qn normalized question string * @return <code>true</code> iff the question contains a predicate */ private static boolean containsPredicate(String qn) { // tag POS and phrase chunks String[] tokens = OpenNLP.tokenize(qn); String[] pos = OpenNLP.tagPos(tokens); String[] chunks = OpenNLP.tagChunks(tokens, pos); // check if there is a verb other than 'to be', 'to do' or 'to have' which is not on the ignore list for (int i = 0; i < tokens.length; i++) if ((pos[i].startsWith("VB") || chunks[i].endsWith("-VP")) && !(tokens[i].matches(BE_P) || tokens[i].matches(DO_P) || tokens[i].matches(HAVE_P) || tokens[i].matches(IGNORE_P))) return true; // check if there is a verb phrase that ends in a verb other than 'to be' // String lastToken = ""; // boolean lastTokenVerbPhrase = false; // last token was part of a verb phrase // for (int i = 0; i < tokens.length; i++) { // if (pos[i].startsWith("VB") || chunks[i].endsWith("-VP")) { // lastToken = tokens[i]; // lastTokenVerbPhrase = true; // } else { // if (lastTokenVerbPhrase) // if (!lastToken.matches(BE_P) && !lastToken.matches(IGNORE_P)) return true; // lastTokenVerbPhrase = false; // } // } // if (lastTokenVerbPhrase) // if (!lastToken.matches(BE_P) && !lastToken.matches(IGNORE_P)) return true; return false; } /** * Transforms a phrase into a regular expression that matches the phrase, allowing differences regarding * whitespaces and punctuation and quotation marks. * * @param phrase phrase to transform * @return regular expression */ private static String phraseToRegex(String phrase) { // transform into a regular expression phrase = RegexConverter.strToRegex(phrase); // allow an arbitrary number of whitespaces phrase = phrase.replace(" ", "\\s*+"); // make punctuation marks optional phrase = phrase.replaceAll("(\\.|\\?|!|\")", ".?"); return phrase; } // methods to replace phrases with interrogatives private static String handleIgnore(String qn, String verbMod, String[] tokens, String[] pos, String[] chunks, int i) { // get phrase from word on ignore-list to next interrogative String phrase = tokens[i]; int interrogative = 0; for (int j = i + 1; j < tokens.length; j++) { phrase += " " + tokens[j]; if (tokens[j].matches(INTERROGATIVE_P)) { interrogative = j; break; } } if (interrogative > i + 1) { // is the interrogative followed by a verb? boolean verb = (interrogative + 1 < tokens.length && (pos[interrogative + 1].startsWith("VB") || chunks[interrogative + 1].endsWith("-VP"))) ? true : false; // replace phrase phrase = phraseToRegex(phrase); Matcher m = Pattern.compile(phrase).matcher(qn); if (m.find()) { String replacement = m.group(0).replaceFirst(IGNORE_P, UNKNOWN_R).replaceFirst(INTERROGATIVE_P, ""); if (verb) { verbMod = verbMod.replaceFirst(phrase, replacement); } else { verbMod = verbMod.replaceFirst(phrase, "") + " " + replacement; } } } return verbMod; } private static String handlePerson(String verbMod) { verbMod = verbMod.replaceFirst(PERSON_P, PERSON_R); return verbMod; } private static String handleThing(String qn, String verbMod, String[] ats, String[] tokens, String[] pos, String[] chunks, int i) { if (i + 1 == tokens.length || !chunks[i + 1].endsWith("-NP")) { // interrogative is not followed by a noun phrase // is the interrogative followed by an auxiliary verb that has been shifted in verbMod? boolean auxiliary = (i + 2 < tokens.length && !verbMod.matches(".*?" + phraseToRegex(tokens[i + 1] + " " + tokens[i + 2]) + ".*+")) ? true : false; // replace interrogative if (auxiliary) verbMod = verbMod.replaceFirst(THING_P, "") + " " + THING_R; else verbMod = verbMod.replaceFirst(THING_P, THING_R); } else { // interrogative is followed by noun phrases... // get interrogative + noun phrases String phrase = tokens[i]; int j; for (j = i + 1; j < tokens.length; j++) if (!pos[j].startsWith("VB") && !chunks[j].endsWith("-VP") && !(j == tokens.length - 1 && pos[j].equals("."))) phrase += " " + tokens[j]; else break; if (i == 0 || !chunks[i - 1].endsWith("-PP")) { // ...and not preceded by prepositions // replace phrase phrase = phraseToRegex(phrase); // special handling for certain answer types boolean replaced = false; for (String at : ats) if (at.startsWith("NEdate") || at.startsWith("NEtime")) { verbMod = verbMod.replaceFirst(phrase, "") + " " + DATE_TIME_R; replaced = true; break; } else if (at.startsWith("NElocation")) { verbMod = verbMod.replaceFirst(phrase, "") + " " + LOCATION_R; replaced = true; break; } // general case if (!replaced) { Matcher m = Pattern.compile(phrase).matcher(qn); if (m.find()) { // is the phrase followed by an auxiliary verb that has been shifted in verbMod? boolean auxiliary = (j + 1 < tokens.length && !verbMod.matches(".*?" + phraseToRegex(tokens[j] + " " + tokens[j + 1]) + ".*+")) ? true : false; String replacement = m.group(0).replaceFirst(THING_P, UNKNOWN_R); if (auxiliary) verbMod = verbMod.replaceFirst(phrase, "") + " " + replacement; else verbMod = verbMod.replaceFirst(phrase, replacement); } } } else { // ...and preceded by prepositions // get prepositions + interrogative + noun phrases for (j = i - 1; j >= 0; j--) if (chunks[j].endsWith("-PP")) phrase = tokens[j] + " " + phrase; else break; // replace phrase phrase = phraseToRegex(phrase); // special handling for certain answer types boolean replaced = false; for (String at : ats) if (at.startsWith("NEdate") || at.startsWith("NEtime")) { verbMod = verbMod.replaceFirst(phrase, "") + " " + DATE_TIME_R; replaced = true; break; } else if (at.startsWith("NElocation")) { verbMod = verbMod.replaceFirst(phrase, "") + " " + LOCATION_R; replaced = true; break; } // general case if (!replaced) { Matcher m = Pattern.compile(phrase).matcher(qn); if (m.find()) { String replacement = m.group(0).replaceFirst(THING_P, UNKNOWN_R); verbMod = verbMod.replaceFirst(phrase, "") + " " + replacement; } } } } return verbMod; } private static String handleDateTime(String verbMod) { verbMod = verbMod.replaceFirst(DATE_TIME_P, "") + " " + DATE_TIME_R; return verbMod; } private static String handleLocation(String verbMod) { verbMod = verbMod.replaceFirst(LOCATION_P, "") + " " + LOCATION_R; return verbMod; } private static String handlePurpose(String verbMod) { verbMod = verbMod.replaceFirst(PURPOSE_P, "") + " " + PURPOSE_R; return verbMod; } private static String handleManner(String qn, String verbMod, String[] ats, String[] tokens, String[] pos, String[] chunks, int i) { if (i + 1 == tokens.length || !(pos[i + 1].matches("JJ") || pos[i + 1].matches("RB"))) { // interrogative is not followed by an adjective verbMod = verbMod.replaceFirst(MANNER_P, "") + " " + MANNER_R; } else { // interrogative is followed by an adjective // get interrogative + adjective + noun phrases String phrase = tokens[i]; int j; for (j = i + 1; j < tokens.length; j++) if (!pos[j].startsWith("VB") && !chunks[j].endsWith("-VP") && !(j == tokens.length - 1 && pos[j].equals("."))) phrase += " " + tokens[j]; else break; // replace phrase phrase = phraseToRegex(phrase); // special handling for answer type 'duration' boolean replaced = false; for (String at : ats) if (at.startsWith("NEduration")) { verbMod = verbMod.replaceFirst(phrase, "") + " " + DURATION_R; replaced = true; break; } // general case if (!replaced) { Matcher m = Pattern.compile(phrase).matcher(qn); if (m.find()) { String replacement = m.group(0).replaceFirst(MANNER_P, QUANTIFICATION_R); // is the phrase followed by an auxiliary verb that has been shifted in verbMod? boolean auxiliary = (j + 1 < tokens.length && !verbMod.matches(".*?" + phraseToRegex(tokens[j] + " " + tokens[j + 1]) + ".*+")) ? true : false; if (auxiliary) { verbMod = verbMod.replaceFirst(phrase, "") + " " + replacement; } else verbMod = verbMod.replaceFirst(phrase, replacement); } } } return verbMod; } /** * Transforms a question into a statement by replacing phrases with interrogatives. * * @param qn normalized question string * @param verbMod question string with modified verbs * @param ats expected answer types * @return statement */ private static String questionToStatement(String qn, String verbMod, String[] ats) { String[] tokens = OpenNLP.tokenize(qn); String[] pos = OpenNLP.tagPos(tokens); String[] chunks = OpenNLP.tagChunks(tokens, pos); for (int i = 0; i < tokens.length; i++) if (tokens[i].matches(IGNORE_P)) { verbMod = handleIgnore(qn, verbMod, tokens, pos, chunks, i); break; } else if (tokens[i].matches(PERSON_P)) { verbMod = handlePerson(verbMod); break; } else if (tokens[i].matches(THING_P)) { verbMod = handleThing(qn, verbMod, ats, tokens, pos, chunks, i); break; } else if (tokens[i].matches(DATE_TIME_P)) { verbMod = handleDateTime(verbMod); break; } else if (tokens[i].matches(LOCATION_P)) { verbMod = handleLocation(verbMod); break; } else if (tokens[i].matches(PURPOSE_P)) { verbMod = handlePurpose(verbMod); break; } else if (tokens[i].matches(MANNER_P)) { verbMod = handleManner(qn, verbMod, ats, tokens, pos, chunks, i); break; } verbMod = verbMod.replaceAll("\\s++", " ").trim(); // drop unnecessary whitespaces return verbMod; } /** * Extracts the predicates from a question string. * * @param qn normalized question string * @param verbMod question string with modified verbs * @param ats expected answer types * @param terms question terms * @return predicate-argument structures */ public static Predicate[] getPredicates(String qn, String verbMod, String[] ats, Term[] terms) { // check if question contains a predicate if (!containsPredicate(qn)) return new Predicate[0]; // transform question into statement String statement = questionToStatement(qn, verbMod, ats); // annotate and extract predicates String[][] ass = ASSERT.annotatePredicates(new String[] {statement}); String[] as = (ass.length > 0) ? ass[0] : new String[0]; List<Predicate> predicates = new ArrayList<Predicate>(); for (int i = 0; i < as.length; i++) { // build predicate Predicate predicate = null; try { predicate = new Predicate(statement, as[i], terms); } catch (ParseException e) { // MsgPrinter.printErrorMsg(e.getMessage()); // System.exit(1); continue; } predicates.add(predicate); } // drop placeholders boolean missingArgs = false; for (Predicate p : predicates) { if (p.dropArgs(PERSON_R) | p.dropArgs(THING_R) | p.dropArgs(DATE_TIME_R) | p.dropArgs(DURATION_R) | p.dropArgs(LOCATION_R) | p.dropArgs(PURPOSE_R) | p.dropArgs(MANNER_R) | p.dropArgs(QUANTIFICATION_R) | p.dropArgs(UNKNOWN_R)) missingArgs = true; } // only return predicates if at least one has a missing argument // (else the answer extraction does not work) return (missingArgs) ? predicates.toArray(new Predicate[predicates.size()]) : new Predicate[0]; } }