package info.ephyra.questionanalysis;
import info.ephyra.nlp.OpenNLP;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* A <code>QuestionPattern</code> is applied to a question to determine the
* TARGET of the question, CONTEXT information and the PROPERTY the question
* asks for.
*
* @author Nico Schlaefer
* @version 2006-04-20
*/
public class QuestionPattern {
/** Maximum length of a TARGET object in tokens. */
private static final int MAX_TARGET = 10;
/** Maximum length of a CONTEXT object in tokens. */
private static final int MAX_CONTEXT = 10;
/** The <code>Pattern</code> that is applied to a question string. */
private Pattern pattern;
/** The PROPERTY that a question which matches this pattern asks for. */
private String property;
/** ID of the group that represents the TARGET of the question. */
private int targetID;
/** IDs of 0 to n groups that represent the CONTEXT of the question. */
private int[] contextIDs;
/**
* Creates a <code>QuestionPattern</code> from a descriptor that is a
* regular expression but additionally contains the following tags:
* <ul>
* <li><TO> - exactly one TARGET tag</li>
* <li><CO> - an arbitrary number of CONTEXT tags</li>
* </ul>
*
* @param expr pattern descriptor
* @param prop PROPERTY that a question which matches the pattern asks for
*/
public QuestionPattern(String expr, String prop) {
property = prop; // PROPERTY that this pattern extracts
// add ".*?" at the beginning of the expression
expr = ".*?" + expr; // reluctant
// add ".*+" at the end of the expression, if it does not end with a
// TARGET/CONTEXT object
// if (!expr.matches(".*<(TO|CO)>(\\(.*?\\)\\?)?$"))
// expr += ".*+"; // possessive
// replace tags
expr = replaceTargetTag(expr);
expr = replaceContextTags(expr);
// compile regular expression (case insensitive)
pattern = Pattern.compile(expr, Pattern.CASE_INSENSITIVE);
}
/**
* Sets the <code>targetID</code> field and replaces the TARGET tag by a
* capturing group.
*
* @param expr pattern descriptor
* @return descriptor without TARGET tag
*/
private String replaceTargetTag(String expr) {
// compute the ID of the group that represents the TARGET object
// - get string before TARGET tag
String s = expr.split("<TO>")[0];
// - count number of '(' not preceded by '\' or followed by '?:' and
// number of CONTEXT tags
targetID = s.split("(\\(|<CO>)", -1).length -
s.split("\\\\\\(", -1).length -
s.split("\\(\\?\\:").length +
s.split("\\\\\\(\\?\\:").length + 1;
// replace TARGET tag
expr = expr.replace("<TO>", "(.*?)"); // reluctant
return expr;
}
/**
* Sets the <code>contextIDs</code> field and replaces the CONTEXT tags by
* capturing groups.
*
* @param expr pattern descriptor
* @return descriptor without CONTEXT tags
*/
private String replaceContextTags(String expr) {
// compute the IDs of the groups that represent the CONTEXT objects
// - get strings between CONTEXT tags
String[] ss = expr.split("<CO>", -1);
contextIDs = new int[ss.length - 1];
for (int i = 0; i < contextIDs.length; i++)
// - count number of '(' not preceded by '\' or followed by '?:'
contextIDs[i] = ss[i].split("\\(", -1).length -
ss[i].split("\\\\\\(", -1).length -
ss[i].split("\\(\\?\\:").length +
ss[i].split("\\\\\\(\\?\\:").length +
((i > 0) ? contextIDs[i - 1] + 1 : 1);
// replace CONTEXT tags
expr = expr.replace("<CO>", "(.*?)"); // reluctant
return expr;
}
/**
* Ensures that the TARGET and CONTEXT objects are noun phrases and splits
* the objects along prepositions and punctuation marks.
*
* @param qn normalized question string
* @param qi question interpretation
* @return modified question interpretation
*/
private QuestionInterpretation ensureNounPhrases(String qn,
QuestionInterpretation qi) {
// tag phrase chunks
String[] tokens = OpenNLP.tokenize(qn);
String[] pos = OpenNLP.tagPos(tokens);
String[] chunks = OpenNLP.tagChunks(tokens, pos);
Hashtable<String, String> tagTable = new Hashtable<String, String>();
for (int i = 0; i < tokens.length; i++)
tagTable.put(tokens[i], chunks[i]);
// get TARGET and CONTEXT objects
String target = qi.getTarget();
// String[] context = qi.getContext();
ArrayList<String> objects = new ArrayList<String>();
objects.add(target);
// for (String co : context) objects.add(co);
ArrayList<String> newObjects = new ArrayList<String>();
for (int i = 0; i < objects.size(); i++) {
tokens = OpenNLP.tokenize(objects.get(i));
// ensure that the object is a noun phrase
if ((tagTable.containsKey(tokens[0]) &&
!tagTable.get(tokens[0]).contains("NP")) ||
(tagTable.containsKey(tokens[tokens.length - 1]) &&
!tagTable.get(tokens[tokens.length - 1]).contains("NP")))
return null;
// split object along prepositions and punctuation marks
// String delims = "(";
// for (int j = 1; j < tokens.length - 1; j++) {
// if (tagTable.containsKey(tokens[j]) &&
// tagTable.containsKey(tokens[j - 1]) &&
// tagTable.containsKey(tokens[j + 1]) &&
// (tagTable.get(tokens[j]).equals("B-PP") ||
// tagTable.get(tokens[j]).equals("O")) &&
// tagTable.get(tokens[j - 1]).contains("NP") &&
// tagTable.get(tokens[j + 1]).contains("NP")) {
// if (delims.length() > 1) delims += "|";
// delims += RegexConverter.strToRegexWithBounds(tokens[j]);
// }
// }
// if (delims.length() > 1) {
// delims += ")";
// String[] subObjects = objects.get(i).split(delims);
// for (String subObject : subObjects)
// newObjects.add(subObject.trim());
// } else
newObjects.add(objects.get(i));
}
// update interpretation
qi.setTarget(newObjects.get(0));
// newObjects.remove(0);
// qi.setContext(newObjects.toArray(new String[newObjects.size()]));
return qi;
}
/**
* Formats the extracted TARGET and CONTEXT objects.
*
* @param object TARGET or CONTEXT object
* @return formatted object
*/
private String formatObject(String object) {
// drop preceding "a", "an", "the" and trim
return object.replaceFirst("(?i)^(an?|the) ", "").trim();
}
/**
* Returns the PROPERTY that a question which matches the pattern asks for.
*
* @return the PROPERTY
*/
public String getProperty() {
return property;
}
/**
* Applies the pattern to a question. If the question matches the pattern,
* a <code>QuestionInterpretation</code> is returned, else
* <code>null</code>.
*
* @param qn normalized question string
* @param stemmed stemmed question string
* @return interpretation of the question or <code>null</code>, if the
* question does not match the pattern
*/
public QuestionInterpretation apply(String qn, String stemmed) {
Matcher m = pattern.matcher(stemmed);
if (m.matches()) {
String target = m.group(targetID);
if (target.length() == 0) return null;
target = QuestionNormalizer.unstem(target, stemmed, qn);
String[] context = new String[contextIDs.length];
for (int i = 0; i < context.length; i++) {
context[i] = m.group(contextIDs[i]);
if (context[i].length() == 0) return null;
context[i] = QuestionNormalizer.unstem(context[i], stemmed, qn);
}
// make sure that TARGET and CONTEXT objects are noun phrases
QuestionInterpretation qi =
new QuestionInterpretation(target, context, property);
qi = ensureNounPhrases(qn, qi);
if (qi == null) return null;
target = qi.getTarget();
target = formatObject(target);
if (target.length() == 0 ||
target.split(" ").length > MAX_TARGET)
return null;
qi.setTarget(target);
context = qi.getContext();
for (int i = 0; i < context.length; i++) {
context[i] = formatObject(context[i]);
if (context[i].length() == 0 ||
context[i].split(" ").length > MAX_CONTEXT)
return null;
}
qi.setContext(context);
return qi;
} else
return null; // question does not match the pattern
}
}