QuestionPattern.java example

Explorer
lucida-master
- lucida
package info.ephyra.questionanalysis;

import info.ephyra.nlp.OpenNLP;

import java.util.ArrayList;
import java.util.Hashtable;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * A <code>QuestionPattern</code> is applied to a question to determine the
 * TARGET of the question, CONTEXT information and the PROPERTY the question
 * asks for.
 * 
 * @author Nico Schlaefer
 * @version 2006-04-20
 */
public class QuestionPattern {
	/** Maximum length of a TARGET object in tokens. */
	private static final int MAX_TARGET = 10;
	/** Maximum length of a CONTEXT object in tokens. */
	private static final int MAX_CONTEXT = 10;
	/** The <code>Pattern</code> that is applied to a question string. */
	private Pattern pattern;
	/** The PROPERTY that a question which matches this pattern asks for. */
	private String property;
	/** ID of the group that represents the TARGET of the question. */
	private int targetID;
	/** IDs of 0 to n groups that represent the CONTEXT of the question. */
	private int[] contextIDs;
	
	/**
	 * Creates a <code>QuestionPattern</code> from a descriptor that is a
	 * regular expression but additionally contains the following tags:
	 * <ul>
	 * <li><TO> - exactly one TARGET tag</li>
	 * <li><CO> - an arbitrary number of CONTEXT tags</li>
	 * </ul>
	 * 
	 * @param expr pattern descriptor
	 * @param prop PROPERTY that a question which matches the pattern asks for
	 */
	public QuestionPattern(String expr, String prop) {
		property = prop;  // PROPERTY that this pattern extracts
		
		// add ".*?" at the beginning of the expression
		expr = ".*?" + expr;  // reluctant
		// add ".*+" at the end of the expression, if it does not end with a
		// TARGET/CONTEXT object
//		if (!expr.matches(".*<(TO|CO)>(\\(.*?\\)\\?)?$"))
//			expr += ".*+";  // possessive
		
		// replace tags
		expr = replaceTargetTag(expr);
		expr = replaceContextTags(expr);
		
		// compile regular expression (case insensitive)
		pattern = Pattern.compile(expr, Pattern.CASE_INSENSITIVE);
	}
	
	/**
	 * Sets the <code>targetID</code> field and replaces the TARGET tag by a
	 * capturing group.
	 * 
	 * @param expr pattern descriptor
	 * @return descriptor without TARGET tag
	 */
	private String replaceTargetTag(String expr) {
		// compute the ID of the group that represents the TARGET object
		// - get string before TARGET tag
		String s = expr.split("<TO>")[0];
		// - count number of '(' not preceded by '\' or followed by '?:' and
		//	 number of CONTEXT tags
		targetID = s.split("(\\(|<CO>)", -1).length -
				   s.split("\\\\\\(", -1).length -
				   s.split("\\(\\?\\:").length +
				   s.split("\\\\\\(\\?\\:").length + 1;
		
		// replace TARGET tag
		expr = expr.replace("<TO>", "(.*?)");  // reluctant
		
		return expr;
	}
	
	/**
	 * Sets the <code>contextIDs</code> field and replaces the CONTEXT tags by
	 * capturing groups.
	 * 
	 * @param expr pattern descriptor
	 * @return descriptor without CONTEXT tags
	 */
	private String replaceContextTags(String expr) {
		// compute the IDs of the groups that represent the CONTEXT objects
		// - get strings between CONTEXT tags
		String[] ss = expr.split("<CO>", -1);
		contextIDs = new int[ss.length - 1];
		for (int i = 0; i < contextIDs.length; i++)
			// - count number of '(' not preceded by '\' or followed by '?:'
			contextIDs[i] = ss[i].split("\\(", -1).length -
							ss[i].split("\\\\\\(", -1).length -
							ss[i].split("\\(\\?\\:").length +
							ss[i].split("\\\\\\(\\?\\:").length +
							((i > 0) ? contextIDs[i - 1] + 1 : 1);
		
		// replace CONTEXT tags
		expr = expr.replace("<CO>", "(.*?)");  // reluctant
		
		return expr;
	}
	
	/**
	 * Ensures that the TARGET and CONTEXT objects are noun phrases and splits
	 * the objects along prepositions and punctuation marks.
	 * 
	 * @param qn normalized question string
	 * @param qi question interpretation
	 * @return modified question interpretation
	 */
	private QuestionInterpretation ensureNounPhrases(String qn,
			QuestionInterpretation qi) {
		// tag phrase chunks
		String[] tokens = OpenNLP.tokenize(qn);
		String[] pos = OpenNLP.tagPos(tokens);
		String[] chunks = OpenNLP.tagChunks(tokens, pos);
		Hashtable<String, String> tagTable = new Hashtable<String, String>();
		for (int i = 0; i < tokens.length; i++)
			tagTable.put(tokens[i], chunks[i]);
		
		// get TARGET and CONTEXT objects
		String target = qi.getTarget();
//		String[] context = qi.getContext();
		ArrayList<String> objects = new ArrayList<String>();
		objects.add(target);
//		for (String co : context) objects.add(co);
		
		ArrayList<String> newObjects = new ArrayList<String>();
		for (int i = 0; i < objects.size(); i++) {
			tokens = OpenNLP.tokenize(objects.get(i));
			
			// ensure that the object is a noun phrase
			if ((tagTable.containsKey(tokens[0]) &&
				!tagTable.get(tokens[0]).contains("NP")) ||
				(tagTable.containsKey(tokens[tokens.length - 1]) &&
				!tagTable.get(tokens[tokens.length - 1]).contains("NP")))
				return null;
			
			// split object along prepositions and punctuation marks
//			String delims = "(";
//			for (int j = 1; j < tokens.length - 1; j++) {
//				if (tagTable.containsKey(tokens[j]) &&
//					tagTable.containsKey(tokens[j - 1]) &&
//					tagTable.containsKey(tokens[j + 1]) &&
//					(tagTable.get(tokens[j]).equals("B-PP") ||
//					tagTable.get(tokens[j]).equals("O")) &&
//					tagTable.get(tokens[j - 1]).contains("NP") &&
//					tagTable.get(tokens[j + 1]).contains("NP")) {
//					if (delims.length() > 1) delims += "|";
//					delims += RegexConverter.strToRegexWithBounds(tokens[j]);
//				}
//			}
//			if (delims.length() > 1) {
//				delims += ")";
//				String[] subObjects = objects.get(i).split(delims);
//				for (String subObject : subObjects)
//					newObjects.add(subObject.trim());
//			} else
				newObjects.add(objects.get(i));
		}
		
		// update interpretation
		qi.setTarget(newObjects.get(0));
//		newObjects.remove(0);
//		qi.setContext(newObjects.toArray(new String[newObjects.size()]));
		
		return qi;
	}
	
	/**
	 * Formats the extracted TARGET and CONTEXT objects.
	 * 
	 * @param object TARGET or CONTEXT object
	 * @return formatted object
	 */
	private String formatObject(String object) {
		// drop preceding "a", "an", "the" and trim
		return object.replaceFirst("(?i)^(an?|the) ", "").trim();
	}
	
	/**
	 * Returns the PROPERTY that a question which matches the pattern asks for.
	 * 
	 * @return the PROPERTY
	 */
	public String getProperty() {
		return property;
	}
	
	/**
	 * Applies the pattern to a question. If the question matches the pattern,
	 * a <code>QuestionInterpretation</code> is returned, else
	 * <code>null</code>.
	 * 
	 * @param qn normalized question string
	 * @param stemmed stemmed question string
	 * @return interpretation of the question or <code>null</code>, if the
	 * 		   question does not match the pattern
	 */
	public QuestionInterpretation apply(String qn, String stemmed) {
		Matcher m = pattern.matcher(stemmed);
		
		if (m.matches()) {
			String target = m.group(targetID);
			if (target.length() == 0) return null;
			target = QuestionNormalizer.unstem(target, stemmed, qn);
			String[] context = new String[contextIDs.length];
			for (int i = 0; i < context.length; i++) {
				context[i] = m.group(contextIDs[i]);
				if (context[i].length() == 0) return null;
				context[i] = QuestionNormalizer.unstem(context[i], stemmed, qn);
			}
			
			// make sure that TARGET and CONTEXT objects are noun phrases
			QuestionInterpretation qi =
				new QuestionInterpretation(target, context, property);
			qi = ensureNounPhrases(qn, qi);
			if (qi == null) return null;
			
			target = qi.getTarget();
			target = formatObject(target);
			if (target.length() == 0 ||
				target.split(" ").length > MAX_TARGET)
				return null;
			qi.setTarget(target);
			context = qi.getContext();
			for (int i = 0; i < context.length; i++) {
				context[i] = formatObject(context[i]);
				if (context[i].length() == 0 ||
					context[i].split(" ").length > MAX_CONTEXT)
					return null;
			}
			qi.setContext(context);
			
			return qi;
		} else
			return null;  // question does not match the pattern
	}
}