package info.ephyra.answerselection; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.PrintWriter; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * <p>An <code>AnswerPattern</code> is applied to a sentence to extract a * PROPERTY object of the type specified in the <code>property</code> field. * The sentence must contain a TARGET tag to indicate the object of which the * PROPERTY is wanted.</p> * * <p>This class implements the interface <code>Comparable</code>. Note: it has * a natural ordering that is inconsistent with <code>equals()</code>.</p> * * @author Nico Schlaefer * @version 2008-01-29 */ public class AnswerPattern implements Comparable<AnswerPattern> { /** Maximum distance between TARGET and PROPERTY in tokens. */ private static final int MAX_DIST = 20; /** Maximum length of a PROPERTY object in tokens. */ private static final int MAX_PROP = 10; /** The pattern descriptor from which the pattern is built. */ private String desc; /** The <code>Pattern</code> that is applied to a sentence. */ private Pattern pattern; /** The type of PROPERTY that is extracted with this pattern. */ private String property; /** ID of the group that represents the PROPERTY to be extracted. */ private int propertyID; /** ID of the group that covers the string between TARGET and PROPERTY. */ private int distID; /** Counter for the number of correct applications of the pattern. */ private int correct = 0; /** Counter for the number of wrong applications of the pattern. */ private int wrong = 0; /** * Creates an <code>AnswerPattern</code> from a descriptor that is a * regular expression but additionally contains the following tags: * <ul> * <li><TO> - exactly one TARGET tag</li> * <li><CO> - an arbitrary number of CONTEXT tags</li> * <li><PO(_NExyz)*> - exactly one PROPERTY tag, optionally combined * with NE types</li> * <li><NExyz(_NExyz)*> - an arbitrary number of NE tags, which are * combinations of one or more NE types</li> * </ul> * * @param expr pattern descriptor * @param prop PROPERTY that the pattern extracts */ public AnswerPattern(String expr, String prop) { desc = expr; // pattern descriptor property = prop; // PROPERTY that this pattern extracts // add group that covers the string between TARGET and PROPERTY expr = addDistGroup(expr); // replace tags expr = replaceTargetTag(expr); expr = replaceContextTags(expr); expr = replacePropertyTag(expr); expr = replaceNeTags(expr); // optimize pattern expr = optimizePattern(expr); // compile regular expression (case insensitive) pattern = Pattern.compile(expr, Pattern.CASE_INSENSITIVE); } /** * <p>Creates an <code>AnswerPattern</code> from a descriptor by applying * the constructor <code>AnswerPattern(String expr, String prop)</code>.</p> * * <p>In addition, it sets the counters for the number of correct/wrong * applications of the pattern.</p> * * @param expr pattern descriptor * @param prop PROPERTY that the pattern extracts * @param correct number of correct applications * @param wrong number of wrong applications */ public AnswerPattern(String expr, String prop, int correct, int wrong) { this(expr, prop); this.correct = correct; this.wrong = wrong; } /** * Adds a capturing group that covers the string between the TARGET and * the PROPERTY and sets the <code>distID</code> field. Required to measure * the distance between TARGET and PROPERTY. * * @param expr pattern descriptor * @return descriptor with capturing group */ private String addDistGroup(String expr) { Matcher m = Pattern.compile("<PO.*?>").matcher(expr); m.find(); String pTag = m.group(0); if (expr.startsWith("<TO>")) { // TARGET comes before PROPERTY distID = 1; return expr.replace("<TO>", "<TO>(").replace(pTag, ")" + pTag); } else { // PROPERTY comes before TARGET distID = 2; return expr.replace(pTag, pTag + "(").replace("<TO>", ")<TO>"); } } /** * Replaces the TARGET tag by a regular expression that matches TARGET tags * with tag IDs. * * @param expr pattern descriptor * @return descriptor with a regular expression for TARGET tags */ private String replaceTargetTag(String expr) { return expr.replace("<TO>", "<TO_\\d*+>"); // possessive } /** * Replaces CONTEXT tags by regular expressions that match CONTEXT tags with * tag IDs. * * @param expr pattern descriptor * @return descriptor with regular expressions for CONTEXT tags */ private String replaceContextTags(String expr) { return expr.replace("<CO>", "<CO_\\d*+>"); // possessive } /** * Sets the <code>propertyID</code> field and replaces the PROPERTY tag by a * capturing group. * * @param expr pattern descriptor * @return descriptor without PROPERTY tag */ private String replacePropertyTag(String expr) { // compute the ID of the group that represents the PROPERTY object // - get string before PROPERTY tag String s = expr.split("<PO[^>]*+>")[0]; // - count number of '(' not preceded by '\' or followed by '?:' propertyID = s.split("\\(", -1).length - s.split("\\\\\\(", -1).length - s.split("\\(\\?\\:").length + s.split("\\\\\\(\\?\\:").length + 1; // replace PROPERTY tag if (expr.contains("<PO>")) // without NE types // length of PROPERTY objects that are not NEs is restricted to // MAX_PROP tokens, reluctant // (Note: NEs within a PROPERTY object are counted as 1) expr = expr.replace("<PO>", "([^ ]++(?: [^ ]++){0," + (MAX_PROP - 1) + "}?)"); else { // with NE types Matcher m = Pattern.compile("<PO_([^>]++)>").matcher(expr); m.find(); expr = expr.replaceFirst("<PO_[^>]++>", "\\(<" + m.group(1) + ">\\)"); } return expr; } /** * Replaces NE tags by regular expressions that match NE tags with at least * one of the NE types. * * @param expr pattern descriptor * @return descriptor with regular expressions for NE tags */ private String replaceNeTags(String expr) { Matcher m = Pattern.compile("<(NE[^>]*+)>").matcher(expr); while (m.find()) { // find next NE tag // get NE types String[] neTypes = m.group(1).split("_"); // build regular expression String regex = "<(?:NE[a-zA-Z0-9]*+_)*?"; // reluctant if (neTypes.length > 1) regex += "(?:"; regex += neTypes[0]; for (int i = 1; i < neTypes.length; i++) regex += "|" + neTypes[i]; if (neTypes.length > 1) regex += ")"; regex += "[^>]*+>"; // possessive // replace NE tag expr = expr.replace(m.group(0), regex); } return expr; } /** * Optimizes the pattern to improve its runtime performance. * * @param expr pattern descriptor * @return optimized pattern */ private String optimizePattern(String expr) { // use possessive quantifiers whenever possible Matcher m = Pattern.compile("(?:\\[\\^<\\]\\*\\?|" + "\\(\\?\\:\\S*+ \\)\\?)++.").matcher(expr); while (m.find()) { String rep = m.group(0); if (rep.endsWith("<") || rep.endsWith(")")) { // next token is '<' rep = rep.replace("[^<]*?", "[^<]*+"); // } else { // next token not '<' // // open parentheses // String closing = ""; // Matcher m2 = Pattern.compile("\\[\\^<\\]\\*\\?").matcher(rep); // m2.find(); // while (m2.find()) { // rep = rep.replaceFirst("\\[\\^<\\]\\*\\?", // "\\(\\?\\:\\(\\?\\:\\[\\^< \\]\\*\\+ \\)\\*\\?\\|" + // "\\[\\^<\\]\\*\\+"); // closing += ")"; // } // // // close parentheses // rep = rep.substring(0, rep.length() - 1) + closing + // rep.charAt(rep.length() - 1); } expr = expr.replace(m.group(0), rep); } // eat whole tokens, reluctant expr = expr.replace("[^<]*?", "(?:[^< ]*+ )*?"); return expr; } /** * Compares this object to another <code>AnswerPattern</code>. Two * <code>AnswerPattern</code> objects are equal, iff the pattern descriptors * are equal. * * @param o the reference object with which to compare * @return <code>true</code>, iff this object is the same as the * <code>o</code> argument */ public boolean equals(Object o) { if (!(o instanceof AnswerPattern)) return false; return desc.equals(((AnswerPattern) o).getDesc()); } /** * Compares two <code>AnswerPattern</code> objects by comparing the number * of correct applications. * * @param ap the <code>AnswerPattern</code> to be compared * @return a negative integer, zero or a positive integer as this * <code>AnswerPattern</code> is less than, equal to or greater than * the specified <code>AnswerPattern</code> */ public int compareTo(AnswerPattern ap) { return correct - ap.getCorrect(); } /** * The hashcode of an <code>AnswerPattern</code> is the hashcode of its * descriptor. * * @return hashcode */ public int hashCode() { return desc.hashCode(); } /** * Returns the pattern descriptor. * * @return pattern descriptor */ public String getDesc() { return desc; } /** * Returns the type of PROPERTY that is extracted with this pattern. * * @return the PROPERTY */ public String getProperty() { return property; } /** * Returns the number of correct applications of the pattern. * * @return number of correct applications */ public int getCorrect() { return correct; } /** * Returns the number of wrong applications of the pattern. * * @return number of wrong applications */ public int getWrong() { return wrong; } /** * Calculates a confidence measure for the pattern by applying the formula * <code>confidence = correct / (correct + wrong)</code>. * * @return confidence in the pattern */ public float getConfidence() { return ((float) correct) / (correct + wrong); } /** * Increments the number of correct applications by 1. */ public void incCorrect() { correct++; } /** * Increments the number of wrong applications by 1. */ public void incWrong() { wrong++; } /** * Returns the NE types that are allowed for a PROPERTY object to match the * pattern. * * @return NE types or <code>null</code> iff no specific types are expected */ public String[] getPropertyTypes() { Matcher m = Pattern.compile("<PO_([^>]++)>").matcher(desc); if (!m.find()) return null; String[] neTypes = m.group(1).split("_"); return neTypes; } /** * Applies the pattern to a sentence of space-delimited tokens containing * a TARGET tag and optionally a number of CONTEXT and NE tags. For each * match, a PROPERTY object is extracted. * * @param sentence a sentence * @return array of PROPERTY objects or an empty array, if the sentence does * not match the pattern */ public String[] apply(String sentence) { /*PrintWriter pw = null; try { pw = new PrintWriter(new FileOutputStream(new File("regex_data.txt"),true)); } catch (FileNotFoundException ex) { System.out.println("File not found exception!!"); }*/ // pw.printf("%s ----- %s\n", pattern.pattern(), sentence); Matcher m = pattern.matcher(sentence); ArrayList<String> results = new ArrayList<String>(); while (m.find()) { if (m.group(distID).split(" ").length <= MAX_DIST) // distance between TARGET and PROPERTY is restricted to // MAX_DIST tokens (Note: NEs are counted as 1) results.add(m.group(propertyID)); // continue search right after the beginning of this match m.region(m.start() + 1, sentence.length()); } // pw.close(); return results.toArray(new String[results.size()]); } }