package info.ephyra.patternlearning;
import info.ephyra.answerselection.filters.AnswerPatternFilter;
import info.ephyra.nlp.NETagger;
import info.ephyra.nlp.OpenNLP;
import info.ephyra.questionanalysis.QuestionInterpretation;
import info.ephyra.questionanalysis.QuestionInterpreter;
import info.ephyra.search.Result;
import info.ephyra.util.RegexConverter;
import info.ephyra.util.StringUtils;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Extracts answer patterns from text passages and adds them to the
* <code>AnswerPatternFilter</code>.
*
* @author Nico Schlaefer
* @version 2006-04-04
*/
public class PatternExtractor {
/**
* Maximum number of NE and CONTEXT tags in a pattern (for time
* performance).
*/
private static final int MAX_TAGS = 5;
/**
* Replaces all TARGET objects in the sentence.
*
* @param sentence input sentence
* @param to the TARGET object of the question
* @param nes the NEs in the sentence
* @return sentence with TARGET tags or <code>null</code>, if the sentence
* does not contain the TARGET
*/
private static String replaceTarget(String sentence, String to,
String[][] nes) {
HashSet<String> reps = new HashSet<String>();
String result = sentence;
for (String[] neType : nes)
for (String ne : neType)
if (StringUtils.equalsCommonNorm(ne, to)) reps.add(ne);
reps.add(to);
// sort expressions by length
String[] sorted = reps.toArray(new String[reps.size()]);
StringUtils.sortByLengthDesc(sorted);
for (String rep : sorted) {
rep = RegexConverter.strToRegexWithBounds(rep);
result = result.replaceAll(rep, "<TO>");
}
return (result.equals(sentence)) ? null : result;
}
/**
* Replaces all PROPERTY objects in the sentence.
*
* @param sentence input sentence
* @param as the answer to the question
* @param nes the NEs in the sentence
* @return sentence with PROPERTY tags or <code>null</code>, if the sentence
* does not contain the answer
*/
private static String replaceProperty(String sentence, String as,
String[][] nes) {
Hashtable<String, String> reps = new Hashtable<String, String>();
String neType, tag, result = sentence;
for (int i = 0; i < nes.length ; i++){
neType = NETagger.getNeType(i);
for (String ne : nes[i])
if (StringUtils.equalsCommonNorm(ne, as)) {
tag = reps.get(ne);
if (tag == null) tag = "<PO_" + neType;
else if (!tag.contains(neType)) tag += "_" + neType;
reps.put(ne, tag);
}
}
if (!reps.containsKey(as)) reps.put(as, "<PO");
// sort expressions by length
String[] sorted = reps.keySet().toArray(new String[reps.size()]);
StringUtils.sortByLengthDesc(sorted);
for (String rep : sorted) {
tag = reps.get(rep) + ">";
rep = RegexConverter.strToRegexWithBounds(rep);
result = result.replaceAll(rep, tag);
}
return (result.equals(sentence)) ? null : result;
}
/**
* Replaces all CONTEXT objects in the sentence.
*
* @param sentence input sentence
* @param cos the CONTEXT objects of the question
* @param nes the NEs in the sentence
* @return sentence with CONTEXT tags
*/
private static String replaceContext(String sentence, String[] cos,
String[][] nes) {
HashSet<String> reps = new HashSet<String>();
for (String[] neType : nes)
for (String ne : neType)
for (String co : cos)
if (StringUtils.equalsCommonNorm(ne, co)) reps.add(ne);
for (String co : cos) reps.add(co);
// sort expressions by length
String[] sorted = reps.toArray(new String[reps.size()]);
StringUtils.sortByLengthDesc(sorted);
for (String rep : sorted) {
rep = RegexConverter.strToRegexWithBounds(rep);
sentence = sentence.replaceAll(rep, "<CO>");
}
return sentence;
}
/**
* Replaces all NEs in the sentence.
*
* @param sentence input sentence
* @param nes the NEs in the sentence
* @return sentence with NE tags
*/
private static String replaceNes(String sentence, String[][] nes) {
Hashtable<String, String> reps = new Hashtable<String, String>();
String neType, tag;
for (int i = 0; i < nes.length; i++) {
neType = NETagger.getNeType(i);
for (String ne : nes[i]) {
tag = reps.get(ne);
if (tag == null) tag = "<" + neType;
else if (!tag.contains(neType)) tag += "_" + neType;
reps.put(ne, tag);
}
}
// sort expressions by length
String[] sorted = reps.keySet().toArray(new String[reps.size()]);
StringUtils.sortByLengthDesc(sorted);
for (String rep : sorted) {
tag = reps.get(rep) + ">";
rep = RegexConverter.strToRegexWithBounds(rep);
sentence = sentence.replaceAll(rep, tag);
}
return sentence;
}
/**
* Prepares a sentence for pattern extraction.
*
* @param sentence input sentence
* @param to the TARGET object of the question
* @param cos the CONTEXT objects of the question
* @param po the answer to the question
* @param nes the NEs in the sentence
* @return sentence ready for pattern extraction or <code>null</code>, if
* there is no TARGET or PROPERTY object in the input sentence
*/
private static String prepSentence(String sentence, String to, String[] cos,
String po, String[][] nes) {
// replace TARGET, PROPERTY and CONTEXT objects and NEs
sentence = replaceTarget(sentence, to, nes);
if (sentence == null) return null;
sentence = replaceProperty(sentence, po, nes);
if (sentence == null) return null;
sentence = replaceContext(sentence, cos, nes);
sentence = replaceNes(sentence, nes);
// add '#' at beginning and end of sentence
sentence = "# " + sentence + " #";
// transform into regular expression
sentence = RegexConverter.strToRegex(sentence);
return sentence;
}
/**
* Extract basic answer patterns from the sentence.
*
* @param sentence input sentence
* @return basic answer patterns
*/
private static String[] extractPatterns(String sentence) {
String[] tokens = sentence.split(" ");
HashSet<String> patterns = new HashSet<String>();
// TARGET comes before PROPERTY
String ap = "";
for (int i = 0; i < tokens.length; i++) {
if (tokens[i].equals("<TO>")) {
ap = tokens[i];
} else if (ap.length() > 0) {
ap += " " + tokens[i]; // add to pattern
if (tokens[i].matches("<PO.*>")) {
ap += " " + tokens[i + 1];
if (ap.split("<TO>", -1).length == 2 &&
ap.split("<PO.*?>", -1).length == 2)
// exactly one TARGET and PROPERTY tag
patterns.add(ap);
ap = "";
}
}
}
// PROPERTY comes before TARGET
ap = "";
for (int i = 0; i < tokens.length; i++) {
if (tokens[i].matches("<PO.*>")) {
ap = tokens[i - 1] + " " + tokens[i];
} else if (ap.length() > 0) {
ap += " " + tokens[i]; // add to pattern
if (tokens[i].equals("<TO>")) {
if (ap.split("<TO>", -1).length == 2 &&
ap.split("<PO.*?>", -1).length == 2)
// exactly one TARGET and PROPERTY tag
patterns.add(ap);
ap = "";
}
}
}
return patterns.toArray(new String[patterns.size()]);
}
/**
* Generates more generic patterns from the initial patterns.
*
* @param patterns initial patterns
* @param prop PROPERTY that the patterns extract
* @return more generic patterns
*/
private static String[] generalizePatterns(String[] patterns, String prop) {
HashSet<String> gens = new HashSet<String>();
// if the PROPERTY tag is combined with NE types, replace the pattern
// by applying the following generalizations:
// - drop the token preceding/following the PROPERTY tag
// - drop the NE types
Pattern p = Pattern.compile("(<TO>.*?<PO_.*?>)|(<PO_.*?>.*?<TO>)");
for (String pattern : patterns) {
Matcher m = p.matcher(pattern);
if (m.find()) {
gens.add(m.group(0));
gens.add(pattern.replaceFirst("<PO_.*?>", "<PO>"));
} else gens.add(pattern);
}
// drop all tokens in between the TARGET and PROPERTY tag that are not
// keywords or tags and that are not adjacent to a PROPERTY tag without
// NE types and make tags optional
patterns = gens.toArray(new String[gens.size()]);
for (String pattern : patterns) {
String[] tokens = pattern.split(" ");
String gen = "";
int nOfTags = 0;
boolean dropped = false; // true, iff last token was dropped
// boolean keywords = false; // true, iff pattern contains keywords
for (int i = 0; i < tokens.length; i++) {
if (tokens[i].matches("<TO>") ||
tokens[i].matches("<PO.*>") ||
(i > 0 && tokens[i - 1].matches("<PO>")) ||
(i < tokens.length - 1 && tokens[i + 1].matches("<PO>"))) {
// keep TARGET and PROPERTY tags and tokens that are
// adjacent to a PROPERTY tag without NE types
gen += tokens[i] + " ";
dropped = false;
} else if (tokens[i].matches("<.*>")) {
// make tags optional
gen += "(?:" + tokens[i] + " )?"; // greedy
nOfTags++;
dropped = false;
} else if (QuestionInterpreter.lookupKeyword(tokens[i], prop)) {
// keep keywords
gen += tokens[i] + " ";
dropped = false;
// keywords = true;
} else {
// drop other tokens
if (!dropped) gen += "[^<]*?"; // reluctant
dropped = true;
}
}
// if (keywords) // patterns contains keywords
if (nOfTags <= MAX_TAGS) // at most MAX_TAGS NE or CONTEXT tags
gens.add(gen.trim());
}
return gens.toArray(new String[gens.size()]);
}
/**
* Extracts answer patterns from the answer string of a <code>Result</code>
* object and adds them to the <code>AnswerPatternFilter</code>.
*
* @param result <code>Result</code> object
* @param as the answer to the question
*/
public static void extract(Result result, String as) {
// get interpretation and answer string
QuestionInterpretation qi = result.getQuery().getInterpretation();
String to = qi.getTarget();
// String[] cos = qi.getContext();
String[] cos = new String[0]; // CONTEXT objects are ignored
String prop = qi.getProperty();
String answer = result.getAnswer();
// tokenize interpretation and provided answer, convert to lower-case
to = NETagger.tokenizeWithSpaces(to).toLowerCase();
for (int i = 0; i < cos.length; i++)
cos[i] = NETagger.tokenizeWithSpaces(cos[i]).toLowerCase();
as = NETagger.tokenizeWithSpaces(as).toLowerCase();
// split answer string into sentences and tokenize sentences
String[] sentences = OpenNLP.sentDetect(answer);
String[][] tokens = new String[sentences.length][];
for (int i = 0; i < sentences.length; i++) {
tokens[i] = NETagger.tokenize(sentences[i]);
sentences[i] = StringUtils.concatWithSpaces(tokens[i]);
}
// extract named entities
String[][][] nes = NETagger.extractNes(tokens);
// convert sentences and named entities to lower-case
for (int i = 0; i < nes.length; i++) {
sentences[i] = sentences[i].toLowerCase();
for (int j = 0; j < nes[i].length; j++)
for (int k = 0; k < nes[i][j].length; k++)
nes[i][j][k] = nes[i][j][k].toLowerCase();
}
for (int i = 0; i < sentences.length; i++) {
// prepare sentence for pattern extraction
sentences[i] = prepSentence(sentences[i], to, cos, as, nes[i]);
if (sentences[i] == null) continue;
// extract patterns
String[] patterns = extractPatterns(sentences[i]);
// generalize patterns
patterns = generalizePatterns(patterns, prop);
// add patterns
for (String pattern : patterns)
AnswerPatternFilter.addPattern(pattern, prop);
}
}
}