PatternExtractor.java example

Explorer
lucida-master
- lucida
package info.ephyra.patternlearning;

import info.ephyra.answerselection.filters.AnswerPatternFilter;
import info.ephyra.nlp.NETagger;
import info.ephyra.nlp.OpenNLP;
import info.ephyra.questionanalysis.QuestionInterpretation;
import info.ephyra.questionanalysis.QuestionInterpreter;
import info.ephyra.search.Result;
import info.ephyra.util.RegexConverter;
import info.ephyra.util.StringUtils;

import java.util.HashSet;
import java.util.Hashtable;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Extracts answer patterns from text passages and adds them to the
 * <code>AnswerPatternFilter</code>.
 * 
 * @author Nico Schlaefer
 * @version 2006-04-04
 */
public class PatternExtractor {
	/**
	 * Maximum number of NE and CONTEXT tags in a pattern (for time
	 * performance).
	 */
	private static final int MAX_TAGS = 5;
	
	/**
	 * Replaces all TARGET objects in the sentence.
	 * 
	 * @param sentence input sentence
	 * @param to the TARGET object of the question
	 * @param nes the NEs in the sentence
	 * @return sentence with TARGET tags or <code>null</code>, if the sentence
	 * 		   does not contain the TARGET
	 */
	private static String replaceTarget(String sentence, String to,
										String[][] nes) {
		HashSet<String> reps = new HashSet<String>();
		String result = sentence;
		
		for (String[] neType : nes)
			for (String ne : neType)
				if (StringUtils.equalsCommonNorm(ne, to)) reps.add(ne);
		reps.add(to);
		
		// sort expressions by length
		String[] sorted = reps.toArray(new String[reps.size()]);
		StringUtils.sortByLengthDesc(sorted);
		
		for (String rep : sorted) {
			rep = RegexConverter.strToRegexWithBounds(rep);
			result = result.replaceAll(rep, "<TO>");
		}
		
		return (result.equals(sentence)) ? null : result;
	}
	
	/**
	 * Replaces all PROPERTY objects in the sentence.
	 * 
	 * @param sentence input sentence
	 * @param as the answer to the question
	 * @param nes the NEs in the sentence
	 * @return sentence with PROPERTY tags or <code>null</code>, if the sentence
	 * 		   does not contain the answer
	 */
	private static String replaceProperty(String sentence, String as,
										  String[][] nes) {
		Hashtable<String, String> reps = new Hashtable<String, String>();
		String neType, tag, result = sentence;
		
		for (int i = 0; i < nes.length ; i++){
			neType = NETagger.getNeType(i);
			
			for (String ne : nes[i])
				if (StringUtils.equalsCommonNorm(ne, as)) {
					tag = reps.get(ne);
					
					if (tag == null) tag = "<PO_" + neType;
					else if (!tag.contains(neType)) tag += "_" + neType;
					
					reps.put(ne, tag);
				}
		}
		if (!reps.containsKey(as)) reps.put(as, "<PO");
		
		// sort expressions by length
		String[] sorted = reps.keySet().toArray(new String[reps.size()]);
		StringUtils.sortByLengthDesc(sorted);
		
		for (String rep : sorted) {
			tag = reps.get(rep) + ">";
			rep = RegexConverter.strToRegexWithBounds(rep);
			result = result.replaceAll(rep, tag);
		}
		
		return (result.equals(sentence)) ? null : result;
	}
	
	/**
	 * Replaces all CONTEXT objects in the sentence.
	 * 
	 * @param sentence input sentence
	 * @param cos the CONTEXT objects of the question
	 * @param nes the NEs in the sentence
	 * @return sentence with CONTEXT tags
	 */
	private static String replaceContext(String sentence, String[] cos,
										 String[][] nes) {
		HashSet<String> reps = new HashSet<String>();
		
		for (String[] neType : nes)
			for (String ne : neType)
				for (String co : cos)
					if (StringUtils.equalsCommonNorm(ne, co)) reps.add(ne);
		for (String co : cos) reps.add(co);
		
		// sort expressions by length
		String[] sorted = reps.toArray(new String[reps.size()]);
		StringUtils.sortByLengthDesc(sorted);
		
		for (String rep : sorted) {
			rep = RegexConverter.strToRegexWithBounds(rep);
			sentence = sentence.replaceAll(rep, "<CO>");
		}
		
		return sentence;
	}
	
	/**
	 * Replaces all NEs in the sentence.
	 * 
	 * @param sentence input sentence
	 * @param nes the NEs in the sentence
	 * @return sentence with NE tags
	 */
	private static String replaceNes(String sentence, String[][] nes) {
		Hashtable<String, String> reps = new Hashtable<String, String>();
		String neType, tag;
		
		for (int i = 0; i < nes.length; i++) {
			neType = NETagger.getNeType(i);
			
			for (String ne : nes[i]) {
				tag = reps.get(ne);
				
				if (tag == null) tag = "<" + neType;
				else if (!tag.contains(neType)) tag += "_" + neType;
				
				reps.put(ne, tag);
			}
		}
		
		// sort expressions by length
		String[] sorted = reps.keySet().toArray(new String[reps.size()]);
		StringUtils.sortByLengthDesc(sorted);
		
		for (String rep : sorted) {
			tag = reps.get(rep) + ">";
			rep = RegexConverter.strToRegexWithBounds(rep);
			sentence = sentence.replaceAll(rep, tag);
		}
		
		return sentence;
	}
	
	/**
	 * Prepares a sentence for pattern extraction.
	 * 
	 * @param sentence input sentence
	 * @param to the TARGET object of the question
	 * @param cos the CONTEXT objects of the question
	 * @param po the answer to the question
	 * @param nes the NEs in the sentence
	 * @return sentence ready for pattern extraction or <code>null</code>, if
	 * 		   there is no TARGET or PROPERTY object in the input sentence
	 */
	private static String prepSentence(String sentence, String to, String[] cos,
									   String po, String[][] nes) {
		// replace TARGET, PROPERTY and CONTEXT objects and NEs
		sentence = replaceTarget(sentence, to, nes);
		if (sentence == null) return null;
		sentence = replaceProperty(sentence, po, nes);
		if (sentence == null) return null;
		sentence = replaceContext(sentence, cos, nes);
		sentence = replaceNes(sentence, nes);
		
		// add '#' at beginning and end of sentence
		sentence = "# " + sentence + " #";
		
		// transform into regular expression
		sentence = RegexConverter.strToRegex(sentence);
		
		return sentence;
	}
	
	/**
	 * Extract basic answer patterns from the sentence.
	 * 
	 * @param sentence input sentence
	 * @return basic answer patterns
	 */
	private static String[] extractPatterns(String sentence) {
		String[] tokens = sentence.split(" ");
		HashSet<String> patterns = new HashSet<String>();
		
		// TARGET comes before PROPERTY
		String ap = "";
		for (int i = 0; i < tokens.length; i++) {
			if (tokens[i].equals("<TO>")) {
				ap = tokens[i];
			} else if (ap.length() > 0) {
				ap += " " + tokens[i];  // add to pattern
				
				if (tokens[i].matches("<PO.*>")) {
					ap += " " + tokens[i + 1];

					if (ap.split("<TO>", -1).length == 2 &&
						ap.split("<PO.*?>", -1).length == 2)
						// exactly one TARGET and PROPERTY tag
						patterns.add(ap);
					
					ap = "";
				}
			}
		}
		
		// PROPERTY comes before TARGET
		ap = "";
		for (int i = 0; i < tokens.length; i++) {
			if (tokens[i].matches("<PO.*>")) {
				ap = tokens[i - 1] + " " + tokens[i];
			} else if (ap.length() > 0) {
				ap += " " + tokens[i];  // add to pattern
				
				if (tokens[i].equals("<TO>")) {
					if (ap.split("<TO>", -1).length == 2 &&
						ap.split("<PO.*?>", -1).length == 2)
						// exactly one TARGET and PROPERTY tag
						patterns.add(ap);
					
					ap = "";
				}
			}
		}
		
		return patterns.toArray(new String[patterns.size()]);
	}
	
	/**
	 * Generates more generic patterns from the initial patterns.
	 * 
	 * @param patterns initial patterns
	 * @param prop PROPERTY that the patterns extract
	 * @return more generic patterns
	 */
	private static String[] generalizePatterns(String[] patterns, String prop) {
		HashSet<String> gens = new HashSet<String>();
		
		// if the PROPERTY tag is combined with NE types, replace the pattern
		// by applying the following generalizations:
		// - drop the token preceding/following the PROPERTY tag
		// - drop the NE types
		Pattern p = Pattern.compile("(<TO>.*?<PO_.*?>)|(<PO_.*?>.*?<TO>)");
		for (String pattern : patterns) {
			Matcher m = p.matcher(pattern);
			if (m.find()) {
				gens.add(m.group(0));
				gens.add(pattern.replaceFirst("<PO_.*?>", "<PO>"));
			} else gens.add(pattern);
		}
		
		// drop all tokens in between the TARGET and PROPERTY tag that are not
		// keywords or tags and that are not adjacent to a PROPERTY tag without
		// NE types and make tags optional
		patterns = gens.toArray(new String[gens.size()]);
		for (String pattern : patterns) {
			String[] tokens = pattern.split(" ");
			String gen = "";
			int nOfTags = 0;
			boolean dropped = false;  // true, iff last token was dropped
//			boolean keywords = false;  // true, iff pattern contains keywords
			
			for (int i = 0; i < tokens.length; i++) {
				if (tokens[i].matches("<TO>") ||
					tokens[i].matches("<PO.*>") ||
					(i > 0 && tokens[i - 1].matches("<PO>")) ||
					(i < tokens.length - 1 && tokens[i + 1].matches("<PO>"))) {
					// keep TARGET and PROPERTY tags and tokens that are
					// adjacent to a PROPERTY tag without NE types
					gen += tokens[i] + " ";
					dropped = false;
				} else if (tokens[i].matches("<.*>")) {
					// make tags optional
					gen += "(?:" + tokens[i] + " )?";  // greedy
					nOfTags++;
					dropped = false;
				} else if (QuestionInterpreter.lookupKeyword(tokens[i], prop)) {
					// keep keywords
					gen += tokens[i] + " ";
					dropped = false;
//					keywords = true;
				} else {
					// drop other tokens
					if (!dropped) gen += "[^<]*?";  // reluctant
					dropped = true;
				}
			}
			
//			if (keywords)  // patterns contains keywords
				if (nOfTags <= MAX_TAGS)  // at most MAX_TAGS NE or CONTEXT tags
					gens.add(gen.trim());
		}
		
		return gens.toArray(new String[gens.size()]);
	}
	
	/**
	 * Extracts answer patterns from the answer string of a <code>Result</code>
	 * object and adds them to the <code>AnswerPatternFilter</code>.
	 * 
	 * @param result <code>Result</code> object
	 * @param as the answer to the question
	 */
	public static void extract(Result result, String as) {
		// get interpretation and answer string
		QuestionInterpretation qi = result.getQuery().getInterpretation();
		String to = qi.getTarget();
//		String[] cos = qi.getContext();
		String[] cos = new String[0];  // CONTEXT objects are ignored
		String prop = qi.getProperty();
		String answer = result.getAnswer();
		
		// tokenize interpretation and provided answer, convert to lower-case
		to = NETagger.tokenizeWithSpaces(to).toLowerCase();
		for (int i = 0; i < cos.length; i++)
			cos[i] = NETagger.tokenizeWithSpaces(cos[i]).toLowerCase();
		as = NETagger.tokenizeWithSpaces(as).toLowerCase();
		// split answer string into sentences and tokenize sentences
		String[] sentences = OpenNLP.sentDetect(answer);
		String[][] tokens = new String[sentences.length][];
		for (int i = 0; i < sentences.length; i++) {
			tokens[i] = NETagger.tokenize(sentences[i]);
			sentences[i] = StringUtils.concatWithSpaces(tokens[i]);
		}
		// extract named entities
		String[][][] nes = NETagger.extractNes(tokens);
		// convert sentences and named entities to lower-case
		for (int i = 0; i < nes.length; i++) {
			sentences[i] = sentences[i].toLowerCase();
			for (int j = 0; j < nes[i].length; j++)
				for (int k = 0; k < nes[i][j].length; k++)
					nes[i][j][k] = nes[i][j][k].toLowerCase();
		}
		
		for (int i = 0; i < sentences.length; i++) {
			// prepare sentence for pattern extraction
			sentences[i] = prepSentence(sentences[i], to, cos, as, nes[i]);
			if (sentences[i] == null) continue;
			
			// extract patterns
			String[] patterns = extractPatterns(sentences[i]);
			// generalize patterns
			patterns = generalizePatterns(patterns, prop);
			
			// add patterns
			for (String pattern : patterns)
				AnswerPatternFilter.addPattern(pattern, prop);
		}
	}
}