TREC13To16Parser.java example

Explorer
lucida-master
- lucida
package info.ephyra.trec;

import info.ephyra.io.MsgPrinter;
import info.ephyra.querygeneration.Query;
import info.ephyra.questionanalysis.QuestionInterpretation;
import info.ephyra.search.Result;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;
import java.util.ArrayList;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;

/**
 * A parser for the TREC 13-16 QA tracks.
 * 
 * @author Nico Schlaefer
 * @version 2008-02-07
 */
public class TREC13To16Parser {
	/** Characters that have to be replaced before parsing an XML document. */
	private static final String[] SPECIALCHARS = {"&"};
	/** Replacements for the special characters. */
	private static final String[] REPLACEMENTS = {"&"};
	
	/** Cached log file entries of type "factoid". */
	private static ArrayList<String> factoidEntries;
	/** Cached log file entries of type "list". */
	private static ArrayList<String> listEntries;
	/** Cached log file entries of type "other". */
	private static ArrayList<String> otherEntries;
	
	/**
	 * Drops the cached entries of type "factoid".
	 */
	public static void dropCachedFactoidEntries() {
		factoidEntries = null;
	}
	
	/**
	 * Drops the cached entries of type "list".
	 */
	public static void dropCachedListEntries() {
		listEntries = null;
	}
	
	/**
	 * Drops the cached entries of type "other".
	 */
	public static void dropCachedOtherEntries() {
		otherEntries = null;
	}
	
	/**
	 * Drops the cached entries of all types.
	 */
	public static void dropAllCachedEntries() {
		dropCachedFactoidEntries();
		dropCachedListEntries();
		dropCachedOtherEntries();
	}
	
	/**
	 * Loads the target objects from a file.
	 * 
	 * @param filename file that contains the targets
	 * @return targets or <code>null</code>, if the file could not be parsed
	 */
	public static TRECTarget[] loadTargets(String filename) {
		try {
			// create factory object
			DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
			// create DOM parser
			DocumentBuilder parser = factory.newDocumentBuilder();
			// parse file and build tree
			Document trecD = parser.parse(new File(filename));
			
			NodeList targetL = trecD.getElementsByTagName("target");
			TRECTarget[] targets = new TRECTarget[targetL.getLength()];
			
			for (int i = 0; i < targets.length; i++) {
				Element targetE = (Element) targetL.item(i);
				
				String targetId = targetE.getAttribute("id").trim();
				String targetDesc = targetE.getAttribute("text").trim();
				
				NodeList questionL = targetE.getElementsByTagName("q");
				TRECQuestion[] questions = new TRECQuestion[questionL.getLength()];
				
				for (int j = 0; j < questions.length; j++) {
					Element questionE = (Element) questionL.item(j);
					
					String questionId = questionE.getAttribute("id").trim();
					String type = questionE.getAttribute("type").trim();
					String questionString =	questionE.getFirstChild().getNodeValue().trim();
					
					questions[j] = new TRECQuestion(questionId, type, questionString);
				}
				
				targets[i] = new TRECTarget(targetId, targetDesc, questions);
			}
			
			return targets;
		} catch (Exception e) {
			MsgPrinter.printErrorMsg("Failed to load or parse question file:");
			MsgPrinter.printErrorMsg(e.toString());
			
			return null;
		}
	}
	
	/**
	 * Loads patterns for the factoid or list questions from a file.
	 * 
	 * @param filename file that contains the patterns
	 * @return patterns or <code>null</code>, if the file could not be parsed
	 */
	public static TRECPattern[] loadPatterns(String filename) {
		File file = new File(filename);
		
		try {
			BufferedReader in = new BufferedReader(new FileReader(file));
			
			String[] line;
			String id, lastId = "";
			ArrayList<String> regexs = new ArrayList<String>();
			ArrayList<TRECPattern> patterns = new ArrayList<TRECPattern>();
			
			while (in.ready()) {
				line = in.readLine().split(" ", 2);
				
				id = line[0];
				if (!id.equals(lastId) && !lastId.equals("")) {
					// add pattern for previous question ID
					patterns.add(new TRECPattern(lastId,
							regexs.toArray(new String[regexs.size()])));
					regexs = new ArrayList<String>();
				}
				
				regexs.add("(?i)" + line[1].trim());
				lastId = id;
			}
			
			// add last pattern
			patterns.add(new TRECPattern(lastId,
					regexs.toArray(new String[regexs.size()])));
			
			in.close();
			
			return patterns.toArray(new TRECPattern[patterns.size()]);
		} catch (IOException e) {
			MsgPrinter.printErrorMsg("Failed to load or parse pattern file:");
			MsgPrinter.printErrorMsg(e.toString());
			
			return null;
		}
	}
	
	/**
	 * Loads the results for a question from a log file.
	 * 
	 * @param question the question
	 * @param type the type of question ("factoid", "list" or "other")
	 * @param logfile the log file
	 * @return array of results or <code>null</code> if the question could not
	 * 		   be found in the log file
	 */
	public static Result[] loadResults(String question, String type, String logfile) {
		try {
			// get cached entries for given question type
			ArrayList<String> entries;
			if (type.equals("FACTOID")) entries = factoidEntries;
			else if (type.equals("LIST")) entries = listEntries;
			else entries = otherEntries;
			
			// get entries from log file if not cached
			if (entries == null) {
				entries = new ArrayList<String>();
				String entry = "";
				BufferedReader in = new BufferedReader(new FileReader(logfile));
				while (in.ready()) {
					String line = in.readLine();
					
					// handle characters that are not allowed in XML
					for (int i = 0; i < SPECIALCHARS.length; i++)
						line = line.replace(SPECIALCHARS[i], REPLACEMENTS[i]);
//					if (!line.matches("\\s*+</?\\w++>\\s*+"))
//						line = "<![CDATA[" + line.trim() + "]]>";
					
					if (line.matches("<" + type.toLowerCase() + ">")) entry = "";
					entry += line + "\n";
					if (line.matches("</" + type.toLowerCase() + ">")) entries.add(entry);
				}
				
				// cache entries
				if (type.equals("FACTOID")) factoidEntries = entries;
				else if (type.equals("LIST")) listEntries = entries;
				else otherEntries = entries;
			}
			
			// traverse entries in reverse order
			for (int i = entries.size() - 1; i >= 0; i--) {
				// create factory object
				DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
				// create DOM parser
				DocumentBuilder parser = factory.newDocumentBuilder();
				// parse entry and build tree
				Document entryD = parser.parse(new InputSource(new StringReader(entries.get(i))));
				
				// Is this the question we are looking for?
				Element questionE = (Element) entryD.getElementsByTagName("question").item(0);
				String questionS = questionE.getFirstChild().getNodeValue().trim();
				if (!questionS.equals(question)) continue;
				
				// get results
				ArrayList<Result> results = new ArrayList<Result>();
				NodeList resultsL = entryD.getElementsByTagName("result");
				for (int j = 0; j < resultsL.getLength(); j++) {
					Element resultE = (Element) resultsL.item(j);
					
					Element answerE = (Element) resultE.getElementsByTagName("answer").item(0);
					String answerS = answerE.getFirstChild().getNodeValue().trim();
					
					Element scoreE = (Element) resultE.getElementsByTagName("score").item(0);
					float scoreF = Float.parseFloat(scoreE.getFirstChild().getNodeValue().trim());
					
					Element docidE = (Element) resultE.getElementsByTagName("docid").item(0);
					String docidS = docidE.getFirstChild().getNodeValue().trim();
					
					Element qiE = (Element) resultE.getElementsByTagName("interpretation").item(0);
					QuestionInterpretation qi = null;
					if (qiE != null) {
						Element propertyE = (Element) qiE.getElementsByTagName("property").item(0);
						String propertyS = propertyE.getFirstChild().getNodeValue().trim();
						
						Element targetE = (Element) qiE.getElementsByTagName("target").item(0);
						String targetS = targetE.getFirstChild().getNodeValue().trim();
						
						NodeList contextL = qiE.getElementsByTagName("context");
						String[] contextS = new String[contextL.getLength()];
						for (int k = 0; k < contextS.length; k++) {
							Element contextE = (Element) contextL.item(k);
							contextS[k] = contextE.getFirstChild().getNodeValue().trim();
						}
						
						qi = new QuestionInterpretation(targetS, contextS, propertyS);
					}
					
					Query query = new Query(null);
					query.setInterpretation(qi);
					Result result = new Result(answerS, query, docidS);
					result.setScore(scoreF);
					
					results.add(result);
				}
				
				return results.toArray(new Result[results.size()]);
			}
			
			return null;  // question not found
		} catch (Exception e) {
			MsgPrinter.printErrorMsg("Failed to load or parse log file:");
			MsgPrinter.printErrorMsg(e.toString());
			
			return null;
		}
	}
	
	/**
	 * Appends answers to an output file.
	 * 
	 * @param filename the output file
	 * @param answers the answers
	 * @param correct for each answer a flag that is true iff the answer is
	 * 		  correct or <code>null</code> if the answers were not evaluated
	 * @param runTag tag that uniquely identifies the run
	 * @return <code>true</code>, iff the answers could be saved
	 */
	public static boolean saveAnswers(String filename, TRECAnswer[] answers,
									  boolean[] correct, String runTag) {
		try {
			PrintWriter out =
				new PrintWriter(new FileOutputStream(filename, true));
			
			for (int i = 0; i < answers.length; i++) {
				String line = answers[i].getId();
				line += "   " + runTag;
				if (correct == null || correct.length < i + 1)
					line += "   ";
				else
					line += ((correct[i]) ? " + " : " - ");
				line += answers[i].getSupportDoc();
				if (answers[i].getAnswerString() != null)
					line += "   " + answers[i].getAnswerString();
				
				out.println(line);
			}
			
			out.close();
		} catch (IOException e) {
			MsgPrinter.printErrorMsg("Failed to save answers:");
			MsgPrinter.printErrorMsg(e.toString());
			return false;
		}
		
		return true;
	}
}