package info.ephyra.trec;
import info.ephyra.io.MsgPrinter;
import info.ephyra.querygeneration.Query;
import info.ephyra.questionanalysis.QuestionInterpretation;
import info.ephyra.search.Result;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;
import java.util.ArrayList;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
/**
* A parser for the TREC 13-16 QA tracks.
*
* @author Nico Schlaefer
* @version 2008-02-07
*/
public class TREC13To16Parser {
/** Characters that have to be replaced before parsing an XML document. */
private static final String[] SPECIALCHARS = {"&"};
/** Replacements for the special characters. */
private static final String[] REPLACEMENTS = {"&"};
/** Cached log file entries of type "factoid". */
private static ArrayList<String> factoidEntries;
/** Cached log file entries of type "list". */
private static ArrayList<String> listEntries;
/** Cached log file entries of type "other". */
private static ArrayList<String> otherEntries;
/**
* Drops the cached entries of type "factoid".
*/
public static void dropCachedFactoidEntries() {
factoidEntries = null;
}
/**
* Drops the cached entries of type "list".
*/
public static void dropCachedListEntries() {
listEntries = null;
}
/**
* Drops the cached entries of type "other".
*/
public static void dropCachedOtherEntries() {
otherEntries = null;
}
/**
* Drops the cached entries of all types.
*/
public static void dropAllCachedEntries() {
dropCachedFactoidEntries();
dropCachedListEntries();
dropCachedOtherEntries();
}
/**
* Loads the target objects from a file.
*
* @param filename file that contains the targets
* @return targets or <code>null</code>, if the file could not be parsed
*/
public static TRECTarget[] loadTargets(String filename) {
try {
// create factory object
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
// create DOM parser
DocumentBuilder parser = factory.newDocumentBuilder();
// parse file and build tree
Document trecD = parser.parse(new File(filename));
NodeList targetL = trecD.getElementsByTagName("target");
TRECTarget[] targets = new TRECTarget[targetL.getLength()];
for (int i = 0; i < targets.length; i++) {
Element targetE = (Element) targetL.item(i);
String targetId = targetE.getAttribute("id").trim();
String targetDesc = targetE.getAttribute("text").trim();
NodeList questionL = targetE.getElementsByTagName("q");
TRECQuestion[] questions = new TRECQuestion[questionL.getLength()];
for (int j = 0; j < questions.length; j++) {
Element questionE = (Element) questionL.item(j);
String questionId = questionE.getAttribute("id").trim();
String type = questionE.getAttribute("type").trim();
String questionString = questionE.getFirstChild().getNodeValue().trim();
questions[j] = new TRECQuestion(questionId, type, questionString);
}
targets[i] = new TRECTarget(targetId, targetDesc, questions);
}
return targets;
} catch (Exception e) {
MsgPrinter.printErrorMsg("Failed to load or parse question file:");
MsgPrinter.printErrorMsg(e.toString());
return null;
}
}
/**
* Loads patterns for the factoid or list questions from a file.
*
* @param filename file that contains the patterns
* @return patterns or <code>null</code>, if the file could not be parsed
*/
public static TRECPattern[] loadPatterns(String filename) {
File file = new File(filename);
try {
BufferedReader in = new BufferedReader(new FileReader(file));
String[] line;
String id, lastId = "";
ArrayList<String> regexs = new ArrayList<String>();
ArrayList<TRECPattern> patterns = new ArrayList<TRECPattern>();
while (in.ready()) {
line = in.readLine().split(" ", 2);
id = line[0];
if (!id.equals(lastId) && !lastId.equals("")) {
// add pattern for previous question ID
patterns.add(new TRECPattern(lastId,
regexs.toArray(new String[regexs.size()])));
regexs = new ArrayList<String>();
}
regexs.add("(?i)" + line[1].trim());
lastId = id;
}
// add last pattern
patterns.add(new TRECPattern(lastId,
regexs.toArray(new String[regexs.size()])));
in.close();
return patterns.toArray(new TRECPattern[patterns.size()]);
} catch (IOException e) {
MsgPrinter.printErrorMsg("Failed to load or parse pattern file:");
MsgPrinter.printErrorMsg(e.toString());
return null;
}
}
/**
* Loads the results for a question from a log file.
*
* @param question the question
* @param type the type of question ("factoid", "list" or "other")
* @param logfile the log file
* @return array of results or <code>null</code> if the question could not
* be found in the log file
*/
public static Result[] loadResults(String question, String type, String logfile) {
try {
// get cached entries for given question type
ArrayList<String> entries;
if (type.equals("FACTOID")) entries = factoidEntries;
else if (type.equals("LIST")) entries = listEntries;
else entries = otherEntries;
// get entries from log file if not cached
if (entries == null) {
entries = new ArrayList<String>();
String entry = "";
BufferedReader in = new BufferedReader(new FileReader(logfile));
while (in.ready()) {
String line = in.readLine();
// handle characters that are not allowed in XML
for (int i = 0; i < SPECIALCHARS.length; i++)
line = line.replace(SPECIALCHARS[i], REPLACEMENTS[i]);
// if (!line.matches("\\s*+</?\\w++>\\s*+"))
// line = "<![CDATA[" + line.trim() + "]]>";
if (line.matches("<" + type.toLowerCase() + ">")) entry = "";
entry += line + "\n";
if (line.matches("</" + type.toLowerCase() + ">")) entries.add(entry);
}
// cache entries
if (type.equals("FACTOID")) factoidEntries = entries;
else if (type.equals("LIST")) listEntries = entries;
else otherEntries = entries;
}
// traverse entries in reverse order
for (int i = entries.size() - 1; i >= 0; i--) {
// create factory object
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
// create DOM parser
DocumentBuilder parser = factory.newDocumentBuilder();
// parse entry and build tree
Document entryD = parser.parse(new InputSource(new StringReader(entries.get(i))));
// Is this the question we are looking for?
Element questionE = (Element) entryD.getElementsByTagName("question").item(0);
String questionS = questionE.getFirstChild().getNodeValue().trim();
if (!questionS.equals(question)) continue;
// get results
ArrayList<Result> results = new ArrayList<Result>();
NodeList resultsL = entryD.getElementsByTagName("result");
for (int j = 0; j < resultsL.getLength(); j++) {
Element resultE = (Element) resultsL.item(j);
Element answerE = (Element) resultE.getElementsByTagName("answer").item(0);
String answerS = answerE.getFirstChild().getNodeValue().trim();
Element scoreE = (Element) resultE.getElementsByTagName("score").item(0);
float scoreF = Float.parseFloat(scoreE.getFirstChild().getNodeValue().trim());
Element docidE = (Element) resultE.getElementsByTagName("docid").item(0);
String docidS = docidE.getFirstChild().getNodeValue().trim();
Element qiE = (Element) resultE.getElementsByTagName("interpretation").item(0);
QuestionInterpretation qi = null;
if (qiE != null) {
Element propertyE = (Element) qiE.getElementsByTagName("property").item(0);
String propertyS = propertyE.getFirstChild().getNodeValue().trim();
Element targetE = (Element) qiE.getElementsByTagName("target").item(0);
String targetS = targetE.getFirstChild().getNodeValue().trim();
NodeList contextL = qiE.getElementsByTagName("context");
String[] contextS = new String[contextL.getLength()];
for (int k = 0; k < contextS.length; k++) {
Element contextE = (Element) contextL.item(k);
contextS[k] = contextE.getFirstChild().getNodeValue().trim();
}
qi = new QuestionInterpretation(targetS, contextS, propertyS);
}
Query query = new Query(null);
query.setInterpretation(qi);
Result result = new Result(answerS, query, docidS);
result.setScore(scoreF);
results.add(result);
}
return results.toArray(new Result[results.size()]);
}
return null; // question not found
} catch (Exception e) {
MsgPrinter.printErrorMsg("Failed to load or parse log file:");
MsgPrinter.printErrorMsg(e.toString());
return null;
}
}
/**
* Appends answers to an output file.
*
* @param filename the output file
* @param answers the answers
* @param correct for each answer a flag that is true iff the answer is
* correct or <code>null</code> if the answers were not evaluated
* @param runTag tag that uniquely identifies the run
* @return <code>true</code>, iff the answers could be saved
*/
public static boolean saveAnswers(String filename, TRECAnswer[] answers,
boolean[] correct, String runTag) {
try {
PrintWriter out =
new PrintWriter(new FileOutputStream(filename, true));
for (int i = 0; i < answers.length; i++) {
String line = answers[i].getId();
line += " " + runTag;
if (correct == null || correct.length < i + 1)
line += " ";
else
line += ((correct[i]) ? " + " : " - ");
line += answers[i].getSupportDoc();
if (answers[i].getAnswerString() != null)
line += " " + answers[i].getAnswerString();
out.println(line);
}
out.close();
} catch (IOException e) {
MsgPrinter.printErrorMsg("Failed to save answers:");
MsgPrinter.printErrorMsg(e.toString());
return false;
}
return true;
}
}