package info.ephyra.trec; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; /** * A parser for the TREC 8-12 QA tracks. * * @author Nico Schlaefer * @version 2007-05-25 */ public class TREC8To12Parser { /** Type of the TREC 8 to 11 questions. */ private static final String QTYPE = "FACTOID"; /** * Loads the questions from a file. * * @param filename file that contains the questions * @return questions or <code>null</code>, if the file could not be parsed */ public static TRECQuestion[] loadQuestions(String filename) { File file = new File(filename); try { BufferedReader in = new BufferedReader(new FileReader(file)); String id = ""; String type = ""; String line, questionString; TRECQuestion question; ArrayList<TRECQuestion> questions = new ArrayList<TRECQuestion>(); while (in.ready()) { line = in.readLine(); if (line.matches("<num>.*")) { id = line.split(": ")[1].trim(); type = QTYPE; // TREC 8 to 11 } else if (line.matches("<type>.*")) { type = line.split(": ")[1].trim().toUpperCase(); // TREC 12 } else if (line.matches("<desc>.*")) { questionString = in.readLine().trim(); question = new TRECQuestion(id, type, questionString); questions.add(question); } } in.close(); return questions.toArray(new TRECQuestion[questions.size()]); } catch (IOException e) { return null; // file could not be parsed } } /** * Loads the patterns from a file. * * @param filename file that contains the patterns * @return patterns or <code>null</code>, if the file could not be parsed */ public static TRECPattern[] loadPatterns(String filename) { TRECPattern[] aligned = loadPatternsAligned(filename); if (aligned == null) return null; // remove null-entries ArrayList<TRECPattern> patterns = new ArrayList<TRECPattern>(); for (TRECPattern pattern : aligned) if (pattern != null) patterns.add(pattern); return patterns.toArray(new TRECPattern[patterns.size()]); } /** * Loads the patterns from a file. For each skipped question ID in the input * file, a <code>null</code> entry is added to the array of patterns. * * @param filename file that contains the patterns * @return patterns aligned to question IDs or <code>null</code>, if the * file could not be parsed */ public static TRECPattern[] loadPatternsAligned(String filename) { File file = new File(filename); try { BufferedReader in = new BufferedReader(new FileReader(file)); String[] line; int id, lastId = -1; String regex = ""; TRECPattern pattern; ArrayList<TRECPattern> patterns = new ArrayList<TRECPattern>(); while (in.ready()) { line = in.readLine().split(" ", 2); id = Integer.parseInt(line[0]); if (id == lastId) // if still the same pattern, append the regular expression regex += "|" + line[1].trim(); else { // next pattern if (!(lastId == -1)) { // if not first pattern, add previous pattern to results regex += ")"; pattern = new TRECPattern(Integer.toString(lastId), new String[] {regex}); patterns.add(pattern); // some number might have been skipped for (int i = lastId + 1; i < id; i++) patterns.add(null); } // start new pattern lastId = id; regex = "(?i)(" + line[1].trim(); // case is ignored } } // add last pattern to results regex += ")"; pattern = new TRECPattern(Integer.toString(lastId), new String[] {regex}); patterns.add(pattern); in.close(); return patterns.toArray(new TRECPattern[patterns.size()]); } catch (IOException e) { return null; // file could not be parsed } } /** * Loads the answers to the TREC9 questions from a file. * * @param filename file that contains the answers * @return answers or <code>null</code>, if the file could not be parsed */ public static TRECAnswer[] loadTREC9Answers(String filename) { File file = new File(filename); try { BufferedReader in = new BufferedReader(new FileReader(file)); String id; String line, answerString; TRECAnswer answer; ArrayList<TRECAnswer> answers = new ArrayList<TRECAnswer>(); while (in.ready()) { line = in.readLine(); if (line.matches("Question.*")) { id = line.split(" ")[1]; in.readLine(); in.readLine(); answerString = in.readLine().trim(); answer = new TRECAnswer(id, answerString); answers.add(answer); } } in.close(); return answers.toArray(new TRECAnswer[answers.size()]); } catch (IOException e) { return null; // file could not be parsed } } }