package info.ephyra.indexing; import info.ephyra.io.MsgPrinter; import info.ephyra.util.FileUtils; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * A preprocessor for the AQUAINT-2 corpus: * <ul> * <li>adds paragraph tags for document types 'multi', 'advis' and 'other'</li> * <li>converts documents to 'trectext' format required by Indri</li> * </ul> * * @author Nico Schlaefer * @version 2007-07-14 */ public class AQUAINT2Preprocessor { /** Directory of the AQUAINT corpus */ private static String dir; /** * Adds paragraph tags to documents of type 'multi', 'advis' and 'other'. * Documents of type 'story' are usually already tagged. * * @return true, iff the preprocessing was successful */ private static boolean addParagraphTags() { File[] files = FileUtils.getFilesRec(dir); for (File file : files) { // only parse data files if (file.getName().contains(".")) { MsgPrinter.printStatusMsg("Ignoring " + file.getPath() + ".\n"); continue; } MsgPrinter.printStatusMsg("Parsing " + file.getName() + "..."); // read file content and modify ArrayList<String> doc = new ArrayList<String>(); try { FileInputStream fis = new FileInputStream(file); BufferedReader in = new BufferedReader(new InputStreamReader(fis, "UTF-8")); String line, prevLine; boolean text = false, paragraph = false; Pattern p = Pattern.compile("\\s*+<DOC\\s*+id=\"([^\"]*+)\"" + "\\s*+type=\"([^\"]*+)\"\\s*+>\\s*+"); String docNo = null, docType = null; boolean mod = false; while (in.ready()) { line = in.readLine(); prevLine = (doc.size() > 0) ? doc.get(doc.size() - 1) : ""; Matcher m = p.matcher(line); if (m.find()) { docNo = m.group(1); docType = m.group(2); } if (text) { if (paragraph) { if (line.contains("</P>")) { paragraph = false; } else if (line.contains("<P>")) { doc.add("</P>"); mod = true; } else if (line.matches("\\s*+<[^>]++>\\s*+")) { doc.add("</P>"); mod = true; paragraph = false; } else if (!prevLine.contains("<P>") && line.matches("\\s*+")) { doc.add("</P>"); doc.add("<P>"); mod = true; } } else { if (line.contains("<P>")) { paragraph = true; } else if (line.contains("</P>")) { doc.add("<P>"); mod = true; } else if (!line.matches("\\s*+<[^>]++>\\s*+")) { doc.add("<P>"); mod = true; paragraph = true; } } } if (line.contains("<TEXT>")) text = true; if (line.contains("</TEXT>")) { if (mod) { // print message if 'story' document was modified if (!(docType.equals("multi") || docType.equals("advis") || docType.equals("other"))) MsgPrinter.printStatusMsg("Document " + docNo + " of type '" + docType + "' modified."); } mod = false; text = false; } if (!line.matches("\\s*+")) doc.add(line); } in.close(); } catch (IOException e) { return false; } // write modified content try { FileOutputStream fos = new FileOutputStream(file); PrintWriter out = new PrintWriter(new OutputStreamWriter(fos, "UTF-8")); for (String line : doc) out.println(line); out.close(); } catch (IOException e) { return false; } MsgPrinter.printStatusMsg("...parsed.\n"); } return true; } /** * Converts the documents to the 'trectext' format required by Indri. * * @return true, iff the preprocessing was successful */ private static boolean convertToTrectext() { File[] files = FileUtils.getFilesRec(dir); for (File file : files) { // only parse data files if (file.getName().contains(".")) { MsgPrinter.printStatusMsg("Ignoring " + file.getPath() + ".\n"); continue; } MsgPrinter.printStatusMsg("Parsing " + file.getName() + "..."); // read file content and modify ArrayList<String> doc = new ArrayList<String>(); try { FileInputStream fis = new FileInputStream(file); BufferedReader in = new BufferedReader(new InputStreamReader(fis, "UTF-8")); String line; Pattern p = Pattern.compile("\\s*+<DOC\\s*+id=\"([^\"]*+)\"" + "\\s*+type=\"([^\"]*+)\"\\s*+>\\s*+"); String docNo = null, docType = null; while (in.ready()) { line = in.readLine(); if (line.matches("\\s*+<\\?xml .*+") || line.matches("\\s*+<!DOCTYPE .*+") || line.matches("\\s*+<DOCSTREAM>\\s*+") || line.matches("\\s*+</DOCSTREAM>\\s*+")) { System.out.println("Dropping line: " + line); } else if (line.matches("\\s*+<HEADLINE>\\s*+")) { doc.add("<TITLE>"); } else if (line.matches("\\s*+</HEADLINE>\\s*+")) { doc.add("</TITLE>"); } else { Matcher m = p.matcher(line); if (m.find()) { docNo = m.group(1); docType = m.group(2); doc.add("<DOC>"); doc.add("<DOCNO>" + docNo + "</DOCNO>"); doc.add("<DOCTYPE>" + docType + "</DOCTYPE>"); } else { doc.add(line); } } } in.close(); } catch (IOException e) { return false; } // write modified content try { FileOutputStream fos = new FileOutputStream(file); PrintWriter out = new PrintWriter(new OutputStreamWriter(fos, "UTF-8")); for (String line : doc) out.println(line); out.close(); } catch (IOException e) { return false; } MsgPrinter.printStatusMsg("...parsed.\n"); } return true; } /** * <p>Entry point of the program.</p> * * <p>Preprocesses the AQUAINT-2 corpus.</p> * * @param args argument 1: directory of the AQUAINT-2 corpus */ public static void main(String[] args) { if (args.length < 1) { MsgPrinter.printUsage("java AQUAINT2Preprocessor " + "AQUAINT2_directory"); System.exit(1); } dir = args[0]; // enable output of status and error messages MsgPrinter.enableStatusMsgs(true); MsgPrinter.enableErrorMsgs(true); // add paragraph tags if missing MsgPrinter.printStatusMsg("Adding paragraph tags:\n"); if (addParagraphTags()) MsgPrinter.printStatusMsg("Paragraph tags added successfully.\n"); else { MsgPrinter.printErrorMsg("Could not add paragraph tags."); System.exit(1); } // convert to 'trectext' MsgPrinter.printStatusMsg("Converting to 'trectext' format:\n"); if (convertToTrectext()) MsgPrinter.printStatusMsg("Documents converted successfully."); else { MsgPrinter.printErrorMsg("Could not convert documents."); System.exit(1); } } }