package info.ephyra.indexing; import info.ephyra.io.MsgPrinter; import info.ephyra.util.FileUtils; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; /** * A preprocessor for the AQUAINT corpus: * <ul> * <li>adds paragraph tags if missing</li> * <li>splits paragraphs, e.g. to separate publisher details</li> * </ul> * * @author Nico Schlaefer * @version 2006-04-30 */ public class AQUAINTPreprocessor { /** Directory of the AQUAINT corpus */ private static String dir; /** * Adds paragraph tags if missing. * * @return true, iff the preprocessing was successful */ private static boolean addParagraphTags() { File[] files = FileUtils.getFilesRec(dir); for (File file : files) { // read file content and modify MsgPrinter.printStatusMsg("Parsing " + file.getName() + "..."); ArrayList<String> doc = new ArrayList<String>(); boolean mod = false; try { BufferedReader in = new BufferedReader(new FileReader(file)); String line, prevLine = ""; boolean text = false, paragraph = false; while (in.ready()) { line = in.readLine(); if (text) { if (paragraph) { if (line.contains("</P>")) { paragraph = false; } else if (line.contains("<P>")) { doc.add("</P>"); mod = true; } else if (line.matches("\\s*+<[^>]++>\\s*+")) { doc.add("</P>"); mod = true; paragraph = false; } else if (!prevLine.contains("<P>") && (line.startsWith("\t") || line.startsWith(" "))) { doc.add("</P>"); doc.add("<P>"); mod = true; } } else { if (line.contains("<P>")) { paragraph = true; } else if (line.contains("</P>")) { doc.add("<P>"); mod = true; } else if (!line.matches("\\s*+<[^>]++>\\s*+")) { doc.add("<P>"); mod = true; paragraph = true; } } } if (line.contains("<TEXT>")) text = true; if (line.contains("</TEXT>")) text = false; doc.add(line); prevLine = line; } in.close(); } catch (IOException e) { return false; } // write modified content if (mod) { try { PrintWriter out = new PrintWriter(file, "UTF-8"); for (String line : doc) out.println(line); out.close(); } catch (IOException e) { return false; } MsgPrinter.printStatusMsg("...modified"); } } return true; } /** * Splits paragraphs, e.g. to separate publisher details. * * @return true, iff the preprocessing was successful */ private static boolean splitParagraphs() { File[] files = FileUtils.getFilesRec(dir); for (File file : files) { // read file content and modify MsgPrinter.printStatusMsg("Parsing " + file.getName() + "..."); ArrayList<String> doc = new ArrayList<String>(); boolean mod = false; try { BufferedReader in = new BufferedReader(new FileReader(file)); String line; boolean begin = false; while (in.ready()) { line = in.readLine(); if (begin) { String[] split = line.split("( _ | -- )", 2); if (split.length == 2) { doc.add(split[0]); doc.add("</P>"); doc.add("<P>"); doc.add(split[1]); mod = true; } else doc.add(line); if (!line.contains("<P>")) begin = false; } else { doc.add(line); if (line.contains("<TEXT>")) begin = true; } } in.close(); } catch (IOException e) { return false; } // write modified content if (mod) { try { PrintWriter out = new PrintWriter(file, "UTF-8"); for (String line : doc) out.println(line); out.close(); } catch (IOException e) { return false; } MsgPrinter.printStatusMsg("...modified"); } } return true; } /** * <p>Entry point of the program.</p> * * <p>Preprocesses the AQUAINT corpus.</p> * * @param args argument 1: directory of the AQUAINT corpus */ public static void main(String[] args) { if (args.length < 1) { MsgPrinter.printUsage("java AQUAINTPreprocessor AQUAINT_directory"); System.exit(1); } dir = args[0]; // enable output of status and error messages MsgPrinter.enableStatusMsgs(true); MsgPrinter.enableErrorMsgs(true); // add paragraph tags if missing MsgPrinter.printStatusMsg("Adding paragraph tags..."); if (addParagraphTags()) MsgPrinter.printStatusMsg("Paragraph tags added successfully."); else { MsgPrinter.printErrorMsg("Could not add paragraph tags."); System.exit(1); } // split paragraphs MsgPrinter.printStatusMsg("Splitting paragraphs..."); if (splitParagraphs()) MsgPrinter.printStatusMsg("Paragraphs splitted successfully."); else { MsgPrinter.printErrorMsg("Could not split paragraphs."); System.exit(1); } } }