package info.ephyra.indexing; import info.ephyra.io.MsgPrinter; import info.ephyra.util.FileUtils; import info.ephyra.util.HTMLConverter; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.ArrayList; /** * A preprocessor for the Blog06 corpus: * <ul> * <li>converts HTML to plain text</li> * <li>splits documents into paragraphs along structuring HTML tags</li> * </ul> * * @author Nico Schlaefer * @version 2007-05-16 */ public class Blog06Preprocessor { /** * Converts a single file. * * @param input corpus file */ private static void convertFile(File input) { MsgPrinter.printStatusMsg("Parsing " + input.getName() + "..."); // read document ArrayList<String> headers = new ArrayList<String>(); ArrayList<String> contents = new ArrayList<String>(); try { FileInputStream fis = new FileInputStream(input); BufferedReader reader = new BufferedReader(new InputStreamReader(fis, "UTF-8")); boolean content = false; while (reader.ready()) { if (content == false) { // read header StringBuilder sb = new StringBuilder(); while (reader.ready()) { String line = reader.readLine(); sb.append(line + "\n"); if (line.matches("\\s*?</DOCHDR>\\s*+")) break; } headers.add(sb.toString()); content = true; } else { // read content StringBuilder sb = new StringBuilder(); while (reader.ready()) { String line = reader.readLine(); sb.append(line + "\n"); if (line.matches("\\s*?</DOC>\\s*+")) break; } contents.add(sb.toString()); content = false; } } reader.close(); if (headers.size() == 0 || contents.size() == 0 || headers.size() != contents.size()) { MsgPrinter.printErrorMsg(input.getName() + " is malformatted."); System.exit(1); } } catch (IOException e) { MsgPrinter.printErrorMsg("Could not read from " + input.getName() + "."); System.exit(1); } // convert contents to plain text for (int i = 0; i < contents.size(); i++) { String text = HTMLConverter.html2text(contents.get(i)); if (text == null) { MsgPrinter.printErrorMsg(input.getName() + " could not be parsed."); System.exit(1); } contents.set(i, text); } // add paragraph tags along new lines for (int i = 0; i < contents.size(); i++) { String[] lines = contents.get(i).split("\\n"); StringBuilder sb = new StringBuilder(); for (int j = 0; j < lines.length; j++) sb.append("<P>" + lines[j] + "</P>\n"); contents.set(i, sb.toString()); } // write header and modified content File output = new File(input.getPath() + ".parsed"); try { FileOutputStream fos = new FileOutputStream(output); PrintWriter writer = new PrintWriter(new OutputStreamWriter(fos, "UTF-8")); for (int i = 0; i < headers.size(); i++) { writer.print(headers.get(i)); writer.print(contents.get(i)); writer.print("</DOC>\n"); } writer.close(); } catch (IOException e) { MsgPrinter.printErrorMsg("Could not write to " + output.getName() + "."); System.exit(1); } } /** * <p>Entry point of the program.</p> * * <p>Preprocesses the Blog06 corpus.</p> * * @param args argument 1: directory of the Blog06 corpus */ public static void main(String[] args) { if (args.length < 1) { MsgPrinter.printUsage("java Blog06Preprocessor Blog06_directory"); System.exit(1); } String dir = args[0]; // enable output of status and error messages MsgPrinter.enableStatusMsgs(true); MsgPrinter.enableErrorMsgs(true); // convert all content files in the corpus directory File[] files = FileUtils.getFilesRec(dir); for (File file : files) { String filename = file.getName(); // file must not be hidden and must end in "-" followed by a number if (!filename.startsWith(".") && filename.matches(".*?-\\d++")) convertFile(file); } MsgPrinter.printStatusMsg("...done."); } }