AQUAINT2Preprocessor.java example

Explorer
lucida-master
- lucida
package info.ephyra.indexing;

import info.ephyra.io.MsgPrinter;
import info.ephyra.util.FileUtils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * A preprocessor for the AQUAINT-2 corpus:
 * <ul>
 * <li>adds paragraph tags for document types 'multi', 'advis' and 'other'</li>
 * <li>converts documents to 'trectext' format required by Indri</li>
 * </ul>
 * 
 * @author Nico Schlaefer
 * @version 2007-07-14
 */
public class AQUAINT2Preprocessor {
	/** Directory of the AQUAINT corpus */
	private static String dir;
	
	/**
	 * Adds paragraph tags to documents of type 'multi', 'advis' and 'other'.
	 * Documents of type 'story' are usually already tagged.
	 * 
	 * @return true, iff the preprocessing was successful
	 */
	private static boolean addParagraphTags() {
		File[] files = FileUtils.getFilesRec(dir);
		
		for (File file : files) {
			// only parse data files
			if (file.getName().contains(".")) {
				MsgPrinter.printStatusMsg("Ignoring " + file.getPath() + ".\n");
				continue;
			}
			
			MsgPrinter.printStatusMsg("Parsing " + file.getName() + "...");
			
			// read file content and modify
			ArrayList<String> doc = new ArrayList<String>();
			try {
				FileInputStream fis = new FileInputStream(file);
				BufferedReader in =
					new BufferedReader(new InputStreamReader(fis, "UTF-8"));
				String line, prevLine;
				boolean text = false, paragraph = false;
				Pattern p = Pattern.compile("\\s*+<DOC\\s*+id=\"([^\"]*+)\"" +
						"\\s*+type=\"([^\"]*+)\"\\s*+>\\s*+");
				String docNo = null, docType = null;
				boolean mod = false;
				
				while (in.ready()) {
					line = in.readLine();
					prevLine = (doc.size() > 0) ? doc.get(doc.size() - 1) : "";
					
					Matcher m = p.matcher(line);
					if (m.find()) {
						docNo = m.group(1);
						docType = m.group(2);
					}
					
					if (text) {
						if (paragraph) {
							if (line.contains("</P>")) {
								paragraph = false;
							} else if (line.contains("<P>")) {
								doc.add("</P>");
								mod = true;
							} else if (line.matches("\\s*+<[^>]++>\\s*+")) {
								doc.add("</P>");
								mod = true;
								paragraph = false;
							} else if (!prevLine.contains("<P>") &&
									   line.matches("\\s*+")) {
								doc.add("</P>");
								doc.add("<P>");
								mod = true;
							}
						} else {
							if (line.contains("<P>")) {
								paragraph = true;
							} else if (line.contains("</P>")) {
								doc.add("<P>");
								mod = true;
							} else if (!line.matches("\\s*+<[^>]++>\\s*+")) {
								doc.add("<P>");
								mod = true;
								paragraph = true;
							}
						}
					}
					if (line.contains("<TEXT>")) text = true;
					if (line.contains("</TEXT>")) {
						if (mod) {
							// print message if 'story' document was modified
							if (!(docType.equals("multi") ||
									docType.equals("advis") ||
									docType.equals("other")))
								MsgPrinter.printStatusMsg("Document " + docNo +
										" of type '" + docType + "' modified.");
						}
						mod = false;
						text = false;
					}
					
					if (!line.matches("\\s*+"))	doc.add(line);
				}
				
				in.close();
			} catch (IOException e) {
				return false;
			}
			
			// write modified content
			try {
				FileOutputStream fos = new FileOutputStream(file);
				PrintWriter out =
					new PrintWriter(new OutputStreamWriter(fos, "UTF-8"));
				for (String line : doc)	out.println(line);
				out.close();
			} catch (IOException e) {
				return false;
			}
			
			MsgPrinter.printStatusMsg("...parsed.\n");
		}
		
		return true;
	}
	
	/**
	 * Converts the documents to the 'trectext' format required by Indri.
	 * 
	 * @return true, iff the preprocessing was successful
	 */
	private static boolean convertToTrectext() {
		File[] files = FileUtils.getFilesRec(dir);
		
		for (File file : files) {
			// only parse data files
			if (file.getName().contains(".")) {
				MsgPrinter.printStatusMsg("Ignoring " + file.getPath() + ".\n");
				continue;
			}
			
			MsgPrinter.printStatusMsg("Parsing " + file.getName() + "...");
			
			// read file content and modify
			ArrayList<String> doc = new ArrayList<String>();
			try {
				FileInputStream fis = new FileInputStream(file);
				BufferedReader in =
					new BufferedReader(new InputStreamReader(fis, "UTF-8"));
				String line;
				Pattern p = Pattern.compile("\\s*+<DOC\\s*+id=\"([^\"]*+)\"" +
						"\\s*+type=\"([^\"]*+)\"\\s*+>\\s*+");
				String docNo = null, docType = null;
				
				while (in.ready()) {
					line = in.readLine();
					
					if (line.matches("\\s*+<\\?xml .*+") ||
							line.matches("\\s*+<!DOCTYPE .*+") ||
							line.matches("\\s*+<DOCSTREAM>\\s*+") ||
							line.matches("\\s*+</DOCSTREAM>\\s*+")) {
						System.out.println("Dropping line: " + line);
					} else if (line.matches("\\s*+<HEADLINE>\\s*+")) {
						doc.add("<TITLE>");
					} else if (line.matches("\\s*+</HEADLINE>\\s*+")) {
						doc.add("</TITLE>");
					} else {
						Matcher m = p.matcher(line);
						if (m.find()) {
							docNo = m.group(1);
							docType = m.group(2);
							doc.add("<DOC>");
							doc.add("<DOCNO>" + docNo + "</DOCNO>");
							doc.add("<DOCTYPE>" + docType + "</DOCTYPE>");
						} else {
							doc.add(line);
						}
					}
				}
				
				in.close();
			} catch (IOException e) {
				return false;
			}
			
			// write modified content
			try {
				FileOutputStream fos = new FileOutputStream(file);
				PrintWriter out =
					new PrintWriter(new OutputStreamWriter(fos, "UTF-8"));
				for (String line : doc)	out.println(line);
				out.close();
			} catch (IOException e) {
				return false;
			}
			
			MsgPrinter.printStatusMsg("...parsed.\n");
		}
		
		return true;
	}
	
	/**
	 * <p>Entry point of the program.</p>
	 * 
	 * <p>Preprocesses the AQUAINT-2 corpus.</p>
	 * 
	 * @param args argument 1: directory of the AQUAINT-2 corpus
	 */
	public static void main(String[] args) {
		if (args.length < 1) {
			MsgPrinter.printUsage("java AQUAINT2Preprocessor " +
					"AQUAINT2_directory");
			System.exit(1);
		}
		dir = args[0];
		
		// enable output of status and error messages
		MsgPrinter.enableStatusMsgs(true);
		MsgPrinter.enableErrorMsgs(true);
		
		// add paragraph tags if missing
		MsgPrinter.printStatusMsg("Adding paragraph tags:\n");
		if (addParagraphTags())
			MsgPrinter.printStatusMsg("Paragraph tags added successfully.\n");
		else {
			MsgPrinter.printErrorMsg("Could not add paragraph tags.");
			System.exit(1);
		}
		
		// convert to 'trectext'
		MsgPrinter.printStatusMsg("Converting to 'trectext' format:\n");
		if (convertToTrectext())
			MsgPrinter.printStatusMsg("Documents converted successfully.");
		else {
			MsgPrinter.printErrorMsg("Could not convert documents.");
			System.exit(1);
		}
	}
}