ASSERT.java example

Explorer
lucida-master
- lucida
package info.ephyra.nlp.semantics;

import info.ephyra.io.MsgPrinter;
import info.ephyra.util.FileCache;
import info.ephyra.util.RegexConverter;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * A Wrapper for the ASSERT semantic role labeler.
 * 
 * @author Andy Schlaikjer, Nico Schlaefer
 * @version 2007-04-25
 */
public class ASSERT {
	/** Directory of ASSERT. */
	private static final String ASSERT_DIR = System.getenv("ASSERT");
	/** Run ASSERT in client mode. */
	private static final boolean ASSERT_CLIENT_MODE = false;
	
	/** Enable caching of parses. */
	private static final boolean CACHING = true;
	/** Cache directory where parses are stored. */
	private static final String CACHE_DIR = "cache/assert";
	
	/** Pattern for extracting parses from ASSERT's output file. */
	private static final Pattern PARSE_P = Pattern.compile("(\\d++): (.*+)");
	
	/**
	 * Creates a temporary file containing the sentences to be processed by ASSERT.
	 * 
	 * @param ss sentences to be parsed
	 * @return input file
	 */
	private static File createInputFile(String[] ss) throws Exception {
		try {
			File input = File.createTempFile("assert", ".input", new File(ASSERT_DIR + "/scripts"));
//			input.deleteOnExit();
			PrintWriter pw = new PrintWriter(new BufferedWriter(
				new OutputStreamWriter(new FileOutputStream(input), "ISO-8859-1")));
			
			for (String sentence : ss) {
				pw.println(sentence);
				if (pw.checkError()) throw new IOException();
			}
			
			pw.close();
			if (pw.checkError()) throw new IOException();
			
			return input;
		} catch (IOException e) {
			throw new IOException("Failed to create input file.");
		}
	}
	
	/**
	 * Instantiates an ASSERT process using the supplied input file.
	 * 
	 * @param input an input file initialized previously with a call to <code>createInputFile()</code>
	 * @return log file
	 */
	private static File execAssertProcess(File input) throws Exception {
		String basename = input.getCanonicalPath();
		basename = basename.substring(0, basename.lastIndexOf('.'));
		File logf = new File(basename + ".log");
//		logf.deleteOnExit();
		
		// instantiate an annotator process
		Process process;
		try {
			// in order to pipe stdout and stderr from our subprocess, we start
			// an instance of bash and then exec assert from within this shell.
			process = Runtime.getRuntime().exec("bash");
			PrintWriter pw = new PrintWriter(process.getOutputStream());
			
			// assert generates its intermediate and final output files within
			// the current working directory, so we first must "cd" to the
			// directory we want output to be stored, making sure to test error
			// status of the "cd" operation.
			String cmd = "cd " + input.getParent() + " 2> "
				+ logf.getCanonicalPath()
				+ "; rv=\"$?\"; if [ \"$rv\" -ne \"0\" ]; then exit $rv; fi; "
				+ ASSERT_DIR + "/scripts/assert "
				+ (ASSERT_CLIENT_MODE ? "--mode=client " : "")
				+ input.getCanonicalPath() + " >> " + logf.getCanonicalPath()
				+ " 2>&1; exit";
			
			pw.println(cmd);
			
			// close stdin, stdout, and stderr of the subprocess for good
			// measure. This is usually a good thing to do, both from a java
			// threads point of view (many helper threads to handle each
			// subprocess io stream), and daemonization of subprocess point of
			// view (we don't want the subprocess thinking we've got something
			// more to say to it).
			pw.close();
			// process.getInputStream().close();
			// process.getErrorStream().close();
		} catch (Exception e) {
			throw new Exception("Failed to execute annotator process.", e);
		}
		
		// timeout on the Assert process
		// Interruptor interruptor = new Interruptor(timeout * 1000);
		int rv;
		try {
			// start the interruptor thread just before we block on the
			// previosly instantiated annotator process.
			// interruptor.run();
			
			// block on the annotator process. if the process does not finish
			// before the timeout provided to the interruptor thread, then the
			// interruptor will interrupt this blocking call and we'll recieve
			// an InterruptedException.
			rv = process.waitFor();
			
			// as soon as the annotator process finishes, interrupt the
			// interruptor so it doesn't needlessly interrupt us!
			// interruptor.interrupt();
		} catch (InterruptedException e) {
			// the subprocess has timed out, so we kill it before throwing a new
			// exception.
			process.destroy();
			throw new Exception("Process timed out.");
		}
		
		// test return value
		if (rv != 0) throw new Exception("Process returned error code: " + rv);
		
		return logf;
	}
	
	/**
	 * Reads the annotated sentences from the output file created by ASSERT.
	 * 
	 * @param input an input file initialized previously with a call to <code>createInputFile()</code>
	 * @param sentCount number of sentences that have been passed to ASSERT
	 * @return annotated sentences
	 */
	private static String[][] readOutputFile(File input, int sentCount) throws Exception {
		try {
			String basename = input.getCanonicalPath();
			basename = basename.substring(0, basename.lastIndexOf('.'));
			File output = new File(basename + ".parses");
//			output.deleteOnExit();
			
			BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(output), "ISO-8859-1"));
			
			// this leaves out sentences that could not be parsed
//			ArrayList<String> as = new ArrayList<String>();
//			ArrayList<String[]> ass = new ArrayList<String[]>();
//			String line;
//			int previous = -1;
//			while ((line = br.readLine()) != null) {
//				Matcher parseM = PARSE_P.matcher(line);
//				if (parseM.find()) {
//					int sid = Integer.parseInt(parseM.group(1));
//					String annotation = parseM.group(2).trim();
//					
//					if (sid != previous) {
//						if (as.size() > 0) ass.add(as.toArray(new String[as.size()]));
//						as = new ArrayList<String>();
//						previous = sid;
//					}
//					as.add(annotation);
//				} else throw new IOException();
//			}
//			br.close();
//			if (as.size() > 0) ass.add(as.toArray(new String[as.size()]));
//			return ass.toArray(new String[ass.size()][]);
			
			// this creates an empty array for sentences that could not be parsed
			ArrayList<String> as = new ArrayList<String>();
			String[][] ass = new String[sentCount][];
			String line;
			int previous = -1;
			while ((line = br.readLine()) != null) {
				Matcher parseM = PARSE_P.matcher(line);
				if (parseM.find()) {
					int sid = Integer.parseInt(parseM.group(1));
					String annotation = parseM.group(2).trim();
					
					if (sid != previous) {
						if (as.size() > 0) ass[previous] = as.toArray(new String[as.size()]);
						as = new ArrayList<String>();
						previous = sid;
					}
					as.add(annotation);
				} else {
					if (!line.equals("ERROR: Found an empty input file... exiting."))
						throw new IOException("Malformatted line: " + line);
				}
			}
			br.close();
			if (as.size() > 0) ass[previous] = as.toArray(new String[as.size()]);
			for (int i = 0; i < ass.length; i++)
				if (ass[i] == null) ass[i] = new String[0];
			return ass;
		} catch (IOException e) {
			throw new IOException("Failed to parse output file.");
		}
	}
	
	/**
	 * Checks the log file for ASSERT failures. Returns <code>Integer.MAX_VALUE</code> if ASSERT successfully parsed the
	 * sentences or the index of the last sentence that was parsed if ASSERT failed. -1 indicates that no sentence could
	 * be parsed.
	 * 
	 * @param logf log file
	 * @return <code>Integer.MAX_VALUE</code> or index of last sentence that was parsed
	 */
	private static int checkLogFile(File logf) {
		try {
			BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(logf), "ISO-8859-1"));
			
			int lastIndex = -1;
			Pattern p = Pattern.compile("^(\\d++): ");
			while (br.ready()) {
				String line = br.readLine();
				
				Matcher m = p.matcher(line);
				if (m.find()) lastIndex = Integer.parseInt(m.group(1));
				
				if (line.contains(" DOMAIN/FRAME/")) {
					br.close();
					return lastIndex;  // ASSERT crashed
				}
			}
			
			br.close();
			
			return Integer.MAX_VALUE;  // log file looks ok
		} catch (IOException e) {
			return -1;  // log file cannot be read
		}
	}
	
	/**
	 * Annotates the predicates in an array of sentences.
	 * 
	 * @param ss sentences to be parsed
	 * @return annotated sentences
	 */
	public static String[][] annotatePredicates(String ss[]) {
		// drop special characters that ASSERT cannot handle
		Pattern p = Pattern.compile(".++");
		for (int i = 0; i < ss.length; i++) {
			String noSpecChar = "";
			
			Matcher m = p.matcher(ss[i]);
			while (m.find()) noSpecChar += " " + m.group(0);
			
			ss[i] = noSpecChar.trim();
		}
		
		// if caching is enabled, try to read parses from cache
		String[][] allParses = new String[ss.length][];  // parses from both cache and ASSERT
		ArrayList<Integer> originalIndices = new ArrayList<Integer>();  // used to merge parses from cache and ASSERT
		if (CACHING) {
			FileCache cache = new FileCache(CACHE_DIR);
			
			ArrayList<String> notInCache = new ArrayList<String>();  // sentences that are not in the cache
			for (int i = 0; i < ss.length; i++) {
				String[] parses = cache.read(ss[i]);
				if (parses != null)
					allParses[i] = parses;
				else {
					notInCache.add(ss[i]);
					originalIndices.add(i);
				}
			}
			
			ss = notInCache.toArray(new String[notInCache.size()]);
		}
		
		// get missing parses from ASSERT
		String[][] parses = new String[ss.length][];
		if (ss.length > 0 && ASSERT_DIR != null && ASSERT_DIR.length() > 0) {
			try {
				MsgPrinter.printStatusMsgTimestamp("Parsing " + ss.length + " sentences with ASSERT...");
				
				int beginIndex = 0;
				while (beginIndex < ss.length) {  // restart ASSERT if it crashed
					// copy sentences that have not been parsed yet
					String[] sentences = new String[ss.length - beginIndex];
					for (int i = 0; i < sentences.length; i++)
						sentences[i] = ss[i + beginIndex];
					
					// parse these sentences
					File input = createInputFile(sentences);
					File logf = execAssertProcess(input);
					String[][] output = readOutputFile(input, ss.length);
					
					// merge parses in one array
					int lastIndex = checkLogFile(logf);
					if (lastIndex > -1 && lastIndex < Integer.MAX_VALUE) {
						MsgPrinter.printErrorMsg("ASSERT could not parse sentence:\n" + sentences[lastIndex]);
						output[lastIndex] = null;
					} else if (lastIndex == Integer.MAX_VALUE) {
						lastIndex = sentences.length - 1;
					}
					lastIndex = beginIndex + lastIndex;
					for (int i = beginIndex; i <= lastIndex; i++)
						parses[i] = output[i - beginIndex];
					
					beginIndex = lastIndex + 1;
				}
				
				MsgPrinter.printStatusMsgTimestamp("...done");
			} catch (Exception e) {
				MsgPrinter.printErrorMsg("\nCould not call ASSERT:\n" + e.getMessage());
				System.exit(1);
			}
		}
		
		// if caching is enabled, write new parses to cache and merge parses from cache and ASSERT
		if (CACHING) {
			FileCache cache = new FileCache(CACHE_DIR);
			
			for (int i = 0; i < parses.length; i++) {
				if (parses[i] != null) cache.write(ss[i], parses[i]);  // write to cache
				allParses[originalIndices.get(i)] = parses[i];  // merge with results from cache
			}
		} else {
			allParses = parses;
		}
		
		// return an empty array for sentences that could not be parsed
		for (int i = 0; i < allParses.length; i++)
			if (allParses[i] == null) allParses[i] = new String[0];
		
		return allParses;
	}
	
	/**
	 * This untokenizer is tailored for ASSERT. It does not only remove abundant blanks but it also tries to restore
	 * special characters that have been dropped by ASSERT.
	 * 
	 * @param text text to untokenize
	 * @param original string that contains the original text as a subsequence
	 * @return subsequence of the original string or <code>null</code>, iff there is no such subsequence
	 */
	public static String untokenize(String text, String original) {
		// try with boundary matchers
		String regex = RegexConverter.strToRegexWithBounds(text);
		regex = regex.replace(" ", "\\W*");
		Matcher m = Pattern.compile(regex).matcher(original);
		if (m.find()) return m.group(0);
		
		// try without boundary matchers
		regex = RegexConverter.strToRegex(text);
		regex = regex.replace(" ", "\\W*");
		m = Pattern.compile(regex).matcher(original);
		if (m.find()) return m.group(0);
		
		// untokenization failed
		return null;
	}
}