ExtractTopCand.java example

Explorer
relax-decode-master
- third-party
/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */

package joshua.util;

import joshua.util.io.LineReader;
import joshua.util.io.IndexedReader;

import java.io.BufferedWriter;
import java.io.OutputStreamWriter;
import java.io.FileOutputStream;
import java.io.IOException;


/**
 * This program extracts the 1-best output translations from the
 * n-best output translations generated by
 * {@link joshua.decoder.JoshuaDecoder}.
 *
 * @author wren ng thornton <wren@users.sourceforge.net>
 * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $
 */
/* TODO: This class should be renamed, something like ExtractBestCandidates
 * or ExtractBestTranslations. Saying "top" implies more than one
 * (the top how many?) and "cand" is unnecessary abbreviation (also,
 * who cares about candidacy?). Once we rename this, the
 * ./example2/decode_example2.sh script will need updating (as will
 * the end-to-end code)
 */
public class ExtractTopCand {
	
	/**
	 * Usage: <code>java ExtractTopCand nbestInputFile 1bestOutputFile</code>.
	 * <p>
	 * If the input file name is "-" then input is read from
	 * <code>System.in</code>. If the output file name is "-"
	 * then output is directed to <code>System.out</code>. If
	 * a file already exists with the output file name, it is
	 * truncated before writing. The bulk of this program is
	 * implemented by
	 * {@link #extractOneBest(IndexedReader,BufferedWriter)}.
	 */
	public static void main(String[] args) {
		if (2 != args.length) {
			System.err.println("Usage: ExtractTopCand nbestInputFile 1bestOutputFile\n       (use \"-\" for stdin/stdout)");
			System.exit(1);
		}
		
		try {
			// TODO: see documentation for extractOneBest
			// regarding using an n-best SegmentFileParser.
			IndexedReader<String> nbestReader =
				new IndexedReader<String>("line",
					"-".equals(args[0])
						? new LineReader(System.in)
						: new LineReader(args[0]));
			
			/* TODO: This duplicates FileUtility.getWriteFileStream
			 * but with the addition of defaulting to System.out;
			 * should fix that (without breaking other clients
			 * of that method). We ultimately want something which
			 * autochecks for errors (like Writer); has a newLine
			 * method (like BufferedWriter); can wrap System.out;
			 * can autoflush; and it'd be handy to have the
			 * print/println methods of PrintStream/PrintWriter
			 * to boot. PrintWriter *almost* gives us all this,
			 * but it swallows errors and gives no way to
			 * retrieve them >:(
			 */
			BufferedWriter onebestWriter =
				new BufferedWriter(
					new OutputStreamWriter(
						("-".equals(args[1])
							? System.out
							: new FileOutputStream(args[1], false)
						), "UTF-8"));
			
			extractOneBest(nbestReader, onebestWriter);
			
		} catch (IOException ioe) {
			// NOTE: if our onebest was System.out, then that
			// will already have been closed by the finally
			// block. Printing to a closed PrintStream generates
			// no exceptions. We should be printing to System.err
			// anyways, but this something subtle to be aware of.
			System.err.println("There was an error: " + ioe.getMessage());
		}
	}
	
	
	/**
	 * Prints the one-best translation for each segment ID from
	 * the reader as a line on the writer, and closes both
	 * before exiting. The translations for a segment are printed
	 * in the order of the first occurance of the segment ID.
	 * Any information about the segment other than the translation
	 * (including segment ID) is not printed to the writer.
	 * 
	 * <h4>Developer Notes</h4>
	 * This implementation assumes:
	 * <ol>
	 * <li>all translations for a segment are contiguous</li>
	 * <li>the 1-best translation is the first one encountered.</li>
	 * </ol>
	 * We will need to alter the implementation if these
	 * assumptions no longer hold for the output of JoshuaDecoder
	 * (or any sensible n-best format passed to this method).
	 * <p>
	 * We should switch to using an n-best
	 * {@link joshua.decoder.segment_file.SegmentFileParser}
	 * to ensure future compatibility with being able to configure
	 * the output format of the decoder. The MERT code needs
	 * such a SegmentFileParser anyways, so that will reduce
	 * the code duplication between these two classes.
	 */
	protected static void extractOneBest(
		IndexedReader<String> nbestReader, BufferedWriter onebestWriter)
	throws IOException {
		
		try {
			String prevID = null;
			for (String line : nbestReader) {
				
				String[] columns = Regex.threeBarsWithSpace.split(line);
				
				// We allow non-integer segment IDs because the
				// Segment interface does, and we have no reason
				// to add new restrictions.
				String newID = columns[0].trim();
				
				// We want to give the same error message
				// regardless of whether there's a leading space
				// or not. And, we don't want to accidentally
				// accept lines with lots and lots of columns.
				if ("".equals(newID) || newID.startsWith("|||")) {
					throw nbestReader.wrapIOException(
						new IOException("Malformed line, missing segment ID:\n" + line));
				}
				
				// Make sure there's a translation there too
				// TODO: good error message for when the second
				// "|||" doesn't have a following field, m/\|{3}\s*$/
				if (3 > columns.length) {
					throw nbestReader.wrapIOException(
						new IOException("Malformed line, should have at least two \" ||| \":\n" + line));
				}
				
				
				if (null == prevID || ! prevID.equals(newID)) {
					onebestWriter.write(columns[1], 0, columns[1].length());
					onebestWriter.newLine();
					onebestWriter.flush();
					
					prevID = newID;
				}
			}
		} finally {
			try {
				nbestReader.close();
			} finally {
				onebestWriter.close();
			}
		}
	}
}