Compile.java example

Explorer
relax-decode-master
- third-party
/* This file is part of the Joshua Machine Translation System.
 * 
 * Joshua is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free
 * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA
 */
package joshua.corpus.suffix_array;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.util.Scanner;
import java.util.logging.Level;
import java.util.logging.Logger;

import joshua.corpus.AlignedParallelCorpus;
import joshua.corpus.CorpusArray;
import joshua.corpus.ParallelCorpus;
import joshua.corpus.alignment.AlignmentGrids;
import joshua.corpus.lexprob.LexProbs;
import joshua.corpus.lexprob.LexicalProbabilities;
import joshua.corpus.vocab.Vocabulary;
import joshua.decoder.JoshuaConfiguration;
import joshua.util.Cache;
import joshua.util.io.BinaryOut;

/**
 * Compiles a parallel corpus into binary data files.
 * 
 * @author Lane Schwartz
 * @version $LastChangedDate: 2010-02-11 15:47:37 -0600 (Thu, 11 Feb 2010) $
 */
public class Compile {

	/** Logger for this class. */
	private static final Logger logger =
		Logger.getLogger(Compile.class.getName());
	
	
	private String sourceCorpusFileName;
	private String targetCorpusFileName;
	private String alignmentsFileName;
	private String outputDirName;
	private String charset = "UTF-8";
	
	private int minFrequency = 0;
	private short maxPhrases = 100;

	private int maxPhraseLength = JoshuaConfiguration.sa_max_phrase_length;
	
	private int maxPhraseSpan = JoshuaConfiguration.sa_max_phrase_span;
	
	private int minNonterminalSpan = JoshuaConfiguration.sa_min_nonterminal_span;
	
	public void setMinNonterminalSpan(int minNonterminalSpan) {
		this.minNonterminalSpan = minNonterminalSpan;
	}
	
	public void setMaxPhraseSpan(int maxPhraseSpan) {
		this.maxPhraseSpan = maxPhraseSpan;
	}
	
	public void setMinFrequency(int minFrequency) {
		this.minFrequency = minFrequency;
	}
	
	public void setMaxPhrases(short maxPhrases) {
		this.maxPhrases = maxPhrases;
	}
	
	public void setMaxPhraseLength(int maxPhraseLength) {
		this.maxPhraseLength = maxPhraseLength;
	}
	
	public void setSourceCorpus(String sourceCorpusFileName) {
		this.sourceCorpusFileName = sourceCorpusFileName;
	}
	
	public void setTargetCorpus(String targetCorpusFileName) {
		this.targetCorpusFileName = targetCorpusFileName;
	}
	
	public void setAlignments(String alignmentsFileName) {
		this.alignmentsFileName = alignmentsFileName;
	}
	
	
	public void setOutputDir(String outputDirName) {
		this.outputDirName = outputDirName;
	}
	
	public void setEncoding(String charset) {
		this.charset = charset;
	}
	
	public void execute() throws IOException {
		// Verify that output directory exists or can be created
		File outputDir = new File(outputDirName);
		if (! outputDir.exists()) {
			boolean success = outputDir.mkdirs();
			if (! success) {
				logger.severe("Output directory does not exist, and could not be successfully created: " + outputDirName);
				System.exit(-1);
			}
		} else if (! outputDir.isDirectory()) {
			logger.severe("Output directory exists, but is not a directory: " + outputDirName);
			System.exit(-2);
		} else if (! outputDirName.endsWith(".josh")) {
			logger.warning("By convention, the output directory should end in .josh");
		}
		
		
		// Construct common vocabulary
		Vocabulary symbolTable = new Vocabulary();
		if (logger.isLoggable(Level.INFO)) logger.info("Adding terminal tokens from file " + sourceCorpusFileName + " to common vocabulary");
		int[] sourceLengths = Vocabulary.initializeVocabulary(sourceCorpusFileName, symbolTable, false);
		if (logger.isLoggable(Level.INFO)) logger.info("Adding terminal tokens from file " + targetCorpusFileName + " to common vocabulary");
		int[] targetLengths = Vocabulary.initializeVocabulary(targetCorpusFileName, symbolTable, true);
		
		if (sourceLengths[1] != targetLengths[1]) {
			logger.severe("Source corpus and target corpus have different number of sentences (" + sourceLengths[1] + " vs " + targetLengths[1] + ")");
			System.exit(-3);
		}
		int numberOfSentences = sourceLengths[1];
		
		// Write README file to disk
		String readmeFilename = outputDirName + File.separator + "README.txt";
		PrintStream out = new PrintStream(readmeFilename);

		out.println("This directory contains the following binary files:");
		out.println();

		
		
		// Write vocabulary to disk
		{
			String binaryVocabFilename = outputDirName + File.separator + "common.vocab";
			if (logger.isLoggable(Level.INFO)) logger.info("Writing binary common vocabulary to disk at " + binaryVocabFilename);
			
			ObjectOutput vocabOut =
	    		new BinaryOut(new FileOutputStream(binaryVocabFilename), true);
			symbolTable.setExternalizableEncoding(charset);
	    	symbolTable.writeExternal(vocabOut);
	    	vocabOut.flush();
	    	
			out.println("Common symbol table for source and target language: " + binaryVocabFilename);
		}
		
		
		
		// Construct source language corpus
		if (logger.isLoggable(Level.INFO)) logger.info("Constructing corpus array from file " + sourceCorpusFileName);
		CorpusArray sourceCorpusArray = SuffixArrayFactory.createCorpusArray(sourceCorpusFileName, symbolTable, sourceLengths[0], sourceLengths[1]);
		
		// Write source corpus to disk
		{
			String binarySourceCorpusFilename = outputDirName + File.separator + "source.corpus";
			if (logger.isLoggable(Level.INFO)) logger.info("Writing binary source corpus to disk at " + binarySourceCorpusFilename);
			
	    	BinaryOut corpusOut = new BinaryOut(new FileOutputStream(binarySourceCorpusFilename), false);
	    	sourceCorpusArray.writeExternal(corpusOut);	
	    	corpusOut.flush();
	    	
			out.println("Source language corpus: " + binarySourceCorpusFilename);
		}
		
		// Construct target language corpus
		if (logger.isLoggable(Level.INFO)) logger.info("Constructing corpus array from file " + targetCorpusFileName);
		CorpusArray targetCorpusArray = SuffixArrayFactory.createCorpusArray(targetCorpusFileName, symbolTable, targetLengths[0], targetLengths[1]);
		
		
		// Write target language corpus to disk
		{
			String binaryTargetCorpusFilename = outputDirName + File.separator + "target.corpus";
			if (logger.isLoggable(Level.INFO)) logger.info("Writing binary target corpus to disk at " + binaryTargetCorpusFilename);
			
	    	BinaryOut corpusOut = new BinaryOut(new FileOutputStream(binaryTargetCorpusFilename), false);
	    	targetCorpusArray.writeExternal(corpusOut);	
	    	corpusOut.flush();
	    	
	    	out.println("Target language corpus: " + binaryTargetCorpusFilename);
		}
		
		{
			// Construct alignments data structure
			AlignmentGrids grids = new AlignmentGrids(
					new Scanner(new File(alignmentsFileName)), 
					sourceCorpusArray, 
					targetCorpusArray,
					numberOfSentences);

			// Write alignments to disk
			{
				String binaryAlignmentsFilename = outputDirName + File.separator + "alignment.grids";
				if (logger.isLoggable(Level.INFO)) logger.info("Writing binary alignment grids to disk at " + binaryAlignmentsFilename);

				BinaryOut alignmentsOut = new BinaryOut(binaryAlignmentsFilename);
				grids.writeExternal(alignmentsOut);
				alignmentsOut.flush();
				alignmentsOut.close();

				out.println("Source-target alignment grids: " + binaryAlignmentsFilename);
			}

			// Write lexprobs to disk
			{
				ParallelCorpus parallelCorpus = new AlignedParallelCorpus(sourceCorpusArray, targetCorpusArray, grids);

				if (logger.isLoggable(Level.INFO)) logger.info("Constructing lexprob table");
				LexicalProbabilities lexProbs = 
					new LexProbs(parallelCorpus, Float.MIN_VALUE);

				String lexprobsFilename = outputDirName + File.separator + "lexprobs.txt";
				FileOutputStream stream = new FileOutputStream(lexprobsFilename);
				OutputStreamWriter lexprobsOut = new OutputStreamWriter(stream, charset);

				String binaryLexCountFilename = outputDirName + File.separator + "lexicon.counts";
				if (logger.isLoggable(Level.INFO)) logger.info("Writing binary lexicon counts to disk at " + binaryLexCountFilename);

				//			BinaryOut lexCountOut = new BinaryOut(binaryLexCountFilename);
				ObjectOutput lexCountOut = new ObjectOutputStream(new FileOutputStream(binaryLexCountFilename));
				lexProbs.writeExternal(lexCountOut);
				lexCountOut.close();

				String s = lexProbs.toString();

				if (logger.isLoggable(Level.INFO)) logger.info("Writing lexprobs at " + lexprobsFilename);
				lexprobsOut.write(s);  
				lexprobsOut.flush();
				lexprobsOut.close();
				out.println("Lexprobs at " + lexprobsFilename);

			}
		}
		
		// Write target language suffix array to disk
		{
			// Construct target language suffix array
			if (logger.isLoggable(Level.INFO)) logger.info("Constructing suffix array from file " + targetCorpusFileName);
			SuffixArray targetSuffixArray = SuffixArrayFactory.createSuffixArray(targetCorpusArray, Cache.DEFAULT_CAPACITY);
			
			String binaryTargetSuffixesFilename = outputDirName + File.separator + "target.suffixes";
			if (logger.isLoggable(Level.INFO)) logger.info("Writing binary target corpus to disk at " + binaryTargetSuffixesFilename);
			
			BinaryOut suffixesOut = new BinaryOut(new FileOutputStream(binaryTargetSuffixesFilename), false);
			targetSuffixArray.writeExternal(suffixesOut);	
	    	suffixesOut.flush();
	    	
			out.println("Target language suffix array: " + binaryTargetSuffixesFilename);
		}
		
		
		{
			// Construct source language suffix array
			if (logger.isLoggable(Level.INFO)) logger.info("Constructing suffix array from file " + sourceCorpusFileName);
			SuffixArray sourceSuffixArray = SuffixArrayFactory.createSuffixArray(sourceCorpusArray, Cache.DEFAULT_CAPACITY);

			// Write source language suffix array to disk
			{
				String binarySourceSuffixesFilename = outputDirName + File.separator + "source.suffixes";
				if (logger.isLoggable(Level.INFO)) logger.info("Writing binary source corpus to disk at " + binarySourceSuffixesFilename);

				BinaryOut suffixesOut = new BinaryOut(new FileOutputStream(binarySourceSuffixesFilename), false);
				sourceSuffixArray.writeExternal(suffixesOut);	
				suffixesOut.flush();

				out.println("Source language suffix array: " + binarySourceSuffixesFilename);
			}

			// Precompute and write frequent phrase locations to disk
			{
				if (logger.isLoggable(Level.INFO)) logger.info("Precomputing indices for most frequent phrases");
				FrequentPhrases frequentPhrases = 
					new FrequentPhrases(sourceSuffixArray, minFrequency, maxPhrases, maxPhraseLength, maxPhraseLength, maxPhraseSpan, minNonterminalSpan);

				String frequentPhrasesFilename = outputDirName + File.separator + "frequentPhrases";
				if (logger.isLoggable(Level.INFO)) logger.info("Writing precomputing indices for most frequent phrases at " + frequentPhrasesFilename);
				BinaryOut frequentPhrasesOut = new BinaryOut(frequentPhrasesFilename);
				frequentPhrases.writeExternal(frequentPhrasesOut);
				frequentPhrasesOut.close();
			}
		}
		
		out.flush();
		out.close();
		
		if (logger.isLoggable(Level.INFO)) logger.info("Completed writing binary files to disk");
		
	}
	
	public static void main(String[] args) throws IOException {
		
		if (args.length < 4) {
			System.err.println("Usage: java " + Compile.class.getName() + " sourceCorpus targetCorpus alignmentsFile outputDir.josh");
			System.exit(0);
		}
		
		Compile compiler = new Compile();
		compiler.setSourceCorpus(args[0]);
		compiler.setTargetCorpus(args[1]);
		compiler.setAlignments(args[2]);
		compiler.setOutputDir(args[3]);
		if (args.length > 4) compiler.setEncoding(args[4]);
		
		compiler.execute();
	}
	
}