/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.corpus.lexprob; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectInput; import java.io.OutputStreamWriter; import joshua.corpus.AlignedParallelCorpus; import joshua.corpus.Corpus; import joshua.corpus.ParallelCorpus; import joshua.corpus.alignment.Alignments; import joshua.corpus.alignment.mm.MemoryMappedAlignmentGrids; import joshua.corpus.mm.MemoryMappedCorpusArray; import joshua.corpus.vocab.Vocabulary; import joshua.util.io.BinaryIn; /** * Ant task to export a human-readable lexical probabilities table * to disk from a binary josh directory. * * @author Lane Schwartz */ public class WriteLexProbs { private String encoding = "UTF-8"; private int cacheSize = 1000; private String joshDir; private String output; public void setEncoding(String encoding) { this.encoding = encoding; } public void setCacheSize(int cacheSize) { this.cacheSize = cacheSize; } public void setJoshDir(String joshDir) { System.out.println("Setting " + joshDir); this.joshDir = joshDir; } public void setOutput(String output) { this.output = output; } public void execute() throws IOException, ClassNotFoundException { System.out.println("Getting parallel corpus"); ParallelCorpus parallelCorpus = getParallelCorpus(joshDir, cacheSize); System.out.println("Getting lexprobs"); LexicalProbabilities lexProbs = new LexProbs(parallelCorpus, Float.MIN_VALUE); FileOutputStream stream = new FileOutputStream(output); OutputStreamWriter out = new OutputStreamWriter(stream, encoding); try { String s = lexProbs.toString(); System.out.println("Writing lexprobs from " + joshDir + " to " + output); out.write(s); } catch (IOException e) { System.out.println("Failure"); } finally { out.close(); } } private static ParallelCorpus getParallelCorpus(String joshDir, int cacheSize) throws IOException, ClassNotFoundException { Vocabulary commonVocab = new Vocabulary(); String binaryVocabFileName = joshDir + "/common.vocab"; ObjectInput in = BinaryIn.vocabulary(binaryVocabFileName); commonVocab.readExternal(in); String sourceFileName = joshDir + "/source.corpus"; Corpus sourceCorpusArray = new MemoryMappedCorpusArray(commonVocab, sourceFileName); String targetFileName = joshDir + "/target.corpus"; Corpus targetCorpusArray = new MemoryMappedCorpusArray(commonVocab, targetFileName); String alignmentFileName = joshDir + "/alignment.grids"; Alignments alignments = new MemoryMappedAlignmentGrids(alignmentFileName, sourceCorpusArray, targetCorpusArray); return new AlignedParallelCorpus(sourceCorpusArray, targetCorpusArray, alignments); } /** * Takes a directory containing a compiled suffix array and writes LexProb file to disk. * @param args * @throws IOException */ public static void main(String[] args) throws IOException, ClassNotFoundException { if(args.length != 2) { System.err.println("Usage: java LexProbs joshDir outputFile"); System.exit(0); } String joshDir = args[0]; String outputFile = args[1]; WriteLexProbs lexProbWriter = new WriteLexProbs(); lexProbWriter.setJoshDir(joshDir); lexProbWriter.setOutput(outputFile); lexProbWriter.execute(); } }