/* This file is part of the Joshua Machine Translation System. * * Joshua is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA */ package joshua.corpus.suffix_array; import java.io.IOException; import java.io.ObjectInput; import java.util.logging.Logger; import joshua.corpus.CorpusArray; import joshua.corpus.vocab.Vocabulary; import joshua.util.io.BinaryIn; /** * Given a corpus and an existing symbol table, read the corpus, * and create a binary representation of the corpus using the * provided symbol table. * * @author Lane Schwartz * @version $LastChangedDate: 2009-05-22 23:31:12 -0500 (Fri, 22 May 2009) $ */ public class ConvertCorpus { /** Logger for this class. */ private static final Logger logger = Logger.getLogger(ConvertCorpus.class.getName()); /** * Given a corpus and an existing symbol table, read the * corpus, and create a binary representation of the corpus * using the provided symbol table. * * @param args Command line arguments * @throws ClassNotFoundException * @throws IOException */ public static void main(String[] args) throws IOException, ClassNotFoundException { // Read the command line arguments if (args.length < 3) { System.err.println( "Usage: java " + SuffixArray.class.getName() + " target_corpus tgt.lm.vocab tgt.corpus"); System.exit(-1); } String corpusFileName = args[0]; String binaryVocabFilename = args[1]; String binaryCorpusFilename = args[2]; String charset = (args.length > 3) ? args[3] : "UTF-8"; // Read the provided symbol table logger.info("Reading provided symbol table"); Vocabulary symbolTable = new Vocabulary(); ObjectInput in = BinaryIn.vocabulary(binaryVocabFilename); symbolTable.readExternal(in); // Read the provided corpus logger.info("Reading provided corpus"); Vocabulary oldSymbolTable = new Vocabulary(); int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, oldSymbolTable, true); CorpusArray corpusArray = SuffixArrayFactory.createCorpusArray(corpusFileName, oldSymbolTable, lengths[0], lengths[1]); // Change the internal integer-string mappings // of the corpus to use those provided by the given symbol table. logger.info("Converting corpus to use new symbol mappings"); corpusArray.setSymbolTable(symbolTable); // Write the corpus to disk in binary format logger.info("Writing corpus to disk in binary format, using new symbol mappings"); corpusArray.write(binaryCorpusFilename, binaryVocabFilename, charset); } }