package edu.berkeley.nlp.lm.io;
import java.util.ArrayList;
import java.util.List;
import edu.berkeley.nlp.lm.NgramLanguageModel;
import edu.berkeley.nlp.lm.util.Logger;
/**
* Given a language model in ARPA format, builds a binary representation of the
* language model and writes it to disk. Language model binaries are
* significantly smaller and faster to load than ARPA files.
* <p>
* Note that if the input/output files have a <code>.gz</code> suffix, they will
* be unzipped/zipped as necessary.
*
* @author adampauls
*
*/
public class MakeLmBinaryFromArpa
{
private enum Opts
{
HASH_OPT
{
@Override
public String toString() {
return "-h";
}
@Override
public String docString() {
return "build an array-encoded hash-table LM (the default)";
}
@Override
public NgramLanguageModel<String> makeLm(final String file) {
return LmReaders.readArrayEncodedLmFromArpa(file, false);
}
},
CONTEXT_OPT
{
@Override
public String toString() {
return "-e";
}
@Override
public String docString() {
return "build a context-encoded LM instead of the default hash table";
}
@Override
public NgramLanguageModel<String> makeLm(final String file) {
return LmReaders.readContextEncodedLmFromArpa(file);
}
},
COMPRESS_OPT
{
@Override
public String toString() {
return "-c";
}
@Override
public String docString() {
return "build a compressed hash-table LM instead of the array encoding";
}
@Override
public NgramLanguageModel<String> makeLm(final String file) {
return LmReaders.readArrayEncodedLmFromArpa(file, true);
}
};
public abstract String docString();
public abstract NgramLanguageModel<String> makeLm(String file);
}
/**
*
*/
private static void usage() {
System.err.println("Usage: [opts] <ARPA lm file> <outputfile>");
for (final Opts opts : Opts.values()) {
System.err.println("\t" + opts.toString() + ": " + opts.docString());
}
System.exit(1);
}
public static void main(final String[] argv) {
final List<String> fileArgs = new ArrayList<String>();
Opts finalOpt = Opts.HASH_OPT;
OUTER: for (final String arg : argv) {
if (arg.startsWith("-")) {
for (final Opts opts : Opts.values()) {
if (opts.toString().equals(arg)) {
finalOpt = opts;
continue OUTER;
}
}
System.err.println("Unrecognized opts: " + arg);
usage();
} else
fileArgs.add(arg);
}
if (fileArgs.size() != 2) {
usage();
}
Logger.setGlobalLogger(new Logger.SystemLogger(System.out, System.err));
final String lmFile = fileArgs.get(0);
Logger.startTrack("Reading Lm File " + lmFile + " . . . ");
final NgramLanguageModel<String> lm = finalOpt.makeLm(lmFile);
Logger.endTrack();
final String outFile = fileArgs.get(1);
Logger.startTrack("Writing to file " + outFile + " . . . ");
LmReaders.writeLmBinary(lm, outFile);
Logger.endTrack();
}
}