package edu.berkeley.nlp.lm.io; import edu.berkeley.nlp.lm.NgramLanguageModel; import edu.berkeley.nlp.lm.util.Logger; /** * Given a directory in Google n-grams format, builds a binary representation of * a stupid-backoff language model language model and writes it to disk. * Language model binaries are significantly smaller and faster to load. Note: * actually running this code on the full Google-ngrams corpus can be very slow * and memory intensive -- on our machines, it takes about 32GB of memory and 15 * hours. * <p> * Note that if the input/output files have a <code>.gz</code> suffix, they will * be unzipped/zipped as necessary. * * @author adampauls * */ public class MakeLmBinaryFromGoogle { /** * */ private static void usage() { System.err.println("Usage: <Google n-grams dir> <outputfile>"); System.exit(1); } public static void main(final String[] argv) { if (argv.length != 2) usage(); Logger.setGlobalLogger(new Logger.SystemLogger(System.out, System.err)); Logger.startTrack("Reading Lm File " + argv[0] + " . . . "); final String googleDir = argv[0]; final NgramLanguageModel<String> lm = LmReaders.readLmFromGoogleNgramDir(googleDir, true, false); Logger.endTrack(); final String outFile = argv[1]; Logger.startTrack("Writing to file " + outFile + " . . . "); LmReaders.writeLmBinary(lm, outFile); Logger.endTrack(); } }