package edu.berkeley.nlp.lm.io; import java.io.File; import java.util.ArrayList; import java.util.List; import edu.berkeley.nlp.lm.ConfigOptions; import edu.berkeley.nlp.lm.StringWordIndexer; import edu.berkeley.nlp.lm.util.Logger; /** * Estimates a Kneser-Ney language model from raw text, and writes the language * model out in ARPA-format. This is meant to closely resemble the functionality * of SRILM's * <code>ngram-count -text <text file> -ukndiscount -lm <outputfile>)</code> * , with two main exceptions: <br> * (a) rather than calculating the discount for each n-gram order from counts, * we use a constant discount of 0.75 for all orders <br> * (b) Count thresholding is currently not implemented (SRILM by default * thresholds counts for n-grams with n > 3). * <p> * Note that if the input/output files have a .gz suffix, they will be * unzipped/zipped as necessary. If no input files or given (or "-" is * specified), lines will be read from standard input. * * @author adampauls * */ public class MakeKneserNeyArpaFromText { /** * */ private static void usage() { System.err.println("Usage: <lmOrder> <ARPA lm output file> <textfiles>*"); System.exit(1); } public static void main(final String[] argv) { if (argv.length < 2) { usage(); } final int lmOrder = Integer.parseInt(argv[0]); final String outputFile = argv[1]; final List<String> inputFiles = new ArrayList<String>(); for (int i = 2; i < argv.length; ++i) { inputFiles.add(argv[i]); } if (inputFiles.isEmpty()) inputFiles.add("-"); Logger.setGlobalLogger(new Logger.SystemLogger(System.out, System.err)); Logger.startTrack("Reading text files " + inputFiles + " and writing to file " + outputFile); final StringWordIndexer wordIndexer = new StringWordIndexer(); wordIndexer.setStartSymbol(ArpaLmReader.START_SYMBOL); wordIndexer.setEndSymbol(ArpaLmReader.END_SYMBOL); wordIndexer.setUnkSymbol(ArpaLmReader.UNK_SYMBOL); LmReaders.createKneserNeyLmFromTextFiles(inputFiles, wordIndexer, lmOrder, new File(outputFile), new ConfigOptions()); Logger.endTrack(); } }