package edu.berkeley.nlp.lm.io;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import edu.berkeley.nlp.lm.ArrayEncodedNgramLanguageModel;
import edu.berkeley.nlp.lm.ArrayEncodedProbBackoffLm;
import edu.berkeley.nlp.lm.ConfigOptions;
import edu.berkeley.nlp.lm.ContextEncodedNgramLanguageModel;
import edu.berkeley.nlp.lm.ContextEncodedProbBackoffLm;
import edu.berkeley.nlp.lm.NgramLanguageModel;
import edu.berkeley.nlp.lm.StringWordIndexer;
import edu.berkeley.nlp.lm.StupidBackoffLm;
import edu.berkeley.nlp.lm.WordIndexer;
import edu.berkeley.nlp.lm.array.LongArray;
import edu.berkeley.nlp.lm.cache.ArrayEncodedCachingLmWrapper;
import edu.berkeley.nlp.lm.cache.ContextEncodedCachingLmWrapper;
import edu.berkeley.nlp.lm.collections.LongRepresentable;
import edu.berkeley.nlp.lm.map.AbstractNgramMap;
import edu.berkeley.nlp.lm.map.CompressedNgramMap;
import edu.berkeley.nlp.lm.map.ContextEncodedNgramMap;
import edu.berkeley.nlp.lm.map.HashNgramMap;
import edu.berkeley.nlp.lm.map.NgramMap;
import edu.berkeley.nlp.lm.map.NgramMapWrapper;
import edu.berkeley.nlp.lm.util.Logger;
import edu.berkeley.nlp.lm.util.LongRef;
import edu.berkeley.nlp.lm.values.CompressibleValueContainer;
import edu.berkeley.nlp.lm.values.CompressibleProbBackoffValueContainer;
import edu.berkeley.nlp.lm.values.ProbBackoffPair;
import edu.berkeley.nlp.lm.values.UncompressedProbBackoffValueContainer;
import edu.berkeley.nlp.lm.values.CountValueContainer;
import edu.berkeley.nlp.lm.values.UnrankedUncompressedProbBackoffValueContainer;
import edu.berkeley.nlp.lm.values.ValueContainer;
/**
* This class contains a number of static methods for reading/writing/estimating
* n-gram language models. Since most uses of this software will use this class,
* I will use this space to document the software as a whole.
* <p>
* This software provides three main pieces of functionality: <br>
* (a) estimation of a language models from text inputs <br>
* (b) data structures for efficiently storing large collections of n-grams in
* memory <br>
* (c) an API for efficient querying language models derived from n-gram
* collections. Most of the techniques in the paper are described in
* "Faster and Smaller N-gram Language Models" (Pauls and Klein 2011).
* <p>
* This software supports the estimation of two types of language models:
* Kneser-Ney language models (Kneser and Ney, 1995) and Stupid Backoff language
* models (Brants et al. 2007). Kneser-Ney language models can be estimated from
* raw text by calling
* {@link #createKneserNeyLmFromTextFiles(List, WordIndexer, int, File, ConfigOptions)}. This
* can also be done from the command-line by calling <code>main()</code> in
* {@link MakeKneserNeyArpaFromText}. See the <code>examples</code> folder for a
* script which demonstrates its use. A Stupid Backoff language model can be
* read from a directory containing n-gram counts in the format used by Google's
* Web1T corpus by calling {@link #readLmFromGoogleNgramDir(String, boolean, boolean)}.
* Note that this software does not (yet) support building Google count
* directories from raw text, though this can be done using SRILM.
* <p>
* Loading/estimating language models from text files can be very slow. This
* software can use Java's built-in serialization to build language model
* binaries which are both smaller and faster to load.
* {@link MakeLmBinaryFromArpa} and {@link MakeLmBinaryFromGoogle} provide
* <code>main()</code> methods for doing this. See the <code>examples</code>
* folder for scripts which demonstrate their use.
* <p>
* Language models can be read into memory from ARPA formats using
* {@link #readArrayEncodedLmFromArpa(String, boolean)} and
* {@link #readContextEncodedLmFromArpa(String)}. The "array encoding" versus
* "context encoding" distinction is discussed in Section 4.2 of Pauls and Klein
* (2011). Again, since loading language models from textual representations can
* be very slow, they can be read from binaries using
* {@link #readLmBinary(String)}. The interfaces for these language models can
* be found in {@link ArrayEncodedNgramLanguageModel} and
* {@link ContextEncodedNgramLanguageModel}. For examples of these interfaces in
* action, you can have a look at {@link PerplexityTest}.
* <p>
* We implement the HASH,HASH+SCROLL, and COMPRESSED language model
* representations described in Pauls and Klein (2011) in this release. The
* SORTED implementation may be added later. See {@link HashNgramMap} and
* {@link CompressedNgramMap} for the implementations of the HASH and COMPRESSED
* representations.
* <p>
* To speed up queries, you can wrap language models with caches (
* {@link ContextEncodedCachingLmWrapper} and
* {@link ArrayEncodedCachingLmWrapper}). These caches are described in section
* 4.1 of Pauls and Klein (2011). You should more or less always use these
* caches, since they are faster and have modest memory requirements.
* <p>
* This software also support a java Map wrapper around an n-gram collection.
* You can read a map wrapper using
* {@link #readNgramMapFromGoogleNgramDir(String, boolean, WordIndexer)}.
* <p>
* {@link ComputeLogProbabilityOfTextStream} provides a <code>main()</code> method for computing the log probability of raw text.
* <p>
* Some example scripts can be found in the <code>examples/</code> directory.
*
* @author adampauls
*
*/
public class LmReaders
{
public static ContextEncodedProbBackoffLm<String> readContextEncodedLmFromArpa(final String lmFile) {
return readContextEncodedLmFromArpa(lmFile, new StringWordIndexer());
}
public static <W> ContextEncodedProbBackoffLm<W> readContextEncodedLmFromArpa(final String lmFile, final WordIndexer<W> wordIndexer) {
return readContextEncodedLmFromArpa(lmFile, wordIndexer, new ConfigOptions(), Integer.MAX_VALUE);
}
/**
* Reads a context-encoded language model from an ARPA lm file.
* Context-encoded language models allow faster queries, but require an
* extra 4-bytes of storage per n-gram for the suffix offsets (as compared
* to array-encoded language models).
*
* @param <W>
* @param lmFile
* @param compress
* @param wordIndexer
* @param opts
* @param lmOrder
* @return
*/
public static <W> ContextEncodedProbBackoffLm<W> readContextEncodedLmFromArpa(final String lmFile, final WordIndexer<W> wordIndexer,
final ConfigOptions opts, final int lmOrder) {
return readContextEncodedLmFromArpa(new ArpaLmReader<W>(lmFile, wordIndexer, lmOrder), wordIndexer, opts);
}
public static <W> ContextEncodedProbBackoffLm<W> readContextEncodedLmFromArpa(
final LmReader<ProbBackoffPair, ArpaLmReaderCallback<ProbBackoffPair>> lmFile, final WordIndexer<W> wordIndexer, final ConfigOptions opts) {
final FirstPassCallback<ProbBackoffPair> valueAddingCallback = firstPassArpa(lmFile, false);
final LongArray[] numNgramsForEachWord = valueAddingCallback.getNumNgramsForEachWord();
return secondPassContextEncoded(opts, lmFile, wordIndexer, valueAddingCallback, numNgramsForEachWord);
}
public static ArrayEncodedProbBackoffLm<String> readArrayEncodedLmFromArpa(final String lmFile, final boolean compress) {
return readArrayEncodedLmFromArpa(lmFile, compress, new StringWordIndexer());
}
public static <W> ArrayEncodedProbBackoffLm<W> readArrayEncodedLmFromArpa(final String lmFile, final boolean compress, final WordIndexer<W> wordIndexer) {
return readArrayEncodedLmFromArpa(lmFile, compress, wordIndexer, new ConfigOptions(), Integer.MAX_VALUE);
}
public static <W> ArrayEncodedProbBackoffLm<W> readArrayEncodedLmFromArpa(final String lmFile, final boolean compress, final WordIndexer<W> wordIndexer,
final ConfigOptions opts, final int lmOrder) {
return readArrayEncodedLmFromArpa(new ArpaLmReader<W>(lmFile, wordIndexer, lmOrder), compress, wordIndexer, opts);
}
/**
* Reads an array-encoded language model from an ARPA lm file.
*
* @param <W>
* @param lmFile
* @param compress
* Compress the LM using block compression. This LM should be
* smaller but slower.
* @param wordIndexer
* @param opts
* @param lmOrder
* @return
*/
public static <W> ArrayEncodedProbBackoffLm<W> readArrayEncodedLmFromArpa(final LmReader<ProbBackoffPair, ArpaLmReaderCallback<ProbBackoffPair>> lmFile,
final boolean compress, final WordIndexer<W> wordIndexer, final ConfigOptions opts) {
final boolean reverse = true;
final FirstPassCallback<ProbBackoffPair> valueAddingCallback = firstPassArpa(lmFile, reverse);
final LongArray[] numNgramsForEachWord = valueAddingCallback.getNumNgramsForEachWord();
return secondPassArrayEncoded(opts, lmFile, wordIndexer, valueAddingCallback, numNgramsForEachWord, reverse, compress);
}
public static NgramMapWrapper<String, LongRef> readNgramMapFromGoogleNgramDir(final String dir, final boolean compress) {
return readNgramMapFromGoogleNgramDir(dir, compress, new StringWordIndexer());
}
public static <W> NgramMapWrapper<W, LongRef> readNgramMapFromGoogleNgramDir(final String dir, final boolean compress, final WordIndexer<W> wordIndexer) {
final StupidBackoffLm<W> lm = (StupidBackoffLm<W>) readLmFromGoogleNgramDir(dir, compress, false, wordIndexer, new ConfigOptions());
return new NgramMapWrapper<W, LongRef>(lm.getNgramMap(), lm.getWordIndexer());
}
public static NgramMapWrapper<String, LongRef> readNgramMapFromBinary(final String binary, final String vocabFile) {
return readNgramMapFromBinary(binary, vocabFile, new StringWordIndexer());
}
/**
*
* @param sortedVocabFile
* should be the vocab_cs.gz file from the Google n-gram corpus.
* @return
*/
public static <W> NgramMapWrapper<W, LongRef> readNgramMapFromBinary(final String binary, final String sortedVocabFile, final WordIndexer<W> wordIndexer) {
GoogleLmReader.addToIndexer(wordIndexer, sortedVocabFile);
wordIndexer.trimAndLock();
@SuppressWarnings("unchecked")
final NgramMap<LongRef> map = (NgramMap<LongRef>) IOUtils.readObjFileHard(binary);
return new NgramMapWrapper<W, LongRef>(map, wordIndexer);
}
public static ArrayEncodedNgramLanguageModel<String> readLmFromGoogleNgramDir(final String dir, final boolean compress, final boolean kneserNey) {
return readLmFromGoogleNgramDir(dir, compress, kneserNey, new StringWordIndexer(), new ConfigOptions());
}
/**
* Reads a stupid backoff lm from a directory with n-gram counts in the
* format used by Google n-grams.
*
* @param <W>
* @param dir
* @param compress
* @param wordIndexer
* @param opts
* @return
*/
public static <W> ArrayEncodedNgramLanguageModel<W> readLmFromGoogleNgramDir(final String dir, final boolean compress, final boolean kneserNey,
final WordIndexer<W> wordIndexer, final ConfigOptions opts) {
final GoogleLmReader<W> googleLmReader = new GoogleLmReader<W>(dir, wordIndexer, opts);
if (kneserNey) {
GoogleLmReader.addSpecialSymbols(wordIndexer);
KneserNeyLmReaderCallback<W> kneserNeyReader = new KneserNeyLmReaderCallback<W>(wordIndexer, googleLmReader.getLmOrder(), opts);
googleLmReader.parse(kneserNeyReader);
return readArrayEncodedLmFromArpa(kneserNeyReader, compress, wordIndexer, opts);
} else {
final FirstPassCallback<LongRef> valueAddingCallback = firstPassGoogle(dir, wordIndexer, opts);
final LongArray[] numNgramsForEachWord = valueAddingCallback.getNumNgramsForEachWord();
return secondPassGoogle(opts, googleLmReader, wordIndexer, valueAddingCallback, numNgramsForEachWord, compress);
}
}
/**
* Builds a context-encoded LM from raw text. This call first builds and
* writes a (temporary) ARPA file by calling
* {@link #createKneserNeyLmFromTextFiles(List, WordIndexer, int, File)},
* and the reads the resulting file. Since the temp file can be quite large,
* it is important that the temp directory used by java (
* <code>java.io.tmpdir</code>).
*
* @param <W>
* @param files
* @param wordIndexer
* @param lmOrder
* @param opts
* @return
*/
public static <W> ContextEncodedProbBackoffLm<W> readContextEncodedKneserNeyLmFromTextFile(final List<String> files, final WordIndexer<W> wordIndexer,
final int lmOrder, final ConfigOptions opts) {
final File tmpFile = getTempFile();
return readContextEncodedKneserNeyLmFromTextFile(files, wordIndexer, lmOrder, opts, tmpFile);
}
/**
* Builds an array-encoded LM from raw text. This call first builds and
* writes a (temporary) ARPA file by calling
* {@link #createKneserNeyLmFromTextFiles(List, WordIndexer, int, File)},
* and the reads the resulting file. Since the temp file can be quite large,
* it is important that the temp directory used by java (
* <code>java.io.tmpdir</code>).
*
* @param <W>
* @param files
* @param wordIndexer
* @param lmOrder
* @param opts
* @return
*/
public static <W> ArrayEncodedProbBackoffLm<W> readKneserNeyLmFromTextFile(final List<String> files, final WordIndexer<W> wordIndexer, final int lmOrder,
final ConfigOptions opts, final boolean compress) {
final File tmpFile = getTempFile();
return readKneserNeyLmFromTextFile(files, wordIndexer, lmOrder, compress, opts, tmpFile);
}
public static <W> ContextEncodedProbBackoffLm<W> readContextEncodedKneserNeyLmFromTextFile(final List<String> files, final WordIndexer<W> wordIndexer,
final int lmOrder, final ConfigOptions opts, final File tmpFile) {
createKneserNeyLmFromTextFiles(files, wordIndexer, lmOrder, tmpFile, opts);
return readContextEncodedLmFromArpa(tmpFile.getPath(), wordIndexer, opts, lmOrder);
}
public static <W> ArrayEncodedProbBackoffLm<W> readKneserNeyLmFromTextFile(final List<String> files, final WordIndexer<W> wordIndexer, final int lmOrder,
final boolean compress, final ConfigOptions opts, final File tmpFile) {
createKneserNeyLmFromTextFiles(files, wordIndexer, lmOrder, tmpFile, opts);
return readArrayEncodedLmFromArpa(tmpFile.getPath(), compress, wordIndexer, opts, lmOrder);
}
/**
* Estimates a Kneser-Ney language model from raw text, and writes a file
* (in ARPA format). Probabilities are in log base 10 to match SRILM.
*
* @param <W>
* @param files
* Files of raw text (new-line separated).
* @param wordIndexer
* @param lmOrder
* @param arpaOutputFile
*/
public static <W> void createKneserNeyLmFromTextFiles(final List<String> files, final WordIndexer<W> wordIndexer, final int lmOrder,
final File arpaOutputFile, final ConfigOptions opts) {
final TextReader<W> reader = new TextReader<W>(files, wordIndexer);
KneserNeyLmReaderCallback<W> kneserNeyReader = new KneserNeyLmReaderCallback<W>(wordIndexer, lmOrder, opts);
reader.parse(kneserNeyReader);
kneserNeyReader.parse(new KneserNeyFileWritingLmReaderCallback<W>(arpaOutputFile, wordIndexer));
}
public static StupidBackoffLm<String> readGoogleLmBinary(final String file, final String sortedVocabFile) {
return readGoogleLmBinary(file, new StringWordIndexer(), sortedVocabFile);
}
/**
* Reads in a pre-built Google n-gram binary. The user must supply the
* <code>vocab_cs.gz</code> file (so that the corpus cannot be reproduced
* unless the user has the rights to do so).
*
* @param <W>
* @param file
* The binary
* @param wordIndexer
* @param sortedVocabFile
* the <code>vocab_cs.gz</code> vocabulary file.
* @return
*/
public static <W> StupidBackoffLm<W> readGoogleLmBinary(final String file, final WordIndexer<W> wordIndexer, final String sortedVocabFile) {
GoogleLmReader.addToIndexer(wordIndexer, sortedVocabFile);
wordIndexer.trimAndLock();
@SuppressWarnings("unchecked")
final NgramMap<LongRef> map = (NgramMap<LongRef>) IOUtils.readObjFileHard(file);
return new StupidBackoffLm<W>(map.getMaxNgramOrder(), wordIndexer, map, new ConfigOptions());
}
/**
* Reads a binary file representing an LM. These will need to be cast down
* to either {@link ContextEncodedNgramLanguageModel} or
* {@link ArrayEncodedNgramLanguageModel} to be useful.
*/
public static <W> NgramLanguageModel<W> readLmBinary(final String file) {
@SuppressWarnings("unchecked")
final NgramLanguageModel<W> lm = (NgramLanguageModel<W>) IOUtils.readObjFileHard(file);
return lm;
}
/**
* Writes a binary file representing the LM using the built-in
* serialization. These binaries should load much faster than ARPA files.
*
* @param <W>
* @param lm
* @param file
*/
public static <W> void writeLmBinary(final NgramLanguageModel<W> lm, final String file) {
IOUtils.writeObjFileHard(file, lm);
}
/**
* @return
*/
private static File getTempFile() {
try {
final File tmpFile = File.createTempFile("berkeleylm", "arpa");
tmpFile.deleteOnExit();
return tmpFile;
} catch (final IOException e) {
throw new RuntimeException(e);
}
}
/**
* Second pass actually builds the lm.
*
* @param <W>
* @param opts
* @param lmFile
* @param lmOrder
* @param wordIndexer
* @param valueAddingCallback
* @param numNgramsForEachWord
* @return
*/
private static <W> ContextEncodedProbBackoffLm<W> secondPassContextEncoded(final ConfigOptions opts,
final LmReader<ProbBackoffPair, ArpaLmReaderCallback<ProbBackoffPair>> lmFile, final WordIndexer<W> wordIndexer,
final FirstPassCallback<ProbBackoffPair> valueAddingCallback, final LongArray[] numNgramsForEachWord) {
final boolean contextEncoded = true;
final boolean reversed = false;
final boolean compress = false;
final NgramMap<ProbBackoffPair> map = buildMapArpa(opts, lmFile, wordIndexer, valueAddingCallback, numNgramsForEachWord, contextEncoded, reversed,
compress);
return new ContextEncodedProbBackoffLm<W>(map.getMaxNgramOrder(), wordIndexer, (ContextEncodedNgramMap<ProbBackoffPair>) map, opts);
}
/**
* Second pass actually builds the lm.
*
* @param <W>
* @param opts
* @param lmReader
* @param lmOrder
* @param wordIndexer
* @param valueAddingCallback
* @param numNgramsForEachWord
* @return
*/
private static <W> ArrayEncodedProbBackoffLm<W> secondPassArrayEncoded(final ConfigOptions opts,
final LmReader<ProbBackoffPair, ArpaLmReaderCallback<ProbBackoffPair>> lmReader, final WordIndexer<W> wordIndexer,
final FirstPassCallback<ProbBackoffPair> valueAddingCallback, final LongArray[] numNgramsForEachWord, final boolean reversed, final boolean compress) {
final boolean contextEncoded = false;
final NgramMap<ProbBackoffPair> map = buildMapArpa(opts, lmReader, wordIndexer, valueAddingCallback, numNgramsForEachWord, contextEncoded, reversed,
compress);
return new ArrayEncodedProbBackoffLm<W>(map.getMaxNgramOrder(), wordIndexer, map, opts);
}
private static <W> StupidBackoffLm<W> secondPassGoogle(final ConfigOptions opts, final LmReader<LongRef, NgramOrderedLmReaderCallback<LongRef>> lmReader,
final WordIndexer<W> wordIndexer, final FirstPassCallback<LongRef> valueAddingCallback, final LongArray[] numNgramsForEachWord, final boolean compress) {
final boolean contextEncoded = false;
final boolean reversed = true;
final CountValueContainer values = new CountValueContainer(valueAddingCallback.getValueCounter(), opts.valueRadix, contextEncoded,
new long[numNgramsForEachWord.length]);
final NgramMap<LongRef> map = buildMapCommon(opts, wordIndexer, numNgramsForEachWord, valueAddingCallback.getNumNgramsForEachOrder(), reversed,
lmReader, values, compress);
return new StupidBackoffLm<W>(numNgramsForEachWord.length, wordIndexer, map, opts);
}
/**
* @param <W>
* @param opts
* @param lmFile
* @param lmOrder
* @param wordIndexer
* @param valueAddingCallback
* @param numNgramsForEachWord
* @param contextEncoded
* @param reversed
* @return
*/
private static <W> NgramMap<ProbBackoffPair> buildMapArpa(final ConfigOptions opts,
final LmReader<ProbBackoffPair, ArpaLmReaderCallback<ProbBackoffPair>> lmReader, final WordIndexer<W> wordIndexer,
final FirstPassCallback<ProbBackoffPair> valueAddingCallback, final LongArray[] numNgramsForEachWord, final boolean contextEncoded,
final boolean reversed, final boolean compress) {
final ValueContainer<ProbBackoffPair> values = compress ? new CompressibleProbBackoffValueContainer(valueAddingCallback.getValueCounter(),
opts.valueRadix, contextEncoded, valueAddingCallback.getNumNgramsForEachOrder())
: opts.storeRankedProbBackoffs ? new UncompressedProbBackoffValueContainer(valueAddingCallback.getValueCounter(), opts.valueRadix, contextEncoded,
valueAddingCallback.getNumNgramsForEachOrder()) : new UnrankedUncompressedProbBackoffValueContainer(contextEncoded, valueAddingCallback.getNumNgramsForEachOrder());
if (contextEncoded && compress) throw new RuntimeException("Compression is not supported by context-encoded LMs");
final NgramMap<ProbBackoffPair> map = buildMapCommon(opts, wordIndexer, numNgramsForEachWord, valueAddingCallback.getNumNgramsForEachOrder(), reversed,
lmReader, values, compress);
return map;
}
/**
* @param <W>
* @param opts
* @param wordIndexer
* @param valueAddingCallback
* @param numNgramsForEachWord
* @param contextEncoded
* @param reversed
* @param lmReader
* @return
*/
private static <W, V extends Comparable<V>> NgramMap<V> buildMapCommon(final ConfigOptions opts, final WordIndexer<W> wordIndexer,
final LongArray[] numNgramsForEachWord, final long[] numNgramsForEachOrder, final boolean reversed,
final LmReader<V, ? super NgramMapAddingCallback<V>> lmReader, final ValueContainer<V> values, final boolean compress) {
Logger.startTrack("Adding n-grams");
NgramMap<V> map = createNgramMap(opts, numNgramsForEachWord, numNgramsForEachOrder, reversed, values, compress);
final List<int[]> failures = tryBuildingNgramMap(opts, wordIndexer, lmReader, map);
Logger.endTrack();
if (!failures.isEmpty()) {
Logger.startTrack(failures.size() + " missing suffixes or prefixes were found, doing another pass to add n-grams");
for (final int[] failure : failures) {
final int ngramOrder = failure.length - 1;
final int headWord = failure[reversed ? 0 : ngramOrder];
numNgramsForEachOrder[ngramOrder]++;
numNgramsForEachWord[ngramOrder].incrementCount(headWord, 1);
}
// try to clear some memory
for (int ngramOrder = 0; ngramOrder < numNgramsForEachOrder.length; ++ngramOrder) {
values.clearStorageForOrder(ngramOrder);
}
final ValueContainer<V> newValues = values.createFreshValues(numNgramsForEachOrder);
map.clearStorage();
map = createNgramMap(opts, numNgramsForEachWord, numNgramsForEachOrder, reversed, newValues, compress);
lmReader.parse(new NgramMapAddingCallback<V>(map, failures));
Logger.endTrack();
}
return map;
}
/**
* @param <V>
* @param <W>
* @param opts
* @param wordIndexer
* @param lmReader
* @param map
* @return
*/
private static <V, W> List<int[]> tryBuildingNgramMap(final ConfigOptions opts, final WordIndexer<W> wordIndexer,
final LmReader<V, ? super NgramMapAddingCallback<V>> lmReader, NgramMap<V> map) {
final NgramMapAddingCallback<V> ngramMapAddingCallback = new NgramMapAddingCallback<V>(map, null);
lmReader.parse(ngramMapAddingCallback);
if (opts.lockIndexer) wordIndexer.trimAndLock();
final List<int[]> failures = ngramMapAddingCallback.getFailures();
return failures;
}
/**
* @param <V>
* @param opts
* @param numNgramsForEachWord
* @param numNgramsForEachOrder
* @param reversed
* @param values
* @param compress
* @return
*/
private static <V> AbstractNgramMap<V> createNgramMap(final ConfigOptions opts, final LongArray[] numNgramsForEachWord, final long[] numNgramsForEachOrder,
final boolean reversed, final ValueContainer<V> values, final boolean compress) {
return compress ? new CompressedNgramMap<V>((CompressibleValueContainer<V>) values, numNgramsForEachOrder, opts) : HashNgramMap
.createImplicitWordHashNgramMap(values, opts, numNgramsForEachWord, reversed);
}
private static <W> FirstPassCallback<ProbBackoffPair> firstPassArpa(final LmReader<ProbBackoffPair, ArpaLmReaderCallback<ProbBackoffPair>> arpaLmReader, //final int lmOrder, final WordIndexer<W> wordIndexer,
final boolean reverse) {
// final ArpaLmReader<W> arpaLmReader = new ArpaLmReader<W>(lmFile, wordIndexer, lmOrder);
final FirstPassCallback<ProbBackoffPair> valueAddingCallback = firstPassCommon(arpaLmReader, reverse);
return valueAddingCallback;
}
private static <W> FirstPassCallback<LongRef> firstPassGoogle(final String rootDir, final WordIndexer<W> wordIndexer, final ConfigOptions opts) {
final GoogleLmReader<W> arpaLmReader = new GoogleLmReader<W>(rootDir, wordIndexer, opts);
final boolean reverse = true;
final FirstPassCallback<LongRef> valueAddingCallback = firstPassCommon(arpaLmReader, reverse);
return valueAddingCallback;
}
/**
* First pass over the file collects some statistics which help with memory
* allocation
*
* @param <W>
* @param arpaLmReader
* @return
*/
private static <V extends LongRepresentable<V>> FirstPassCallback<V> firstPassCommon(final LmReader<V, ? super FirstPassCallback<V>> arpaLmReader,
final boolean reverse) {
Logger.startTrack("Counting values");
final FirstPassCallback<V> valueAddingCallback = new FirstPassCallback<V>(reverse);
arpaLmReader.parse(valueAddingCallback);
Logger.endTrack();
return valueAddingCallback;
}
}