package edu.berkeley.nlp.lm; import java.io.Serializable; import edu.berkeley.nlp.lm.io.KneserNeyLmReaderCallback; import edu.berkeley.nlp.lm.util.Annotations.Option; /** * Stores some configuration options, with useful defaults. * * @author adampauls * */ public class ConfigOptions implements Serializable { /** * */ private static final long serialVersionUID = 1L; @Option(gloss = "Number of longs (8 bytes) used as a block for variable length compression") public int compressedBlockSize = 16; @Option(gloss = "Parameter \"k\" which controls the base for variable-length compression of offset deltas") public int offsetDeltaRadix = 6; @Option(gloss = "Parameter \"k\" which controls the base for variable-length compression of value ranks") public int valueRadix = 6; @Option(gloss = "Fraction of hash table array actually used for entries (lower means more memory/more speed)") public double hashTableLoadFactor = 1.0 / 1.5; @Option(gloss = "Probability returned when the last word of an n-gram is not in the vocabulary of the LM (this is *not* the probability of the <unk> tag)") public double unknownWordLogProb = -100.0f; @Option(gloss = "Backoff constant used for stupid backoff") public double stupidBackoffAlpha = 0.4; @Option(gloss = "Discounts used in estimating Kneser-Ney language models (one for each order). If null, they are calculated automatically using c1/(c1+2*c2), where cn is the number of ngrams with count n.") public double[] kneserNeyDiscounts = null;//KneserNeyLmReaderCallback.defaultDiscounts(); @Option(gloss = "Minimum token counts used in estimating Kneser-Ney language models (one for each order). Note that for some internal reasons, these counts are *only* applied to the highest- and second-highest order n-grams (for example, if you estimate a 5-gram language model, only 4- and 5-grams will be thresholded. Also, any ngram orders beyond the length of this array are considered to have min count 0.") public double[] kneserNeyMinCounts = KneserNeyLmReaderCallback.defaultMinCounts(); @Option(gloss = "Number of bits allocated for a word in a context encoding (remaining bits of a long are used to encode an offset") public int numWordBits = 26; @Option(gloss = "Whether to lock indexers after language model creation. This prevents the vocabulary from growing.") public boolean lockIndexer = true; @Option(gloss = "Number of bits to round the mantissa of floats to when reading from ARPA LM files. Note that the mantissa of a float is at most 24 bits long.") public static int roundBits = 24; @Option(gloss = "For (uncompressed) models that store probabilities and backoffs, store by ranking") public boolean storeRankedProbBackoffs = true; public ConfigOptions() { } }