package edu.berkeley.nlp.lm; import java.io.Serializable; import edu.berkeley.nlp.lm.map.ContextEncodedNgramMap; import edu.berkeley.nlp.lm.map.HashNgramMap; import edu.berkeley.nlp.lm.map.NgramMap; import edu.berkeley.nlp.lm.util.Annotations.OutputParameter; import edu.berkeley.nlp.lm.util.Logger; import edu.berkeley.nlp.lm.values.ProbBackoffPair; import edu.berkeley.nlp.lm.values.ProbBackoffValueContainer; import edu.berkeley.nlp.lm.values.UncompressedProbBackoffValueContainer; /** * Language model implementation which uses Kneser-Ney style backoff * computation. * * @author adampauls * * @param <W> */ public class ContextEncodedProbBackoffLm<W> extends AbstractContextEncodedNgramLanguageModel<W> implements ContextEncodedNgramLanguageModel<W>, Serializable { /** * */ private static final long serialVersionUID = 1L; private final HashNgramMap<ProbBackoffPair> map; private final ProbBackoffValueContainer values; private final long numWords; public ContextEncodedProbBackoffLm(final int lmOrder, final WordIndexer<W> wordIndexer, final ContextEncodedNgramMap<ProbBackoffPair> map, final ConfigOptions opts) { super(lmOrder, wordIndexer, (float) opts.unknownWordLogProb); this.map = (HashNgramMap<ProbBackoffPair>) map; this.values = (ProbBackoffValueContainer) map.getValues(); numWords = map.getNumNgrams(0); } /* * (non-Javadoc) * * @see * edu.berkeley.nlp.lm.AbstractContextEncodedNgramLanguageModel#getLogProb * (long, int, int, * edu.berkeley.nlp.lm.ContextEncodedNgramLanguageModel.LmContextInfo) */ @Override public float getLogProb(final long contextOffset, final int contextOrder, final int word, @OutputParameter final LmContextInfo outputContext) { if (word < 0 || word >= numWords) { return oovReturn(outputContext); } final HashNgramMap<ProbBackoffPair> localMap = map; long longestOffset = -2; int longestOrder = -2; float backoffSum = 0.0f; long currContextOffset = contextOffset; for (int currContextOrder = contextOrder; currContextOrder >= 0; --currContextOrder) { final int ngramOrder = currContextOrder + 1; final long offset = localMap.getOffset(currContextOffset, currContextOrder, word); if (offset >= 0) { if (longestOffset == -2) { longestOffset = offset; longestOrder = ngramOrder; } final float prob = values.getProb(ngramOrder, offset); if (!Float.isNaN(prob)) { setOutputContext(outputContext, longestOffset, longestOrder); return backoffSum + prob; } } final float backOff = values.getBackoff(currContextOrder, currContextOffset); backoffSum += (Float.isNaN(backOff) ? 0.0f : backOff); if (currContextOrder > 0) currContextOffset = values.getSuffixOffset(currContextOffset, currContextOrder); } // do unigram final long offset = word; final int ngramOrder = 0; final float prob = values.getProb(ngramOrder, offset); if (Float.isNaN(prob)) return oovReturn(outputContext); setOutputContext(outputContext, longestOffset == -2 ? offset : longestOffset, longestOffset == -2 ? ngramOrder : longestOrder); return backoffSum + prob; } /** * @param outputContext * @return */ private float oovReturn(final LmContextInfo outputContext) { if (outputContext != null) { outputContext.offset = 0; outputContext.order = -1; } return oovWordLogProb; } /* * (non-Javadoc) * * @see edu.berkeley.nlp.lm.AbstractContextEncodedNgramLanguageModel# * getOffsetForNgram(int[], int, int) */ @Override public LmContextInfo getOffsetForNgram(final int[] ngram, final int startPos, final int endPos) { return map.getOffsetForNgram(ngram, startPos, endPos); } /* * (non-Javadoc) * * @see edu.berkeley.nlp.lm.AbstractContextEncodedNgramLanguageModel# * getNgramForOffset(long, int, int) */ @Override public int[] getNgramForOffset(final long contextOffset, final int contextOrder, final int word) { return map.getNgramFromContextEncoding(contextOffset, contextOrder, word); } public NgramMap<ProbBackoffPair> getNgramMap() { return map; } private void setOutputContext(final LmContextInfo outputContext, final long offset, final int ngramOrder) { if (outputContext != null) { if (ngramOrder == lmOrder - 1) { final long suffixOffset = values.getSuffixOffset(offset, ngramOrder); outputContext.offset = suffixOffset; outputContext.order = ngramOrder - 1; } else { outputContext.offset = offset; outputContext.order = ngramOrder; } } } }