package edu.berkeley.nlp.lm; import java.util.List; import edu.berkeley.nlp.lm.collections.BoundedList; import edu.berkeley.nlp.lm.util.Annotations.OutputParameter; /** * Interface for language models which expose the internal context-encoding for * more efficient queries. (Note: language model implementations may internally * use a context-encoding without implementing this interface). A * context-encoding encodes an n-gram as a integer representing the last word, * and an offset which serves as a logical pointer to the (n-1) prefix words. * The integers represent words of type <code>W</code> in the vocabulary, and the mapping * from the vocabulary to integers is managed by an instance of the {@link WordIndexer} class. * * @author adampauls * * @param <W> */ public interface ContextEncodedNgramLanguageModel<W> extends NgramLanguageModel<W> { /** * Simple class for returning context offsets * * @author adampauls * */ public static class LmContextInfo { /** * Offset of context (prefix) of an n-gram */ public long offset = -1L; /** * The (0-based) length of <code>context</code> (i.e. * <code>order == 0</code> iff <code>context</code> refers to a * unigram). * * Use -1 for an empty context. */ public int order = -1; } /** * Get the score for an n-gram, and also get the context offset of the * n-gram's suffix. * * @param contextOffset * Offset of context (prefix) of an n-gram * @param contextOrder * The (0-based) length of <code>context</code> (i.e. * <code>order == 0</code> iff <code>context</code> refers to a * unigram). * @param word * Last word of the n-gram * @param outputContext * Offset of the suffix of the input n-gram. If the parameter is * <code>null</code> it will be ignored. This can be passed to * future queries for efficient access. * @return */ public float getLogProb(long contextOffset, int contextOrder, int word, @OutputParameter LmContextInfo outputContext); /** * Gets the offset which refers to an n-gram. If the n-gram is not in the * model, then it returns the shortest suffix of the n-gram which is. This * operation is not necessarily fast. * */ public LmContextInfo getOffsetForNgram(int[] ngram, int startPos, int endPos); /** * Gets the n-gram referred to by a context-encoding. This operation is not * necessarily fast. * */ public int[] getNgramForOffset(long contextOffset, int contextOrder, int word); public static class DefaultImplementations { public static <T> float scoreSentence(final List<T> sentence, final ContextEncodedNgramLanguageModel<T> lm) { final List<T> sentenceWithBounds = new BoundedList<T>(sentence, lm.getWordIndexer().getStartSymbol(), lm.getWordIndexer().getEndSymbol()); final int lmOrder = lm.getLmOrder(); float sentenceScore = 0.0f; for (int i = 1; i < lmOrder - 1 && i <= sentenceWithBounds.size() + 1; ++i) { final List<T> ngram = sentenceWithBounds.subList(-1, i); final float scoreNgram = lm.getLogProb(ngram); sentenceScore += scoreNgram; } for (int i = lmOrder - 1; i < sentenceWithBounds.size() + 2; ++i) { final List<T> ngram = sentenceWithBounds.subList(i - lmOrder, i); final float scoreNgram = lm.getLogProb(ngram); sentenceScore += scoreNgram; } return sentenceScore; } public static <T> float getLogProb(final List<T> ngram, final ContextEncodedNgramLanguageModel<T> lm) { final LmContextInfo contextOutput = new LmContextInfo(); final WordIndexer<T> wordIndexer = lm.getWordIndexer(); float score = Float.NaN; for (int i = 0; i < ngram.size(); ++i) { score = lm.getLogProb(contextOutput.offset, contextOutput.order, wordIndexer.getIndexPossiblyUnk(ngram.get(i)), contextOutput); } return score; } } }