/* * Copyright 2010 LIUM, based on Carnegie Mellon University previous work. * Portions Copyright 2002 Sun Microsystems, Inc. * Portions Copyright 2002 Mitsubishi Electric Research Laboratories. * Portions Copyright 2010 LIUM, University of Le Mans, France -> Yannick Esteve, Anthony Rousseau * All Rights Reserved. Use is subject to license terms. * * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL * WARRANTIES. * */ package edu.cmu.sphinx.linguist.language.ngram.large; import java.io.*; import java.net.URL; import java.util.*; import java.util.logging.Level; import java.util.logging.Logger; import edu.cmu.sphinx.linguist.WordSequence; import edu.cmu.sphinx.linguist.dictionary.Dictionary; import edu.cmu.sphinx.linguist.dictionary.Word; import edu.cmu.sphinx.linguist.language.ngram.LanguageModel; import edu.cmu.sphinx.linguist.util.LRUCache; import edu.cmu.sphinx.util.LogMath; import edu.cmu.sphinx.util.TimerPool; import edu.cmu.sphinx.util.props.*; /** * Language model that uses a binary NGram language model file ("DMP file") * generated by the SphinxBase sphinx_lm_convert. */ public class LargeNGramModel implements LanguageModel { /** * The property for the name of the file that logs all the queried N-grams. * If this property is set to null, it means that the queried N-grams are * not logged. */ @S4String(mandatory = false) public static final String PROP_QUERY_LOG_FILE = "queryLogFile"; /** The property that defines that maximum number of ngrams to be cached */ @S4Integer(defaultValue = 100000) public static final String PROP_NGRAM_CACHE_SIZE = "ngramCacheSize"; /** * The property that controls whether the ngram caches are cleared after * every utterance */ @S4Boolean(defaultValue = false) public static final String PROP_CLEAR_CACHES_AFTER_UTTERANCE = "clearCachesAfterUtterance"; /** The property that defines the language weight for the search */ @S4Double(defaultValue = 1.0f) public final static String PROP_LANGUAGE_WEIGHT = "languageWeight"; /** * The property that controls whether or not the language model will apply * the language weight and word insertion probability */ @S4Boolean(defaultValue = false) public final static String PROP_APPLY_LANGUAGE_WEIGHT_AND_WIP = "applyLanguageWeightAndWip"; /** Word insertion probability property */ @S4Double(defaultValue = 1.0f) public final static String PROP_WORD_INSERTION_PROBABILITY = "wordInsertionProbability"; /** If true, use full bigram information to determine smear */ @S4Boolean(defaultValue = false) public final static String PROP_FULL_SMEAR = "fullSmear"; /** * The number of bytes per N-gram in the LM file generated by the * CMU-Cambridge Statistical Language Modeling Toolkit. */ public static final int BYTES_PER_NGRAM = 4; public static final int BYTES_PER_NMAXGRAM = 2; private final static int SMEAR_MAGIC = 0xC0CAC01A; // things go better // ------------------------------ // Configuration data // ------------------------------ URL location; protected Logger logger; protected LogMath logMath; protected int maxDepth; protected int ngramCacheSize; protected boolean clearCacheAfterUtterance; protected boolean fullSmear; protected Dictionary dictionary; protected String format; protected boolean applyLanguageWeightAndWip; protected float languageWeight; protected float unigramWeight; protected double wip; // ------------------------------- // Statistics // ------------------------------- private int ngramMisses; private int ngramHits; private int smearTermCount; protected String ngramLogFile; // ------------------------------- // subcomponents // -------------------------------- private BinaryLoader loader; private PrintWriter logFile; // ------------------------------- // Working data // -------------------------------- private Map<Word, UnigramProbability> unigramIDMap; private Map<WordSequence, NGramBuffer>[] loadedNGramBuffers; private LRUCache<WordSequence, Float> ngramProbCache; private Map<Long, Float> bigramSmearMap; private NGramBuffer[] loadedBigramBuffers; private UnigramProbability[] unigrams; private int[][] ngramSegmentTable; private float[][] ngramProbTable; private float[][] ngramBackoffTable; private float[] unigramSmearTerm; public LargeNGramModel(String format, URL location, String ngramLogFile, int maxNGramCacheSize, boolean clearCacheAfterUtterance, int maxDepth, Dictionary dictionary, boolean applyLanguageWeightAndWip, float languageWeight, double wip, float unigramWeight, boolean fullSmear) { logger = Logger.getLogger(getClass().getName()); this.format = format; this.location = location; this.ngramLogFile = ngramLogFile; this.ngramCacheSize = maxNGramCacheSize; this.clearCacheAfterUtterance = clearCacheAfterUtterance; this.maxDepth = maxDepth; logMath = LogMath.getLogMath(); this.dictionary = dictionary; this.applyLanguageWeightAndWip = applyLanguageWeightAndWip; this.languageWeight = languageWeight; this.wip = wip; this.unigramWeight = unigramWeight; this.fullSmear = fullSmear; } public LargeNGramModel() { } /* * (non-Javadoc) * * @see edu.cmu.sphinx.util.props.Configurable#newProperties(edu.cmu.sphinx. * util.props.PropertySheet) */ public void newProperties(PropertySheet ps) throws PropertyException { logger = ps.getLogger(); location = ConfigurationManagerUtils.getResource(PROP_LOCATION, ps); ngramLogFile = ps.getString(PROP_QUERY_LOG_FILE); ngramCacheSize = ps.getInt(PROP_NGRAM_CACHE_SIZE); clearCacheAfterUtterance = ps .getBoolean(PROP_CLEAR_CACHES_AFTER_UTTERANCE); maxDepth = ps.getInt(LanguageModel.PROP_MAX_DEPTH); dictionary = (Dictionary) ps.getComponent(PROP_DICTIONARY); applyLanguageWeightAndWip = ps .getBoolean(PROP_APPLY_LANGUAGE_WEIGHT_AND_WIP); languageWeight = ps.getFloat(PROP_LANGUAGE_WEIGHT); wip = ps.getDouble(PROP_WORD_INSERTION_PROBABILITY); unigramWeight = ps.getFloat(PROP_UNIGRAM_WEIGHT); fullSmear = ps.getBoolean(PROP_FULL_SMEAR); } /* * (non-Javadoc) * * @see edu.cmu.sphinx.linguist.language.ngram.LanguageModel#allocate() */ @SuppressWarnings("unchecked") public void allocate() throws IOException { TimerPool.getTimer(this, "Load LM").start(); logger.info("Loading n-gram language model from: " + location); // create the log file if specified if (ngramLogFile != null) logFile = new PrintWriter(new FileOutputStream(ngramLogFile)); if (location.getProtocol() == null || location.getProtocol().equals("file")) { try { loader = new BinaryLoader(new File(location.toURI()), format, applyLanguageWeightAndWip, languageWeight, wip, unigramWeight); } catch (Exception e) { loader = new BinaryLoader(new File(location.getPath()), format, applyLanguageWeightAndWip, languageWeight, wip, unigramWeight); } } else { loader = new BinaryStreamLoader(location, format, applyLanguageWeightAndWip, languageWeight, wip, unigramWeight); } unigramIDMap = new HashMap<Word, UnigramProbability>(); unigrams = loader.getUnigrams(); loadedNGramBuffers = new Map[loader.getMaxDepth()]; ngramProbTable = new float[loader.getMaxDepth()][]; ngramBackoffTable = new float[loader.getMaxDepth()][]; ngramSegmentTable = new int[loader.getMaxDepth()][]; for (int i = 1; i <= loader.getMaxDepth(); i++) { loadedNGramBuffers[i - 1] = new HashMap<WordSequence, NGramBuffer>(); if (i >= 2) ngramProbTable[i - 1] = loader.getNGramProbabilities(i); if (i > 2) { ngramBackoffTable[i - 1] = loader.getNGramBackoffWeights(i); ngramSegmentTable[i - 1] = loader.getNGramSegments(i); } } ngramProbCache = new LRUCache<WordSequence, Float>(ngramCacheSize); if (dictionary != null) buildUnigramIDMap(dictionary); else buildUnigramIDMap(); loadedBigramBuffers = new NGramBuffer[unigrams.length]; if (maxDepth <= 0 || maxDepth > loader.getMaxDepth()) maxDepth = loader.getMaxDepth(); for (int i = 1; i <= loader.getMaxDepth(); i++) logger.info(Integer.toString(i) + "-grams: " + loader.getNumberNGrams(i)); if (fullSmear) { System.out.println("Full Smear"); try { System.out.println("... Reading ..."); readSmearInfo("smear.dat"); System.out.println("... Done "); } catch (IOException e) { System.out.println("... " + e); System.out.println("... Calculating"); buildSmearInfo(); System.out.println("... Writing"); // writeSmearInfo("smear.dat"); System.out.println("... Done"); } } TimerPool.getTimer(this, "Load LM").stop(); } /* * (non-Javadoc) * * @see edu.cmu.sphinx.linguist.language.ngram.LanguageModel#deallocate() */ public void deallocate() throws IOException { loader.deallocate(); } /** * Builds the map from unigram to unigramID. Also finds the startWordID and * endWordID. * * @param dictionary * */ private void buildUnigramIDMap(Dictionary dictionary) { int missingWords = 0; String[] words = loader.getWords(); for (int i = 0; i < words.length; i++) { Word word = dictionary.getWord(words[i]); if (word == null) { logger.warning("The dictionary is missing a phonetic transcription for the word '" + words[i] + "'"); missingWords++; } unigramIDMap.put(word, unigrams[i]); if (logger.isLoggable(Level.FINE)) logger.fine("Word: " + word); } if (missingWords > 0) logger.warning("Dictionary is missing " + missingWords + " words that are contained in the language model."); } private void buildUnigramIDMap() { String[] words = loader.getWords(); for (int i = 0; i < words.length; i++) { Word word = new Word(words[i], null, false); unigramIDMap.put(word, unigrams[i]); } } @Override public void onUtteranceEnd() { clearCache(); if (logFile != null) { logFile.println("<END_UTT>"); logFile.flush(); } } /** Clears the various N-gram caches. */ private void clearCache() { for (int i = 0; i < loadedBigramBuffers.length; i++) { NGramBuffer buffer = loadedBigramBuffers[i]; if (buffer != null) { if (!buffer.getUsed()) loadedBigramBuffers[i] = null; // free the BigramBuffer else buffer.setUsed(false); } } loadedBigramBuffers = new NGramBuffer[unigrams.length]; for (int i = 2; i <= loader.getMaxDepth(); i++) { loadedNGramBuffers[i - 1] = new HashMap<WordSequence, NGramBuffer>(); } logger.info("LM Cache Size: " + ngramProbCache.size() + " Hits: " + ngramHits + " Misses: " + ngramMisses); if (clearCacheAfterUtterance) { ngramProbCache = new LRUCache<WordSequence, Float>(ngramCacheSize); } } /** * Gets the ngram probability of the word sequence represented by the word * list * * @param wordSequence * the word sequence * @return the probability of the word sequence. Probability is in logMath * log base */ public float getProbability(WordSequence wordSequence) { int numberWords = wordSequence.size(); Float probability; if (numberWords > maxDepth) { throw new Error("Unsupported NGram: " + wordSequence.size()); } if (numberWords == maxDepth) { probability = ngramProbCache.get(wordSequence); if (probability != null) { ngramHits++; return probability; } ngramMisses++; } probability = getNGramProbability(wordSequence); if (numberWords == maxDepth) ngramProbCache.put(wordSequence, probability); if (logFile != null && probability != null) logFile.println(wordSequence.toString().replace("][", " ") + " : " + Float.toString(probability)); return probability; } private Float getNGramProbability(WordSequence wordSequence) { int numberWords = wordSequence.size(); Word firstWord = wordSequence.getWord(0); if (loader.getNumberNGrams(numberWords) == 0 || !hasUnigram(firstWord)) return getNGramProbability(wordSequence.getNewest()); if (numberWords < 2) { return getUnigramProbability(wordSequence); } NGramProbability nGProbability = findNGram(wordSequence); if (nGProbability != null) { return ngramProbTable[numberWords - 1][nGProbability .getProbabilityID()]; } if (numberWords == 2) { UnigramProbability unigramProb = getUnigram(firstWord); UnigramProbability unigramProb1 = getUnigram(wordSequence .getWord(1)); return unigramProb.getLogBackoff() + unigramProb1.getLogProbability(); } NGramProbability nMinus1Gram = findNGram(wordSequence.getOldest()); if (nMinus1Gram != null) { return ngramBackoffTable[numberWords - 1][nMinus1Gram .getBackoffID()] + getProbability(wordSequence.getNewest()); } return getProbability(wordSequence.getNewest()); } /** * Finds or loads the NGram probability of the given NGram. * * @param wordSequence * the NGram to load * @return a NGramProbability of the given NGram */ private NGramProbability findNGram(WordSequence wordSequence) { int numberWords = wordSequence.size(); NGramProbability nGram = null; WordSequence oldest = wordSequence.getOldest(); NGramBuffer nGramBuffer = loadedNGramBuffers[numberWords - 1] .get(oldest); if (nGramBuffer == null) { nGramBuffer = getNGramBuffer(oldest); if (nGramBuffer != null) loadedNGramBuffers[numberWords - 1].put(oldest, nGramBuffer); } if (nGramBuffer != null) { int nthWordID = getWordID(wordSequence.getWord(numberWords - 1)); nGram = nGramBuffer.findNGram(nthWordID); } return nGram; } /** * Tells if the model is 16 or 32 bits. * * @return true if 32 bits, false otherwise */ private boolean is32bits() { if (loader.getBytesPerField() == 4) return true; return false; } /** * Loads into a buffer all the NGram followers of the given N-1Gram. * * @param ws * the N-1Gram to find followers * * @return a NGramBuffer of all the NGram followers of the given sequence */ private NGramBuffer loadNGramBuffer(WordSequence ws) { int firstWordID = getWordID(ws.getWord(0)); int firstCurrentNGramEntry = 0; int numberNGrams = 0; int size = 0; long position = 0; int orderBuffer = ws.size() + 1; NGramBuffer currentBuffer = null; NGramBuffer nMinus1Buffer = null; firstCurrentNGramEntry = unigrams[firstWordID].getFirstBigramEntry(); numberNGrams = getNumberBigramFollowers(firstWordID) + 1; if (numberNGrams == 1) // 1 means that there is no bigram starting with // firstWordID return null; if (orderBuffer == 2) { size = numberNGrams * ((loader.getMaxDepth() == orderBuffer) ? BYTES_PER_NMAXGRAM : BYTES_PER_NGRAM) * loader.getBytesPerField(); position = (loader.getNGramOffset(orderBuffer) + (firstCurrentNGramEntry * ((loader.getMaxDepth() == orderBuffer) ? BYTES_PER_NMAXGRAM : BYTES_PER_NGRAM) * loader.getBytesPerField())); } else { // only for ws.size() >= 2 int lastWordId = getWordID(ws.getWord(ws.size() - 1)); nMinus1Buffer = getNGramBuffer(ws.getOldest()); int index = nMinus1Buffer.findNGramIndex(lastWordId); if (index == -1) return null; int firstNMinus1GramEntry = nMinus1Buffer.getFirstNGramEntry(); firstCurrentNGramEntry = getFirstNGramEntry( nMinus1Buffer.getNGramProbability(index), firstNMinus1GramEntry, orderBuffer); int firstNextNGramEntry = getFirstNGramEntry( nMinus1Buffer.getNGramProbability(index + 1), firstNMinus1GramEntry, orderBuffer); numberNGrams = firstNextNGramEntry - firstCurrentNGramEntry; if (numberNGrams == 0) return null; if (loader.getMaxDepth() != orderBuffer) numberNGrams++; size = numberNGrams * ((loader.getMaxDepth() == orderBuffer) ? BYTES_PER_NMAXGRAM : BYTES_PER_NGRAM) * loader.getBytesPerField(); position = loader.getNGramOffset(orderBuffer) + (long) firstCurrentNGramEntry * (long) ((loader.getMaxDepth() == orderBuffer) ? BYTES_PER_NMAXGRAM : BYTES_PER_NGRAM) * loader.getBytesPerField(); } try { byte[] buffer = loader.loadBuffer(position, size); if (loader.getMaxDepth() == orderBuffer) { currentBuffer = new NMaxGramBuffer(buffer, numberNGrams, loader.getBigEndian(), is32bits(), orderBuffer, firstCurrentNGramEntry); } else { currentBuffer = new NGramBuffer(buffer, numberNGrams, loader.getBigEndian(), is32bits(), orderBuffer, firstCurrentNGramEntry); } } catch (IOException ioe) { ioe.printStackTrace(); throw new Error("Error loading " + orderBuffer + "-Grams."); } return currentBuffer; } /** * Returns the NGrams of the given word sequence * * @param wordSequence * the word sequence from which to get the buffer * @return the NGramBuffer of the word sequence */ private NGramBuffer getNGramBuffer(WordSequence wordSequence) { NGramBuffer nGramBuffer = null; int order = wordSequence.size(); if (order > 1) nGramBuffer = loadedNGramBuffers[order - 1].get(wordSequence); // better // when // using // containsKey if (nGramBuffer == null) { nGramBuffer = loadNGramBuffer(wordSequence); if (nGramBuffer != null) loadedNGramBuffers[order - 1].put(wordSequence, nGramBuffer); // optimizable // by // adding // an // 'empty' // nGramBuffer } return nGramBuffer; } /** * Returns the index of the first NGram entry of the given N-1Gram * * @param nMinus1Gram * the N-1Gram which first NGram entry we're looking for * @param firstNMinus1GramEntry * the index of the first N-1Gram entry of the N-1Gram in * question * @param n * the order of the NGram * @return the index of the first NGram entry of the given N-1Gram */ private int getFirstNGramEntry(NGramProbability nMinus1Gram, int firstNMinus1GramEntry, int n) { int firstNGramEntry = ngramSegmentTable[n - 1][(firstNMinus1GramEntry + nMinus1Gram .getWhichFollower()) >> loader.getLogNGramSegmentSize()] + nMinus1Gram.getFirstNPlus1GramEntry(); return firstNGramEntry; } /** * Returns the unigram probability of the given unigram. * * @param wordSequence * the unigram word sequence * @return the unigram probability */ private float getUnigramProbability(WordSequence wordSequence) { Word unigram = wordSequence.getWord(0); UnigramProbability unigramProb = getUnigram(unigram); if (unigramProb == null) throw new Error("Unigram not in LM: " + unigram); return unigramProb.getLogProbability(); } /** * Returns its UnigramProbability if this language model has the given * unigram. * * @param unigram * the unigram to find * @return the UnigramProbability, or null if this language model does not * have the unigram */ private UnigramProbability getUnigram(Word unigram) { return unigramIDMap.get(unigram); } /** * Returns true if this language model has the given unigram. * * @param unigram * the unigram to find * @return true if this LM has this unigram, false otherwise */ private boolean hasUnigram(Word unigram) { return (unigramIDMap.get(unigram) != null); } /** * Returns the ID of the given word. * * @param word * the word to find the ID * @return the ID of the word */ public final int getWordID(Word word) { UnigramProbability probability = getUnigram(word); if (probability == null) throw new IllegalArgumentException("No word ID: " + word); else return probability.getWordID(); } /** * Returns true if the language model contains the given word * * @param w word * @return if word is in the language model */ public boolean hasWord(Word w) { return (unigramIDMap.get(new Word(w.toString(), null, false)) != null); } /** * Gets the smear term for the given wordSequence * * @param wordSequence * the word sequence * @return the smear term associated with this word sequence */ public float getSmearOld(WordSequence wordSequence) { float smearTerm = 0.0f; if (fullSmear) { int length = wordSequence.size(); if (length > 0) { int wordID = getWordID(wordSequence.getWord(length - 1)); smearTerm = unigramSmearTerm[wordID]; } } if (fullSmear && logger.isLoggable(Level.FINE)) logger.fine("SmearTerm: " + smearTerm); return smearTerm; } int smearCount; int smearBigramHit; public float getSmear(WordSequence wordSequence) { float smearTerm = 0.0f; if (fullSmear) { smearCount++; int length = wordSequence.size(); if (length == 1) { int wordID = getWordID(wordSequence.getWord(0)); smearTerm = unigramSmearTerm[wordID]; } else if (length >= 2) { int size = wordSequence.size(); int wordID1 = getWordID(wordSequence.getWord(size - 2)); int wordID2 = getWordID(wordSequence.getWord(size - 1)); Float st = getSmearTerm(wordID1, wordID2); if (st == null) smearTerm = unigramSmearTerm[wordID2]; else { smearTerm = st; smearBigramHit++; } } if (smearCount % 100000 == 0) System.out.println("Smear hit: " + smearBigramHit + " tot: " + smearCount); } if (fullSmear && logger.isLoggable(Level.FINE)) logger.fine("SmearTerm: " + smearTerm); return smearTerm; } /** * Returns the number of bigram followers of a word. * * @param wordID * the ID of the word * @return the number of bigram followers */ private int getNumberBigramFollowers(int wordID) { if (wordID == unigrams.length - 1) return 0; else return unigrams[wordID + 1].getFirstBigramEntry() - unigrams[wordID].getFirstBigramEntry(); } /** * Returns the maximum depth of the language model * * @return the maximum depth of the language model */ public int getMaxDepth() { return maxDepth; } /** * Returns the set of words in the language model. The set is unmodifiable. * * @return the unmodifiable set of words */ public Set<String> getVocabulary() { Set<String> vocabulary = new HashSet<String>(Arrays.asList(loader .getWords())); return Collections.unmodifiableSet(vocabulary); } /** * Returns the number of times when a NGram is queried, but there is no such * NGram in the LM (in which case it uses the backoff probabilities). * * @return the number of NGram misses */ public int getNGramMisses() { return ngramMisses; } /** * Returns the number of NGram hits. * * @return the number of NGram hits */ public int getNGramHits() { return ngramHits; } /** * Returns the bigrams of the given word * * @param firstWordID * the ID of the word * * @return the bigrams of the word */ private NGramBuffer getBigramBuffer(int firstWordID) { Word[] wd = new Word[1]; wd[0] = dictionary.getWord(loader.getWords()[firstWordID]); WordSequence ws = new WordSequence(wd); return loadNGramBuffer(ws); } /** * Loads into a buffer all the trigram followers of the given bigram. * * @param firstWordID * the ID of the first word * @param secondWordID * the ID of the second word * * @return a TrigramBuffer of all the trigram followers of the given two * words */ private NGramBuffer loadTrigramBuffer(int firstWordID, int secondWordID) { Word[] wd = new Word[2]; wd[0] = dictionary.getWord(loader.getWords()[firstWordID]); wd[1] = dictionary.getWord(loader.getWords()[secondWordID]); WordSequence ws = new WordSequence(wd); return loadNGramBuffer(ws); } private void buildSmearInfo() throws IOException { double S0 = 0; double R0 = 0; bigramSmearMap = new HashMap<Long, Float>(); double[] ugNumerator = new double[unigrams.length]; double[] ugDenominator = new double[unigrams.length]; double[] ugAvgLogProb = new double[unigrams.length]; unigramSmearTerm = new float[unigrams.length]; for (UnigramProbability unigram : unigrams) { float logp = unigram.getLogProbability(); double p = logMath.logToLinear(logp); S0 += p * logp; R0 += p * logp * logp; } System.out.println("R0 S0 " + R0 + ' ' + S0); for (int i = 0; i < loadedBigramBuffers.length; i++) { NGramBuffer bigram = getBigramBuffer(i); if (bigram == null) { unigramSmearTerm[i] = LogMath.LOG_ONE; continue; } ugNumerator[i] = 0.0; ugDenominator[i] = 0.0; ugAvgLogProb[i] = 0.0; float logugbackoff = unigrams[i].getLogBackoff(); double ugbackoff = logMath.logToLinear(logugbackoff); for (int j = 0; j < bigram.getNumberNGrams(); j++) { int wordID = bigram.getWordID(j); NGramProbability bgProb = bigram.getNGramProbability(j); float logugprob = unigrams[wordID].getLogProbability(); float logbgprob = ngramProbTable[1][bgProb.getProbabilityID()]; double ugprob = logMath.logToLinear(logugprob); double bgprob = logMath.logToLinear(logbgprob); double backoffbgprob = ugbackoff * ugprob; double logbackoffbgprob = logMath.linearToLog(backoffbgprob); ugNumerator[i] += (bgprob * logbgprob - backoffbgprob * logbackoffbgprob) * logugprob; ugDenominator[i] += (bgprob - backoffbgprob) * logugprob; // dumpProbs(ugNumerator, ugDenominator, i, j, logugprob, // logbgprob, ugprob, bgprob, backoffbgprob, // logbackoffbgprob); } ugNumerator[i] += ugbackoff * (logugbackoff * S0 + R0); ugAvgLogProb[i] = ugDenominator[i] + ugbackoff * S0; ugDenominator[i] += ugbackoff * R0; // System.out.println("n/d " + ugNumerator[i] + " " + // ugDenominator[i]); unigramSmearTerm[i] = (float) (ugNumerator[i] / ugDenominator[i]); // / unigramSmearTerm[i] = // logMath.linearToLog(ugNumerator[i] / ugDenominator[i]); // System.out.println("ugs " + unigramSmearTerm[i]); } for (int i = 0; i < loadedBigramBuffers.length; i++) { System.out.println("Processed " + i + " of " + loadedBigramBuffers.length); NGramBuffer bigram = getBigramBuffer(i); if (bigram == null) continue; for (int j = 0; j < bigram.getNumberNGrams(); j++) { float smearTerm; NGramProbability bgProb = bigram.getNGramProbability(j); float logbgbackoff = ngramBackoffTable[2][bgProb.getBackoffID()]; double bgbackoff = logMath.logToLinear(logbgbackoff); int k = bigram.getWordID(j); NGramBuffer trigram = loadTrigramBuffer(i, k); if (trigram == null) smearTerm = unigramSmearTerm[k]; else { double bg_numerator = 0; double bg_denominator = 0; for (int l = 0; l < trigram.getNumberNGrams(); l++) { int m = trigram.getWordID(l); float logtgprob = ngramProbTable[2][trigram .getProbabilityID(l)]; double tgprob = logMath.logToLinear(logtgprob); float logbgprob = getBigramProb(k, m); double bgprob = logMath.logToLinear(logbgprob); float logugprob = unigrams[m].getLogProbability(); double backofftgprob = bgbackoff * bgprob; double logbackofftgprob = logMath .linearToLog(backofftgprob); bg_numerator += (tgprob * logtgprob - backofftgprob * logbackofftgprob) * logugprob; bg_denominator += (tgprob - backofftgprob) * logugprob * logugprob; } bg_numerator += bgbackoff * (logbgbackoff * ugAvgLogProb[k] - ugNumerator[k]); bg_denominator += bgbackoff * ugDenominator[k]; // bigram.ugsmear = bg_numerator / bg_denominator; smearTerm = (float) (bg_numerator / bg_denominator); smearTermCount++; } putSmearTerm(i, k, smearTerm); } } System.out.println("Smear count is " + smearTermCount); } @SuppressWarnings("unused") private void dumpProbs(double[] ugNumerator, double[] ugDenominator, int i, int j, float logugprob, float logbgprob, double ugprob, double bgprob, double backoffbgprob, double logbackoffbgprob) { System.out .println("ubo " + ugprob + ' ' + bgprob + ' ' + backoffbgprob); System.out.println("logubo " + logugprob + ' ' + logbgprob + ' ' + logbackoffbgprob); System.out.println("n/d " + j + ' ' + ugNumerator[i] + ' ' + ugDenominator[i]); System.out.print(ugprob + " " + bgprob + ' ' + backoffbgprob); System.out.print(" " + logugprob + ' ' + logbgprob + ' ' + logbackoffbgprob); System.out.println(" " + ugNumerator[i] + ' ' + ugDenominator[i]); } /** * Writes the smear info to the given file * * @param filename * the file to write the smear info to * @throws IOException * if an error occurs on write */ @SuppressWarnings("unused") private void writeSmearInfo(String filename) throws IOException { DataOutputStream out = new DataOutputStream(new FileOutputStream( filename)); out.writeInt(SMEAR_MAGIC); System.out.println("writing " + unigrams.length); out.writeInt(unigrams.length); for (int i = 0; i < unigrams.length; i++) out.writeFloat(unigramSmearTerm[i]); for (int i = 0; i < unigrams.length; i++) { System.out.println("Writing " + i + " of " + unigrams.length); NGramBuffer bigram = getBigramBuffer(i); if (bigram == null) { out.writeInt(0); continue; } out.writeInt(bigram.getNumberNGrams()); for (int j = 0; j < bigram.getNumberNGrams(); j++) { int k = bigram.getWordID(j); Float smearTerm = getSmearTerm(i, k); out.writeInt(k); out.writeFloat(smearTerm.floatValue()); } } out.close(); } /** * Reads the smear info from the given file * * @param filename * where to read the smear info from * @throws IOException * if an inconsistent file is found or on any general I/O error */ private void readSmearInfo(String filename) throws IOException { DataInputStream in = new DataInputStream(new FileInputStream(filename)); if (in.readInt() != SMEAR_MAGIC) { in.close(); throw new IOException("Bad smear format for " + filename); } if (in.readInt() != unigrams.length) { in.close(); throw new IOException("Bad unigram length in " + filename); } bigramSmearMap = new HashMap<Long, Float>(); unigramSmearTerm = new float[unigrams.length]; System.out.println("Reading " + unigrams.length); for (int i = 0; i < unigrams.length; i++) unigramSmearTerm[i] = in.readFloat(); for (int i = 0; i < unigrams.length; i++) { System.out.println("Processed " + i + " of " + loadedBigramBuffers.length); int numBigrams = in.readInt(); NGramBuffer bigram = getBigramBuffer(i); if (bigram.getNumberNGrams() != numBigrams) { in.close(); throw new IOException("Bad ngrams for unigram " + i + " Found " + numBigrams + " expected " + bigram.getNumberNGrams()); } for (int j = 0; j < numBigrams; j++) { int k = bigram.getWordID(j); putSmearTerm(i, k, in.readFloat()); } } in.close(); } /** * Puts the smear term for the two words * * @param word1 * the first word * @param word2 * the second word * @param smearTerm * the smear term */ private void putSmearTerm(int word1, int word2, float smearTerm) { long bigramID = (((long) word1) << 32) | word2; bigramSmearMap.put(bigramID, smearTerm); } /** * Retrieves the smear term for the two words * * @param word1 * the first word * @param word2 * the second word * @return the smear term */ private Float getSmearTerm(int word1, int word2) { long bigramID = (((long) word1) << 32) | word2; return bigramSmearMap.get(bigramID); } /** * Retrieves the bigram probability for the two given words * * @param word1 * the first word of the bigram * @param word2 * the second word of the bigram * @return the log probability */ private float getBigramProb(int word1, int word2) { NGramBuffer bigram = getBigramBuffer(word1); NGramProbability bigramProbability = bigram.findNGram(word2); return ngramProbTable[1][bigramProbability.getProbabilityID()]; } }