/* * Copyright 1999-2002 Carnegie Mellon University. * Portions Copyright 2002 Sun Microsystems, Inc. * Portions Copyright 2002 Mitsubishi Electric Research Laboratories. * Portions Copyright 2010 LIUM, University of Le Mans, France -> Yannick Esteve, Anthony Rousseau * All Rights Reserved. Use is subject to license terms. * * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL * WARRANTIES. * */ package edu.cmu.sphinx.linguist.language.ngram.large; /** * Implements a buffer that contains NGrams. It assumes that the first two bytes of each n-gram entry is the ID of the * n-gram. */ class NGramBuffer { private final byte[] buffer; private final int numberNGrams; private int position; private final boolean bigEndian; private final boolean is32bits; private final int n; private boolean used; private int firstNGramEntry; /** * Constructs a NGramBuffer object with the given byte[]. * * @param buffer the byte[] with NGrams * @param numberNGrams the number of N-gram * @param bigEndian the buffer's endianness * @param is32bits whether the buffer is 16 or 32 bits * @param n the buffer's order * @param firstNGramEntry the first NGram Entry */ public NGramBuffer(byte[] buffer, int numberNGrams, boolean bigEndian, boolean is32bits, int n, int firstNGramEntry) { this.buffer = buffer; this.numberNGrams = numberNGrams; this.bigEndian = bigEndian; this.is32bits = is32bits; this.position = 0; this.n = n; this.firstNGramEntry = firstNGramEntry; } /** * Returns the byte[] of n-grams. * * @return the byte[] of n-grams */ public byte[] getBuffer() { return buffer; } /** * Returns the firstNGramEntry * @return the firstNGramEntry of the buffer */ public int getFirstNGramEntry() { return firstNGramEntry; } /** * Returns the size of the buffer in bytes. * * @return the size of the buffer in bytes */ public int getSize() { return buffer.length; } /** * Returns the number of n-grams in this buffer. * * @return the number of n-grams in this buffer */ public int getNumberNGrams() { return numberNGrams; } /** * Returns the position of the buffer. * * @return the position of the buffer */ protected int getPosition() { return position; } protected int getN() { return n; } /** * Sets the position of the buffer. * * @param position new buffer position */ protected void setPosition(int position) { this.position = position; } /** * Returns the word ID of the nth follower, assuming that the ID is the first two bytes of the NGram entry. * * @param nthFollower starts from 0 to (numberFollowers - 1). * @return the word ID */ public final int getWordID(int nthFollower) { int nthPosition = nthFollower * (buffer.length / numberNGrams); setPosition(nthPosition); return readBytesAsInt(); } /** * Returns true if the NGramBuffer is big-endian. * * @return true if the NGramBuffer is big-endian, false if little-endian */ public final boolean isBigEndian() { return bigEndian; } /** * Returns true if the NGramBuffer is 32 bits. * * @return true if the NGramBuffer is 32 bits, false if 16 bits */ public final boolean is32bits() { return is32bits; } /** * Reads the next two bytes from the buffer's current position as an integer. * * @return the next two bytes as an integer */ public final int readBytesAsInt() { if (is32bits) { if (bigEndian) { int value = (0x000000ff & buffer[position++]); value <<= 8; value |= (0x000000ff & buffer[position++]); value <<= 8; value |= (0x000000ff & buffer[position++]); value <<= 8; value |= (0x000000ff & buffer[position++]); return value; } else { int value = (0x000000ff & buffer[position+3]); value <<= 8; value |= (0x000000ff & buffer[position+2]); value <<= 8; value |= (0x000000ff & buffer[position+1]); value <<= 8; value |= (0x000000ff & buffer[position]); position += 4; return value; } } else { if (bigEndian) { int value = (0x000000ff & buffer[position++]); value <<= 8; value |= (0x000000ff & buffer[position++]); return value; } else { int value = (0x000000ff & buffer[position + 1]); value <<= 8; value |= (0x000000ff & buffer[position]); position += 2; return value; } } } /** * Returns true if this buffer was used in the last utterance. * * @return true if this buffer was used in the last utterance */ public boolean getUsed() { return used; } /** * Sets whether this buffer was used in the last utterance * * @param used true if this buffer was used in the last utterance, false otherwise */ public void setUsed(boolean used) { this.used = used; } /** * Finds the NGram probability ID for the given nth word in a NGram. * * @param nthWordID the ID of the nth word * @return the NGram Probability ID of the given nth word */ public int findProbabilityID(int nthWordID) { int mid, start = 0, end = getNumberNGrams(); int nGram = -1; while ((end - start) > 0) { mid = (start + end) / 2; int midWordID = getWordID(mid); if (midWordID < nthWordID) { start = mid + 1; } else if (midWordID > nthWordID) { end = mid; } else { nGram = getProbabilityID(mid); break; } } return nGram; } /** * Returns the NGramProbability of the nth follower. * * @param nthFollower which follower * @return the NGramProbability of the nth follower */ public int getProbabilityID(int nthFollower) { int nthPosition = 0; nthPosition = nthFollower * LargeNGramModel.BYTES_PER_NGRAM * ((is32bits) ? 4 : 2); setPosition(nthPosition + ((is32bits) ? 4 : 2)); // to skip the word ID return readBytesAsInt(); } /** * Finds the NGram probabilities for the given nth word in a NGram. * * @param nthWordID the ID of the nth word * @return the NGramProbability of the given nth word */ public NGramProbability findNGram(int nthWordID) { int mid, start = 0, end = getNumberNGrams() - 1; NGramProbability ngram = null; while ((end - start) > 0) { mid = (start + end) / 2; int midWordID = getWordID(mid); if (midWordID < nthWordID) { start = mid + 1; } else if (midWordID > nthWordID) { end = mid; } else { ngram = getNGramProbability(mid); break; } } return ngram; } /** * Finds the NGram index for the given nth word in a NGram * * @param nthWordID the ID of the nth word * @return the NGramIndex of the given nth word */ public int findNGramIndex(int nthWordID) { int mid = -1, start = 0, end = getNumberNGrams() - 1; while ((end - start) > 0) { mid = (start + end) / 2; int midWordID = getWordID(mid); if (midWordID < nthWordID) { start = mid + 1; } else if (midWordID > nthWordID) { end = mid; } else { break; } } return mid; } /** * Returns the NGramProbability of the nth follower. * * @param nthFollower which follower * @return the NGramProbability of the nth follower */ public NGramProbability getNGramProbability(int nthFollower) { int nthPosition = 0, wordID = 0, probID = 0, backoffID = 0, firstNGram = 0; nthPosition = nthFollower * LargeNGramModel.BYTES_PER_NGRAM * ((is32bits) ? 4 : 2); setPosition(nthPosition); wordID = readBytesAsInt(); probID = readBytesAsInt(); backoffID = readBytesAsInt(); firstNGram = readBytesAsInt(); return (new NGramProbability(nthFollower, wordID, probID, backoffID, firstNGram)); } }