package edu.berkeley.cs.nlp.ocular.lm;
import tberg.murphy.indexer.Indexer;
/**
* Contains code for carrying out operations on trigrams encoded as longs.
* Can be instantiated, but also has static methods so that the code can be
* used without creating the object.
*
* Indices are packed into a long using BITS_PER_WORD bits per index,
* up to MAX_ORDER indices. BITS_PER_WORD * MAX_ORDER must be <= 64 (use = at your own risk...)
* When indices are in the long, 1 is added to each of them so that lower-order
* n-grams (with 0s) can be differentiated from n-grams with the first character in the indexer
* in them.
*
* @author Greg Durrett (gdurrett@cs.berkeley.edu)
*/
public class Ngram {
// 128 characters should be enough, this lets us do a 9-gram
public static final int BITS_PER_WORD = 7;
public static final int MAX_ORDER = 9;
public static final int[] CONVERTER = new int[MAX_ORDER];
private static int encodeWord(int rawWord) {
return rawWord + 1;
}
private static int decodeWord(int encodedWord) {
return encodedWord - 1;
}
public static long convertToLong(int[] ngram) {
return convertToLong(ngram, 0, ngram.length);
}
public static long convertToLong(int[] ngram, int start, int end) {
long l = 0;
for (int i = start; i < end; i++)
l = (l << BITS_PER_WORD) + encodeWord(ngram[i]);
return l;
}
public static int[] convertToIntArr(long ngram) {
// assert Ngram.getActualOrder(ngram) == MAX_ORDER : "Ngram of less than max order: "
// + Ngram.toString(ngram) + ", order: " + Ngram.getActualOrder(ngram);
int[] arr = new int[Ngram.getActualOrder(ngram)];
int i = 0;
long wordMask = (1L << BITS_PER_WORD) - 1;
while (ngram != 0) {
arr[arr.length - 1 - i] = decodeWord((int) (ngram & wordMask));
i++;
ngram = Ngram.getHistory(ngram);
}
return arr;
}
public static long getLowerOrder(long ngram) {
return Ngram.getLowerOrder(ngram, Ngram.getActualOrder(ngram));
}
public static long getLowerOrder(long ngram, int order) {
long mask = (1L << ((order - 1) * BITS_PER_WORD)) - 1L;
return mask & ngram;
}
public static long getHistory(long ngram) {
return Ngram.getHistory(ngram, Ngram.getActualOrder(ngram));
}
public static long getHistory(long ngram, int order) {
long mask = ((1L << (((long) order - 1) * BITS_PER_WORD)) - 1L) << BITS_PER_WORD;
return (mask & ngram) >> BITS_PER_WORD;
}
public static long getLowerOrderHistory(long ngram) {
return Ngram.getLowerOrderHistory(ngram, Ngram.getActualOrder(ngram));
}
public static long getLowerOrderHistory(long ngram, int order) {
return Ngram.getLowerOrder(Ngram.getHistory(ngram, order), order - 1);
}
// public static long addWordAndShift(long ngram, int word) {
// long mask = (1L << (((long) MAX_ORDER - 1) * BITS_PER_WORD)) - 1L << BITS_PER_WORD;
// return ((ngram << BITS_PER_WORD) & mask) + encodeWord(word);
// }
public static int getActualOrder(long ngram) {
for (int i = MAX_ORDER - 1; i >= 0; i--) {
long mask = (1L << (((long) i) * BITS_PER_WORD)) - 1L;
if ((ngram & mask) != ngram)
return i + 1;
}
return 0;
}
public static String toString(int[] ngram, Indexer<String> indexer) {
return Ngram.toString(Ngram.convertToLong(ngram), indexer);
}
public static String toString(long ngram, Indexer<String> indexer) {
return "[" + Ngram.getActualOrder(ngram) + ":" + getNgramStr(ngram, indexer) + "]";
}
public static String getNgramStr(long ngram, Indexer<String> indexer) {
String string = "";
int order = Ngram.getActualOrder(ngram);
for (int i = 0; i < order; i++) {
long mask = (1L << BITS_PER_WORD) - 1L;
string = indexer.getObject(decodeWord((int) (ngram & mask))) + string;
ngram = ngram >> BITS_PER_WORD;
}
return string;
}
}