package edu.berkeley.cs.nlp.ocular.lm;
import tberg.murphy.indexer.Indexer;
/**
* Contains code for carrying out operations on trigrams encoded as longs.
* Can be instantiated, but also has static methods so that the code can be
* used without creating the object.
*
* Indices are packed into a long using BITS_PER_WORD bits per index,
* up to MAX_ORDER indices. BITS_PER_WORD * MAX_ORDER must be <= 64 (use = at your own risk...)
* When indices are in the long, 1 is added to each of them so that lower-order
* n-grams (with 0s) can be differentiated from n-grams with the first character in the indexer
* in them.
*
* @author Greg Durrett (gdurrett@cs.berkeley.edu)
*/
public class LongNgram {
// 128 characters should be enough, this lets us do a 9-gram
public static final int BITS_PER_WORD = 7;
public static final int MAX_ORDER = 9;
public static long[] convertToLong(int[] ngram) {
return convertToLong(ngram, 0, ngram.length);
}
public static long[] convertToLong(int[] ngram, int start, int end) {
// Add MAX_ORDER-1 to round up
int numLongs = (end - start + MAX_ORDER-1)/MAX_ORDER;
long[] longs = new long[numLongs];
int longIdx = numLongs - 1;
for (int i = end; i > start; i -= MAX_ORDER) {
longs[longIdx] = Ngram.convertToLong(ngram, Math.max(start, i - MAX_ORDER), i);
longIdx--;
}
return longs;
}
public static int[] convertToIntArr(long[] ngram) {
int[] arr = new int[LongNgram.getActualOrder(ngram)];
int ngramIdx = arr.length - 1;
for (int longIdx = ngram.length - 1; longIdx >= 0; longIdx--) {
int[] curr = Ngram.convertToIntArr(ngram[longIdx]);
for (int i = curr.length - 1; i >= 0; i--) {
arr[ngramIdx] = curr[i];
ngramIdx--;
}
}
return arr;
}
// TODO: I think these methods work but they don't do clipping to arbitrary orders,
// and I think it's easier to just
// public static long[] getLowerOrder(long[] ngram) {
// return LongNgram.getLowerOrder(ngram, LongNgram.getActualOrder(ngram));
// }
//
// public static long[] getLowerOrder(long[] ngram, int order) {
// if (order % MAX_ORDER == 1) {
// long[] newNgram = new long[ngram.length-1];
// System.arraycopy(ngram, 1, newNgram, 0, ngram.length-1);
// return newNgram;
// } else {
// long[] newNgram = new long[ngram.length];
// System.arraycopy(ngram, 0, newNgram, 0, ngram.length);
// newNgram[0] = Ngram.getLowerOrder(ngram[0]);
// return newNgram;
// }
// }
//
// public static long[] getHistory(long[] ngram) {
// return LongNgram.getHistory(ngram, LongNgram.getActualOrder(ngram));
// }
//
// public static long[] getHistory(long[] ngram, int order) {
// long lowOrderMask = (1L << ((long)BITS_PER_WORD)) - 1L;
// long[] newNgram;
// int newNgramIdx;
// long carryOver;
// if (order % MAX_ORDER == 1) {
// newNgram = new long[ngram.length-1];
// newNgramIdx = 0;
// carryOver = ngram[0];
// } else {
// newNgram = new long[ngram.length];
// newNgramIdx = 1;
// carryOver = ngram[0] & lowOrderMask;
// newNgram[0] = ngram[0] >>> BITS_PER_WORD;
// }
// for (int i = 1; i < ngram.length; i++) {
// newNgram[newNgramIdx] = ngram[i] >>> BITS_PER_WORD + carryOver << (BITS_PER_WORD * (MAX_ORDER - 1));
// newNgramIdx++;
// carryOver = ngram[i] & lowOrderMask;
// }
// return newNgram;
// }
//
// public static long[] getLowerOrderHistory(long[] ngram) {
// return LongNgram.getLowerOrderHistory(ngram, LongNgram.getActualOrder(ngram));
// }
//
// public static long[] getLowerOrderHistory(long[] ngram, int order) {
// return LongNgram.getLowerOrder(LongNgram.getHistory(ngram, order), order - 1);
// }
public static int getActualOrder(long[] ngram) {
if (ngram.length == 0) {
return 0;
} else {
return (ngram.length - 1) * MAX_ORDER + Ngram.getActualOrder(ngram[0]);
}
}
public static String toString(int[] ngram, Indexer<String> indexer) {
return LongNgram.toString(LongNgram.convertToLong(ngram), indexer);
}
public static String toString(long[] ngram, Indexer<String> indexer) {
int order = LongNgram.getActualOrder(ngram);
String ngramStr = "";
for (int i = 0; i < ngram.length; i++) {
ngramStr += Ngram.getNgramStr(ngram[i], indexer);
}
return "[" + order + ":" + ngramStr + "]";
}
}