package edu.berkeley.cs.nlp.ocular.lm;
import java.io.Serializable;
import java.util.Arrays;
/**
* @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu)
*/
public class CountDbBig implements Serializable {
private static final long serialVersionUID = 4457857637251200367L;
public long[][] keys;
public int[][] countVals;
public final int numCountTypes;
private long trainNumTokens;
private int trainNumBigramTypes;
private int numEntries;
private int numProbes;
private int numQueries;
public CountDbBig(int numKeys, int numCountTypes) {
this.keys = new long[numKeys][];
int totalNumCountTypes = numCountTypes;
this.countVals = new int[totalNumCountTypes][numKeys];
for (int i = 0; i < totalNumCountTypes; i++) {
for (int j = 0; j < numKeys; j++) {
countVals[i][j] = 0;
}
}
this.numCountTypes = numCountTypes;
this.trainNumBigramTypes = 0;
this.numEntries = 0;
this.numProbes = 0;
this.numQueries = 0;
}
public long getNumTokens() {
return trainNumTokens;
}
public int getNumBigramTypes() {
return trainNumBigramTypes;
}
public int currSize() {
return numEntries;
}
public int totalSize() {
return countVals[0].length;
}
public long[][] getKeys() {
return keys;
}
public int getCount(NgramWrapper ngram, CountType countType) {
return this.getCount(ngram.getLongerRep(), countType);
}
public int getCount(long[] ngram, CountType countType) {
int countTypeIndex = countType.getIndex();
return countVals[countTypeIndex][find(ngram)];
}
public void incrementBigramTypes() {
trainNumBigramTypes++;
}
public int incrementCount(NgramWrapper ngram, CountType countType) {
return this.incrementCount(ngram.getLongerRep(), countType);
}
/**
* @param ngram
* @param countType
* @param trainTestSelector
* @return The old count of the ngram (pre-update), but only if we do token counts
*/
private int incrementCount(long[] ngram, CountType countType) {
int countTypeIndex = countType.getIndex();
int index = find(ngram);
int oldCount = countVals[countTypeIndex][index];
if (!Arrays.equals(keys[index], ngram)) {
numEntries++;
}
if (countType == CountType.TOKEN_INDEX) {
trainNumTokens++;
}
keys[index] = ngram;
countVals[countTypeIndex][index]++;
return oldCount;
}
private int find(long[] key) {
int hashToArray = hashKey(key);
if (hashToArray < 0)
hashToArray += totalSize();
numQueries++;
numProbes++;
// Until we find the key or a blank space to put it in
while (!Arrays.equals(keys[hashToArray], key) && keys[hashToArray] != null) {
numProbes++;
hashToArray = (hashToArray + 1) % totalSize();
}
return hashToArray;
}
private int hashKey(long[] key) {
int hc = 17;
for (int i = 0; i < key.length; i++) {
hc = (hc + ((int)(key[i]^(key[i]>>>32)))) * 3875239;
}
hc = hc % totalSize();
if (hc < 0) {
hc += totalSize();
}
return hc;
}
public void maybeResize() {
if (numEntries * 1.08 > countVals[0].length) {
if (Runtime.getRuntime().freeMemory() < countVals[0].length * (2 + countVals.length) * 4 * 0.6) {
System.out.println("WARNING: need more than " + Runtime.getRuntime().freeMemory()/(1024*1024)
+ " MB in order to expand");
}
// Resize additively because because at this size, it will get too big otherwise
if (totalSize() >= 50000000) {
resizeDb(countVals[0].length + 10000000);
} else {
resizeDb((int)(countVals[0].length * 1.6));
}
}
}
public void resizeDb(int newNumKeys) {
System.out.println("Resizing database to have " + newNumKeys + " keys");
long[][] tempKeys = keys;
int[][] tempCountVals = countVals;
keys = new long[newNumKeys][];
countVals = new int[numCountTypes][newNumKeys];
for (int i = 0; i < tempCountVals[0].length; i++) {
if (tempKeys[i] == null)
continue;
int newDbInd = find(tempKeys[i]);
keys[newDbInd] = tempKeys[i];
for (int j = 0; j < numCountTypes; j++)
countVals[j][newDbInd] = tempCountVals[j][i];
}
this.numProbes = 0;
this.numQueries = 0;
tempKeys = null;
tempCountVals = null;
System.gc();
System.gc();
System.gc();
}
public String getStringAnalysis() {
int maxBlockSize = 0;
double avgBlockSize = 0;
double numBlocks = 0;
int currSize = 0;
int[] blockDist = new int[15];
for (int i = 0; i < totalSize(); i++) {
if (keys[i] != null) {
currSize++;
} else {
if (currSize > 0) {
numBlocks++;
avgBlockSize += currSize;
if (currSize < 15)
blockDist[currSize]++;
maxBlockSize = Math.max(maxBlockSize, currSize);
currSize = 0;
}
}
}
String retVal = "Total size: " + totalSize()
+ ", " + numEntries + " entries\n\t" + ((int)numBlocks) + " blocks, avg size "
+ avgBlockSize/numBlocks + ", max size " + maxBlockSize
+ "\n\tAverage number of probes: " + ((double)numProbes)/numQueries
+ "\n\tBlock dist (first few): ";
for (int i = 0; i < blockDist.length; i++) {
retVal += ((double)blockDist[i])/numBlocks + " ";
}
this.numQueries = 0;
this.numProbes = 0;
return retVal;
}
}