/* (c) 2014 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use
* this file except in compliance with the License. You may obtain a copy of the
* License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
*/
package com.linkedin.cubert.memory;
import com.linkedin.cubert.operator.cube.DimensionKey;
import com.linkedin.cubert.utils.Pair;
import java.util.Iterator;
import org.apache.commons.lang.NotImplementedException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
*
* A compact hash table implementation with predictable memory consumption. This is
* implemented to have precise control over the total amount of memory allocated. The key
* idea is to stored a key and return an "index" where the value would be stored if an
* array of values of size expectedHTSize were to be allocated.
*
* Why is only index returned instead of storing the actual values? This way, we can avoid
* the cost of storing different types of objects via a generic Object array and let the
* derived classes define what type of values they are allocating. The derived classes
* just need to allocate an array and used the returned index to index into the array.
*
* @author Krishna Puttaswamy
*
*/
public class CompactHashTableBase
{
private static final Log LOG = LogFactory.getLog(CompactHashTableBase.class.getName());
private static final int LOAD_FACTOR = 2;
private static final int BATCH_PCT = 10;
private static final Integer DEFAULT_VALUE = new Integer(-1);
private static final int GROWTH_TRACKER_SIZE = 10;
private final int expectedSize;
private final int keyHashtableBatchSize;
private final IntArrayList batchGrowthTracker;
private final IntArrayList keyHashcodes; // hashcodes
private final IntArrayList offsetToData; // offset to data array
private final IntArrayList dimensionKeyArray; // where data is stored in int format
private int dimensionKeyLength = 0;
private int dataArrayOffset = 0; // offset to dimensionKey offset
private int numberOfHashtableEntries;
private int putCount = 0;
private final Pair<Integer, Boolean> index = new Pair<Integer, Boolean>(-1, false);
private final Pair<DimensionKey, Integer> entry =
new Pair<DimensionKey, Integer>(null, null);
private final OffsetsIterator offsetsIterator = new OffsetsIterator();
// Iterator abstraction for offset polling.
class OffsetsIterator
{
private boolean first;
private int offset;
private int batchPos;
public void setSeedOffset(int offset)
{
// Always look for offset starting in the first growth bucket.
batchPos = 0;
this.offset = offset % batchGrowthTracker.getInt(batchPos);
first = true;
}
public int getNext()
{
if (first)
{
// method called for the first time.
first = false;
return offset;
}
// Increment offset
offset++;
if (offset == batchGrowthTracker.getInt(batchPos))
{
// Offset has hit the max count for current growth bucket.
boolean firstWrapAround = (batchPos == 0);
batchPos++;
if (firstWrapAround)
{
// ONLY For the first wrap-around only allow looking at offset == 0.
// TODO: optimize such that if initial offset was zero this condition can be skipped.
offset = 0;
return offset;
}
if (batchPos >= batchGrowthTracker.size())
{
// Gone past the last growth bucket. Error!!
throw new RuntimeException("Hashtable offset not found!");
}
}
return offset;
}
}
/**
* Note: size the hash table appropriately to fit in memory. Too small a hash table can affect runtime performance
* due to bucket conflicts.
*
* @param dimensionKeyLength
* @param expectedHTSize
*/
public CompactHashTableBase(int dimensionKeyLength, int expectedHTSize)
{
this.dimensionKeyLength = dimensionKeyLength;
expectedSize = expectedHTSize;
numberOfHashtableEntries = baselineHashtableSize();
int valueArraySize = dimensionKeyLength * expectedSize;
keyHashcodes = getGrowableArray(numberOfHashtableEntries, DEFAULT_VALUE);
offsetToData = getGrowableArray(numberOfHashtableEntries, DEFAULT_VALUE);
dimensionKeyArray = getGrowableArray(valueArraySize, null);
keyHashtableBatchSize = batchSize(numberOfHashtableEntries);
batchGrowthTracker = new IntArrayList(GROWTH_TRACKER_SIZE);
batchGrowthTracker.add(numberOfHashtableEntries); // initial size
}
private int baselineHashtableSize()
{
return LOAD_FACTOR * expectedSize;
}
private int batchSize(int baselineSize)
{
return (baselineSize * BATCH_PCT) / 100;
}
private IntArrayList getGrowableArray(int baselineSize, Integer defaultValue)
{
// Calculate size of each batch
int batch_size = batchSize(baselineSize);
IntArrayList growableArray = new IntArrayList(batch_size);
// [Optionally] Set default value of each element in the array
if (defaultValue != null)
growableArray.setDefaultValue(defaultValue.intValue());
// Ensure that array can hold baselineSize # of elements
growableArray.ensureCapacity(baselineSize);
return growableArray;
}
class KeyIndexIterator implements Iterator<Pair<DimensionKey, Integer>>
{
int count = 0;
int offset = 0;
public KeyIndexIterator(int count)
{
this.count = count;
}
@Override
public boolean hasNext()
{
return count > 0;
}
@Override
public Pair<DimensionKey, Integer> next()
{
count--;
Pair<DimensionKey, Integer> obj = deserializedAndGetEntry(offset);
offset += dimensionKeyLength;
return obj;
}
@Override
public void remove()
{
throw new NotImplementedException();
}
}
public Iterator<Pair<DimensionKey, Integer>> getIterator()
{
return new KeyIndexIterator(putCount);
}
public int size()
{
return putCount;
}
public void clear()
{
dataArrayOffset = 0;
putCount = 0;
numberOfHashtableEntries = baselineHashtableSize();
keyHashcodes.reset(numberOfHashtableEntries);
offsetToData.reset(numberOfHashtableEntries);
batchGrowthTracker.reset(GROWTH_TRACKER_SIZE);
batchGrowthTracker.add(numberOfHashtableEntries);
}
private int getValidHashCode(DimensionKey key)
{
int h = key.hashCode();
if (h == Integer.MIN_VALUE)
h = Integer.MAX_VALUE;
return h < 0 ? -h : h;
}
// No need to have this pair any more
public Pair<Integer, Boolean> lookupOrCreateIndex(DimensionKey key)
{
if (putCount >= numberOfHashtableEntries)
{
LOG.info("Bump size. putCount = " + putCount +
" currentSize = " + numberOfHashtableEntries +
" increase by = " + keyHashtableBatchSize);
numberOfHashtableEntries += keyHashtableBatchSize;
batchGrowthTracker.add(numberOfHashtableEntries); // track growth
keyHashcodes.ensureCapacity(numberOfHashtableEntries);
offsetToData.ensureCapacity(numberOfHashtableEntries);
}
int hashcode = getValidHashCode(key);
offsetsIterator.setSeedOffset(hashcode);
boolean isNewKey;
int offset;
// Look through offset enumerator to find empty / matching slot for key
while (true)
{
offset = offsetsIterator.getNext();
// Test: if the slot is empty
if (keyHashcodes.getInt(offset) == DEFAULT_VALUE)
{
// Key does not exist, else would have been found before empty slot
isNewKey = true;
break;
}
// Test: if key @ slot matches current key
if ((keyHashcodes.getInt(offset) == hashcode)
&& deserializedAndCompare(offsetToData.getInt(offset), key))
{
// Key exists. Rewrite
isNewKey = false;
break;
}
}
if (isNewKey)
{
// new put.
putCount++; // count of the unique number of keys;
keyHashcodes.updateInt(offset, hashcode);
offsetToData.updateInt(offset, writeToStore(key));
}
int realIndex = offsetToData.getInt(offset) / dimensionKeyLength;
index.setFirst(realIndex);
index.setSecond(isNewKey);
return index;
}
private int writeToStore(DimensionKey key)
{
int oldOffset = dataArrayOffset;
int[] dimArray = key.getArray();
dimensionKeyArray.ensureCapacity(dataArrayOffset + dimensionKeyLength);
for (int i = 0; i < dimArray.length; i++)
dimensionKeyArray.updateInt(dataArrayOffset++, dimArray[i]);
return oldOffset;
}
public Pair<DimensionKey, Integer> deserializedAndGetEntry(int offset)
{
entry.setSecond(offset / dimensionKeyLength);
int len = dimensionKeyLength;
int[] data = new int[len];
for (int i = 0; i < len; i++)
data[i] = dimensionKeyArray.getInt(offset++);
DimensionKey key = new DimensionKey(data);
entry.setFirst(key);
return entry;
}
public boolean deserializedAndCompare(int offset, DimensionKey key)
{
int len = dimensionKeyLength;
int[] tocompare = key.getArray();
if (tocompare.length != len)
return false;
for (int i = 0; i < tocompare.length; i++)
if (tocompare[i] != dimensionKeyArray.getInt(offset++))
return false;
return true;
}
public Object deserializedAndGetValue(int offset)
{
return offset / dimensionKeyLength;
}
}