/*
* (c) 2014 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use
* this file except in compliance with the License. You may obtain a copy of the
* License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
*/
package com.linkedin.cubert.memory;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.util.IndexedSortable;
import org.apache.hadoop.util.QuickSort;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import com.linkedin.cubert.block.BlockSchema;
import com.linkedin.cubert.utils.TupleStore;
import com.linkedin.cubert.utils.print;
/**
* The LookUpTable is a memory efficient data structure for performing look ups.
* This is primarily useful while performing a HASH JOIN operation. The is designed to work in conjunction
* with the SerializedTupleStore.
*
* Constraints:
* The data structure needs to be initialized with all the data before any queries can be performed on the
* structure.
*
* Data Structures:
* A HashCode array is maintained where the index of the array would represent the HashCode % (size of the array)
* and the value would be the position or index of a sorted offset array.
* The sorted offset array is an array of offsets which point to a tuple in the SerializedTupleStore.
* The array is sorted by hashcode, hashkey.
*
* Algorithm:
* The algorithm is a 2 step fetch.
* Step 1> Compute Hash Code of Key
* Step 2> idx = HashCodeArray[HashCode] gives the index in the offset array to start looking
* Step 3> Linearly probe from idx onwards in the offset array and find the tuple which matches the hash key
*
* @author spyne on Oct 13, 2014
*/
public class LookUpTable implements Map<Tuple, List<Tuple>>
{
/* Used for obtaining the offset value */
private static final int SIGNBIT = Integer.MIN_VALUE;
private static final int MASK = Integer.MAX_VALUE;
private static final int SIZE_HASH_CODE_ARR = 1 << 22;
/* IndexSortable: Used for quick sort of offsets */
private final IndexedSortable sortable;
/* Tuple Store data */
private final TupleStore store;
/* Schema attributes */
private final BlockSchema schema;
private final int[] comparatorIndices; // Indices of Attributes that need to be Compared
/* Hash Table data structures */
private final int[] hashCodeArr;
private final int[] offsetArr;
public LookUpTable(TupleStore store,
String[] comparatorKeys) throws IOException
{
this.store = store;
schema = store.getSchema();
comparatorIndices = createComparatorIndices(comparatorKeys);
/* Hash Table Data Structures */
hashCodeArr = new int[SIZE_HASH_CODE_ARR];
offsetArr = store.getOffsets();
Arrays.fill(hashCodeArr, -1);
if (store instanceof ColumnarTupleStore)
sortable = new ColumnarIndexedSortable();
else
sortable = new CachedIndexedSortable();
buildTable();
}
private int[] createComparatorIndices(String[] comparatorKeys)
{
int[] keyIndices = new int[comparatorKeys.length];
for (int i = 0; i < keyIndices.length; i++)
keyIndices[i] = schema.getIndex(comparatorKeys[i]);
return keyIndices;
}
private void buildTable() throws IOException
{
QuickSort quickSort = new QuickSort();
long start, end;
/* Sort the offsets array */
start = System.currentTimeMillis();
if (offsetArr.length > 1)
{
quickSort.sort(sortable, 0, offsetArr.length);
}
end = System.currentTimeMillis();
print.f("LookUpTable: Sorted %d entries in %d ms", offsetArr.length, (end - start));
/* Fill in the HashCode array */
start = System.currentTimeMillis();
int prevHashCode = -1;
Tuple prevTuple = newTuple();
Tuple t = newTuple();
for (int i = 0; i < offsetArr.length; ++i)
{
t = store.getTuple(offsetArr[i], t);
int hashCode = tupleHashCode(t);
if (prevHashCode != hashCode)
{
hashCodeArr[hashCode] = i;
prevHashCode = hashCode;
}
if (i == 0 || !compareKeys(prevTuple, t))
{
offsetArr[i] = offsetArr[i] | SIGNBIT;
}
/* Object Reuse: Swap the tuples instead of creating new ones */
Tuple temp = t;
t = prevTuple;
prevTuple = temp;
}
end = System.currentTimeMillis();
print.f("LookUpTable: Created HashCode Array for %d entries in %d ms", offsetArr.length, (end - start));
}
private boolean compareKeys(Tuple t1, Tuple t2) throws ExecException
{
for (int idx : comparatorIndices)
{
if (!t1.get(idx).equals(t2.get(idx)))
{
return false;
}
}
return true;
}
public Tuple newTuple()
{
return TupleFactory.getInstance().newTuple(schema.getNumColumns());
}
@Override
public int size()
{
return offsetArr.length;
}
@Override
public boolean isEmpty()
{
return size() == 0;
}
@Override
public boolean containsKey(Object key)
{
try
{
return hashCodeArr[keyHashCode((Tuple) key)] != -1;
} catch (ExecException e)
{
e.printStackTrace();
throw new RuntimeException(e);
}
}
@Override
public boolean containsValue(Object value)
{
throw new UnsupportedOperationException();
}
@Override
public List<Tuple> get(Object keyObject)
{
Tuple key = (Tuple) keyObject;
try
{
/* Get the hashcode of the tuple considering only the comparator keys */
final int hashCode = keyHashCode(key);
/* This is the start index in the offset array */
final int startOffsetIdx = hashCodeArr[hashCode];
if (startOffsetIdx == -1) return null;
/* The end index is the start index of the next element unless its the last element */
int nextHashCode = hashCode + 1;
while (nextHashCode < hashCodeArr.length && hashCodeArr[nextHashCode] == -1)
{
++nextHashCode;
}
final int endOffsetIdx = nextHashCode < hashCodeArr.length ? hashCodeArr[nextHashCode] : offsetArr.length;
List<Tuple> tuples = new ArrayList<Tuple>();
boolean found = false;
for (int i = startOffsetIdx; i < endOffsetIdx; ++i)
{
int offset = offsetArr[i];
if (offset < 0)
{
if (found) break;
offset = offset & MASK;
final Tuple t = store.getTuple(offset, null);
if (matchesKey(key, t))
{
found = true;
tuples.add(t);
}
}
else if (found)
{
offset = offset & MASK;
tuples.add(store.getTuple(offset, null));
}
}
if( !found)
return null;
return tuples;
}
catch (Exception e)
{
e.printStackTrace();
throw new RuntimeException(e);
}
}
private boolean matchesKey(Tuple key, Tuple t) throws ExecException
{
for (int i = 0; i < comparatorIndices.length; ++i)
{
if (!key.get(i).equals(t.get(comparatorIndices[i])))
{
return false;
}
}
return true;
}
private int tupleHashCode(Tuple tuple) throws ExecException
{
final int PRIME = 31;
long hashCode = 17;
for (int idx : comparatorIndices)
{
hashCode = hashCode * PRIME + tuple.get(idx).hashCode();
}
if (hashCode < 0) hashCode = -hashCode;
return (int) (hashCode % hashCodeArr.length);
}
private int keyHashCode(Tuple key) throws ExecException
{
final int PRIME = 31;
long hashCode = 17;
for (int i = 0; i < key.size(); i++)
{
hashCode = hashCode * PRIME + key.get(i).hashCode();
}
if (hashCode < 0) hashCode = -hashCode;
return (int) (hashCode % hashCodeArr.length);
}
@Override
public List<Tuple> remove(Object key)
{
throw new UnsupportedOperationException();
}
@Override
public void clear()
{
throw new UnsupportedOperationException();
}
@SuppressWarnings("NullableProblems")
@Override
public Set<Tuple> keySet()
{
final int nKeyColumns = comparatorIndices.length;
final TupleFactory factory = TupleFactory.getInstance();
final Tuple reuse = newTuple();
final Set<Tuple> keys = new HashSet<Tuple>();
try
{
for (int offset : offsetArr)
{
/* For every new key the sign bit is set. Thus, ignore all others */
if (offset >= 0)
{
continue;
}
/* Mask out the offset and fetch from store */
offset = offset & MASK;
store.getTuple(offset, reuse);
/* Create a key tuple and add it to the set */
final Tuple t = factory.newTuple(nKeyColumns);
for (int c = 0; c < nKeyColumns; ++c)
{
t.set(c, reuse.get(comparatorIndices[c]));
}
keys.add(t);
}
}
catch (ExecException e)
{
throw new RuntimeException(e);
}
return keys;
}
@SuppressWarnings("NullableProblems")
@Override
public Collection<List<Tuple>> values()
{
throw new UnsupportedOperationException();
}
@SuppressWarnings("NullableProblems")
@Override
public Set<Entry<Tuple, List<Tuple>>> entrySet()
{
throw new UnsupportedOperationException("entrySet method not implemented");
}
@Override
public List<Tuple> put(Tuple key, List<Tuple> value)
{
throw new UnsupportedOperationException("put() called on read-only map");
}
@SuppressWarnings("NullableProblems")
@Override
public void putAll(Map<? extends Tuple, ? extends List<Tuple>> m)
{
throw new UnsupportedOperationException("putAll() called on read-only map");
}
public void printTuples()
{
final Tuple reuse = newTuple();
for (int offset : offsetArr)
{
System.out.println(store.getTuple(offset, reuse));
}
}
public IndexedSortable getSortable()
{
return sortable;
}
class ColumnarIndexedSortable implements IndexedSortable
{
private Tuple t1;
private Tuple t2;
@Override
public int compare(int i, int j)
{
try
{
final int offset1 = offsetArr[i] & MASK;
final int offset2 = offsetArr[j] & MASK;
t1 = store.getTuple(offset1, t1);
t2 = store.getTuple(offset2, t2);
/* t1 - t2 => ascending */
int result = tupleHashCode(t1) - tupleHashCode(t2);
if (result != 0) return result;
for (int idx : comparatorIndices)
{
Comparable a = (Comparable) t1.get(idx);
Comparable b = (Comparable) t2.get(idx);
/* ascending */
result = a.compareTo(b);
if (result != 0) return result;
}
return 0;
}
catch (IOException e)
{
e.printStackTrace();
throw new RuntimeException(e);
}
}
@Override
public void swap(int i, int j)
{
int temp = offsetArr[i];
offsetArr[i] = offsetArr[j];
offsetArr[j] = temp;
}
}
/**
* Created by spyne on 10/15/14.
*/
class CachedIndexedSortable implements IndexedSortable
{
/* Objects reused */
private final int[] offsets = { -1, -1, -1 };
private final Tuple[] tuples;
public CachedIndexedSortable()
{
tuples = new Tuple[]
{
newTuple(),
newTuple(),
newTuple()
};
}
/**
* Copied from SerializedStoreTupleComparator
*/
private Tuple getCached(final int offset) throws IOException
{
if (offsets[0] == offset)
return tuples[0];
if (offsets[1] == offset)
{
int tmp = offsets[0];
offsets[0] = offsets[1];
offsets[1] = tmp;
Tuple ttmp = tuples[0];
tuples[0] = tuples[1];
tuples[1] = ttmp;
return tuples[0];
}
int tmp0 = offsets[0];
int tmp1 = offsets[1];
offsets[0] = offsets[2];
offsets[1] = tmp0;
offsets[2] = tmp1;
Tuple ttmp0 = tuples[0];
Tuple ttmp1 = tuples[1];
tuples[0] = tuples[2];
tuples[1] = ttmp0;
tuples[2] = ttmp1;
if (offsets[0] != offset)
{
tuples[0] = store.getTuple(offset, tuples[0]);
offsets[0] = offset;
}
return tuples[0];
}
@SuppressWarnings("unchecked")
@Override
public int compare(int i, int j)
{
try
{
final int offset1 = offsetArr[i] & MASK;
final int offset2 = offsetArr[j] & MASK;
Tuple t1 = getCached(offset1);
Tuple t2 = getCached(offset2);
/* t1 - t2 => ascending */
int result = tupleHashCode(t1) - tupleHashCode(t2);
if (result != 0) return result;
for (int idx : comparatorIndices)
{
Comparable a = (Comparable) t1.get(idx);
Comparable b = (Comparable) t2.get(idx);
/* ascending */
result = a.compareTo(b);
if (result != 0) return result;
}
return 0;
}
catch (IOException e)
{
e.printStackTrace();
throw new RuntimeException(e);
}
}
@Override
public void swap(int i, int j)
{
int temp = offsetArr[i];
offsetArr[i] = offsetArr[j];
offsetArr[j] = temp;
}
}
}