package com.scaleunlimited.maps;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.hadoop.io.Writable;
/**
* A Map<String, String> that uses fastutil for native type->native type mapping, and a byte array for
* storing the UTF-8 bytes for key/value pairs. This makes it much more efficient for storing lots of
* small strings, and it's very fast to serialize/deserialize.
*
*/
public class StringMap implements Map<String, String>, Writable {
// Value returned by fastutil when we request an int that doesn't exist.
private static final int MISSING_HASH_VALUE = -1;
private static final int DEFAULT_ENTRY_COUNT = 1000;
private static final int STRING_DATA_BLOCKSIZE = 64 * 1024;
// FUTURE have multiple stringData arrays, each up to a max size, and determine which one via offset % block size.
// That would avoid having one gigantic block of memory that we're expanding (and copying to).
// FUTURE do in-place put if new key/value fit where old key/value was located.
// FUTURE make it more efficient by skipping conversion of string to byte array, unless the key contains a
// character > 0x7F (which means it's not something that fits in one byte in UTF-8)
// FUTURE track empty space in data array due to removal/put that has to move. If it gets too big relative to
// total file size, do a compaction. Walk data, generate up to say 10K offset/shift values (where shift
// keeps increasing) - move the data as we do this. Then walk the map, and do binary search into offsets,
// adjusting value by shift amount.
private Int2IntOpenHashMap _hashToOffsets;
private Map<String, String> _collisionMap;
private byte[] _stringData;
private int _curStringOffset;
private boolean _smallHash; // for testing
public StringMap() {
this(false);
}
public StringMap(boolean smallHash) {
reset(smallHash, DEFAULT_ENTRY_COUNT, 0, STRING_DATA_BLOCKSIZE);
}
private void reset(boolean smallHash, int numHashEntries, int numCollisionEntries, int stringDataSize) {
_smallHash = smallHash;
_hashToOffsets = new Int2IntOpenHashMap(numHashEntries);
_hashToOffsets.defaultReturnValue(MISSING_HASH_VALUE);
_collisionMap = new HashMap<String, String>(numCollisionEntries);
// The key and value strings are stored as null-termianted UTF-8 bytes
_stringData = new byte[stringDataSize];
_curStringOffset = 0;
}
@Override
public void readFields(DataInput in) throws IOException {
boolean smallHash = in.readBoolean();
int numHashEntries = in.readInt();
int numCollisionEntries = in.readInt();
int stringDataSize = in.readInt();
reset(smallHash, numHashEntries, numCollisionEntries, stringDataSize);
in.readFully(_stringData, 0, stringDataSize);
// Now we have to rebuild the hash table from the data in _stringData.
for (; _curStringOffset < stringDataSize; ) {
int keyLen = calcStringLength(_curStringOffset);
if (keyLen > 0) {
// only process strings we haven't deleted
int hash = hash(_stringData, _curStringOffset, keyLen);
int oldOffset = _hashToOffsets.put(hash, _curStringOffset);
if (oldOffset != MISSING_HASH_VALUE) {
throw new IOException("Data corruption - hash already exists!");
}
_curStringOffset += (keyLen + 1);
// Skip over the value
int valueLen = calcStringLength(_curStringOffset);
_curStringOffset += (valueLen + 1);
} else {
_curStringOffset += 1;
}
}
// Now read in the collision values. For each, make sure we already have a
// hash entry, otherwise it's an error.
for (int i = 0; i < numCollisionEntries; i++) {
String key = in.readUTF();
String value = in.readUTF();
int hash = hash(key);
if (!_hashToOffsets.containsKey(hash)) {
throw new IOException("Data corruption - collision entry doesn't exist in hash!");
}
if (_collisionMap.put(key, value) != null) {
throw new IOException("Data corruption - collision entry already exists!");
}
}
}
@Override
public void write(DataOutput out) throws IOException {
out.writeBoolean(_smallHash);
out.writeInt(_hashToOffsets.size());
out.writeInt(_collisionMap.size());
// Write out the key & value data info. We can e-build the hash table from
// this array.
out.writeInt(_curStringOffset);
out.write(_stringData, 0, _curStringOffset);
// Write out the entries we've saved in the collision set.
for (Entry<String, String> entry : _collisionMap.entrySet()) {
out.writeUTF(entry.getKey());
out.writeUTF(entry.getValue());
}
}
private int calcStringLength(int startingOffset) {
int curOffset = startingOffset;
while (_stringData[curOffset] != 0) {
curOffset += 1;
}
return curOffset - startingOffset;
}
private String getValueString(int valueOffset, int valueLen) {
try {
return new String(_stringData, valueOffset, valueLen, "UTF-8");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("Impossible missing charset exception", e);
}
}
/**
* Generate a 32-bit JOAAT hash from the bytes of <phrase>
*
* @param phrase String to hash
* @return 32-bit hash
*/
public int hash(String phrase) {
int result = HashUtils.getIntHash(phrase);
if (_smallHash) {
// only generate 256 unique hash values, for testing.
result = result & 0x0FF;
}
return result;
}
private int hash(byte[] b, int offset, int length) {
int result = HashUtils.getIntHash(b, offset, length);
if (_smallHash) {
// only generate 256 unique hash values, for testing.
result = result & 0x0FF;
}
return result;
}
@Override
public int size() {
return _hashToOffsets.size() + _collisionMap.size();
}
@Override
public boolean isEmpty() {
return _hashToOffsets.isEmpty() && _collisionMap.isEmpty();
}
@Override
public String remove(Object key) {
if (key instanceof String) {
String collisionValue = _collisionMap.remove(key);
if (collisionValue != null) {
return collisionValue;
} else {
// FUTURE set up to reclaim space in string data block.
// We'd want to save the offset somewhere
int hash = hash((String)key);
int keyOffset = _hashToOffsets.remove(hash);
if (keyOffset != MISSING_HASH_VALUE) {
// We need to clear out the entry so we don't re-add it as a string
// when we de-serialize things.
int keyLen = calcStringLength(keyOffset);
int valueOffset = keyOffset + keyLen + 1;
int valueLen = calcStringLength(valueOffset);
String result = getValueString(valueOffset, valueLen);
Arrays.fill(_stringData, keyOffset, keyOffset + keyLen + 1 + valueLen + 1, (byte)0);
return result;
} else {
return null;
}
}
} else {
return null;
}
}
@Override
public void clear() {
_hashToOffsets.clear();
_collisionMap.clear();
// Decrease size of byte arrays
if (_stringData.length > STRING_DATA_BLOCKSIZE) {
_stringData = new byte[STRING_DATA_BLOCKSIZE];
}
_curStringOffset = 0;
}
@Override
public boolean containsKey(Object key) {
if (key instanceof String) {
int hash = hash((String)key);
int keyOffset = _hashToOffsets.get(hash);
if (keyOffset == MISSING_HASH_VALUE) {
return false;
}
// We might have a match...need to see if the actual string matches our stored bytes.
// If not, then we check the collision set.
byte[] stringBytes = HashUtils.getUTF8Bytes((String)key);
boolean matches = true;
for (int i = 0; (i < stringBytes.length) && matches; i++) {
if (stringBytes[i] != _stringData[keyOffset + i]) {
matches = false;
}
}
// If it matched all of the string bytes, make sure we've got our terminating null byte.
matches = matches && _stringData[keyOffset + stringBytes.length] == 0;
// If it didn't match, see if it's in the collision set.
return(matches || _collisionMap.containsKey((String)key));
} else {
return false;
}
}
@Override
public boolean containsValue(Object value) {
throw new UnsupportedOperationException("Not yet implemented");
}
@Override
public String get(Object key) {
// See if we have it in the collision map.
String result = _collisionMap.get(key);
if (result != null) {
return result;
}
// TODO use common code to return either MISSING_HASH_VALUE,
// or the long offsets value if we have the key.
int hash = hash((String)key);
int keyOffset = _hashToOffsets.get(hash);
if (keyOffset == MISSING_HASH_VALUE) {
return null;
}
byte[] stringBytes = HashUtils.getUTF8Bytes((String)key);
boolean matches = true;
for (int i = 0; (i < stringBytes.length) && matches; i++) {
if (stringBytes[i] != _stringData[keyOffset + i]) {
matches = false;
}
}
// If it matched all of the string bytes, make sure we've got our terminating null byte.a
matches = matches && _stringData[keyOffset + stringBytes.length] == 0;
if (matches) {
int keyLen = stringBytes.length;
int valueOffset = keyOffset + keyLen + 1;
int valueLen = calcStringLength(valueOffset);
return getValueString(valueOffset, valueLen);
} else {
return _collisionMap.get(key);
}
}
private boolean keyInHash(String key) {
int hash = hash(key);
int keyOffset = _hashToOffsets.get(hash);
if (keyOffset == MISSING_HASH_VALUE) {
return false;
}
byte[] stringBytes = HashUtils.getUTF8Bytes(key);
boolean matches = true;
for (int i = 0; (i < stringBytes.length) && matches; i++) {
if (stringBytes[i] != _stringData[keyOffset + i]) {
matches = false;
}
}
// If it matched all of the string bytes, make sure we've got our terminating null byte.a
matches = matches && _stringData[keyOffset + stringBytes.length] == 0;
return matches;
}
@Override
public String put(String key, String value) {
int hash = hash(key);
int keyOffset = _hashToOffsets.get(hash);
if (keyOffset == MISSING_HASH_VALUE) {
// We need to add it to the array and the hash set
byte[] keyBytes = HashUtils.getUTF8Bytes(key);
byte[] valueBytes = HashUtils.getUTF8Bytes(value);
// Make sure we have enough space in the array.
int endOffset = _curStringOffset + keyBytes.length + 1 + valueBytes.length + 1;
if (endOffset > _stringData.length) {
byte[] newData = new byte[endOffset + STRING_DATA_BLOCKSIZE];
System.arraycopy(_stringData, 0, newData, 0, _curStringOffset);
_stringData = newData;
}
_hashToOffsets.put(hash, _curStringOffset);
System.arraycopy(keyBytes, 0, _stringData, _curStringOffset, keyBytes.length);
_curStringOffset += keyBytes.length;
_stringData[_curStringOffset++] = 0;
System.arraycopy(valueBytes, 0, _stringData, _curStringOffset, valueBytes.length);
_curStringOffset += valueBytes.length;
_stringData[_curStringOffset++] = 0;
// There was no previous value.
return null;
} else if (keyInHash(key)) {
// We're updating something that's in our hash. For now, just remove it and re-add it.
// FUTURE if new value length <= old value length, insert in-place and zero out the
// remaining data.
String result = remove(key);
put(key, value);
return result;
} else {
// We're adding a collision entry, or updating one that already exists.
return _collisionMap.put(key, value);
}
}
@Override
public void putAll(Map<? extends String, ? extends String> m) {
throw new UnsupportedOperationException("Not yet implemented");
}
@Override
public Set<String> keySet() {
throw new UnsupportedOperationException("Not yet implemented");
}
@Override
public Collection<String> values() {
throw new UnsupportedOperationException("Not yet implemented");
}
@Override
public Set<java.util.Map.Entry<String, String>> entrySet() {
throw new UnsupportedOperationException("Not yet implemented");
}
}