package com.scaleunlimited.maps;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.apache.hadoop.io.Writable;
public class StringSet implements Set<String>, Writable {
// Value returned by fastutil when we request an int that doesn't exist.
private static final int MISSING_HASH_VALUE = -1;
private static final int DEFAULT_ENTRY_COUNT = 1000;
private static final int STRING_DATA_BLOCKSIZE = 64 * 1024;
private Int2IntOpenHashMap _hashToOffset;
private Set<String> _collisionSet;
private byte[] _stringData;
private int _curOffset;
private boolean _smallHash; // for testing
public StringSet() {
this(false);
}
public StringSet(boolean smallHash) {
reset(smallHash, DEFAULT_ENTRY_COUNT, 0, STRING_DATA_BLOCKSIZE);
}
private void reset(boolean smallHash, int numHashEntries, int numCollisionEntries, int stringDataSize) {
_smallHash = smallHash;
_hashToOffset = new Int2IntOpenHashMap(numHashEntries);
_hashToOffset.defaultReturnValue(MISSING_HASH_VALUE);
_collisionSet = new HashSet<String>(numCollisionEntries);
_stringData = new byte[stringDataSize];
_curOffset = 0;
}
@Override
public void readFields(DataInput in) throws IOException {
boolean smallHash = in.readBoolean();
int numHashEntries = in.readInt();
int numCollisionEntries = in.readInt();
int stringDataSize = in.readInt();
reset(smallHash, numHashEntries, numCollisionEntries, stringDataSize);
in.readFully(_stringData, 0, stringDataSize);
// Now we have to rebuild the hash table from the data in _stringData.
for (; _curOffset < stringDataSize; ) {
int len = calcStringLength(_curOffset);
if (len > 0) {
// only process strings we haven't deleted
int hash = hash(_stringData, _curOffset, len);
int oldOffset = _hashToOffset.put(hash, _curOffset);
if (oldOffset != MISSING_HASH_VALUE) {
throw new IOException("Data corruption - hash already exists!");
}
}
// Skip over the null value.
_curOffset += (len + 1);
}
// Now read in the collision values. For each, make sure we already have a
// hash entry, otherwise it's an error.
for (int i = 0; i < numCollisionEntries; i++) {
String s = in.readUTF();
int hash = hash(s);
if (!_hashToOffset.containsKey(hash)) {
throw new IOException("Data corruption - collision entry doesn't exist in hash!");
}
if (!_collisionSet.add(s)) {
throw new IOException("Data corruption - collision entry already exists!");
}
}
}
@Override
public void write(DataOutput out) throws IOException {
out.writeBoolean(_smallHash);
out.writeInt(_hashToOffset.size());
out.writeInt(_collisionSet.size());
out.writeInt(_curOffset);
// Now just write out the string array. We can re-build the hash table from
// this array.
out.write(_stringData, 0, _curOffset);
// Write out the strings we've saved in the collision set.
Iterator<String> iter = _collisionSet.iterator();
while (iter.hasNext()) {
out.writeUTF(iter.next());
}
}
private int calcStringLength(int startingOffset) {
int curOffset = startingOffset;
while (_stringData[curOffset] != 0) {
curOffset += 1;
}
return curOffset - startingOffset;
}
/**
* Generate a 32-bit JOAAT hash from the bytes of <phrase>
*
* @param phrase String to hash
* @return 32-bit hash
*/
public int hash(String phrase) {
int result = HashUtils.getIntHash(phrase);
if (_smallHash) {
// only generate 256 unique hash values.
result = result & 0x0FF;
}
return result;
}
private int hash(byte[] b, int offset, int length) {
int result = HashUtils.getIntHash(b, offset, length);
if (_smallHash) {
// only generate 256 unique hash values.
result = result & 0x0FF;
}
return result;
}
@Override
public int size() {
return _hashToOffset.size() + _collisionSet.size();
}
@Override
public boolean isEmpty() {
return _hashToOffset.isEmpty() && _collisionSet.isEmpty();
}
@Override
public boolean contains(Object o) {
if (o instanceof String) {
int hash = hash((String)o);
int offset = _hashToOffset.get(hash);
if (offset == MISSING_HASH_VALUE) {
return false;
}
// We might have a match...need to see if the actual string matches our stored bytes.
// If not, then we check the collision set.
byte[] stringBytes = getUTF8Bytes((String)o);
boolean matches = true;
for (int i = 0; (i < stringBytes.length) && matches; i++) {
if (stringBytes[i] != _stringData[offset + i]) {
matches = false;
}
}
// If it matched all of the string bytes, make sure we've got our terminating null byte.a
matches = matches && _stringData[offset + stringBytes.length] == 0;
// If it didn't match, see if it's in the collision set.
return(matches || _collisionSet.contains((String)o));
} else {
return false;
}
}
private byte[] getUTF8Bytes(String str) {
try {
return str.getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("Impossible missing charset exception", e);
}
}
@Override
public Iterator<String> iterator() {
throw new UnsupportedOperationException("Not yet implemented");
// We'd need to create an iterator that iterates over all of the values, and
// also returns everything from the collision set.
}
@Override
public Object[] toArray() {
throw new UnsupportedOperationException("Not yet implemented");
}
@Override
public <T> T[] toArray(T[] a) {
throw new UnsupportedOperationException("Not yet implemented");
}
@Override
public boolean add(String e) {
if (contains(e)) {
return false;
}
int hash = hash(e);
int offset = _hashToOffset.get(hash);
if (offset == MISSING_HASH_VALUE) {
// We need to add it to the array and the hash set
byte[] stringBytes = getUTF8Bytes(e);
// Make sure we have enough space in the array.
int endOffset = _curOffset + stringBytes.length + 1;
if (endOffset > _stringData.length) {
byte[] newData = new byte[endOffset + STRING_DATA_BLOCKSIZE];
System.arraycopy(_stringData, 0, newData, 0, _curOffset);
_stringData = newData;
}
System.arraycopy(stringBytes, 0, _stringData, _curOffset, stringBytes.length);
_hashToOffset.put(hash, _curOffset);
_curOffset += stringBytes.length;
// And null-terminate it.
_stringData[_curOffset++] = 0;
} else {
_collisionSet.add(e);
}
return true;
}
@Override
public boolean remove(Object o) {
if (o instanceof String) {
if (_collisionSet.remove(o)) {
return true;
} else {
// FUTURE set up to reclaim space in string data block.
// We'd want to save the offset somewhere
int hash = hash((String)o);
int stringDataOffset = _hashToOffset.remove(hash);
if (stringDataOffset != MISSING_HASH_VALUE) {
// We need to clear out the entry so we don't re-add it as a string
// when we de-serialize things.
int len = calcStringLength(stringDataOffset);
Arrays.fill(_stringData, stringDataOffset, stringDataOffset + len, (byte)0);
return true;
} else {
return false;
}
}
} else {
return false;
}
}
@Override
public boolean containsAll(Collection<?> c) {
throw new UnsupportedOperationException("Not yet implemented");
}
@Override
public boolean addAll(Collection<? extends String> c) {
throw new UnsupportedOperationException("Not yet implemented");
}
@Override
public boolean retainAll(Collection<?> c) {
throw new UnsupportedOperationException("Not yet implemented");
}
@Override
public boolean removeAll(Collection<?> c) {
throw new UnsupportedOperationException("Not yet implemented");
}
@Override
public void clear() {
_hashToOffset.clear();
_collisionSet.clear();
// Decrease size of byte array
if (_stringData.length > STRING_DATA_BLOCKSIZE) {
_stringData = new byte[STRING_DATA_BLOCKSIZE];
}
_curOffset = 0;
}
}