/**
* Copyright 2011 LiveRamp
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.liveramp.hank.storage.cueball;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.util.Arrays;
import com.liveramp.commons.util.BytesUtils;
import com.liveramp.hank.compression.cueball.CueballCompressionCodec;
import com.liveramp.hank.hasher.Hasher;
import com.liveramp.hank.storage.Writer;
import com.liveramp.hank.util.EncodingHelper;
import com.liveramp.hank.util.IOStreamUtils;
/**
* Note that the current implementation does not support writing partitions with
* more than 80000 entries per block.
*/
public class CueballWriter implements Writer {
private static final int DEFAULT_NUMBER_OF_ENTRIES = 80000;
private final OutputStream stream;
private final int keyHashSize;
private final Hasher hasher;
private final int valueSize;
private final CueballCompressionCodec compressionCodec;
private final byte[] uncompressedBuffer;
private final byte[] compressedBuffer;
private final byte[] keyHashBytes;
private final byte[] previousKeyHashBytes;
private ByteBuffer previousKey = null;
private final long[] hashIndex;
private final HashPrefixCalculator prefixer;
private int lastHashPrefix = -1;
private int uncompressedOffset = 0;
private int numEntriesInBlock = 0;
private long numBytesWritten = 0;
private long numRecordsWritten = 0;
private long maxUncompressedBlockSize;
private long maxCompressedBlockSize;
public CueballWriter(OutputStream outputStream,
int keyHashSize,
Hasher hasher,
int valueSize,
CueballCompressionCodec compressionCodec,
int hashIndexBits) {
// Buffer output
this.stream = new BufferedOutputStream(outputStream, IOStreamUtils.DEFAULT_BUFFER_SIZE);
this.keyHashSize = keyHashSize;
this.hasher = hasher;
this.valueSize = valueSize;
this.compressionCodec = compressionCodec;
uncompressedBuffer = new byte[(keyHashSize + valueSize)
* DEFAULT_NUMBER_OF_ENTRIES];
compressedBuffer = new byte[compressionCodec.getMaxCompressBufferSize(uncompressedBuffer.length)];
keyHashBytes = new byte[keyHashSize];
previousKeyHashBytes = new byte[keyHashSize];
Arrays.fill(previousKeyHashBytes, (byte) 0);
prefixer = new HashPrefixCalculator(hashIndexBits);
hashIndex = new long[1 << hashIndexBits];
Arrays.fill(hashIndex, -1);
}
@Override
public void write(ByteBuffer key, ByteBuffer value) throws IOException {
// Check that value size is compatible
if (value.remaining() != valueSize) {
throw new IOException("Size of value to be written is: "
+ value.remaining() + ", but configured value size is: " + valueSize);
}
// Check that key is different from previous one
if (previousKey != null && previousKey.remaining() == key.remaining()
&& 0 == BytesUtils.compareBytesUnsigned(key, previousKey)) {
throw new IOException("Keys must be distinct but two consecutive (in terms of comparableKey) keys are equal."
+ " Offending key: " + BytesUtils.bytesToHexString(key));
}
// Hash key
hasher.hash(key, keyHashSize, keyHashBytes);
// Compare with previous key hash
int previousKeyHashComparision = BytesUtils.compareBytesUnsigned(keyHashBytes, 0, previousKeyHashBytes, 0, keyHashSize);
// Check that there is not a key hash collision
if (previousKey != null && 0 == previousKeyHashComparision) {
throw new IOException("Collision: two consecutive keys have the same hash value."
+ "\nKey: "
+ BytesUtils.bytesToHexString(key)
+ "\nPrevious key: "
+ BytesUtils.bytesToHexString(previousKey)
+ "\nHash: "
+ BytesUtils.bytesToHexString(ByteBuffer.wrap(keyHashBytes)));
}
// Check key hash ordering
if (0 > previousKeyHashComparision) {
throw new IOException("Key ordering is incorrect. They should be ordered by increasing hash (comparableKey) value, but a decreasing sequence was detected."
+ "\nKey: "
+ BytesUtils.bytesToHexString(key)
+ "\nHash: "
+ BytesUtils.bytesToHexString(ByteBuffer.wrap(keyHashBytes))
+ "\nPrevious key: "
+ BytesUtils.bytesToHexString(previousKey)
+ "\nPrevious Hash: "
+ BytesUtils.bytesToHexString(ByteBuffer.wrap(previousKeyHashBytes)));
}
// Write hash
writeHash(ByteBuffer.wrap(keyHashBytes), value);
numRecordsWritten++;
// Save current key and key hash
System.arraycopy(keyHashBytes, 0, previousKeyHashBytes, 0, keyHashSize);
previousKey = BytesUtils.byteBufferDeepCopy(key, previousKey);
}
public void writeHash(ByteBuffer hashedKey, ByteBuffer value) throws IOException {
// check the first hashIndexBits of the hashedKey
int thisPrefix = prefixer.getHashPrefix(hashedKey.array(), hashedKey.arrayOffset()
+ hashedKey.position());
// if this prefix and the last one don't match, then it's time to clear the
// buffer.
if (lastHashPrefix == -1 || thisPrefix != lastHashPrefix) {
// clear the uncompressed buffer and start over
clearUncompressed();
lastHashPrefix = thisPrefix;
// record the start index of the next block
hashIndex[thisPrefix] = numBytesWritten;
}
// at this point, we're guaranteed to be ready to write to the buffer.
// write a subsequence of the key hash's bytes
if (uncompressedOffset + keyHashSize > uncompressedBuffer.length) {
throw new IOException("Out of room to write to uncompressed buffer for block "
+ Integer.toString(thisPrefix, 16)
+ "! Buffer size: "
+ uncompressedBuffer.length
+ ", offset: "
+ uncompressedOffset
+ ", hash size: "
+ keyHashSize
+ ", num entries written in block: "
+ numEntriesInBlock);
}
if (hashedKey.arrayOffset() + hashedKey.position() + keyHashSize > hashedKey.array().length) {
throw new IOException("Need to copy " + keyHashSize
+ " from key, but there weren't enough bytes left! key buffer size: "
+ hashedKey.array().length + ", offset: " + hashedKey.arrayOffset()
+ hashedKey.position() + ", num entries written in block: "
+ numEntriesInBlock);
}
System.arraycopy(hashedKey.array(), hashedKey.arrayOffset()
+ hashedKey.position(), uncompressedBuffer, uncompressedOffset, keyHashSize);
// encode the value offset and write it out
System.arraycopy(value.array(), value.arrayOffset() + value.position(), uncompressedBuffer, uncompressedOffset
+ keyHashSize, valueSize);
uncompressedOffset += keyHashSize + valueSize;
++numEntriesInBlock;
}
private void clearUncompressed() throws IOException {
// compress the block
int compressedSize = compressionCodec.compress(uncompressedBuffer, 0, uncompressedOffset, compressedBuffer, 0);
// write the compressed block to the data stream
stream.write(compressedBuffer, 0, compressedSize);
numBytesWritten += compressedSize;
// keep track of the max block sizes
if (uncompressedOffset > maxUncompressedBlockSize) {
maxUncompressedBlockSize = uncompressedOffset;
}
if (compressedSize > maxCompressedBlockSize) {
maxCompressedBlockSize = compressedSize;
}
// Reset offset and counter
uncompressedOffset = 0;
numEntriesInBlock = 0;
}
@Override
public void close() throws IOException {
// clear the last block, if there is one
if (uncompressedOffset > 0) {
clearUncompressed();
}
// serialize the footer
byte[] footer = new byte[8 * hashIndex.length + 4 + 4];
for (int i = 0; i < hashIndex.length; i++) {
EncodingHelper.encodeLittleEndianFixedWidthLong(hashIndex[i], footer, i * 8, 8);
}
// write the buffer size hints
EncodingHelper.encodeLittleEndianFixedWidthLong(maxUncompressedBlockSize, footer, footer.length - 8, 4);
EncodingHelper.encodeLittleEndianFixedWidthLong(maxCompressedBlockSize, footer, footer.length - 4, 4);
stream.write(footer);
numBytesWritten += footer.length;
// flush everything and close
stream.flush();
stream.close();
}
@Override
public long getNumBytesWritten() {
return numBytesWritten;
}
@Override
public long getNumRecordsWritten() {
return numRecordsWritten;
}
@Override
public String toString() {
return "CurlyWriter ["
+ "numRecordsWritten=" + getNumRecordsWritten()
+ ", numBytesWritten=" + getNumBytesWritten()
+ "]";
}
}