/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.hadoop.mergeutils;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.Writable;
import org.commoncrawl.util.FlexBuffer;
/**
* Used to Generate optimized representations from complex key/values pairs.
* Basically a tradeoff between better sort performance at the expense of a
* little bit of increased (per record) memory footprint.
*
* @author rana
*
* @param <KeyType>
* @param <ValueType>
*/
public abstract class OptimizedKeyGeneratorAndComparator<KeyType extends Writable, ValueType extends Writable> {
/**
* OptimizedKey - used to encapsulate optimized key data generated via the
* generator
*
* @author rana
*
*/
public static final class OptimizedKey {
// key types ...
// a key that has a long component
public static final int KEY_TYPE_LONG = 1 << 0;
// a key that has a buffer component
public static final int KEY_TYPE_BUFFER = 1 << 1;
// a key that has a long and buffer component
public static final int KEY_TYPE_LONG_AND_BUFFER = KEY_TYPE_LONG | KEY_TYPE_BUFFER;
public static int writeVLong(DataOutput stream, long i) throws IOException {
int bytesUsed = 1;
if (i >= -112 && i <= 127) {
stream.writeByte((byte) i);
return bytesUsed;
}
int len = -112;
if (i < 0) {
i ^= -1L; // take one's complement'
len = -120;
}
long tmp = i;
while (tmp != 0) {
tmp = tmp >> 8;
len--;
}
stream.writeByte((byte) len);
bytesUsed++;
len = (len < -120) ? -(len + 120) : -(len + 112);
for (int idx = len; idx != 0; idx--) {
int shiftbits = (idx - 1) * 8;
long mask = 0xFFL << shiftbits;
stream.writeByte((byte) ((i & mask) >> shiftbits));
bytesUsed++;
}
return bytesUsed;
}
// key type
private int _keyType;
// header size (based on key type)
private int _headerSize = 0;
// long value if optimized key is a long
private long _longKeyValue = 0;
// serialized buffer size
private int _dataBufferSize = 0;
// serialized buffer offset
private int _dataBufferOffset = 0;
// data buffer
private FlexBuffer _dataBuffer = new FlexBuffer();
// data buffer used to generate complex keys
private DataOutputBuffer _outputStream = new DataOutputBuffer() {
@Override
public void close() throws IOException {
super.close();
_dataBuffer.set(this.getData(), 0, this.getLength());
_dataBufferSize = this.getLength();
}
};
private DataInputBuffer _inputStream = new DataInputBuffer();
public OptimizedKey(int keyType) {
_keyType = keyType;
_headerSize = 0;
if ((_keyType & KEY_TYPE_LONG) != 0)
_headerSize += 8;
if ((_keyType & KEY_TYPE_BUFFER) != 0)
_headerSize += 8;
}
/**
*
* @return the buffer key value
*/
public FlexBuffer getBufferKeyValue() {
return _dataBuffer;
}
/**
*
* @return a DataOutputStream - write into this stream and then close it to
* commit data
* @throws IOException
*/
public DataOutputStream getBufferKeyValueStream() throws IOException {
_outputStream.reset();
return _outputStream;
}
public int getDataBufferOffset() {
return _dataBufferOffset;
}
/**
* get optimized key size in bytes
*
*/
public int getDataBufferSize() {
return _dataBufferSize;
}
/**
* get header bytes size
*
*/
public int getHeaderSize() {
return _headerSize;
}
/**
*
* @return key type
*/
public int getKeyType() {
return _keyType;
}
/**
*
* @return the long key value
*/
public long getLongKeyValue() {
return _longKeyValue;
}
/**
* initialize the optimized key object from the passed in key/value data
*
* @param data
* @param offset
* @param length
* @return
* @throws IOException
*/
public int initFromKeyValuePair(byte[] keyBytes, int keyOffset, int keyLength, byte[] valueBytes, int valueOffset,
int valueLength) throws IOException {
_inputStream.reset(keyBytes, keyOffset, keyLength);
readHeader(_inputStream);
if ((_keyType & KEY_TYPE_BUFFER) != 0 && _dataBufferSize != 0) {
// initialize data buffer ...
_dataBuffer.set(valueBytes, valueOffset + valueLength - _dataBufferSize, _dataBufferSize);
}
return _dataBufferSize;
}
/**
*
*/
public int readHeader(DataInputStream stream) throws IOException {
// read header ...
if ((_keyType & KEY_TYPE_LONG) != 0) {
_longKeyValue = stream.readLong();
}
if ((_keyType & KEY_TYPE_BUFFER) != 0) {
_dataBufferSize = stream.readInt();
_dataBufferOffset = stream.readInt();
}
return _headerSize;
}
public void setDataBufferOffset(int dataBufferOffset) {
_dataBufferOffset = dataBufferOffset;
}
/**
*
*/
public void setLongKeyValue(long value) {
_longKeyValue = value;
}
/**
*
*
* @param outputStream
* @return
* @throws IOException
*/
public int writeBufferToStream(DataOutputStream outputStream) throws IOException {
_dataBufferSize = 0;
if ((_keyType & KEY_TYPE_BUFFER) != 0) {
outputStream.write(_dataBuffer.get(), _dataBuffer.getOffset(), _dataBuffer.getCount());
_dataBufferSize += _dataBuffer.getCount();
}
return _dataBufferSize;
}
public int writeHeaderToStream(DataOutputStream outputStream) throws IOException {
if ((_keyType & KEY_TYPE_LONG) != 0) {
outputStream.writeLong(_longKeyValue);
}
if ((_keyType & KEY_TYPE_BUFFER) != 0) {
outputStream.writeInt(_dataBuffer.getCount());
outputStream.writeInt(_dataBufferOffset);
}
return _headerSize;
}
}
/**
* compare two optimized key values (previously emitted as buffers by generate
* method)
*
* @param key1Data
* @param key1Offset
* @param key1Length
* @param key2Data
* @param key2Offset
* @param key2Length
* @return 0 if equal,-1 if lvalue is less than rvalue, and 1 if not
* @throws IOException
*/
public int compareOptimizedBufferKeys(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data,
int key2Offset, int key2Length) throws IOException {
// default throws exception
throw new IOException("compare optimized buffers not implemented in base class!");
}
/**
* Given a key object and a value object, produce an optimized key data
* structure
*
* @param key
* - the key value associated with this object
* @param value
* - the value associated with this object
* @param optimizedKeyOut
* - the optimized key value out
* @throws IOException
*/
public abstract void generateOptimizedKeyForPair(KeyType key, ValueType value, OptimizedKey optimizedKeyOut)
throws IOException;
/**
* Generate optimized key data given raw key/value buffers
*
* @param keyData
* @param keyOffset
* @param keyLength
* @param valueData
* @param valueOffset
* @param valueLength
* @return
* @throws IOException
*/
public long generateOptimizedKeyForRawPair(byte[] keyData, int keyOffset, int keyLength, byte[] valueData,
int valueOffset, int valueLength, OptimizedKey optimizedKeyOut) throws IOException {
throw new IOException("generate OptimizedKeyForRawPair not implemented in base class!");
}
/**
*
* @return the key type produced by the generator
*/
public abstract int getGeneratedKeyType();
}