package org.commoncrawl.util; import java.io.DataInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; /** * Raw Key SequenceFileIndex * * @author rana * * @param <KeyType> */ public class KeyBasedSequenceFileIndex<KeyType extends WritableComparable> { static final Log LOG = LogFactory.getLog(KeyBasedSequenceFileIndex.class); byte[] _indexData; RawComparator<KeyType> _comparator; /** * * @param conf * @param indexFilePath * @param keyComparator * @throws IOException */ public KeyBasedSequenceFileIndex(Configuration conf,Path indexFilePath,RawComparator<KeyType> keyComparator)throws IOException { _comparator = keyComparator; FileSystem fs = FileSystem.get(indexFilePath.toUri(),conf); FileStatus fileStatus = fs.getFileStatus(indexFilePath); if (fileStatus == null) { throw new IOException("Null FileStats !"); } _indexData = new byte[(int)fileStatus.getLen()]; FSDataInputStream stream = fs.open(indexFilePath); try { // read full stream ... stream.readFully(_indexData); } finally { stream.close(); } } public static class IndexReader<KeyType extends WritableComparable> { KeyBasedSequenceFileIndex<KeyType> _index; DataInputStream _reader = null; ByteBuffer _buffer = null; IndexItemData _itemData = new IndexItemData(); long _keyDataOffset; int _itemCount; public IndexReader(KeyBasedSequenceFileIndex<KeyType> index)throws IOException { _index = index; DataInputBuffer temp = new DataInputBuffer(); temp.reset(index._indexData,0,index._indexData.length); // seek to very end ... temp.skip(index._indexData.length-12); // read key data pos ... _keyDataOffset = temp.readLong(); // read item count ... _itemCount = temp.readInt(); // set byte buffer _buffer = ByteBuffer.wrap(index._indexData,0,index._indexData.length - 12); // set reader ... _reader = new DataInputStream(wrapBufferInStream(_buffer)); // dumpIndex(); } private void dumpIndex()throws IOException { DataInputBuffer tempBuffer = new DataInputBuffer(); TextBytes textBytes = new TextBytes(); LOG.info("Index Item Count:"+ _itemCount); for (int i=0;i<_itemCount;++i) { getIndexItemDataAtPos(i, _buffer,_reader,_itemData); tempBuffer.reset(_index._indexData,(int)(_keyDataOffset+_itemData.keyDataOffset),_itemData.keyDataLen); textBytes.setFromRawTextBytes(tempBuffer); LOG.info("Pos:" + i + " Key:" + textBytes.toString() + " SeqFilePos:" + _itemData.seqFilePos); } } private static InputStream wrapBufferInStream(final ByteBuffer buf) { return new InputStream() { public synchronized int read() throws IOException { if (!buf.hasRemaining()) { return -1; } return buf.get() & 0xff; } public synchronized int read(byte[] bytes, int off, int len) throws IOException { // Read only what's left len = Math.min(len, buf.remaining()); buf.get(bytes, off, len); return len; } }; } private static class IndexItemData { long seqFilePos; int keyDataOffset; int keyDataLen; } private static void getIndexItemDataAtPos(int pos,ByteBuffer buffer,DataInputStream reader,IndexItemData indexData) throws IOException { buffer.position((pos * (IndexWriter.INDEX_ITEM_SIZE))); indexData.seqFilePos = reader.readLong(); indexData.keyDataOffset = reader.readInt(); indexData.keyDataLen = reader.readInt(); } /** * * @param keyData * @param keyDataOffset * @param keyDataLen * @return * @throws IOException */ public long findBestPositionForKey(byte[] keyData,int keyDataOffset,int keyDataLen)throws IOException { int low = 0; int high = _itemCount - 1; while (low <= high) { int mid = low + ((high - low) / 2); // get data at index ... getIndexItemDataAtPos(mid, _buffer, _reader, _itemData); // compare int comparisonResult = _index._comparator.compare( _index._indexData, (int) _keyDataOffset + _itemData.keyDataOffset, _itemData.keyDataLen, keyData, keyDataOffset, keyDataLen); if (comparisonResult > 0) high = mid - 1; else if (comparisonResult < 0) low = mid + 1; else { LOG.info("Match at IndexPos:" + mid + " SeqFilePos:" + _itemData.seqFilePos); return _itemData.seqFilePos; } } if (high == -1) return -1L; else { getIndexItemDataAtPos(high, _buffer, _reader, _itemData); LOG.info("Nearest Match at:" + high + " SeqFilePos:" + _itemData.seqFilePos); return _itemData.seqFilePos; } } } public static class IndexWriter<KeyType extends WritableComparable, ValueType extends Writable> implements SequenceFileIndexWriter<KeyType, ValueType> { FSDataOutputStream _outputStream; long _lastIndexedPos = -1; int _totalIndexKeys = 0; DataOutputBuffer _keyDataBuffer = new DataOutputBuffer(); public static final int INDEX_ITEM_SIZE = 8 + 4 + 4; public IndexWriter(Configuration conf,Path indexFilePath)throws IOException { FileSystem fs = FileSystem.get(indexFilePath.toUri(),conf); fs.delete(indexFilePath, false); _outputStream = fs.create(indexFilePath); } @Override public void close() throws IOException { try { long keyDataPos = _outputStream.getPos(); _outputStream.write(_keyDataBuffer.getData(),0,_keyDataBuffer.getLength()); _outputStream.writeLong(keyDataPos); _outputStream.writeInt(_totalIndexKeys); _outputStream.flush(); } finally { _outputStream.close(); } } @Override public void indexItem(byte[] keyData, int keyOffset, int keyLength, byte[] valueData, int valueOffset, int valueLength, long writerPosition) throws IOException { // check to see if block position changed ... if (writerPosition != _lastIndexedPos){ // establish new start index _lastIndexedPos = writerPosition; // increment index key count... _totalIndexKeys++; // write index item // stream pos _outputStream.writeLong(_lastIndexedPos); // key data offset _outputStream.writeInt(_keyDataBuffer.getLength()); // key len ... _outputStream.writeInt(keyLength); // key data _keyDataBuffer.write(keyData,keyOffset,keyLength); } } } }