/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.listcrawler;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.rmi.server.UID;
import java.security.MessageDigest;
import java.util.Collections;
import java.util.Comparator;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.protocol.CacheItem;
import org.commoncrawl.util.RiceCoding;
import org.commoncrawl.util.URLFingerprint;
import org.commoncrawl.util.BloomFilter;
import org.commoncrawl.util.ByteStream;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.IntrusiveList.IntrusiveListElement;
/**
* An index of files contained in a HDFS SequenceFile
*
* @author rana
*
*/
public class HDFSFileIndex {
public static final Log LOG = LogFactory.getLog(HDFSFileIndex.class);
public static final int INDEX_HINT_RECORD_INTERVAL= 100;
public static final int INDEX_HINT_SIZE = 8 + 4 + 4;
private File _localIndexFilePath = null;
private FileSystem _remoteFileSystem = null;
private Path _remoteDataPath = null;
private BloomFilter _bloomFilter = null;
private ByteBuffer _indexHints = null;
private int _indexHintCount = -1;
private int _indexDataOffset = -1;
private int _indexDataSize = -1;
public HDFSFileIndex(FileSystem remoteFileSystem,Path remoteIndexFileLocation,Path remoteDataFileLocation,File localIndexDataDirectory) throws IOException {
_remoteFileSystem = remoteFileSystem;
_remoteDataPath = remoteDataFileLocation;
// create a local index file for the index
_localIndexFilePath = new File(localIndexDataDirectory,remoteIndexFileLocation.getName());
_localIndexFilePath.delete();
LOG.info("Copying Remote Index Location:" + remoteIndexFileLocation + " to Local File Location:" + _localIndexFilePath);
// copy over the index data file
remoteFileSystem.copyToLocalFile(remoteIndexFileLocation, new Path(_localIndexFilePath.getAbsolutePath()));
LOG.info("Done Copying Remote File. Loading Index");
// load the index
loadIndexFromLocalFile();
}
public HDFSFileIndex(FileSystem remoteFileSystem,File localIndexFileLocation,Path remoteDataFileLocation) throws IOException {
_remoteFileSystem = remoteFileSystem;
_remoteDataPath = remoteDataFileLocation;
_localIndexFilePath = localIndexFileLocation;
loadIndexFromLocalFile();
}
public long getIndexTimestamp() {
try {
Matcher m = Pattern.compile(".*-([0-9]*)").matcher(_remoteDataPath.getName());
if (m.matches()) {
return Long.parseLong(m.group(1));
}
}
catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
}
return 0L;
}
public Path getIndexDataPath() {
return _remoteDataPath;
}
private void loadIndexFromLocalFile()throws IOException {
LOG.info("Loading Index from Local File:" + _localIndexFilePath);
// now open an input stream to the local file ...
FileInputStream fileInputStream = new FileInputStream(_localIndexFilePath);
DataInputStream dataStream = new DataInputStream(fileInputStream);
try {
// deserialize bloom filter
_bloomFilter = BloomFilter.serializer().deserialize(dataStream);
_indexHintCount = dataStream.readInt();
int indexHintDataSize = _indexHintCount * INDEX_HINT_SIZE;
// and deserialize index hints
_indexHints = ByteBuffer.allocate(indexHintDataSize);
dataStream.readFully(_indexHints.array());
// load index data buffer size
_indexDataSize = dataStream.readInt();
// and capture offset information
_indexDataOffset = (int) fileInputStream.getChannel().position();
}
finally {
if (fileInputStream != null) {
fileInputStream.close();
}
}
LOG.info("Successfully loaded Index");
}
public CacheItem findItem(long targetFingerprint,boolean checkOnly)throws IOException {
// check bloom filter first ...
if (_bloomFilter.isPresent(targetFingerprint)) {
// synchronized (this) {
// find best hint ...
HDFSFileIndex.IndexItem itemOut = _findBestIndexHintForFingerprint(targetFingerprint);
// if non null result returned
if (itemOut != null) {
// if no match, then this is the next lowest matching hint item ...
if (itemOut.fingerprint != targetFingerprint) {
// demand load item data
HDFSFileIndex.IndexDataBlock dataBlock = demandLoadIndexDataBlock(itemOut.fingerprint,itemOut.indexDataOffset,itemOut.indexDataSize);
// and search within it ...
itemOut = dataBlock.searchBlockFor(targetFingerprint);
}
if (itemOut != null && checkOnly) {
CacheItem item = new CacheItem();
item.setUrlFingerprint(targetFingerprint);
return item;
}
if (itemOut != null) {
LOG.info("Found Match in Index:" + _localIndexFilePath + " For FP:" + targetFingerprint + " Loading File:" + _remoteDataPath+ " at Offset:" + _indexDataOffset);
// open sequence file ...
SequenceFile.Reader reader = new SequenceFile.Reader(_remoteFileSystem,_remoteDataPath,CrawlEnvironment.getHadoopConfig());
try {
reader.seek(itemOut.dataOffset);
Text url = new Text();
CacheItem item = new CacheItem();
LOG.info("Reading Item and Data");
reader.next(url,item);
String strURL = url.toString();
LOG.info("Read returned url:" + strURL);
item.setUrl(strURL);
return item;
}
finally {
if (reader != null)
reader.close();
}
}
}
// }
}
return null;
}
private HDFSFileIndex.IndexDataBlock demandLoadIndexDataBlock(long fingerprint,int itemDataOffset,int itemDataSize) throws IOException {
// ok time to load this block ...
RandomAccessFile file = new RandomAccessFile(_localIndexFilePath,"r");
try {
ByteBuffer bufferOut = ByteBuffer.allocate(itemDataSize);
if (bufferOut != null) {
file.seek(_indexDataOffset + itemDataOffset);
file.readFully(bufferOut.array());
HDFSFileIndex.IndexDataBlock dataBlock = new IndexDataBlock(fingerprint,0,bufferOut);
return dataBlock;
}
else {
throw new IOException("Unable to allocate byte buffer!!!");
}
}
finally {
if (file != null) {
file.close();
}
}
}
private HDFSFileIndex.IndexItem _findBestIndexHintForFingerprint(long targetFP) throws IOException {
int low = 0;
int high = _indexHintCount - 1;
while (low <= high) {
int mid = low + ((high - low) / 2);
_indexHints.position(mid * (INDEX_HINT_SIZE));
long hintFP = _indexHints.getLong();
// compare to target
long comparisonResult = (hintFP > targetFP) ? 1 : (hintFP < targetFP) ? -1 : 0;
if (comparisonResult > 0)
high = mid - 1;
else if (comparisonResult < 0)
low = mid + 1;
else {
return new IndexItem(targetFP,_indexHints.getInt());
}
}
if (high >= 0 && low < _indexHintCount) {
_indexHints.position(high * INDEX_HINT_SIZE);
// create nearest match ...
HDFSFileIndex.IndexItem itemOut = new IndexItem(_indexHints.getLong(),_indexHints.getInt(),_indexHints.getInt(),-1);
// figure out this items data block size ...
if (high < (_indexHintCount - 1)) {
_indexHints.position(((high+1) * INDEX_HINT_SIZE) + 12);
itemOut.indexDataSize = _indexHints.getInt() - itemOut.indexDataOffset;
}
else {
itemOut.indexDataSize = _indexDataSize - itemOut.indexDataOffset;
}
return itemOut;
}
return null;
}
static class IndexItem {
public IndexItem(long fingerprint,int dataOffset) {
this.fingerprint = fingerprint;
this.dataOffset = dataOffset;
this.indexDataOffset = -1;
}
public IndexItem(long fingerprint,int dataOffset,int indexDataOffset,int indexDataSize) {
this.fingerprint = fingerprint;
this.dataOffset = dataOffset;
this.indexDataOffset = indexDataOffset;
this.indexDataSize = indexDataSize;
}
public long fingerprint;
public int dataOffset;
public int indexDataOffset = -1;
public int indexDataSize = -1;
}
public static class IndexDataBlock extends IntrusiveListElement<HDFSFileIndex.IndexDataBlock> {
public IndexDataBlock(long baseFingerprint,int dataOffset,ByteBuffer data) {
_dataOffset = dataOffset;
_buffer = data;
_lastUseTime = System.currentTimeMillis();
_baseFingerprint = baseFingerprint;
}
HDFSFileIndex.IndexItem searchBlockFor(long targetFingerprint) {
// reset cursor ...
_buffer.position(_dataOffset);
int fingerprintMValue = _buffer.get();
int fingerprintBits = (int)CacheManager.readVLongFromByteBuffer(_buffer);
RiceCoding.RiceCodeReader fingerprintReader
= new RiceCoding.RiceCodeReader(
(int)fingerprintMValue,
(int)fingerprintBits,
_buffer.array(),
_buffer.position());
// advance past fingerprint data to offset data
_buffer.position(_buffer.position() + ((fingerprintBits + 7) /8));
// and create offset data reader
RiceCoding.RiceCodeReader offsetReader
= new RiceCoding.RiceCodeReader(
(int)_buffer.get(),
(int)CacheManager.readVLongFromByteBuffer(_buffer),
_buffer.array(),
_buffer.position());
long fingerprintValue = _baseFingerprint;
while (fingerprintReader.hasNext()) {
fingerprintValue += fingerprintReader.nextValue();
// rice coded values are offset by one since rice coding cannot support zero values ....
fingerprintValue -= 1;
int offsetValue = (int)offsetReader.nextValue();
// now compare to target
if (fingerprintValue == targetFingerprint) {
return new IndexItem(fingerprintValue,offsetValue - 1 /*rice coder doesn't like zeros and Offset COULD be zero, so we have to offset by one to be safe*/);
}
}
return null;
}
public int _dataOffset = -1;
public ByteBuffer _buffer = null;
public long _lastUseTime = -1;
public long _baseFingerprint;
}
private static double lg(double value) {
return Math.log(value)/Math.log(2.0);
}
public static void writeIndex(Vector<FingerprintAndOffsetTuple> offsetInfo,DataOutput indexFileOut)throws IOException {
long firstFingerprint = offsetInfo.get(0)._fingerprint;
BloomFilter bloomFilter = new BloomFilter(offsetInfo.size(),0.001201);
// sort the offset list by fingerprint
Collections.sort(offsetInfo,new Comparator<FingerprintAndOffsetTuple>() {
@Override
public int compare(FingerprintAndOffsetTuple o1, FingerprintAndOffsetTuple o2) {
return (o1._fingerprint < o2._fingerprint) ? -1 : o1._fingerprint > o2._fingerprint ? 1 :0;
}
});
// now we need to write the index out
// allocate working set buffers ...
ByteBuffer indexDataBuffer = ByteBuffer.allocate(offsetInfo.size() * 16);
ByteBuffer indexHintsBuffer = ByteBuffer.allocate(((((offsetInfo.size() + INDEX_HINT_RECORD_INTERVAL) / INDEX_HINT_RECORD_INTERVAL)+1) * INDEX_HINT_SIZE) + 4);
// build index hints placeholder
Vector<HDFSFileIndex.IndexItem> hints = new Vector<HDFSFileIndex.IndexItem>();
// 0 100 200 300 400 500
for (int i=0;i<offsetInfo.size();++i) {
if (i%INDEX_HINT_RECORD_INTERVAL == 0 || (i == (offsetInfo.size()-1))) {
HDFSFileIndex.IndexItem hint = new IndexItem(offsetInfo.get(i)._fingerprint,(int)offsetInfo.get(i)._offset);
hints.add(hint);
// add fingerprint to bloom filter
bloomFilter.add(hint.fingerprint);
}
}
// start off the index hints buffer with a hint of the index hint buffer size
indexHintsBuffer.putInt(hints.size());
// track total bits used ...
int bitsUsedForHints = 0;
int bitsUsedForFingerprints = 0;
int bitsUsedForOffsets = 0;
// now start populating index data ...
for (int hintIdx=0;hintIdx<hints.size();++hintIdx) {
HDFSFileIndex.IndexItem hint = hints.get(hintIdx);
LOG.info("IndexWriter FP:" + hint.fingerprint);
indexHintsBuffer.putLong(hint.fingerprint);
indexHintsBuffer.putInt(hint.dataOffset);
indexHintsBuffer.putInt(indexDataBuffer.position());
// update stats
bitsUsedForHints += INDEX_HINT_SIZE * 8;
if (hintIdx < hints.size() - 1) {
// track cumilative delta and offset values (for average calc later)
double cumilativeDelta = 0;
long cumilativeOffset =0;
int subIndexItemCount = 0;
int nonZeroDeltaCount = 0;
Vector<HDFSFileIndex.IndexItem> subHints = new Vector<HDFSFileIndex.IndexItem>();
// initialize last fingerprint to indexed value ...
long lastFingerprint = hint.fingerprint;
// first collect values in between index hints
for (int nonIndexItem=(hintIdx*INDEX_HINT_RECORD_INTERVAL)+1;nonIndexItem < ((hintIdx+1)*INDEX_HINT_RECORD_INTERVAL);++nonIndexItem) {
if (nonIndexItem >= offsetInfo.size())
break;
// calculdate fingerprint delta ...
long fingerprintDelta = offsetInfo.get(nonIndexItem)._fingerprint - lastFingerprint;
LOG.info("IndexWriter FP:" + offsetInfo.get(nonIndexItem)._fingerprint + " Delta:" + fingerprintDelta);
// offset delta
if (fingerprintDelta != 0) {
cumilativeDelta += (double) fingerprintDelta;
LOG.info("Cumilative Delta is:" + cumilativeDelta);
nonZeroDeltaCount++;
}
cumilativeOffset += offsetInfo.get(nonIndexItem)._offset;
++subIndexItemCount;
// add to collection vector
subHints.add(new IndexItem(fingerprintDelta,(int)offsetInfo.get(nonIndexItem)._offset));
// remember the last fingerpint ...
lastFingerprint = offsetInfo.get(nonIndexItem)._fingerprint;
// add item to bloom filter
bloomFilter.add(lastFingerprint);
}
// calculate average delta value
double averageDeltaValue = (double)cumilativeDelta / (double)nonZeroDeltaCount;
// calculate m for fingerprint deltas
int mForFingerprints = (int) Math.floor(lg(averageDeltaValue));
LOG.info("Average Delta Value is:" + averageDeltaValue + " m is:" + mForFingerprints);
// cacluldate average offset value
double averageOffsetValue = (double)cumilativeOffset/ (double)subIndexItemCount;
// calculate m for offsets
int mForOffsets = (int) Math.floor(lg(averageOffsetValue));
// calculate rice codes
RiceCoding riceCodeFP = new RiceCoding(mForFingerprints);
RiceCoding riceCodeOffsets = new RiceCoding(mForOffsets);
// populate bits
for (HDFSFileIndex.IndexItem subItemHint : subHints) {
if (subItemHint.fingerprint == 0) {
LOG.warn("Zero Delta for Fingerprint Detected.There are two duplicate entires in log!");
}
riceCodeFP.addItem(subItemHint.fingerprint + 1);
riceCodeOffsets.addItem(subItemHint.dataOffset+1);
}
// now track bits used ...
bitsUsedForFingerprints += riceCodeFP.getNumBits();
bitsUsedForOffsets += riceCodeOffsets.getNumBits();
// write out metadata
// save the current position
int currentPosition = indexDataBuffer.position();
// fingerprint data
indexDataBuffer.put((byte)mForFingerprints);
CacheManager.writeVLongToByteBuffer(indexDataBuffer,riceCodeFP.getNumBits());
indexDataBuffer.put(riceCodeFP.getBits(), 0, (riceCodeFP.getNumBits() + 7) /8);
// offset data
indexDataBuffer.put((byte)mForOffsets);
CacheManager.writeVLongToByteBuffer(indexDataBuffer,riceCodeOffsets.getNumBits());
indexDataBuffer.put(riceCodeOffsets.getBits(), 0, (riceCodeOffsets.getNumBits() + 7) /8);
System.out.println("Item Count:" + subIndexItemCount + "FP Bits:" + subIndexItemCount * 64 + " Compressed:" + riceCodeFP.getNumBits() +
" Offset Bits:" + subIndexItemCount * 32 + " Compressed:" + riceCodeOffsets.getNumBits());
LOG.info("Item Count:" + subIndexItemCount + "FP Bits:" + subIndexItemCount * 64 + " Compressed:" + riceCodeFP.getNumBits() +
" Offset Bits:" + subIndexItemCount * 32 + " Compressed:" + riceCodeOffsets.getNumBits());
if ((subIndexItemCount * 64) < riceCodeFP.getNumBits()) {
throw new RuntimeException("Compressed Size > UnCompressed Size!!!!");
}
validateIndexData(indexDataBuffer.array(),currentPosition,hint.fingerprint,subHints,bloomFilter);
}
}
if (!bloomFilter.isPresent(firstFingerprint)) {
throw new RuntimeException("Test Failed!");
}
// serialize bloomfilter
ByteStream baos = new ByteStream(1<< 12);
BloomFilter.serializer().serialize(bloomFilter, new DataOutputStream(baos));
// spit out final stats
System.out.println(
" Bloomfilter Size:" + baos.size() +
" IndexHintBuffer Size:" + indexHintsBuffer.position() +
" IndexDataBuffer Size:" + indexDataBuffer.position());
// now write out the final index file ...
// bloom filter data ...
indexFileOut.write(baos.getBuffer(),0,baos.size());
// write hint data
indexFileOut.write(indexHintsBuffer.array(),0,indexHintsBuffer.position());
// write out rice code data size
indexFileOut.writeInt(indexDataBuffer.position());
// finally rice coded sub-index data
indexFileOut.write(indexDataBuffer.array(),0,indexDataBuffer.position());
}
public static void validateIndexData(byte[] data,int offset,long baseFingerprint,Vector<HDFSFileIndex.IndexItem> subItems,BloomFilter filter) {
HDFSFileIndex.IndexDataBlock dataBlock = new IndexDataBlock(baseFingerprint,offset,ByteBuffer.wrap(data));
long fingerprintValue= baseFingerprint;
int itemIndex = 0;
for (HDFSFileIndex.IndexItem item : subItems) {
fingerprintValue += item.fingerprint;
long timeStart= System.currentTimeMillis();
if (dataBlock.searchBlockFor(fingerprintValue) == null) {
throw new RuntimeException("Unable to Find fingerprint in data block ! - Test Failed!");
}
if (!filter.isPresent(fingerprintValue)) {
throw new RuntimeException("Unable to Find fingerprint in bloom filter ! - Test Failed!");
}
CacheManager.LOG.info("Search for Item@" + itemIndex++ + " Took:" + (System.currentTimeMillis() - timeStart));
}
}
public static void main(String[] args) {
try {
ByteStream outputStream = new ByteStream(8192);
Vector<FingerprintAndOffsetTuple> fpInfo = new Vector<FingerprintAndOffsetTuple>();
// construct 10000 entries with randomin fingerprints
for (int i=0;i<10000;++i) {
MessageDigest digester;
digester = MessageDigest.getInstance("MD5");
long time = System.currentTimeMillis();
digester.update((new UID()+"@"+time+":" + i).getBytes());
FingerprintAndOffsetTuple offsetInfo = new FingerprintAndOffsetTuple(URLFingerprint.generate64BitURLFPrint(StringUtils.byteToHexString(digester.digest())),i*10000);
fpInfo.add(offsetInfo);
}
// clone the vector
Vector<FingerprintAndOffsetTuple> fpInfoCloned = new Vector<FingerprintAndOffsetTuple>();
fpInfoCloned.addAll(fpInfo);
// now write out the index ...
writeIndex(fpInfoCloned, new DataOutputStream(outputStream));
// spit out some basic stats
System.out.println("output buffer size is:" + outputStream.size());
}
catch (Exception e) {
CacheManager.LOG.error(CCStringUtils.stringifyException(e));
}
}
}