/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.util;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel.MapMode;
import java.util.TreeMap;
import java.util.zip.CRC32;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.io.compress.CompressionInputStream;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.io.compress.Compressor;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.io.compress.zlib.ZlibDecompressor;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.Tuples.LongTextBytesTuple;
import org.junit.Assert;
/**
*
* @author rana
*
*/
public class CompressURLListV2 {
public static final Log LOG = LogFactory
.getLog(CompressURLListV2.class);
public static final int INDEX_RECORD_SIZE = 40;
public static final int URL_DATA_FIXED_RECORD_HEADER = 41;
public static final int DATA_BLOCK_SIZE = 256 * 1024;
public static final int MAX_DATA_BUFFER_SIZE = 245 * 1024;
public static final int URL_ID_RECORD_SIZE = 8;
public static class Builder {
private FSDataOutputStream indexStream;
private FSDataOutputStream dataStream;
private static class BlockCompressor {
int desiredBlockSize = -1;
DataOutputBuffer urlIDStream;
DataOutputBuffer urlStream;
DataOutputBuffer metadataStream;
int urlCount;
int lastURLDataLength;
GzipCodec codec = new GzipCodec();
Compressor compressor = null;
URLFPV2 firstItem = null;
URLFPV2 lastItem = null;
public BlockCompressor(Configuration conf, int desiredBlockSize) {
conf.setInt("io.file.buffer.size", DATA_BLOCK_SIZE);
codec.setConf(conf);
compressor = codec.createCompressor();
this.desiredBlockSize = desiredBlockSize;
reset();
}
private void reset() {
compressor.reset();
urlIDStream = new DataOutputBuffer();
urlStream = new DataOutputBuffer();
metadataStream = new DataOutputBuffer();
urlCount = 0;
lastURLDataLength = 0;
firstItem = null;
lastItem = null;
}
public boolean addItem(URLFPV2 fingerprint, TextBytes urlBytes)
throws IOException {
if (firstItem == null) {
firstItem = new URLFPV2();
firstItem.setDomainHash(fingerprint.getDomainHash());
firstItem.setUrlHash(fingerprint.getUrlHash());
}
if (lastItem == null) {
lastItem = new URLFPV2();
}
// update last item pointer
lastItem.setDomainHash(fingerprint.getDomainHash());
lastItem.setUrlHash(fingerprint.getUrlHash());
// increment url count
urlCount++;
// write url id
urlIDStream.writeLong(fingerprint.getUrlHash());
// write out url length (delta)
WritableUtils.writeVInt(metadataStream, urlBytes.getLength()
- lastURLDataLength);
// update last url data length
lastURLDataLength = urlBytes.getLength();
// write url data
urlStream.write(urlBytes.getBytes(), 0, urlBytes.getLength());
if (30 + metadataStream.getLength() + urlIDStream.getLength()
+ urlStream.getLength() >= desiredBlockSize) {
return true;
}
return false;
}
public void flush(FSDataOutputStream indexStream,
FSDataOutputStream finalDataStream) throws IOException {
if (urlCount > 0) {
// ok write out index
indexStream.writeLong(firstItem.getDomainHash());
indexStream.writeLong(firstItem.getUrlHash());
indexStream.writeLong(lastItem.getDomainHash());
indexStream.writeLong(lastItem.getUrlHash());
indexStream.writeLong(finalDataStream.getPos());
indexStream.flush();
DataOutputBuffer dataStream = new DataOutputBuffer();
// construct a crc object
CRC32 crc = new CRC32();
// and url data stream
dataStream.writeLong(firstItem.getDomainHash());
dataStream.writeLong(firstItem.getUrlHash());
dataStream.writeLong(lastItem.getDomainHash());
dataStream.writeLong(lastItem.getUrlHash());
// ok write out url count ...
WritableUtils.writeVInt(dataStream, urlCount);
// and lengths stream size
WritableUtils.writeVInt(dataStream, metadataStream.getLength());
// write url data uncompressed length
WritableUtils.writeVInt(dataStream, urlStream.getLength());
// ok now url data stream
dataStream.write(urlIDStream.getData(), 0, urlIDStream.getLength());
// now lengths
dataStream.write(metadataStream.getData(), 0, metadataStream
.getLength());
// now finally compress the url data
DataOutputBuffer urlDataCompressed = new DataOutputBuffer();
CompressionOutputStream compressionStream = codec.createOutputStream(
urlDataCompressed, compressor);
try {
compressionStream.write(urlStream.getData(), 0, urlStream
.getLength());
compressionStream.flush();
} finally {
compressionStream.close();
}
// ok compute crc up to this point
crc.update(dataStream.getData(), 0, dataStream.getLength());
// next compute crc for compressed data
crc.update(urlDataCompressed.getData(), 0, urlDataCompressed
.getLength());
// ok now pickup checksum
finalDataStream.writeByte(0); // version
// ok now pickup checksum
finalDataStream.writeLong(crc.getValue());
// write out data
finalDataStream
.write(dataStream.getData(), 0, dataStream.getLength());
// and write out compressed data
finalDataStream.write(urlDataCompressed.getData(), 0,
urlDataCompressed.getLength());
finalDataStream.flush();
}
reset();
}
}
BlockCompressor compressor = null;
public Builder(FSDataOutputStream indexStream, FSDataOutputStream dataStream) {
this.indexStream = indexStream;
this.dataStream = dataStream;
this.compressor = new BlockCompressor(new Configuration(),
MAX_DATA_BUFFER_SIZE);
}
public void addItem(URLFPV2 fingerprint, TextBytes url) throws IOException {
if (compressor.addItem(fingerprint, url)) {
compressor.flush(indexStream, dataStream);
}
}
public void close() throws IOException {
compressor.flush(indexStream, dataStream);
indexStream.close();
dataStream.close();
}
}
public static class IndexCursor {
public IndexCursor() {
}
long dataOffset = -1;
byte decompressedBytes[] = null;
}
public static class Index {
public static class IndexFile {
public File _localIndexFilePath;
public File _localDataFilePath;
public ByteBuffer _indexDataBuffer;
public int _recordCount = -1;
public IndexFile(File localIndexFilePath) throws IOException {
_localIndexFilePath = localIndexFilePath;
String baseName = _localIndexFilePath.getName();
baseName = baseName.substring(0, baseName.lastIndexOf('.'));
_localDataFilePath = new File(_localIndexFilePath.getParentFile(),
baseName + ".data");
// LOG.info("Index File:" + _localIndexFilePath + " DataFile:" +
// _localDataFilePath + " Loading");
loadIndex();
// LOG.info("Index File:" + _localIndexFilePath + " DataFile:" +
// _localDataFilePath + " Loaded");
}
private void loadIndex() throws IOException {
_indexDataBuffer = ByteBuffer.allocate((int) _localIndexFilePath
.length());
// LOG.info("Loading Index Buffer From File:" + _localIndexFilePath);
BufferedInputStream inputStream = new BufferedInputStream(
new FileInputStream(_localIndexFilePath));
try {
for (int offset = 0, totalRead = 0; offset < _indexDataBuffer
.capacity();) {
int bytesToRead = Math.min(16384, _indexDataBuffer.capacity()
- totalRead);
inputStream.read(_indexDataBuffer.array(), offset, bytesToRead);
offset += bytesToRead;
totalRead += bytesToRead;
}
_recordCount = (int) _localIndexFilePath.length() / INDEX_RECORD_SIZE;
} finally {
if (inputStream != null) {
inputStream.close();
}
}
}
final long findDataOffsetInIndex(URLFPV2 searchTerm) {
ByteBuffer indexDataBuffer = _indexDataBuffer.asReadOnlyBuffer();
URLFPV2 indexFPLow = new URLFPV2();
URLFPV2 indexFPHigh = new URLFPV2();
int low = 0;
int high = _recordCount - 1;
while (low <= high) {
int mid = low + ((high - low) / 2);
indexFPLow.setDomainHash(indexDataBuffer.getLong(mid
* INDEX_RECORD_SIZE + (0 * 8)));
indexFPLow.setUrlHash(indexDataBuffer.getLong(mid * INDEX_RECORD_SIZE
+ (1 * 8)));
indexFPHigh.setDomainHash(indexDataBuffer.getLong(mid
* INDEX_RECORD_SIZE + (2 * 8)));
indexFPHigh.setUrlHash(indexDataBuffer.getLong(mid
* INDEX_RECORD_SIZE + (3 * 8)));
int result = indexFPLow.compareTo(searchTerm);
if (result <= 0) {
if (result == 0) {
// LOG.info("Matched Index Record for (DH):" +
// searchTerm.getDomainHash() + " (UH):" + searchTerm.getUrlHash()
// + " index:" + mid);
// LOG.info("fpLow is (DH):"+ indexFPLow.getDomainHash() + "(UH):"
// + indexFPLow.getUrlHash());
// LOG.info("fpHigh is (DH):"+ indexFPHigh.getDomainHash() +
// "(UH):" + indexFPHigh.getUrlHash());
return indexDataBuffer.getLong(mid * INDEX_RECORD_SIZE + (4 * 8));
} else {
result = indexFPHigh.compareTo(searchTerm);
if (result >= 0) {
// LOG.info("Matched Index Record for (DH):" +
// searchTerm.getDomainHash() + " (UH):" +
// searchTerm.getUrlHash() + " index:" + mid);
// LOG.info("fpLow is (DH):"+ indexFPLow.getDomainHash() +
// "(UH):" + indexFPLow.getUrlHash());
// LOG.info("fpHigh is (DH):"+ indexFPHigh.getDomainHash() +
// "(UH):" + indexFPHigh.getUrlHash());
return indexDataBuffer.getLong(mid * INDEX_RECORD_SIZE
+ (4 * 8));
}
}
}
if (result > 0)
high = mid - 1;
else if (result < 0)
low = mid + 1;
}
// LOG.info("Failed to find Index Record for (DH):" +
// searchTerm.getDomainHash() + " (UH):" + searchTerm.getUrlHash());
// LOG.info("fpLow is (DH):"+ indexFPLow.getDomainHash() + "(UH):" +
// indexFPLow.getUrlHash());
// LOG.info("fpHigh is (DH):"+ indexFPHigh.getDomainHash() + "(UH):" +
// indexFPHigh.getUrlHash());
return -1; // not found
}
private static long readVLongFromByteBuffer(ByteBuffer source) {
byte firstByte = source.get();
int len = WritableUtils.decodeVIntSize(firstByte);
if (len == 1) {
return firstByte;
}
long i = 0;
for (int idx = 0; idx < len - 1; idx++) {
byte b = source.get();
i = i << 8;
i = i | (b & 0xFF);
}
return (WritableUtils.isNegativeVInt(firstByte) ? (i ^ -1L) : i);
}
private static final byte[] decompressBytes(ByteBuffer buffer,
int decompressedBufferSize) throws IOException {
Configuration conf = new Configuration();
conf.setInt("io.file.buffer.size", DATA_BLOCK_SIZE);
GzipCodec codec = new GzipCodec();
codec.setConf(conf);
// ok time to decompress
CompressionInputStream compressionInput = codec.createInputStream(
new ByteBufferInputStream(buffer), new ZlibDecompressor(
ZlibDecompressor.CompressionHeader.AUTODETECT_GZIP_ZLIB,
DATA_BLOCK_SIZE));
try {
byte data[] = new byte[decompressedBufferSize];
int bytesRead = compressionInput.read(data, 0, data.length);
if (bytesRead != decompressedBufferSize) {
LOG.error("Decompress. Expected Uncompressed Size of:"
+ decompressedBufferSize + " Got:" + bytesRead);
throw new IOException("Expected Uncompressed Size of:"
+ decompressedBufferSize + " Got:" + bytesRead);
}
return data;
} finally {
compressionInput.close();
}
}
private static final TextBytes decodeURL(byte[] decompressedBytes,
int urlDataPos, int urlDataLength) throws IOException {
// allocate a buffer
TextBytes txtBytesOut = new TextBytes();
// copy corresponding url bytes into it
txtBytesOut.set(decompressedBytes, urlDataPos, urlDataLength);
// LOG.info("Returning Text Bytes At Pos:" + urlDataPos + " urlDataLen:"
// + urlDataLength);
// return the buffer
return txtBytesOut;
}
public TextBytes mapURLFPToURL(URLFPV2 fingerprint, IndexCursor cursor)
throws IOException {
long dataOffset = findDataOffsetInIndex(fingerprint);
// LOG.info("Data Offset for (DH):" + fingerprint.getDomainHash() +
// " (UH):" + fingerprint.getUrlHash() + " is:" + dataOffset);
// if found ...
if (dataOffset != -1) {
// ok seartch the url list ...
// open the data file ...
RandomAccessFile file = null;
try {
// LOG.info("Opening Data File At:" + _localDataFilePath);
file = new RandomAccessFile(_localDataFilePath, "r");
// LOG.info("Mapping Memory At:" + dataOffset);
// map the proper location
long mapSize = Math
.min(file.length() - dataOffset, DATA_BLOCK_SIZE);
MappedByteBuffer memoryBuffer = file.getChannel().map(
MapMode.READ_ONLY, dataOffset, mapSize);
// ok load the url list
memoryBuffer.position(URL_DATA_FIXED_RECORD_HEADER);
// LOG.info("Skipping Past:" + URL_DATA_FIXED_RECORD_HEADER +
// " bytes");
// read url count and metadata buffer length
int urlCount = (int) readVLongFromByteBuffer(memoryBuffer);
int metadataLength = (int) readVLongFromByteBuffer(memoryBuffer);
int urlBufferLength = (int) readVLongFromByteBuffer(memoryBuffer);
// LOG.info("URLCount:"+ urlCount + " metadataLength:" +
// metadataLength + " urlBufferLength:" + urlBufferLength);
memoryBuffer.mark();
memoryBuffer.position(memoryBuffer.position() + URL_ID_RECORD_SIZE
* urlCount);
ByteBuffer metadataAndURLDataReader = memoryBuffer.slice();
memoryBuffer.reset();
int urlDataPos = 0;
int lastURLDataLength = 0;
// ok start reading ...
for (int i = 0; i < urlCount; ++i) {
long urlFPValue = memoryBuffer.getLong();
int urlDataLength = lastURLDataLength
+ (int) readVLongFromByteBuffer(metadataAndURLDataReader);
int result = (urlFPValue < fingerprint.getUrlHash()) ? -1
: urlFPValue == fingerprint.getUrlHash() ? 0 : 1;
if (result == 0) {
// LOG.info("Found Match At Pos:"+ i + "urlDataLength:" +
// urlDataLength + " offset:" + urlDataPos + " urlBufferLength:"
// + urlBufferLength);
// ok match found ...
// time to decompress and return the result ...
metadataAndURLDataReader.position(metadataLength);
byte decompressedBytes[] = null;
if (cursor != null && cursor.dataOffset == dataOffset) {
decompressedBytes = cursor.decompressedBytes;
} else {
decompressedBytes = decompressBytes(metadataAndURLDataReader
.slice(), urlBufferLength);
}
if (cursor != null) {
cursor.decompressedBytes = decompressedBytes;
cursor.dataOffset = dataOffset;
}
return decodeURL(decompressedBytes, urlDataPos, urlDataLength);
} else {
// ok advance to next record ... but accumulate url data
urlDataPos += urlDataLength;
lastURLDataLength = urlDataLength;
}
}
} finally {
if (file != null) {
file.close();
}
}
}
// not found ...
return null;
}
}
Path _indexPaths[] = null;
FileSystem _fileSystem = null;
public Index(FileSystem fileSystem, Path[] indexPaths) {
_fileSystem = fileSystem;
_indexPaths = indexPaths;
// startScannerThread();
}
}
}