/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.io.DataOutputStream; import java.io.IOException; import java.util.zip.CRC32; import java.util.zip.CheckedOutputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.io.compress.CompressionOutputStream; import org.apache.hadoop.io.compress.Compressor; import org.apache.hadoop.io.compress.GzipCodec; public class CompressedIndex { public static final Log LOG = LogFactory .getLog(CompressedIndex.class); public static final int DATA_BLOCK_SIZE = 256 * 1024; public static final int MAX_DATA_BUFFER_SIZE = 245 * 1024; public static class Builder { private FSDataOutputStream indexStream; private FSDataOutputStream dataStream; private static class BlockCompressor { int desiredBlockSize = -1; DataOutputBuffer keyDataStream; DataOutputBuffer valueDataStream; DataOutputBuffer metadataStream; int entryCount; int lastKeyLength; int lastDataLength; GzipCodec codec = new GzipCodec(); Compressor compressor = null; DataOutputBuffer firstKeyBuffer = new DataOutputBuffer(); DataOutputBuffer lastKeyBuffer = new DataOutputBuffer(); DataOutputBuffer firstKey = null; DataOutputBuffer lastKey = null; public BlockCompressor(Configuration conf, int desiredBlockSize) { conf.setInt("io.file.buffer.size", DATA_BLOCK_SIZE); codec.setConf(conf); compressor = codec.createCompressor(); this.desiredBlockSize = desiredBlockSize; reset(); } private void reset() { compressor.reset(); keyDataStream = new DataOutputBuffer(); valueDataStream = new DataOutputBuffer(); metadataStream = new DataOutputBuffer(); entryCount = 0; lastKeyLength = 0; lastDataLength = 0; firstKey = null; lastKey = null; } public boolean addItem(FlexBuffer keyBytes, FlexBuffer dataBytes) throws IOException { if (firstKey == null) { firstKeyBuffer.reset(); firstKeyBuffer.write(keyBytes.get(), keyBytes.getOffset(), keyBytes .getCount()); firstKey = firstKeyBuffer; } if (lastKey == null) { lastKey = firstKeyBuffer; } else { lastKeyBuffer.reset(); lastKeyBuffer.write(keyBytes.get(), keyBytes.getOffset(), keyBytes .getCount()); lastKey = lastKeyBuffer; } // increment count entryCount++; // write out key length ... WritableUtils.writeVInt(metadataStream, keyBytes.getCount() - lastKeyLength); // update last Key length lastKeyLength = keyBytes.getCount(); // write key to block stream keyDataStream.write(keyBytes.get(), keyBytes.getOffset(), keyBytes .getCount()); // write out data length (delta) WritableUtils.writeVInt(metadataStream, dataBytes.getCount() - lastDataLength); // update last url data length lastDataLength = dataBytes.getCount(); // write url data valueDataStream.write(dataBytes.get(), dataBytes.getOffset(), dataBytes .getCount()); if (30 + metadataStream.getLength() + keyDataStream.getLength() + valueDataStream.getLength() >= desiredBlockSize) { return true; } return false; } public void flush(DataOutputBuffer indexPosStream, DataOutputBuffer indexDataStream, FSDataOutputStream finalDataStream) throws IOException { if (entryCount > 0) { // write out index position ... indexPosStream.writeLong(indexDataStream.getLength()); // ok write out index WritableUtils.writeVInt(indexDataStream, firstKey.getLength()); indexDataStream.write(firstKey.getData(), 0, firstKey.getLength()); WritableUtils.writeVInt(indexDataStream, lastKey.getLength()); indexDataStream.write(lastKey.getData(), 0, lastKey.getLength()); indexDataStream.writeLong(finalDataStream.getPos()); indexPosStream.flush(); indexDataStream.flush(); DataOutputBuffer dataStream = new DataOutputBuffer(); // construct a crc object CRC32 crc = new CRC32(); // ok write out url count ... WritableUtils.writeVInt(dataStream, entryCount); // and lengths stream size WritableUtils.writeVInt(dataStream, metadataStream.getLength()); // write url data uncompressed length WritableUtils.writeVInt(dataStream, valueDataStream.getLength()); // ok now url data stream dataStream.write(keyDataStream.getData(), 0, keyDataStream .getLength()); // now lengths dataStream.write(metadataStream.getData(), 0, metadataStream .getLength()); // now finally compress the url data DataOutputBuffer urlDataCompressed = new DataOutputBuffer(); CompressionOutputStream compressionStream = codec.createOutputStream( urlDataCompressed, compressor); try { compressionStream.write(valueDataStream.getData(), 0, valueDataStream.getLength()); compressionStream.flush(); } finally { compressionStream.close(); } // ok compute crc up to this point crc.update(dataStream.getData(), 0, dataStream.getLength()); // next compute crc for compressed data crc.update(urlDataCompressed.getData(), 0, urlDataCompressed .getLength()); // ok now pickup checksum finalDataStream.writeByte(0); // version // ok now pickup checksum finalDataStream.writeLong(crc.getValue()); // write out data finalDataStream .write(dataStream.getData(), 0, dataStream.getLength()); // and write out compressed data finalDataStream.write(urlDataCompressed.getData(), 0, urlDataCompressed.getLength()); finalDataStream.flush(); } reset(); } } BlockCompressor compressor = null; DataOutputBuffer blockIndexPosStream = new DataOutputBuffer(); DataOutputBuffer blockIndexDataStream = new DataOutputBuffer(); public Builder(FSDataOutputStream indexDataStream, FSDataOutputStream dataStream) { this.indexStream = indexStream; this.dataStream = dataStream; this.compressor = new BlockCompressor(new Configuration(), MAX_DATA_BUFFER_SIZE); } public void addItem(FlexBuffer key, FlexBuffer value) throws IOException { if (compressor.addItem(key, value)) { compressor.flush(blockIndexPosStream, blockIndexDataStream, dataStream); } } public void close() throws IOException { compressor.flush(blockIndexPosStream, blockIndexDataStream, dataStream); CRC32 crc = new CRC32(); DataOutputStream checkedOutputStream = new DataOutputStream( new CheckedOutputStream(indexStream, new CRC32())); // write out cumilative length indexStream.writeInt(blockIndexPosStream.getLength() + blockIndexDataStream.getLength()); // write out data via checked stream WritableUtils.writeVInt(checkedOutputStream, blockIndexPosStream .getLength()); checkedOutputStream.write(blockIndexPosStream.getData(), 0, blockIndexPosStream.getLength()); WritableUtils.writeVInt(checkedOutputStream, blockIndexDataStream .getLength()); checkedOutputStream.write(blockIndexDataStream.getData(), 0, blockIndexDataStream.getLength()); checkedOutputStream.flush(); // write out crc at end indexStream.writeLong(crc.getValue()); indexStream.flush(); indexStream.close(); dataStream.close(); } } }