package org.commoncrawl.mapred.ec2.postprocess.crawldb;
import java.io.IOException;
import static org.mockito.Mockito.mock;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.InputBuffer;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.Counters.Counter;
import org.commoncrawl.hadoop.mergeutils.MergeSortSpillWriter;
import org.commoncrawl.hadoop.mergeutils.RawDataSpillWriter;
import org.commoncrawl.hadoop.mergeutils.RawKeyValueComparator;
import org.commoncrawl.hadoop.mergeutils.SequenceFileSpillWriter;
import org.commoncrawl.util.TextBytes;
import com.google.common.collect.Iterators;
public class SortPartitionData {
static class Comparator implements RawKeyValueComparator<TextBytes, TextBytes> {
CrawlDBKey.LinkKeyComparator comparator = new CrawlDBKey.LinkKeyComparator();
@Override
public int compare(TextBytes key1, TextBytes value1, TextBytes key2,
TextBytes value2) {
return comparator.compare(key1, key2);
}
@Override
public int compareRaw(byte[] key1Data, int key1Offset, int key1Length,
byte[] key2Data, int key2Offset, int key2Length, byte[] value1Data,
int value1Offset, int value1Length, byte[] value2Data,
int value2Offset, int value2Length) throws IOException {
return comparator.compare(key1Data, key1Offset, key1Length, key2Data, key2Offset, key2Length);
}
}
public static void main(String[] args) throws IOException {
Path inputPath = new Path(args[0]);
Path outputPath = new Path(args[1]);
Configuration conf = new Configuration();
final SequenceFile.Writer outputWriter = SequenceFile.createWriter(
FileSystem.get(outputPath.toUri(),conf),
conf,
outputPath,
TextBytes.class,
TextBytes.class,
CompressionType.BLOCK,
new GzipCodec());
final CrawlDBMergingReducer crawlDBWriter = new CrawlDBMergingReducer();
crawlDBWriter.configure(new JobConf(conf));
final OutputCollector<TextBytes,TextBytes> collector = new OutputCollector<TextBytes, TextBytes>() {
@Override
public void collect(TextBytes key, TextBytes value) throws IOException {
outputWriter.append(key, value);
}
};
RawDataSpillWriter<TextBytes, TextBytes> spillWriter = new RawDataSpillWriter<TextBytes, TextBytes>() {
TextBytes _key = new TextBytes();
TextBytes _value = new TextBytes();
DataInputBuffer _buffer = new DataInputBuffer();
@Override
public void spillRecord(TextBytes key, TextBytes value) throws IOException {
crawlDBWriter.reduce(key, Iterators.forArray(value), collector, mock(Reporter.class));
}
@Override
public void close() throws IOException {
crawlDBWriter.close();
outputWriter.close();
}
@Override
public void spillRawRecord(byte[] keyData, int keyOffset, int keyLength,
byte[] valueData, int valueOffset, int valueLength) throws IOException {
_buffer.reset(keyData,keyOffset,keyLength);
int realLength = WritableUtils.readVInt(_buffer);
_key.set(keyData, keyOffset + _buffer.getPosition(), realLength);
_buffer.reset(valueData,valueOffset,valueLength);
realLength = WritableUtils.readVInt(_buffer);
_value.set(valueData,valueOffset + _buffer.getPosition(),realLength);
spillRecord(_key, _value);
}
};
//SequenceFileSpillWriter<TextBytes, TextBytes> finalWriter
// = new SequenceFileSpillWriter<TextBytes, TextBytes>(FileSystem.get(outputPath.toUri(),conf), conf, outputPath, TextBytes.class, TextBytes.class, null, true);
conf.setInt(MergeSortSpillWriter.SPILL_INDEX_BUFFER_SIZE_PARAM, 100000000);
conf.setInt(SequenceFileSpillWriter.SPILL_WRITER_BUFFER_SIZE_PARAM,100000000);
MergeSortSpillWriter merger
= new MergeSortSpillWriter<TextBytes,TextBytes>(
conf,
spillWriter,
FileSystem.getLocal(conf),
new Path("/tmp"),
null,
new Comparator(),
TextBytes.class,
TextBytes.class,
true,
null);
try {
SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(inputPath.toUri(),conf), inputPath, conf);
TextBytes key = new TextBytes();
TextBytes value = new TextBytes();
while (reader.next(key, value)) {
merger.spillRecord(key, value);
}
}
finally {
merger.close();
spillWriter.close();
}
}
}