package org.archive.hadoop.mapreduce;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
/**
* RecordWriter which produces "zipnum" output format. This is fairly
* specific to the needs of the Wayback Machine CDX "clusters".
* <p>
* It only handles Text keys and values, and only outputs text records
* in "zipnum" format. The output text format is just the key and
* value that are passed in, delimited by DELIMITER (' ').
* <p>
* For "zipnum" format, the output records are compressed with the
* given Hadoop <code>codec</code>, and after <code>limit</code> lines
* are written, the compression stream is closed and a new one is
* open. This gives us the "catenated compression envelopes" format
* that is used frequently at Internet Archive.
* <p>
* Whenever a compression envelope is closed, a summary line is
* written to an <code>*-idx</code> file. This summary/idx file
* records the first key of the compression envelope and the
* starting byte-offset of the envelope and the envelope size
* in bytes, e.g.
* <pre>
* org,example) 0 128
* </pre>
* The fields of the summary/idx file are delimited with tabs.
* <p>
* The trick to make this work is to use the
* <code>NotClosingDataOutputStream</code> to trap the calls to
* <code>close()</code> by the <code>codec</code>'s output stream.
* <p>
* When we close one compression envelope, we call the
* <code>codec.flush()</code> and <code>codec.close()</code> methods
* to ensure that the compressed output is flushed and the compression
* footers are written to the underlying output stream. But, the
* codec will try to close the underlying output stream too, which we
* want to prevent from happening because we want to start the next
* compression envelope. So, we trap the call to <code>close()</code>
* and ignore it. Then, create a new compression stream on top
* of the existing underlying file output stream.
*/
public class ZipNumRecordWriter extends RecordWriter<Text, Text>
{
// Since we are writing binary output, we just create some
// <strong>int</strong> values for our literal characters we use
// later.
public static final int DELIMITER = ' ';
public static final int NEWLINE = '\n';
public static final int SUMMARY_DELIMITER = '\t';
public FSDataOutputStream out;
public CompressionCodec codec;
public CompressionOutputStream compressing;
public FSDataOutputStream summary;
public String partitionName;
public Text startKey;
public long oldPos = 0;
public long count = 0;
public long limit = 0;
/**
* Construct a ZipNumRecordWriter.
*/
public ZipNumRecordWriter( CompressionCodec codec, FSDataOutputStream out, FSDataOutputStream summary, String partitionName, long limit ) throws IOException
{
this.limit = limit;
this.out = out;
this.codec = codec;
/* Create CompressionOutputStream once when starting */
this.compressing = codec.createOutputStream( out );
this.summary = summary;
this.partitionName = partitionName;
}
/**
* get the key that will be written in the summary file
* currently this is the first 2 cdx fields, the url key and date
*
*
*/
protected String getCdxSummaryKey(String cdx)
{
int spaceIndex = cdx.indexOf(DELIMITER);
if (spaceIndex >= 0) {
spaceIndex = cdx.indexOf(DELIMITER, spaceIndex + 1);
}
String summaryKey = ((spaceIndex >= 0) ? cdx.substring(0, spaceIndex) : cdx);
if (spaceIndex < 0) {
System.err.println("POSSIBLY INVALID CDX LINE: " + cdx);
}
// Ensure no tabs are present in the key
summaryKey = summaryKey.replace("\t", "%09");
return summaryKey;
}
/**
* Write the key,value pair to the compressed output stream. Once we write <code>limit</code>
* records, close the compression envelope and start another one; also write a summary line.
*/
@Override
public synchronized void write( Text key, Text value ) throws IOException
{
if ( count == 0 )
{
// NOTE: It's important to create a *new* Text here. The
// 'key' passed-in is modified by the caller. So if we
// just keep a reference to the 'key', then those
// modifications will also apply to our 'startKey'.
String summaryKey = getCdxSummaryKey(key.getLength() > 0 ? key.toString() : value.toString());
startKey = new Text( summaryKey );
}
// Write the output record to the compressing stream.
if (value.getLength() == 0) {
compressing.write( key.getBytes(), 0, key.getLength() );
} else if (key.getLength() == 0) {
compressing.write( value.getBytes(), 0, value.getLength() );
} else {
compressing.write( key.getBytes(), 0, key.getLength() );
compressing.write( DELIMITER );
compressing.write( value.getBytes(), 0, value.getLength() );
}
compressing.write( NEWLINE );
count++;
if ( count == limit )
{
// Flush and close the current compression block/envelope.
// The close() method is supposed to flush() first, but you never know...
compressing.flush();
compressing.finish();
writeSummary();
// Save the position and start the next compression envelope.
oldPos = out.getPos();
count = 0;
// Reset Compression stream to begin compressing again w/o resetting underlying stream
compressing.resetState();
}
}
/**
* Close the compression envelope, write a summary line, then
* finally close the underlying output stream.
*/
@Override
public synchronized void close(TaskAttemptContext context) throws IOException
{
// I'm paranoid about flushing :)
compressing.flush();
compressing.finish();
compressing.close();
// It's possible no records were written to this output partition,
// in which case we don't have a startKey to write to the summary.
if ( startKey != null )
{
writeSummary();
}
out.flush();
out.close();
summary.close();
}
/**
* Convenience method to write out a summary line.
*/
public void writeSummary( ) throws IOException
{
summary.write( startKey.getBytes(), 0, startKey.getLength() );
summary.write( SUMMARY_DELIMITER );
summary.write( partitionName.getBytes("UTF-8") );
summary.write( SUMMARY_DELIMITER );
summary.write( Long.toString( oldPos ).getBytes("UTF-8") );
summary.write( SUMMARY_DELIMITER );
summary.write( Long.toString( out.getPos() - oldPos ).getBytes("UTF-8") );
summary.write( NEWLINE );
summary.flush();
}
}