ZipNumRecordWriter.java example

Explorer

ia-hadoop-tools-master
- src
  - main
    - java
      - org
        archive
        cassandra
        BOP.java
        CDXImporter.java
        CDXToSSTable.java
        CassCDXOutputFormat.java
        CassCDXRecordWriter.java
        CassCDXStore.java
        hadoop
        cdx
        BlockLoader.java
        CDXCluster.java
        CDXClusterRangeDumper.java
        CDXConverterTool.java
        ClusterRange.java
        HDFSBlockLoader.java
        HDFSLSR.java
        HDFSRangeDumper.java
        ManifestAggregator.java
        SplitFile.java
        SummaryGenerator.java
        ZipNumBlock.java
        ZipNumBlockIterator.java
        fs
        CollectionIndexItemSearcher.java
        IAS3.java
        ItemSearcher.java
        MetaManagerItemSearcher.java
        PetaboxFileSystem.java
        SearchEngineItemSearcher.java
        io
        HDFSTouch.java
        MergeClusterRangesInputFormat.java
        MergeClusterRangesInputSplit.java
        jobs
        ArchiveFileExtractor.java
        BuildCluster.java
        CDXGenerator.java
        CDXTransformer.java
        HTTPImportJob.java
        JobDriver.java
        MergeCluster.java
        MergeClusterRangesJob.java
        MergeClusters.java
        ProgramDriver.java
        WARCMetadataRecordGenerator.java
        WATExtractorJob.java
        WATGenerator.java
        mapreduce
        AlphaPartitioner.java
        CDXInputFormat.java
        CDXMapper.java
        GZIPMembersLineInputFormat.java
        GZIPMembersLineRecordReader.java
        GZIPRangeLineDereferencingInputFormat.java
        GZIPRangeLineDereferencingRecordReader.java
        GlobalWaybackCDXReducer.java
        GlobalWaybackMergeMapper.java
        HTTPImportMapper.java
        HttpLineRecordReader.java
        HttpTextInputFormat.java
        IdentityTextReducer.java
        LFOnlyLineReader.java
        LFOnlyLineRecordReader.java
        LineDereferencingInputFormat.java
        LineDereferencingRecordReader.java
        SimpleTextMapper.java
        SortMergeInputFormat.java
        SortMergeInputSplit.java
        WATExtractorMapper.java
        ZipNumAllOutputFormat.java
        ZipNumAllShardRecordWriter.java
        ZipNumOutputFormat.java
        ZipNumPartitioner.java
        ZipNumRecordWriter.java
        ZipNumRecordWriterOld.java
        pig
        CDXLoader.java
        CrawlLogLoader.java
        DateFilter.java
        DateFilterLSRLoader.java
        DisablablePathFilter.java
        FirstPigJobOnlyFilter.java
        HBaseStorage.java
        HFileStorage.java
        HttpClusterInputSplit.java
        HttpInputLineRecordReader.java
        HttpTextLoader.java
        HttpZipNumDerefLineRecordReader.java
        LSRPigLoader.java
        PerMapTextLoader.java
        RecordReaderValueIterator.java
        SequenceFileStorage.java
        SortedDistinctBagFactory.java
        ZipNumInputFormat.java
        ZipNumLoader.java
        ZipNumPartitioner.java
        ZipNumRecordReader.java
        ZipNumStorage.java
        udf
        AccessControlAllowCapture.java
        DateTime14ToTimestamp.java
        FromJsonFunc.java
        Recanonicalize.java
        ToJsonFunc.java
        toSURT.java
        streaming
        CdxDedupReducer.java
        CdxFilterMap.java
        CombineZipNumInputFormat.java
        CombinedGzipInputFormat.java
        GzipInputFormat.java
        GzipSingleFileRecordReader.java
        NativeZipNumOutputFormat.java
        NativeZipNumRecordWriter.java
        RecordReaderValueIterator.java
        ZipNumInputFormat.java
        ZipNumOutputFormat.java
        ZipNumPartitioner.java
        ZipNumRecordReader.java
        ZipNumRecordWriter.java
        util
        FilenameInputFormat.java
        HDFSMove.java
        HDFSSync.java
        HDFSeeko.java
        HadoopUtil.java
        PartitionName.java
        io
        ZipNumWriterTool.java
        petabox
        CookieFilePetaboxCredentialProvider.java
        HMACPetaboxAuthProvider.java
        ItemFile.java
        ItemMetadata.java
        PetaboxAuthProvider.java
        PetaboxClient.java
        PetaboxClientConfig.java
        PetaboxCredentialProvider.java
        server
        FileBackedInputStream.java
        GZRangeClient.java
        GZRangeClientTool.java
        GZRangeServer.java
  - test
    - java
      - org
        archive
        hadoop
        cdx
        CDXClusterTest.java
        SplitFileTest.java
        fs
        PetaboxFileSystemTest.java
        func
        URLResolverFuncTest.java
        pig
        CDXLoaderTest.java
        udf
        DateTime14ToTimestampTest.java
        petabox
        ItemMetadataTest.java
        server
        GZRangeClientTest.java

package org.archive.hadoop.mapreduce;

import java.io.IOException;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

/**
 * RecordWriter which produces "zipnum" output format.  This is fairly
 * specific to the needs of the Wayback Machine CDX "clusters".
 * <p>
 * It only handles Text keys and values, and only outputs text records
 * in "zipnum" format.  The output text format is just the key and
 * value that are passed in, delimited by DELIMITER (' ').
 * <p>
 * For "zipnum" format, the output records are compressed with the
 * given Hadoop <code>codec</code>, and after <code>limit</code> lines
 * are written, the compression stream is closed and a new one is
 * open.  This gives us the "catenated compression envelopes" format
 * that is used frequently at Internet Archive.
 * <p>
 * Whenever a compression envelope is closed, a summary line is 
 * written to an <code>*-idx</code> file.  This summary/idx file
 * records the first key of the compression envelope and the
 * starting byte-offset of the envelope and the envelope size
 * in bytes, e.g.
 * <pre>
 *   org,example)	0	128
 * </pre>
 * The fields of the summary/idx file are delimited with tabs.
 * <p>
 * The trick to make this work is to use the
 * <code>NotClosingDataOutputStream</code> to trap the calls to
 * <code>close()</code> by the <code>codec</code>'s output stream.
 * <p>
 * When we close one compression envelope, we call the
 * <code>codec.flush()</code> and <code>codec.close()</code> methods
 * to ensure that the compressed output is flushed and the compression
 * footers are written to the underlying output stream.  But, the
 * codec will try to close the underlying output stream too, which we
 * want to prevent from happening because we want to start the next
 * compression envelope.  So, we trap the call to <code>close()</code>
 * and ignore it.  Then, create a new compression stream on top
 * of the existing underlying file output stream.
 */
public class ZipNumRecordWriter extends RecordWriter<Text, Text>
{
  // Since we are writing binary output, we just create some
  // <strong>int</strong> values for our literal characters we use
  // later.
  public static final int DELIMITER = ' ';
  public static final int NEWLINE   = '\n';
  public static final int SUMMARY_DELIMITER = '\t';

  public FSDataOutputStream out;
  public CompressionCodec codec;
  public CompressionOutputStream compressing;
  public FSDataOutputStream summary;
  public String partitionName;
   
  public Text startKey;
  public long oldPos = 0;
  public long count  = 0;
  public long limit  = 0;
  
  /**
   * Construct a ZipNumRecordWriter.
   */
  public ZipNumRecordWriter( CompressionCodec codec, FSDataOutputStream out, FSDataOutputStream summary, String partitionName, long limit ) throws IOException
  {
    this.limit = limit;
    
    this.out = out;

    this.codec = codec;
    
    /* Create CompressionOutputStream once when starting */
    this.compressing  = codec.createOutputStream( out );
    
    this.summary = summary;
    this.partitionName = partitionName;
  }
  
  /**
   * get the key that will be written in the summary file
   * currently this is the first 2 cdx fields, the url key and date
   * 
   * 
   */
  protected String getCdxSummaryKey(String cdx)
  {
	  int spaceIndex = cdx.indexOf(DELIMITER);

	  if (spaceIndex >= 0) {
		  spaceIndex = cdx.indexOf(DELIMITER, spaceIndex + 1);
	  }

	  String summaryKey = ((spaceIndex >= 0) ? cdx.substring(0, spaceIndex) : cdx);
		
	  if (spaceIndex < 0) {
		  System.err.println("POSSIBLY INVALID CDX LINE: " + cdx);
	  }
	  
	  // Ensure no tabs are present in the key
	  summaryKey = summaryKey.replace("\t", "%09");
	  
	  return summaryKey;
   }

  /**
   * Write the key,value pair to the compressed output stream.  Once we write <code>limit</code>
   * records, close the compression envelope and start another one; also write a summary line.
   */
  @Override
  public synchronized void write( Text key, Text value ) throws IOException
  {
    if ( count == 0 )
      {
        // NOTE: It's important to create a *new* Text here.  The
        //       'key' passed-in is modified by the caller.  So if we
        //       just keep a reference to the 'key', then those
        //       modifications will also apply to our 'startKey'.
    	String summaryKey = getCdxSummaryKey(key.getLength() > 0 ? key.toString() : value.toString());
        startKey = new Text( summaryKey );
      }
    
    // Write the output record to the compressing stream.
    if (value.getLength() == 0) {
        compressing.write( key.getBytes(), 0, key.getLength() );
    } else if (key.getLength() == 0) {
        compressing.write( value.getBytes(), 0, value.getLength() );
    } else {
        compressing.write( key.getBytes(), 0, key.getLength() );
        compressing.write( DELIMITER );
        compressing.write( value.getBytes(), 0, value.getLength() );        
    }
 
    compressing.write( NEWLINE ); 
    
    count++;
    if ( count == limit )
      {
        // Flush and close the current compression block/envelope.  
        // The close() method is supposed to flush() first, but you never know...
        compressing.flush();
        compressing.finish();
        
        writeSummary();
        
        // Save the position and start the next compression envelope.
        oldPos = out.getPos();
        count  = 0;
        
        // Reset Compression stream to begin compressing again w/o resetting underlying stream
        compressing.resetState();
      }
  }
  
  /**
   * Close the compression envelope, write a summary line, then
   * finally close the underlying output stream.
   */
  @Override
  public synchronized void close(TaskAttemptContext context) throws IOException 
  {
    // I'm paranoid about flushing :)
    compressing.flush();
    compressing.finish();
    compressing.close();

    // It's possible no records were written to this output partition,
    // in which case we don't have a startKey to write to the summary.
    if ( startKey != null )
      {
        writeSummary();
      }
    
    out.flush();
    out.close();
    summary.close();
  }

  /**
   * Convenience method to write out a summary line.
   */
  public void writeSummary( ) throws IOException
  {
    summary.write( startKey.getBytes(), 0, startKey.getLength() );
    summary.write( SUMMARY_DELIMITER );
    summary.write( partitionName.getBytes("UTF-8") );
    summary.write( SUMMARY_DELIMITER );
    summary.write( Long.toString( oldPos ).getBytes("UTF-8") );
    summary.write( SUMMARY_DELIMITER );
    summary.write( Long.toString( out.getPos() - oldPos ).getBytes("UTF-8") );
    summary.write( NEWLINE ); 
    summary.flush();
  }
}