DataSegmentWriter.java example

Explorer
hadoop-20-master
- src
package org.apache.hadoop.io.simpleseekableformat;

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.util.concurrent.ConcurrentHashMap;
import java.util.zip.CRC32;

import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.Compressor;

/**
 * This class holds the data related to a single data segment.
 */
class DataSegmentWriter {

  // empty string for no compression
  private final String codecName;
  private final byte[] codecNameUTF8;
  // either uncompressedData or compressedData
  private final SimpleSeekableFormat.Buffer storedData;
  // CRC32 value
  private final long crc32Value;

  /**
   * Create a new data segment from uncompressed data and a codec.
   * This is called by the writer.
   * @param compressor  for reusing a compressor.  It can be null.
   */
  DataSegmentWriter(SimpleSeekableFormat.Buffer uncompressedData,
      CompressionCodec codec,
      Compressor compressor) throws IOException {

    // Try compress
    if (codec != null) {
      SimpleSeekableFormat.Buffer compressedData = new SimpleSeekableFormat.Buffer();
      OutputStream out;
      if (compressor == null) {
        compressor = codec.createCompressor();
      } else {
        compressor.reset();
      }
      out = codec.createOutputStream(compressedData, compressor);
      out.write(uncompressedData.getData(), 0, uncompressedData.getLength());
      out.close();

      // Don't compress if the result is longer than uncompressed data.
      if (compressedData.getLength() + codec.getClass().getName().length()
          < uncompressedData.getLength() + 8) {
        codecName = codec.getClass().getName();
        storedData = compressedData;
      } else {
        codecName = "";
        storedData = uncompressedData;
      }
    } else {
      // no compression
      codecName = "";
      storedData = uncompressedData;
    }
    codecNameUTF8 = getCodecNameUTF8(codecName);

    // Calculate CRC32 only when there are no compression.
    if (codecName.length() == 0) {
      CRC32 crc32 = new CRC32();
      crc32.update(storedData.getData(), 0, storedData.getLength());
      crc32Value = crc32.getValue();
    } else {
      crc32Value = 0;
    }
  }

  /**
   * Returns the size of this Data Segment after (potential) compression.
   * This includes the length field.
   */
  int size() {
    return 4 /*length field*/
      + (codecNameUTF8.length == 0 ? 8 : 0) /*optional crc32*/
      + 2 /*utf8 length*/ + codecNameUTF8.length
      + storedData.getLength();
  }

  /**
   * Write this data segment into an OutputStream.
   */
  void writeTo(DataOutputStream out) throws IOException {

    // We do the UTF8 conversion ourselves instead of relying on DataOutput
    // to ensure we strictly follow UTF-8 standard, as well as better performance,
    // and save the code to count the UTF-8 bytes (we need that to calculate
    // the total length.
    int length = size() - 4;

    out.writeInt(length);
    out.writeShort(codecNameUTF8.length);
    out.write(codecNameUTF8);
    if (codecNameUTF8.length == 0) {
      out.writeLong(crc32Value);
    }
    out.write(storedData.getData(), 0, storedData.getLength());
  }


  /**
   * Utility static fields.
   */
  static final ConcurrentHashMap<String, byte[]> CODEC_NAME_CACHE = new ConcurrentHashMap<String, byte[]>();
  /**
   * Convert from String to UTF8 byte array.
   */
  static byte[] getCodecNameUTF8(String compressionCodecName) {
    byte[] codecNameBytes = CODEC_NAME_CACHE.get(compressionCodecName);
    if (codecNameBytes == null) {
      try {
        codecNameBytes = compressionCodecName.getBytes("UTF-8");
      } catch (UnsupportedEncodingException e) {
        throw new RuntimeException(e);
      }
      CODEC_NAME_CACHE.put(compressionCodecName, codecNameBytes);
    }
    return codecNameBytes;
  }

}