package org.apache.hadoop.io.simpleseekableformat; import java.io.DataOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.SortedMap; import java.util.TreeMap; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionOutputStream; import org.apache.hadoop.io.compress.Compressor; import org.apache.hadoop.io.simpleseekableformat.SimpleSeekableFormat.MetaData; import org.apache.hadoop.io.simpleseekableformat.SimpleSeekableFormat.OffsetPair; import org.apache.hadoop.util.ReflectionUtils; /** * Write data in Seekable File Format. * Data from a single write will be in a single data segment. * * See {@link SimpleSeekableFormat} */ public class SimpleSeekableFormatOutputStream extends CompressionOutputStream implements Configurable { /** * This is a hint. The actual max can go beyond this number if a lot of data are * sent via a single write. A single write will always be in the same data segment. */ private static final int DEFAULT_MAX_UNCOMPRESSED_SEGMENT_LENGTH = 1024 * 1024; /** * dataSegmentOut is a wrapper stream that automatically inserts MetaDataBlocks * while writing out data segments. */ final InterleavedOutputStream dataSegmentOut; /** * dataSegmentDataOut is a DataOutputStream wrapping dataSegmentOut. */ private final DataOutputStream dataSegmentDataOut; private final MetaData metadata; private Configuration conf; private Class<? extends CompressionCodec> codecClass; private CompressionCodec codec; private Compressor codecCompressor; private int thresholdUncompressedSegmentLength; private final SimpleSeekableFormat.Buffer currentDataSegmentBuffer = new SimpleSeekableFormat.Buffer(); public SimpleSeekableFormatOutputStream(OutputStream out) { this(new DataOutputStream(out)); } /** * DataOutputStream allows easy write of integer, string etc. */ protected SimpleSeekableFormatOutputStream(DataOutputStream out) { // We don't use the inherited field "out" at all. super(null); metadata = new MetaData(); SortedMap<Long, Long> offsetPairs = new TreeMap<Long, Long>(); offsetPairs.put(0L, 0L); metadata.setOffsetPairs(offsetPairs); this.dataSegmentOut = new InterleavedOutputStream(out, SimpleSeekableFormat.METADATA_BLOCK_LENGTH, SimpleSeekableFormat.DATA_BLOCK_LENGTH, new SimpleSeekableFormat.MetaDataProducer(metadata) ); this.dataSegmentDataOut = new DataOutputStream(dataSegmentOut); } @Override public Configuration getConf() { return conf; } @Override public void setConf(Configuration conf) { this.conf = conf; // Set the codec codecClass = conf.getClass(SimpleSeekableFormat.FILEFORMAT_SSF_CODEC_CONF, null, CompressionCodec.class); if (codecClass == null) { codec = null; } else { codec = ReflectionUtils.newInstance(codecClass, conf); codecCompressor = codec.createCompressor(); } // Set the max segment length thresholdUncompressedSegmentLength = conf.getInt( SimpleSeekableFormat.FILEFORMAT_SSF_MAX_UNCOMPRESSED_SEGMENT_LENGTH, DEFAULT_MAX_UNCOMPRESSED_SEGMENT_LENGTH); } @Override public void write(int b) throws IOException { currentDataSegmentBuffer.write(b); flushIfNeeded(); } /** * This function makes sure the whole buffer is written into the same data segment. */ @Override public void write(byte[] b, int start, int length) throws IOException { currentDataSegmentBuffer.write(b, start, length); flushIfNeeded(); } @Override public void close() throws IOException { if (currentDataSegmentBuffer.size() > 0) { flush(); } dataSegmentDataOut.close(); } private void flushIfNeeded() throws IOException { if (currentDataSegmentBuffer.size() >= thresholdUncompressedSegmentLength) { flush(); } } private void updateMetadata(long uncompressedSegmentSize, long compressedSegmentSize) { SortedMap<Long, Long> offsetPairs = metadata.getOffsetPairs(); long lastUncompressedOffset = offsetPairs.firstKey(); long lastCompressedOffset = offsetPairs.get(lastUncompressedOffset); long uncompressedOffset = lastUncompressedOffset + uncompressedSegmentSize; long compressedOffset = lastCompressedOffset + compressedSegmentSize; offsetPairs.clear(); offsetPairs.put(uncompressedOffset, compressedOffset); } /** * Take the current data segment, optionally compress it, * calculate the crc32, and then write it out. * * The method sets the lastOffsets to the end of the file before it starts * writing. That means the offsets in the MetaDataBlock will be after the * end of the current data block. */ @Override public void flush() throws IOException { // Do not do anything if no data has been written if (currentDataSegmentBuffer.size() == 0) { return; } // Create the current DataSegment DataSegmentWriter currentDataSegment = new DataSegmentWriter(currentDataSegmentBuffer, codec, codecCompressor); // Update the metadata updateMetadata(currentDataSegmentBuffer.size(), currentDataSegment.size()); // Write out the DataSegment currentDataSegment.writeTo(dataSegmentDataOut); // Clear out the current buffer. Note that this has to be done after // currentDataSegment.writeTo(...), because currentDataSegment can // keep a reference to the currentDataSegmentBuffer. currentDataSegmentBuffer.reset(); // Flush out the underlying stream dataSegmentDataOut.flush(); } @Override public void finish() throws IOException { // we don't need to do anything for finish(). } @Override public void resetState() throws IOException { // we don't need to do anything for resetState(). } }