/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools.util; import htsjdk.samtools.util.zip.DeflaterFactory; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.util.zip.CRC32; import java.util.zip.Deflater; /** * Writer for a file that is a series of gzip blocks (BGZF format). The caller just treats it as an * OutputStream, and under the covers a gzip block is written when the amount of uncompressed as-yet-unwritten * bytes reaches a threshold. * * The advantage of BGZF over conventional gzip is that BGZF allows for seeking without having to scan through * the entire file up to the position being sought. * * Note that the flush() method should not be called by client * unless you know what you're doing, because it forces a gzip block to be written even if the * number of buffered bytes has not reached threshold. close(), on the other hand, must be called * when done writing in order to force the last gzip block to be written. * * c.f. http://samtools.sourceforge.net/SAM1.pdf for details of BGZF file format. */ public class BlockCompressedOutputStream extends OutputStream implements LocationAware { private static int defaultCompressionLevel = BlockCompressedStreamConstants.DEFAULT_COMPRESSION_LEVEL; /** * Sets the GZip compression level for subsequent BlockCompressedOutputStream object creation * that do not specify the compression level. * @param compressionLevel 1 <= compressionLevel <= 9 */ public static void setDefaultCompressionLevel(final int compressionLevel) { if (compressionLevel < Deflater.NO_COMPRESSION || compressionLevel > Deflater.BEST_COMPRESSION) { throw new IllegalArgumentException("Invalid compression level: " + compressionLevel); } defaultCompressionLevel = compressionLevel; } public static int getDefaultCompressionLevel() { return defaultCompressionLevel; } private final BinaryCodec codec; private final byte[] uncompressedBuffer = new byte[BlockCompressedStreamConstants.DEFAULT_UNCOMPRESSED_BLOCK_SIZE]; private int numUncompressedBytes = 0; private final byte[] compressedBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH]; private final Deflater deflater; // A second deflater is created for the very unlikely case where the regular deflation actually makes // things bigger, and the compressed block is too big. It should be possible to downshift the // primary deflater to NO_COMPRESSION level, recompress, and then restore it to its original setting, // but in practice that doesn't work. // The motivation for deflating at NO_COMPRESSION level is that it will predictably produce compressed // output that is 10 bytes larger than the input, and the threshold at which a block is generated is such that // the size of tbe final gzip block will always be <= 64K. This is preferred over the previous method, // which would attempt to compress up to 64K bytes, and if the resulting compressed block was too large, // try compressing fewer input bytes (aka "downshifting'). The problem with downshifting is that // getFilePointer might return an inaccurate value. // I assume (AW 29-Oct-2013) that there is no value in using hardware-assisted deflater for no-compression mode, // so just use JDK standard. private final Deflater noCompressionDeflater = new Deflater(Deflater.NO_COMPRESSION, true); private final CRC32 crc32 = new CRC32(); private File file = null; private long mBlockAddress = 0; // Really a local variable, but allocate once to reduce GC burden. private final byte[] singleByteArray = new byte[1]; /** * Uses default compression level, which is 5 unless changed by setDefaultCompressionLevel */ public BlockCompressedOutputStream(final String filename) { this(filename, defaultCompressionLevel); } /** * Uses default compression level, which is 5 unless changed by setDefaultCompressionLevel */ public BlockCompressedOutputStream(final File file) { this(file, defaultCompressionLevel); } /** * Prepare to compress at the given compression level * @param compressionLevel 1 <= compressionLevel <= 9 */ public BlockCompressedOutputStream(final String filename, final int compressionLevel) { this(new File(filename), compressionLevel); } /** * Prepare to compress at the given compression level * @param compressionLevel 1 <= compressionLevel <= 9 */ public BlockCompressedOutputStream(final File file, final int compressionLevel) { this.file = file; codec = new BinaryCodec(file, true); deflater = DeflaterFactory.makeDeflater(compressionLevel, true); } /** * Constructors that take output streams * file may be null */ public BlockCompressedOutputStream(final OutputStream os, final File file) { this(os, file, defaultCompressionLevel); } public BlockCompressedOutputStream(final OutputStream os, final File file, final int compressionLevel) { this.file = file; codec = new BinaryCodec(os); if (file != null) { codec.setOutputFileName(file.getAbsolutePath()); } deflater = DeflaterFactory.makeDeflater(compressionLevel, true); } /** * * @param location May be null. Used for error messages, and for checking file termination. * @param output May or not already be a BlockCompressedOutputStream. * @return A BlockCompressedOutputStream, either by wrapping the given OutputStream, or by casting if it already * is a BCOS. */ public static BlockCompressedOutputStream maybeBgzfWrapOutputStream(final File location, OutputStream output) { if (!(output instanceof BlockCompressedOutputStream)) { return new BlockCompressedOutputStream(output, location); } else { return (BlockCompressedOutputStream)output; } } /** * Writes b.length bytes from the specified byte array to this output stream. The general contract for write(b) * is that it should have exactly the same effect as the call write(b, 0, b.length). * @param bytes the data */ @Override public void write(final byte[] bytes) throws IOException { write(bytes, 0, bytes.length); } /** * Writes len bytes from the specified byte array starting at offset off to this output stream. The general * contract for write(b, off, len) is that some of the bytes in the array b are written to the output stream in order; * element b[off] is the first byte written and b[off+len-1] is the last byte written by this operation. * * @param bytes the data * @param startIndex the start offset in the data * @param numBytes the number of bytes to write */ @Override public void write(final byte[] bytes, int startIndex, int numBytes) throws IOException { assert(numUncompressedBytes < uncompressedBuffer.length); while (numBytes > 0) { final int bytesToWrite = Math.min(uncompressedBuffer.length - numUncompressedBytes, numBytes); System.arraycopy(bytes, startIndex, uncompressedBuffer, numUncompressedBytes, bytesToWrite); numUncompressedBytes += bytesToWrite; startIndex += bytesToWrite; numBytes -= bytesToWrite; assert(numBytes >= 0); if (numUncompressedBytes == uncompressedBuffer.length) { deflateBlock(); } } } /** * WARNING: flush() affects the output format, because it causes the current contents of uncompressedBuffer * to be compressed and written, even if it isn't full. Unless you know what you're doing, don't call flush(). * Instead, call close(), which will flush any unwritten data before closing the underlying stream. * */ @Override public void flush() throws IOException { while (numUncompressedBytes > 0) { deflateBlock(); } codec.getOutputStream().flush(); } /** * close() must be called in order to flush any remaining buffered bytes. An unclosed file will likely be * defective. * */ @Override public void close() throws IOException { flush(); // For debugging... // if (numberOfThrottleBacks > 0) { // System.err.println("In BlockCompressedOutputStream, had to throttle back " + numberOfThrottleBacks + // " times for file " + codec.getOutputFileName()); // } codec.writeBytes(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); codec.close(); // Can't re-open something that is not a regular file, e.g. a named pipe or an output stream if (this.file == null || !this.file.isFile()) return; if (BlockCompressedInputStream.checkTermination(this.file) != BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) { throw new IOException("Terminator block not found after closing BGZF file " + this.file); } } /** * Writes the specified byte to this output stream. The general contract for write is that one byte is written * to the output stream. The byte to be written is the eight low-order bits of the argument b. * The 24 high-order bits of b are ignored. * @param bite * @throws IOException */ public void write(final int bite) throws IOException { singleByteArray[0] = (byte)bite; write(singleByteArray); } /** Encode virtual file pointer * Upper 48 bits is the byte offset into the compressed stream of a block. * Lower 16 bits is the byte offset into the uncompressed stream inside the block. */ public long getFilePointer(){ return BlockCompressedFilePointerUtil.makeFilePointer(mBlockAddress, numUncompressedBytes); } @Override public long getPosition() { return getFilePointer(); } /** * Attempt to write the data in uncompressedBuffer to the underlying file in a gzip block. * If the entire uncompressedBuffer does not fit in the maximum allowed size, reduce the amount * of data to be compressed, and slide the excess down in uncompressedBuffer so it can be picked * up in the next deflate event. * @return size of gzip block that was written. */ private int deflateBlock() { if (numUncompressedBytes == 0) { return 0; } final int bytesToCompress = numUncompressedBytes; // Compress the input deflater.reset(); deflater.setInput(uncompressedBuffer, 0, bytesToCompress); deflater.finish(); int compressedSize = deflater.deflate(compressedBuffer, 0, compressedBuffer.length); // If it didn't all fit in compressedBuffer.length, set compression level to NO_COMPRESSION // and try again. This should always fit. if (!deflater.finished()) { noCompressionDeflater.reset(); noCompressionDeflater.setInput(uncompressedBuffer, 0, bytesToCompress); noCompressionDeflater.finish(); compressedSize = noCompressionDeflater.deflate(compressedBuffer, 0, compressedBuffer.length); if (!noCompressionDeflater.finished()) { throw new IllegalStateException("unpossible"); } } // Data compressed small enough, so write it out. crc32.reset(); crc32.update(uncompressedBuffer, 0, bytesToCompress); final int totalBlockSize = writeGzipBlock(compressedSize, bytesToCompress, crc32.getValue()); assert(bytesToCompress <= numUncompressedBytes); // Clear out from uncompressedBuffer the data that was written if (bytesToCompress == numUncompressedBytes) { numUncompressedBytes = 0; } else { System.arraycopy(uncompressedBuffer, bytesToCompress, uncompressedBuffer, 0, numUncompressedBytes - bytesToCompress); numUncompressedBytes -= bytesToCompress; } mBlockAddress += totalBlockSize; return totalBlockSize; } /** * Writes the entire gzip block, assuming the compressed data is stored in compressedBuffer * @return size of gzip block that was written. */ private int writeGzipBlock(final int compressedSize, final int uncompressedSize, final long crc) { // Init gzip header codec.writeByte(BlockCompressedStreamConstants.GZIP_ID1); codec.writeByte(BlockCompressedStreamConstants.GZIP_ID2); codec.writeByte(BlockCompressedStreamConstants.GZIP_CM_DEFLATE); codec.writeByte(BlockCompressedStreamConstants.GZIP_FLG); codec.writeInt(0); // Modification time codec.writeByte(BlockCompressedStreamConstants.GZIP_XFL); codec.writeByte(BlockCompressedStreamConstants.GZIP_OS_UNKNOWN); codec.writeShort(BlockCompressedStreamConstants.GZIP_XLEN); codec.writeByte(BlockCompressedStreamConstants.BGZF_ID1); codec.writeByte(BlockCompressedStreamConstants.BGZF_ID2); codec.writeShort(BlockCompressedStreamConstants.BGZF_LEN); final int totalBlockSize = compressedSize + BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH + BlockCompressedStreamConstants.BLOCK_FOOTER_LENGTH; // I don't know why we store block size - 1, but that is what the spec says codec.writeShort((short)(totalBlockSize - 1)); codec.writeBytes(compressedBuffer, 0, compressedSize); codec.writeInt((int)crc); codec.writeInt(uncompressedSize); return totalBlockSize; } }