/* * Copyright (c) 2012 The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package htsjdk.variant.bcf2; import htsjdk.tribble.TribbleException; import htsjdk.variant.utils.GeneralUtils; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; public final class BCF2Decoder { byte[] recordBytes = null; ByteArrayInputStream recordStream = null; public BCF2Decoder() { // nothing to do } /** * Create a new decoder ready to read BCF2 data from the byte[] recordBytes, for testing purposes * * @param recordBytes */ protected BCF2Decoder(final byte[] recordBytes) { setRecordBytes(recordBytes); } // ---------------------------------------------------------------------- // // Routines to load, set, skip blocks of underlying data we are decoding // // ---------------------------------------------------------------------- /** * Reads the next record from input stream and prepare this decoder to decode values from it * * @param stream * @return */ public void readNextBlock(final int blockSizeInBytes, final InputStream stream) { if ( blockSizeInBytes < 0 ) throw new TribbleException("Invalid block size " + blockSizeInBytes); setRecordBytes(readRecordBytes(blockSizeInBytes, stream)); } /** * Skips the next record from input stream, invalidating current block data * * @param stream * @return */ public void skipNextBlock(final int blockSizeInBytes, final InputStream stream) { try { final int bytesRead = (int)stream.skip(blockSizeInBytes); validateReadBytes(bytesRead, 1, blockSizeInBytes); } catch ( IOException e ) { throw new TribbleException("I/O error while reading BCF2 file", e); } this.recordBytes = null; this.recordStream = null; } /** * Returns the byte[] for the block of data we are currently decoding * @return */ public byte[] getRecordBytes() { return recordBytes; } /** * The size of the current block in bytes * * @return */ public int getBlockSize() { return recordBytes.length; } public boolean blockIsFullyDecoded() { return recordStream.available() == 0; } /** * Use the recordBytes[] to read BCF2 records from now on * * @param recordBytes */ public void setRecordBytes(final byte[] recordBytes) { this.recordBytes = recordBytes; this.recordStream = new ByteArrayInputStream(recordBytes); } // ---------------------------------------------------------------------- // // High-level decoder // // ---------------------------------------------------------------------- public final Object decodeTypedValue() throws IOException { final byte typeDescriptor = readTypeDescriptor(); return decodeTypedValue(typeDescriptor); } public final Object decodeTypedValue(final byte typeDescriptor) throws IOException { final int size = decodeNumberOfElements(typeDescriptor); return decodeTypedValue(typeDescriptor, size); } public final Object decodeTypedValue(final byte typeDescriptor, final int size) throws IOException { if ( size == 0 ) { // missing value => null in java return null; } else { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); if ( type == BCF2Type.CHAR ) { // special case string decoding for efficiency return decodeLiteralString(size); } else if ( size == 1 ) { return decodeSingleValue(type); } else { final ArrayList<Object> ints = new ArrayList<Object>(size); for ( int i = 0; i < size; i++ ) { final Object val = decodeSingleValue(type); if ( val == null ) continue; // auto-pruning. We remove trailing nulls ints.add(val); } return ints.isEmpty() ? null : ints; // return null when all of the values are null } } } public final Object decodeSingleValue(final BCF2Type type) throws IOException { // TODO -- decodeTypedValue should integrate this routine final int value = decodeInt(type); if ( value == type.getMissingBytes() ) return null; else { switch (type) { case INT8: case INT16: case INT32: return value; case FLOAT: return rawFloatToFloat(value); case CHAR: return value & 0xFF; // TODO -- I cannot imagine why we'd get here, as string needs to be special cased default: throw new TribbleException("BCF2 codec doesn't know how to decode type " + type ); } } } // ---------------------------------------------------------------------- // // Decode raw primitive data types (ints, floats, and strings) // // ---------------------------------------------------------------------- private final Object decodeLiteralString(final int size) { assert size > 0; // TODO -- assumes size > 0 final byte[] bytes = new byte[size]; // TODO -- in principle should just grab bytes from underlying array try { recordStream.read(bytes); int goodLength = 0; for ( ; goodLength < bytes.length ; goodLength++ ) if ( bytes[goodLength] == 0 ) break; if ( goodLength == 0 ) return null; else { final String s = new String(bytes, 0, goodLength); return BCF2Utils.isCollapsedString(s) ? BCF2Utils.explodeStringList(s) : s; } } catch ( IOException e ) { throw new TribbleException("readByte failure", e); } } public final int decodeNumberOfElements(final byte typeDescriptor) throws IOException { if ( BCF2Utils.sizeIsOverflow(typeDescriptor) ) // -1 ensures we explode immediately with a bad size if the result is missing return decodeInt(readTypeDescriptor(), -1); else // the size is inline, so just decode it return BCF2Utils.decodeSize(typeDescriptor); } /** * Decode an int from the stream. If the value in the stream is missing, * returns missingValue. Requires the typeDescriptor indicate an inline * single element event * * @param typeDescriptor * @return */ public final int decodeInt(final byte typeDescriptor, final int missingValue) throws IOException { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); final int i = decodeInt(type); return i == type.getMissingBytes() ? missingValue : i; } public final int decodeInt(final BCF2Type type) throws IOException { return type.read(recordStream); } /** * Low-level reader for int[] * * Requires a typeDescriptor so the function knows how many elements to read, * and how they are encoded. * * If size == 0 => result is null * If size > 0 => result depends on the actual values in the stream * -- If the first element read is MISSING, result is null (all values are missing) * -- Else result = int[N] where N is the first N non-missing values decoded * * @param maybeDest if not null we'll not allocate space for the vector, but instead use * the externally allocated array of ints to store values. If the * size of this vector is < the actual size of the elements, we'll be * forced to use freshly allocated arrays. Also note that padded * int elements are still forced to do a fresh allocation as well. * @return see description */ public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) throws IOException { if ( size == 0 ) { return null; } else { if ( maybeDest != null && maybeDest.length < size ) maybeDest = null; // by nulling this out we ensure that we do fresh allocations as maybeDest is too small final int val1 = decodeInt(type); if ( val1 == type.getMissingBytes() ) { // fast path for first element being missing for ( int i = 1; i < size; i++ ) decodeInt(type); return null; } else { // we know we will have at least 1 element, so making the int[] is worth it final int[] ints = maybeDest == null ? new int[size] : maybeDest; ints[0] = val1; // we already read the first one for ( int i = 1; i < size; i++ ) { ints[i] = decodeInt(type); if ( ints[i] == type.getMissingBytes() ) { // read the rest of the missing values, dropping them for ( int j = i + 1; j < size; j++ ) decodeInt(type); // deal with auto-pruning by returning an int[] containing // only the non-MISSING values. We do this by copying the first // i elements, as i itself is missing return Arrays.copyOf(ints, i); } } return ints; // all of the elements were non-MISSING } } } public final int[] decodeIntArray(final byte typeDescriptor, final int size) throws IOException { final BCF2Type type = BCF2Utils.decodeType(typeDescriptor); return decodeIntArray(size, type, null); } private double rawFloatToFloat(final int rawFloat) { return (double)Float.intBitsToFloat(rawFloat); } // ---------------------------------------------------------------------- // // Utility functions // // ---------------------------------------------------------------------- /** * Read the size of the next block from inputStream * * @param inputStream * @return */ public final int readBlockSize(final InputStream inputStream) throws IOException { return BCF2Type.INT32.read(inputStream); } /** * Read all bytes for a BCF record block into a byte[], and return it * * Is smart about reading from the stream multiple times to fill the buffer, if necessary * * @param blockSizeInBytes number of bytes to read * @param inputStream the stream to read from * @return a non-null byte[] containing exactly blockSizeInBytes bytes from the inputStream */ private static byte[] readRecordBytes(final int blockSizeInBytes, final InputStream inputStream) { assert blockSizeInBytes >= 0; final byte[] record = new byte[blockSizeInBytes]; try { int bytesRead = 0; int nReadAttempts = 0; // keep track of how many times we've read // because we might not read enough bytes from the file in a single go, do it in a loop until we get EOF while ( bytesRead < blockSizeInBytes ) { final int read1 = inputStream.read(record, bytesRead, blockSizeInBytes - bytesRead); if ( read1 == -1 ) validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes); else bytesRead += read1; } if ( GeneralUtils.DEBUG_MODE_ENABLED && nReadAttempts > 1 ) { // TODO -- remove me System.err.println("Required multiple read attempts to actually get the entire BCF2 block, unexpected behavior"); } validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes); } catch ( IOException e ) { throw new TribbleException("I/O error while reading BCF2 file", e); } return record; } /** * Make sure we read the right number of bytes, or throw an error * * @param actuallyRead * @param nReadAttempts * @param expected */ private static void validateReadBytes(final int actuallyRead, final int nReadAttempts, final int expected) { assert expected >= 0; if ( actuallyRead < expected ) { throw new TribbleException( String.format("Failed to read next complete record: expected %d bytes but read only %d after %d iterations", expected, actuallyRead, nReadAttempts)); } } public final byte readTypeDescriptor() throws IOException { return BCF2Utils.readByte(recordStream); } }