/* This file is part of VoltDB. * Copyright (C) 2008-2017 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb.sysprocs.saverestore; import java.io.EOFException; import java.io.FileDescriptor; import java.io.FileInputStream; import java.io.IOException; import java.nio.BufferOverflowException; import java.nio.BufferUnderflowException; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.util.ArrayDeque; import java.util.HashSet; import java.util.Set; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.Semaphore; import java.util.zip.Checksum; import org.apache.hadoop_voltpatches.util.PureJavaCrc32; import org.apache.hadoop_voltpatches.util.PureJavaCrc32C; import org.json_voltpatches.JSONArray; import org.json_voltpatches.JSONException; import org.json_voltpatches.JSONObject; import org.voltcore.TransactionIdManager; import org.voltcore.logging.VoltLogger; import org.voltcore.utils.Bits; import org.voltcore.utils.DBBPool; import org.voltcore.utils.DBBPool.BBContainer; import org.voltdb.EELibraryLoader; import org.voltdb.messaging.FastDeserializer; import org.voltdb.utils.CompressionService; import org.voltdb.utils.PosixAdvise; /** * An abstraction around a table's save file for restore. Deserializes the * meta-data that was stored when the table was saved and makes it available * to clients. The meta data is stored as a JSON blob with length prefixing and a CRC * as well as a byte to that is set once the file is completely written and synced. * A VoltTable header describing the schema is follows the JSON blob. */ public class TableSaveFile { public static enum ChecksumType { CRC32, CRC32C } public class Container extends BBContainer { public final int partitionId; private final BBContainer m_origin; private boolean discarded = false; Container(ByteBuffer b, BBContainer origin, int partitionId) { super(b); m_origin = origin; this.partitionId = partitionId; } @Override public void discard() { checkDoubleFree(); discarded = true; if (m_hasMoreChunks == false) { m_origin.discard(); } else { m_buffers.add(m_origin); } } } /** * It is actually possible to make a bigger chunk then this if the table header is * big enough... */ private static final int DEFAULT_CHUNKSIZE = org.voltdb.SnapshotSiteProcessor.m_snapshotBufferLength + (1024 * 256); public TableSaveFile( FileInputStream fis, int readAheadChunks, Integer[] relevantPartitionIds) throws IOException { this(fis, readAheadChunks, relevantPartitionIds, false); } // XXX maybe consider an IOException subclass at some point public TableSaveFile( FileInputStream fis, int readAheadChunks, Integer[] relevantPartitionIds, boolean continueOnCorruptedChunk) throws IOException { m_fd = fis.getFD(); FileChannel dataIn = fis.getChannel(); try { EELibraryLoader.loadExecutionEngineLibrary(true); if (relevantPartitionIds == null) { m_relevantPartitionIds = null; } else { m_relevantPartitionIds = new HashSet<Integer>(); for (Integer i : relevantPartitionIds) { m_relevantPartitionIds.add(i); } } m_chunkReads = new Semaphore(readAheadChunks); m_saveFile = dataIn; m_continueOnCorruptedChunk = continueOnCorruptedChunk; final PureJavaCrc32 crc = new PureJavaCrc32(); /* * If the CRC check fails because the file wasn't completed */ final PureJavaCrc32 secondCRC = new PureJavaCrc32(); /* * Get the header with the save restore specific information */ final ByteBuffer lengthBuffer = ByteBuffer.allocate(8); while (lengthBuffer.hasRemaining()) { final int read = m_saveFile.read(lengthBuffer); if (read == -1) { throw new EOFException(); } } lengthBuffer.flip(); final int originalCRC = lengthBuffer.getInt(); int length = lengthBuffer.getInt(); crc.update(lengthBuffer.array(), 4, 4); secondCRC.update(lengthBuffer.array(), 4, 4); if (length < 0) { throw new IOException("Corrupted save file has negative header length"); } if (length > 2097152) { throw new IOException("Corrupted save file has unreasonable header length > 2 megs"); } final ByteBuffer saveRestoreHeader = ByteBuffer.allocate(length); while (saveRestoreHeader.hasRemaining()) { final int read = m_saveFile.read(saveRestoreHeader); if (read == -1 || read < length) { throw new EOFException(); } } saveRestoreHeader.flip(); crc.update(saveRestoreHeader.array()); secondCRC.update(new byte[] { 1 }); secondCRC.update(saveRestoreHeader.array(), 1, saveRestoreHeader.array().length - 1); /* * Get the template for the VoltTable serialization header. * It will have an extra length value preceded to it so that * it can be sucked straight into a buffer. This will not * contain a row count since that varies from chunk to chunk * and is supplied by the chunk */ lengthBuffer.clear(); lengthBuffer.limit(4); /* * Why this stupidity and no while loop? * Because java is broken and complains about a random final * elsewhere if you do. */ { final int read = m_saveFile.read(lengthBuffer); if (read == -1) { throw new EOFException(); } } crc.update(lengthBuffer.array(), 0, 4); secondCRC.update(lengthBuffer.array(), 0, 4); lengthBuffer.flip(); length = lengthBuffer.getInt(); if (length < 4) { throw new IOException("Corrupted save file has negative length or too small length for VoltTable header"); } if (length > 2097152) { throw new IOException("Corrupted save file has unreasonable VoltTable header length > 2 megs"); } m_tableHeader = ByteBuffer.allocate(length + 4); m_tableHeader.putInt(length); while (m_tableHeader.hasRemaining()) { final int read = m_saveFile.read(m_tableHeader); if (read == -1) { throw new EOFException(); } } crc.update(m_tableHeader.array(), 4, length); secondCRC.update(m_tableHeader.array(), 4, length); boolean failedCRCDueToNotCompleted = false; final int actualCRC = (int)crc.getValue(); if (originalCRC != actualCRC) { /* * Check if the CRC mismatch is due to the snapshot not being completed */ final int secondCRCValue = (int)secondCRC.getValue(); if (secondCRCValue == originalCRC) { failedCRCDueToNotCompleted = true; } else { throw new IOException("Checksum mismatch"); } } FastDeserializer fd = new FastDeserializer(saveRestoreHeader); byte completedByte = fd.readByte(); m_completed = failedCRCDueToNotCompleted ? false : (completedByte == 1 ? true : false); for (int ii = 0; ii < 4; ii++) { m_versionNum[ii] = fd.readInt(); } /* * Support the original pre 1.3 header format as well as a new JSON format. * JSON will make it possible to add info to a snapshot header without * breaking backwards compatibility. */ if (m_versionNum[3] == 0) { m_txnId = fd.readLong(); m_timestamp = TransactionIdManager.getTimestampFromTransactionId(m_txnId); m_hostId = fd.readInt(); m_hostname = fd.readString(); m_clusterName = fd.readString(); m_databaseName = fd.readString(); m_tableName = fd.readString(); m_isReplicated = fd.readBoolean(); m_isCompressed = false; m_checksumType = ChecksumType.CRC32; if (!m_isReplicated) { m_partitionIds = (int[])fd.readArray(int.class); if (!m_completed) { for (Integer partitionId : m_partitionIds) { m_corruptedPartitions.add(partitionId); } } m_totalPartitions = fd.readInt(); } else { m_partitionIds = new int[] {0}; m_totalPartitions = 1; if (!m_completed) { m_corruptedPartitions.add(0); } } m_hasVersion2FormatChunks = false; } else { assert(m_versionNum[3] == 1 || m_versionNum[3] == 2); if (m_versionNum[3] >= 2) { m_hasVersion2FormatChunks = true; } else { m_hasVersion2FormatChunks = false; } int numJSONBytes = fd.readInt(); byte jsonBytes[] = new byte[numJSONBytes]; fd.readFully(jsonBytes); String jsonString = new String(jsonBytes, "UTF-8"); JSONObject obj = new JSONObject(jsonString); m_txnId = obj.getLong("txnId"); //Timestamp field added for 3.0, might not be there if (obj.has("timestamp")) { m_timestamp = obj.getLong("timestamp"); } else { //Pre 3.0/IV2 the timestamp was in the transactionid m_timestamp = TransactionIdManager.getTimestampFromTransactionId(m_txnId); } m_hostId = obj.getInt("hostId"); m_hostname = obj.getString("hostname"); m_clusterName = obj.getString("clusterName"); m_databaseName = obj.getString("databaseName"); m_tableName = obj.getString("tableName"); m_isReplicated = obj.getBoolean("isReplicated"); m_isCompressed = obj.optBoolean("isCompressed", false); m_checksumType = ChecksumType.valueOf(obj.optString("checksumType", "CRC32")); if (!m_isReplicated) { JSONArray partitionIds = obj.getJSONArray("partitionIds"); m_partitionIds = new int[partitionIds.length()]; for (int ii = 0; ii < m_partitionIds.length; ii++) { m_partitionIds[ii] = partitionIds.getInt(ii); } if (!m_completed) { for (Integer partitionId : m_partitionIds) { m_corruptedPartitions.add(partitionId); } } m_totalPartitions = obj.getInt("numPartitions"); } else { m_partitionIds = new int[] {0}; m_totalPartitions = 1; if (!m_completed) { m_corruptedPartitions.add(0); } } } /* * Several runtime exceptions can be thrown in valid failure cases where * a corrupt save file is being detected. */ } catch (BufferUnderflowException e) { throw new IOException(e); } catch (BufferOverflowException e) { throw new IOException(e); } catch (IndexOutOfBoundsException e) { throw new IOException(e); } catch (JSONException e) { throw new IOException(e); } } public int[] getVersionNumber() { return m_versionNum; } public int getHostId() { return m_hostId; } public String getHostname() { return m_hostname; } public String getClusterName() { return m_clusterName; } public String getDatabaseName() { return m_databaseName; } public String getTableName() { return m_tableName; } public int[] getPartitionIds() { return m_partitionIds; } public boolean isReplicated() { return m_isReplicated; } public boolean isCompressed() { return m_isCompressed; } public int getTotalPartitions() { return m_totalPartitions; } public boolean getCompleted() { return m_completed; } public long getTxnId() { return m_txnId; } public long getTimestamp() { return m_timestamp; } public void close() throws IOException { Thread chunkReader; synchronized (this) { m_hasMoreChunks = false; chunkReader = m_chunkReaderThread; } if (chunkReader != null) { chunkReader.interrupt(); try { chunkReader.join(); } catch (InterruptedException e) { throw new IOException(e); } } synchronized (this) { while (!m_availableChunks.isEmpty()) { m_availableChunks.poll().discard(); } notifyAll(); } /* * Free buffers used to pull snapshot data in process */ BBContainer cont; while ((cont = m_buffers.poll()) != null) { cont.discard(); } } public Set<Integer> getCorruptedPartitionIds() { return m_corruptedPartitions; } public ByteBuffer getTableHeader() { return m_tableHeader; } // Will get the next chunk of the table that is just over the chunk size public synchronized BBContainer getNextChunk() throws IOException { if (m_chunkReaderException != null) { throw m_chunkReaderException; } if (!m_hasMoreChunks) { final Container c = m_availableChunks.poll(); return c; } if (m_chunkReader == null) { m_chunkReader = new ChunkReader(); m_chunkReaderThread = new Thread(m_chunkReader, "ChunkReader"); m_chunkReaderThread.start(); } Container c = null; while (c == null && (m_hasMoreChunks || !m_availableChunks.isEmpty())) { c = m_availableChunks.poll(); if (c == null) { try { wait(); } catch (InterruptedException e) { throw new IOException(e); } } } if (c != null) { m_chunkReads.release(); } else { if (m_chunkReaderException != null) { throw m_chunkReaderException; } } return c; } public synchronized boolean hasMoreChunks() throws IOException { if (m_chunkReaderException != null) { throw m_chunkReaderException; } return m_hasMoreChunks || !m_availableChunks.isEmpty(); } private final FileChannel m_saveFile; private final FileDescriptor m_fd; private final ByteBuffer m_tableHeader; private final boolean m_completed; private final int m_versionNum[] = new int[4]; private final int m_hostId; private final String m_hostname; private final String m_clusterName; private final String m_databaseName; private final String m_tableName; private final boolean m_isReplicated; private final boolean m_isCompressed; private final int m_partitionIds[]; private final int m_totalPartitions; private final long m_txnId; private final long m_timestamp; private boolean m_hasMoreChunks = true; private ConcurrentLinkedQueue<BBContainer> m_buffers = new ConcurrentLinkedQueue<BBContainer>(); private final ArrayDeque<Container> m_availableChunks = new ArrayDeque<Container>(); private final HashSet<Integer> m_relevantPartitionIds; private final ChecksumType m_checksumType; /* * In version 2 the layout of chunks was rejiggered to do less work * in execution sites. The checksum is done after the compression so the layout * of the block is very different. */ private final boolean m_hasVersion2FormatChunks; /** * Maintain a list of corrupted partitions. It is possible for uncorrupted partitions * to be recovered from a save file in the future */ private final HashSet<Integer> m_corruptedPartitions = new HashSet<Integer>(); /** * Ignore corrupted chunks and continue validation of the rest of the chunks. */ private final boolean m_continueOnCorruptedChunk; /** * The thread reading chunks will read at most this number of chunks */ private final Semaphore m_chunkReads; private ChunkReader m_chunkReader = null; private Thread m_chunkReaderThread = null; private IOException m_chunkReaderException = null; /** * Thread to read chunks from the disk */ private class ChunkReader implements Runnable { /* * The old method was out of hand. Going to start a new one with a different format * that should be easier to understand and validate. */ private void readChunksV2() { //For reading the compressed input. final BBContainer fileInputBufferC = DBBPool.allocateDirect(CompressionService.maxCompressedLength(DEFAULT_CHUNKSIZE)); final ByteBuffer fileInputBuffer = fileInputBufferC.b(); long sinceLastFAdvise = Long.MAX_VALUE; long positionAtLastFAdvise = 0; while (m_hasMoreChunks) { if (sinceLastFAdvise > 1024 * 1024 * 48) { sinceLastFAdvise = 0; VoltLogger log = new VoltLogger("SNAPSHOT"); try { final long position = m_saveFile.position(); long retval = PosixAdvise.fadvise( m_fd, position, position + 1024 * 1024 * 64, PosixAdvise.POSIX_FADV_WILLNEED); if (retval != 0) { log.info("Failed to fadvise in TableSaveFile, this is harmless: " + retval); } //Get aligned start and end position final long fadviseStart = positionAtLastFAdvise; //-1 because we don't want to drop the last page because //We will be reading it soon positionAtLastFAdvise = ((position / Bits.pageSize()) - 1) * Bits.pageSize(); final long length = positionAtLastFAdvise - fadviseStart; if (length > 0) { retval = PosixAdvise.fadvise( m_fd, fadviseStart, length, PosixAdvise.POSIX_FADV_DONTNEED); } if (retval != 0) { log.info("Failed to fadvise in TableSaveFile, this is harmless: " + retval); } positionAtLastFAdvise = position; } catch (Throwable t) { log.info("Exception attempting fadvise", t); } } /* * Limit the number of chunk materialized into memory at one time */ try { m_chunkReads.acquire(); } catch (InterruptedException e) { fileInputBufferC.discard(); return; } boolean expectedAnotherChunk = false; Container c = null; try { /* * Get the length of the next chunk, partition id, crc for partition id, and length prefix, * and then the CRC of the compressed payload */ ByteBuffer chunkLengthB = ByteBuffer.allocate(16); while (chunkLengthB.hasRemaining()) { final int read = m_saveFile.read(chunkLengthB); if (read == -1) { throw new EOFException(); } sinceLastFAdvise += read; } int nextChunkLength = chunkLengthB.getInt(0); expectedAnotherChunk = true; /* * Get the partition id and its CRC (CRC now covers length prefix) and validate it. Validating the * partition ID for the chunk separately makes it possible to * continue processing chunks from other partitions if only one partition * has corrupt chunks in the file. */ assert(m_checksumType == ChecksumType.CRC32C); final Checksum partitionIdCRC = new PureJavaCrc32C(); final int nextChunkPartitionId = chunkLengthB.getInt(4); final int nextChunkPartitionIdCRC = chunkLengthB.getInt(8); partitionIdCRC.update(chunkLengthB.array(), 0, 8); int generatedValue = (int)partitionIdCRC.getValue(); if (generatedValue != nextChunkPartitionIdCRC) { chunkLengthB.position(0); for (int partitionId : m_partitionIds) { m_corruptedPartitions.add(partitionId); } throw new IOException("Chunk partition ID CRC check failed. " + "This corrupts all partitions in this file"); } /* * CRC for the data portion of the chunk */ final int nextChunkCRC = chunkLengthB.getInt(12); /* * Sanity check the length value to ensure there isn't * a runtime exception or OOM. */ if (nextChunkLength < 0) { throw new IOException("Corrupted TableSaveFile chunk has negative chunk length"); } if (nextChunkLength > fileInputBuffer.capacity()) { throw new IOException("Corrupted TableSaveFile chunk has unreasonable length " + "> DEFAULT_CHUNKSIZE bytes"); } /* * Go fetch the compressed data so that the uncompressed size is known * and use that to set nextChunkLength to be the uncompressed length, * the code ahead that constructs the volt table is expecting * the uncompressed size/data since it is producing an uncompressed table */ fileInputBuffer.clear(); fileInputBuffer.limit(nextChunkLength); while (fileInputBuffer.hasRemaining()) { final int read = m_saveFile.read(fileInputBuffer); if (read == -1) { throw new EOFException(); } sinceLastFAdvise += read; } fileInputBuffer.flip(); nextChunkLength = CompressionService.uncompressedLength(fileInputBuffer); /* * Validate the rest of the chunk. This can fail if the data is corrupted * or the length value was corrupted. */ final int calculatedCRC = DBBPool.getBufferCRC32C(fileInputBuffer, 0, fileInputBuffer.remaining()); if (calculatedCRC != nextChunkCRC) { m_corruptedPartitions.add(nextChunkPartitionId); if (m_continueOnCorruptedChunk) { m_chunkReads.release(); continue; } else { throw new IOException("CRC mismatch in saved table chunk"); } } /* * Now allocate space to store the chunk using the VoltTable serialization representation. * The chunk will contain an integer row count preceding it so it can * be sucked straight in. There is a little funny business to overwrite the * partition id that is not part of the serialization format */ c = getOutputBuffer(nextChunkPartitionId); /* * If the length value is wrong or not all data made it to disk this read will * not complete correctly. There could be overflow, underflow etc. * so use a try finally block to indicate that all partitions are now corrupt. * The enclosing exception handlers will do the right thing WRT to * propagating the error and closing the file. */ boolean completedRead = false; try { final ByteBuffer buf = c.b(); /* * Assemble a VoltTable out of the chunk of tuples. * Put in the header that was cached in the constructor, * then copy the tuple data. */ buf.clear(); buf.limit(nextChunkLength + m_tableHeader.capacity()); m_tableHeader.position(0); buf.put(m_tableHeader); //Doesn't move buffer position, does change the limit CompressionService.decompressBuffer(fileInputBuffer, buf); completedRead = true; } finally { if (!completedRead) { for (int partitionId : m_partitionIds) { m_corruptedPartitions.add(partitionId); } if (m_continueOnCorruptedChunk) { m_chunkReads.release(); continue; } else { throw new IOException("Failed decompression of saved table chunk"); } } } /* * Skip irrelevant chunks after CRC is calculated. Always calulate the CRC * in case it is the length value that is corrupted */ if (m_relevantPartitionIds != null) { if (!m_relevantPartitionIds.contains(nextChunkPartitionId)) { m_chunkReads.release(); continue; } } /* * VoltTable wants the buffer at the home position 0 */ c.b().position(0); synchronized (TableSaveFile.this) { m_availableChunks.offer(c); c = null; TableSaveFile.this.notifyAll(); } } catch (EOFException eof) { synchronized (TableSaveFile.this) { m_hasMoreChunks = false; if (expectedAnotherChunk) { m_chunkReaderException = new IOException( "Expected to find another chunk but reached end of file instead"); } TableSaveFile.this.notifyAll(); } } catch (IOException e) { e.printStackTrace(); synchronized (TableSaveFile.this) { m_hasMoreChunks = false; m_chunkReaderException = e; TableSaveFile.this.notifyAll(); } } catch (BufferUnderflowException e) { synchronized (TableSaveFile.this) { m_hasMoreChunks = false; m_chunkReaderException = new IOException(e); TableSaveFile.this.notifyAll(); } } catch (BufferOverflowException e) { synchronized (TableSaveFile.this) { m_hasMoreChunks = false; m_chunkReaderException = new IOException(e); TableSaveFile.this.notifyAll(); } } catch (IndexOutOfBoundsException e) { synchronized (TableSaveFile.this) { m_hasMoreChunks = false; m_chunkReaderException = new IOException(e); TableSaveFile.this.notifyAll(); } } finally { if (c != null) c.discard(); } } fileInputBufferC.discard(); } private void readChunks() { //For reading the compressed input. BBContainer fileInputBufferC = DBBPool.allocateDirect(CompressionService.maxCompressedLength(DEFAULT_CHUNKSIZE)); ByteBuffer fileInputBuffer = fileInputBufferC.b(); while (m_hasMoreChunks) { /* * Limit the number of chunk materialized into memory at one time */ try { m_chunkReads.acquire(); } catch (InterruptedException e) { return; } boolean expectedAnotherChunk = false; Container c = null; try { /* * Get the length of the next chunk, partition id, crc for partition id, */ ByteBuffer chunkLengthB = ByteBuffer.allocate(16); while (chunkLengthB.hasRemaining()) { final int read = m_saveFile.read(chunkLengthB); if (read == -1) { throw new EOFException(); } } chunkLengthB.flip(); int nextChunkLength = chunkLengthB.getInt(); expectedAnotherChunk = true; /* * Get the partition id and its CRC and validate it. Validating the * partition ID for the chunk separately makes it possible to * continue processing chunks from other partitions if only one partition * has corrupt chunks in the file. */ final Checksum partitionIdCRC = m_checksumType == ChecksumType.CRC32C ? new PureJavaCrc32C() : new PureJavaCrc32(); chunkLengthB.mark(); final int nextChunkPartitionId = chunkLengthB.getInt(); final int nextChunkPartitionIdCRC = chunkLengthB.getInt(); chunkLengthB.reset(); byte partitionIdBytes[] = new byte[4]; chunkLengthB.get(partitionIdBytes); partitionIdCRC.update(partitionIdBytes, 0, partitionIdBytes.length); int generatedValue = (int)partitionIdCRC.getValue(); if (generatedValue != nextChunkPartitionIdCRC) { chunkLengthB.position(0); for (int partitionId : m_partitionIds) { m_corruptedPartitions.add(partitionId); } throw new IOException("Chunk partition ID CRC check failed. " + "This corrupts all partitions in this file"); } /* * CRC for the data portion of the chunk */ chunkLengthB.position(chunkLengthB.position() + 4); final int nextChunkCRC = chunkLengthB.getInt(); /* * Sanity check the length value to ensure there isn't * a runtime exception or OOM. */ if (nextChunkLength < 0) { throw new IOException("Corrupted TableSaveFile chunk has negative chunk length"); } if (isCompressed()) { if (nextChunkLength > fileInputBuffer.capacity()) { throw new IOException("Corrupted TableSaveFile chunk has unreasonable length " + "> DEFAULT_CHUNKSIZE bytes"); } } else { if (nextChunkLength > DEFAULT_CHUNKSIZE) { throw new IOException("Corrupted TableSaveFile chunk has unreasonable length " + "> DEFAULT_CHUNKSIZE bytes"); } } /* * Go fetch the compressed data so that the uncompressed size is known * and use that to set nextChunkLength to be the uncompressed length, * the code ahead that constructs the volt table is expecting * the uncompressed size/data since it is producing an uncompressed table */ if (isCompressed()) { fileInputBuffer.clear(); fileInputBuffer.limit(nextChunkLength); while (fileInputBuffer.hasRemaining()) { final int read = m_saveFile.read(fileInputBuffer); if (read == -1) { throw new EOFException(); } } fileInputBuffer.flip(); nextChunkLength = CompressionService.uncompressedLength(fileInputBuffer); } /* * Now allocate space to store the chunk using the VoltTable serialization representation. * The chunk will contain an integer row count preceding it so it can * be sucked straight in. There is a little funny business to overwrite the * partition id that is not part of the serialization format */ c = getOutputBuffer(nextChunkPartitionId); /* * If the length value is wrong or not all data made it to disk this read will * not complete correctly. There could be overflow, underflow etc. * so use a try finally block to indicate that all partitions are now corrupt. * The enclosing exception handlers will do the right thing WRT to * propagating the error and closing the file. */ boolean completedRead = false; int checksumStartPosition = 0; int rowCount = 0; try { /* * Assemble a VoltTable out of the chunk of tuples. * Put in the header that was cached in the constructor, * then copy the tuple data. The row count is at the end * because it isn't known until serialization is complete. * It will have to be moved back to the beginning of the tuple data * after the header once the CRC has been calculated. */ c.b().clear(); //The length of the chunk already includes space for the 4-byte row count //even though it is at the end, but we need to also leave at the end for the CRC calc if (isCompressed()) { c.b().limit(nextChunkLength + m_tableHeader.capacity() + 4); } else { //Before compression the chunk length included the stuff added in the EE //like the 2 CRCs and partition id. It is only -8 because we still need the 4-bytes //of padding to move the row count in when constructing the volt table format. c.b().limit((nextChunkLength - 8) + m_tableHeader.capacity()); } m_tableHeader.position(0); c.b().put(m_tableHeader); c.b().position(c.b().position() + 4);//Leave space for row count to be moved into checksumStartPosition = c.b().position(); if (isCompressed()) { CompressionService.decompressBuffer(fileInputBuffer, c.b()); c.b().position(c.b().limit()); } else { while (c.b().hasRemaining()) { final int read = m_saveFile.read(c.b()); if (read == -1) { throw new EOFException(); } } } c.b().position(c.b().position() - 4); rowCount = c.b().getInt(); c.b().position(checksumStartPosition); completedRead = true; } finally { if (!completedRead) { for (int partitionId : m_partitionIds) { m_corruptedPartitions.add(partitionId); } } } /* * Validate the rest of the chunk. This can fail if the data is corrupted * or the length value was corrupted. */ final int calculatedCRC = m_checksumType == ChecksumType.CRC32C ? DBBPool.getCRC32C(c.address(), c.b().position(), c.b().remaining()) : DBBPool.getCRC32(c.address(), c.b().position(), c.b().remaining()); if (calculatedCRC != nextChunkCRC) { m_corruptedPartitions.add(nextChunkPartitionId); if (m_continueOnCorruptedChunk) { m_chunkReads.release(); continue; } else { throw new IOException("CRC mismatch in saved table chunk"); } } /* * Skip irrelevant chunks after CRC is calculated. Always calulate the CRC * in case it is the length value that is corrupted */ if (m_relevantPartitionIds != null) { if (!m_relevantPartitionIds.contains(nextChunkPartitionId)) { m_chunkReads.release(); continue; } } /* * The row count which was stored on disk at the end (and for the CRC calc) * is now moved to the appropriate place for the table serialization format. * Update the limit to reflect that. * * Surrounded in a try finally just in case there is overflow/underflow. Shouldn't * happen but I could be wrong. */ boolean success = false; try { c.b().limit(c.b().limit() - 4); c.b().position(checksumStartPosition - 4); c.b().putInt(rowCount); c.b().position(0); success = true; } finally { if (!success) { for (int partitionId : m_partitionIds) { m_corruptedPartitions.add(partitionId); } } } synchronized (TableSaveFile.this) { m_availableChunks.offer(c); c = null; TableSaveFile.this.notifyAll(); } } catch (EOFException eof) { synchronized (TableSaveFile.this) { m_hasMoreChunks = false; if (expectedAnotherChunk) { m_chunkReaderException = new IOException( "Expected to find another chunk but reached end of file instead"); } TableSaveFile.this.notifyAll(); } } catch (IOException e) { synchronized (TableSaveFile.this) { m_hasMoreChunks = false; m_chunkReaderException = e; TableSaveFile.this.notifyAll(); } } catch (BufferUnderflowException e) { synchronized (TableSaveFile.this) { m_hasMoreChunks = false; m_chunkReaderException = new IOException(e); TableSaveFile.this.notifyAll(); } } catch (BufferOverflowException e) { synchronized (TableSaveFile.this) { m_hasMoreChunks = false; m_chunkReaderException = new IOException(e); TableSaveFile.this.notifyAll(); } } catch (IndexOutOfBoundsException e) { synchronized (TableSaveFile.this) { m_hasMoreChunks = false; m_chunkReaderException = new IOException(e); TableSaveFile.this.notifyAll(); } } finally { if (c != null) c.discard(); } } fileInputBufferC.discard(); } private Container getOutputBuffer(final int nextChunkPartitionId) { BBContainer c = m_buffers.poll(); if (c == null) { final BBContainer originContainer = DBBPool.allocateDirect(DEFAULT_CHUNKSIZE); final ByteBuffer b = originContainer.b(); final Container retcont = new Container(b, originContainer, nextChunkPartitionId); return retcont; } /* * Need to reconstruct the container with the partition id of the next * chunk so it can be a final public field. The buffer, address, and origin * container remain the same. */ final Container retcont = new Container(c.b(), c, nextChunkPartitionId); return retcont; } @Override public void run() { try { if (m_hasVersion2FormatChunks) { readChunksV2(); } else { readChunks(); } } finally { synchronized (TableSaveFile.this) { m_hasMoreChunks = false; TableSaveFile.this.notifyAll(); try { m_saveFile.close(); } catch (IOException e) { } } } } } }