TableSaveFile.java example

Explorer
h-store-master
/* This file is part of VoltDB.
 * Copyright (C) 2008-2010 VoltDB Inc.
 *
 * VoltDB is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * VoltDB is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with VoltDB.  If not, see <http://www.gnu.org/licenses/>.
 */

package org.voltdb.sysprocs.saverestore;

import java.io.EOFException;
import java.io.IOException;
import java.nio.BufferUnderflowException;
import java.nio.BufferOverflowException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayDeque;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.Semaphore;
import java.util.zip.CRC32;

import org.voltdb.messaging.FastDeserializer;
import org.voltdb.utils.DBBPool;
import org.voltdb.utils.DBBPool.BBContainer;
import org.voltdb.EELibraryLoader;

import edu.brown.hstore.HStoreConstants;
import edu.brown.hstore.PartitionExecutor.SystemProcedureExecutionContext;
import edu.brown.catalog.CatalogUtil;
import edu.brown.utils.CollectionUtil;

/**
 * An abstraction around a table's save file for restore. Deserializes the
 * meta-data that was stored when the table was saved and makes it available to
 * clients. This follows the structure in src/ee/storage/TableDiskHeader.{h,cpp}
 * and looks like: Header length - 4 octet integer version - 4 octet integer
 * Host ID - 4 octet integer (this is the name, *not* the GUID) Cluster name -
 * VoltDB serialized string (2 octet length followed by chars) Database name -
 * VoltDB serialized string Table name - VoltDB serialized string isReplicated -
 * 1 octet, indicates whether the table was replicated The following fields are
 * conditional on isReplicated == false Partition Ids - Array of 4 octet integer
 * ids for partitions in this file Total Hosts - The number of hosts for this
 * table when it was saved
 */
public class TableSaveFile {

    private static class Container extends BBContainer {
        @SuppressWarnings("unused")
        private final BBContainer m_origin;

        Container(ByteBuffer b, long pointer, BBContainer origin) {
            super(b, pointer);
            m_origin = origin;
        }

        @Override
        public void discard() {
            m_buffers.add(this);
        }
    }

    /**
     * It is actually possible to make a bigger chunk then this if the table
     * header is big enough...
     */
    private static final int DEFAULT_CHUNKSIZE = org.voltdb.SnapshotSiteProcessor.m_snapshotBufferLength + (1024 * 256);

    public TableSaveFile(FileChannel dataIn, int readAheadChunks, int relevantPartitionIds[]) throws IOException {
        this(dataIn, readAheadChunks, relevantPartitionIds, false);
    }

    // XXX maybe consider an IOException subclass at some point
    public TableSaveFile(FileChannel dataIn, int readAheadChunks, int relevantPartitionIds[], boolean continueOnCorruptedChunk) throws IOException {
        try {
            EELibraryLoader.loadExecutionEngineLibrary(true);
            if (relevantPartitionIds == null) {
                m_relevantPartitionIds = null;
            } else {
                m_relevantPartitionIds = new HashSet<Integer>();
                for (Integer i : relevantPartitionIds) {
                    m_relevantPartitionIds.add(i);
                }
            }
            m_chunkReads = new Semaphore(readAheadChunks);
            m_saveFile = dataIn;
            m_continueOnCorruptedChunk = continueOnCorruptedChunk;

            final CRC32 crc = new CRC32();
            /*
             * If the CRC check fails because the file wasn't completed
             */
            final CRC32 secondCRC = new CRC32();

            /*
             * Get the header with the save restore specific information
             */
            final ByteBuffer lengthBuffer = ByteBuffer.allocate(8);
            while (lengthBuffer.hasRemaining()) {
                final int read = m_saveFile.read(lengthBuffer);
                if (read == -1) {
                    throw new EOFException();
                }
            }
            lengthBuffer.flip();
            final int originalCRC = lengthBuffer.getInt();
            int length = lengthBuffer.getInt();
            crc.update(lengthBuffer.array(), 4, 4);
            secondCRC.update(lengthBuffer.array(), 4, 4);

            if (length < 0) {
                throw new IOException("Corrupted save file has negative header length");
            }

            if (length > 2097152) {
                throw new IOException("Corrupted save file has unreasonable header length > 2 megs");
            }

            final ByteBuffer saveRestoreHeader = ByteBuffer.allocate(length);
            while (saveRestoreHeader.hasRemaining()) {
                final int read = m_saveFile.read(saveRestoreHeader);
                if (read == -1 || read < length) {
                    throw new EOFException();
                }
            }
            saveRestoreHeader.flip();
            crc.update(saveRestoreHeader.array());
            secondCRC.update(new byte[] { 1 });
            secondCRC.update(saveRestoreHeader.array(), 1, saveRestoreHeader.array().length - 1);

            /*
             * Get the template for the VoltTable serialization header. It will
             * have an extra length value preceded to it so that it can be
             * sucked straight into a buffer. This will not contain a row count
             * since that varies from chunk to chunk and is supplied by the
             * chunk
             */
            lengthBuffer.clear();
            lengthBuffer.limit(4);
            /*
             * Why this stupidity and no while loop? Because java is broken and
             * complains about a random final elsewhere if you do.
             */
            {
                final int read = m_saveFile.read(lengthBuffer);
                if (read == -1) {
                    throw new EOFException();
                }
            }
            crc.update(lengthBuffer.array(), 0, 4);
            secondCRC.update(lengthBuffer.array(), 0, 4);
            lengthBuffer.flip();
            length = lengthBuffer.getInt();

            if (length < 4) {
                throw new IOException("Corrupted save file has negative length or too small length for VoltTable header");
            }

            if (length > 2097152) {
                throw new IOException("Corrupted save file has unreasonable VoltTable header length > 2 megs");
            }

            m_tableHeader = ByteBuffer.allocate(length + 4);
            m_tableHeader.putInt(length);
            while (m_tableHeader.hasRemaining()) {
                final int read = m_saveFile.read(m_tableHeader);
                if (read == -1) {
                    throw new EOFException();
                }
            }
            crc.update(m_tableHeader.array(), 4, length);
            secondCRC.update(m_tableHeader.array(), 4, length);

            boolean failedCRCDueToNotCompleted = false;

            final int actualCRC = (int) crc.getValue();
            if (originalCRC != actualCRC) {
                /*
                 * Check if the CRC mismatch is due to the snapshot not being
                 * completed
                 */
                final int secondCRCValue = (int) secondCRC.getValue();
                if (secondCRCValue == originalCRC) {
                    failedCRCDueToNotCompleted = true;
                } else {
                    throw new IOException("Checksum mismatch");
                }
            }

            FastDeserializer fd = new FastDeserializer(saveRestoreHeader);
            byte completedByte = fd.readByte();
            m_completed = failedCRCDueToNotCompleted ? false : (completedByte == 1 ? true : false);

            for (int ii = 0; ii < 4; ii++) {
                m_versionNum[ii] = fd.readInt();
            }
            m_createTime = fd.readLong();
            m_hostId = fd.readInt();
            m_hostname = fd.readString();
            m_clusterName = fd.readString();
            m_databaseName = fd.readString();
            m_tableName = fd.readString();
            m_isReplicated = fd.readBoolean();
            if (!m_isReplicated) {
                m_partitionIds = (int[]) fd.readArray(int.class);
                if (!m_completed) {
                    for (Integer partitionId : m_partitionIds) {
                        m_corruptedPartitions.add(partitionId);
                    }
                }
                m_totalPartitions = fd.readInt();
            } else {
                m_partitionIds = new int[] { 0 };
                m_totalPartitions = 1;
                if (!m_completed) {
                    m_corruptedPartitions.add(0);
                }
            }

            //System.err.println("Tablename :" + m_tableName);
            //System.err.println("Replicated :" + m_isReplicated);
            //System.err.println("# Partitions :" + m_totalPartitions);
            // System.err.println("Completed :"+m_completed);
            //System.err.println("File Channel Size :" + m_saveFile.size());
            //System.err.println("File Channel Position :" + m_saveFile.position());
            //System.err.println("-----");
            /*
             * Several runtime exceptions can be thrown in valid failure cases
             * where a corrupt save file is being detected.
             */
        } catch (BufferUnderflowException e) {
            throw new IOException(e);
        } catch (BufferOverflowException e) {
            throw new IOException(e);
        } catch (IndexOutOfBoundsException e) {
            throw new IOException(e);
        }
    }

    public int[] getVersionNumber() {
        return m_versionNum;
    }

    public int getHostId() {
        return m_hostId;
    }

    public String getHostname() {
        return m_hostname;
    }

    public String getClusterName() {
        return m_clusterName;
    }

    public String getDatabaseName() {
        return m_databaseName;
    }

    public String getTableName() {
        return m_tableName;
    }

    public int[] getPartitionIds() {
        return m_partitionIds;
    }

    public boolean isReplicated() {
        return m_isReplicated;
    }

    public int getTotalPartitions() {
        return m_totalPartitions;
    }

    public boolean getCompleted() {
        return m_completed;
    }

    public long getCreateTime() {
        return m_createTime;
    }

    public FileChannel getFileChannel() {
        return m_saveFile;
    }

    public void setFilePath(String path) {
        m_filePath = path;
    }

    public String getFilePath() {
        return m_filePath;
    }

    public void close() throws IOException {
        if (m_chunkReaderThread != null) {
            m_chunkReaderThread.interrupt();
            try {
                m_chunkReaderThread.join();
            } catch (InterruptedException e) {
                throw new IOException(e);
            }
        }
        synchronized (this) {
            while (!m_availableChunks.isEmpty()) {
                m_availableChunks.poll().discard();
            }
            notifyAll();
        }
    }

    public Set<Integer> getCorruptedPartitionIds() {
        return m_corruptedPartitions;
    }

    public ByteBuffer getTableHeader() {
        return m_tableHeader;
    }

    // Will get the next chunk of the table that is just over the chunk size
    public synchronized BBContainer getNextChunk() throws IOException {
        if (m_chunkReaderException != null) {
            throw m_chunkReaderException;
        }

        if (!m_hasMoreChunks) {
            return m_availableChunks.poll();
        }

        if (m_chunkReader == null) {
            m_chunkReader = new ChunkReader();
            m_chunkReaderThread = new Thread(m_chunkReader, "ChunkReader");
            m_chunkReaderThread.start();
        }

        Container c = null;
        while (c == null && (m_hasMoreChunks || !m_availableChunks.isEmpty())) {
            c = m_availableChunks.poll();
            if (c == null) {
                try {
                    wait();
                } catch (InterruptedException e) {
                    e.printStackTrace();
                    throw new IOException(e);
                }
            }
        }

        if (c != null) {
            m_chunkReads.release();
        }
        return c;
    }

    public synchronized boolean hasMoreChunks() throws IOException {
        if (m_chunkReaderException != null) {
            throw m_chunkReaderException;
        }

        return m_hasMoreChunks || !m_availableChunks.isEmpty();
    }

    //
    // /**
    // * A wrapper for the in memory storage for a table chunk
    // * that counts the number of times the chunk is discarded
    // * and only returns the memory back to the pool when the
    // * chunk has been read by enough times. This is necessary
    // * for replicated tables so that they only have to
    // *
    // */
    // private class ChunkCounter {
    //
    // private ChunkCounter(BBContainer c, int chunkIndex) {
    // m_container = c;
    // m_chunkIndex = chunkIndex;
    // }
    //
    // private BBContainer fetch() {
    // m_fetches++;
    // if (m_fetches == m_fetchCount) {
    // return m_container;
    // }
    // }
    //
    // private final BBContainer m_container;
    // private int m_chunkIndex;
    // private int m_fetches = 0;
    // }

    // /**
    // * Number of times a chunk must be fetched before its buffer can
    // * be returned to the pool
    // */
    // private final int m_fetchCount;

    private final FileChannel m_saveFile;
    private final ByteBuffer m_tableHeader;
    private final boolean m_completed;
    private final int m_versionNum[] = new int[4];
    private final int m_hostId;
    private final String m_hostname;
    private final String m_clusterName;
    private final String m_databaseName;
    private final String m_tableName;
    private final boolean m_isReplicated;
    private final int m_partitionIds[];
    private final int m_totalPartitions;
    private final long m_createTime;
    private boolean m_hasMoreChunks = true;
    private static ConcurrentLinkedQueue<Container> m_buffers = new ConcurrentLinkedQueue<Container>();
    private final ArrayDeque<Container> m_availableChunks = new ArrayDeque<Container>();
    private final HashSet<Integer> m_relevantPartitionIds;
    private String m_filePath;

    /**
     * Maintain a list of corrupted partitions. It is possible for uncorrupted
     * partitions to be recovered from a save file in the future
     */
    private final HashSet<Integer> m_corruptedPartitions = new HashSet<Integer>();

    /**
     * Ignore corrupted chunks and continue validation of the rest of the
     * chunks.
     */
    private final boolean m_continueOnCorruptedChunk;

    /**
     * The thread reading chunks will read at most this number of chunks
     */
    private final Semaphore m_chunkReads;

    private ChunkReader m_chunkReader = null;
    private Thread m_chunkReaderThread = null;
    private IOException m_chunkReaderException = null;

    /**
     * Thread to read chunks from the disk
     */
    private class ChunkReader implements Runnable {

        private void readChunks() {
            int chunksRead = 0;
            while (m_hasMoreChunks) {
                /*
                 * Limit the number of chunk reads at any one time.
                 */
                try {
                    m_chunkReads.acquire();
                } catch (InterruptedException e) {
                    return;
                }
                boolean expectedAnotherChunk = false;
                try {
                    /*
                     * Get the length of the next chunk, partition id, crc for
                     * partition id,
                     */
                    ByteBuffer chunkLengthB = ByteBuffer.allocate(16);
                    while (chunkLengthB.hasRemaining()) {
                        final int read = m_saveFile.read(chunkLengthB);
                        if (read == -1) {
                            throw new EOFException();
                        }
                    }
                    chunkLengthB.flip();
                    final int nextChunkLength = chunkLengthB.getInt();
                    expectedAnotherChunk = true;

                    /*
                     * Get the partition id and its CRC and validate it.
                     * Validating the partition ID for the chunk separately
                     * makes it possible to continue processing chunks from
                     * other partitions if only one partition has corrupt chunks
                     * in the file.
                     */
                    final CRC32 partitionIdCRC = new CRC32();
                    chunkLengthB.mark();
                    final int nextChunkPartitionId = chunkLengthB.getInt();
                    final int nextChunkPartitionIdCRC = chunkLengthB.getInt();
                    chunkLengthB.reset();
                    byte partitionIdBytes[] = new byte[4];
                    chunkLengthB.get(partitionIdBytes);
                    partitionIdCRC.update(partitionIdBytes);
                    int generatedValue = (int) partitionIdCRC.getValue();

                    if (generatedValue != nextChunkPartitionIdCRC) {
                        chunkLengthB.position(0);
                        for (int partitionId : m_partitionIds) {
                            m_corruptedPartitions.add(partitionId);
                        }
                        throw new IOException("Chunk partition ID CRC check failed. " + "This corrupts all partitions in this file");
                    }

                    //System.err.println("nextChunkPartitionId :"+nextChunkPartitionId);
                    //System.err.println("nextChunkLength :"+nextChunkLength);
                    
                    /*
                     * CRC for the data portion of the chunk
                     */
                    chunkLengthB.position(chunkLengthB.position() + 4);
                    final int nextChunkCRC = chunkLengthB.getInt();

                    /*
                     * Sanity check the length value to ensure there isn't a
                     * runtime exception or OOM.
                     */
                    if (nextChunkLength < 0) {
                        throw new IOException("Corrupted TableSaveFile chunk has negative chunk length");
                    }

                    if (nextChunkLength > DEFAULT_CHUNKSIZE) {
                        throw new IOException("Corrupted TableSaveFile chunk has unreasonable length " + "> DEFAULT_CHUNKSIZE bytes");
                    }

                    /*
                     * Now allocate space to store the chunk using the VoltTable
                     * serialization representation. The chunk will contain an
                     * integer row count preceding it so it can be sucked
                     * straight in. There is a little funny business to
                     * overwrite the partition id that is not part of the
                     * serialization format
                     */
                    Container c = m_buffers.poll();
                    if (c == null) {
                        final BBContainer originContainer = DBBPool.allocateDirect(DEFAULT_CHUNKSIZE);
                        final ByteBuffer b = originContainer.b;
                        final long pointer = org.voltdb.utils.DBBPool.getBufferAddress(b);
                        c = new Container(b, pointer, originContainer);
                    }

                    /*
                     * If the length value is wrong or not all data made it to
                     * disk this read will not complete correctly. There could
                     * be overflow, underflow etc. so use a try finally block to
                     * indicate that all partitions are now corrupt. The
                     * enclosing exception handlers will do the right thing WRT
                     * to propagating the error and closing the file.
                     */
                    boolean completedRead = false;
                    int checksumStartPosition = 0;
                    int rowCount = 0;
                    try {
                        /*
                         * Assemble a VoltTable out of the chunk of tuples. Put
                         * in the header that was cached in the constructor,
                         * then copy the tuple data. The row count is at the end
                         * because it isn't known until serialization is
                         * complete. It will have to be moved back to the
                         * beginning of the tuple data after the header once the
                         * CRC has been calculated.
                         */
                        c.b.clear();
                        c.b.limit((nextChunkLength - 8) + m_tableHeader.capacity());
                        m_tableHeader.position(0);
                        c.b.put(m_tableHeader);
                        c.b.position(c.b.position() + 4);// Leave space for row
                                                         // count to be moved
                                                         // into
                        checksumStartPosition = c.b.position();
                        while (c.b.hasRemaining()) {
                            final int read = m_saveFile.read(c.b);
                            if (read == -1) {
                                throw new EOFException();
                            }
                        }
                        c.b.position(c.b.position() - 4);
                        rowCount = c.b.getInt();
                        c.b.position(checksumStartPosition);
                        completedRead = true;
                    } finally {
                        if (!completedRead) {
                            for (int partitionId : m_partitionIds) {
                                m_corruptedPartitions.add(partitionId);
                            }
                        }
                    }

                    /*
                     * Validate the rest of the chunk. This can fail if the data
                     * is corrupted or the length value was corrupted.
                     */
                    final int calculatedCRC = DBBPool.getBufferCRC32(c.b, c.b.position(), c.b.remaining());
                    if (calculatedCRC != nextChunkCRC) {
                        m_corruptedPartitions.add(nextChunkPartitionId);
                        if (m_continueOnCorruptedChunk) {
                            c.discard();
                            m_chunkReads.release();
                            continue;
                        } else {
                            throw new IOException("CRC mismatch in saved table chunk");
                        }
                    }

                    /*
                     * Skip irrelevant chunks after CRC is calculated. Always
                     * calulate the CRC in case it is the length value that is
                     * corrupted
                     */
                    if (m_relevantPartitionIds != null) {
                        if (!m_relevantPartitionIds.contains(nextChunkPartitionId)) {
                            c.discard();
                            m_chunkReads.release();
                            continue;
                        }
                    }

                    /*
                     * The row count which was stored on disk at the end (and
                     * for the CRC calc) is now moved to the appropriate place
                     * for the table serialization format. Update the limit to
                     * reflect that. Surrounded in a try finally just in case
                     * there is overflow/underflow. Shouldn't happen but I could
                     * be wrong.
                     */
                    boolean success = false;
                    try {
                        c.b.limit(c.b.limit() - 4);
                        c.b.position(checksumStartPosition - 4);
                        c.b.putInt(rowCount);
                        c.b.position(0);
                        success = true;
                    } finally {
                        if (!success) {
                            for (int partitionId : m_partitionIds) {
                                m_corruptedPartitions.add(partitionId);
                            }
                        }
                    }
                    ++chunksRead;

                    synchronized (TableSaveFile.this) {
                        m_availableChunks.offer(c);
                        TableSaveFile.this.notifyAll();
                    }
                } catch (EOFException eof) {
                    synchronized (TableSaveFile.this) {
                        m_hasMoreChunks = false;
                        if (expectedAnotherChunk) {
                            m_chunkReaderException = new IOException("Expected to find another chunk but reached end of file instead");
                        }
                        TableSaveFile.this.notifyAll();
                    }
                } catch (IOException e) {
                    synchronized (TableSaveFile.this) {
                        m_hasMoreChunks = false;
                        m_chunkReaderException = e;
                        TableSaveFile.this.notifyAll();
                    }
                } catch (BufferUnderflowException e) {
                    synchronized (TableSaveFile.this) {
                        m_hasMoreChunks = false;
                        m_chunkReaderException = new IOException(e);
                        TableSaveFile.this.notifyAll();
                    }
                } catch (BufferOverflowException e) {
                    synchronized (TableSaveFile.this) {
                        m_hasMoreChunks = false;
                        m_chunkReaderException = new IOException(e);
                        TableSaveFile.this.notifyAll();
                    }
                } catch (IndexOutOfBoundsException e) {
                    synchronized (TableSaveFile.this) {
                        m_hasMoreChunks = false;
                        m_chunkReaderException = new IOException(e);
                        TableSaveFile.this.notifyAll();
                    }
                }
            }
        }

        @Override
        public void run() {
            try {
                readChunks();
            } finally {
                synchronized (TableSaveFile.this) {
                    m_hasMoreChunks = false;
                    TableSaveFile.this.notifyAll();
                    try {
                        m_saveFile.close();
                    } catch (IOException e) {
                    }
                }
            }
        }

    }
}