/* This file is part of VoltDB. * Copyright (C) 2008-2010 VoltDB Inc. * * VoltDB is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * VoltDB is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.ThreadFactory; import java.util.concurrent.TimeUnit; import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.Semaphore; import java.util.zip.CRC32; import org.apache.log4j.Logger; import org.voltdb.client.ConnectionUtil; import org.voltdb.messaging.FastSerializer; import org.voltdb.utils.DBBPool; import org.voltdb.utils.DBBPool.BBContainer; public class DefaultSnapshotDataTarget implements SnapshotDataTarget { private static final Logger LOG = Logger.getLogger(DefaultSnapshotDataTarget.class); public static volatile boolean m_simulateFullDiskWritingHeader = false; public static volatile boolean m_simulateFullDiskWritingChunk = false; private final File m_file; private final FileChannel m_channel; private final FileOutputStream m_fos; private Runnable m_onCloseHandler = null; /* * If a write fails then this snapshot is hosed. * Set the flag so all writes return immediately. The system still * needs to scan all the tables to clear the dirty bits * so the process continues as if the writes are succeeding. * A more efficient failure mode would do the scan but not the * extra serialization work. */ private volatile boolean m_writeFailed = false; private volatile IOException m_writeException = null; private volatile long m_bytesWritten = 0; private static final Semaphore m_bytesAllowedBeforeSync = new Semaphore((1024 * 1024) * 256); private final AtomicInteger m_bytesWrittenSinceLastSync = new AtomicInteger(0); private final ScheduledFuture<?> m_syncTask; /* * Accept a single write even though simulating a full disk is enabled; */ private volatile boolean m_acceptOneWrite = false; @SuppressWarnings("unused") private final String m_tableName; private final AtomicInteger m_outstandingWriteTasks = new AtomicInteger(0); private static final ExecutorService m_es = Executors.newSingleThreadExecutor(new ThreadFactory() { @Override public Thread newThread(Runnable r) { return new Thread( Thread.currentThread().getThreadGroup(), r, "Snapshot write service ", 131072); } }); private static final ScheduledExecutorService m_syncService = Executors.newSingleThreadScheduledExecutor( new ThreadFactory() { @Override public Thread newThread(Runnable r) { return new Thread( Thread.currentThread().getThreadGroup(), r, "Snapshot sync service ", 131072); } }); public DefaultSnapshotDataTarget( final File file, final int hostId, final String clusterName, final String databaseName, final String tableName, final int numPartitions, final boolean isReplicated, final int partitionIds[], final VoltTable schemaTable, final long createTime) throws IOException { this( file, hostId, clusterName, databaseName, tableName, numPartitions, isReplicated, partitionIds, schemaTable, createTime, new int[] { 0, 0, 0, 0 }); } public DefaultSnapshotDataTarget( final File file, final int hostId, final String clusterName, final String databaseName, final String tableName, final int numPartitions, final boolean isReplicated, final int partitionIds[], final VoltTable schemaTable, final long createTime, int version[] ) throws IOException { String hostname = ConnectionUtil.getHostnameOrAddress(); m_file = file; m_tableName = tableName; m_fos = new FileOutputStream(file); m_channel = m_fos.getChannel(); final FastSerializer fs = new FastSerializer(); fs.writeInt(0);//CRC fs.writeInt(0);//Header length placeholder fs.writeByte(1);//Indicate the snapshot was not completed, set to true for the CRC calculation, false later for (int ii = 0; ii < 4; ii++) { fs.writeInt(version[ii]);//version } fs.writeLong(createTime); fs.writeInt(hostId); fs.writeString(hostname); fs.writeString(clusterName); fs.writeString(databaseName); fs.writeString(tableName.toUpperCase()); fs.writeBoolean(isReplicated); if (!isReplicated) { fs.writeArray(partitionIds); fs.writeInt(numPartitions); } final BBContainer container = fs.getBBContainer(); container.b.position(4); container.b.putInt(container.b.remaining() - 4); container.b.position(0); FastSerializer schemaSerializer = new FastSerializer(); schemaTable.writeExternal(schemaSerializer); final BBContainer schemaContainer = schemaSerializer.getBBContainer(); schemaContainer.b.limit(schemaContainer.b.limit() - 4);//Don't want the row count schemaContainer.b.position(schemaContainer.b.position() + 4);//Don't want total table length final CRC32 crc = new CRC32(); ByteBuffer aggregateBuffer = ByteBuffer.allocate(container.b.remaining() + schemaContainer.b.remaining()); aggregateBuffer.put(container.b); aggregateBuffer.put(schemaContainer.b); aggregateBuffer.flip(); crc.update(aggregateBuffer.array(), 4, aggregateBuffer.capacity() - 4); final int crcValue = (int) crc.getValue(); aggregateBuffer.putInt(crcValue).position(8); aggregateBuffer.put((byte)0).position(0);//Haven't actually finished writing file if (m_simulateFullDiskWritingHeader) { m_writeException = new IOException("Disk full"); m_writeFailed = true; m_fos.close(); throw m_writeException; } /* * Be completely sure the write succeeded. If it didn't * the disk is probably full or the path is bunk etc. */ m_acceptOneWrite = true; Future<?> writeFuture = write(DBBPool.wrapBB(aggregateBuffer), false); try { writeFuture.get(); } catch (InterruptedException e) { m_fos.close(); throw new java.io.InterruptedIOException(); } catch (ExecutionException e) { m_fos.close(); throw m_writeException; } if (m_writeFailed) { m_fos.close(); throw m_writeException; } ScheduledFuture<?> syncTask = null; syncTask = m_syncService.scheduleAtFixedRate(new Runnable() { @Override public void run() { int bytesSinceLastSync = 0; while ((bytesSinceLastSync = m_bytesWrittenSinceLastSync.getAndSet(0)) > 0) { try { m_channel.force(false); } catch (IOException e) { LOG.error("Error syncing snapshot", e); } m_bytesAllowedBeforeSync.release(bytesSinceLastSync); } } }, 1, 1, TimeUnit.SECONDS); m_syncTask = syncTask; } @Override public void close() throws IOException, InterruptedException { try { synchronized (m_outstandingWriteTasks) { while (m_outstandingWriteTasks.get() > 0) { m_outstandingWriteTasks.wait(); } } m_syncTask.cancel(false); m_channel.force(false); } finally { m_bytesAllowedBeforeSync.release(m_bytesWrittenSinceLastSync.getAndSet(0)); } m_channel.position(8); ByteBuffer completed = ByteBuffer.allocate(1); if (m_writeFailed) { completed.put((byte)0).flip(); } else { completed.put((byte)1).flip(); } m_channel.write(completed); m_channel.force(false); m_channel.close(); if (m_onCloseHandler != null) { m_onCloseHandler.run(); } } @Override public int getHeaderSize() { return 4; } private Future<?> write(final BBContainer tupleData, final boolean prependLength) { if (m_writeFailed) { tupleData.discard(); return null; } if (prependLength) { tupleData.b.putInt(tupleData.b.remaining() - 4); tupleData.b.position(0); } m_outstandingWriteTasks.incrementAndGet(); Future<?> writeTask = m_es.submit(new Callable<Object>() { @Override public Object call() throws Exception { try { if (m_acceptOneWrite) { m_acceptOneWrite = false; } else { if (m_simulateFullDiskWritingChunk) { throw new IOException("Disk full"); } } m_bytesAllowedBeforeSync.acquire(tupleData.b.remaining()); int totalWritten = 0; while (tupleData.b.hasRemaining()) { totalWritten += m_channel.write(tupleData.b); } m_bytesWritten += totalWritten; m_bytesWrittenSinceLastSync.addAndGet(totalWritten); } catch (IOException e) { m_writeException = e; LOG.error("Error while attempting to write snapshot data to file " + m_file, e); m_writeFailed = true; throw e; } finally { tupleData.discard(); synchronized (m_outstandingWriteTasks) { if (m_outstandingWriteTasks.decrementAndGet() == 0) { m_outstandingWriteTasks.notify(); } } } return null; } }); return writeTask; } @Override public Future<?> write(final BBContainer tupleData) { return write(tupleData, true); } @Override public long getBytesWritten() { return m_bytesWritten; } @Override public void setOnCloseHandler(Runnable onClose) { m_onCloseHandler = onClose; } @Override public IOException getLastWriteException() { return m_writeException; } }