/* This file is part of VoltDB. * Copyright (C) 2008-2010 VoltDB Inc. * * VoltDB is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * VoltDB is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb; import java.io.*; import java.util.*; import java.util.concurrent.atomic.AtomicInteger; import org.apache.log4j.Logger; import org.voltdb.SnapshotSiteProcessor.SnapshotTableTask; import org.voltdb.catalog.CatalogMap; import org.voltdb.catalog.Cluster; import org.voltdb.catalog.Host; import org.voltdb.catalog.Site; import org.voltdb.catalog.Table; import org.voltdb.catalog.Partition; import org.voltdb.sysprocs.SnapshotRegistry; import org.voltdb.sysprocs.SnapshotSave; import org.voltdb.sysprocs.saverestore.SnapshotUtil; import edu.brown.catalog.CatalogUtil; import edu.brown.hstore.PartitionExecutor.SystemProcedureExecutionContext; import edu.brown.utils.CollectionUtil; /** * SnapshotSaveAPI extracts reusuable snapshot production code * that can be called from the SnapshotSave stored procedure or * directly from an ExecutionSite thread, perhaps has a message * or failure action. */ public class SnapshotSaveAPI { private static final Logger LOG = Logger.getLogger(SnapshotSaveAPI.class); /** * The only public method: do all the work to start a snapshot. * Assumes that a snapshot is feasible, that the caller has validated it can * be accomplished, that the caller knows this is a consistent or useful * transaction point at which to snapshot. * * @param file_path * @param file_nonce * @param block * @param startTime * @param context * @param hostname * @return VoltTable describing the results of the snapshot attempt */ public VoltTable startSnapshotting(String file_path, String file_nonce, byte block, long startTime, SystemProcedureExecutionContext context, String hostname) { //LOG.trace("Creating snapshot target and handing to EEs"); final VoltTable result = SnapshotSave.constructNodeResultsTable(); // Each partition does this to accumulate tasks for it createSetup(file_path, file_nonce, startTime, context, hostname, result); //LOG.trace("Stage 0 : at partition : "+context.getPartitionExecutor().getPartitionId()); // All sites wait for a permit to start their individual snapshot tasks VoltTable error = acquireSnapshotPermit(context, hostname, result); if (error != null) { return error; } Site site = context.getSite(); CatalogMap<Partition> partition_map = site.getPartitions(); Integer lowest_partition_id = Integer.MAX_VALUE, p_id; for (Partition pt : partition_map) { p_id = pt.getId(); lowest_partition_id = Math.min(p_id, lowest_partition_id); } assert (lowest_partition_id != Integer.MAX_VALUE); int partition_id = context.getPartitionExecutor().getPartitionId(); LOG.trace("Stage 1 : at partition : "+partition_id); synchronized (SnapshotSiteProcessor.m_taskListsForSites) { // Fetch work for this partition int index = partition_id - lowest_partition_id; final Deque<SnapshotTableTask> m_taskList = SnapshotSiteProcessor.m_taskListsForSites.get(index); if (m_taskList == null) { LOG.trace("tasklist null"); return result; } else { if (SnapshotSiteProcessor.m_taskListsForSites.isEmpty()) { //assert(SnapshotSiteProcessor.m_snapshotCreateSetupPermit.availablePermits() == 1); assert(SnapshotSiteProcessor.m_snapshotPermits.availablePermits() == 0); } LOG.trace("ExecutionSitesCurrentlySnapshotting :"+SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting.get()); assert(SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting.get() > 0); context.getPartitionExecutor().initiateSnapshots(m_taskList); } } //LOG.trace("Stage 2 : at partition : "+context.getPartitionExecutor().getPartitionId()); if (block != 0) { Collection<Exception> failures = null; String status = "SUCCESS"; String err = ""; try { failures = context.getPartitionExecutor().completeSnapshotWork(); } catch (InterruptedException e) { status = "FAILURE"; err = e.toString(); } final VoltTable blockingResult = SnapshotSave.constructPartitionResultsTable(); if (failures.isEmpty()) { blockingResult.addRow( Integer.parseInt(context.getSite().getHost().getTypeName().replaceAll("[\\D]", "")), hostname, context.getHStoreSite().getSiteId(), context.getPartitionExecutor().getPartitionId(), "", status, err); } else { status = "FAILURE"; for (Exception e : failures) { err = e.toString(); } blockingResult.addRow( Integer.parseInt(context.getSite().getHost().getTypeName().replaceAll("[\\D]", "")), hostname, context.getHStoreSite().getSiteId(), context.getPartitionExecutor().getPartitionId(), "", status, err); } return blockingResult; } return result; } private void createSetup(String file_path, String file_nonce, long startTime, SystemProcedureExecutionContext context, String hostname, final VoltTable result) { { Site site = context.getSite(); int numLocalPartitions = site.getPartitions().size(); LOG.trace("createSetup at : partition "+context.getPartitionExecutor().getPartitionId()); /* * Used to close targets on failure */ final ArrayList<SnapshotDataTarget> targets = new ArrayList<SnapshotDataTarget>(); try { final ArrayDeque<SnapshotTableTask> partitionedSnapshotTasks = new ArrayDeque<SnapshotTableTask>(); final ArrayList<SnapshotTableTask> replicatedSnapshotTasks = new ArrayList<SnapshotTableTask>(); LOG.trace("ExecutionSitesCurrentlySnapshotting initial check : " + SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting.get()); assert(SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting.get() == -1); final List<Table> tables = SnapshotUtil.getTablesToSave(context.getDatabase()); synchronized (SnapshotSiteProcessor.m_digestWritten) { if (SnapshotSiteProcessor.m_digestWritten.get() == false) { SnapshotSiteProcessor.m_digestWritten.set(true); SnapshotUtil.recordSnapshotTableList(startTime, file_path, file_nonce, tables); LOG.trace("Digest written at partition " + context.getPartitionExecutor().getPartitionId()); } } final AtomicInteger numTables = new AtomicInteger(tables.size()); //LOG.info("NumTables Initial : "+numTables); final SnapshotRegistry.Snapshot snapshotRecord = SnapshotRegistry.startSnapshot( startTime, context.getHStoreSite().getHostId(), context.getHStoreSite().getSiteId(), context.getPartitionExecutor().getPartitionId(), file_path, file_nonce, tables.toArray(new Table[0])); for (final Table table : SnapshotUtil.getTablesToSave(context.getDatabase())) { String canSnapshot = "SUCCESS"; String err_msg = ""; final File saveFilePath = SnapshotUtil.constructFileForTable(table, file_path, file_nonce, String.valueOf(context.getHost().getId()), String.valueOf(context.getHStoreSite().getSiteId()), String.valueOf(context.getPartitionExecutor().getPartitionId()) ); SnapshotDataTarget sdt = null; try { sdt = constructSnapshotDataTargetForTable( context, saveFilePath, table, context.getSite().getHost(), numLocalPartitions, startTime); targets.add(sdt); final SnapshotDataTarget sdtFinal = sdt; final Runnable onClose = new Runnable() { @Override public void run() { snapshotRecord.updateTable(table.getTypeName(), new SnapshotRegistry.Snapshot.TableUpdater() { @Override public SnapshotRegistry.Snapshot.Table update( SnapshotRegistry.Snapshot.Table registryTable) { return snapshotRecord.new Table( registryTable, sdtFinal.getBytesWritten(), sdtFinal.getLastWriteException()); } }); int tablesLeft = numTables.decrementAndGet(); if (tablesLeft == 0) { final SnapshotRegistry.Snapshot completed = SnapshotRegistry.finishSnapshot(snapshotRecord); final double duration = (completed.timeFinished - completed.timeStarted) / 1000.0; LOG.info( "Snapshot " + snapshotRecord.nonce + " finished at " + completed.timeFinished + " and took " + duration + " seconds "); } } }; sdt.setOnCloseHandler(onClose); final SnapshotTableTask task = new SnapshotTableTask( table.getRelativeIndex(), sdt, table.getIsreplicated(), table.getTypeName()); if (table.getIsreplicated()) { replicatedSnapshotTasks.add(task); } else { partitionedSnapshotTasks.offer(task); } } catch (IOException ex) { /* * Creation of this specific target failed. Close it if it was created. * Continue attempting the snapshot anyways so that at least some of the data * can be retrieved. */ try { if (sdt != null) { targets.remove(sdt); sdt.close(); } } catch (Exception e) { LOG.error(e); } StringWriter sw = new StringWriter(); PrintWriter pw = new PrintWriter(sw); ex.printStackTrace(pw); pw.flush(); canSnapshot = "FAILURE"; err_msg = "SNAPSHOT INITIATION OF " + saveFilePath + "RESULTED IN IOException: \n" + sw.toString(); } result.addRow(Integer.parseInt(context.getSite().getHost().getTypeName().replaceAll("[\\D]", "")), hostname, context.getHStoreSite().getSiteId(), context.getPartitionExecutor().getPartitionId(), table.getTypeName(), canSnapshot, err_msg); } synchronized (SnapshotSiteProcessor.m_taskListsForSites) { if (!partitionedSnapshotTasks.isEmpty() || !replicatedSnapshotTasks.isEmpty()) { // Used to sync across all partitions on all sites - set only once if(SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting.get() == -1){ SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting.set(numLocalPartitions); LOG.trace("ExecutionSitesCurrentlySnapshotting set :" + SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting.get()); } for (int ii = 0; ii < numLocalPartitions; ii++) { SnapshotSiteProcessor.m_taskListsForSites.add(new ArrayDeque<SnapshotTableTask>()); } } else { SnapshotRegistry.discardSnapshot(snapshotRecord); } /** * Distribute the writing of replicated tables to exactly one partition. */ CatalogMap<Partition> partition_map = site.getPartitions(); Integer lowest_partition_id = Integer.MAX_VALUE, p_id; for (Partition pt : partition_map) { p_id = pt.getId(); lowest_partition_id = Math.min(p_id, lowest_partition_id); } assert (lowest_partition_id != Integer.MAX_VALUE); int partition_id = context.getPartitionExecutor().getPartitionId(); int index = partition_id - lowest_partition_id; // Each partition gets a partitioned task for (SnapshotTableTask t : partitionedSnapshotTasks) { SnapshotSiteProcessor.m_taskListsForSites.get(index).offer(t); } //for (int ii = 0; ii < numLocalSites && !partitionedSnapshotTasks.isEmpty(); ii++) { // SnapshotSiteProcessor.m_taskListsForSites.get(ii).addAll(partitionedSnapshotTasks); //} // Each partition gets a replicated task //int siteIndex = 0; for (SnapshotTableTask t : replicatedSnapshotTasks) { //SnapshotSiteProcessor.m_taskListsForSites.get(siteIndex++ % numLocalSites).offer(t); SnapshotSiteProcessor.m_taskListsForSites.get(index).offer(t); } } } catch (Exception ex) { /* * Close all the targets to release the threads. Don't let sites get any tasks. */ SnapshotSiteProcessor.m_taskListsForSites.clear(); for (SnapshotDataTarget sdt : targets) { try { sdt.close(); } catch (Exception e) { LOG.error(ex); } } StringWriter sw = new StringWriter(); PrintWriter pw = new PrintWriter(sw); ex.printStackTrace(pw); pw.flush(); result.addRow( context.getSite().getHost().getId(), hostname, context.getHStoreSite().getSiteId(), context.getPartitionExecutor().getPartitionId(), "", "FAILURE", "SNAPSHOT INITIATION OF " + file_path + file_nonce + "RESULTED IN Exception: \n" + sw.toString()); LOG.error(result); } finally { SnapshotSiteProcessor.m_snapshotPermits.release(numLocalPartitions); LOG.trace("Released "+ numLocalPartitions + " snapshot permits at partition "+ context.getPartitionExecutor().getPartitionId()); } } } private VoltTable acquireSnapshotPermit(SystemProcedureExecutionContext context, String hostname, final VoltTable result) { try { SnapshotSiteProcessor.m_snapshotPermits.acquire(); } catch (Exception e) { result.addRow(Integer.parseInt(context.getSite().getHost().getTypeName().replaceAll("[\\D]", "")), hostname, context.getHStoreSite().getSiteId(), context.getPartitionExecutor().getPartitionId(), "", "FAILURE", e.toString()); return result; } finally { /* * The last thread to acquire a snapshot permit has to be the one * to release the setup permit to ensure that a thread * doesn't come late and think it is supposed to do the setup work */ /* synchronized (SnapshotSiteProcessor.m_snapshotPermits) { if (SnapshotSiteProcessor.m_snapshotPermits.availablePermits() == 0 && SnapshotSiteProcessor.m_snapshotCreateSetupPermit.availablePermits() == 0) { SnapshotSiteProcessor.m_snapshotCreateSetupPermit.release(); } } */ } return null; } private final SnapshotDataTarget constructSnapshotDataTargetForTable( SystemProcedureExecutionContext context, File f, Table table, Host h, int numPartitions, long createTime) throws IOException { return new DefaultSnapshotDataTarget(f, Integer.parseInt(h.getTypeName().replaceAll("[\\D]", "")), context.getCluster().getTypeName(), context.getDatabase().getTypeName(), table.getTypeName(), numPartitions, table.getIsreplicated(), SnapshotUtil.getPartitionsOnHost(context, h), CatalogUtil.getVoltTable(table), createTime); } }