/* This file is part of VoltDB. * Copyright (C) 2008-2017 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb.sysprocs.saverestore; import java.io.File; import java.io.IOException; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.concurrent.Callable; import java.util.concurrent.atomic.AtomicInteger; import org.json_voltpatches.JSONObject; import org.voltcore.logging.VoltLogger; import org.voltcore.utils.CoreUtils; import org.voltdb.CSVSnapshotFilter; import org.voltdb.ExtensibleSnapshotDigestData; import org.voltdb.SimpleFileSnapshotDataTarget; import org.voltdb.SnapshotDataFilter; import org.voltdb.SnapshotDataTarget; import org.voltdb.SnapshotFormat; import org.voltdb.SnapshotSiteProcessor; import org.voltdb.SnapshotTableTask; import org.voltdb.SystemProcedureExecutionContext; import org.voltdb.VoltTable; import org.voltdb.catalog.Table; import org.voltdb.dtxn.SiteTracker; import org.voltdb.sysprocs.SnapshotRegistry; import org.voltdb.utils.CatalogUtil; import com.google_voltpatches.common.primitives.Ints; import com.google_voltpatches.common.primitives.Longs; /** * Create a snapshot write plan for a CSV snapshot. This will attempt to write * every table only once across the entire cluster. Replicated tables are only * written at the 'first host', which is the lowest host ID currently in the * cluster, and at that host the responsibility for writing them is round-robin * across all the sites on that node. Partitioned tables are written by only * one of the replicas of each partition, chosen according to a random * selection which is seeded such that each node in the cluster will reach the * same conclusion about whether or not it is writing a given partition. Each * partitioned table is written to the same target per table by each selected * site on a node. */ public class CSVSnapshotWritePlan extends SnapshotWritePlan { static final VoltLogger SNAP_LOG = new VoltLogger("SNAPSHOT"); @Override public Callable<Boolean> createSetup( String file_path, String pathType, String file_nonce, long txnId, Map<Integer, Long> partitionTransactionIds, JSONObject jsData, SystemProcedureExecutionContext context, final VoltTable result, ExtensibleSnapshotDigestData extraSnapshotData, SiteTracker tracker, HashinatorSnapshotData hashinatorData, long timestamp) { assert(SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting.isEmpty()); /* * List of partitions to include if this snapshot is * going to be deduped. Attempts to break up the work * by seeding and RNG selecting * a random replica to do the work. Will not work in failure * cases, but we don't use dedupe when we want durability. */ List<Long> sitesToInclude = CSVSnapshotWritePlan.computeDedupedLocalSites(txnId, tracker); // If there's no work to do on this host, just claim success and get out: if (sitesToInclude.isEmpty() && !tracker.isFirstHost()) { return null; } final SnapshotRequestConfig config = new SnapshotRequestConfig(jsData, context.getDatabase()); final AtomicInteger numTables = new AtomicInteger(config.tables.length); final SnapshotRegistry.Snapshot snapshotRecord = SnapshotRegistry.startSnapshot( txnId, context.getHostId(), file_path, file_nonce, SnapshotFormat.CSV, config.tables); boolean noTargetsCreated = true; final ArrayList<SnapshotTableTask> partitionedSnapshotTasks = new ArrayList<SnapshotTableTask>(); final ArrayList<SnapshotTableTask> replicatedSnapshotTasks = new ArrayList<SnapshotTableTask>(); for (final Table table : config.tables) { /* * For a deduped csv snapshot, only produce the replicated tables on the "leader" * host. */ if (table.getIsreplicated() && !tracker.isFirstHost()) { snapshotRecord.removeTable(table.getTypeName()); // We'll expect one less table in the global table count // in order to be done, too (ENG-4802) numTables.decrementAndGet(); continue; } List<SnapshotDataFilter> filters = new ArrayList<SnapshotDataFilter>(); filters.add(new CSVSnapshotFilter(CatalogUtil.getVoltTable(table), ',', null)); final SnapshotTableTask task = new SnapshotTableTask( table, filters.toArray(new SnapshotDataFilter[filters.size()]), null, false); if (table.getIsreplicated()) { replicatedSnapshotTasks.add(task); } else { partitionedSnapshotTasks.add(task); } noTargetsCreated = false; result.addRow(context.getHostId(), CoreUtils.getHostnameOrAddress(), table.getTypeName(), "SUCCESS", ""); } if (noTargetsCreated) { SnapshotRegistry.discardSnapshot(snapshotRecord); } // CSV snapshots do the partitioned work only on the specified sites for de-duping, // but since we've pre-filtered the replicated task list to only contain entries on // one node, we can go ahead and distribute them across all of the sites on that node. placePartitionedTasks(partitionedSnapshotTasks, sitesToInclude); placeReplicatedTasks(replicatedSnapshotTasks, tracker.getSitesForHost(context.getHostId())); // All IO work will be deferred and be run on the dedicated snapshot IO thread return createDeferredSetup(file_path, pathType, file_nonce, config.tables, txnId, partitionTransactionIds, context, extraSnapshotData, timestamp, numTables, snapshotRecord, partitionedSnapshotTasks, replicatedSnapshotTasks); } private Callable<Boolean> createDeferredSetup(final String file_path, final String pathType, final String file_nonce, final Table[] tables, final long txnId, final Map<Integer, Long> partitionTransactionIds, final SystemProcedureExecutionContext context, final ExtensibleSnapshotDigestData extraSnapshotData, final long timestamp, final AtomicInteger numTables, final SnapshotRegistry.Snapshot snapshotRecord, final ArrayList<SnapshotTableTask> partitionedSnapshotTasks, final ArrayList<SnapshotTableTask> replicatedSnapshotTasks) { return new Callable<Boolean>() { @Override public Boolean call() throws Exception { NativeSnapshotWritePlan.createFileBasedCompletionTasks(file_path, pathType, file_nonce, txnId, partitionTransactionIds, context, extraSnapshotData, null, timestamp, context.getNumberOfPartitions(), tables); for (SnapshotTableTask task : replicatedSnapshotTasks) { final SnapshotDataTarget target = createDataTargetForTable(file_path, file_nonce, context.getHostId(), numTables, snapshotRecord, task.m_table); task.setTarget(target); } for (SnapshotTableTask task : partitionedSnapshotTasks) { final SnapshotDataTarget target = createDataTargetForTable(file_path, file_nonce, context.getHostId(), numTables, snapshotRecord, task.m_table); task.setTarget(target); } return true; } }; } private SnapshotDataTarget createDataTargetForTable(String file_path, String file_nonce, int hostId, AtomicInteger numTables, SnapshotRegistry.Snapshot snapshotRecord, Table table) throws IOException { SnapshotDataTarget sdt; File saveFilePath = SnapshotUtil.constructFileForTable( table, file_path, file_nonce, SnapshotFormat.CSV, hostId); sdt = new SimpleFileSnapshotDataTarget(saveFilePath, !table.getIsreplicated()); m_targets.add(sdt); final Runnable onClose = new TargetStatsClosure(sdt, table.getTypeName(), numTables, snapshotRecord); sdt.setOnCloseHandler(onClose); return sdt; } static private List<Long> computeDedupedLocalSites(long txnId, SiteTracker tracker) { MessageDigest digest; try { digest = MessageDigest.getInstance("SHA-1"); } catch (NoSuchAlgorithmException e) { throw new AssertionError(e); } /* * List of partitions to include if this snapshot is * going to be deduped. Attempts to break up the work * by seeding and RNG selecting * a random replica to do the work. Will not work in failure * cases, but we don't use dedupe when we want durability. * * Originally used the partition id as the seed, but it turns out * that nextInt(2) returns a 1 for seeds 0-4095. Now use SHA-1 * on the txnid + partition id. */ List<Long> sitesToInclude = new ArrayList<Long>(); for (long localSite : tracker.getLocalSites()) { final int partitionId = tracker.getPartitionForSite(localSite); List<Long> sites = new ArrayList<Long>(tracker.getSitesForPartition(tracker.getPartitionForSite(localSite))); Collections.sort(sites); digest.update(Longs.toByteArray(txnId)); final long seed = Longs.fromByteArray(Arrays.copyOf( digest.digest(Ints.toByteArray(partitionId)), 8)); int siteIndex = new java.util.Random(seed).nextInt(sites.size()); if (localSite == sites.get(siteIndex)) { sitesToInclude.add(localSite); } } if (sitesToInclude.isEmpty()) { SNAP_LOG.info("This host was not selected to write CSV data for any partition"); } return sitesToInclude; } }