/* This file is part of VoltDB. * Copyright (C) 2008-2017 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.TreeSet; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.ExecutorService; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import org.apache.zookeeper_voltpatches.CreateMode; import org.apache.zookeeper_voltpatches.KeeperException.NoNodeException; import org.apache.zookeeper_voltpatches.WatchedEvent; import org.apache.zookeeper_voltpatches.Watcher; import org.apache.zookeeper_voltpatches.ZooDefs.Ids; import org.apache.zookeeper_voltpatches.ZooKeeper; import org.json_voltpatches.JSONObject; import org.voltcore.logging.VoltLogger; import org.voltcore.utils.CoreUtils; import org.voltcore.utils.Pair; import org.voltdb.SnapshotCompletionInterest.SnapshotCompletionEvent; import com.google_voltpatches.common.collect.ImmutableMap; import org.voltdb.sysprocs.saverestore.SnapshotUtil; import org.voltdb.sysprocs.saverestore.SnapshotPathType; public class SnapshotCompletionMonitor { private static final VoltLogger SNAP_LOG = new VoltLogger("SNAPSHOT"); final CopyOnWriteArrayList<SnapshotCompletionInterest> m_interests = new CopyOnWriteArrayList<SnapshotCompletionInterest>(); private ZooKeeper m_zk; private final ExecutorService m_es = new ThreadPoolExecutor(1, 1, 0L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<Runnable>(), CoreUtils.getThreadFactory(null, "SnapshotCompletionMonitor", CoreUtils.SMALL_STACK_SIZE, false, null), new java.util.concurrent.ThreadPoolExecutor.DiscardPolicy()); private final Watcher m_newSnapshotWatcher = new Watcher() { @Override public void process(final WatchedEvent event) { switch (event.getType()) { case NodeChildrenChanged: m_es.execute(new Runnable() { @Override public void run() { processSnapshotChildrenChanged(event); } }); default: break; } } }; /* * For every snapshot, the local sites will log their partition specific txnids here * and when the snapshot completes the completion monitor will grab the list of partition specific * txnids to pass to those who are interestewd */ private final HashMap<Long, Map<Integer, Long>> m_snapshotTxnIdsToPartitionTxnIds = new HashMap<Long, Map<Integer, Long>>(); public void registerPartitionTxnIdsForSnapshot(long snapshotTxnId, Map<Integer, Long> partitionTxnIds) { SNAP_LOG.debug("Registering per partition txnids " + partitionTxnIds); synchronized (m_snapshotTxnIdsToPartitionTxnIds) { assert(!m_snapshotTxnIdsToPartitionTxnIds.containsKey(snapshotTxnId)); m_snapshotTxnIdsToPartitionTxnIds.put(snapshotTxnId, partitionTxnIds); } } private TreeSet<String> m_lastKnownSnapshots = new TreeSet<String>(); private void processSnapshotChildrenChanged(final WatchedEvent event) { try { TreeSet<String> children = new TreeSet<String>(m_zk.getChildren( VoltZK.completed_snapshots, m_newSnapshotWatcher)); TreeSet<String> newChildren = new TreeSet<String>(children); newChildren.removeAll(m_lastKnownSnapshots); m_lastKnownSnapshots = children; for (String newSnapshot : newChildren) { String path = VoltZK.completed_snapshots + "/" + newSnapshot; try { byte data[] = m_zk.getData(path, new Watcher() { @Override public void process(final WatchedEvent event) { switch (event.getType()) { case NodeDataChanged: m_es.execute(new Runnable() { @Override public void run() { processSnapshotDataChangedEvent(event); } }); break; default: break; } } }, null); processSnapshotData(data); } catch (NoNodeException e) { } } } catch (Exception e) { VoltDB.crashLocalVoltDB("Exception in snapshot completion monitor", true, e); } } private void processSnapshotDataChangedEvent(final WatchedEvent event) { try { byte data[] = m_zk.getData(event.getPath(), new Watcher() { @Override public void process(final WatchedEvent event) { switch (event.getType()) { case NodeDataChanged: m_es.execute(new Runnable() { @Override public void run() { processSnapshotDataChangedEvent(event); } }); break; default: break; } } }, null); processSnapshotData(data); } catch (NoNodeException e) { } catch (Exception e) { VoltDB.crashLocalVoltDB("Exception in snapshot completion monitor", true, e); } } private void processSnapshotData(byte data[]) throws Exception { if (data == null) { return; } JSONObject jsonObj = new JSONObject(new String(data, "UTF-8")); long txnId = jsonObj.getLong("txnId"); int hostCount = jsonObj.getInt("hostCount"); String path = jsonObj.getString(SnapshotUtil.JSON_PATH); SnapshotPathType stype = SnapshotPathType.valueOf(jsonObj.getString(SnapshotUtil.JSON_PATH_TYPE)); String nonce = jsonObj.getString(SnapshotUtil.JSON_NONCE); boolean truncation = jsonObj.getBoolean("isTruncation"); boolean didSucceed = jsonObj.getBoolean("didSucceed"); // A truncation request ID is not always provided. It's used for // snapshots triggered indirectly via ZooKeeper so that the // triggerer can recognize the snapshot when it finishes. String truncReqId = jsonObj.optString("truncReqId"); if (hostCount == 0) { /* * Convert the JSON object containing the export sequence numbers for each * table and partition to a regular map */ Map<String, Map<Integer, Pair<Long, Long>>> exportSequenceNumbers = null; final JSONObject exportSequenceJSON = jsonObj.getJSONObject("exportSequenceNumbers"); final ImmutableMap.Builder<String, Map<Integer, Pair<Long, Long>>> builder = ImmutableMap.builder(); @SuppressWarnings("unchecked") final Iterator<String> tableKeys = exportSequenceJSON.keys(); while (tableKeys.hasNext()) { final String tableName = tableKeys.next(); final JSONObject tableSequenceNumbers = exportSequenceJSON.getJSONObject(tableName); ImmutableMap.Builder<Integer, Pair<Long, Long>> tableBuilder = ImmutableMap.builder(); @SuppressWarnings("unchecked") final Iterator<String> partitionKeys = tableSequenceNumbers.keys(); while (partitionKeys.hasNext()) { final String partitionString = partitionKeys.next(); final Integer partitionId = Integer.valueOf(partitionString); JSONObject sequenceNumbers = tableSequenceNumbers.getJSONObject(partitionString); final Long ackOffset = sequenceNumbers.getLong("ackOffset"); final Long sequenceNumber = sequenceNumbers.getLong("sequenceNumber"); tableBuilder.put(partitionId, Pair.of(ackOffset, sequenceNumber)); } builder.put(tableName, tableBuilder.build()); } exportSequenceNumbers = builder.build(); long clusterCreateTime = jsonObj.optLong("clusterCreateTime", -1); Map<Integer, Long> drSequenceNumbers = new HashMap<>(); JSONObject drTupleStreamJSON = jsonObj.getJSONObject("drTupleStreamStateInfo"); Iterator<String> partitionKeys = drTupleStreamJSON.keys(); int drVersion = 0; while (partitionKeys.hasNext()) { String partitionIdString = partitionKeys.next(); JSONObject stateInfo = drTupleStreamJSON.getJSONObject(partitionIdString); drVersion = (int)stateInfo.getLong("drVersion"); drSequenceNumbers.put(Integer.valueOf(partitionIdString), stateInfo.getLong("sequenceNumber")); } Map<Integer, Long> partitionTxnIdsMap = ImmutableMap.of(); synchronized (m_snapshotTxnIdsToPartitionTxnIds) { Map<Integer, Long> partitionTxnIdsList = m_snapshotTxnIdsToPartitionTxnIds.get(txnId); if (partitionTxnIdsList != null) { partitionTxnIdsMap = ImmutableMap.copyOf(partitionTxnIdsList); } } /* * Collect all the last seen ids from the remote data centers so they can * be used by live rejoin to initialize a starting state for applying DR * data */ Map<Integer, Map<Integer, Map<Integer, DRConsumerDrIdTracker>>> drMixedClusterSizeConsumerState = new HashMap<>(); JSONObject consumerPartitions = jsonObj.getJSONObject("drMixedClusterSizeConsumerState"); Iterator<String> cpKeys = consumerPartitions.keys(); while (cpKeys.hasNext()) { final String consumerPartitionIdStr = cpKeys.next(); final Integer consumerPartitionId = Integer.valueOf(consumerPartitionIdStr); JSONObject siteInfo = consumerPartitions.getJSONObject(consumerPartitionIdStr); drMixedClusterSizeConsumerState.put(consumerPartitionId, ExtensibleSnapshotDigestData.buildConsumerSiteDrIdTrackersFromJSON(siteInfo)); } Iterator<SnapshotCompletionInterest> iter = m_interests.iterator(); while (iter.hasNext()) { SnapshotCompletionInterest interest = iter.next(); try { interest.snapshotCompleted( new SnapshotCompletionEvent( path, stype, nonce, txnId, partitionTxnIdsMap, truncation, didSucceed, truncReqId, exportSequenceNumbers, Collections.unmodifiableMap(drSequenceNumbers), Collections.unmodifiableMap(drMixedClusterSizeConsumerState), drVersion, clusterCreateTime)); } catch (Exception e) { SNAP_LOG.warn("Exception while executing snapshot completion interest", e); } } } } public void addInterest(final SnapshotCompletionInterest interest) { m_interests.add(interest); } public void removeInterest(final SnapshotCompletionInterest interest) { m_interests.remove(interest); } public void shutdown() throws InterruptedException { m_es.shutdown(); m_es.awaitTermination(Long.MAX_VALUE, TimeUnit.DAYS); } public void init(final ZooKeeper zk) { m_es.execute(new Runnable() { @Override public void run() { m_zk = zk; try { m_zk.create(VoltZK.completed_snapshots, null, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); } catch (Exception e){} try { m_lastKnownSnapshots = new TreeSet<String>(m_zk.getChildren(VoltZK.completed_snapshots, m_newSnapshotWatcher)); } catch (Exception e) { VoltDB.crashLocalVoltDB("Error initializing snapshot completion monitor", true, e); } } }); } }