/* This file is part of VoltDB.
* Copyright (C) 2008-2017 VoltDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with VoltDB. If not, see <http://www.gnu.org/licenses/>.
*/
package org.voltdb;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import org.json_voltpatches.JSONException;
import org.json_voltpatches.JSONObject;
import org.json_voltpatches.JSONStringer;
import org.voltcore.logging.VoltLogger;
import org.voltcore.utils.Pair;
import org.voltdb.iv2.MpInitiator;
import org.voltdb.sysprocs.saverestore.SnapshotUtil;
public class ExtensibleSnapshotDigestData {
/**
* This field is the same values as m_exportSequenceNumbers once they have been extracted
* in SnapshotSaveAPI.createSetup and then passed back in to SSS.initiateSnapshots. The only
* odd thing is that setting up a snapshot can fail in which case values will have been populated into
* m_exportSequenceNumbers and kept until the next snapshot is started in which case they are repopulated.
* Decoupling them seems like a good idea in case snapshot code is every re-organized.
*/
private final Map<String, Map<Integer, Pair<Long, Long>>> m_exportSequenceNumbers;
/**
* Same as m_exportSequenceNumbersToLogOnCompletion, but for m_drTupleStreamInfo
*/
private final Map<Integer, TupleStreamStateInfo> m_drTupleStreamInfo;
/**
* Used to pass the last seen unique ids from remote datacenters into the snapshot
* termination path so it can publish it to ZK where it is extracted by rejoining
* nodes
*/
private final Map<Integer, JSONObject> m_drMixedClusterSizeConsumerState;
/**
* Value that denotes that this snapshot is one created with shutdown --save. 0
* being no, and other values yes
*/
private long m_terminus;
public ExtensibleSnapshotDigestData(
Map<String, Map<Integer, Pair<Long, Long>>> exportSequenceNumbers,
Map<Integer, TupleStreamStateInfo> drTupleStreamInfo,
Map<Integer, JSONObject> drMixedClusterSizeConsumerState,
final JSONObject jsData) {
m_exportSequenceNumbers = exportSequenceNumbers;
m_drTupleStreamInfo = drTupleStreamInfo;
m_drMixedClusterSizeConsumerState = drMixedClusterSizeConsumerState;
m_terminus = jsData != null ? jsData.optLong(SnapshotUtil.JSON_TERMINUS, 0L) : 0L;
}
private void writeExportSequenceNumbersToSnapshot(JSONStringer stringer) throws IOException {
try {
stringer.key("exportSequenceNumbers").array();
for (Map.Entry<String, Map<Integer, Pair<Long, Long>>> entry : m_exportSequenceNumbers.entrySet()) {
stringer.object();
stringer.keySymbolValuePair("exportTableName", entry.getKey());
stringer.key("sequenceNumberPerPartition").array();
for (Map.Entry<Integer, Pair<Long,Long>> sequenceNumber : entry.getValue().entrySet()) {
stringer.object();
stringer.keySymbolValuePair("partition", sequenceNumber.getKey());
//First value is the ack offset which matters for pauseless rejoin, but not persistence
stringer.keySymbolValuePair("exportSequenceNumber", sequenceNumber.getValue().getSecond());
stringer.endObject();
}
stringer.endArray();
stringer.endObject();
}
stringer.endArray();
} catch (JSONException e) {
throw new IOException(e);
}
}
/*
* When recording snapshot completion in ZooKeeper we also record export
* sequence numbers as JSON. Need to merge our sequence numbers with
* existing numbers since multiple replicas will submit the sequence number
*/
private void mergeExportSequenceNumbersToZK(JSONObject jsonObj, VoltLogger log) throws JSONException {
JSONObject tableSequenceMap;
if (jsonObj.has("exportSequenceNumbers")) {
tableSequenceMap = jsonObj.getJSONObject("exportSequenceNumbers");
} else {
tableSequenceMap = new JSONObject();
jsonObj.put("exportSequenceNumbers", tableSequenceMap);
}
for (Map.Entry<String, Map<Integer, Pair<Long, Long>>> tableEntry : m_exportSequenceNumbers.entrySet()) {
JSONObject sequenceNumbers;
final String tableName = tableEntry.getKey();
if (tableSequenceMap.has(tableName)) {
sequenceNumbers = tableSequenceMap.getJSONObject(tableName);
} else {
sequenceNumbers = new JSONObject();
tableSequenceMap.put(tableName, sequenceNumbers);
}
for (Map.Entry<Integer, Pair<Long, Long>> partitionEntry : tableEntry.getValue().entrySet()) {
final Integer partitionId = partitionEntry.getKey();
final String partitionIdString = partitionId.toString();
final Long ackOffset = partitionEntry.getValue().getFirst();
final Long partitionSequenceNumber = partitionEntry.getValue().getSecond();
/*
* Check that the sequence number is the same everywhere and log if it isn't.
* Not going to crash because we are worried about poison pill transactions.
*/
if (sequenceNumbers.has(partitionIdString)) {
JSONObject existingEntry = sequenceNumbers.getJSONObject(partitionIdString);
Long existingSequenceNumber = existingEntry.getLong("sequenceNumber");
if (!existingSequenceNumber.equals(partitionSequenceNumber)) {
log.debug("Found a mismatch in export sequence numbers of export table " + tableName +
" while recording snapshot metadata for partition " + partitionId +
". This is expected only on replicated, write-to-file export streams (remote node reported " +
existingSequenceNumber + " and the local node reported " + partitionSequenceNumber + ")");
}
existingEntry.put(partitionIdString, Math.max(existingSequenceNumber, partitionSequenceNumber));
Long existingAckOffset = existingEntry.getLong("ackOffset");
existingEntry.put("ackOffset", Math.max(ackOffset, existingAckOffset));
} else {
JSONObject newObj = new JSONObject();
newObj.put("sequenceNumber", partitionSequenceNumber);
newObj.put("ackOffset", ackOffset);
sequenceNumbers.put(partitionIdString, newObj);
}
}
}
}
private void mergeTerminusToZK(JSONObject jsonObj) throws JSONException {
long jsTerminus = jsonObj.optLong(SnapshotUtil.JSON_TERMINUS, 0L);
m_terminus = Math.max(jsTerminus, m_terminus);
jsonObj.put(SnapshotUtil.JSON_TERMINUS, m_terminus);
}
private void writeDRTupleStreamInfoToSnapshot(JSONStringer stringer) throws IOException {
try {
stringer.key("drTupleStreamStateInfo");
stringer.object();
for (Map.Entry<Integer, TupleStreamStateInfo> e : m_drTupleStreamInfo.entrySet()) {
stringer.key(e.getKey().toString());
stringer.object();
if (e.getKey() != MpInitiator.MP_INIT_PID) {
stringer.keySymbolValuePair("sequenceNumber", e.getValue().partitionInfo.drId);
stringer.keySymbolValuePair("spUniqueId", e.getValue().partitionInfo.spUniqueId);
stringer.keySymbolValuePair("mpUniqueId", e.getValue().partitionInfo.mpUniqueId);
} else {
stringer.keySymbolValuePair("sequenceNumber", e.getValue().replicatedInfo.drId);
stringer.keySymbolValuePair("spUniqueId", e.getValue().replicatedInfo.spUniqueId);
stringer.keySymbolValuePair("mpUniqueId", e.getValue().replicatedInfo.mpUniqueId);
}
stringer.endObject();
}
stringer.endObject();
} catch (JSONException e) {
throw new IOException(e);
}
}
private void mergeDRTupleStreamInfoToZK(JSONObject jsonObj, VoltLogger log) throws JSONException {
JSONObject stateInfoMap;
// clusterCreateTime should be same across the cluster
long clusterCreateTime = VoltDB.instance().getClusterCreateTime();
assert (!jsonObj.has("clusterCreateTime") || (clusterCreateTime == jsonObj.getLong("clusterCreateTime")));
jsonObj.put("clusterCreateTime", clusterCreateTime);
if (jsonObj.has("drTupleStreamStateInfo")) {
stateInfoMap = jsonObj.getJSONObject("drTupleStreamStateInfo");
} else {
stateInfoMap = new JSONObject();
jsonObj.put("drTupleStreamStateInfo", stateInfoMap);
}
for (Map.Entry<Integer, TupleStreamStateInfo> e : m_drTupleStreamInfo.entrySet()) {
final String partitionId = e.getKey().toString();
DRLogSegmentId partitionStateInfo;
if (e.getKey() != MpInitiator.MP_INIT_PID) {
partitionStateInfo = e.getValue().partitionInfo;
} else {
partitionStateInfo = e.getValue().replicatedInfo;
}
JSONObject existingStateInfo = stateInfoMap.optJSONObject(partitionId);
boolean addEntry = false;
if (existingStateInfo == null) {
addEntry = true;
}
else if (partitionStateInfo.drId != existingStateInfo.getLong("sequenceNumber")) {
if (partitionStateInfo.drId > existingStateInfo.getLong("sequenceNumber")) {
addEntry = true;
}
log.debug("Found a mismatch in dr sequence numbers for partition " + partitionId +
" the DRId should be the same at all replicas, but one node had " +
DRLogSegmentId.getDebugStringFromDRId(existingStateInfo.getLong("sequenceNumber")) +
" and the local node reported " + DRLogSegmentId.getDebugStringFromDRId(partitionStateInfo.drId));
}
if (addEntry) {
JSONObject stateInfo = new JSONObject();
stateInfo.put("sequenceNumber", partitionStateInfo.drId);
stateInfo.put("spUniqueId", partitionStateInfo.spUniqueId);
stateInfo.put("mpUniqueId", partitionStateInfo.mpUniqueId);
stateInfo.put("drVersion", e.getValue().drVersion);
stateInfoMap.put(partitionId, stateInfo);
}
}
}
static public JSONObject serializeSiteConsumerDrIdTrackersToJSON(Map<Integer, Map<Integer, DRConsumerDrIdTracker>> drMixedClusterSizeConsumerState)
throws JSONException {
JSONObject clusters = new JSONObject();
if (drMixedClusterSizeConsumerState == null) {
return clusters;
}
for (Map.Entry<Integer, Map<Integer, DRConsumerDrIdTracker>> e : drMixedClusterSizeConsumerState.entrySet()) {
// The key is the remote Data Center's partitionId. HeteroTopology implies a different partition count
// from the local cluster's partition count (which is not tracked here)
JSONObject partitions = new JSONObject();
for (Map.Entry<Integer, DRConsumerDrIdTracker> e2 : e.getValue().entrySet()) {
partitions.put(e2.getKey().toString(), e2.getValue().toJSON());
}
clusters.put(e.getKey().toString(), partitions);
}
return clusters;
}
static public Map<Integer, Map<Integer, DRConsumerDrIdTracker>> buildConsumerSiteDrIdTrackersFromJSON(JSONObject siteTrackers) throws JSONException {
Map<Integer, Map<Integer, DRConsumerDrIdTracker>> perSiteTrackers = new HashMap<Integer, Map<Integer, DRConsumerDrIdTracker>>();
Iterator<String> clusterKeys = siteTrackers.keys();
while (clusterKeys.hasNext()) {
Map<Integer, DRConsumerDrIdTracker> perProducerPartitionTrackers = new HashMap<Integer, DRConsumerDrIdTracker>();
String clusterIdStr = clusterKeys.next();
int clusterId = Integer.valueOf(clusterIdStr);
JSONObject producerPartitionInfo = siteTrackers.getJSONObject(clusterIdStr);
Iterator<String> producerPartitionKeys = producerPartitionInfo.keys();
while (producerPartitionKeys.hasNext()) {
String producerPartitionIdStr = producerPartitionKeys.next();
int producerPartitionId = Integer.valueOf(producerPartitionIdStr);
DRConsumerDrIdTracker producerPartitionTracker = new DRConsumerDrIdTracker(producerPartitionInfo.getJSONObject(producerPartitionIdStr));
perProducerPartitionTrackers.put(producerPartitionId, producerPartitionTracker);
}
perSiteTrackers.put(clusterId, perProducerPartitionTrackers);
}
return perSiteTrackers;
}
/*
* When recording snapshot completion we also record DR remote DC unique ids
* as JSON. Need to merge our unique ids with existing numbers
* since multiple replicas will submit the unique ids
*/
private void mergeConsumerDrIdTrackerToZK(JSONObject jsonObj) throws JSONException {
//DR ids/unique ids for remote partitions indexed by remote datacenter id,
//each DC has a full partition set
JSONObject dcIdMap;
if (jsonObj.has("drMixedClusterSizeConsumerState")) {
dcIdMap = jsonObj.getJSONObject("drMixedClusterSizeConsumerState");
} else {
dcIdMap = new JSONObject();
jsonObj.put("drMixedClusterSizeConsumerState", dcIdMap);
}
for (Map.Entry<Integer, JSONObject> dcEntry : m_drMixedClusterSizeConsumerState.entrySet()) {
//Last seen ids for a specific data center
final String consumerPartitionString = dcEntry.getKey().toString();
if (!dcIdMap.has(consumerPartitionString)) {
dcIdMap.put(consumerPartitionString, dcEntry.getValue());
}
}
}
private void writeDRStateToSnapshot(JSONStringer stringer) throws IOException {
try {
long clusterCreateTime = VoltDB.instance().getClusterCreateTime();
stringer.keySymbolValuePair("clusterCreateTime", clusterCreateTime);
Iterator<Entry<Integer, TupleStreamStateInfo>> iter = m_drTupleStreamInfo.entrySet().iterator();
if (iter.hasNext()) {
stringer.keySymbolValuePair("drVersion", iter.next().getValue().drVersion);
}
writeDRTupleStreamInfoToSnapshot(stringer);
stringer.key("drMixedClusterSizeConsumerState");
stringer.object();
for (Entry<Integer, JSONObject> e : m_drMixedClusterSizeConsumerState.entrySet()) {
stringer.key(e.getKey().toString()); // Consumer partitionId
stringer.value(e.getValue()); // Trackers from that site
}
stringer.endObject();
} catch (JSONException e) {
throw new IOException(e);
}
}
public void writeToSnapshotDigest(JSONStringer stringer) throws IOException {
writeExportSequenceNumbersToSnapshot(stringer);
writeDRStateToSnapshot(stringer);
}
public void mergeToZooKeeper(JSONObject jsonObj, VoltLogger log) throws JSONException {
mergeExportSequenceNumbersToZK(jsonObj, log);
mergeDRTupleStreamInfoToZK(jsonObj, log);
mergeConsumerDrIdTrackerToZK(jsonObj);
mergeTerminusToZK(jsonObj);
}
public long getTerminus() {
return m_terminus;
}
}