package com.splout.db.qnode; /* * #%L * Splout SQL Server * %% * Copyright (C) 2012 - 2015 Datasalt Systems S.L. * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * #L% */ import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.splout.db.common.PartitionEntry; import com.splout.db.common.PartitionMap; import com.splout.db.common.ReplicationEntry; import com.splout.db.common.ReplicationMap; import com.splout.db.common.Tablespace; import com.splout.db.hazelcast.DNodeInfo; import com.splout.db.hazelcast.TablespaceVersion; import com.splout.db.qnode.QNodeHandlerContext.DNodeEvent; import com.splout.db.qnode.QNodeHandlerContext.TablespaceVersionInfoException; import com.splout.db.thrift.PartitionMetadata; /** * Abstraction that allows any Hazelcast listener to build an incremental state of the tablespaces * and the versions everytime that a new DNode enters or leaves. */ public class TablespaceMemoryState { protected final static Log log = LogFactory.getLog(TablespaceMemoryState.class); // Local map with all versions for a tablespace with the PartitionMap, // ReplicationMap for each of them private final Map<TablespaceVersion, Tablespace> tablespaceVersionsMap = new ConcurrentHashMap<TablespaceVersion, Tablespace>(); public Object tVLock = new Object(); /** * Update the in-memory <TablespaceVersion, Tablespace> map when a DNode * joins, leaves or updates its DNodeINfo. */ public synchronized void updateTablespaceVersions(DNodeInfo dNodeInfo, DNodeEvent event) throws TablespaceVersionInfoException { Map<TablespaceVersion, Tablespace> tablespaceVersionMap = getTablespaceVersionsMap(); // First check if this DNode is not anymore serving a version that it used // to serve (IMPLICIT leaving). // This can happen for instance if a DNode removes an old version. // In this case the version will eventually become empty here. Iterator<Map.Entry<TablespaceVersion, Tablespace>> iterator = tablespaceVersionMap.entrySet().iterator(); while (iterator.hasNext()) { Map.Entry<TablespaceVersion, Tablespace> tablespaceVersion = iterator.next(); String tablespaceName = tablespaceVersion.getKey().getTablespace(); Long version = tablespaceVersion.getKey().getVersion(); // Is this DNode present in this version? Tablespace tablespace = tablespaceVersion.getValue(); // We will rebuild the replication map to check if it became empty after // the checkings or not int nonEmptyReplicas = 0; Iterator<ReplicationEntry> repIter = tablespace.getReplicationMap().getReplicationEntries().iterator(); while (repIter.hasNext()) { ReplicationEntry entry = repIter.next(); int partition = entry.getShard(); if (entry.getNodes().contains(dNodeInfo.getAddress())) { // Yes! // So we have to check if this DNode is still serving this // version/partition or not if ((dNodeInfo.getServingInfo().get(tablespaceName) == null) || (dNodeInfo.getServingInfo().get(tablespaceName).get(version) == null) || (dNodeInfo.getServingInfo().get(tablespaceName).get(version).get(partition) == null)) { // NO! So we have to remove the DNode entry.getNodes().remove(dNodeInfo.getAddress()); if (entry.getNodes().isEmpty()) { repIter.remove(); // Remove also from PartitionMap PartitionEntry pEntry = new PartitionEntry(); pEntry.setShard(entry.getShard()); tablespace.getPartitionMap().getPartitionEntries().remove(pEntry); } } } if (!entry.getNodes().isEmpty()) { nonEmptyReplicas++; } } if (nonEmptyReplicas == 0) { // Delete TablespaceVersion log.info("Removing empty tablespace version (implicit leaving from " + dNodeInfo.getAddress() + "): " + tablespaceName + ", " + version); iterator.remove(); } } // Now iterate over all the tablespaces of this DNode to see new additions // or EXPLICIT leavings for (Map.Entry<String, Map<Long, Map<Integer, PartitionMetadata>>> tablespaceEntry : dNodeInfo.getServingInfo() .entrySet()) { String tablespaceName = tablespaceEntry.getKey(); // Iterate over all versions of this tablespace for (Map.Entry<Long, Map<Integer, PartitionMetadata>> versionEntry : tablespaceEntry.getValue().entrySet()) { Long versionName = versionEntry.getKey(); TablespaceVersion tablespaceVersion = new TablespaceVersion(tablespaceName, versionName); Tablespace currentTablespace = tablespaceVersionMap.get(tablespaceVersion); List<PartitionEntry> partitionMap = new ArrayList<PartitionEntry>(); List<ReplicationEntry> replicationMap = new ArrayList<ReplicationEntry>(); long deployDate = -1; if (currentTablespace != null) { // Not first time we see this tablespace. We do a copy of the // partition map to be able to modify it without // contention. partitionMap.addAll(currentTablespace.getPartitionMap().getPartitionEntries()); replicationMap.addAll(currentTablespace.getReplicationMap().getReplicationEntries()); deployDate = currentTablespace.getCreationDate(); } // Iterate over all partitions of this tablespace for (Map.Entry<Integer, PartitionMetadata> partition : versionEntry.getValue().entrySet()) { deployDate = deployDate == -1 ? partition.getValue().getDeploymentDate() : deployDate; if (deployDate != -1 && (deployDate != partition.getValue().getDeploymentDate())) { throw new TablespaceVersionInfoException( "Inconsistent partition metadata within same node, deploy date was " + deployDate + " versus " + partition.getValue().getDeploymentDate()); } PartitionMetadata metadata = partition.getValue(); Integer shard = partition.getKey(); // Create a PartitionEntry according to this PartitionMetadata PartitionEntry myEntry = new PartitionEntry(); myEntry.setMax(metadata.getMaxKey()); myEntry.setMin(metadata.getMinKey()); myEntry.setShard(shard); PartitionEntry existingPartitionEntry = null; // Look for an existing PartitionEntry for the same shard in the // PartitionMap if (!partitionMap.contains(myEntry)) { if (!event.equals(DNodeEvent.LEAVE)) { // In this case all conditions are met for adding a new entry to // the PartitionMap partitionMap.add(myEntry); // Note that now the PartitionMap is not necessarily sorted! let's // sort it now Collections.sort(partitionMap); } } else { // Check consistency of this Partition Metadata existingPartitionEntry = partitionMap.get(partitionMap.indexOf(myEntry)); if (existingPartitionEntry.getMax() == null || myEntry.getMax() == null) { if (!(existingPartitionEntry.getMax() == null && myEntry.getMax() == null)) { throw new TablespaceVersionInfoException("Inconsistent partition metadata between nodes: " + existingPartitionEntry + " versus " + myEntry); } } else { if (!existingPartitionEntry.getMax().equals(myEntry.getMax())) { throw new TablespaceVersionInfoException("Inconsistent partition metadata between nodes: " + existingPartitionEntry + " versus " + myEntry); } } if (existingPartitionEntry.getMin() == null || myEntry.getMin() == null) { if (!(existingPartitionEntry.getMin() == null && myEntry.getMin() == null)) { throw new TablespaceVersionInfoException("Inconsistent partition metadata between nodes: " + existingPartitionEntry + " versus " + myEntry); } } else { if (!existingPartitionEntry.getMin().equals(myEntry.getMin())) { throw new TablespaceVersionInfoException("Inconsistent partition metadata between nodes: " + existingPartitionEntry + " versus " + myEntry); } } } // Create a ReplicationEntry according to this PartitionMetadata // Will only contain this DNode as we don't know about the others yet ReplicationEntry reEntry = new ReplicationEntry(); reEntry.setShard(shard); reEntry.setExpectedReplicationFactor(metadata.getNReplicas()); reEntry.setNodes(new ArrayList<String>()); // Look for an existing ReplicationEntry for the same shard in the // ReplicationMap if (replicationMap.contains(reEntry)) { ReplicationEntry existingEntry = replicationMap.get(replicationMap.indexOf(reEntry)); if (event.equals(DNodeEvent.LEAVE)) { // Remove it from replication map and partition map existingEntry.getNodes().remove(dNodeInfo.getAddress()); if (existingEntry.getNodes().isEmpty()) { replicationMap.remove(existingEntry); if (existingPartitionEntry != null) { partitionMap.remove(existingPartitionEntry); } else { throw new RuntimeException( "ReplicationEntry for one shard with no associated PartitionEntry. This is very likely to be a software bug."); } } } else { if (!existingEntry.getNodes().contains(dNodeInfo.getAddress())) { // Add it to replication map existingEntry.getNodes().add(dNodeInfo.getAddress()); } else { // We are adding / updating but the node already exists in the // replication map. } } } else if (!event.equals(DNodeEvent.LEAVE)) { // Otherwise just add // and sort // We check the DNodeEvent but although would be very weird if this // DNode leaves and its ReplicationEntry // wasn't present reEntry.getNodes().add(dNodeInfo.getAddress()); replicationMap.add(reEntry); Collections.sort(reEntry.getNodes()); Collections.sort(replicationMap); } } // Delete tablespaceVersion if it is empty now if (currentTablespace != null && replicationMap.size() == 0) { log.info("Removing empty tablespaceVersion: " + tablespaceVersion + " due to explicit leaving from node " + dNodeInfo.getAddress()); tablespaceVersionMap.remove(tablespaceVersion); } else { // Update the info in memory currentTablespace = new Tablespace(new PartitionMap(partitionMap), new ReplicationMap(replicationMap), versionName, deployDate); tablespaceVersionMap.put(tablespaceVersion, currentTablespace); } } } } public Map<TablespaceVersion, Tablespace> getTablespaceVersionsMap() { return tablespaceVersionsMap; } }