/*
* Copyright (c) 2008-2017, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.internal.partition.impl;
import com.hazelcast.instance.Node;
import com.hazelcast.internal.cluster.Versions;
import com.hazelcast.internal.metrics.Probe;
import com.hazelcast.internal.partition.InternalPartition;
import com.hazelcast.internal.partition.NonFragmentedServiceNamespace;
import com.hazelcast.internal.partition.PartitionReplicaVersionManager;
import com.hazelcast.internal.partition.operation.ReplicaSyncRequest;
import com.hazelcast.internal.util.counters.MwCounter;
import com.hazelcast.logging.ILogger;
import com.hazelcast.nio.Address;
import com.hazelcast.spi.ExecutionService;
import com.hazelcast.spi.Operation;
import com.hazelcast.spi.ServiceNamespaceAware;
import com.hazelcast.spi.ServiceNamespace;
import com.hazelcast.spi.TaskScheduler;
import com.hazelcast.spi.impl.NodeEngineImpl;
import com.hazelcast.spi.properties.GroupProperty;
import com.hazelcast.spi.properties.HazelcastProperties;
import com.hazelcast.util.scheduler.EntryTaskScheduler;
import com.hazelcast.util.scheduler.EntryTaskSchedulerFactory;
import com.hazelcast.util.scheduler.ScheduleType;
import com.hazelcast.util.scheduler.ScheduledEntry;
import com.hazelcast.util.scheduler.ScheduledEntryProcessor;
import com.hazelcast.version.Version;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import static com.hazelcast.internal.util.counters.MwCounter.newMwCounter;
import static java.util.Collections.newSetFromMap;
/**
*
* Maintains the version values for the partition replicas and manages the replica-related operations for partitions
*
*/
public class PartitionReplicaManager implements PartitionReplicaVersionManager {
private final Node node;
private final NodeEngineImpl nodeEngine;
private final ILogger logger;
private final InternalPartitionServiceImpl partitionService;
private final PartitionStateManager partitionStateManager;
private final PartitionReplicaVersions[] replicaVersions;
/** Replica sync requests that have been sent to the target and awaiting response */
private final Set<ReplicaFragmentSyncInfo> replicaSyncRequests;
private final EntryTaskScheduler<ReplicaFragmentSyncInfo, Void> replicaSyncTimeoutScheduler;
@Probe
private final Semaphore replicaSyncProcessLock;
@Probe
private final MwCounter replicaSyncRequestsCounter = newMwCounter();
private final long partitionMigrationTimeout;
private final int maxParallelReplications;
private volatile Version clusterVersion;
PartitionReplicaManager(Node node, InternalPartitionServiceImpl partitionService) {
this.node = node;
this.nodeEngine = node.nodeEngine;
this.logger = node.getLogger(getClass());
this.partitionService = partitionService;
int partitionCount = partitionService.getPartitionCount();
partitionStateManager = partitionService.getPartitionStateManager();
HazelcastProperties properties = node.getProperties();
partitionMigrationTimeout = properties.getMillis(GroupProperty.PARTITION_MIGRATION_TIMEOUT);
maxParallelReplications = properties.getInteger(GroupProperty.PARTITION_MAX_PARALLEL_REPLICATIONS);
replicaSyncProcessLock = new Semaphore(maxParallelReplications);
replicaVersions = new PartitionReplicaVersions[partitionCount];
for (int i = 0; i < replicaVersions.length; i++) {
replicaVersions[i] = new PartitionReplicaVersions(i);
}
ExecutionService executionService = nodeEngine.getExecutionService();
TaskScheduler globalScheduler = executionService.getGlobalTaskScheduler();
// The reason behind this scheduler to have POSTPONE type is as follows:
// When a node shifts up in the replica table upon a node failure, it sends a sync request to the partition owner and
// registers it to the replicaSyncRequests. If another node fails before the already-running sync process completes,
// the new sync request is simply scheduled to a further time. Again, before the already-running sync process completes,
// if another node fails for the third time, the already-scheduled sync request should be overwritten with the new one.
// This is because this node is shifted up to a higher level when the third node failure occurs and its respective sync
// request will inherently include the backup data that is requested by the previously scheduled sync request.
replicaSyncTimeoutScheduler = EntryTaskSchedulerFactory.newScheduler(globalScheduler,
new ReplicaSyncTimeoutProcessor(), ScheduleType.POSTPONE);
replicaSyncRequests = newSetFromMap(new ConcurrentHashMap<ReplicaFragmentSyncInfo, Boolean>(partitionCount));
}
/**
* This method is called on a backup node (replica). Given all conditions are satisfied, this method initiates a replica sync
* operation and registers it to replicaSyncRequest. The operation is scheduled for a future execution if :
* <ul>
* <li>the {@code delayMillis} is greater than 0</li>
* <li>if a migration is not allowed (during repartitioning or a node joining the cluster)</li>
* <li>the partition is currently migrating</li>
* <li>another sync request has already been sent</li>
* <li>the maximum number of parallel synchronizations has already been reached</li>
* </ul>
*
* @param partitionId the partition which is being synchronized
* @param namespaces namespaces of partition replica fragments
* @param replicaIndex the index of the replica which is being synchronized
* @throws IllegalArgumentException if the replica index is not between 0 and {@link InternalPartition#MAX_REPLICA_COUNT}
*/
public void triggerPartitionReplicaSync(int partitionId, Collection<ServiceNamespace> namespaces, int replicaIndex) {
assert replicaIndex >= 0 && replicaIndex < InternalPartition.MAX_REPLICA_COUNT
: "Invalid replica index! partitionId=" + partitionId + ", replicaIndex=" + replicaIndex;
Address target = checkAndGetPrimaryReplicaOwner(partitionId, replicaIndex);
if (target == null) {
return;
}
if (!partitionService.isReplicaSyncAllowed()) {
logger.finest("Cannot send sync replica request for partitionId=" + partitionId + ", replicaIndex=" + replicaIndex
+ ", namespaces=" + namespaces + ". Sync is not allowed.");
return;
}
InternalPartitionImpl partition = partitionStateManager.getPartitionImpl(partitionId);
if (partition.isMigrating()) {
logger.finest("Cannot send sync replica request for partitionId=" + partitionId + ", replicaIndex=" + replicaIndex
+ ", namespaces=" + namespaces + ". Partition is already migrating.");
return;
}
sendSyncReplicaRequest(partitionId, namespaces, replicaIndex, target);
}
/** Checks preconditions for replica sync - if we don't know the owner yet, if this node is the owner or not a replica */
Address checkAndGetPrimaryReplicaOwner(int partitionId, int replicaIndex) {
final InternalPartitionImpl partition = partitionStateManager.getPartitionImpl(partitionId);
final Address target = partition.getOwnerOrNull();
if (target == null) {
logger.info("Sync replica target is null, no need to sync -> partitionId=" + partitionId + ", replicaIndex="
+ replicaIndex);
return null;
}
Address thisAddress = nodeEngine.getThisAddress();
if (target.equals(thisAddress)) {
if (logger.isFinestEnabled()) {
logger.finest("This node is now owner of partition, cannot sync replica -> partitionId=" + partitionId
+ ", replicaIndex=" + replicaIndex + ", partition-info="
+ partitionStateManager.getPartitionImpl(partitionId));
}
return null;
}
if (!partition.isOwnerOrBackup(thisAddress)) {
if (logger.isFinestEnabled()) {
logger.finest("This node is not backup replica of partitionId=" + partitionId
+ ", replicaIndex=" + replicaIndex + " anymore.");
}
return null;
}
return target;
}
/**
* Send the sync request to {@code target} if the max number of parallel sync requests has not been made and the target
* was not removed while the cluster was not active. Also cancel any currently scheduled sync requests for the given
* partition and schedule a new sync request that is to be run in the case of timeout
*/
private void sendSyncReplicaRequest(int partitionId, Collection<ServiceNamespace> syncNamespaces,
int replicaIndex, Address target) {
if (node.clusterService.isMemberRemovedInNotJoinableState(target)) {
return;
}
if (!tryToAcquireReplicaSyncPermit()) {
if (logger.isFinestEnabled()) {
logger.finest("Cannot send sync replica request for partitionId=" + partitionId + ", replicaIndex=" + replicaIndex
+ ", namespaces=" + syncNamespaces + ". No permits available!");
}
return;
}
Collection<ServiceNamespace> namespaces = registerSyncInfoFor(partitionId, syncNamespaces, replicaIndex, target);
if (namespaces.isEmpty()) {
releaseReplicaSyncPermit();
return;
}
// ASSERTION
if (nodeEngine.getClusterService().getClusterVersion().isLessThan(Versions.V3_9)) {
assert namespaces.size() == 1 : "Only single namespace is allowed before V3.9: " + namespaces;
}
if (logger.isFinestEnabled()) {
logger.finest("Sending sync replica request for partitionId=" + partitionId + ", replicaIndex=" + replicaIndex
+ ", namespaces=" + namespaces);
}
replicaSyncRequestsCounter.inc();
ReplicaSyncRequest syncRequest = new ReplicaSyncRequest(partitionId, namespaces, replicaIndex);
nodeEngine.getOperationService().send(syncRequest, target);
}
private Collection<ServiceNamespace> registerSyncInfoFor(int partitionId,
Collection<ServiceNamespace> requestedNamespaces, int replicaIndex, Address target) {
// namespaces arg may not support removal
Collection<ServiceNamespace> namespaces = new ArrayList<ServiceNamespace>(requestedNamespaces);
Iterator<ServiceNamespace> iter = namespaces.iterator();
while (iter.hasNext()) {
ServiceNamespace namespace = iter.next();
ReplicaFragmentSyncInfo syncInfo = new ReplicaFragmentSyncInfo(partitionId, namespace, replicaIndex, target);
if (!replicaSyncRequests.add(syncInfo)) {
logger.finest("Cannot send sync replica request for " + syncInfo + ". Sync is already in progress!");
iter.remove();
continue;
}
replicaSyncTimeoutScheduler.schedule(partitionMigrationTimeout, syncInfo, null);
}
return namespaces;
}
@Override
public ServiceNamespace getServiceNamespace(Operation operation) {
if (operation instanceof ServiceNamespaceAware && clusterVersion.isGreaterOrEqual(Versions.V3_9)) {
return ((ServiceNamespaceAware) operation).getServiceNamespace();
}
return NonFragmentedServiceNamespace.INSTANCE;
}
@Override
// Caution: Returning version array without copying for performance reasons. Callers must not modify this array!
public long[] incrementPartitionReplicaVersions(int partitionId, ServiceNamespace namespace, int backupCount) {
PartitionReplicaVersions replicaVersion = replicaVersions[partitionId];
return replicaVersion.incrementAndGet(namespace, backupCount);
}
@Override
public void updatePartitionReplicaVersions(int partitionId, ServiceNamespace namespace,
long[] versions, int replicaIndex) {
PartitionReplicaVersions partitionVersion = replicaVersions[partitionId];
if (!partitionVersion.update(namespace, versions, replicaIndex)) {
// this partition backup is behind the owner or dirty.
triggerPartitionReplicaSync(partitionId, Collections.singleton(namespace), replicaIndex);
}
}
@Override
public boolean isPartitionReplicaVersionStale(int partitionId, ServiceNamespace namespace,
long[] versions, int replicaIndex) {
return replicaVersions[partitionId].isStale(namespace, versions, replicaIndex);
}
// called in operation threads
public boolean isPartitionReplicaVersionDirty(int partitionId, ServiceNamespace namespace) {
return replicaVersions[partitionId].isDirty(namespace);
}
@Override
// Caution: Returning version array without copying for performance reasons. Callers must not modify this array!
public long[] getPartitionReplicaVersions(int partitionId, ServiceNamespace namespace) {
return replicaVersions[partitionId].get(namespace);
}
// called in operation threads
public void setPartitionReplicaVersions(int partitionId, ServiceNamespace namespace,
long[] versions, int replicaOffset) {
replicaVersions[partitionId].set(namespace, versions, replicaOffset);
}
// called in operation threads
public void clearPartitionReplicaVersions(int partitionId, ServiceNamespace namespace) {
replicaVersions[partitionId].clear(namespace);
}
/**
* Set the new replica versions for the partition with the {@code partitionId} and reset any ongoing replica
* synchronization request for this partition and replica index.
*
* @param partitionId the partition ID
* @param replicaIndex the index of the replica
* @param versions the new replica versions for the partition
*/
// called in operation threads
public void finalizeReplicaSync(int partitionId, int replicaIndex, ServiceNamespace namespace, long[] versions) {
PartitionReplicaVersions replicaVersion = replicaVersions[partitionId];
replicaVersion.clear(namespace);
replicaVersion.set(namespace, versions, replicaIndex);
clearReplicaSyncRequest(partitionId, namespace, replicaIndex);
}
/**
* Resets the state of the replica synchronization request for the given partition and replica. This will cancel the
* scheduled synchronization, clear the ongoing sync flag and release a synchronization permit.
*
* @param partitionId the partition being synchronized
* @param namespace namespace
* @param replicaIndex the index of the replica being synchronized
*/
// called in operation threads
public void clearReplicaSyncRequest(int partitionId, ServiceNamespace namespace, int replicaIndex) {
ReplicaFragmentSyncInfo syncInfo = new ReplicaFragmentSyncInfo(partitionId, namespace, replicaIndex, null);
if (!replicaSyncRequests.remove(syncInfo)) {
return;
}
if (logger.isFinestEnabled()) {
logger.finest("Clearing sync replica request for partitionId=" + partitionId + ", replicaIndex="
+ replicaIndex + ", namespace=" + namespace);
}
releaseReplicaSyncPermit();
replicaSyncTimeoutScheduler.cancelIfExists(syncInfo, null);
}
void cancelReplicaSyncRequestsTo(Address deadAddress) {
Iterator<ReplicaFragmentSyncInfo> iter = replicaSyncRequests.iterator();
while (iter.hasNext()) {
ReplicaFragmentSyncInfo syncInfo = iter.next();
if (deadAddress.equals(syncInfo.target)) {
iter.remove();
replicaSyncTimeoutScheduler.cancel(syncInfo);
releaseReplicaSyncPermit();
}
}
}
void cancelReplicaSync(int partitionId) {
Iterator<ReplicaFragmentSyncInfo> iter = replicaSyncRequests.iterator();
while (iter.hasNext()) {
ReplicaFragmentSyncInfo syncInfo = iter.next();
if (syncInfo.partitionId == partitionId) {
iter.remove();
replicaSyncTimeoutScheduler.cancel(syncInfo);
releaseReplicaSyncPermit();
}
}
}
public boolean tryToAcquireReplicaSyncPermit() {
return replicaSyncProcessLock.tryAcquire();
}
public void releaseReplicaSyncPermit() {
replicaSyncProcessLock.release();
}
/**
* @return copy of ongoing replica-sync operations
*/
List<ReplicaFragmentSyncInfo> getOngoingReplicaSyncRequests() {
return new ArrayList<ReplicaFragmentSyncInfo>(replicaSyncRequests);
}
/**
* @return copy of scheduled replica-sync requests
*/
List<ScheduledEntry<ReplicaFragmentSyncInfo, Void>> getScheduledReplicaSyncRequests() {
final List<ScheduledEntry<ReplicaFragmentSyncInfo, Void>>
entries = new ArrayList<ScheduledEntry<ReplicaFragmentSyncInfo, Void>>();
for (ReplicaFragmentSyncInfo syncInfo : replicaSyncRequests) {
ScheduledEntry<ReplicaFragmentSyncInfo, Void> entry = replicaSyncTimeoutScheduler.get(syncInfo);
if (entry != null) {
entries.add(entry);
}
}
return entries;
}
void reset() {
replicaSyncRequests.clear();
replicaSyncTimeoutScheduler.cancelAll();
// this is not sync with possibly running sync process
// permit count can exceed allowed parallelization count.
replicaSyncProcessLock.drainPermits();
replicaSyncProcessLock.release(maxParallelReplications);
}
void scheduleReplicaVersionSync(ExecutionService executionService) {
long definedBackupSyncCheckInterval = node.getProperties().getSeconds(GroupProperty.PARTITION_BACKUP_SYNC_INTERVAL);
long backupSyncCheckInterval = definedBackupSyncCheckInterval > 0 ? definedBackupSyncCheckInterval : 1;
executionService.scheduleWithRepetition(new SyncReplicaVersionTask(),
backupSyncCheckInterval, backupSyncCheckInterval, TimeUnit.SECONDS);
}
@Override
public Collection<ServiceNamespace> getNamespaces(int partitionId) {
return replicaVersions[partitionId].getNamespaces();
}
public void retainNamespaces(int partitionId, Set<ServiceNamespace> namespaces) {
PartitionReplicaVersions versions = replicaVersions[partitionId];
versions.retainNamespaces(namespaces);
}
void setClusterVersion(Version newVersion) {
this.clusterVersion = newVersion;
}
private class ReplicaSyncTimeoutProcessor implements ScheduledEntryProcessor<ReplicaFragmentSyncInfo, Void> {
@Override
public void process(EntryTaskScheduler<ReplicaFragmentSyncInfo, Void> scheduler,
Collection<ScheduledEntry<ReplicaFragmentSyncInfo, Void>> entries) {
for (ScheduledEntry<ReplicaFragmentSyncInfo, Void> entry : entries) {
ReplicaFragmentSyncInfo syncInfo = entry.getKey();
if (replicaSyncRequests.remove(syncInfo)) {
releaseReplicaSyncPermit();
}
}
}
}
private class SyncReplicaVersionTask implements Runnable {
@Override
public void run() {
if (!node.nodeEngine.isRunning() || !partitionService.isReplicaSyncAllowed()) {
return;
}
for (InternalPartition partition : partitionStateManager.getPartitions()) {
if (!partition.isLocal()) {
continue;
}
for (int index = 1; index < InternalPartition.MAX_REPLICA_COUNT; index++) {
if (partition.getReplicaAddress(index) != null) {
CheckReplicaVersionTask task = new CheckReplicaVersionTask(nodeEngine, partitionService,
partition.getPartitionId(), index, null);
nodeEngine.getOperationService().execute(task);
}
}
}
}
}
}