/*
* Copyright (c) 2008-2017, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.internal.partition.impl;
import com.hazelcast.cluster.ClusterState;
import com.hazelcast.core.HazelcastInstanceNotActiveException;
import com.hazelcast.core.Member;
import com.hazelcast.core.MemberLeftException;
import com.hazelcast.core.MigrationListener;
import com.hazelcast.instance.MemberImpl;
import com.hazelcast.instance.Node;
import com.hazelcast.internal.cluster.ClusterStateListener;
import com.hazelcast.internal.cluster.ClusterVersionListener;
import com.hazelcast.internal.cluster.impl.ClusterServiceImpl;
import com.hazelcast.internal.cluster.impl.operations.TriggerMemberListPublishOp;
import com.hazelcast.internal.metrics.MetricsRegistry;
import com.hazelcast.internal.metrics.Probe;
import com.hazelcast.internal.partition.InternalPartition;
import com.hazelcast.internal.partition.InternalPartitionService;
import com.hazelcast.internal.partition.MigrationInfo;
import com.hazelcast.internal.partition.MigrationInfo.MigrationStatus;
import com.hazelcast.internal.partition.PartitionListener;
import com.hazelcast.internal.partition.PartitionReplicaVersionManager;
import com.hazelcast.internal.partition.PartitionRuntimeState;
import com.hazelcast.internal.partition.PartitionServiceProxy;
import com.hazelcast.internal.partition.PartitionTableView;
import com.hazelcast.internal.partition.operation.AssignPartitions;
import com.hazelcast.internal.partition.operation.FetchPartitionStateOperation;
import com.hazelcast.internal.partition.operation.PartitionStateOperation;
import com.hazelcast.internal.partition.operation.ShutdownRequestOperation;
import com.hazelcast.logging.ILogger;
import com.hazelcast.nio.Address;
import com.hazelcast.nio.serialization.Data;
import com.hazelcast.partition.NoDataMemberInClusterException;
import com.hazelcast.partition.PartitionEvent;
import com.hazelcast.partition.PartitionEventListener;
import com.hazelcast.partition.PartitionLostListener;
import com.hazelcast.spi.EventPublishingService;
import com.hazelcast.spi.ExecutionService;
import com.hazelcast.spi.ManagedService;
import com.hazelcast.spi.NodeEngine;
import com.hazelcast.spi.OperationService;
import com.hazelcast.spi.PartitionAwareService;
import com.hazelcast.spi.exception.TargetNotMemberException;
import com.hazelcast.spi.impl.NodeEngineImpl;
import com.hazelcast.spi.impl.operationservice.InternalOperationService;
import com.hazelcast.spi.partition.IPartition;
import com.hazelcast.spi.partition.IPartitionLostEvent;
import com.hazelcast.spi.properties.GroupProperty;
import com.hazelcast.spi.properties.HazelcastProperties;
import com.hazelcast.util.EmptyStatement;
import com.hazelcast.util.ExceptionUtil;
import com.hazelcast.util.FutureUtil.ExceptionHandler;
import com.hazelcast.util.HashUtil;
import com.hazelcast.util.scheduler.ScheduledEntry;
import com.hazelcast.version.Version;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.logging.Level;
import static com.hazelcast.cluster.memberselector.MemberSelectors.DATA_MEMBER_SELECTOR;
import static com.hazelcast.util.FutureUtil.logAllExceptions;
import static com.hazelcast.util.FutureUtil.returnWithDeadline;
import static java.lang.Math.ceil;
import static java.lang.Math.max;
import static java.lang.Math.min;
/**
* The {@link InternalPartitionService} implementation.
*/
@SuppressWarnings({"checkstyle:methodcount", "checkstyle:classfanoutcomplexity", "checkstyle:classdataabstractioncoupling"})
public class InternalPartitionServiceImpl implements InternalPartitionService, ManagedService,
EventPublishingService<PartitionEvent, PartitionEventListener<PartitionEvent>>,
PartitionAwareService, ClusterStateListener, ClusterVersionListener {
private static final int PARTITION_OWNERSHIP_WAIT_MILLIS = 10;
private static final String EXCEPTION_MSG_PARTITION_STATE_SYNC_TIMEOUT = "Partition state sync invocation timed out";
private static final int PTABLE_SYNC_TIMEOUT_SECONDS = 10;
private static final int SAFE_SHUTDOWN_MAX_AWAIT_STEP_MILLIS = 1000;
private final Node node;
private final NodeEngineImpl nodeEngine;
private final ILogger logger;
private final int partitionCount;
private final long partitionMigrationTimeout;
private final PartitionServiceProxy proxy;
private final Lock lock = new ReentrantLock();
private final InternalPartitionListener partitionListener;
private final PartitionStateManager partitionStateManager;
private final MigrationManager migrationManager;
private final PartitionReplicaManager replicaManager;
private final PartitionReplicaStateChecker partitionReplicaStateChecker;
private final PartitionEventManager partitionEventManager;
private final ExceptionHandler partitionStateSyncTimeoutHandler;
/** Determines if a {@link AssignPartitions} is being sent to the master, used to limit partition assignment requests. */
private final AtomicBoolean triggerMasterFlag = new AtomicBoolean(false);
private final AtomicReference<CountDownLatch> shutdownLatchRef = new AtomicReference<CountDownLatch>();
private volatile Address lastMaster;
/** Whether the master should fetch the partition tables from other nodes, can happen when node becomes new master. */
private volatile boolean shouldFetchPartitionTables;
public InternalPartitionServiceImpl(Node node) {
HazelcastProperties properties = node.getProperties();
this.partitionCount = properties.getInteger(GroupProperty.PARTITION_COUNT);
this.node = node;
this.nodeEngine = node.nodeEngine;
this.logger = node.getLogger(InternalPartitionService.class);
partitionListener = new InternalPartitionListener(node, this);
partitionStateManager = new PartitionStateManager(node, this, partitionListener);
migrationManager = new MigrationManager(node, this, lock);
replicaManager = new PartitionReplicaManager(node, this);
partitionReplicaStateChecker = new PartitionReplicaStateChecker(node, this);
partitionEventManager = new PartitionEventManager(node);
partitionStateSyncTimeoutHandler =
logAllExceptions(logger, EXCEPTION_MSG_PARTITION_STATE_SYNC_TIMEOUT, Level.FINEST);
partitionMigrationTimeout = properties.getMillis(GroupProperty.PARTITION_MIGRATION_TIMEOUT);
proxy = new PartitionServiceProxy(nodeEngine, this);
MetricsRegistry metricsRegistry = nodeEngine.getMetricsRegistry();
metricsRegistry.scanAndRegister(this, "partitions");
metricsRegistry.scanAndRegister(partitionStateManager, "partitions");
metricsRegistry.scanAndRegister(migrationManager, "partitions");
metricsRegistry.scanAndRegister(replicaManager, "partitions");
}
@Override
public void init(NodeEngine nodeEngine, Properties properties) {
int partitionTableSendInterval = node.getProperties().getSeconds(GroupProperty.PARTITION_TABLE_SEND_INTERVAL);
if (partitionTableSendInterval <= 0) {
partitionTableSendInterval = 1;
}
ExecutionService executionService = nodeEngine.getExecutionService();
executionService.scheduleWithRepetition(new PublishPartitionRuntimeStateTask(node, this),
partitionTableSendInterval, partitionTableSendInterval, TimeUnit.SECONDS);
migrationManager.start();
replicaManager.setClusterVersion(node.getClusterService().getClusterVersion());
replicaManager.scheduleReplicaVersionSync(executionService);
}
@Override
public Address getPartitionOwner(int partitionId) {
if (!partitionStateManager.isInitialized()) {
firstArrangement();
}
final InternalPartition partition = partitionStateManager.getPartitionImpl(partitionId);
if (partition.getOwnerOrNull() == null && !node.isMaster()) {
if (!isClusterFormedByOnlyLiteMembers()) {
triggerMasterToAssignPartitions();
}
}
return partition.getOwnerOrNull();
}
@Override
public Address getPartitionOwnerOrWait(int partitionId) {
Address owner;
while ((owner = getPartitionOwner(partitionId)) == null) {
if (!nodeEngine.isRunning()) {
throw new HazelcastInstanceNotActiveException();
}
ClusterState clusterState = node.getClusterService().getClusterState();
if (!clusterState.isMigrationAllowed()) {
throw new IllegalStateException("Partitions can't be assigned since cluster-state: " + clusterState);
}
if (isClusterFormedByOnlyLiteMembers()) {
throw new NoDataMemberInClusterException(
"Partitions can't be assigned since all nodes in the cluster are lite members");
}
try {
Thread.sleep(PARTITION_OWNERSHIP_WAIT_MILLIS);
} catch (InterruptedException e) {
throw ExceptionUtil.rethrow(e);
}
}
return owner;
}
@Override
public void firstArrangement() {
if (partitionStateManager.isInitialized()) {
return;
}
if (!node.isMaster()) {
triggerMasterToAssignPartitions();
return;
}
lock.lock();
try {
if (partitionStateManager.isInitialized()) {
return;
}
Set<Address> excludedAddresses = migrationManager.getShutdownRequestedAddresses();
if (!partitionStateManager.initializePartitionAssignments(excludedAddresses)) {
return;
}
publishPartitionRuntimeState();
} finally {
lock.unlock();
}
}
/** Sends a {@link AssignPartitions} to the master to assign partitions. */
private void triggerMasterToAssignPartitions() {
if (partitionStateManager.isInitialized()) {
return;
}
ClusterServiceImpl clusterService = node.getClusterService();
if (!clusterService.isJoined()) {
return;
}
ClusterState clusterState = clusterService.getClusterState();
if (!clusterState.isMigrationAllowed()) {
logger.warning("Partitions can't be assigned since cluster-state= " + clusterState);
return;
}
if (!triggerMasterFlag.compareAndSet(false, true)) {
return;
}
try {
final Address masterAddress = clusterService.getMasterAddress();
if (masterAddress != null && !masterAddress.equals(node.getThisAddress())) {
Future f = nodeEngine.getOperationService().createInvocationBuilder(SERVICE_NAME, new AssignPartitions(),
masterAddress).setTryCount(1).invoke();
f.get(1, TimeUnit.SECONDS);
}
} catch (Exception e) {
logger.finest(e);
} finally {
triggerMasterFlag.set(false);
}
}
private boolean isClusterFormedByOnlyLiteMembers() {
final ClusterServiceImpl clusterService = node.getClusterService();
return clusterService.getMembers(DATA_MEMBER_SELECTOR).isEmpty();
}
/**
* Sets the initial partition table and state version. If any partition has a replica, the partition state manager is
* set to initialized, otherwise {@link #partitionStateManager#isInitialized()} stays uninitialized but the current state
* will be updated nevertheless.
* This method acquires the partition service lock.
*
* @param partitionTable the initial partition table
* @throws IllegalStateException if the partition manager has already been initialized
*/
public void setInitialState(PartitionTableView partitionTable) {
lock.lock();
try {
partitionStateManager.setInitialState(partitionTable);
} finally {
lock.unlock();
}
}
@Override
public int getMemberGroupsSize() {
return partitionStateManager.getMemberGroupsSize();
}
@Probe(name = "maxBackupCount")
@Override
public int getMaxAllowedBackupCount() {
return max(min(getMemberGroupsSize() - 1, InternalPartition.MAX_BACKUP_COUNT), 0);
}
@Override
public boolean isMemberAllowedToJoin(Address address) {
lock.lock();
try {
ClusterState clusterState = node.getClusterService().getClusterState();
if (!clusterState.isMigrationAllowed() && clusterState != ClusterState.IN_TRANSITION) {
logger.fine(address + " can join since cluster state is " + clusterState);
return true;
}
if (partitionStateManager.isPresentInPartitionTable(address)) {
logger.fine(address + " is in partition table");
return false;
}
final MigrationRunnable activeTask = migrationManager.getActiveTask();
if (activeTask instanceof MigrationManager.MigrateTask) {
final MigrationManager.MigrateTask migrateTask = (MigrationManager.MigrateTask) activeTask;
final MigrationInfo migrationInfo = migrateTask.migrationInfo;
if (address.equals(migrationInfo.getSource()) || address.equals(migrationInfo.getDestination())) {
logger.fine(address + " cannot join since " + migrationInfo);
return false;
}
}
return true;
} finally {
lock.unlock();
}
}
@Override
public void memberAdded(MemberImpl member) {
logger.fine("Adding " + member);
lock.lock();
try {
partitionStateManager.updateMemberGroupsSize();
lastMaster = node.getClusterService().getMasterAddress();
if (node.isMaster()) {
if (partitionStateManager.isInitialized()) {
final ClusterState clusterState = nodeEngine.getClusterService().getClusterState();
if (clusterState.isMigrationAllowed()) {
migrationManager.triggerControlTask();
}
}
}
} finally {
lock.unlock();
}
}
@Override
public void memberRemoved(final MemberImpl member) {
logger.fine("Removing " + member);
final Address deadAddress = member.getAddress();
final Address thisAddress = node.getThisAddress();
lock.lock();
try {
partitionStateManager.updateMemberGroupsSize();
migrationManager.onMemberRemove(member);
boolean isThisNodeNewMaster = node.isMaster() && !thisAddress.equals(lastMaster);
if (isThisNodeNewMaster) {
assert !shouldFetchPartitionTables : "SOMETHING IS WRONG! Removed member: " + member;
shouldFetchPartitionTables = true;
}
lastMaster = node.getClusterService().getMasterAddress();
migrationManager.pauseMigration();
replicaManager.cancelReplicaSyncRequestsTo(deadAddress);
if (node.isMaster()) {
migrationManager.triggerControlTask();
}
migrationManager.resumeMigration();
} finally {
lock.unlock();
}
}
@Override
public void onClusterStateChange(ClusterState newState) {
if (!newState.isMigrationAllowed()) {
return;
}
if (!partitionStateManager.isInitialized()) {
return;
}
if (!node.isMaster()) {
return;
}
lock.lock();
try {
if (partitionStateManager.isInitialized()) {
migrationManager.triggerControlTask();
}
} finally {
lock.unlock();
}
}
public void onClusterVersionChange(Version newVersion) {
// required for 3.8 -> 3.9 upgrade
replicaManager.setClusterVersion(newVersion);
}
public void cancelReplicaSyncRequestsTo(Address deadAddress) {
lock.lock();
try {
replicaManager.cancelReplicaSyncRequestsTo(deadAddress);
} finally {
lock.unlock();
}
}
@Override
public PartitionRuntimeState createPartitionState() {
if (!isFetchMostRecentPartitionTableTaskRequired()) {
return createPartitionStateInternal();
}
return null;
}
/**
* Returns a copy of the partition table or {@code null} if not initialized. This method will acquire the partition service
* lock.
*/
public PartitionRuntimeState createPartitionStateInternal() {
lock.lock();
try {
if (!partitionStateManager.isInitialized()) {
return null;
}
List<MigrationInfo> completedMigrations = migrationManager.getCompletedMigrationsCopy();
InternalPartition[] partitions = partitionStateManager.getPartitions();
PartitionRuntimeState state = new PartitionRuntimeState(partitions, completedMigrations, getPartitionStateVersion());
state.setActiveMigration(migrationManager.getActiveMigration());
return state;
} finally {
lock.unlock();
}
}
/**
* Creates a transient PartitionRuntimeState to commit given migration.
* Result migration is applied to partition table and migration is added to completed-migrations set.
* Version of created partition table is incremented by 1.
*/
PartitionRuntimeState createMigrationCommitPartitionState(MigrationInfo migrationInfo) {
lock.lock();
try {
if (!partitionStateManager.isInitialized()) {
return null;
}
List<MigrationInfo> completedMigrations = migrationManager.getCompletedMigrationsCopy();
InternalPartition[] partitions = partitionStateManager.getPartitionsCopy();
int partitionId = migrationInfo.getPartitionId();
InternalPartitionImpl partition = (InternalPartitionImpl) partitions[partitionId];
migrationManager.applyMigration(partition, migrationInfo);
migrationInfo.setStatus(MigrationStatus.SUCCESS);
completedMigrations.add(migrationInfo);
int committedVersion = getPartitionStateVersion() + 1;
return new PartitionRuntimeState(partitions, completedMigrations, committedVersion);
} finally {
lock.unlock();
}
}
/**
* Creates a transient {@link PartitionRuntimeState} to commit promotions by applying the {@code migrationInfos}.
* The partition table version is incremented by number of promotions.
* This method will acquire the partition service lock.
*
* @param migrationInfos the promotions to be executed on the destination
* @return the partition table with the executed migrations or {@code null} if the partitions are not initialized (assigned)
*/
PartitionRuntimeState createPromotionCommitPartitionState(Collection<MigrationInfo> migrationInfos) {
lock.lock();
try {
if (!partitionStateManager.isInitialized()) {
return null;
}
List<MigrationInfo> completedMigrations = migrationManager.getCompletedMigrationsCopy();
InternalPartition[] partitions = partitionStateManager.getPartitionsCopy();
for (MigrationInfo migrationInfo : migrationInfos) {
int partitionId = migrationInfo.getPartitionId();
InternalPartitionImpl partition = (InternalPartitionImpl) partitions[partitionId];
migrationManager.applyMigration(partition, migrationInfo);
migrationInfo.setStatus(MigrationStatus.SUCCESS);
}
int committedVersion = getPartitionStateVersion() + migrationInfos.size();
return new PartitionRuntimeState(partitions, completedMigrations, committedVersion);
} finally {
lock.unlock();
}
}
/**
* Called on the master node to publish the current partition state to all cluster nodes. It will not publish the partition
* state if the partitions have not yet been initialized, there is ongoing repartitioning or a node is joining the cluster.
*/
@SuppressWarnings("checkstyle:npathcomplexity")
void publishPartitionRuntimeState() {
if (!partitionStateManager.isInitialized()) {
// do not send partition state until initialized!
return;
}
if (!node.isMaster()) {
return;
}
if (!isReplicaSyncAllowed()) {
// migration is disabled because of a member leave, wait till enabled!
return;
}
PartitionRuntimeState partitionState = createPartitionStateInternal();
if (partitionState == null) {
return;
}
if (logger.isFineEnabled()) {
logger.fine("Publishing partition state, version: " + partitionState.getVersion());
}
PartitionStateOperation op = new PartitionStateOperation(partitionState);
OperationService operationService = nodeEngine.getOperationService();
Collection<MemberImpl> members = node.clusterService.getMemberImpls();
for (MemberImpl member : members) {
if (!member.localMember()) {
try {
operationService.send(op, member.getAddress());
} catch (Exception e) {
logger.finest(e);
}
}
}
}
/**
* Called on the master node to send the partition tables to other cluster members. It will not publish the partition
* state if the partitions have not yet been initialized.
* Waits for {@value PTABLE_SYNC_TIMEOUT_SECONDS} for the members to respond to the partition state operation.
*
* @return {@code true} if all cluster members have synced their partition tables, {@code false} otherwise.
*/
@SuppressWarnings("checkstyle:npathcomplexity")
boolean syncPartitionRuntimeState() {
if (!partitionStateManager.isInitialized()) {
// do not send partition state until initialized!
return false;
}
if (!node.isMaster()) {
return false;
}
PartitionRuntimeState partitionState = createPartitionStateInternal();
if (partitionState == null) {
return false;
}
if (logger.isFineEnabled()) {
logger.fine("Sync'ing partition state, version: " + partitionState.getVersion());
}
OperationService operationService = nodeEngine.getOperationService();
Collection<MemberImpl> members = node.clusterService.getMemberImpls();
List<Future<Boolean>> calls = firePartitionStateOperation(members, partitionState, operationService);
Collection<Boolean> results = returnWithDeadline(calls, PTABLE_SYNC_TIMEOUT_SECONDS,
TimeUnit.SECONDS, partitionStateSyncTimeoutHandler);
if (calls.size() != results.size()) {
return false;
}
for (Boolean result : results) {
if (!result) {
if (logger.isFineEnabled()) {
logger.fine("Partition state, version: " + partitionState.getVersion()
+ " sync failed to one of the members!");
}
return false;
}
}
return true;
}
/** Sends a {@link PartitionStateOperation} to cluster members and returns the futures. */
private List<Future<Boolean>> firePartitionStateOperation(Collection<MemberImpl> members,
PartitionRuntimeState partitionState,
OperationService operationService) {
final ClusterServiceImpl clusterService = node.clusterService;
List<Future<Boolean>> calls = new ArrayList<Future<Boolean>>(members.size());
for (MemberImpl member : members) {
if (!(member.localMember() || clusterService.isMemberRemovedInNotJoinableState(member.getAddress()))) {
try {
Address address = member.getAddress();
PartitionStateOperation operation = new PartitionStateOperation(partitionState, true);
Future<Boolean> f = operationService.invokeOnTarget(SERVICE_NAME, operation, address);
calls.add(f);
} catch (Exception e) {
logger.finest(e);
}
}
}
return calls;
}
/**
* Sets the {@code partitionState} if the node is started and the state is sent by the master known by this node.
*
* @param partitionState the new partition state
* @return {@code true} if the partition state was applied
*/
public boolean processPartitionRuntimeState(final PartitionRuntimeState partitionState) {
final Address sender = partitionState.getEndpoint();
if (!node.getNodeExtension().isStartCompleted()) {
logger.warning("Ignoring received partition table, startup is not completed yet. Sender: " + sender);
return false;
}
final Address master = node.getClusterService().getMasterAddress();
if (node.isMaster() && !node.getThisAddress().equals(sender)) {
logger.warning("This is the master node and received a PartitionRuntimeState from "
+ sender + ". Ignoring incoming state! ");
return false;
} else {
if (sender == null || !sender.equals(master)) {
if (node.clusterService.getMember(sender) == null) {
logger.severe("Received a ClusterRuntimeState from an unknown member!"
+ " => Sender: " + sender + ", Master: " + master + "! ");
return false;
} else {
logger.warning("Received a ClusterRuntimeState, but its sender doesn't seem to be master!"
+ " => Sender: " + sender + ", Master: " + master + "! "
+ "(Ignore if master node has changed recently.)");
return false;
}
}
}
return applyNewState(partitionState, sender);
}
/**
* Applies the {@code partitionState} sent by the {@code sender} if the new state is newer than the current one
* and finalizes the migrations.
* This method does not validate the sender. It is caller method's responsibility.
* This method will acquire the partition service lock.
*
* @param partitionState the new partition state
* @param sender the sender of the new partition state
* @return {@code true} if the partition state version is higher than the current one and was applied or
* if the partition state version is same as the current one
*/
private boolean applyNewState(PartitionRuntimeState partitionState, Address sender) {
try {
if (!lock.tryLock(PTABLE_SYNC_TIMEOUT_SECONDS, TimeUnit.SECONDS)) {
return false;
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return false;
}
try {
final int newVersion = partitionState.getVersion();
final int currentVersion = partitionStateManager.getVersion();
if (newVersion < currentVersion) {
logger.warning("Master version should be greater than ours! Local version: " + currentVersion
+ ", Master version: " + newVersion + " Master: " + sender);
return false;
} else if (newVersion == currentVersion) {
if (logger.isFineEnabled()) {
logger.fine("Master version should be greater than ours! Local version: " + currentVersion
+ ", Master version: " + newVersion + " Master: " + sender);
}
return true;
}
filterAndLogUnknownAddressesInPartitionTable(sender, partitionState.getPartitionTable());
updatePartitionsAndFinalizeMigrations(partitionState);
return true;
} finally {
lock.unlock();
}
}
/**
* Updates all partitions and version, updates (adds and retains) the completed migrations and finalizes the active
* migration if it is equal to any completed.
*
* @see MigrationManager#scheduleActiveMigrationFinalization(MigrationInfo)
*/
private void updatePartitionsAndFinalizeMigrations(PartitionRuntimeState partitionState) {
final Address[][] partitionTable = partitionState.getPartitionTable();
updateAllPartitions(partitionTable);
partitionStateManager.setVersion(partitionState.getVersion());
Collection<MigrationInfo> completedMigrations = partitionState.getCompletedMigrations();
for (MigrationInfo completedMigration : completedMigrations) {
assert completedMigration.getStatus() == MigrationStatus.SUCCESS
|| completedMigration.getStatus() == MigrationStatus.FAILED
: "Invalid migration: " + completedMigration;
if (migrationManager.addCompletedMigration(completedMigration)) {
migrationManager.scheduleActiveMigrationFinalization(completedMigration);
}
}
if (!partitionStateManager.setInitialized()) {
node.getNodeExtension().onPartitionStateChange();
}
migrationManager.retainCompletedMigrations(completedMigrations);
}
/** Sets the replica addresses for all partitions. */
private void updateAllPartitions(Address[][] partitionTable) {
for (int partitionId = 0; partitionId < partitionCount; partitionId++) {
Address[] replicas = partitionTable[partitionId];
partitionStateManager.updateReplicaAddresses(partitionId, replicas);
}
}
/**
* Checks if there are unknown addresses in the {@code partitionTable} and requests the member list from the master node if
* there are any.
*/
private void filterAndLogUnknownAddressesInPartitionTable(Address sender, Address[][] partitionTable) {
final Set<Address> unknownAddresses = new HashSet<Address>();
for (int partitionId = 0; partitionId < partitionTable.length; partitionId++) {
Address[] replicas = partitionTable[partitionId];
searchUnknownAddressesInPartitionTable(sender, unknownAddresses, partitionId, replicas);
}
logUnknownAddressesInPartitionTable(sender, unknownAddresses);
if (!unknownAddresses.isEmpty()) {
Address masterAddress = node.getClusterService().getMasterAddress();
// If node is shutting down, master can be null.
if (masterAddress != null && !masterAddress.equals(node.getThisAddress())) {
// unknown addresses found in partition table, request a new member-list from master
nodeEngine.getOperationService().send(new TriggerMemberListPublishOp(), masterAddress);
}
}
}
private void logUnknownAddressesInPartitionTable(Address sender, Set<Address> unknownAddresses) {
if (!unknownAddresses.isEmpty() && logger.isWarningEnabled()) {
StringBuilder s = new StringBuilder("Following unknown addresses are found in partition table")
.append(" sent from master[").append(sender).append("].")
.append(" (Probably they have recently joined or left the cluster.)")
.append(" {");
for (Address address : unknownAddresses) {
s.append("\n\t").append(address);
}
s.append("\n}");
logger.warning(s.toString());
}
}
/**
* Searches {@code addresses} for addresses which are currently not cluster members and were not removed while cluster was
* not active and add them to {@code unknownAddresses}.
*/
private void searchUnknownAddressesInPartitionTable(Address sender, Set<Address> unknownAddresses, int partitionId,
Address[] addresses) {
final ClusterServiceImpl clusterService = node.clusterService;
final ClusterState clusterState = clusterService.getClusterState();
for (int index = 0; index < InternalPartition.MAX_REPLICA_COUNT; index++) {
Address address = addresses[index];
if (address != null && node.clusterService.getMember(address) == null) {
if (clusterState.isJoinAllowed() || !clusterService.isMemberRemovedInNotJoinableState(address)) {
if (logger.isFinestEnabled()) {
logger.finest(
"Unknown " + address + " found in partition table sent from master "
+ sender + ". It has probably already left the cluster. partitionId="
+ partitionId);
}
unknownAddresses.add(address);
}
}
}
}
@Override
public IPartition[] getPartitions() {
IPartition[] result = new IPartition[partitionCount];
System.arraycopy(partitionStateManager.getPartitions(), 0, result, 0, partitionCount);
return result;
}
@Override
public InternalPartition[] getInternalPartitions() {
return partitionStateManager.getPartitions();
}
@Override
public InternalPartition getPartition(int partitionId) {
return getPartition(partitionId, true);
}
@Override
public InternalPartition getPartition(int partitionId, boolean triggerOwnerAssignment) {
InternalPartitionImpl p = partitionStateManager.getPartitionImpl(partitionId);
if (triggerOwnerAssignment && p.getOwnerOrNull() == null) {
// probably ownerships are not set yet.
// force it.
getPartitionOwner(partitionId);
}
return p;
}
@Override
public boolean prepareToSafeShutdown(long timeout, TimeUnit unit) {
if (!node.getClusterService().isJoined()) {
return true;
}
if (node.isLiteMember()) {
return true;
}
CountDownLatch latch = getShutdownLatch();
InternalOperationService operationService = nodeEngine.getOperationService();
long timeoutMillis = unit.toMillis(timeout);
long awaitStep = Math.min(SAFE_SHUTDOWN_MAX_AWAIT_STEP_MILLIS, timeoutMillis);
try {
do {
Address masterAddress = nodeEngine.getMasterAddress();
if (masterAddress == null) {
logger.warning("Safe shutdown failed, master member is not known!");
return false;
}
if (node.getThisAddress().equals(masterAddress)) {
onShutdownRequest(node.getThisAddress());
} else {
operationService.send(new ShutdownRequestOperation(), masterAddress);
}
if (latch.await(awaitStep, TimeUnit.MILLISECONDS)) {
return true;
}
timeoutMillis -= awaitStep;
} while (timeoutMillis > 0);
} catch (InterruptedException e) {
logger.info("Safe shutdown is interrupted!");
}
return false;
}
private CountDownLatch getShutdownLatch() {
CountDownLatch latch = shutdownLatchRef.get();
if (latch == null) {
latch = new CountDownLatch(1);
if (!shutdownLatchRef.compareAndSet(null, latch)) {
latch = shutdownLatchRef.get();
}
}
return latch;
}
public void onShutdownRequest(Address address) {
if (lock.tryLock()) {
try {
migrationManager.onShutdownRequest(address);
} finally {
lock.unlock();
}
}
}
public void onShutdownResponse() {
CountDownLatch latch = shutdownLatchRef.get();
assert latch != null;
latch.countDown();
}
@Override
public boolean isMemberStateSafe() {
return partitionReplicaStateChecker.getPartitionServiceState() == PartitionServiceState.SAFE;
}
@Override
public boolean hasOnGoingMigration() {
return hasOnGoingMigrationLocal()
|| (!node.isMaster() && partitionReplicaStateChecker.hasOnGoingMigrationMaster(Level.FINEST));
}
@Override
public boolean hasOnGoingMigrationLocal() {
return migrationManager.hasOnGoingMigration();
}
@Override
public final int getPartitionId(Data key) {
return HashUtil.hashToIndex(key.getPartitionHash(), partitionCount);
}
@Override
public final int getPartitionId(Object key) {
return getPartitionId(nodeEngine.toData(key));
}
@Override
public final int getPartitionCount() {
return partitionCount;
}
public long getPartitionMigrationTimeout() {
return partitionMigrationTimeout;
}
@Override
public PartitionReplicaVersionManager getPartitionReplicaVersionManager() {
return replicaManager;
}
@Override
public Map<Address, List<Integer>> getMemberPartitionsMap() {
Collection<Member> dataMembers = node.getClusterService().getMembers(DATA_MEMBER_SELECTOR);
int dataMembersSize = dataMembers.size();
int partitionsPerMember = (dataMembersSize > 0 ? (int) ceil((float) partitionCount / dataMembersSize) : 0);
Map<Address, List<Integer>> memberPartitions = new HashMap<Address, List<Integer>>(dataMembersSize);
for (int partitionId = 0; partitionId < partitionCount; partitionId++) {
Address owner = getPartitionOwnerOrWait(partitionId);
List<Integer> ownedPartitions = memberPartitions.get(owner);
if (ownedPartitions == null) {
ownedPartitions = new ArrayList<Integer>(partitionsPerMember);
memberPartitions.put(owner, ownedPartitions);
}
ownedPartitions.add(partitionId);
}
return memberPartitions;
}
@Override
public List<Integer> getMemberPartitions(Address target) {
List<Integer> ownedPartitions = new LinkedList<Integer>();
for (int i = 0; i < partitionCount; i++) {
final Address owner = getPartitionOwner(i);
if (target.equals(owner)) {
ownedPartitions.add(i);
}
}
return ownedPartitions;
}
@Override
public List<Integer> getMemberPartitionsIfAssigned(Address target) {
if (!partitionStateManager.isInitialized()) {
return Collections.emptyList();
}
return getMemberPartitions(target);
}
@Override
public void reset() {
lock.lock();
try {
shouldFetchPartitionTables = false;
replicaManager.reset();
partitionStateManager.reset();
migrationManager.reset();
} finally {
lock.unlock();
}
}
@Override
public void pauseMigration() {
migrationManager.pauseMigration();
}
@Override
public void resumeMigration() {
migrationManager.resumeMigration();
}
public boolean isReplicaSyncAllowed() {
return migrationManager.isMigrationAllowed();
}
@Override
public void shutdown(boolean terminate) {
logger.finest("Shutting down the partition service");
migrationManager.stop();
reset();
}
@Override
@Probe
public long getMigrationQueueSize() {
return migrationManager.getMigrationQueueSize();
}
public PartitionServiceProxy getPartitionServiceProxy() {
return proxy;
}
@Override
public String addMigrationListener(MigrationListener listener) {
return partitionEventManager.addMigrationListener(listener);
}
@Override
public boolean removeMigrationListener(String registrationId) {
return partitionEventManager.removeMigrationListener(registrationId);
}
@Override
public String addPartitionLostListener(PartitionLostListener listener) {
return partitionEventManager.addPartitionLostListener(listener);
}
@Override
public String addLocalPartitionLostListener(PartitionLostListener listener) {
return partitionEventManager.addLocalPartitionLostListener(listener);
}
@Override
public boolean removePartitionLostListener(String registrationId) {
return partitionEventManager.removePartitionLostListener(registrationId);
}
@Override
public void dispatchEvent(PartitionEvent partitionEvent, PartitionEventListener partitionEventListener) {
partitionEventListener.onEvent(partitionEvent);
}
public void addPartitionListener(PartitionListener listener) {
lock.lock();
try {
partitionListener.addChildListener(listener);
} finally {
lock.unlock();
}
}
@Override
public boolean isPartitionOwner(int partitionId) {
InternalPartition partition = partitionStateManager.getPartitionImpl(partitionId);
return partition.isLocal();
}
@Override
public int getPartitionStateVersion() {
return partitionStateManager.getVersion();
}
@Override
public void onPartitionLost(IPartitionLostEvent event) {
partitionEventManager.onPartitionLost(event);
}
public void setInternalMigrationListener(InternalMigrationListener listener) {
migrationManager.setInternalMigrationListener(listener);
}
public InternalMigrationListener getInternalMigrationListener() {
return migrationManager.getInternalMigrationListener();
}
public void resetInternalMigrationListener() {
migrationManager.resetInternalMigrationListener();
}
/**
* @return copy of ongoing replica-sync operations
*/
public List<ReplicaFragmentSyncInfo> getOngoingReplicaSyncRequests() {
return replicaManager.getOngoingReplicaSyncRequests();
}
/**
* @return copy of scheduled replica-sync requests
*/
public List<ScheduledEntry<ReplicaFragmentSyncInfo, Void>> getScheduledReplicaSyncRequests() {
return replicaManager.getScheduledReplicaSyncRequests();
}
public PartitionStateManager getPartitionStateManager() {
return partitionStateManager;
}
public MigrationManager getMigrationManager() {
return migrationManager;
}
public PartitionReplicaManager getReplicaManager() {
return replicaManager;
}
public PartitionReplicaStateChecker getPartitionReplicaStateChecker() {
return partitionReplicaStateChecker;
}
public PartitionEventManager getPartitionEventManager() {
return partitionEventManager;
}
boolean isFetchMostRecentPartitionTableTaskRequired() {
return shouldFetchPartitionTables;
}
boolean scheduleFetchMostRecentPartitionTableTaskIfRequired() {
lock.lock();
try {
if (shouldFetchPartitionTables) {
migrationManager.schedule(new FetchMostRecentPartitionTableTask());
return true;
}
return false;
} finally {
lock.unlock();
}
}
public void replaceAddress(Address oldAddress, Address newAddress) {
lock.lock();
try {
partitionStateManager.replaceAddress(oldAddress, newAddress);
} finally {
lock.unlock();
}
}
@Override
public PartitionTableView createPartitionTableView() {
lock.lock();
try {
return partitionStateManager.getPartitionTable();
} finally {
lock.unlock();
}
}
/**
* Invoked on a node when it becomes master. It will receive partition states from all members and consolidate them into one.
* It guarantees the monotonicity of the partition table.
* <ul>
* <li>Fetch partition tables from all cluster members</li>
* <li>Pick the most up to date partition table and apply it to local</li>
* <li>Complete the pending migration, if present</li>
* <li>Send the new partition table to all cluster members</li>
* </ul>
*/
private class FetchMostRecentPartitionTableTask implements MigrationRunnable {
private final Address thisAddress = node.getThisAddress();
private int maxVersion;
private PartitionRuntimeState newState;
public void run() {
maxVersion = partitionStateManager.getVersion();
Collection<Future<PartitionRuntimeState>> futures = invokeFetchPartitionStateOps();
logger.info("Fetching most recent partition table! my version: " + maxVersion);
Collection<MigrationInfo> allCompletedMigrations = new HashSet<MigrationInfo>();
Collection<MigrationInfo> allActiveMigrations = new HashSet<MigrationInfo>();
processResults(futures, allCompletedMigrations, allActiveMigrations);
logger.info("Most recent partition table version: " + maxVersion);
processNewState(allCompletedMigrations, allActiveMigrations);
syncPartitionRuntimeState();
}
/** Sends {@link FetchPartitionStateOperation} to all cluster members. */
private Collection<Future<PartitionRuntimeState>> invokeFetchPartitionStateOps() {
Collection<MemberImpl> members = node.clusterService.getMemberImpls();
Collection<Future<PartitionRuntimeState>> futures = new ArrayList<Future<PartitionRuntimeState>>(
members.size());
for (MemberImpl m : members) {
if (m.localMember()) {
continue;
}
Future<PartitionRuntimeState> future = nodeEngine.getOperationService()
.createInvocationBuilder(SERVICE_NAME, new FetchPartitionStateOperation(),
m.getAddress()).setTryCount(Integer.MAX_VALUE)
.setCallTimeout(Long.MAX_VALUE).invoke();
futures.add(future);
}
return futures;
}
/** Collects all completed and active migrations and sets the partition state to the latest version. */
private void processResults(Collection<Future<PartitionRuntimeState>> futures,
Collection<MigrationInfo> allCompletedMigrations, Collection<MigrationInfo> allActiveMigrations) {
for (Future<PartitionRuntimeState> future : futures) {
try {
PartitionRuntimeState state = future.get();
if (state == null) {
// state can be null, if not initialized
continue;
}
if (maxVersion < state.getVersion()) {
newState = state;
maxVersion = state.getVersion();
}
allCompletedMigrations.addAll(state.getCompletedMigrations());
if (state.getActiveMigration() != null) {
allActiveMigrations.add(state.getActiveMigration());
}
} catch (TargetNotMemberException e) {
EmptyStatement.ignore(e);
} catch (MemberLeftException e) {
EmptyStatement.ignore(e);
} catch (InterruptedException e) {
logger.fine("FetchMostRecentPartitionTableTask is interrupted.");
} catch (ExecutionException e) {
Throwable cause = e.getCause();
if (!(cause instanceof TargetNotMemberException) && !(cause instanceof MemberLeftException)) {
logger.warning("Failed to fetch partition table!", e);
}
}
}
}
/**
* Applies a partition state and marks all migrations (including local) as complete, when a newer state is received.
* The method will acquire the partition state lock.
*
* @param allCompletedMigrations received completed migrations from other nodes
* @param allActiveMigrations received active migrations from other nodes
*/
private void processNewState(Collection<MigrationInfo> allCompletedMigrations,
Collection<MigrationInfo> allActiveMigrations) {
lock.lock();
try {
processMigrations(allCompletedMigrations, allActiveMigrations);
if (newState != null) {
newState.setCompletedMigrations(allCompletedMigrations);
maxVersion = Math.max(maxVersion, getPartitionStateVersion()) + 1;
newState.setVersion(maxVersion);
logger.info("Applying the most recent of partition state...");
applyNewState(newState, thisAddress);
} else if (partitionStateManager.isInitialized()) {
partitionStateManager.incrementVersion();
node.getNodeExtension().onPartitionStateChange();
for (MigrationInfo migrationInfo : allCompletedMigrations) {
if (migrationManager.addCompletedMigration(migrationInfo)) {
if (logger.isFinestEnabled()) {
logger.finest("Scheduling migration finalization after finding most recent partition table: "
+ migrationInfo);
}
migrationManager.scheduleActiveMigrationFinalization(migrationInfo);
}
}
}
shouldFetchPartitionTables = false;
} finally {
lock.unlock();
}
}
/** Moves all migrations to completed (including local) and marks active migrations as {@link MigrationStatus#FAILED}. */
private void processMigrations(Collection<MigrationInfo> allCompletedMigrations,
Collection<MigrationInfo> allActiveMigrations) {
allCompletedMigrations.addAll(migrationManager.getCompletedMigrationsCopy());
if (migrationManager.getActiveMigration() != null) {
allActiveMigrations.add(migrationManager.getActiveMigration());
}
for (MigrationInfo activeMigration : allActiveMigrations) {
activeMigration.setStatus(MigrationStatus.FAILED);
if (allCompletedMigrations.add(activeMigration)) {
logger.info("Marked active migration " + activeMigration + " as " + MigrationStatus.FAILED);
}
}
}
}
@Override
public String toString() {
return "InternalPartitionService {"
+ "version: " + getPartitionStateVersion() + ", migrationQ: " + getMigrationQueueSize() + "}";
}
}