/* * Copyright (c) 2008-2012, Hazel Bilisim Ltd. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.hazelcast.impl; import com.hazelcast.cluster.AbstractRemotelyCallable; import com.hazelcast.cluster.AbstractRemotelyProcessable; import com.hazelcast.cluster.ClusterManager.AsyncRemotelyBooleanCallable; import com.hazelcast.cluster.MemberInfo; import com.hazelcast.core.DistributedTask; import com.hazelcast.core.Member; import com.hazelcast.impl.base.DataRecordEntry; import com.hazelcast.impl.base.RecordSet; import com.hazelcast.impl.base.SystemLogService; import com.hazelcast.impl.concurrentmap.CostAwareRecordList; import com.hazelcast.impl.concurrentmap.ValueHolder; import com.hazelcast.impl.partition.*; import com.hazelcast.logging.ILogger; import com.hazelcast.nio.Address; import com.hazelcast.nio.Connection; import com.hazelcast.partition.MigrationEvent; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.*; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.logging.Level; public class PartitionManager { private static final long MIGRATING_PARTITION_CHECK_INTERVAL = TimeUnit.SECONDS.toMillis(300); // 5 MINUTES private static final long REPARTITIONING_CHECK_INTERVAL = TimeUnit.SECONDS.toMillis(300); // 5 MINUTES private static final int REPARTITIONING_TASK_COUNT_THRESHOLD = 20; private static final int REPARTITIONING_TASK_REPLICA_THRESHOLD = 2; private final ConcurrentMapManager concurrentMapManager; private final ILogger logger; private final int PARTITION_COUNT; private final PartitionInfo[] partitions; // updates will come from ServiceThread (one exception is PartitionManager.reset()) // but reads will be multithreaded. private volatile MigratingPartition migratingPartition; private volatile boolean initialized = false; private final AtomicInteger version = new AtomicInteger(); private final List<PartitionListener> lsPartitionListeners = new CopyOnWriteArrayList<PartitionListener>(); private final int partitionMigrationInterval; private final int immediateBackupInterval; private final MigrationService migrationService; private boolean running = true; // accessed only by MigrationService thread private final BlockingQueue<Runnable> immediateTasksQueue = new LinkedBlockingQueue<Runnable>(); private final Queue<Runnable> scheduledTasksQueue = new LinkedBlockingQueue<Runnable>(); private final AtomicBoolean sendingDiffs = new AtomicBoolean(false); private final AtomicBoolean migrationActive = new AtomicBoolean(true); // for testing purposes only private final AtomicLong lastRepartitionTime = new AtomicLong(); private final SystemLogService systemLogService; public PartitionManager(final ConcurrentMapManager concurrentMapManager) { this.PARTITION_COUNT = concurrentMapManager.getPartitionCount(); this.concurrentMapManager = concurrentMapManager; this.logger = concurrentMapManager.node.getLogger(PartitionManager.class.getName()); this.partitions = new PartitionInfo[PARTITION_COUNT]; final Node node = concurrentMapManager.node; systemLogService = node.getSystemLogService(); for (int i = 0; i < PARTITION_COUNT; i++) { this.partitions[i] = new PartitionInfo(i, new PartitionListener() { public void replicaChanged(PartitionReplicaChangeEvent event) { for (PartitionListener partitionListener : lsPartitionListeners) { partitionListener.replicaChanged(event); } if (event.getReplicaIndex() == 0 && event.getNewAddress() == null && node.isActive() && node.joined()) { final String warning = "Owner of partition is being removed! " + "Possible data loss for partition[" + event.getPartitionId() + "]. " + event; logger.log(Level.WARNING, warning); systemLogService.logPartition(warning); } if (concurrentMapManager.isMaster()) { version.incrementAndGet(); } } }); } partitionMigrationInterval = node.groupProperties.PARTITION_MIGRATION_INTERVAL.getInteger() * 1000; immediateBackupInterval = node.groupProperties.IMMEDIATE_BACKUP_INTERVAL.getInteger() * 1000; migrationService = new MigrationService(node); migrationService.start(); int partitionTableSendInterval = node.groupProperties.PARTITION_TABLE_SEND_INTERVAL.getInteger(); if (partitionTableSendInterval <= 0) { partitionTableSendInterval = 1; } node.executorManager.getScheduledExecutorService().scheduleAtFixedRate(new SendClusterStateTask(), partitionTableSendInterval, partitionTableSendInterval, TimeUnit.SECONDS); node.executorManager.getScheduledExecutorService().scheduleAtFixedRate(new CheckMigratingPartitionTask(), partitionTableSendInterval, partitionTableSendInterval, TimeUnit.SECONDS); node.executorManager.getScheduledExecutorService().scheduleAtFixedRate(new Runnable() { public void run() { if (concurrentMapManager.isMaster() && node.isActive() && initialized && shouldCheckRepartitioning()) { logger.log(Level.FINEST, "Checking partition table for repartitioning..."); immediateTasksQueue.add(new CheckRepartitioningTask()); } } }, 180, 180, TimeUnit.SECONDS); } // for testing purposes only public boolean activateMigration() { return migrationActive.getAndSet(true); } // for testing purposes only public boolean inactivateMigration() { migrationActive.getAndSet(false); while (migratingPartition != null) { try { Thread.sleep(250); } catch (InterruptedException e) { return true; } } return true; } private void sendClusterRuntimeState() { if (!concurrentMapManager.isMaster() || !concurrentMapManager.isActive() || !concurrentMapManager.node.joined()) { return; } // do not send partition state until initialized! // sending partition state makes nodes believe initialization completed. if (!initialized) return; long clusterTime = concurrentMapManager.node.getClusterImpl().getClusterTime(); List<MemberImpl> lsMembers = concurrentMapManager.lsMembers; ArrayList<MemberInfo> memberInfos = new ArrayList<MemberInfo>(lsMembers.size()); for (MemberImpl member : lsMembers) { memberInfos.add(new MemberInfo(member.getAddress(), member.getNodeType(), member.getUuid())); } ClusterRuntimeState crs = new ClusterRuntimeState(memberInfos, partitions, clusterTime, version.get()); concurrentMapManager.sendProcessableToAll(crs, false); } // for testing purposes only private void printPartitionOwnerDuplicates() { for (PartitionInfo partition : partitions) { L: for (int index = 0; index < PartitionInfo.MAX_REPLICA_COUNT; index++) { Address address = partition.getReplicaAddress(index); if (address != null) { for (int k = 0; k < PartitionInfo.MAX_REPLICA_COUNT; k++) { if (k != index && address.equals(partition.getReplicaAddress(k))) { logger.log(Level.WARNING, "DUPLICATE ==> " + partition); break L; } } } } } } public MigratingPartition getMigratingPartition() { return migratingPartition; } public void addPartitionListener(PartitionListener partitionListener) { lsPartitionListeners.add(partitionListener); } public PartitionInfo[] getPartitions() { return partitions; } public Address getOwner(int partitionId) { concurrentMapManager.checkServiceThread(); if (!initialized) { firstArrangement(); } Address owner = partitions[partitionId].getOwner(); if (owner == null && !concurrentMapManager.isMaster()) { concurrentMapManager.sendProcessableTo(new AssignPartitions(), concurrentMapManager.getMasterAddress()); } return owner; } public void firstArrangement() { concurrentMapManager.checkServiceThread(); if (!concurrentMapManager.isMaster() || !concurrentMapManager.isActive()) return; if (!hasStorageMember()) return; if (!initialized) { PartitionStateGenerator psg = getPartitionStateGenerator(); logger.log(Level.INFO, "Initializing cluster partition table first arrangement..."); PartitionInfo[] newState = psg.initialize(concurrentMapManager.lsMembers, PARTITION_COUNT); if (newState != null) { for (PartitionInfo partitionInfo : newState) { partitions[partitionInfo.getPartitionId()].setPartitionInfo(partitionInfo); } } sendClusterRuntimeState(); initialized = true; } } private PartitionStateGenerator getPartitionStateGenerator() { return PartitionStateGeneratorFactory.newConfigPartitionStateGenerator( concurrentMapManager.node.getConfig().getPartitionGroupConfig()); } public CostAwareRecordList getActivePartitionRecords(final int partitionId, final int replicaIndex, final Address newAddress, boolean diffOnly) { final Address thisAddress = concurrentMapManager.node.getThisAddress(); concurrentMapManager.enqueueAndWait(new Processable() { public void process() { addActiveMigration(partitionId, replicaIndex, thisAddress, newAddress); } }); long now = System.currentTimeMillis(); final Collection<CMap> cmaps = concurrentMapManager.maps.values(); CostAwareRecordList lsResultSet = new CostAwareRecordList(1000); for (final CMap cmap : cmaps) { boolean includeCMap = diffOnly ? cmap.getBackupCount() == replicaIndex : cmap.getBackupCount() >= replicaIndex; if (includeCMap) { for (Record rec : cmap.mapRecords.values()) { if (rec.isActive() && rec.isValid(now)) { if (rec.getKeyData() == null || rec.getKeyData().size() == 0) { throw new RuntimeException("Record.key is null or empty " + rec.getKeyData()); } if (rec.getBlockId() == partitionId) { cmap.onMigrate(rec); if (cmap.isMultiMap()) { Collection<ValueHolder> colValues = rec.getMultiValues(); for (ValueHolder valueHolder : colValues) { Record record = rec.copy(); record.setValueData(valueHolder.getData()); lsResultSet.add(record); } } else { lsResultSet.add(rec); } lsResultSet.addCost(rec.getCost()); } } } } } return lsResultSet; } private void addActiveMigration(final MigratingPartition migrationRequestTask) { addActiveMigration(migrationRequestTask.getPartitionId(), migrationRequestTask.getReplicaIndex(), migrationRequestTask.getFromAddress(), migrationRequestTask.getToAddress()); } private void addActiveMigration(final int partitionId, final int replicaIndex, final Address currentAddress, final Address newAddress) { concurrentMapManager.checkServiceThread(); final MigratingPartition currentMigratingPartition = migratingPartition; final MigratingPartition newMigratingPartition = new MigratingPartition(partitionId, replicaIndex, currentAddress, newAddress); if (!newMigratingPartition.equals(currentMigratingPartition)) { if (currentMigratingPartition != null) { logger.log(Level.FINEST, "Replacing current " + currentMigratingPartition + " with " + newMigratingPartition); } migratingPartition = newMigratingPartition; } } private void compareAndSetActiveMigratingPartition(final MigratingPartition expectedMigratingPartition, final MigratingPartition newMigratingPartition) { concurrentMapManager.checkServiceThread(); if (expectedMigratingPartition == null) { if (migratingPartition == null) { migratingPartition = newMigratingPartition; } } else if (expectedMigratingPartition.equals(migratingPartition)) { migratingPartition = newMigratingPartition; } } public void doMigrate(final int partitionId, final int replicaIndex, final RecordSet recordSet, final Address from) { concurrentMapManager.enqueueAndWait(new Processable() { public void process() { addActiveMigration(partitionId, replicaIndex, from, concurrentMapManager.thisAddress); for (DataRecordEntry dataRecordEntry : recordSet.getRecords()) { CMap cmap = concurrentMapManager.getOrCreateMap(dataRecordEntry.getName()); if (replicaIndex == 0) { // owner cmap.own(dataRecordEntry); } else { // backup cmap.storeAsBackup(dataRecordEntry); } } } }); } public MemberImpl getMember(Address address) { if (address != null) { for (Member member : concurrentMapManager.node.getClusterImpl().getMembers()) { MemberImpl memberImpl = (MemberImpl) member; if (memberImpl.getAddress().equals(address)) return memberImpl; } } return null; } public void reset() { initialized = false; clearTaskQueues(); migratingPartition = null; for (PartitionInfo partition : partitions) { for (int i = 0; i < PartitionInfo.MAX_REPLICA_COUNT; i++) { partition.setReplicaAddress(i, null); } } version.set(0); } private void clearTaskQueues() { immediateTasksQueue.clear(); scheduledTasksQueue.clear(); } public void shutdown() { logger.log(Level.FINEST, "Shutting down the partition manager"); try { clearTaskQueues(); final CountDownLatch stopLatch = new CountDownLatch(1); immediateTasksQueue.offer(new Runnable() { public void run() { running = false; stopLatch.countDown(); } }); stopLatch.await(1, TimeUnit.SECONDS); } catch (InterruptedException ignored) { } } private boolean hasStorageMember() { for (MemberImpl member : concurrentMapManager.lsMembers) { if (!member.isLiteMember()) { return true; } } return false; } public void syncForDead(MemberImpl deadMember) { Address deadAddress = deadMember.getAddress(); Address thisAddress = concurrentMapManager.getThisAddress(); if (deadAddress == null || deadAddress.equals(thisAddress)) { return; } if (!hasStorageMember()) { reset(); } concurrentMapManager.partitionServiceImpl.reset(); checkMigratingPartitionForDead(deadAddress); // list of partitions those have dead member in their replicas // !! this should be calculated before dead member is removed from partition table !! int[] indexesOfDead = new int[partitions.length]; for (PartitionInfo partition : partitions) { indexesOfDead[partition.getPartitionId()] = partition.getReplicaIndexOf(deadAddress); } if (!deadMember.isLiteMember()) { clearTaskQueues(); // shift partition table up. for (PartitionInfo partition : partitions) { // safe removal of dead address from partition table. // there might be duplicate dead address in partition table // during migration tasks' execution (when there are multiple backups and // copy backup tasks; see MigrationRequestTask selfCopyReplica.) // or because of a bug. while (partition.onDeadAddress(deadAddress)) ; } } fixCMapsForDead(deadAddress, indexesOfDead); fixReplicasAndPartitionsForDead(deadMember, indexesOfDead); } private void fixReplicasAndPartitionsForDead(final MemberImpl deadMember, final int[] indexesOfDead) { if (!deadMember.isLiteMember() && concurrentMapManager.isMaster() && concurrentMapManager.isActive()) { sendingDiffs.set(true); logger.log(Level.INFO, "Starting to send partition replica diffs..." + sendingDiffs.get()); int diffCount = 0; final int maxBackupCount = getMaxBackupCount(); for (int partitionId = 0; partitionId < indexesOfDead.length; partitionId++) { int indexOfDead = indexesOfDead[partitionId]; if (indexOfDead != -1) { PartitionInfo partition = partitions[partitionId]; Address owner = partition.getOwner(); if (owner == null) { logger.log(Level.FINEST, "Owner of one of the replicas of Partition[" + partitionId + "] is dead, but partition owner " + "could not be found either!"); logger.log(Level.FINEST, partition.toString()); continue; } // send replica diffs to new replica owners after partition table shift. for (int replicaIndex = indexOfDead; replicaIndex < maxBackupCount; replicaIndex++) { Address target = partition.getReplicaAddress(replicaIndex); if (target != null && !target.equals(owner)) { if (getMember(target) != null) { MigrationRequestTask mrt = new MigrationRequestTask(partitionId, owner, target, replicaIndex, false, true); immediateTasksQueue.offer(new Migrator(mrt)); diffCount++; } else { logger.log(Level.WARNING, "Target member of replica diff task couldn't found! " + "Replica: " + replicaIndex + ", Dead: " + deadMember + "\n" + partition); } } } // if index of dead member is equal to or less than maxBackupCount // clear indexes of equal to and greater than maxBackupCount of partition. if (indexOfDead <= maxBackupCount) { for (int index = maxBackupCount; index < PartitionInfo.MAX_REPLICA_COUNT; index++) { partition.setReplicaAddress(index, null); } } } } sendClusterRuntimeState(); final int totalDiffCount = diffCount; immediateTasksQueue.offer(new Runnable() { public void run() { logger.log(Level.INFO, "Total " + totalDiffCount + " partition replica diffs have been processed."); sendingDiffs.set(false); } }); immediateTasksQueue.offer(new PrepareRepartitioningTask()); } } private void checkMigratingPartitionForDead(final Address deadAddress) { if (migratingPartition != null) { if (deadAddress.equals(migratingPartition.getFromAddress()) || deadAddress.equals(migratingPartition.getToAddress())) { migratingPartition = null; } } } private void fixCMapsForDead(final Address deadAddress, final int[] indexesOfDead) { Address thisAddress = concurrentMapManager.getThisAddress(); for (CMap cmap : concurrentMapManager.maps.values()) { cmap.onDisconnect(deadAddress); Object[] records = cmap.mapRecords.values().toArray(); for (Object recordObject : records) { if (recordObject != null) { Record record = (Record) recordObject; if (record.isLocked() && cmap.isMapForQueue()) { if (deadAddress.equals(record.getLock().getLockAddress())) { cmap.sendKeyToMaster(record.getKeyData()); } } cmap.onDisconnect(record, deadAddress); final int partitionId = record.getBlockId(); // owner of the partition is dead // and record is active // and new owner of partition is this member. if (indexesOfDead[partitionId] == 0 && record.isActive() && thisAddress.equals(partitions[partitionId].getOwner())) { cmap.markAsDirty(record); // update the indexes cmap.updateIndexes(record); } } } } } private int getMaxBackupCount() { final Collection<CMap> cmaps = concurrentMapManager.maps.values(); if (!cmaps.isEmpty()) { int maxBackupCount = 0; for (final CMap cmap : cmaps) { maxBackupCount = Math.max(maxBackupCount, cmap.getBackupCount()); } return maxBackupCount; } return 1; // if there is no map, avoid extra processing. } public void syncForAdd() { if (concurrentMapManager.isMaster() && concurrentMapManager.node.isActive()) { if (sendingDiffs.get()) { logger.log(Level.INFO, "MigrationService is already sending diffs for dead member, " + "no need to initiate task!"); } else { // to avoid repartitioning during a migration process. clearTaskQueues(); immediateTasksQueue.offer(new PrepareRepartitioningTask()); } } } public int getVersion() { return version.get(); } // for testing purposes only void forcePartitionOwnerMigration(int partitionId, int replicaIndex, Address from, Address to) { MigrationRequestTask mrt = new MigrationRequestTask(partitionId, from, to, replicaIndex, true); immediateTasksQueue.offer(new Migrator(mrt)); } public void setClusterRuntimeState(ClusterRuntimeState clusterRuntimeState) { concurrentMapManager.checkServiceThread(); final Connection conn = clusterRuntimeState.getConnection(); final Address sender = conn != null ? conn.getEndPoint() : null; if (concurrentMapManager.isMaster()) { logger.log(Level.WARNING, "This is the master node and received a ClusterRuntimeState from " + (sender != null ? sender : conn) + ". Ignoring incoming state! "); return; } else { final Address master = concurrentMapManager.getMasterAddress(); if (sender == null || master == null || !master.equals(sender)) { logger.log(Level.WARNING, "Received a ClusterRuntimeState, but its sender doesn't seem master!" + " => Sender: " + sender + ", Master: " + master + "! " + "(Ignore if master node has changed recently.)"); } } PartitionInfo[] newPartitions = clusterRuntimeState.getPartitions(); int size = newPartitions.length; for (int i = 0; i < size; i++) { PartitionInfo newPartition = newPartitions[i]; PartitionInfo currentPartition = partitions[newPartition.getPartitionId()]; for (int index = 0; index < PartitionInfo.MAX_REPLICA_COUNT; index++) { Address address = newPartition.getReplicaAddress(index); if (address != null && concurrentMapManager.getMember(address) == null) { logger.log(Level.WARNING, "Unknown " + address + " is found in received partition table from master " + sender + ". Probably it is dead. Partition: " + newPartition); } } currentPartition.setPartitionInfo(newPartition); checkMigratingPartitionFor(currentPartition); } initialized = true; version.set(clusterRuntimeState.getVersion()); } private void checkMigratingPartitionFor(PartitionInfo partition) { concurrentMapManager.checkServiceThread(); final MigratingPartition mPartition = migratingPartition; if (mPartition != null && partition.getPartitionId() == mPartition.getPartitionId()) { final Address targetAddress = mPartition.getToAddress(); if (targetAddress != null && targetAddress.equals(partition.getReplicaAddress(mPartition.getReplicaIndex()))) { migratingPartition = null; } } } public boolean shouldPurge(int partitionId, int maxBackupCount) { if (isPartitionMigrating(partitionId)) return false; Address thisAddress = concurrentMapManager.getThisAddress(); PartitionInfo partitionInfo = getPartition(partitionId); return !partitionInfo.isOwnerOrBackup(thisAddress, maxBackupCount); } /** * @param partitionId * @return true if any replica of partition is migrating, false otherwise */ public boolean isPartitionMigrating(int partitionId) { // volatile read final MigratingPartition currentMigratingPartition = migratingPartition; return currentMigratingPartition != null && currentMigratingPartition.getPartitionId() == partitionId; } /** * @param partitionId * @return true if owned replica (0) of partition is migrating, false otherwise */ public boolean isOwnedPartitionMigrating(int partitionId) { // volatile read final MigratingPartition currentMigratingPartition = migratingPartition; return currentMigratingPartition != null && currentMigratingPartition.getPartitionId() == partitionId && currentMigratingPartition.getReplicaIndex() == 0; } public PartitionInfo getPartition(int partitionId) { return partitions[partitionId]; } public boolean hasActiveBackupTask() { if (!initialized) return false; if (concurrentMapManager.isLiteMember()) return false; int maxBackupCount = getMaxBackupCount(); if (maxBackupCount == 0) return false; Set<MemberImpl> members = new HashSet<MemberImpl>(); for (Member member : concurrentMapManager.node.getClusterImpl().getMembers()) { members.add((MemberImpl) member); } MemberGroupFactory mgf = PartitionStateGeneratorFactory.newMemberGroupFactory( concurrentMapManager.node.config.getPartitionGroupConfig()); if (mgf.createMemberGroups(members).size() < 2) return false; boolean needBackup = false; if (immediateTasksQueue.isEmpty()) { for (PartitionInfo partition : partitions) { if (partition.getReplicaAddress(1) == null) { needBackup = true; logger.log(Level.WARNING, concurrentMapManager.thisAddress + " still has no replica for partitionId:" + partition.getPartitionId()); break; } } } return needBackup || !immediateTasksQueue.isEmpty(); } public void fireMigrationEvent(final boolean started, int partitionId, Address from, Address to) { final MemberImpl current = concurrentMapManager.getMember(from); final MemberImpl newOwner = concurrentMapManager.getMember(to); final MigrationEvent migrationEvent = new MigrationEvent(concurrentMapManager.node, partitionId, current, newOwner); systemLogService.logPartition("MigrationEvent [" + started + "] " + migrationEvent); concurrentMapManager.partitionServiceImpl.doFireMigrationEvent(started, migrationEvent); } private boolean shouldCheckRepartitioning() { return immediateTasksQueue.isEmpty() && scheduledTasksQueue.isEmpty() && lastRepartitionTime.get() < (System.currentTimeMillis() - REPARTITIONING_CHECK_INTERVAL) && migratingPartition == null; } public static class AssignPartitions extends AbstractRemotelyProcessable { public void process() { node.concurrentMapManager.getPartitionManager().getOwner(0); } } public static class RemotelyCheckMigratingPartition extends AbstractRemotelyCallable<Boolean> { MigratingPartition migratingPartition; public RemotelyCheckMigratingPartition() { } public RemotelyCheckMigratingPartition(final MigratingPartition migratingPartition) { this.migratingPartition = migratingPartition; } public Boolean call() throws Exception { if (migratingPartition != null) { final MigratingPartition masterMigratingPartition = node .concurrentMapManager.getPartitionManager().migratingPartition; return migratingPartition.equals(masterMigratingPartition); } return Boolean.FALSE; } public void readData(final DataInput in) throws IOException { if (in.readBoolean()) { migratingPartition = new MigratingPartition(); migratingPartition.readData(in); } } public void writeData(final DataOutput out) throws IOException { boolean b = migratingPartition != null; out.writeBoolean(b); if (b) { migratingPartition.writeData(out); } } } private class SendClusterStateTask implements Runnable { public void run() { if (concurrentMapManager.isMaster() && concurrentMapManager.node.isActive()) { if (!scheduledTasksQueue.isEmpty() || !immediateTasksQueue.isEmpty()) { logger.log(Level.INFO, "Remaining migration tasks in queue => Immediate-Tasks: " + immediateTasksQueue.size() + ", Scheduled-Tasks: " + scheduledTasksQueue.size()); } final Node node = concurrentMapManager.node; concurrentMapManager.enqueueAndReturn(new Processable() { public void process() { if (!node.isActive() || !node.isMaster()) return; sendClusterRuntimeState(); } }); } } } private class CheckMigratingPartitionTask implements Runnable { public void run() { if (!concurrentMapManager.isMaster()) { final MigratingPartition currentMigratingPartition = migratingPartition; if (currentMigratingPartition != null && (System.currentTimeMillis() - currentMigratingPartition.getCreationTime()) > MIGRATING_PARTITION_CHECK_INTERVAL) { try { final Node node = concurrentMapManager.node; AsyncRemotelyBooleanCallable rrp = node.clusterManager.new AsyncRemotelyBooleanCallable(); rrp.executeProcess(node.getMasterAddress(), new RemotelyCheckMigratingPartition(currentMigratingPartition)); boolean valid = rrp.getResultAsBoolean(1); if (valid) { logger.log(Level.FINEST, "Master has confirmed current " + currentMigratingPartition); } else { logger.log(Level.INFO, currentMigratingPartition + " could not be validated with master! " + "Removing current MigratingPartition..."); concurrentMapManager.enqueueAndReturn(new Processable() { public void process() { migratingPartition = null; } }); } } catch (Throwable t) { logger.log(Level.WARNING, t.getMessage(), t); } } } } } private class PrepareRepartitioningTask implements Runnable { final List<MigrationRequestTask> lostQ = new ArrayList<MigrationRequestTask>(); final List<MigrationRequestTask> scheduledQ = new ArrayList<MigrationRequestTask>(PARTITION_COUNT); final List<MigrationRequestTask> immediateQ = new ArrayList<MigrationRequestTask>(PARTITION_COUNT * 2); private PrepareRepartitioningTask() { } public final void run() { if (concurrentMapManager.isMaster() && concurrentMapManager.node.isActive() && initialized) { doRun(); } } void doRun() { prepareMigrationTasks(); logger.log(Level.INFO, "Re-partitioning cluster data... Immediate-Tasks: " + immediateQ.size() + ", Scheduled-Tasks: " + scheduledQ.size()); fillMigrationQueues(); } void prepareMigrationTasks() { final Collection<MemberImpl> members = new LinkedList<MemberImpl>(); Collection<Member> memberSet = concurrentMapManager.node.getClusterImpl().getMembers(); for (Member member : memberSet) { members.add((MemberImpl) member); } PartitionStateGenerator psg = getPartitionStateGenerator(); psg.reArrange(partitions, members, PARTITION_COUNT, lostQ, immediateQ, scheduledQ); } void fillMigrationQueues() { lastRepartitionTime.set(System.currentTimeMillis()); if (!lostQ.isEmpty()) { concurrentMapManager.enqueueAndReturn(new LostPartitionsAssignmentProcess(lostQ)); logger.log(Level.WARNING, "Assigning new owners for " + lostQ.size() + " LOST partitions!"); } for (MigrationRequestTask migrationRequestTask : immediateQ) { immediateTasksQueue.offer(new Migrator(migrationRequestTask)); } immediateQ.clear(); for (MigrationRequestTask migrationRequestTask : scheduledQ) { scheduledTasksQueue.offer(new Migrator(migrationRequestTask)); } scheduledQ.clear(); } } private class LostPartitionsAssignmentProcess implements Processable { final List<MigrationRequestTask> lostQ; private LostPartitionsAssignmentProcess(final List<MigrationRequestTask> lostQ) { this.lostQ = lostQ; } public void process() { if (!concurrentMapManager.isMaster() || !concurrentMapManager.node.isActive()) return; for (MigrationRequestTask migrationRequestTask : lostQ) { int partitionId = migrationRequestTask.getPartitionId(); int replicaIndex = migrationRequestTask.getReplicaIndex(); if (replicaIndex != 0 || partitionId >= PARTITION_COUNT) { logger.log(Level.WARNING, "Wrong task for lost partitions assignment process" + " => " + migrationRequestTask); continue; } PartitionInfo partition = partitions[partitionId]; Address newOwner = migrationRequestTask.getToAddress(); MemberImpl ownerMember = concurrentMapManager.getMember(newOwner); if (ownerMember != null) { partition.setReplicaAddress(replicaIndex, newOwner); concurrentMapManager.sendMigrationEvent(false, migrationRequestTask); } } sendClusterRuntimeState(); } } private class CheckRepartitioningTask extends PrepareRepartitioningTask implements Runnable { void doRun() { if (shouldCheckRepartitioning()) { final int v = version.get(); prepareMigrationTasks(); int totalTasks = 0; for (MigrationRequestTask task : immediateQ) { if (task.getReplicaIndex() <= REPARTITIONING_TASK_REPLICA_THRESHOLD) { totalTasks++; } } for (MigrationRequestTask task : scheduledQ) { if (task.getReplicaIndex() <= REPARTITIONING_TASK_REPLICA_THRESHOLD) { totalTasks++; } } if (!lostQ.isEmpty() || totalTasks > REPARTITIONING_TASK_COUNT_THRESHOLD) { logger.log(Level.WARNING, "Something weird! Migration task queues are empty," + " last repartitioning executed on " + lastRepartitionTime.get() + " but repartitioning check resulted " + totalTasks + " tasks" + " and " + lostQ.size() + " lost partitions!"); if (version.get() == v && shouldCheckRepartitioning()) { fillMigrationQueues(); } } } } } private class Migrator implements Runnable { final MigrationRequestTask migrationRequestTask; Migrator(MigrationRequestTask migrationRequestTask) { this.migrationRequestTask = migrationRequestTask; } public void run() { try { if (!concurrentMapManager.node.isActive() || !concurrentMapManager.node.isMaster()) { return; } if (migrationRequestTask.isMigration() && migrationRequestTask.getReplicaIndex() == 0) { concurrentMapManager.enqueueAndWait(new Processable() { public void process() { concurrentMapManager.sendMigrationEvent(true, migrationRequestTask); } }, 100); } if (migrationRequestTask.getToAddress() == null) { // A member is dead, this replica should not have an owner! logger.log(Level.INFO, "Fixing partition, " + migrationRequestTask.getReplicaIndex() + ". replica of partition[" + migrationRequestTask.getPartitionId() + "] should be removed."); concurrentMapManager.enqueueAndWait(new Processable() { public void process() { int partitionId = migrationRequestTask.getPartitionId(); int replicaIndex = migrationRequestTask.getReplicaIndex(); PartitionInfo partition = partitions[partitionId]; partition.setReplicaAddress(replicaIndex, null); migratingPartition = null; } }); } else { MemberImpl fromMember = null; Object result = Boolean.FALSE; if (migrationRequestTask.isMigration()) { fromMember = getMember(migrationRequestTask.getFromAddress()); } else { // ignore fromAddress of task and get actual owner from partition table final int partitionId = migrationRequestTask.getPartitionId(); fromMember = getMember(partitions[partitionId].getOwner()); } logger.log(Level.FINEST, "Started Migration : " + migrationRequestTask); systemLogService.logPartition("Started Migration : " + migrationRequestTask); if (fromMember != null) { migrationRequestTask.setFromAddress(fromMember.getAddress()); DistributedTask task = new DistributedTask(migrationRequestTask, fromMember); concurrentMapManager.enqueueAndWait(new Processable() { public void process() { addActiveMigration(migrationRequestTask); } }); Future future = concurrentMapManager.node.factory.getExecutorService().submit(task); try { result = future.get(600, TimeUnit.SECONDS); } catch (Throwable e) { logger.log(Level.WARNING, "Failed migrating from " + fromMember); } } else { // Partition is lost! Assign new owner and exit. result = Boolean.TRUE; } logger.log(Level.FINEST, "Finished Migration : " + migrationRequestTask); systemLogService.logPartition("Finished Migration : " + migrationRequestTask); if (Boolean.TRUE.equals(result)) { concurrentMapManager.enqueueAndWait(new ProcessMigrationResult(migrationRequestTask), 10000); } else { // remove active partition migration logger.log(Level.WARNING, "Migration task has failed => " + migrationRequestTask); systemLogService.logPartition("Migration task has failed => " + migrationRequestTask); concurrentMapManager.enqueueAndWait(new Processable() { public void process() { compareAndSetActiveMigratingPartition(migrationRequestTask, null); } }); } } } catch (Throwable t) { logger.log(Level.WARNING, "Error [" + t.getClass() + ": " + t.getMessage() + "] " + "while executing " + migrationRequestTask, t); systemLogService.logPartition("Failed! " + migrationRequestTask); } } } private class ProcessMigrationResult implements Processable { final MigrationRequestTask migrationRequestTask; private ProcessMigrationResult(final MigrationRequestTask migrationRequestTask) { this.migrationRequestTask = migrationRequestTask; } public void process() { int partitionId = migrationRequestTask.getPartitionId(); int replicaIndex = migrationRequestTask.getReplicaIndex(); PartitionInfo partition = partitions[partitionId]; if (PartitionInfo.MAX_REPLICA_COUNT < replicaIndex) { String msg = "Migrated [" + partitionId + ":" + replicaIndex + "] but cannot assign. Length:" + PartitionInfo.MAX_REPLICA_COUNT; logger.log(Level.WARNING, msg); } else { Address newOwner = migrationRequestTask.getToAddress(); MemberImpl ownerMember = concurrentMapManager.getMember(newOwner); if (ownerMember == null) return; partition.setReplicaAddress(replicaIndex, newOwner); if (replicaIndex == 0) { concurrentMapManager.sendMigrationEvent(false, migrationRequestTask); } // if this partition should be copied back, // just set partition's replica address // before data is cleaned up. if (migrationRequestTask.getSelfCopyReplicaIndex() > -1) { partition.setReplicaAddress(migrationRequestTask.getSelfCopyReplicaIndex(), migrationRequestTask.getFromAddress()); } sendClusterRuntimeState(); compareAndSetActiveMigratingPartition(migrationRequestTask, null); } } } private class MigrationService extends Thread implements Runnable { MigrationService(Node node) { super(node.threadGroup, node.getThreadNamePrefix("MigrationThread")); } public void run() { ThreadContext.get().setCurrentFactory(concurrentMapManager.node.factory); try { while (running) { Runnable r = null; while (isActive() && (r = immediateTasksQueue.poll()) != null) { safeRunImmediate(r); } if (!running) { break; } // wait for partitionMigrationInterval before executing scheduled tasks // and poll immediate tasks occasionally during wait time. long totalWait = 0L; while (isActive() && (r != null || totalWait < partitionMigrationInterval)) { long start = System.currentTimeMillis(); r = immediateTasksQueue.poll(1, TimeUnit.SECONDS); safeRunImmediate(r); totalWait += (System.currentTimeMillis() - start); } if (isActive()) { r = scheduledTasksQueue.poll(); safeRun(r); } if (!migrationActive.get() || hasNoTasks()) { Thread.sleep(250); continue; } } } catch (InterruptedException e) { logger.log(Level.FINEST, "MigrationService is interrupted: " + e.getMessage(), e); running = false; } finally { clearTaskQueues(); } } private boolean hasNoTasks() { return (immediateTasksQueue.isEmpty() && scheduledTasksQueue.isEmpty()); } private boolean isActive() { return migrationActive.get() && running; } boolean safeRun(final Runnable r) { if (r == null || !running) return false; try { r.run(); } catch (Throwable t) { logger.log(Level.WARNING, t.getMessage(), t); } return true; } void safeRunImmediate(final Runnable r) throws InterruptedException { if (safeRun(r) && immediateBackupInterval > 0) { Thread.sleep(immediateBackupInterval); } } } @Override public String toString() { StringBuilder sb = new StringBuilder("PartitionManager[" + version + "] {\n"); sb.append("migratingPartition: " + migratingPartition); sb.append("\n"); sb.append("immediateQ:" + immediateTasksQueue.size()); sb.append(", scheduledQ:" + scheduledTasksQueue.size()); sb.append("\n}"); return sb.toString(); } }