/* * Copyright (c) 2008-2017, Hazelcast, Inc. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.hazelcast.map.impl; import com.hazelcast.core.ExecutionCallback; import com.hazelcast.core.IFunction; import com.hazelcast.core.MapLoader; import com.hazelcast.core.Member; import com.hazelcast.internal.cluster.ClusterService; import com.hazelcast.logging.ILogger; import com.hazelcast.map.impl.mapstore.MapStoreContext; import com.hazelcast.map.impl.operation.KeyLoadStatusOperation; import com.hazelcast.map.impl.operation.KeyLoadStatusOperationFactory; import com.hazelcast.map.impl.operation.MapOperation; import com.hazelcast.map.impl.operation.MapOperationProvider; import com.hazelcast.map.impl.operation.TriggerLoadIfNeededOperation; import com.hazelcast.nio.Address; import com.hazelcast.nio.serialization.Data; import com.hazelcast.spi.ExecutionService; import com.hazelcast.spi.InternalCompletableFuture; import com.hazelcast.spi.Operation; import com.hazelcast.spi.OperationService; import com.hazelcast.spi.impl.AbstractCompletableFuture; import com.hazelcast.spi.partition.IPartition; import com.hazelcast.spi.partition.IPartitionService; import com.hazelcast.util.FutureUtil; import com.hazelcast.util.StateMachine; import com.hazelcast.util.scheduler.CoalescingDelayedTrigger; import java.io.Closeable; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.Executor; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import static com.hazelcast.logging.Logger.getLogger; import static com.hazelcast.map.impl.MapKeyLoaderUtil.assignRole; import static com.hazelcast.map.impl.MapKeyLoaderUtil.toBatches; import static com.hazelcast.map.impl.MapKeyLoaderUtil.toPartition; import static com.hazelcast.map.impl.MapService.SERVICE_NAME; import static com.hazelcast.nio.IOUtil.closeResource; import static com.hazelcast.spi.ExecutionService.MAP_LOAD_ALL_KEYS_EXECUTOR; import static com.hazelcast.util.IterableUtil.limit; import static com.hazelcast.util.IterableUtil.map; import static java.util.concurrent.TimeUnit.SECONDS; /** * Loads keys from a {@link MapLoader} and sends them to all partitions for loading */ public class MapKeyLoader { private static final long LOADING_TRIGGER_DELAY = SECONDS.toMillis(5); private ILogger logger; private String mapName; private OperationService opService; private IPartitionService partitionService; private final ClusterService clusterService; private IFunction<Object, Data> toData; private ExecutionService execService; private CoalescingDelayedTrigger delayedTrigger; private int maxSizePerNode; private int maxBatch; private int mapNamePartition; private int partitionId; private boolean hasBackup; private LoadFinishedFuture keyLoadFinished = new LoadFinishedFuture(true); private MapOperationProvider operationProvider; /** * Role of this MapKeyLoader **/ enum Role { NONE, /** * Sends out keys to all other partitions **/ SENDER, /** * Receives keys from sender **/ RECEIVER, /** * Restarts sending if SENDER fails **/ SENDER_BACKUP } enum State { NOT_LOADED, LOADING, LOADED } private final StateMachine<Role> role = StateMachine.of(Role.NONE) .withTransition(Role.NONE, Role.SENDER, Role.RECEIVER, Role.SENDER_BACKUP) .withTransition(Role.SENDER_BACKUP, Role.SENDER); private final StateMachine<State> state = StateMachine.of(State.NOT_LOADED) .withTransition(State.NOT_LOADED, State.LOADING) .withTransition(State.LOADING, State.LOADED, State.NOT_LOADED) .withTransition(State.LOADED, State.LOADING); public MapKeyLoader(String mapName, OperationService opService, IPartitionService ps, ClusterService clusterService, ExecutionService execService, IFunction<Object, Data> serialize) { this.mapName = mapName; this.opService = opService; this.partitionService = ps; this.clusterService = clusterService; this.toData = serialize; this.execService = execService; this.logger = getLogger(MapKeyLoader.class); } public Future startInitialLoad(MapStoreContext mapStoreContext, int partitionId) { this.partitionId = partitionId; this.mapNamePartition = partitionService.getPartitionId(toData.apply(mapName)); Role newRole = calculateRole(); role.nextOrStay(newRole); state.next(State.LOADING); if (logger.isFinestEnabled()) { logger.finest("startInitialLoad invoked " + getStateMessage()); } switch (newRole) { case SENDER: return sendKeys(mapStoreContext, false); case SENDER_BACKUP: case RECEIVER: return triggerLoading(); default: return keyLoadFinished; } } private Role calculateRole() { boolean isPartitionOwner = partitionService.isPartitionOwner(partitionId); boolean isMapNamePartition = partitionId == mapNamePartition; boolean isMapNamePartitionFirstReplica = false; if (hasBackup && isMapNamePartition) { IPartition partition = partitionService.getPartition(partitionId); Address firstReplicaAddress = partition.getReplicaAddress(1); Member member = clusterService.getMember(firstReplicaAddress); if (member != null) { isMapNamePartitionFirstReplica = member.localMember(); } } return assignRole(isPartitionOwner, isMapNamePartition, isMapNamePartitionFirstReplica); } /** * Sends keys to all partitions in batches. */ public Future<?> sendKeys(final MapStoreContext mapStoreContext, final boolean replaceExistingValues) { if (keyLoadFinished.isDone()) { keyLoadFinished = new LoadFinishedFuture(); Future<Boolean> sent = execService.submit(MAP_LOAD_ALL_KEYS_EXECUTOR, new Callable<Boolean>() { @Override public Boolean call() throws Exception { sendKeysInBatches(mapStoreContext, replaceExistingValues); return false; } }); execService.asCompletableFuture(sent).andThen(keyLoadFinished); } return keyLoadFinished; } /** * Check if loaded on SENDER partition. Triggers key loading if it hadn't started */ public Future triggerLoading() { if (keyLoadFinished.isDone()) { keyLoadFinished = new LoadFinishedFuture(); // side effect -> just trigger load on SENDER_BACKUP id SENDER died execService.execute(MAP_LOAD_ALL_KEYS_EXECUTOR, new Runnable() { @Override public void run() { // checks if loading has finished and triggers loading in case SENDER died and SENDER_BACKUP took over. Operation op = new TriggerLoadIfNeededOperation(mapName); opService.<Boolean>invokeOnPartition(SERVICE_NAME, op, mapNamePartition) // required since loading may be triggerd after migration // and in this case the callback is the only way to get to know if the key load finished or not. .andThen(loadingFinishedCallback()); } }); } return keyLoadFinished; } private ExecutionCallback<Boolean> loadingFinishedCallback() { return new ExecutionCallback<Boolean>() { @Override public void onResponse(Boolean loadingFinished) { if (loadingFinished) { updateLocalKeyLoadStatus(null); } } @Override public void onFailure(Throwable t) { updateLocalKeyLoadStatus(t); } }; } private void updateLocalKeyLoadStatus(Throwable t) { Operation op = new KeyLoadStatusOperation(mapName, t); // This updates the local record store on the partition thread. // If invoked by the SENDER_BACKUP however it's the replica index has to be set to 1, otherwise // it will be a remote call to the SENDER who is the owner of the given partitionId. if (hasBackup && role.is(Role.SENDER_BACKUP)) { opService.createInvocationBuilder(SERVICE_NAME, op, partitionId).setReplicaIndex(1).invoke(); } else { opService.createInvocationBuilder(SERVICE_NAME, op, partitionId).invoke(); } } public Future<?> startLoading(MapStoreContext mapStoreContext, boolean replaceExistingValues) { role.nextOrStay(Role.SENDER); if (state.is(State.LOADING)) { return keyLoadFinished; } state.next(State.LOADING); return sendKeys(mapStoreContext, replaceExistingValues); } public void trackLoading(boolean lastBatch, Throwable exception) { if (lastBatch) { state.nextOrStay(State.LOADED); if (exception != null) { keyLoadFinished.setResult(exception); } else { keyLoadFinished.setResult(true); } } else if (state.is(State.LOADED)) { state.next(State.LOADING); } } /** * Triggers key loading on SENDER if it hadn't started. Delays triggering if invoked multiple times. **/ public void triggerLoadingWithDelay() { if (delayedTrigger == null) { Runnable runnable = new Runnable() { @Override public void run() { Operation op = new TriggerLoadIfNeededOperation(mapName); opService.invokeOnPartition(SERVICE_NAME, op, mapNamePartition); } }; delayedTrigger = new CoalescingDelayedTrigger(execService, LOADING_TRIGGER_DELAY, LOADING_TRIGGER_DELAY, runnable); } delayedTrigger.executeWithDelay(); } // If this gets invoked on SENDER BACKUP it means the SENDER died and SENDER BACKUP takes over. public boolean shouldDoInitialLoad() { if (role.is(Role.SENDER_BACKUP)) { // was backup. become primary sender role.next(Role.SENDER); if (state.is(State.LOADING)) { // previous loading was in progress. cancel and start from scratch state.next(State.NOT_LOADED); keyLoadFinished.setResult(false); } } return state.is(State.NOT_LOADED); } private void sendKeysInBatches(MapStoreContext mapStoreContext, boolean replaceExistingValues) throws Exception { if (logger.isFinestEnabled()) { logger.finest("sendKeysInBatches invoked " + getStateMessage()); } int clusterSize = partitionService.getMemberPartitionsMap().size(); Iterator<Object> keys = null; Throwable loadError = null; try { Iterable<Object> allKeys = mapStoreContext.loadAllKeys(); keys = allKeys.iterator(); Iterator<Data> dataKeys = map(keys, toData); int mapMaxSize = clusterSize * maxSizePerNode; if (mapMaxSize > 0) { dataKeys = limit(dataKeys, mapMaxSize); } Iterator<Entry<Integer, Data>> partitionsAndKeys = map(dataKeys, toPartition(partitionService)); Iterator<Map<Integer, List<Data>>> batches = toBatches(partitionsAndKeys, maxBatch); List<Future> futures = new ArrayList<Future>(); while (batches.hasNext()) { Map<Integer, List<Data>> batch = batches.next(); futures.addAll(sendBatch(batch, replaceExistingValues)); } // This acts as a barrier to prevent re-ordering of key distribution operations (LoadAllOperation) // and LoadStatusOperation(s) which indicates all keys were already loaded. // Re-ordering of in-flight operations can happen during a partition migration. We are waiting here // for all LoadAllOperation(s) to be ACKed by receivers and only then we send them the LoadStatusOperation // See https://github.com/hazelcast/hazelcast/issues/4024 for additional details FutureUtil.waitForever(futures); } catch (Exception caught) { loadError = caught; } finally { sendKeyLoadCompleted(clusterSize, loadError); if (keys instanceof Closeable) { closeResource((Closeable) keys); } } } private List<Future> sendBatch(Map<Integer, List<Data>> batch, boolean replaceExistingValues) { Set<Entry<Integer, List<Data>>> entries = batch.entrySet(); List<Future> futures = new ArrayList<Future>(entries.size()); for (Entry<Integer, List<Data>> e : entries) { int partitionId = e.getKey(); List<Data> keys = e.getValue(); MapOperation op = operationProvider.createLoadAllOperation(mapName, keys, replaceExistingValues); InternalCompletableFuture<Object> future = opService.invokeOnPartition(SERVICE_NAME, op, partitionId); futures.add(future); } return futures; } private void sendKeyLoadCompleted(int clusterSize, Throwable exception) throws Exception { // Notify SENDER first - reason why this is so important: // Someone may do map.get(other_nodes_key) and when it finishes do map.loadAll // The problem is that map.get may finish earlier than then overall loading on the SENDER due to the fact // that the LoadStatusOperation may first reach the node that did map.get and not the SENDER. // The SENDER will be then in the LOADING status, thus the loadAll call will be ignored. // it happens only if all LoadAllOperation finish before the sendKeyLoadCompleted is started (test case, little data) // Fixes https://github.com/hazelcast/hazelcast/issues/5453 List<Future> futures = new ArrayList<Future>(); Operation senderStatus = new KeyLoadStatusOperation(mapName, exception); Future senderFuture = opService.createInvocationBuilder(SERVICE_NAME, senderStatus, mapNamePartition) .setReplicaIndex(0).invoke(); futures.add(senderFuture); // notify SENDER_BACKUP if (hasBackup && clusterSize > 1) { Operation senderBackupStatus = new KeyLoadStatusOperation(mapName, exception); Future senderBackupFuture = opService.createInvocationBuilder(SERVICE_NAME, senderBackupStatus, mapNamePartition) .setReplicaIndex(1).invoke(); futures.add(senderBackupFuture); } // Blocks until finished on SENDER & SENDER_BACKUP PARTITIONS // We need to wait for these operation to finished before the map-key-loader returns from the call // otherwise the loading won't be finished on SENDER and SENDER_BACKUP but the user may be able to call loadAll which // will be ignored since the SENDER and SENDER_BACKUP are still loading. FutureUtil.waitForever(futures); // INVOKES AND BLOCKS UNTIL FINISHED on ALL PARTITIONS (SENDER AND SENDER BACKUP WILL BE REPEATED) // notify all partitions about loading status: finished or exception encountered opService.invokeOnAllPartitions(SERVICE_NAME, new KeyLoadStatusOperationFactory(mapName, exception)); } public void setMaxBatch(int maxBatch) { this.maxBatch = maxBatch; } public void setMaxSize(int maxSize) { this.maxSizePerNode = maxSize; } public void setHasBackup(boolean hasBackup) { this.hasBackup = hasBackup; } public void setMapOperationProvider(MapOperationProvider operationProvider) { this.operationProvider = operationProvider; } public boolean isKeyLoadFinished() { return keyLoadFinished.isDone(); } public void promoteToLoadedOnMigration() { // The state machine cannot skip states so we need to promote to loaded step by step state.next(State.LOADING); state.next(State.LOADED); } private String getStateMessage() { return "on partitionId=" + partitionId + " on " + clusterService.getThisAddress() + " role=" + role + " state=" + state; } private static final class LoadFinishedFuture extends AbstractCompletableFuture<Boolean> implements ExecutionCallback<Boolean> { private LoadFinishedFuture(Boolean result) { this(); setResult(result); } private LoadFinishedFuture() { super((Executor) null, getLogger(LoadFinishedFuture.class)); } @Override public Boolean get(long timeout, TimeUnit timeUnit) throws InterruptedException, ExecutionException, TimeoutException { if (isDone()) { return getResult(); } throw new UnsupportedOperationException("Future is not done yet"); } @Override public void onResponse(Boolean loaded) { if (loaded) { setResult(true); } // if not loaded yet we wait for the last batch to arrive } @Override public void onFailure(Throwable t) { setResult(t); } @Override protected boolean shouldCancel(boolean mayInterruptIfRunning) { return false; } @Override protected void setResult(Object result) { super.setResult(result); } @Override public String toString() { return getClass().getSimpleName() + "{done=" + isDone() + "}"; } } }