/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.ignite.internal.processors.cache.distributed.dht;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicReferenceArray;
import org.apache.ignite.IgniteCheckedException;
import org.apache.ignite.IgniteException;
import org.apache.ignite.IgniteLogger;
import org.apache.ignite.IgniteSystemProperties;
import org.apache.ignite.cache.PartitionLossPolicy;
import org.apache.ignite.cluster.ClusterNode;
import org.apache.ignite.events.DiscoveryEvent;
import org.apache.ignite.events.EventType;
import org.apache.ignite.internal.IgniteFutureTimeoutCheckedException;
import org.apache.ignite.internal.IgniteInterruptedCheckedException;
import org.apache.ignite.internal.managers.discovery.DiscoCache;
import org.apache.ignite.internal.processors.affinity.AffinityAssignment;
import org.apache.ignite.internal.processors.affinity.AffinityTopologyVersion;
import org.apache.ignite.internal.processors.cache.ClusterState;
import org.apache.ignite.internal.processors.cache.GridCacheContext;
import org.apache.ignite.internal.processors.cache.GridCacheMapEntryFactory;
import org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionExchangeId;
import org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionFullMap;
import org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionMap;
import org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture;
import org.apache.ignite.internal.util.F0;
import org.apache.ignite.internal.util.GridAtomicLong;
import org.apache.ignite.internal.util.StripedCompositeReadWriteLock;
import org.apache.ignite.internal.util.tostring.GridToStringExclude;
import org.apache.ignite.internal.util.typedef.F;
import org.apache.ignite.internal.util.typedef.T2;
import org.apache.ignite.internal.util.typedef.X;
import org.apache.ignite.internal.util.typedef.internal.CU;
import org.apache.ignite.internal.util.typedef.internal.U;
import org.jetbrains.annotations.Nullable;
import static org.apache.ignite.IgniteSystemProperties.IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT;
import static org.apache.ignite.events.EventType.EVT_CACHE_REBALANCE_PART_DATA_LOST;
import static org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtPartitionState.EVICTED;
import static org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtPartitionState.LOST;
import static org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtPartitionState.MOVING;
import static org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtPartitionState.OWNING;
import static org.apache.ignite.internal.processors.cache.distributed.dht.GridDhtPartitionState.RENTING;
/**
* Partition topology.
*/
@GridToStringExclude
class GridDhtPartitionTopologyImpl implements GridDhtPartitionTopology {
/** If true, then check consistency. */
private static final boolean CONSISTENCY_CHECK = false;
/** Flag to control amount of output for full map. */
private static final boolean FULL_MAP_DEBUG = false;
/** */
private static final Long ZERO = 0L;
/** Context. */
private final GridCacheContext<?, ?> cctx;
/** Logger. */
private final IgniteLogger log;
/** */
private final AtomicReferenceArray<GridDhtLocalPartition> locParts;
/** Node to partition map. */
private GridDhtPartitionFullMap node2part;
/** Partition to node map. */
private Map<Integer, Set<UUID>> part2node = new HashMap<>();
/** */
private GridDhtPartitionExchangeId lastExchangeId;
/** */
private volatile AffinityTopologyVersion topVer = AffinityTopologyVersion.NONE;
/** Discovery cache. */
private volatile DiscoCache discoCache;
/** */
private volatile boolean stopping;
/** A future that will be completed when topology with version topVer will be ready to use. */
private volatile GridDhtTopologyFuture topReadyFut;
/** */
private final GridAtomicLong updateSeq = new GridAtomicLong(1);
/** Lock. */
private final StripedCompositeReadWriteLock lock = new StripedCompositeReadWriteLock(16);
/** */
private final GridCacheMapEntryFactory entryFactory;
/** Partition update counter. */
private Map<Integer, T2<Long, Long>> cntrMap = new HashMap<>();
/** */
private volatile AffinityTopologyVersion rebalancedTopVer = AffinityTopologyVersion.NONE;
/** */
private volatile boolean treatAllPartAsLoc;
/**
* @param cctx Context.
* @param entryFactory Entry factory.
*/
GridDhtPartitionTopologyImpl(GridCacheContext<?, ?> cctx, GridCacheMapEntryFactory entryFactory) {
assert cctx != null;
this.cctx = cctx;
this.entryFactory = entryFactory;
log = cctx.logger(getClass());
locParts = new AtomicReferenceArray<>(cctx.config().getAffinity().partitions());
}
/** {@inheritDoc} */
@Override public int cacheId() {
return cctx.cacheId();
}
/**
*
*/
public void onReconnected() {
lock.writeLock().lock();
try {
node2part = null;
part2node = new HashMap<>();
lastExchangeId = null;
updateSeq.set(1);
topReadyFut = null;
rebalancedTopVer = AffinityTopologyVersion.NONE;
topVer = AffinityTopologyVersion.NONE;
discoCache = cctx.discovery().discoCache();
}
finally {
lock.writeLock().unlock();
}
}
/**
* @return Full map string representation.
*/
@SuppressWarnings({"ConstantConditions"})
private String fullMapString() {
return node2part == null ? "null" : FULL_MAP_DEBUG ? node2part.toFullString() : node2part.toString();
}
/**
* @param map Map to get string for.
* @return Full map string representation.
*/
@SuppressWarnings({"ConstantConditions"})
private String mapString(GridDhtPartitionMap map) {
return map == null ? "null" : FULL_MAP_DEBUG ? map.toFullString() : map.toString();
}
/**
* Waits for renting partitions.
*
* @return {@code True} if mapping was changed.
* @throws IgniteCheckedException If failed.
*/
private boolean waitForRent() throws IgniteCheckedException {
final long longOpDumpTimeout =
IgniteSystemProperties.getLong(IgniteSystemProperties.IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT, 60_000);
int dumpCnt = 0;
GridDhtLocalPartition part;
for (int i = 0; i < locParts.length(); i++) {
part = locParts.get(i);
if (part == null)
continue;
GridDhtPartitionState state = part.state();
if (state == RENTING || state == EVICTED) {
if (log.isDebugEnabled())
log.debug("Waiting for renting partition: " + part);
part.tryEvictAsync(false);
// Wait for partition to empty out.
if (longOpDumpTimeout > 0) {
while (true) {
try {
part.rent(true).get(longOpDumpTimeout);
break;
}
catch (IgniteFutureTimeoutCheckedException ignored) {
if (dumpCnt++ < GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) {
U.warn(log, "Failed to wait for partition eviction [" +
"topVer=" + topVer +
", cache=" + cctx.name() +
", part=" + part.id() +
", partState=" + part.state() +
", size=" + part.size() +
", reservations=" + part.reservations() +
", grpReservations=" + part.groupReserved() +
", node=" + cctx.localNodeId() + "]");
if (IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, false))
U.dumpThreads(log);
}
}
}
}
else
part.rent(true).get();
if (log.isDebugEnabled())
log.debug("Finished waiting for renting partition: " + part);
}
}
// Remove evicted partition.
lock.writeLock().lock();
try {
boolean changed = false;
for (int i = 0; i < locParts.length(); i++) {
part = locParts.get(i);
if (part == null)
continue;
if (part.state() == EVICTED) {
locParts.set(i, null);
changed = true;
}
}
return changed;
}
finally {
lock.writeLock().unlock();
}
}
/** {@inheritDoc} */
@SuppressWarnings({"LockAcquiredButNotSafelyReleased"})
@Override public void readLock() {
lock.readLock().lock();
}
/** {@inheritDoc} */
@Override public void readUnlock() {
lock.readLock().unlock();
}
/** {@inheritDoc} */
@Override public void updateTopologyVersion(
GridDhtPartitionExchangeId exchId,
GridDhtPartitionsExchangeFuture exchFut,
long updSeq,
boolean stopping
) throws IgniteInterruptedCheckedException {
U.writeLock(lock);
try {
assert exchId.topologyVersion().compareTo(topVer) > 0 : "Invalid topology version [topVer=" + topVer +
", exchId=" + exchId +
", fut=" + exchFut + ']';
this.stopping = stopping;
updateSeq.setIfGreater(updSeq);
topReadyFut = exchFut;
rebalancedTopVer = AffinityTopologyVersion.NONE;
topVer = exchId.topologyVersion();
discoCache = exchFut.discoCache();
}
finally {
lock.writeLock().unlock();
}
}
/** {@inheritDoc} */
@Override public AffinityTopologyVersion topologyVersion() {
AffinityTopologyVersion topVer = this.topVer;
assert topVer.topologyVersion() > 0 : "Invalid topology version [topVer=" + topVer +
", cacheName=" + cctx.name() + ']';
return topVer;
}
/** {@inheritDoc} */
@Override public GridDhtTopologyFuture topologyVersionFuture() {
assert topReadyFut != null;
return topReadyFut;
}
/** {@inheritDoc} */
@Override public boolean stopping() {
return stopping;
}
/** {@inheritDoc} */
@Override public void initPartitions(
GridDhtPartitionsExchangeFuture exchFut) throws IgniteInterruptedCheckedException {
U.writeLock(lock);
try {
if (stopping)
return;
long updateSeq = this.updateSeq.incrementAndGet();
initPartitions0(exchFut, updateSeq);
consistencyCheck();
}
finally {
lock.writeLock().unlock();
}
}
/**
* @param exchFut Exchange future.
* @param updateSeq Update sequence.
*/
private void initPartitions0(GridDhtPartitionsExchangeFuture exchFut, long updateSeq) {
ClusterNode loc = cctx.localNode();
ClusterNode oldest = discoCache.oldestAliveServerNodeWithCache();
GridDhtPartitionExchangeId exchId = exchFut.exchangeId();
assert topVer.equals(exchFut.topologyVersion()) :
"Invalid topology [topVer=" + topVer +
", cache=" + cctx.name() +
", futVer=" + exchFut.topologyVersion() +
", fut=" + exchFut + ']';
assert cctx.affinity().affinityTopologyVersion().equals(exchFut.topologyVersion()) :
"Invalid affinity [topVer=" + cctx.affinity().affinityTopologyVersion() +
", cache=" + cctx.name() +
", futVer=" + exchFut.topologyVersion() +
", fut=" + exchFut + ']';
List<List<ClusterNode>> aff = cctx.affinity().assignments(exchFut.topologyVersion());
int num = cctx.affinity().partitions();
if (cctx.rebalanceEnabled()) {
boolean added = exchFut.isCacheAdded(cctx.cacheId(), exchId.topologyVersion());
boolean first = (loc.equals(oldest) && loc.id().equals(exchId.nodeId()) && exchId.isJoined()) || added;
if (first) {
assert exchId.isJoined() || added;
for (int p = 0; p < num; p++) {
if (localNode(p, aff)) {
GridDhtLocalPartition locPart = createPartition(p);
boolean owned = locPart.own();
assert owned : "Failed to own partition for oldest node [cacheName" + cctx.name() +
", part=" + locPart + ']';
if (log.isDebugEnabled())
log.debug("Owned partition for oldest node: " + locPart);
updateSeq = updateLocal(p, locPart.state(), updateSeq);
}
}
}
else
createPartitions(aff, updateSeq);
}
else {
// If preloader is disabled, then we simply clear out
// the partitions this node is not responsible for.
for (int p = 0; p < num; p++) {
GridDhtLocalPartition locPart = localPartition(p, topVer, false, false);
boolean belongs = localNode(p, aff);
if (locPart != null) {
if (!belongs) {
GridDhtPartitionState state = locPart.state();
if (state.active()) {
locPart.rent(false);
updateSeq = updateLocal(p, locPart.state(), updateSeq);
if (log.isDebugEnabled())
log.debug("Evicting partition with rebalancing disabled " +
"(it does not belong to affinity): " + locPart);
}
}
else
locPart.own();
}
else if (belongs) {
locPart = createPartition(p);
locPart.own();
updateLocal(p, locPart.state(), updateSeq);
}
}
}
if (node2part != null && node2part.valid())
checkEvictions(updateSeq, aff);
updateRebalanceVersion(aff);
}
/**
* @param aff Affinity assignments.
* @param updateSeq Update sequence.
*/
private void createPartitions(List<List<ClusterNode>> aff, long updateSeq) {
int num = cctx.affinity().partitions();
for (int p = 0; p < num; p++) {
if (node2part != null && node2part.valid()) {
if (localNode(p, aff)) {
// This will make sure that all non-existing partitions
// will be created in MOVING state.
GridDhtLocalPartition locPart = createPartition(p);
updateSeq = updateLocal(p, locPart.state(), updateSeq);
}
}
// If this node's map is empty, we pre-create local partitions,
// so local map will be sent correctly during exchange.
else if (localNode(p, aff))
createPartition(p);
}
}
/** {@inheritDoc} */
@Override public void beforeExchange(GridDhtPartitionsExchangeFuture exchFut, boolean affReady)
throws IgniteCheckedException {
DiscoveryEvent discoEvt = exchFut.discoveryEvent();
ClusterState newState = exchFut.newClusterState();
treatAllPartAsLoc = (newState != null && newState == ClusterState.ACTIVE)
|| (cctx.kernalContext().state().active()
&& discoEvt.type() == EventType.EVT_NODE_JOINED
&& discoEvt.eventNode().isLocal()
&& !cctx.kernalContext().clientNode()
);
// Wait for rent outside of checkpoint lock.
waitForRent();
ClusterNode loc = cctx.localNode();
cctx.shared().database().checkpointReadLock();
synchronized (cctx.shared().exchange().interruptLock()) {
if (Thread.currentThread().isInterrupted())
throw new IgniteInterruptedCheckedException("Thread is interrupted: " + Thread.currentThread());
try {
U.writeLock(lock);
}
catch (IgniteInterruptedCheckedException e) {
cctx.shared().database().checkpointReadUnlock();
throw e;
}
try {
GridDhtPartitionExchangeId exchId = exchFut.exchangeId();
if (stopping)
return;
assert topVer.equals(exchId.topologyVersion()) : "Invalid topology version [topVer=" +
topVer + ", exchId=" + exchId + ']';
if (exchId.isLeft())
removeNode(exchId.nodeId());
ClusterNode oldest = discoCache.oldestAliveServerNodeWithCache();
if (log.isDebugEnabled())
log.debug("Partition map beforeExchange [exchId=" + exchId + ", fullMap=" + fullMapString() + ']');
long updateSeq = this.updateSeq.incrementAndGet();
cntrMap.clear();
// If this is the oldest node.
if (oldest != null && (loc.equals(oldest) || exchFut.isCacheAdded(cctx.cacheId(), exchId.topologyVersion()))) {
if (node2part == null) {
node2part = new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq);
if (log.isDebugEnabled())
log.debug("Created brand new full topology map on oldest node [exchId=" +
exchId + ", fullMap=" + fullMapString() + ']');
}
else if (!node2part.valid()) {
node2part = new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq, node2part, false);
if (log.isDebugEnabled())
log.debug("Created new full topology map on oldest node [exchId=" + exchId + ", fullMap=" +
node2part + ']');
}
else if (!node2part.nodeId().equals(loc.id())) {
node2part = new GridDhtPartitionFullMap(oldest.id(), oldest.order(), updateSeq, node2part, false);
if (log.isDebugEnabled())
log.debug("Copied old map into new map on oldest node (previous oldest node left) [exchId=" +
exchId + ", fullMap=" + fullMapString() + ']');
}
}
if (affReady)
initPartitions0(exchFut, updateSeq);
else {
List<List<ClusterNode>> aff = cctx.affinity().idealAssignment();
createPartitions(aff, updateSeq);
}
consistencyCheck();
if (log.isDebugEnabled())
log.debug("Partition map after beforeExchange [exchId=" + exchId + ", fullMap=" +
fullMapString() + ']');
}
finally {
lock.writeLock().unlock();
cctx.shared().database().checkpointReadUnlock();
}
}
// Wait for evictions.
waitForRent();
}
/** {@inheritDoc} */
@Override public boolean afterExchange(GridDhtPartitionsExchangeFuture exchFut) throws IgniteCheckedException {
treatAllPartAsLoc = false;
boolean changed = waitForRent();
int num = cctx.affinity().partitions();
AffinityTopologyVersion topVer = exchFut.topologyVersion();
assert cctx.affinity().affinityTopologyVersion().equals(topVer) : "Affinity is not initialized " +
"[topVer=" + topVer +
", affVer=" + cctx.affinity().affinityTopologyVersion() +
", fut=" + exchFut + ']';
lock.writeLock().lock();
try {
if (stopping)
return false;
assert topVer.equals(exchFut.topologyVersion()) : "Invalid topology version [topVer=" +
topVer + ", exchId=" + exchFut.exchangeId() + ']';
if (log.isDebugEnabled())
log.debug("Partition map before afterExchange [exchId=" + exchFut.exchangeId() + ", fullMap=" +
fullMapString() + ']');
long updateSeq = this.updateSeq.incrementAndGet();
for (int p = 0; p < num; p++) {
GridDhtLocalPartition locPart = localPartition(p, topVer, false, false);
if (cctx.affinity().partitionLocalNode(p, topVer)) {
// This partition will be created during next topology event,
// which obviously has not happened at this point.
if (locPart == null) {
if (log.isDebugEnabled())
log.debug("Skipping local partition afterExchange (will not create): " + p);
continue;
}
GridDhtPartitionState state = locPart.state();
if (state == MOVING) {
if (cctx.rebalanceEnabled()) {
Collection<ClusterNode> owners = owners(p);
// If there are no other owners, then become an owner.
if (F.isEmpty(owners)) {
boolean owned = locPart.own();
assert owned : "Failed to own partition [cacheName" + cctx.name() + ", locPart=" +
locPart + ']';
updateSeq = updateLocal(p, locPart.state(), updateSeq);
changed = true;
if (cctx.events().isRecordable(EVT_CACHE_REBALANCE_PART_DATA_LOST)) {
DiscoveryEvent discoEvt = exchFut.discoveryEvent();
cctx.events().addPreloadEvent(p,
EVT_CACHE_REBALANCE_PART_DATA_LOST, discoEvt.eventNode(),
discoEvt.type(), discoEvt.timestamp());
}
if (log.isDebugEnabled())
log.debug("Owned partition: " + locPart);
}
else if (log.isDebugEnabled())
log.debug("Will not own partition (there are owners to rebalance from) [locPart=" +
locPart + ", owners = " + owners + ']');
}
else
updateSeq = updateLocal(p, locPart.state(), updateSeq);
}
}
else {
if (locPart != null) {
GridDhtPartitionState state = locPart.state();
if (state == MOVING && cctx.kernalContext().state().active()) {
locPart.rent(false);
updateSeq = updateLocal(p, locPart.state(), updateSeq);
changed = true;
if (log.isDebugEnabled())
log.debug("Evicting moving partition (it does not belong to affinity): " + locPart);
}
}
}
}
updateRebalanceVersion(cctx.affinity().assignments(topVer));
consistencyCheck();
}
finally {
lock.writeLock().unlock();
}
return changed;
}
/** {@inheritDoc} */
@Nullable @Override public GridDhtLocalPartition localPartition(int p, AffinityTopologyVersion topVer,
boolean create)
throws GridDhtInvalidPartitionException {
return localPartition(p, topVer, create, true);
}
/**
* @param p Partition number.
* @return Partition.
*/
private GridDhtLocalPartition createPartition(int p) {
assert lock.isWriteLockedByCurrentThread();
GridDhtLocalPartition loc = locParts.get(p);
if (loc == null || loc.state() == EVICTED) {
locParts.set(p, loc = new GridDhtLocalPartition(cctx, p, entryFactory));
if (cctx.shared().pageStore() != null) {
try {
cctx.shared().pageStore().onPartitionCreated(cctx.cacheId(), p);
}
catch (IgniteCheckedException e) {
// TODO ignite-db
throw new IgniteException(e);
}
}
}
return loc;
}
/**
* @param p Partition number.
* @param topVer Topology version.
* @param create Create flag.
* @param updateSeq Update sequence.
* @return Local partition.
*/
@SuppressWarnings("TooBroadScope")
private GridDhtLocalPartition localPartition(int p,
AffinityTopologyVersion topVer,
boolean create,
boolean updateSeq) {
GridDhtLocalPartition loc;
loc = locParts.get(p);
GridDhtPartitionState state = loc != null ? loc.state() : null;
if (loc != null && state != EVICTED && (state != RENTING || !cctx.allowFastEviction()))
return loc;
if (!create)
return null;
boolean created = false;
lock.writeLock().lock();
try {
loc = locParts.get(p);
state = loc != null ? loc.state() : null;
boolean belongs = cctx.affinity().partitionLocalNode(p, topVer);
if (loc != null && state == EVICTED) {
locParts.set(p, loc = null);
if (!treatAllPartAsLoc && !belongs)
throw new GridDhtInvalidPartitionException(p, "Adding entry to evicted partition " +
"(often may be caused by inconsistent 'key.hashCode()' implementation) " +
"[part=" + p + ", topVer=" + topVer + ", this.topVer=" + this.topVer + ']');
}
else if (loc != null && state == RENTING && cctx.allowFastEviction())
throw new GridDhtInvalidPartitionException(p, "Adding entry to partition that is concurrently evicted.");
if (loc == null) {
if (!treatAllPartAsLoc && !belongs)
throw new GridDhtInvalidPartitionException(p, "Creating partition which does not belong to " +
"local node (often may be caused by inconsistent 'key.hashCode()' implementation) " +
"[part=" + p + ", topVer=" + topVer + ", this.topVer=" + this.topVer + ']');
locParts.set(p, loc = new GridDhtLocalPartition(cctx, p, entryFactory));
if (updateSeq)
this.updateSeq.incrementAndGet();
created = true;
if (log.isDebugEnabled())
log.debug("Created local partition: " + loc);
}
}
finally {
lock.writeLock().unlock();
}
if (created && cctx.shared().pageStore() != null) {
try {
cctx.shared().pageStore().onPartitionCreated(cctx.cacheId(), p);
}
catch (IgniteCheckedException e) {
// TODO ignite-db
throw new IgniteException(e);
}
}
return loc;
}
/** {@inheritDoc} */
@Override public void releasePartitions(int... parts) {
assert parts != null;
assert parts.length > 0;
for (int i = 0; i < parts.length; i++) {
GridDhtLocalPartition part = locParts.get(parts[i]);
if (part != null)
part.release();
}
}
/** {@inheritDoc} */
@Override public GridDhtLocalPartition localPartition(Object key, boolean create) {
return localPartition(cctx.affinity().partition(key), AffinityTopologyVersion.NONE, create);
}
/** {@inheritDoc} */
@Override public List<GridDhtLocalPartition> localPartitions() {
List<GridDhtLocalPartition> list = new ArrayList<>(locParts.length());
for (int i = 0; i < locParts.length(); i++) {
GridDhtLocalPartition part = locParts.get(i);
if (part != null && part.state().active())
list.add(part);
}
return list;
}
/** {@inheritDoc} */
@Override public Iterable<GridDhtLocalPartition> currentLocalPartitions() {
return new Iterable<GridDhtLocalPartition>() {
@Override public Iterator<GridDhtLocalPartition> iterator() {
return new CurrentPartitionsIterator();
}
};
}
/** {@inheritDoc} */
@Override public void onRemoved(GridDhtCacheEntry e) {
/*
* Make sure not to acquire any locks here as this method
* may be called from sensitive synchronization blocks.
* ===================================================
*/
GridDhtLocalPartition loc = localPartition(e.partition(), topologyVersion(), false);
if (loc != null)
loc.onRemoved(e);
}
/** {@inheritDoc} */
@Override public GridDhtPartitionMap localPartitionMap() {
Map<Integer, GridDhtPartitionState> map = new HashMap<>();
lock.readLock().lock();
try {
for (int i = 0; i < locParts.length(); i++) {
GridDhtLocalPartition part = locParts.get(i);
if (part == null)
continue;
map.put(i, part.state());
}
return new GridDhtPartitionMap(cctx.nodeId(),
updateSeq.get(),
topVer,
Collections.unmodifiableMap(map),
true);
}
finally {
lock.readLock().unlock();
}
}
/** {@inheritDoc} */
@Override public GridDhtPartitionState partitionState(UUID nodeId, int part) {
lock.readLock().lock();
try {
GridDhtPartitionMap partMap = node2part.get(nodeId);
if (partMap != null) {
GridDhtPartitionState state = partMap.get(part);
return state == null ? EVICTED : state;
}
return EVICTED;
}
finally {
lock.readLock().unlock();
}
}
/** {@inheritDoc} */
@Nullable @Override public List<ClusterNode> nodes(int p,
AffinityAssignment affAssignment,
List<ClusterNode> affNodes) {
return nodes0(p, affAssignment, affNodes);
}
/** {@inheritDoc} */
@Override public List<ClusterNode> nodes(int p, AffinityTopologyVersion topVer) {
AffinityAssignment affAssignment = cctx.affinity().assignment(topVer);
List<ClusterNode> affNodes = affAssignment.get(p);
List<ClusterNode> nodes = nodes0(p, affAssignment, affNodes);
return nodes != null ? nodes : affNodes;
}
/**
* @param p Partition.
* @param affAssignment Assignments.
* @param affNodes Node assigned for given partition by affinity.
* @return Nodes responsible for given partition (primary is first).
*/
@Nullable private List<ClusterNode> nodes0(int p, AffinityAssignment affAssignment, List<ClusterNode> affNodes) {
AffinityTopologyVersion topVer = affAssignment.topologyVersion();
lock.readLock().lock();
try {
assert node2part != null && node2part.valid() : "Invalid node-to-partitions map [topVer1=" + topVer +
", topVer2=" + this.topVer +
", node=" + cctx.igniteInstanceName() +
", cache=" + cctx.name() +
", node2part=" + node2part + ']';
List<ClusterNode> nodes = null;
Collection<UUID> nodeIds = part2node.get(p);
if (!F.isEmpty(nodeIds)) {
for (UUID nodeId : nodeIds) {
HashSet<UUID> affIds = affAssignment.getIds(p);
if (!affIds.contains(nodeId) && hasState(p, nodeId, OWNING, MOVING, RENTING)) {
ClusterNode n = cctx.discovery().node(nodeId);
if (n != null && (topVer.topologyVersion() < 0 || n.order() <= topVer.topologyVersion())) {
if (nodes == null) {
nodes = new ArrayList<>(affNodes.size() + 2);
nodes.addAll(affNodes);
}
nodes.add(n);
}
}
}
}
return nodes;
}
finally {
lock.readLock().unlock();
}
}
/**
* @param p Partition.
* @param topVer Topology version ({@code -1} for all nodes).
* @param state Partition state.
* @param states Additional partition states.
* @return List of nodes for the partition.
*/
private List<ClusterNode> nodes(int p,
AffinityTopologyVersion topVer,
GridDhtPartitionState state,
GridDhtPartitionState... states) {
Collection<UUID> allIds = topVer.topologyVersion() > 0 ? F.nodeIds(discoCache.cacheAffinityNodes(cctx.cacheId())) : null;
lock.readLock().lock();
try {
assert node2part != null && node2part.valid() : "Invalid node-to-partitions map [topVer=" + topVer +
", allIds=" + allIds +
", node2part=" + node2part +
", cache=" + cctx.name() + ']';
Collection<UUID> nodeIds = part2node.get(p);
// Node IDs can be null if both, primary and backup, nodes disappear.
int size = nodeIds == null ? 0 : nodeIds.size();
if (size == 0)
return Collections.emptyList();
List<ClusterNode> nodes = new ArrayList<>(size);
for (UUID id : nodeIds) {
if (topVer.topologyVersion() > 0 && !F.contains(allIds, id))
continue;
if (hasState(p, id, state, states)) {
ClusterNode n = cctx.discovery().node(id);
if (n != null && (topVer.topologyVersion() < 0 || n.order() <= topVer.topologyVersion()))
nodes.add(n);
}
}
return nodes;
}
finally {
lock.readLock().unlock();
}
}
/** {@inheritDoc} */
@Override public List<ClusterNode> owners(int p, AffinityTopologyVersion topVer) {
if (!cctx.rebalanceEnabled())
return ownersAndMoving(p, topVer);
return nodes(p, topVer, OWNING);
}
/** {@inheritDoc} */
@Override public List<ClusterNode> owners(int p) {
return owners(p, AffinityTopologyVersion.NONE);
}
/** {@inheritDoc} */
@Override public List<ClusterNode> moving(int p) {
if (!cctx.rebalanceEnabled())
return ownersAndMoving(p, AffinityTopologyVersion.NONE);
return nodes(p, AffinityTopologyVersion.NONE, MOVING);
}
/**
* @param p Partition.
* @param topVer Topology version.
* @return List of nodes in state OWNING or MOVING.
*/
private List<ClusterNode> ownersAndMoving(int p, AffinityTopologyVersion topVer) {
return nodes(p, topVer, OWNING, MOVING);
}
/** {@inheritDoc} */
@Override public long updateSequence() {
return updateSeq.get();
}
/** {@inheritDoc} */
@Override public GridDhtPartitionFullMap partitionMap(boolean onlyActive) {
lock.readLock().lock();
try {
assert node2part != null && node2part.valid() : "Invalid node2part [node2part: " + node2part +
", cache=" + cctx.name() +
", started=" + cctx.started() +
", stopping=" + stopping +
", locNodeId=" + cctx.localNode().id() +
", locName=" + cctx.igniteInstanceName() + ']';
GridDhtPartitionFullMap m = node2part;
return new GridDhtPartitionFullMap(m.nodeId(), m.nodeOrder(), m.updateSequence(), m, onlyActive);
}
finally {
lock.readLock().unlock();
}
}
/** {@inheritDoc} */
@SuppressWarnings({"MismatchedQueryAndUpdateOfCollection"})
@Override public GridDhtPartitionMap update(
@Nullable GridDhtPartitionExchangeId exchId,
GridDhtPartitionFullMap partMap,
@Nullable Map<Integer, T2<Long, Long>> cntrMap
) {
if (log.isDebugEnabled())
log.debug("Updating full partition map [exchId=" + exchId + ", parts=" + fullMapString() + ']');
assert partMap != null;
lock.writeLock().lock();
try {
if (stopping)
return null;
if (cntrMap != null) {
// update local map partition counters
for (Map.Entry<Integer, T2<Long, Long>> e : cntrMap.entrySet()) {
T2<Long, Long> cntr = this.cntrMap.get(e.getKey());
if (cntr == null || cntr.get2() < e.getValue().get2())
this.cntrMap.put(e.getKey(), e.getValue());
}
// update local counters in partitions
for (int i = 0; i < locParts.length(); i++) {
GridDhtLocalPartition part = locParts.get(i);
if (part == null)
continue;
T2<Long, Long> cntr = cntrMap.get(part.id());
if (cntr != null)
part.updateCounter(cntr.get2());
}
}
//if need skip
if (exchId != null && lastExchangeId != null && lastExchangeId.compareTo(exchId) >= 0) {
if (log.isDebugEnabled())
log.debug("Stale exchange id for full partition map update (will ignore) [lastExchId=" +
lastExchangeId + ", exchId=" + exchId + ']');
return null;
}
if (node2part != null && node2part.compareTo(partMap) >= 0) {
if (log.isDebugEnabled())
log.debug("Stale partition map for full partition map update (will ignore) [lastExchId=" +
lastExchangeId + ", exchId=" + exchId + ", curMap=" + node2part + ", newMap=" + partMap + ']');
return null;
}
long updateSeq = this.updateSeq.incrementAndGet();
if (exchId != null)
lastExchangeId = exchId;
if (node2part != null) {
for (GridDhtPartitionMap part : node2part.values()) {
GridDhtPartitionMap newPart = partMap.get(part.nodeId());
// If for some nodes current partition has a newer map,
// then we keep the newer value.
if (newPart != null &&
(newPart.updateSequence() < part.updateSequence() || (
cctx.startTopologyVersion() != null &&
newPart.topologyVersion() != null && // Backward compatibility.
cctx.startTopologyVersion().compareTo(newPart.topologyVersion()) > 0))
) {
if (log.isDebugEnabled())
log.debug("Overriding partition map in full update map [exchId=" + exchId + ", curPart=" +
mapString(part) + ", newPart=" + mapString(newPart) + ']');
partMap.put(part.nodeId(), part);
}
}
//remove entry if node left
for (Iterator<UUID> it = partMap.keySet().iterator(); it.hasNext(); ) {
UUID nodeId = it.next();
if (!cctx.discovery().alive(nodeId)) {
if (log.isDebugEnabled())
log.debug("Removing left node from full map update [nodeId=" + nodeId + ", partMap=" +
partMap + ']');
it.remove();
}
}
}
node2part = partMap;
Map<Integer, Set<UUID>> p2n = new HashMap<>(cctx.affinity().partitions(), 1.0f);
for (Map.Entry<UUID, GridDhtPartitionMap> e : partMap.entrySet()) {
for (Integer p : e.getValue().keySet()) {
Set<UUID> ids = p2n.get(p);
if (ids == null)
// Initialize HashSet to size 3 in anticipation that there won't be
// more than 3 nodes per partitions.
p2n.put(p, ids = U.newHashSet(3));
ids.add(e.getKey());
}
}
part2node = p2n;
boolean changed = false;
AffinityTopologyVersion affVer = cctx.affinity().affinityTopologyVersion();
GridDhtPartitionMap nodeMap = partMap.get(cctx.localNodeId());
if (nodeMap != null && cctx.shared().database().persistenceEnabled()) {
for (Map.Entry<Integer, GridDhtPartitionState> e : nodeMap.entrySet()) {
int p = e.getKey();
GridDhtPartitionState state = e.getValue();
if (state == MOVING) {
GridDhtLocalPartition locPart = locParts.get(p);
assert locPart != null;
if (locPart.state() == OWNING) {
locPart.moving();
changed = true;
}
if (cntrMap != null) {
T2<Long, Long> cntr = cntrMap.get(p);
if (cntr != null && cntr.get2() > locPart.updateCounter())
locPart.updateCounter(cntr.get2());
}
}
}
}
if (!affVer.equals(AffinityTopologyVersion.NONE) && affVer.compareTo(topVer) >= 0) {
List<List<ClusterNode>> aff = cctx.affinity().assignments(topVer);
changed |= checkEvictions(updateSeq, aff);
updateRebalanceVersion(aff);
}
consistencyCheck();
if (log.isDebugEnabled())
log.debug("Partition map after full update: " + fullMapString());
if (changed)
cctx.shared().exchange().scheduleResendPartitions();
return changed ? localPartitionMap() : null;
}
finally {
lock.writeLock().unlock();
}
}
/** {@inheritDoc} */
@SuppressWarnings({"MismatchedQueryAndUpdateOfCollection"})
@Nullable @Override public GridDhtPartitionMap update(
@Nullable GridDhtPartitionExchangeId exchId,
GridDhtPartitionMap parts,
@Nullable Map<Integer, T2<Long, Long>> cntrMap
) {
if (log.isDebugEnabled())
log.debug("Updating single partition map [exchId=" + exchId + ", parts=" + mapString(parts) + ']');
if (!cctx.discovery().alive(parts.nodeId())) {
if (log.isDebugEnabled())
log.debug("Received partition update for non-existing node (will ignore) [exchId=" + exchId +
", parts=" + parts + ']');
return null;
}
lock.writeLock().lock();
try {
if (stopping)
return null;
if (cntrMap != null) {
for (Map.Entry<Integer, T2<Long, Long>> e : cntrMap.entrySet()) {
T2<Long, Long> cntr = this.cntrMap.get(e.getKey());
if (cntr == null || cntr.get2() < e.getValue().get2())
this.cntrMap.put(e.getKey(), e.getValue());
}
for (int i = 0; i < locParts.length(); i++) {
GridDhtLocalPartition part = locParts.get(i);
if (part == null)
continue;
T2<Long, Long> cntr = cntrMap.get(part.id());
if (cntr != null && cntr.get2() > part.updateCounter())
part.updateCounter(cntr.get2());
}
}
if (lastExchangeId != null && exchId != null && lastExchangeId.compareTo(exchId) > 0) {
if (log.isDebugEnabled())
log.debug("Stale exchange id for single partition map update (will ignore) [lastExchId=" +
lastExchangeId + ", exchId=" + exchId + ']');
return null;
}
if (exchId != null)
lastExchangeId = exchId;
if (node2part == null)
// Create invalid partition map.
node2part = new GridDhtPartitionFullMap();
GridDhtPartitionMap cur = node2part.get(parts.nodeId());
if (cur != null && cur.updateSequence() >= parts.updateSequence()) {
if (log.isDebugEnabled())
log.debug("Stale update sequence for single partition map update (will ignore) [exchId=" + exchId +
", curSeq=" + cur.updateSequence() + ", newSeq=" + parts.updateSequence() + ']');
return null;
}
long updateSeq = this.updateSeq.incrementAndGet();
node2part = new GridDhtPartitionFullMap(node2part, updateSeq);
boolean changed = false;
if (cur == null || !cur.equals(parts))
changed = true;
node2part.put(parts.nodeId(), parts);
part2node = new HashMap<>(part2node);
// Add new mappings.
for (Integer p : parts.keySet()) {
Set<UUID> ids = part2node.get(p);
if (ids == null)
// Initialize HashSet to size 3 in anticipation that there won't be
// more than 3 nodes per partition.
part2node.put(p, ids = U.newHashSet(3));
changed |= ids.add(parts.nodeId());
}
// Remove obsolete mappings.
if (cur != null) {
for (Integer p : F.view(cur.keySet(), F0.notIn(parts.keySet()))) {
Set<UUID> ids = part2node.get(p);
if (ids != null)
changed |= ids.remove(parts.nodeId());
}
}
AffinityTopologyVersion affVer = cctx.affinity().affinityTopologyVersion();
if (!affVer.equals(AffinityTopologyVersion.NONE) && affVer.compareTo(topVer) >= 0) {
List<List<ClusterNode>> aff = cctx.affinity().assignments(topVer);
changed |= checkEvictions(updateSeq, aff);
updateRebalanceVersion(aff);
}
consistencyCheck();
if (log.isDebugEnabled())
log.debug("Partition map after single update: " + fullMapString());
if (changed)
cctx.shared().exchange().scheduleResendPartitions();
return changed ? localPartitionMap() : null;
}
finally {
lock.writeLock().unlock();
}
}
/** {@inheritDoc} */
@Override public boolean detectLostPartitions(DiscoveryEvent discoEvt) {
lock.writeLock().lock();
try {
int parts = cctx.affinity().partitions();
Collection<Integer> lost = null;
for (int p = 0; p < parts; p++) {
boolean foundOwner = false;
Set<UUID> nodeIds = part2node.get(p);
if (nodeIds != null) {
for (UUID nodeId : nodeIds) {
GridDhtPartitionMap partMap = node2part.get(nodeId);
GridDhtPartitionState state = partMap.get(p);
if (state == OWNING) {
foundOwner = true;
break;
}
}
}
if (!foundOwner) {
if (lost == null)
lost = new HashSet<>(parts - p, 1.0f);
lost.add(p);
}
}
boolean changed = false;
if (lost != null) {
PartitionLossPolicy plc = cctx.config().getPartitionLossPolicy();
assert plc != null;
// Update partition state on all nodes.
for (Integer part : lost) {
long updSeq = updateSeq.incrementAndGet();
GridDhtLocalPartition locPart = localPartition(part, topVer, false);
if (locPart != null) {
boolean marked = plc == PartitionLossPolicy.IGNORE ? locPart.own() : locPart.markLost();
if (marked)
updateLocal(locPart.id(), locPart.state(), updSeq);
changed |= marked;
}
// Update map for remote node.
else if (plc != PartitionLossPolicy.IGNORE) {
Set<UUID> nodeIds = part2node.get(part);
if (nodeIds != null) {
for (UUID nodeId : nodeIds) {
GridDhtPartitionMap nodeMap = node2part.get(nodeId);
if (nodeMap.get(part) != EVICTED)
nodeMap.put(part, LOST);
}
}
}
if (cctx.events().isRecordable(EventType.EVT_CACHE_REBALANCE_PART_DATA_LOST))
cctx.events().addPreloadEvent(part, EVT_CACHE_REBALANCE_PART_DATA_LOST,
discoEvt.eventNode(), discoEvt.type(), discoEvt.timestamp());
}
if (plc != PartitionLossPolicy.IGNORE)
cctx.needsRecovery(true);
}
return changed;
}
finally {
lock.writeLock().unlock();
}
}
/** {@inheritDoc} */
@Override public void resetLostPartitions() {
lock.writeLock().lock();
try {
int parts = cctx.affinity().partitions();
long updSeq = updateSeq.incrementAndGet();
for (int part = 0; part < parts; part++) {
Set<UUID> nodeIds = part2node.get(part);
if (nodeIds != null) {
boolean lost = false;
for (UUID node : nodeIds) {
GridDhtPartitionMap map = node2part.get(node);
if (map.get(part) == LOST) {
lost = true;
break;
}
}
if (lost) {
GridDhtLocalPartition locPart = localPartition(part, topVer, false);
if (locPart != null) {
boolean marked = locPart.own();
if (marked)
updateLocal(locPart.id(), locPart.state(), updSeq);
}
for (UUID nodeId : nodeIds) {
GridDhtPartitionMap nodeMap = node2part.get(nodeId);
if (nodeMap.get(part) == LOST)
nodeMap.put(part, OWNING);
}
}
}
}
checkEvictions(updSeq, cctx.affinity().assignments(topVer));
cctx.needsRecovery(false);
}
finally {
lock.writeLock().unlock();
}
}
/** {@inheritDoc} */
@Override public Collection<Integer> lostPartitions() {
lock.readLock().lock();
try {
Collection<Integer> res = null;
int parts = cctx.affinity().partitions();
for (int part = 0; part < parts; part++) {
Set<UUID> nodeIds = part2node.get(part);
if (nodeIds != null) {
for (UUID node : nodeIds) {
GridDhtPartitionMap map = node2part.get(node);
if (map.get(part) == LOST) {
if (res == null)
res = new ArrayList<>(parts - part);
res.add(part);
break;
}
}
}
}
return res == null ? Collections.<Integer>emptyList() : res;
}
finally {
lock.readLock().unlock();
}
}
/** {@inheritDoc} */
@Override public void setOwners(int p, Set<UUID> owners, boolean updateSeq) {
lock.writeLock().lock();
try {
GridDhtLocalPartition locPart = locParts.get(p);
if (locPart != null) {
if (locPart.state() == OWNING && !owners.contains(cctx.localNodeId()))
locPart.moving();
}
for (Map.Entry<UUID, GridDhtPartitionMap> e : node2part.entrySet()) {
if (!e.getValue().containsKey(p))
continue;
if (e.getValue().get(p) == OWNING && !owners.contains(e.getKey()))
e.getValue().put(p, MOVING);
}
if (updateSeq)
node2part = new GridDhtPartitionFullMap(node2part, this.updateSeq.incrementAndGet());
}
finally {
lock.writeLock().unlock();
}
}
/**
* @param updateSeq Update sequence.
* @return {@code True} if state changed.
*/
private boolean checkEvictions(long updateSeq) {
AffinityTopologyVersion affVer = cctx.affinity().affinityTopologyVersion();
boolean changed = false;
if (!affVer.equals(AffinityTopologyVersion.NONE) && affVer.compareTo(topVer) >= 0) {
List<List<ClusterNode>> aff = cctx.affinity().assignments(topVer);
changed = checkEvictions(updateSeq, aff);
updateRebalanceVersion(aff);
}
return changed;
}
/** {@inheritDoc} */
@Override public void checkEvictions() {
lock.writeLock().lock();
try {
long updateSeq = this.updateSeq.incrementAndGet();
node2part.newUpdateSequence(updateSeq);
checkEvictions(updateSeq);
}
finally {
lock.writeLock().unlock();
}
}
/**
* @param updateSeq Update sequence.
* @param aff Affinity assignments.
* @return Checks if any of the local partitions need to be evicted.
*/
private boolean checkEvictions(long updateSeq, List<List<ClusterNode>> aff) {
if (!cctx.kernalContext().state().active())
return false;
boolean changed = false;
UUID locId = cctx.nodeId();
for (int p = 0; p < locParts.length(); p++) {
GridDhtLocalPartition part = locParts.get(p);
if (part == null)
continue;
GridDhtPartitionState state = part.state();
if (state.active()) {
List<ClusterNode> affNodes = aff.get(p);
if (!affNodes.contains(cctx.localNode())) {
List<ClusterNode> nodes = nodes(p, topVer, OWNING);
Collection<UUID> nodeIds = F.nodeIds(nodes);
// If all affinity nodes are owners, then evict partition from local node.
if (nodeIds.containsAll(F.nodeIds(affNodes))) {
part.rent(false);
updateSeq = updateLocal(part.id(), part.state(), updateSeq);
changed = true;
if (log.isDebugEnabled())
log.debug("Evicted local partition (all affinity nodes are owners): " + part);
}
else {
int ownerCnt = nodeIds.size();
int affCnt = affNodes.size();
if (ownerCnt > affCnt) {
// Sort by node orders in ascending order.
Collections.sort(nodes, CU.nodeComparator(true));
int diff = nodes.size() - affCnt;
for (int i = 0; i < diff; i++) {
ClusterNode n = nodes.get(i);
if (locId.equals(n.id())) {
part.rent(false);
updateSeq = updateLocal(part.id(), part.state(), updateSeq);
changed = true;
if (log.isDebugEnabled())
log.debug("Evicted local partition (this node is oldest non-affinity node): " +
part);
break;
}
}
}
}
}
}
}
return changed;
}
/**
* Updates value for single partition.
*
* @param p Partition.
* @param state State.
* @param updateSeq Update sequence.
* @return Update sequence.
*/
@SuppressWarnings({"MismatchedQueryAndUpdateOfCollection"})
private long updateLocal(int p, GridDhtPartitionState state, long updateSeq) {
ClusterNode oldest = discoCache.oldestAliveServerNodeWithCache();
assert oldest != null || cctx.kernalContext().clientNode();
// If this node became the oldest node.
if (cctx.localNode().equals(oldest)) {
long seq = node2part.updateSequence();
if (seq != updateSeq) {
if (seq > updateSeq) {
long seq0 = this.updateSeq.get();
if (seq0 < seq) {
// Update global counter if necessary.
boolean b = this.updateSeq.compareAndSet(seq0, seq + 1);
assert b : "Invalid update sequence [updateSeq=" + updateSeq +
", seq=" + seq +
", curUpdateSeq=" + this.updateSeq.get() +
", node2part=" + node2part.toFullString() + ']';
updateSeq = seq + 1;
}
else
updateSeq = seq;
}
node2part.updateSequence(updateSeq);
}
}
if (node2part != null) {
UUID locNodeId = cctx.localNodeId();
GridDhtPartitionMap map = node2part.get(locNodeId);
if (map == null) {
map = new GridDhtPartitionMap(locNodeId,
updateSeq,
topVer,
Collections.<Integer, GridDhtPartitionState>emptyMap(),
false);
node2part.put(locNodeId, map);
}
map.updateSequence(updateSeq, topVer);
map.put(p, state);
Set<UUID> ids = part2node.get(p);
if (ids == null)
part2node.put(p, ids = U.newHashSet(3));
ids.add(locNodeId);
}
return updateSeq;
}
/**
* @param nodeId Node to remove.
*/
private void removeNode(UUID nodeId) {
assert nodeId != null;
ClusterNode oldest = discoCache.oldestAliveServerNode();
assert oldest != null || cctx.kernalContext().clientNode();
ClusterNode loc = cctx.localNode();
if (node2part != null) {
if (loc.equals(oldest) && !node2part.nodeId().equals(loc.id())) {
updateSeq.setIfGreater(node2part.updateSequence());
node2part = new GridDhtPartitionFullMap(loc.id(), loc.order(), updateSeq.incrementAndGet(),
node2part, false);
}
else
node2part = new GridDhtPartitionFullMap(node2part, node2part.updateSequence());
GridDhtPartitionMap parts = node2part.remove(nodeId);
if (parts != null) {
for (Integer p : parts.keySet()) {
Set<UUID> nodeIds = part2node.get(p);
if (nodeIds != null) {
nodeIds.remove(nodeId);
if (nodeIds.isEmpty())
part2node.remove(p);
}
}
}
consistencyCheck();
}
}
/** {@inheritDoc} */
@Override public boolean own(GridDhtLocalPartition part) {
lock.writeLock().lock();
try {
if (part.own()) {
updateLocal(part.id(), part.state(), updateSeq.incrementAndGet());
consistencyCheck();
return true;
}
consistencyCheck();
return false;
}
finally {
lock.writeLock().unlock();
}
}
/** {@inheritDoc} */
@Override public void onEvicted(GridDhtLocalPartition part, boolean updateSeq) {
lock.writeLock().lock();
try {
if (stopping)
return;
assert part.state() == EVICTED;
long seq = updateSeq ? this.updateSeq.incrementAndGet() : this.updateSeq.get();
updateLocal(part.id(), part.state(), seq);
consistencyCheck();
}
finally {
lock.writeLock().unlock();
}
}
/** {@inheritDoc} */
@Nullable @Override public GridDhtPartitionMap partitions(UUID nodeId) {
lock.readLock().lock();
try {
return node2part.get(nodeId);
}
finally {
lock.readLock().unlock();
}
}
/** {@inheritDoc} */
@Override public Map<Integer, T2<Long, Long>> updateCounters(boolean skipZeros) {
lock.readLock().lock();
try {
Map<Integer, T2<Long, Long>> res;
if (skipZeros) {
res = U.newHashMap(cntrMap.size());
for (Map.Entry<Integer, T2<Long, Long>> e : cntrMap.entrySet()) {
Long cntr = e.getValue().get2();
if (ZERO.equals(cntr))
continue;
res.put(e.getKey(), e.getValue());
}
}
else
res = new HashMap<>(cntrMap);
for (int i = 0; i < locParts.length(); i++) {
GridDhtLocalPartition part = locParts.get(i);
if (part == null)
continue;
T2<Long, Long> cntr0 = res.get(part.id());
Long initCntr = part.initialUpdateCounter();
if (cntr0 == null || initCntr >= cntr0.get1()) {
if (skipZeros && initCntr == 0L && part.updateCounter() == 0L)
continue;
res.put(part.id(), new T2<>(initCntr, part.updateCounter()));
}
}
return res;
}
finally {
lock.readLock().unlock();
}
}
/** {@inheritDoc} */
@Override public boolean rebalanceFinished(AffinityTopologyVersion topVer) {
AffinityTopologyVersion curTopVer = this.topVer;
return curTopVer.equals(topVer) && curTopVer.equals(rebalancedTopVer);
}
/** {@inheritDoc} */
@Override public boolean hasMovingPartitions() {
lock.readLock().lock();
try {
assert node2part != null && node2part.valid() : "Invalid node2part [node2part: " + node2part +
", cache=" + cctx.name() +
", started=" + cctx.started() +
", stopping=" + stopping +
", locNodeId=" + cctx.localNode().id() +
", locName=" + cctx.igniteInstanceName() + ']';
for (GridDhtPartitionMap map : node2part.values()) {
if (map.hasMovingPartitions())
return true;
}
return false;
}
finally {
lock.readLock().unlock();
}
}
/** {@inheritDoc} */
@Override public void printMemoryStats(int threshold) {
X.println(">>> Cache partition topology stats [igniteInstanceName=" + cctx.igniteInstanceName() +
", cache=" + cctx.name() + ']');
lock.readLock().lock();
try {
for (int i = 0; i < locParts.length(); i++) {
GridDhtLocalPartition part = locParts.get(i);
if (part == null)
continue;
int size = part.dataStore().size();
if (size >= threshold)
X.println(">>> Local partition [part=" + part.id() + ", size=" + size + ']');
}
}
finally {
lock.readLock().unlock();
}
}
/**
* @param part Partition.
* @param aff Affinity assignments.
* @return {@code True} if given partition belongs to local node.
*/
private boolean localNode(int part, List<List<ClusterNode>> aff) {
return aff.get(part).contains(cctx.localNode());
}
/**
* @param aff Affinity assignments.
*/
private void updateRebalanceVersion(List<List<ClusterNode>> aff) {
if (!rebalancedTopVer.equals(topVer)) {
if (node2part == null || !node2part.valid())
return;
for (int i = 0; i < cctx.affinity().partitions(); i++) {
List<ClusterNode> affNodes = aff.get(i);
// Topology doesn't contain server nodes (just clients).
if (affNodes.isEmpty())
continue;
List<ClusterNode> owners = owners(i);
if (affNodes.size() != owners.size() || !owners.containsAll(affNodes))
return;
}
rebalancedTopVer = topVer;
if (log.isDebugEnabled())
log.debug("Updated rebalanced version [cache=" + cctx.name() + ", ver=" + rebalancedTopVer + ']');
}
}
/**
* @param p Partition.
* @param nodeId Node ID.
* @param match State to match.
* @param matches Additional states.
* @return Filter for owners of this partition.
*/
private boolean hasState(final int p, @Nullable UUID nodeId, final GridDhtPartitionState match,
final GridDhtPartitionState... matches) {
if (nodeId == null)
return false;
GridDhtPartitionMap parts = node2part.get(nodeId);
// Set can be null if node has been removed.
if (parts != null) {
GridDhtPartitionState state = parts.get(p);
if (state == match)
return true;
if (matches != null && matches.length > 0) {
for (GridDhtPartitionState s : matches) {
if (state == s)
return true;
}
}
}
return false;
}
/**
* Checks consistency after all operations.
*/
private void consistencyCheck() {
if (CONSISTENCY_CHECK) {
if (node2part == null)
return;
for (Map.Entry<UUID, GridDhtPartitionMap> e : node2part.entrySet()) {
for (Integer p : e.getValue().keySet()) {
Set<UUID> nodeIds = part2node.get(p);
assert nodeIds != null : "Failed consistency check [part=" + p + ", nodeId=" + e.getKey() + ']';
assert nodeIds.contains(e.getKey()) : "Failed consistency check [part=" + p + ", nodeId=" +
e.getKey() + ", nodeIds=" + nodeIds + ']';
}
}
for (Map.Entry<Integer, Set<UUID>> e : part2node.entrySet()) {
for (UUID nodeId : e.getValue()) {
GridDhtPartitionMap map = node2part.get(nodeId);
assert map != null : "Failed consistency check [part=" + e.getKey() + ", nodeId=" + nodeId + ']';
assert map.containsKey(e.getKey()) : "Failed consistency check [part=" + e.getKey() +
", nodeId=" + nodeId + ']';
}
}
}
}
/**
* Iterator over current local partitions.
*/
private class CurrentPartitionsIterator implements Iterator<GridDhtLocalPartition> {
/** Next index. */
private int nextIdx;
/** Next partition. */
private GridDhtLocalPartition nextPart;
/**
* Constructor
*/
private CurrentPartitionsIterator() {
advance();
}
/**
* Try to advance to next partition.
*/
private void advance() {
while (nextIdx < locParts.length()) {
GridDhtLocalPartition part = locParts.get(nextIdx);
if (part != null && part.state().active()) {
nextPart = part;
return;
}
nextIdx++;
}
}
/** {@inheritDoc} */
@Override public boolean hasNext() {
return nextPart != null;
}
/** {@inheritDoc} */
@Override public GridDhtLocalPartition next() {
if (nextPart == null)
throw new NoSuchElementException();
GridDhtLocalPartition retVal = nextPart;
nextPart = null;
nextIdx++;
advance();
return retVal;
}
/** {@inheritDoc} */
@Override public void remove() {
throw new UnsupportedOperationException("remove");
}
}
}