/*
* Licensed to the Apache Software Foundation (ASF) under one or more contributor license
* agreements. See the NOTICE file distributed with this work for additional information regarding
* copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License. You may obtain a
* copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package org.apache.geode.internal.cache;
import org.apache.geode.CancelException;
import org.apache.geode.SystemFailure;
import org.apache.geode.cache.CacheClosedException;
import org.apache.geode.cache.PartitionedRegionStorageException;
import org.apache.geode.cache.Region;
import org.apache.geode.cache.RegionDestroyedException;
import org.apache.geode.cache.persistence.PartitionOfflineException;
import org.apache.geode.cache.persistence.PersistentID;
import org.apache.geode.distributed.DistributedMember;
import org.apache.geode.distributed.internal.DM;
import org.apache.geode.distributed.internal.DistributionConfig;
import org.apache.geode.distributed.internal.LonerDistributionManager;
import org.apache.geode.distributed.internal.MembershipListener;
import org.apache.geode.distributed.internal.membership.InternalDistributedMember;
import org.apache.geode.i18n.StringId;
import org.apache.geode.internal.Assert;
import org.apache.geode.internal.NanoTimer;
import org.apache.geode.internal.OneTaskOnlyExecutor;
import org.apache.geode.internal.cache.PartitionedRegion.RetryTimeKeeper;
import org.apache.geode.internal.cache.PartitionedRegionDataStore.CreateBucketResult;
import org.apache.geode.internal.cache.control.InternalResourceManager;
import org.apache.geode.internal.cache.partitioned.*;
import org.apache.geode.internal.cache.partitioned.FetchPartitionDetailsMessage.FetchPartitionDetailsResponse;
import org.apache.geode.internal.cache.partitioned.ManageBucketMessage.NodeResponse;
import org.apache.geode.internal.cache.partitioned.RegionAdvisor.PartitionProfile;
import org.apache.geode.internal.cache.partitioned.rebalance.CompositeDirector;
import org.apache.geode.internal.cache.partitioned.rebalance.FPRDirector;
import org.apache.geode.internal.cache.partitioned.rebalance.RebalanceDirector;
import org.apache.geode.internal.cache.persistence.MembershipFlushRequest;
import org.apache.geode.internal.cache.persistence.PersistentMemberID;
import org.apache.geode.internal.cache.persistence.PersistentStateListener;
import org.apache.geode.internal.i18n.LocalizedStrings;
import org.apache.geode.internal.logging.LogService;
import org.apache.geode.internal.logging.log4j.LocalizedMessage;
import org.apache.logging.log4j.Logger;
import java.util.*;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
/**
* This class provides the redundancy management for partitioned region. It will provide the
* following to the PartitionedRegion: <br>
* (1) Redundancy Management at the time of bucket creation.</br>
* <br>
* (2) Redundancy management at the new node arrival.</br>
* <br>
* (3) Redundancy management when the node leaves the partitioned region distributed system
* gracefully. i.e. Cache.close()</br>
* <br>
* (4) Redundancy management at random node failure.</br>
*
*/
public class PRHARedundancyProvider {
private static final Logger logger = LogService.getLogger();
private static final boolean DISABLE_CREATE_BUCKET_RANDOMNESS =
Boolean.getBoolean(DistributionConfig.GEMFIRE_PREFIX + "DISABLE_CREATE_BUCKET_RANDOMNESS");
public static class ArrayListWithClearState<T> extends ArrayList<T> {
private static final long serialVersionUID = 1L;
private boolean wasCleared = false;
public boolean wasCleared() {
return this.wasCleared;
}
@Override
public void clear() {
super.clear();
this.wasCleared = true;
}
}
public static final String DATASTORE_DISCOVERY_TIMEOUT_PROPERTY_NAME =
DistributionConfig.GEMFIRE_PREFIX + "partitionedRegionDatastoreDiscoveryTimeout";
static volatile Long DATASTORE_DISCOVERY_TIMEOUT_MILLISECONDS =
Long.getLong(DATASTORE_DISCOVERY_TIMEOUT_PROPERTY_NAME);
public final PartitionedRegion prRegion;
private static AtomicLong insufficientLogTimeStamp = new AtomicLong(0);
private final AtomicBoolean firstInsufficentStoresLogged = new AtomicBoolean(false);
/**
* An executor to submit tasks for redundancy recovery too. It makes sure that there will only be
* one redundancy recovery task in the queue at a time.
*/
protected final OneTaskOnlyExecutor recoveryExecutor;
private volatile ScheduledFuture<?> recoveryFuture;
private final Object shutdownLock = new Object();
private boolean shutdown = false;
volatile CountDownLatch allBucketsRecoveredFromDisk;
/**
* Used to consolidate logging for bucket regions waiting on other members to come online.
*/
private RedundancyLogger redundancyLogger = null;
/**
* Constructor for PRHARedundancyProvider.
*
* @param region The PartitionedRegion for which the HA redundancy is required to be managed.
*/
public PRHARedundancyProvider(final PartitionedRegion region) {
this.prRegion = region;
final InternalResourceManager resourceManager = region.getGemFireCache().getResourceManager();
recoveryExecutor = new OneTaskOnlyExecutor(resourceManager.getExecutor(),
new OneTaskOnlyExecutor.ConflatedTaskListener() {
public void taskDropped() {
InternalResourceManager.getResourceObserver().recoveryConflated(region);
}
});
}
/**
* Display bucket allocation status
*
* @param prRegion the given region
* @param allStores the list of available stores. If null, unknown.
* @param alreadyUsed stores allocated; only used if allStores != null
* @param forLog true if the generated string is for a log message
* @return the description string
*/
public static String regionStatus(PartitionedRegion prRegion, Set allStores,
Collection alreadyUsed, boolean forLog) {
StringBuffer sb = new StringBuffer();
sb.append("Partitioned Region name = " + prRegion.getFullPath());
final char newLine;
final String spaces;
if (forLog) {
newLine = ' ';
spaces = "";
} else {
newLine = '\n';
spaces = " ";
}
if (allStores != null) {
sb.append(newLine + spaces + "Redundancy level set to " + prRegion.getRedundantCopies());
sb.append(newLine + ". Number of available data stores: " + allStores.size());
sb.append(newLine + spaces + ". Number successfully allocated = " + alreadyUsed.size());
sb.append(newLine + ". Data stores: " + PartitionedRegionHelper.printCollection(allStores));
sb.append(newLine + ". Data stores successfully allocated: "
+ PartitionedRegionHelper.printCollection(alreadyUsed));
sb.append(newLine + ". Equivalent members: " + PartitionedRegionHelper
.printCollection(prRegion.getDistributionManager().getMembersInThisZone()));
}
return sb.toString();
}
static public final StringId TIMEOUT_MSG =
LocalizedStrings.PRHARedundancyProvider_IF_YOUR_SYSTEM_HAS_SUFFICIENT_SPACE_PERHAPS_IT_IS_UNDER_MEMBERSHIP_OR_REGION_CREATION_STRESS;
/**
* Indicate a timeout due to excessive retries among available peers
*
* @param allStores all feasible stores. If null, we don't know.
* @param alreadyUsed those that have already accepted, only used if allStores != null
* @param opString description of the operation which timed out
*/
public static void timedOut(PartitionedRegion prRegion, Set allStores, Collection alreadyUsed,
String opString, long timeOut) {
final String tooManyRetries =
LocalizedStrings.PRHARedundancyProvider_TIMED_OUT_ATTEMPTING_TO_0_IN_THE_PARTITIONED_REGION__1_WAITED_FOR_2_MS
.toLocalizedString(new Object[] {opString,
regionStatus(prRegion, allStores, alreadyUsed, true), Long.valueOf(timeOut)})
+ TIMEOUT_MSG;
throw new PartitionedRegionStorageException(tooManyRetries);
}
private Set<InternalDistributedMember> getAllStores(String partitionName) {
if (partitionName != null) {
return getFixedPartitionStores(partitionName);
}
final Set<InternalDistributedMember> allStores =
this.prRegion.getRegionAdvisor().adviseDataStore(true);
PartitionedRegionDataStore myDS = this.prRegion.getDataStore();
if (myDS != null) {
allStores.add(this.prRegion.getDistributionManager().getId());
}
return allStores;
}
/**
* This is for FPR, for given partition, we have to return the set of datastores on which the
* given partition is defined
*
* @param partitionName name of the partition for which datastores need to be found out
*/
private Set<InternalDistributedMember> getFixedPartitionStores(String partitionName) {
Set<InternalDistributedMember> members =
this.prRegion.getRegionAdvisor().adviseFixedPartitionDataStores(partitionName);
List<FixedPartitionAttributesImpl> FPAs = this.prRegion.getFixedPartitionAttributesImpl();
if (FPAs != null) {
for (FixedPartitionAttributesImpl fpa : FPAs) {
if (fpa.getPartitionName().equals(partitionName)) {
members.add((InternalDistributedMember) this.prRegion.getMyId());
}
}
}
return members;
}
/**
* Signature string indicating that not enough stores are available.
*/
static public final StringId INSUFFICIENT_STORES_MSG =
LocalizedStrings.PRHARedundancyProvider_CONSIDER_STARTING_ANOTHER_MEMBER;
/**
* Signature string indicating that there are enough stores available.
*/
static public final StringId SUFFICIENT_STORES_MSG =
LocalizedStrings.PRHARRedundancyProvider_FOUND_A_MEMBER_TO_HOST_A_BUCKET;
/**
* string indicating the attempt to allocate a bucket
*/
private static final StringId ALLOCATE_ENOUGH_MEMBERS_TO_HOST_BUCKET =
LocalizedStrings.PRHARRedundancyProvider_ALLOCATE_ENOUGH_MEMBERS_TO_HOST_BUCKET;
/**
* Indicate that we are unable to allocate sufficient stores and the timeout period has passed
*
* @param allStores stores we know about
* @param alreadyUsed ones already committed
* @param onlyLog true if only a warning log messages should be generated.
*/
private void insufficientStores(Set allStores, Collection alreadyUsed, boolean onlyLog) {
final String regionStat = regionStatus(this.prRegion, allStores, alreadyUsed, onlyLog);
final char newLine;
if (onlyLog) {
newLine = ' ';
} else {
newLine = '\n';
}
final StringId notEnoughValidNodes;
if (alreadyUsed.isEmpty()) {
notEnoughValidNodes =
LocalizedStrings.PRHARRedundancyProvider_UNABLE_TO_FIND_ANY_MEMBERS_TO_HOST_A_BUCKET_IN_THE_PARTITIONED_REGION_0;
} else {
notEnoughValidNodes =
LocalizedStrings.PRHARRedundancyProvider_CONFIGURED_REDUNDANCY_LEVEL_COULD_NOT_BE_SATISFIED_0;
}
final Object[] notEnoughValidNodesArgs = new Object[] {
PRHARedundancyProvider.INSUFFICIENT_STORES_MSG, newLine + regionStat + newLine};
if (onlyLog) {
logger.warn(LocalizedMessage.create(notEnoughValidNodes, notEnoughValidNodesArgs));
} else {
throw new PartitionedRegionStorageException(
notEnoughValidNodes.toLocalizedString(notEnoughValidNodesArgs));
}
}
/**
* Create a single copy of this bucket on one node. The bucket must already be locked.
*
* @param bucketId The bucket we are working on
* @param newBucketSize size to create it
* @param excludedMembers
* @param alreadyUsed members who already seem to have the bucket
* @param timeOut point at which to fail
* @param allStores the set of data stores to choose from
* @return the new member, null if it fails.
* @throws PartitionedRegionStorageException if there are not enough data stores
*/
private InternalDistributedMember createBucketInstance(int bucketId, final int newBucketSize,
final Set<InternalDistributedMember> excludedMembers,
Collection<InternalDistributedMember> alreadyUsed,
ArrayListWithClearState<InternalDistributedMember> failedMembers, final long timeOut,
final Set<InternalDistributedMember> allStores) {
final boolean isDebugEnabled = logger.isDebugEnabled();
// Recalculate list of candidates
HashSet<InternalDistributedMember> candidateMembers =
new HashSet<InternalDistributedMember>(allStores);
candidateMembers.removeAll(alreadyUsed);
candidateMembers.removeAll(excludedMembers);
candidateMembers.removeAll(failedMembers);
if (isDebugEnabled) {
logger.debug("AllStores={} AlreadyUsed={} excluded={} failed={}", allStores, alreadyUsed,
excludedMembers, failedMembers);
}
if (candidateMembers.size() == 0) {
this.prRegion.checkReadiness(); // fix for bug #37207
// Run out of candidates. Refetch?
if (System.currentTimeMillis() > timeOut) {
if (isDebugEnabled) {
logger.debug("createBucketInstance: ran out of candidates and timed out");
}
return null; // fail, let caller signal error
}
// Recalculate
candidateMembers = new HashSet<InternalDistributedMember>(allStores);
candidateMembers.removeAll(alreadyUsed);
candidateMembers.removeAll(excludedMembers);
failedMembers.clear();
}
if (isDebugEnabled) {
logger.debug("createBucketInstance: candidateMembers = {}", candidateMembers);
}
InternalDistributedMember candidate = null;
// If there are no candidates, early out.
if (candidateMembers.size() == 0) { // no options
if (isDebugEnabled) {
logger.debug("createBucketInstance: no valid candidates");
}
return null; // failure
} // no options
else {
// In case of FPR, candidateMembers is the set of members on which
// required fixed partition is defined.
if (this.prRegion.isFixedPartitionedRegion()) {
candidate = candidateMembers.iterator().next();
} else {
String prName = this.prRegion.getAttributes().getPartitionAttributes().getColocatedWith();
if (prName != null) {
candidate = getColocatedDataStore(candidateMembers, alreadyUsed, bucketId, prName);
} else {
final ArrayList<InternalDistributedMember> orderedCandidates =
new ArrayList<InternalDistributedMember>(candidateMembers);
candidate = getPreferredDataStore(orderedCandidates, alreadyUsed);
}
}
}
if (candidate == null) {
failedMembers.addAll(candidateMembers);
return null;
}
if (!this.prRegion.isShadowPR()
&& !ColocationHelper.checkMembersColocation(this.prRegion, candidate)) {
if (isDebugEnabled) {
logger.debug(
"createBucketInstances - Member does not have all of the regions colocated with prRegion {}",
candidate);
}
failedMembers.add(candidate);
return null;
}
if (!(candidate.equals(this.prRegion.getMyId()))) { // myself
PartitionProfile pp = this.prRegion.getRegionAdvisor().getPartitionProfile(candidate);
if (pp == null) {
if (isDebugEnabled) {
logger.debug("createBucketInstance: {}: no partition profile for {}",
this.prRegion.getFullPath(), candidate);
}
failedMembers.add(candidate);
return null;
}
} // myself
// Coordinate with any remote close occurring, causing it to wait until
// this create bucket attempt has been made.
final ManageBucketRsp response =
createBucketOnMember(bucketId, candidate, newBucketSize, failedMembers.wasCleared());
// Add targetNode to bucketNodes if successful, else to failedNodeList
if (response.isAcceptance()) {
return candidate; // success!
}
if (isDebugEnabled) {
logger.debug("createBucketInstance: {}: candidate {} declined to manage bucketId={}: {}",
this.prRegion.getFullPath(), candidate, this.prRegion.bucketStringForLogs(bucketId),
response);
}
if (response.equals(ManageBucketRsp.CLOSED)) {
excludedMembers.add(candidate);
} else {
failedMembers.add(candidate);
}
candidate = null; // failure
return null;
}
public static final long INSUFFICIENT_LOGGING_THROTTLE_TIME = TimeUnit.SECONDS.toNanos(
Integer.getInteger(DistributionConfig.GEMFIRE_PREFIX + "InsufficientLoggingThrottleTime", 2)
.intValue());
public volatile static boolean TEST_MODE = false;
// since 6.6, please use the distributed system property enforce-unique-host instead.
// public static final boolean ENFORCE_UNIQUE_HOST_STORAGE_ALLOCATION =
// DistributionConfig.DEFAULT_ENFORCE_UNIQUE_HOST;
public InternalDistributedMember createBucketOnDataStore(int bucketId, int size, long startTime,
RetryTimeKeeper snoozer) {
Set<InternalDistributedMember> attempted = new HashSet<InternalDistributedMember>();
InternalDistributedMember ret;
InternalDistributedMember primaryForFixedPartition = null;
if (this.prRegion.isFixedPartitionedRegion()) {
primaryForFixedPartition =
this.prRegion.getRegionAdvisor().adviseFixedPrimaryPartitionDataStore(bucketId);
}
final boolean isDebugEnabled = logger.isDebugEnabled();
do {
this.prRegion.checkReadiness();
Set<InternalDistributedMember> available =
this.prRegion.getRegionAdvisor().adviseInitializedDataStore();
InternalDistributedMember target = null;
available.removeAll(attempted);
for (InternalDistributedMember member : available) {
if (available.contains(primaryForFixedPartition)) {
target = primaryForFixedPartition;
} else {
target = member;
}
break;
}
if (target == null) {
if (shouldLogInsufficientStores()) {
insufficientStores(available, Collections.emptySet(), true);
}
// this will always throw an exception
insufficientStores(available, Collections.emptySet(), false);
}
try {
if (isDebugEnabled) {
logger.debug("Attempting to get data store {} to create the bucket {} for us", target,
this.prRegion.bucketStringForLogs(bucketId));
}
CreateBucketMessage.NodeResponse response =
CreateBucketMessage.send(target, this.prRegion, bucketId, size);
ret = response.waitForResponse();
if (ret != null) {
return ret;
}
} catch (ForceReattemptException e) {
// do nothing, we will already check again for a primary.
}
attempted.add(target);
} while ((ret = this.prRegion.getNodeForBucketWrite(bucketId, snoozer)) == null);
return ret;
}
/**
* Creates bucket atomically by creating all the copies to satisfy redundancy. In case all copies
* can not be created, a PartitionedRegionStorageException is thrown to the user and
* BucketBackupMessage is sent to the nodes to make copies of a bucket that was only partially
* created. Other VMs are informed of bucket creation through updates through their
* {@link BucketAdvisor.BucketProfile}s.
*
* <p>
* This method is synchronized to enforce a single threaded ordering, allowing for a more accurate
* picture of bucket distribution in the face of concurrency. See bug 37275.
* </p>
*
* This method is now slightly misnamed. Another member could be in the process of creating this
* same bucket at the same time.
*
* @param bucketId Id of the bucket to be created.
* @param newBucketSize size of the first entry.
* @param startTime a time stamp prior to calling the method, used to update bucket creation stats
* @return the primary member for the newly created bucket
* @throws PartitionedRegionStorageException if required # of buckets can not be created to
* satisfy redundancy.
* @throws PartitionedRegionException if d-lock can not be acquired to create bucket.
* @throws PartitionOfflineException if persistent data recovery is not complete for a partitioned
* region referred to in the query.
*/
public InternalDistributedMember createBucketAtomically(final int bucketId,
final int newBucketSize, final long startTime, final boolean finishIncompleteCreation,
String partitionName) throws PartitionedRegionStorageException, PartitionedRegionException,
PartitionOfflineException {
final boolean isDebugEnabled = logger.isDebugEnabled();
prRegion.checkPROffline();
// If there are insufficient stores throw *before* we try acquiring the
// (very expensive) bucket lock or the (somewhat expensive) monitor on this
earlySufficientStoresCheck(partitionName);
synchronized (this) {
if (this.prRegion.getCache().isCacheAtShutdownAll()) {
throw new CacheClosedException("Cache is shutting down");
}
if (isDebugEnabled) {
logger.debug("Starting atomic creation of bucketId={}",
this.prRegion.bucketStringForLogs(bucketId));
}
Collection<InternalDistributedMember> acceptedMembers =
new ArrayList<InternalDistributedMember>(); // ArrayList<DataBucketStores>
Set<InternalDistributedMember> excludedMembers = new HashSet<InternalDistributedMember>();
ArrayListWithClearState<InternalDistributedMember> failedMembers =
new ArrayListWithClearState<InternalDistributedMember>();
final long timeOut = System.currentTimeMillis() + computeTimeout();
BucketMembershipObserver observer = null;
boolean needToElectPrimary = true;
InternalDistributedMember bucketPrimary = null;
try {
this.prRegion.checkReadiness();
Bucket toCreate = this.prRegion.getRegionAdvisor().getBucket(bucketId);
if (!finishIncompleteCreation) {
bucketPrimary = this.prRegion.getBucketPrimary(bucketId);
if (bucketPrimary != null) {
if (isDebugEnabled) {
logger.debug(
"during atomic creation, discovered that the primary already exists {} returning early",
bucketPrimary);
}
needToElectPrimary = false;
return bucketPrimary;
}
}
observer = new BucketMembershipObserver(toCreate).beginMonitoring();
boolean loggedInsufficentStores = false; // track if insufficient data stores have been
// detected
for (;;) {
this.prRegion.checkReadiness();
if (this.prRegion.getCache().isCacheAtShutdownAll()) {
if (isDebugEnabled) {
logger.debug("Aborted createBucketAtomically due to ShutdownAll");
}
throw new CacheClosedException("Cache is shutting down");
}
// this.prRegion.getCache().getLogger().config(
// "DEBUG createBucketAtomically: "
// + " bucketId=" + this.prRegion.getBucketName(bucketId) +
// " accepted: " + acceptedMembers +
// " failed: " + failedMembers);
long timeLeft = timeOut - System.currentTimeMillis();
if (timeLeft < 0) {
// It took too long.
timedOut(this.prRegion, getAllStores(partitionName), acceptedMembers,
ALLOCATE_ENOUGH_MEMBERS_TO_HOST_BUCKET.toLocalizedString(), computeTimeout());
// NOTREACHED
}
if (isDebugEnabled) {
logger.debug("createBucketAtomically: have {} ms left to finish this", timeLeft);
}
// Always go back to the advisor, see if any fresh data stores are
// present.
Set<InternalDistributedMember> allStores = getAllStores(partitionName);
loggedInsufficentStores = checkSufficientStores(allStores, loggedInsufficentStores);
InternalDistributedMember candidate = createBucketInstance(bucketId, newBucketSize,
excludedMembers, acceptedMembers, failedMembers, timeOut, allStores);
if (candidate != null) {
if (this.prRegion.getDistributionManager().enforceUniqueZone()) {
// enforceUniqueZone property has no effect for a loner. Fix for defect #47181
if (!(this.prRegion.getDistributionManager() instanceof LonerDistributionManager)) {
Set<InternalDistributedMember> exm = getBuddyMembersInZone(candidate, allStores);
exm.remove(candidate);
exm.removeAll(acceptedMembers);
excludedMembers.addAll(exm);
} else {
// log a warning if Loner
logger.warn(LocalizedMessage.create(
LocalizedStrings.GemFireCache_ENFORCE_UNIQUE_HOST_NOT_APPLICABLE_FOR_LONER));
}
}
}
// Get an updated list of bucket owners, which should include
// buckets created concurrently with this createBucketAtomically call
acceptedMembers = prRegion.getRegionAdvisor().getBucketOwners(bucketId);
if (isDebugEnabled) {
logger.debug("Accepted members: {}", acceptedMembers);
}
// [sumedh] set the primary as the candidate in the first iteration if
// the candidate has accepted
if (bucketPrimary == null && acceptedMembers.contains(candidate)) {
bucketPrimary = candidate;
}
// prune out the stores that have left
verifyBucketNodes(excludedMembers, partitionName);
// Note - we used to wait for the created bucket to become primary here
// if this is a colocated region. We no longer need to do that, because
// the EndBucketMessage is sent out after bucket creation completes to
// select the primary.
// Have we exhausted all candidates?
final int potentialCandidateCount = (allStores.size()
- (excludedMembers.size() + acceptedMembers.size() + failedMembers.size()));
// Determining exhausted members competes with bucket balancing; it's
// important to re-visit all failed members since "failed" set may
// contain datastores which at the moment are imbalanced, but yet could
// be candidates. If the failed members list is empty, its expected
// that the next iteration clears the (already empty) list.
final boolean exhaustedPotentialCandidates =
failedMembers.wasCleared() && potentialCandidateCount <= 0;
final boolean redundancySatisfied =
acceptedMembers.size() > this.prRegion.getRedundantCopies();
final boolean bucketNotCreated = acceptedMembers.size() == 0;
if (isDebugEnabled) {
logger.debug(
"potentialCandidateCount={}, exhaustedPotentialCandidates={}, redundancySatisfied={}, bucketNotCreated={}",
potentialCandidateCount, exhaustedPotentialCandidates, redundancySatisfied,
bucketNotCreated);
}
if (bucketNotCreated) {
// if we haven't managed to create the bucket on any nodes, retry.
continue;
}
if (exhaustedPotentialCandidates && !redundancySatisfied) {
insufficientStores(allStores, acceptedMembers, true);
}
// Allow the thread to potentially finish bucket creation even if redundancy was not met.
// Fix for bug 39283
if (redundancySatisfied || exhaustedPotentialCandidates) {
// Tell one of the members to become primary.
// The rest of the members will be allowed to
// volunteer for primary.
endBucketCreation(bucketId, acceptedMembers, bucketPrimary, partitionName);
final int expectedRemoteHosts = acceptedMembers.size()
- (acceptedMembers.contains(this.prRegion.getMyId()) ? 1 : 0);
boolean interrupted = Thread.interrupted();
try {
BucketMembershipObserverResults results = observer
.waitForOwnersGetPrimary(expectedRemoteHosts, acceptedMembers, partitionName);
if (results.problematicDeparture) {
// BZZZT! Member left. Start over.
continue;
}
bucketPrimary = results.primary;
} catch (InterruptedException e) {
interrupted = true;
this.prRegion.getCancelCriterion().checkCancelInProgress(e);
} finally {
if (interrupted) {
Thread.currentThread().interrupt();
}
}
needToElectPrimary = false;
return bucketPrimary;
} // almost done
} // for
} catch (CancelException e) {
// Fix for 43544 - We don't need to elect a primary
// if the cache was closed. The other members will
// take care of it. This ensures we don't compromise
// redundancy.
needToElectPrimary = false;
throw e;
} catch (RegionDestroyedException e) {
// Fix for 43544 - We don't need to elect a primary
// if the region was destroyed. The other members will
// take care of it. This ensures we don't compromise
// redundancy.
needToElectPrimary = false;
throw e;
} catch (PartitionOfflineException e) {
throw e;
} catch (RuntimeException e) {
if (isDebugEnabled) {
logger.debug("Unable to create new bucket {}: {}", bucketId, e.getMessage(), e);
}
// If we're finishing an incomplete bucket creation, don't blast out
// another message to peers to do so.
// TODO - should we ignore a PartitionRegionStorageException, rather
// than reattempting on other nodes?
if (!finishIncompleteCreation) {
cleanUpBucket(bucketId);
}
throw e;
} finally {
if (observer != null) {
observer.stopMonitoring();
}
// Try to make sure everyone that created the bucket can volunteer for primary
if (needToElectPrimary) {
try {
endBucketCreation(bucketId, prRegion.getRegionAdvisor().getBucketOwners(bucketId),
bucketPrimary, partitionName);
} catch (Exception e) {
// if region is going down, then no warning level logs
if (e instanceof CancelException || e instanceof CacheClosedException
|| (prRegion.getCancelCriterion().isCancelInProgress())) {
logger.debug("Exception trying choose a primary after bucket creation failure", e);
} else {
logger.warn("Exception trying choose a primary after bucket creation failure", e);
}
}
}
}
} // synchronized(this)
}
/**
* Figure out which member should be primary for a bucket among the members that have created the
* bucket, and tell that member to become the primary.
*
* @param acceptedMembers The members that now host the bucket
*/
private void endBucketCreation(int bucketId,
Collection<InternalDistributedMember> acceptedMembers,
InternalDistributedMember targetPrimary, String partitionName) {
if (acceptedMembers.isEmpty()) {
return;
}
acceptedMembers = new HashSet<InternalDistributedMember>(acceptedMembers);
// TODO prpersist - we need to factor out a method that just chooses
// the primary. But this will do the trick for the moment.
// This is for FPR, for a given bucket id , make sure that for given bucket
// id , only the datastore on which primary partition is defined for this
// bucket becomes the primary. If primary partition is not available then
// secondary partition will become primary
if (partitionName != null) {
if (isLocalPrimary(partitionName)) {
targetPrimary = this.prRegion.getMyId();
} else {
targetPrimary =
this.prRegion.getRegionAdvisor().adviseFixedPrimaryPartitionDataStore(bucketId);
if (targetPrimary == null) {
Set<InternalDistributedMember> fpDataStores = getFixedPartitionStores(partitionName);
targetPrimary = fpDataStores.iterator().next();
}
}
}
if (targetPrimary == null) {
// [sumedh] we need to select the same primary as chosen earlier (e.g.
// the parent's in case of colocation) so it is now passed
// InternalDistributedMember targetPrimary = getPreferredDataStore(
// acceptedMembers, Collections.<InternalDistributedMember> emptySet());
targetPrimary =
getPreferredDataStore(acceptedMembers, Collections.<InternalDistributedMember>emptySet());
}
boolean isHosting = acceptedMembers.remove(prRegion.getDistributionManager().getId());
EndBucketCreationMessage.send(acceptedMembers, targetPrimary, this.prRegion, bucketId);
// Observer for testing purpose
final EndBucketCreationObserver observer = testEndObserverInstance;
if (observer != null) {
observer.afterEndBucketCreationMessageSend(this.prRegion, bucketId);
}
if (isHosting) {
endBucketCreationLocally(bucketId, targetPrimary);
}
if (observer != null) {
observer.afterEndBucketCreation(this.prRegion, bucketId);
}
}
private boolean isLocalPrimary(String partitionName) {
List<FixedPartitionAttributesImpl> FPAs = this.prRegion.getFixedPartitionAttributesImpl();
if (FPAs != null) {
for (FixedPartitionAttributesImpl fpa : FPAs) {
if (fpa.getPartitionName().equals(partitionName) && fpa.isPrimary()) {
return true;
}
}
}
return false;
}
private static volatile EndBucketCreationObserver testEndObserverInstance;
// Observer for testing purpose
public static void setTestEndBucketCreationObserver(EndBucketCreationObserver observer) {
testEndObserverInstance = observer;
}
/**
* Test observer to help reproduce #42429.
*/
public static interface EndBucketCreationObserver {
public void afterEndBucketCreationMessageSend(PartitionedRegion pr, int bucketId);
public void afterEndBucketCreation(PartitionedRegion pr, int bucketId);
}
public void endBucketCreationLocally(int bucketId, InternalDistributedMember newPrimary) {
// Don't elect ourselves as primary or tell others to persist our ID if this member
// has been destroyed.
if (prRegion.getCancelCriterion().isCancelInProgress() || prRegion.isDestroyed()) {
return;
}
if (logger.isDebugEnabled()) {
logger.debug("endBucketCreationLocally: for region {} bucketId={} new primary: {}",
this.prRegion.getFullPath(), bucketId, newPrimary);
}
BucketAdvisor bucketAdvisor = prRegion.getRegionAdvisor().getBucketAdvisor(bucketId);
final ProxyBucketRegion proxyBucketRegion = bucketAdvisor.getProxyBucketRegion();
BucketPersistenceAdvisor persistentAdvisor = proxyBucketRegion.getPersistenceAdvisor();
// prevent multiple threads from ending bucket creation at the same time.
// This fixes an issue with 41336, where multiple threads were calling endBucketCreation
// on the persistent advisor and marking a bucket as initialized twice.
synchronized (proxyBucketRegion) {
if (persistentAdvisor != null) {
BucketRegion realBucket = proxyBucketRegion.getCreatedBucketRegion();
if (realBucket != null) {
PersistentMemberID persistentID = realBucket.getPersistentID();
persistentAdvisor.endBucketCreation(persistentID);
}
}
// We've received an endBucketCreationMessage, but the primary
// may not have. So now we wait for the chosen member to become
// primary.
bucketAdvisor.setPrimaryElector(newPrimary);
if (prRegion.getGemFireCache().getMyId().equals(newPrimary)) {
// If we're the choosen primary, volunteer for primary now
if (bucketAdvisor.isHosting()) {
bucketAdvisor.clearPrimaryElector();
bucketAdvisor.volunteerForPrimary();
}
} else {
// It's possible the chosen primary has already left. In
// that case, volunteer for primary now.
if (!bucketAdvisor.adviseInitialized().contains(newPrimary)) {
bucketAdvisor.clearPrimaryElector();
bucketAdvisor.volunteerForPrimary();
}
// If the bucket has had a primary, that means the
// chosen bucket was primary for a while. Go ahead and
// clear the primary elector field.
if (bucketAdvisor.getHadPrimary()) {
bucketAdvisor.clearPrimaryElector();
bucketAdvisor.volunteerForPrimary();
}
}
}
// send out a profile update to indicate the persistence is initialized, if needed.
if (persistentAdvisor != null) {
bucketAdvisor.endBucketCreation();
}
List<PartitionedRegion> colocatedWithList = ColocationHelper.getColocatedChildRegions(prRegion);
for (PartitionedRegion child : colocatedWithList) {
if (child.getRegionAdvisor().isBucketLocal(bucketId)) {
child.getRedundancyProvider().endBucketCreationLocally(bucketId, newPrimary);
}
}
}
/**
* Get buddy data stores on the same Host as the accepted member
*
* @return set of members on the same host, not including accepted member
* @since GemFire 5.9
*
*/
private Set<InternalDistributedMember> getBuddyMembersInZone(
final InternalDistributedMember acceptedMember,
final Set<InternalDistributedMember> allStores) {
HashSet<InternalDistributedMember> allMembersOnSystem =
new HashSet<InternalDistributedMember>();
DM dm = this.prRegion.getDistributionManager();
Set<InternalDistributedMember> buddies = dm.getMembersInSameZone(acceptedMember);
// TODO Dan - I'm not sure this retain all is necessary, but there may have been a reason we
// were
// passing this set in before.
buddies.retainAll(allStores);
return buddies;
}
/**
* Early check for resources. This code may be executed for every put operation if there are no
* datastores present, limit excessive logging.
*
* @since GemFire 5.8
*/
private void earlySufficientStoresCheck(String partitionName) {
assert Assert.assertHoldsLock(this, false);
Set currentStores = getAllStores(partitionName);
if (currentStores.isEmpty()) {
if (shouldLogInsufficientStores()) {
insufficientStores(currentStores, Collections.EMPTY_LIST, true);
}
insufficientStores(currentStores, Collections.EMPTY_LIST, false);
}
}
/**
* Limit the frequency for logging the {@link #INSUFFICIENT_STORES_MSG} message to once per PR
* after which once every {@link #INSUFFICIENT_LOGGING_THROTTLE_TIME} second
*
* @return true if it's time to log
* @since GemFire 5.8
*/
private boolean shouldLogInsufficientStores() {
long now = NanoTimer.getTime();
long delta = now - insufficientLogTimeStamp.get();
if (this.firstInsufficentStoresLogged.compareAndSet(false, true)
|| delta >= INSUFFICIENT_LOGGING_THROTTLE_TIME) {
insufficientLogTimeStamp.set(now);
return true;
} else {
return false;
}
}
/**
* Compute timeout for waiting for a bucket. Prefer
* {@link #DATASTORE_DISCOVERY_TIMEOUT_MILLISECONDS} over
* {@link PartitionedRegion#getRetryTimeout()}
*
* @return the milliseconds to wait for a bucket creation operation
*/
private long computeTimeout() {
if (DATASTORE_DISCOVERY_TIMEOUT_MILLISECONDS != null) {
long millis = DATASTORE_DISCOVERY_TIMEOUT_MILLISECONDS.longValue();
if (millis > 0) { // only positive values allowed
return millis;
}
}
return this.prRegion.getRetryTimeout();
}
/**
* Check to determine that there are enough datastore VMs to start the bucket creation processes.
* Log a warning or throw an exception indicating when there are not enough datastore VMs.
*
* @param allStores All known data store instances (including local)
* @param loggedInsufficentStores indicates whether a warning has been logged
* @return true when a warning has been logged, false if a warning should be logged.
*/
private boolean checkSufficientStores(final Set allStores,
final boolean loggedInsufficentStores) {
// Report (only once) if insufficient data store have been detected.
if (!loggedInsufficentStores) {
if (allStores.size() == 0) {
insufficientStores(allStores, Collections.EMPTY_LIST, true);
return true;
}
} else {
if (allStores.size() > 0) {
// Excellent, sufficient resources were found!
final StringId logStr =
LocalizedStrings.PRHARRedundancyProvider_0_IN_THE_PARTITIONED_REGION_REGION_NAME_1;
final Object[] logArgs =
new Object[] {SUFFICIENT_STORES_MSG.toLocalizedString(), prRegion.getFullPath()};
if (TEST_MODE) {
logger.fatal(LocalizedMessage.create(logStr, logArgs));
} else {
logger.info(LocalizedMessage.create(logStr, logArgs));
}
return false;
} else {
// Already logged warning, there are no datastores
insufficientStores(allStores, Collections.EMPTY_LIST, false);
// UNREACHABLE
}
}
return loggedInsufficentStores;
}
/**
* Clean up locally created bucket and tell other VMs to attempt recovering redundancy
*
* @param buck the bucket identifier
*/
private void cleanUpBucket(int buck) {
Set dataStores = this.prRegion.getRegionAdvisor().adviseDataStore();
BucketBackupMessage.send(dataStores, this.prRegion, buck);
}
public void finishIncompleteBucketCreation(int bucketId) {
String partitionName = null;
final long startTime = PartitionedRegionStats.startTime();
if (this.prRegion.isFixedPartitionedRegion()) {
FixedPartitionAttributesImpl fpa =
PartitionedRegionHelper.getFixedPartitionAttributesForBucket(this.prRegion, bucketId);
partitionName = fpa.getPartitionName();
}
createBucketAtomically(bucketId, 0, startTime, true, partitionName);
}
/**
* Creates bucket with ID bucketId on targetNode. This method will also create the bucket for all
* of the child colocated PRs.
*
* @param bucketId
* @param targetNMember
* @param isRebalance true if bucket creation is directed by rebalancing
* @param replaceOfflineData
* @return true if the bucket was sucessfully created
*/
public boolean createBackupBucketOnMember(final int bucketId,
final InternalDistributedMember targetNMember, final boolean isRebalance,
boolean replaceOfflineData, InternalDistributedMember moveSource, boolean forceCreation) {
if (logger.isDebugEnabled()) {
logger.debug("createBackupBucketOnMember for bucketId={} member: {}",
this.prRegion.bucketStringForLogs(bucketId), targetNMember);
}
if (!(targetNMember.equals(this.prRegion.getMyId()))) {
// final StoppableReentrantReadWriteLock.StoppableReadLock isClosingReadLock;
PartitionProfile pp = this.prRegion.getRegionAdvisor().getPartitionProfile(targetNMember);
if (pp != null) {
// isClosingReadLock = pp.getIsClosingReadLock(
// this.prRegion.getCancelCriterion());
} else {
return false;
}
try {
ManageBackupBucketMessage.NodeResponse response =
ManageBackupBucketMessage.send(targetNMember, this.prRegion, bucketId, isRebalance,
replaceOfflineData, moveSource, forceCreation);
if (response.waitForAcceptance()) {
if (logger.isDebugEnabled()) {
logger.debug(
"createBackupBucketOnMember: Bucket creation succeed for bucketId={} on member = {}",
this.prRegion.bucketStringForLogs(bucketId), targetNMember);
}
return true;
} else {
if (logger.isDebugEnabled()) {
logger.debug(
"createBackupBucketOnMember: Bucket creation failed for bucketId={} on member = {}",
this.prRegion.bucketStringForLogs(bucketId), targetNMember);
}
return false;
}
} catch (VirtualMachineError err) {
SystemFailure.initiateFailure(err);
// If this ever returns, rethrow the error. We're poisoned
// now, so don't let this thread continue.
throw err;
} catch (Throwable e) {
// Whenever you catch Error or Throwable, you must also
// catch VirtualMachineError (see above). However, there is
// _still_ a possibility that you are dealing with a cascading
// error condition, so you also need to check to see if the JVM
// is still usable:
SystemFailure.checkFailure();
if (e instanceof ForceReattemptException) {
// no log needed see bug 37569
} else if (e instanceof CancelException
|| (e.getCause() != null && (e.getCause() instanceof CancelException))) {
// no need to log exceptions caused by cache closure
} else {
logger.warn(LocalizedMessage.create(
LocalizedStrings.PRHARedundancyProvider_EXCEPTION_CREATING_PARTITION_ON__0,
targetNMember), e);
}
return false;
}
} else {
final PartitionedRegionDataStore prDS = this.prRegion.getDataStore();
boolean bucketManaged = prDS != null && prDS.grabBucket(bucketId, moveSource, forceCreation,
replaceOfflineData, isRebalance, null, false).equals(CreateBucketResult.CREATED);
if (!bucketManaged) {
if (logger.isDebugEnabled()) {
logger.debug(
"createBackupBucketOnMember: Local data store refused to accommodate the data for bucketId={} prDS={}",
this.prRegion.bucketStringForLogs(bucketId), prDS);
}
}
return bucketManaged;
}
}
private static final ThreadLocal forceLocalPrimaries = new ThreadLocal();
public static void setForceLocalPrimaries(boolean v) {
forceLocalPrimaries.set(Boolean.valueOf(v));
}
private boolean getForceLocalPrimaries() {
boolean result = false;
Boolean v = (Boolean) forceLocalPrimaries.get();
if (v != null) {
result = v.booleanValue();
}
return result;
}
/**
* Creates bucket with ID bucketId on targetNode.
*
* @param bucketId
* @param targetNMember
* @param newBucketSize
* @param forceCreation inform the targetMember it must attempt host the bucket, appropriately
* ignoring it's maximums
* @return a response object
*/
public ManageBucketRsp createBucketOnMember(final int bucketId,
final InternalDistributedMember targetNMember, final int newBucketSize,
boolean forceCreation) {
if (logger.isDebugEnabled()) {
logger.debug("createBucketOnMember for bucketId={} member: {}{}",
this.prRegion.bucketStringForLogs(bucketId), targetNMember,
(forceCreation ? " forced" : ""));
}
if (!(targetNMember.equals(this.prRegion.getMyId()))) {
// final StoppableReentrantReadWriteLock.StoppableReadLock isClosingReadLock;
PartitionProfile pp = this.prRegion.getRegionAdvisor().getPartitionProfile(targetNMember);
if (pp != null) {
// isClosingReadLock = pp.getIsClosingReadLock(
// this.prRegion.getCancelCriterion());
} else {
return ManageBucketRsp.NO;
}
try {
// isClosingReadLock.lock(); // Grab the read lock, preventing any region closures
// on this remote Node until this bucket is fully published, forcing the closing
// Node to recognize any pre-natal buckets.
NodeResponse response = ManageBucketMessage.send(targetNMember, this.prRegion, bucketId,
newBucketSize, forceCreation);
if (response.waitForAcceptance()) {
if (logger.isDebugEnabled()) {
logger.debug(
"createBucketOnMember: Bucket creation succeed for bucketId={} on member = {}",
this.prRegion.bucketStringForLogs(bucketId), targetNMember);
}
// lockList.add(isClosingReadLock);
return ManageBucketRsp.YES;
} else {
if (logger.isDebugEnabled()) {
logger.debug(
"createBucketOnMember: Bucket creation failed for bucketId={} on member = {}",
this.prRegion.bucketStringForLogs(bucketId), targetNMember);
}
// isClosingReadLock.unlock();
return response.rejectedDueToInitialization() ? ManageBucketRsp.NO_INITIALIZING
: ManageBucketRsp.NO;
}
} catch (PartitionOfflineException e) {
throw e;
} catch (VirtualMachineError err) {
SystemFailure.initiateFailure(err);
// If this ever returns, rethrow the error. We're poisoned
// now, so don't let this thread continue.
throw err;
} catch (Throwable e) {
// Whenever you catch Error or Throwable, you must also
// catch VirtualMachineError (see above). However, there is
// _still_ a possibility that you are dealing with a cascading
// error condition, so you also need to check to see if the JVM
// is still usable:
SystemFailure.checkFailure();
if (e instanceof CancelException
|| (e.getCause() != null && (e.getCause() instanceof CancelException))) {
// no need to log exceptions caused by cache closure
return ManageBucketRsp.CLOSED;
} else if (e instanceof ForceReattemptException) {
// no log needed see bug 37569
} else {
logger.warn(LocalizedMessage.create(
LocalizedStrings.PRHARedundancyProvider_EXCEPTION_CREATING_PARTITION_ON__0,
targetNMember), e);
}
// isClosingReadLock.unlock();
return ManageBucketRsp.NO;
}
} else {
final PartitionedRegionDataStore prDS = this.prRegion.getDataStore();
boolean bucketManaged = prDS != null && prDS.handleManageBucketRequest(bucketId,
newBucketSize, this.prRegion.getMyId(), forceCreation);
if (!bucketManaged) {
if (logger.isDebugEnabled()) {
logger.debug(
"createBucketOnMember: Local data store not able to accommodate the data for bucketId={}",
this.prRegion.bucketStringForLogs(bucketId));
}
}
return ManageBucketRsp.valueOf(bucketManaged);
}
}
/**
* Select the member with which is hosting the same bucketid for the PR it is colocated with In
* case of primary it returns the same node whereas in case of secondary it will return the least
* loaded datastore which is hosting the bucketid.
*
* @param alreadyUsed
* @param bucketId
* @param prName
* @return InternalDistributedMember colocated data store
* @since GemFire 5.8Beta
*/
private InternalDistributedMember getColocatedDataStore(
Collection<InternalDistributedMember> candidates,
Collection<InternalDistributedMember> alreadyUsed, int bucketId, String prName) {
Assert.assertTrue(prName != null); // precondition1
PartitionedRegion colocatedRegion = ColocationHelper.getColocatedRegion(this.prRegion);
Region prRoot = PartitionedRegionHelper.getPRRoot(prRegion.getCache());
PartitionRegionConfig config =
(PartitionRegionConfig) prRoot.get(prRegion.getRegionIdentifier());
if (!config.isColocationComplete()) {
throw new IllegalStateException("Cannot create buckets, as colocated regions are not "
+ "configured to be at the same nodes.");
}
RegionAdvisor advisor = colocatedRegion.getRegionAdvisor();
if (alreadyUsed.isEmpty()) {
InternalDistributedMember primary = advisor.getPrimaryMemberForBucket(bucketId);
if (!candidates.contains(primary)) {
return null;
}
return primary;
}
Set bucketOwnersSet = advisor.getBucketOwners(bucketId);
bucketOwnersSet.retainAll(candidates);
ArrayList members = new ArrayList(bucketOwnersSet);
if (members.isEmpty()) {
return null;
}
return getPreferredDataStore(members, alreadyUsed);
}
/**
* Select the member with the fewest buckets, among those with the fewest randomly select one.
*
* Under concurrent access, the data that this method uses, may be somewhat volatile, note that
* createBucketAtomically synchronizes to enhance the consistency of the data used in this method.
*
* @param candidates ArrayList of InternalDistributedMember, potential datastores
* @param alreadyUsed data stores already in use
* @return a member with the fewest buckets or null if no datastores
*/
private InternalDistributedMember getPreferredDataStore(
Collection<InternalDistributedMember> candidates,
final Collection<InternalDistributedMember> alreadyUsed) {
/* has a primary already been chosen? */
final boolean forPrimary = alreadyUsed.size() == 0;
if (forPrimary && getForceLocalPrimaries()) {
PartitionedRegionDataStore myDS = this.prRegion.getDataStore();
if (myDS != null) {
return this.prRegion.getMyId();
}
}
if (candidates.size() == 1) {
return candidates.iterator().next();
}
Assert.assertTrue(candidates.size() > 1);
// Convert peers to DataStoreBuckets
ArrayList<DataStoreBuckets> stores = this.prRegion.getRegionAdvisor()
.adviseFilteredDataStores(new HashSet<InternalDistributedMember>(candidates));
final DM dm = this.prRegion.getDistributionManager();
// Add ourself as a candidate, if appropriate
InternalDistributedMember moi = dm.getId();
PartitionedRegionDataStore myDS = this.prRegion.getDataStore();
if (myDS != null && candidates.contains(moi)) {
int bucketCount = myDS.getBucketsManaged();
int priCount = myDS.getNumberOfPrimaryBucketsManaged();
int localMaxMemory = this.prRegion.getLocalMaxMemory();
stores.add(new DataStoreBuckets(moi, bucketCount, priCount, localMaxMemory));
}
if (stores.isEmpty()) {
return null;
}
// ---------------------------------------------
// Calculate all hosts who already have this bucket
final HashSet<InternalDistributedMember> existingHosts =
new HashSet<InternalDistributedMember>();
Iterator<InternalDistributedMember> it = alreadyUsed.iterator();
while (it.hasNext()) {
InternalDistributedMember mem = it.next();
existingHosts.addAll(dm.getMembersInSameZone(mem));
}
Comparator<DataStoreBuckets> comparator = new Comparator<DataStoreBuckets>() {
public int compare(DataStoreBuckets d1, DataStoreBuckets d2) {
boolean host1Used = existingHosts.contains(d1.memberId);
boolean host2Used = existingHosts.contains(d2.memberId);
if (!host1Used && host2Used) {
return -1; // host1 preferred
}
if (host1Used && !host2Used) {
return 1; // host2 preferred
}
// Six eggs, half a dozen. Look for least loaded.
float metric1, metric2;
if (forPrimary) {
metric1 = d1.numPrimaries / (float) d1.localMaxMemoryMB;
metric2 = d2.numPrimaries / (float) d2.localMaxMemoryMB;
} else {
metric1 = d1.numBuckets / (float) d1.localMaxMemoryMB;
metric2 = d2.numBuckets / (float) d2.localMaxMemoryMB;
}
int result = Float.compare(metric1, metric2);
if (result == 0) {
// if they have the same load, choose the member with the
// higher localMaxMemory
result = d2.localMaxMemoryMB - d1.localMaxMemoryMB;
}
return result;
}
};
// ---------------------------------------------
// First step is to sort datastores first by those whose hosts don't
// hold this bucket, and then secondarily by loading.
Collections.sort(stores, comparator);
if (logger.isDebugEnabled()) {
logger.debug(fancyFormatBucketAllocation("Sorted ", stores, existingHosts));
}
// ---------------------------------------------
// Always add the first datastore and note just how good it is.
DataStoreBuckets bestDataStore = stores.get(0);
ArrayList<DataStoreBuckets> bestStores = new ArrayList<DataStoreBuckets>();
bestStores.add(bestDataStore);
final boolean allStoresInUse = alreadyUsed.contains(bestDataStore.memberId);
// ---------------------------------------------
// Collect all of the other hosts in this sorted list that are as good
// as the very first one.
for (int i = 1; i < stores.size(); i++) {
DataStoreBuckets aDataStore = stores.get(i);
if (!allStoresInUse && alreadyUsed.contains(aDataStore.memberId)) {
// Only choose between the ones not in use.
break;
}
if (comparator.compare(bestDataStore, aDataStore) != 0) {
break;
}
bestStores.add(aDataStore);
}
if (logger.isDebugEnabled()) {
logger.debug(fancyFormatBucketAllocation("Best Stores ", bestStores, existingHosts));
}
// ---------------------------------------------
int chosen;
if (DISABLE_CREATE_BUCKET_RANDOMNESS) {
chosen = 0;
} else {
// Pick one (at random)
chosen = PartitionedRegion.rand.nextInt(bestStores.size());
}
DataStoreBuckets aDataStore = bestStores.get(chosen);
return aDataStore.memberId;
}
/**
* Adds a membership listener to watch for member departures, and schedules a task to recover
* redundancy of existing buckets
*/
public void startRedundancyRecovery() {
prRegion.getRegionAdvisor().addMembershipListener(new PRMembershipListener());
scheduleRedundancyRecovery(null);
}
/**
* Log bucket allocation in the log files in this format:
*
* <pre>
* member1: +5/20
* member2: -10/5
* </pre>
*
* After the member name, the +/- indicates whether or not this bucket is already hosted on the
* given member. This is followed by the number of hosted primaries followed by the number of
* hosted non-primary buckets.
*
* @param prefix first part of message to print
* @param dataStores list of stores
* @param existingStores to mark those already in use
*/
private String fancyFormatBucketAllocation(String prefix, List dataStores, Set existingStores) {
StringBuffer logStr = new StringBuffer();
if (prefix != null) {
logStr.append(prefix);
}
logStr.append("Bucket Allocation for prId=" + this.prRegion.getPRId() + ":\n");
for (Iterator i = dataStores.iterator(); i.hasNext();) {
DataStoreBuckets dsb = (DataStoreBuckets) i.next();
logStr.append(dsb.memberId).append(": ");
if (existingStores.contains(dsb.memberId)) {
logStr.append("+");
} else {
logStr.append("-");
}
logStr.append(Integer.toString(dsb.numPrimaries));
logStr.append("/");
logStr.append(Integer.toString(dsb.numBuckets - dsb.numPrimaries));
// for (int j = 0; j < dsb.numPrimaries; j++) {
// logStr.append('#');
// }
// int nonPrimary = dsb.numBuckets - dsb.numPrimaries;
// for (int j = 0; j < nonPrimary; j++) {
// logStr.append('*');
// }
logStr.append('\n');
}
return logStr.toString();
}
public static class DataStoreBuckets {
public final InternalDistributedMember memberId;
public final int numBuckets;
public final int numPrimaries;
private final int localMaxMemoryMB;
public DataStoreBuckets(InternalDistributedMember mem, int buckets, int primaryBuckets,
int localMaxMemory) {
this.memberId = mem;
this.numBuckets = buckets;
this.numPrimaries = primaryBuckets;
this.localMaxMemoryMB = localMaxMemory;
}
@Override
public boolean equals(Object obj) {
if ((obj == null) || !(obj instanceof DataStoreBuckets)) {
return false;
}
DataStoreBuckets other = (DataStoreBuckets) obj;
return this.numBuckets == other.numBuckets && this.memberId.equals(other.memberId);
}
@Override
public int hashCode() {
return this.memberId.hashCode();
}
@Override
public String toString() {
return "DataStoreBuckets memberId=" + this.memberId + "; numBuckets=" + this.numBuckets
+ "; numPrimaries=" + this.numPrimaries;
}
}
/**
* Verifies the members and removes the members that are either not present in the
* DistributedSystem or are no longer part of the PartitionedRegion (close/localDestroy has been
* performed.) .
*
* @param members collection of members to scan and modify
*/
void verifyBucketNodes(Collection<InternalDistributedMember> members, String partitionName) {
if (members == null || members.isEmpty()) {
return;
}
// Revisit region advisor, get current bucket stores.
final Set<InternalDistributedMember> availableMembers = getAllStores(partitionName);
// boolean debugAnyRemoved = false;
for (Iterator<InternalDistributedMember> itr = members.iterator(); itr.hasNext();) {
InternalDistributedMember node = itr.next();
if (!availableMembers.contains(node)) {
if (logger.isDebugEnabled()) {
logger.debug("verifyBucketNodes: removing member {}", node);
// debugAnyRemoved = true;
}
itr.remove();
Assert.assertTrue(!members.contains(node), "return value does not contain " + node);
}
} // for
}
/**
* Schedule a task to perform redundancy recovery for a new node or for the node departed.
*/
public void scheduleRedundancyRecovery(Object failedMemId) {
final boolean isStartup = failedMemId == null ? true : false;
final GemFireCacheImpl cache = this.prRegion.getCache();
final int redundantCopies = PRHARedundancyProvider.this.prRegion.getRedundantCopies();
final long delay;
final boolean movePrimaries;
if (isStartup) {
delay = this.prRegion.getPartitionAttributes().getStartupRecoveryDelay();
movePrimaries = !Boolean
.getBoolean(DistributionConfig.GEMFIRE_PREFIX + "DISABLE_MOVE_PRIMARIES_ON_STARTUP");
} else {
delay = this.prRegion.getPartitionAttributes().getRecoveryDelay();
movePrimaries = false;
}
final boolean requiresRedundancyRecovery = delay >= 0 && redundantCopies > 0;
if (!requiresRedundancyRecovery) {
return;
}
if (!PRHARedundancyProvider.this.prRegion.isDataStore()) {
return;
}
Runnable task = new RecoveryRunnable(this) {
@Override
public void run2() {
try {
final boolean isFixedPartitionedRegion =
PRHARedundancyProvider.this.prRegion.isFixedPartitionedRegion();
// Fix for 43582 - always replace offline data for fixed partitioned
// regions - this guarantees we create the buckets we are supposed to
// create on this node.
boolean replaceOfflineData = isFixedPartitionedRegion || !isStartup;
RebalanceDirector director;
if (isFixedPartitionedRegion) {
director = new FPRDirector(true, movePrimaries);
} else {
director = new CompositeDirector(true, true, false, movePrimaries);
}
final PartitionedRegionRebalanceOp rebalance = new PartitionedRegionRebalanceOp(
PRHARedundancyProvider.this.prRegion, false, director, replaceOfflineData, false);
long start = PRHARedundancyProvider.this.prRegion.getPrStats().startRecovery();
if (isFixedPartitionedRegion) {
rebalance.executeFPA();
} else {
rebalance.execute();
}
PRHARedundancyProvider.this.prRegion.getPrStats().endRecovery(start);
PRHARedundancyProvider.this.recoveryFuture = null;
} catch (CancelException e) {
logger.debug("Cache closed while recovery in progress");
} catch (RegionDestroyedException e) {
logger.debug("Region destroyed while recovery in progress");
} catch (Exception e) {
logger.error(
LocalizedMessage.create(
LocalizedStrings.PRHARedundancyProvider_UNEXPECTED_EXCEPTION_DURING_BUCKET_RECOVERY),
e);
}
}
};
synchronized (this.shutdownLock) { // possible fix for bug 41094
if (!this.shutdown) {
try {
if (logger.isDebugEnabled()) {
if (isStartup) {
logger.debug(this.prRegion + " scheduling redundancy recovery in {} ms", delay);
} else {
logger.debug(
"prRegion scheduling redundancy recovery after departure/crash/error in {} in {} ms",
failedMemId, delay);
}
}
recoveryFuture = this.recoveryExecutor.schedule(task, delay, TimeUnit.MILLISECONDS);
} catch (RejectedExecutionException e) {
// ok, the executor is shutting down.
}
}
}
}
public boolean isRedundancyImpaired() {
int numBuckets = this.prRegion.getPartitionAttributes().getTotalNumBuckets();
int targetRedundancy = this.prRegion.getPartitionAttributes().getRedundantCopies();
for (int i = 0; i < numBuckets; i++) {
int redundancy = this.prRegion.getRegionAdvisor().getBucketRedundancy(i);
if (redundancy < targetRedundancy && redundancy != -1 || redundancy > targetRedundancy) {
return true;
}
}
return false;
}
public boolean recoverPersistentBuckets() {
/**
* To handle a case where ParallelGatewaySender is persistent but userPR is not First recover
* the GatewaySender buckets for ParallelGatewaySender irrespective of whether colocation is
* complete or not.
*/
PartitionedRegion leaderRegion = ColocationHelper.getLeaderRegion(this.prRegion);
// Check if the leader region or some child shadow PR region is persistent
// and return the first persistent region found
PartitionedRegion persistentLeader = getPersistentLeader();
// If there is no persistent region in the colocation chain, no need to recover.
if (persistentLeader == null) {
return true;
}
if (!ColocationHelper.checkMembersColocation(leaderRegion,
leaderRegion.getDistributionManager().getDistributionManagerId())) {
if (logger.isDebugEnabled()) {
logger.debug("Skipping persistent recovery of {} because colocation is not complete for {}",
prRegion, leaderRegion);
}
return false;
}
// TODO prpersist - It would make sense to hold the lock here in some cases
// to prevent confusing members that are trying to rebalance. BUT, these persistent regions
// need to wait for other members to recover during initialization.
// RecoveryLock lock = leaderRegion.getRecoveryLock();
// lock.lock();
// try {
final ProxyBucketRegion[] proxyBucketArray =
persistentLeader.getRegionAdvisor().getProxyBucketArray();
for (ProxyBucketRegion proxyBucket : proxyBucketArray) {
proxyBucket.initializePersistenceAdvisor();
}
Set<InternalDistributedMember> peers = this.prRegion.getRegionAdvisor().adviseGeneric();
// TODO prpersist - Ok, this is super lame. We need to make sure here that we don't run into
// this race condition
// 1) We get a membership view from member A
// 2) Member B removes itself, and distributes to us and A. We don't remove B
// 3) We apply the membership view from A, which includes B.
// That will add B back into the set.
// This state flush will make sure that any membership changes
// That are in progress on the peers are finished.
MembershipFlushRequest.send(peers, this.prRegion.getDistributionManager(),
this.prRegion.getFullPath());
ArrayList<ProxyBucketRegion> bucketsNotHostedLocally =
new ArrayList<ProxyBucketRegion>(proxyBucketArray.length);
ArrayList<ProxyBucketRegion> bucketsHostedLocally =
new ArrayList<ProxyBucketRegion>(proxyBucketArray.length);
/*
* Start the redundancy logger before recovering any proxy buckets.
*/
allBucketsRecoveredFromDisk = new CountDownLatch(proxyBucketArray.length);
try {
if (proxyBucketArray.length > 0) {
this.redundancyLogger = new RedundancyLogger(this);
Thread loggingThread = new Thread(this.redundancyLogger,
"RedundancyLogger for region " + this.prRegion.getName());
loggingThread.start();
}
} catch (RuntimeException e) {
allBucketsRecoveredFromDisk = null;
throw e;
}
/*
* Spawn a separate thread for bucket that we previously hosted to recover that bucket.
*
* That thread will get to the point at which it has determined that at least one member
* (possibly the local member) has fully initialized the bucket, at which it will count down the
* someMemberRecoveredLatch latch on the bucket.
*
* Once at least one copy of each bucket has been created in the distributed system, the
* initPRInternals method will exit. Some of the threads spawned here will still be doing GII's
* in the background. This allows the system to become usable as fast as possible.
*
* If we used a bounded thread pool here, we end up waiting for some buckets to finish there GII
* before returning from initPRInternals. In the future maybe we could let the create bucket
* return and pass the GII task to a separate thread pool.
*
*/
for (final ProxyBucketRegion proxyBucket : proxyBucketArray) {
if (proxyBucket.getPersistenceAdvisor().wasHosting()) {
final RecoveryRunnable recoveryRunnable = new RecoveryRunnable(this) {
@Override
public void run() {
// Fix for 44551 - make sure that we always count down
// this latch, even if the region was destroyed.
try {
super.run();
} finally {
allBucketsRecoveredFromDisk.countDown();
}
}
@Override
public void run2() {
proxyBucket.recoverFromDiskRecursively();
}
};
Thread recoveryThread =
new Thread(recoveryRunnable, "Recovery thread for bucket " + proxyBucket.getName());
recoveryThread.start();
bucketsHostedLocally.add(proxyBucket);
} else {
bucketsNotHostedLocally.add(proxyBucket);
}
}
try {
// Partial fix for 44045, try to recover the local
// buckets before the proxy buckets. This will allow us
// to detect any ConflictingDataException before the proxy
// buckets update their membership view.
for (final ProxyBucketRegion proxyBucket : bucketsHostedLocally) {
proxyBucket.waitForPrimaryPersistentRecovery();
}
for (final ProxyBucketRegion proxyBucket : bucketsNotHostedLocally) {
proxyBucket.recoverFromDiskRecursively();
}
} finally {
for (final ProxyBucketRegion proxyBucket : bucketsNotHostedLocally) {
allBucketsRecoveredFromDisk.countDown();
}
}
return true;
// } finally {
// lock.unlock();
// }
}
/**
* Check to see if any colocated region of the current region is persistent. It's not enough to
* check just the leader region, because a child region might be a persistent parallel WAN queue,
* which is allowed.
*
* @return the most senior region in the colocation chain (closest to the leader) that is
* persistent.
*/
protected PartitionedRegion getPersistentLeader() {
PartitionedRegion leader = ColocationHelper.getLeaderRegion(this.prRegion);
return findPersistentRegionRecursively(leader);
}
private PartitionedRegion findPersistentRegionRecursively(PartitionedRegion pr) {
if (pr.getDataPolicy().withPersistence()) {
return pr;
}
for (PartitionedRegion child : ColocationHelper.getColocatedChildRegions(pr)) {
PartitionedRegion leader = findPersistentRegionRecursively(child);
if (leader != null) {
return leader;
}
}
return null;
}
public void scheduleCreateMissingBuckets() {
if (this.prRegion.getColocatedWith() != null
&& ColocationHelper.isColocationComplete(this.prRegion)) {
Runnable task = new CreateMissingBucketsTask(this);
final InternalResourceManager resourceManager =
this.prRegion.getGemFireCache().getResourceManager();
resourceManager.getExecutor().execute(task);
}
}
public void shutdown() {
synchronized (this.shutdownLock) { // possible fix for bug 41094
this.shutdown = true;
ScheduledFuture<?> recoveryFuture = this.recoveryFuture;
if (recoveryFuture != null) {
recoveryFuture.cancel(false/* mayInterruptIfRunning */);
this.recoveryExecutor.purge();
}
}
}
/**
* Creates and fills in a PartitionMemberDetails for the partitioned region.
*
* @param internal true if internal-only details should be included
* @param loadProbe the LoadProbe to use
* @return PartitionRegionInfo for the partitioned region
*/
public InternalPRInfo buildPartitionedRegionInfo(final boolean internal,
final LoadProbe loadProbe) {
final PartitionedRegion pr = this.prRegion;
if (pr == null) {
return null;
}
PartitionedRegionStats prStats = pr.getPrStats();
int configuredBucketCount = pr.getTotalNumberOfBuckets();
int createdBucketCount = pr.getRegionAdvisor().getCreatedBucketsCount();
int lowRedundancyBucketCount = prStats.getLowRedundancyBucketCount();
int configuredRedundantCopies = pr.getRedundantCopies();
int actualRedundantCopies = prStats.getActualRedundantCopies();
final PartitionedRegionDataStore ds = pr.getDataStore();
Set<InternalDistributedMember> datastores = pr.getRegionAdvisor().adviseDataStore();
// int size = datastores.size() + (ds == null ? 0 : 1);
Set<InternalPartitionDetails> memberDetails = new TreeSet<InternalPartitionDetails>();
OfflineMemberDetails offlineMembers = null;
boolean fetchOfflineMembers = false;
if (ds != null) {
memberDetails.add(buildPartitionMemberDetails(internal, loadProbe));
offlineMembers = fetchOfflineMembers();
} else {
fetchOfflineMembers = true;
}
// Get remote results
if (!datastores.isEmpty()) {
FetchPartitionDetailsResponse response = FetchPartitionDetailsMessage.send(datastores, pr,
internal, fetchOfflineMembers, loadProbe);
memberDetails.addAll(response.waitForResponse());
if (fetchOfflineMembers) {
offlineMembers = response.getOfflineMembers();
}
}
String colocatedWithPath = pr.getColocatedWith();
InternalPRInfo details = new PartitionRegionInfoImpl(pr.getFullPath(), configuredBucketCount,
createdBucketCount, lowRedundancyBucketCount, configuredRedundantCopies,
actualRedundantCopies, memberDetails, colocatedWithPath, offlineMembers);
return details;
}
/**
* Retrieve the set of members which are currently offline for all buckets.
*/
public OfflineMemberDetailsImpl fetchOfflineMembers() {
ProxyBucketRegion[] proxyBuckets = prRegion.getRegionAdvisor().getProxyBucketArray();
Set<PersistentMemberID>[] offlineMembers = new Set[proxyBuckets.length];
for (int i = 0; i < proxyBuckets.length; i++) {
ProxyBucketRegion proxy = proxyBuckets[i];
if (this.prRegion.getDataPolicy().withPersistence()) {
Set<PersistentMemberID> persistedMembers =
proxy.getPersistenceAdvisor().getMissingMembers();
if (persistedMembers == null) {
persistedMembers = Collections.emptySet();
}
offlineMembers[i] = persistedMembers;
} else {
offlineMembers[i] = Collections.emptySet();
}
}
return new OfflineMemberDetailsImpl(offlineMembers);
}
/**
* Creates and fills in a PartitionMemberDetails for the local member.
*
* @param internal true if internal-only details should be included
* @param loadProbe the LoadProbe to use
* @return PartitionMemberDetails for the local member
*/
public InternalPartitionDetails buildPartitionMemberDetails(final boolean internal,
final LoadProbe loadProbe) {
final PartitionedRegion pr = this.prRegion;
PartitionedRegionDataStore ds = pr.getDataStore();
if (ds == null) {
return null;
}
InternalPartitionDetails localDetails = null;
long size = 0;
InternalDistributedMember localMember = (InternalDistributedMember) pr.getMyId();
int configuredBucketCount = pr.getTotalNumberOfBuckets();
long[] bucketSizes = new long[configuredBucketCount];
// key: bid, value: size
Map<Integer, Integer> bucketSizeMap = ds.getSizeLocally();
for (Iterator<Map.Entry<Integer, Integer>> iter = bucketSizeMap.entrySet().iterator(); iter
.hasNext();) {
Map.Entry<Integer, Integer> me = iter.next();
int bid = me.getKey().intValue();
long bucketSize = ds.getBucketSize(bid);
bucketSizes[bid] = bucketSize;
size += bucketSize;
}
if (internal) {
waitForPersistentBucketRecoveryOrClose();
PRLoad prLoad = loadProbe.getLoad(pr);
localDetails =
new PartitionMemberInfoImpl(localMember, pr.getLocalMaxMemory() * (1024L * 1024L), size,
ds.getBucketsManaged(), ds.getNumberOfPrimaryBucketsManaged(), prLoad, bucketSizes);
} else {
localDetails =
new PartitionMemberInfoImpl(localMember, pr.getLocalMaxMemory() * (1024L * 1024L), size,
ds.getBucketsManaged(), ds.getNumberOfPrimaryBucketsManaged());
}
return localDetails;
}
/**
* Wait for all persistent buckets to be recovered from disk, or for the region to be closed,
* whichever happens first.
*/
protected void waitForPersistentBucketRecoveryOrClose() {
CountDownLatch recoveryLatch = allBucketsRecoveredFromDisk;
if (recoveryLatch != null) {
boolean interrupted = false;
while (true) {
try {
this.prRegion.getCancelCriterion().checkCancelInProgress(null);
boolean done = recoveryLatch.await(
PartitionedRegionHelper.DEFAULT_WAIT_PER_RETRY_ITERATION, TimeUnit.MILLISECONDS);
if (done) {
break;
}
} catch (InterruptedException e) {
interrupted = true;
}
}
if (interrupted) {
Thread.currentThread().interrupt();
}
}
List<PartitionedRegion> colocatedRegions =
ColocationHelper.getColocatedChildRegions(this.prRegion);
for (PartitionedRegion child : colocatedRegions) {
child.getRedundancyProvider().waitForPersistentBucketRecoveryOrClose();
}
}
/**
* Wait for all persistent buckets to be recovered from disk, regardless of whether the region is
* currently being closed.
*/
protected void waitForPersistentBucketRecovery() {
CountDownLatch recoveryLatch = allBucketsRecoveredFromDisk;
if (recoveryLatch != null) {
boolean interrupted = false;
while (true) {
try {
recoveryLatch.await();
break;
} catch (InterruptedException e) {
interrupted = true;
}
}
if (interrupted) {
Thread.currentThread().interrupt();
}
}
}
public boolean isPersistentRecoveryComplete() {
if (!ColocationHelper.checkMembersColocation(this.prRegion, this.prRegion.getMyId())) {
return false;
}
if (allBucketsRecoveredFromDisk != null && allBucketsRecoveredFromDisk.getCount() > 0) {
return false;
}
Map<String, PartitionedRegion> colocatedRegions =
ColocationHelper.getAllColocationRegions(this.prRegion);
for (PartitionedRegion region : colocatedRegions.values()) {
PRHARedundancyProvider redundancyProvider = region.getRedundancyProvider();
if (redundancyProvider.allBucketsRecoveredFromDisk != null
&& redundancyProvider.allBucketsRecoveredFromDisk.getCount() > 0) {
return false;
}
}
return true;
}
private static class ManageBucketRsp {
final static ManageBucketRsp NO = new ManageBucketRsp("NO");
final static ManageBucketRsp YES = new ManageBucketRsp("YES");
final static ManageBucketRsp NO_INITIALIZING = new ManageBucketRsp("NO_INITIALIZING");
public static final ManageBucketRsp CLOSED = new ManageBucketRsp("CLOSED");
private final String name;
private ManageBucketRsp(String name) {
this.name = name;
}
boolean isRejection() {
return this == NO || this == NO_INITIALIZING || this == CLOSED;
}
boolean isAcceptance() {
return this == YES;
}
boolean isInitializing() {
return this == NO_INITIALIZING;
}
@Override
public String toString() {
return "ManageBucketRsp(" + this.name + ")";
}
/** return YES if the argument is true, NO if not */
static ManageBucketRsp valueOf(boolean managed) {
return managed ? YES : NO;
}
}
static private class BucketMembershipObserverResults {
final boolean problematicDeparture;
final InternalDistributedMember primary;
BucketMembershipObserverResults(boolean re, InternalDistributedMember p) {
problematicDeparture = re;
primary = p;
}
@Override
public String toString() {
return "pDepart:" + problematicDeparture + " primary:" + primary;
}
}
/**
* Monitors distributed membership for a given bucket
*
*/
private class BucketMembershipObserver implements MembershipListener {
final Bucket bucketToMonitor;
final AtomicInteger arrivals = new AtomicInteger(0);
final AtomicBoolean departures = new AtomicBoolean(false);
public BucketMembershipObserver(Bucket b) {
this.bucketToMonitor = b;
}
public BucketMembershipObserver beginMonitoring() {
int profilesPresent = this.bucketToMonitor.getBucketAdvisor()
.addMembershipListenerAndAdviseGeneric(this).size();
arrivals.addAndGet(profilesPresent);
return this;
}
public void stopMonitoring() {
this.bucketToMonitor.getBucketAdvisor().removeMembershipListener(this);
}
public void memberJoined(InternalDistributedMember id) {
if (logger.isDebugEnabled()) {
logger.debug("Observer for bucket {} member joined {}", this.bucketToMonitor, id);
}
synchronized (this) {
// TODO manipulate failedNodes and verifiedNodeList directly
arrivals.addAndGet(1);
notify();
}
}
public void memberSuspect(InternalDistributedMember id, InternalDistributedMember whoSuspected,
String reason) {}
public void memberDeparted(InternalDistributedMember id, boolean crashed) {
if (logger.isDebugEnabled()) {
logger.debug("Observer for bucket {} member departed {}", this.bucketToMonitor, id);
}
synchronized (this) {
// TODO manipulate failedNodes and verifiedNodeList directly
departures.getAndSet(true);
notify();
}
}
/**
* Wait for expected number of owners to be recognized. When the expected number have been seen,
* then fetch the primary and report it. If while waiting for the owners to be recognized there
* is a departure which compromises redundancy
*
* @param expectedCount the number of bucket owners to wait for
* @param expectedOwners the list of owners used when a departure is detected
* @return if no problematic departures are detected, the primary
* @throws InterruptedException
*/
public BucketMembershipObserverResults waitForOwnersGetPrimary(final int expectedCount,
final Collection<InternalDistributedMember> expectedOwners, String partitionName)
throws InterruptedException {
boolean problematicDeparture = false;
synchronized (this) {
for (;;) {
this.bucketToMonitor.getCancelCriterion().checkCancelInProgress(null);
// If any departures, need to rethink much...
boolean oldDepartures = departures.get();
if (oldDepartures) {
verifyBucketNodes(expectedOwners, partitionName);
if (expectedOwners.isEmpty()) {
problematicDeparture = true; // need to pick new victims
}
// reselect = true; // need to pick new victims
arrivals.set(expectedOwners.size());
departures.set(false);
if (problematicDeparture) {
if (logger.isDebugEnabled()) {
logger.debug("Bucket observer found departed members - retrying");
}
}
break;
}
// Look for success...
int oldArrivals = arrivals.get();
if (oldArrivals >= expectedCount) {
// success!
break;
}
if (logger.isDebugEnabled()) {
logger.debug("Waiting for bucket {} to finish being created",
prRegion.bucketStringForLogs(this.bucketToMonitor.getId()));
}
prRegion.checkReadiness();
final int creationWaitMillis = 5 * 1000;
wait(creationWaitMillis);
if (oldArrivals == arrivals.get() && oldDepartures == departures.get()) {
logger.warn(LocalizedMessage.create(
LocalizedStrings.PRHARedundancyProvider_TIME_OUT_WAITING_0_MS_FOR_CREATION_OF_BUCKET_FOR_PARTITIONED_REGION_1_MEMBERS_REQUESTED_TO_CREATE_THE_BUCKET_ARE_2,
new Object[] {Integer.valueOf(creationWaitMillis), prRegion.getFullPath(),
expectedOwners}));
}
} // for (;;)
} // synchronized
if (problematicDeparture) {
return new BucketMembershipObserverResults(true, null);
}
InternalDistributedMember primmy = bucketToMonitor.getBucketAdvisor().getPrimary();
if (primmy == null) {
/*
* Handle a race where nobody has the bucket. We can't return a null member here because we
* haven't created the bucket, need to let the higher level code loop.
*/
return new BucketMembershipObserverResults(true, null);
} else {
return new BucketMembershipObserverResults(false, primmy);
}
}
@Override
public void quorumLost(Set<InternalDistributedMember> failures,
List<InternalDistributedMember> remaining) {}
}
/**
* This class extends MembershipListener to perform cleanup when a node leaves DistributedSystem.
*
*/
protected class PRMembershipListener implements MembershipListener {
public void memberDeparted(final InternalDistributedMember id, final boolean crashed) {
try {
DistributedMember dmem = prRegion.getSystem().getDistributedMember();
if (logger.isDebugEnabled()) {
logger.debug(
"MembershipListener invoked on DistributedMember = {} for failed memberId = {}", dmem,
id);
}
if (!prRegion.isCacheClosing() && !prRegion.isDestroyed() && !dmem.equals(id)) {
Runnable postRecoveryTask = null;
// Only schedule redundancy recovery if this not a fixed PR.
if (!PRHARedundancyProvider.this.prRegion.isFixedPartitionedRegion()) {
postRecoveryTask = new Runnable() {
public void run() {
// After the metadata has been cleaned, recover redundancy.
scheduleRedundancyRecovery(id);
}
};
}
// Schedule clean up the metadata for the failed member.
PartitionedRegionHelper.cleanUpMetaDataForRegion(prRegion.getCache(),
prRegion.getRegionIdentifier(), id, postRecoveryTask);
}
} catch (CancelException e) {
// ignore
}
}
public void memberSuspect(InternalDistributedMember id, InternalDistributedMember whoSuspected,
String reason) {}
public void memberJoined(InternalDistributedMember id) {
// no action required
}
public void quorumLost(Set<InternalDistributedMember> failures,
List<InternalDistributedMember> remaining) {}
}
/**
* This class extends MembershipListener to start redundancy recovery when a persistent member is
* revoked
*
*/
protected class PRPersistenceListener extends PersistentStateListener.PersistentStateAdapter {
// TODO prpersist It seems like this might trigger recovery too often. For example, a rebalance
// can end up removing a bucket, which would trigger recovery here. We really need to only
// trigger this thing when a PR region is destroyed. And isn't that code already in there?
@Override
public void memberRemoved(PersistentMemberID persistentID, boolean revoked) {
if (!revoked) {
return;
}
DistributedMember dmem = prRegion.getSystem().getDistributedMember();
if (logger.isDebugEnabled()) {
logger.debug(
"Persistent Membership Listener invoked on DistributedMember = {} for removed memberId = {}",
dmem, persistentID);
}
if (!prRegion.isCacheClosing() && !prRegion.isDestroyed()
&& !prRegion.isFixedPartitionedRegion()) {
scheduleRedundancyRecovery(persistentID);
}
}
}
public CountDownLatch getAllBucketsRecoveredFromDiskLatch() {
return allBucketsRecoveredFromDisk;
}
}