package com.jivesoftware.os.amza.service.take;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.jivesoftware.os.amza.api.partition.PartitionName;
import com.jivesoftware.os.amza.api.partition.VersionedAquarium;
import com.jivesoftware.os.amza.api.partition.VersionedPartitionName;
import com.jivesoftware.os.amza.api.ring.RingMember;
import com.jivesoftware.os.amza.service.partition.VersionedPartitionProvider;
import com.jivesoftware.os.amza.service.replication.PartitionStripeProvider;
import com.jivesoftware.os.amza.service.replication.StripeTx.TxPartitionStripe;
import com.jivesoftware.os.amza.service.storage.SystemWALStorage;
import com.jivesoftware.os.amza.service.take.AvailableRowsTaker.AvailableStream;
import com.jivesoftware.os.amza.service.take.TakeCoordinator.TookLatencyStream;
import com.jivesoftware.os.amza.service.take.TakeRingCoordinator.VersionedRing;
import com.jivesoftware.os.aquarium.LivelyEndState;
import com.jivesoftware.os.aquarium.State;
import com.jivesoftware.os.jive.utils.ordered.id.TimestampedOrderIdProvider;
import com.jivesoftware.os.mlogger.core.MetricLogger;
import com.jivesoftware.os.mlogger.core.MetricLoggerFactory;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
/**
* @author jonathan.colt
*/
public class TakeVersionedPartitionCoordinator {
private static final MetricLogger LOG = MetricLoggerFactory.getLogger();
final SystemWALStorage systemWALStorage;
final RingMember rootMember;
final VersionedPartitionName versionedPartitionName;
final TimestampedOrderIdProvider timestampedOrderIdProvider;
final long slowTakeMillis;
final long slowTakeId;
final long systemReofferDeltaMillis;
final long reofferDeltaMillis;
final AtomicInteger currentCategory;
volatile VersionedPartitionProvider.VersionedPartitionProperties versionedPartitionProperties;
volatile boolean expunged = false;
volatile long callCount;
private final ConcurrentMap<RingMember, Session> sessions = Maps.newConcurrentMap();
private final AtomicBoolean isInBootstrap = new AtomicBoolean(true);
private long lastOfferedMillis = -1; // approximate is good enough
private long lastTakenMillis = -1; // approximate is good enough
private long lastCategoryCheckMillis = -1; // approximate is good enough
public TakeVersionedPartitionCoordinator(SystemWALStorage systemWALStorage,
RingMember rootMember,
VersionedPartitionName versionedPartitionName,
TimestampedOrderIdProvider timestampedOrderIdProvider,
long slowTakeMillis,
long slowTakeId,
long systemReofferDeltaMillis,
long reofferDeltaMillis) {
this.systemWALStorage = systemWALStorage;
this.rootMember = rootMember;
this.versionedPartitionName = versionedPartitionName;
this.timestampedOrderIdProvider = timestampedOrderIdProvider;
this.systemReofferDeltaMillis = systemReofferDeltaMillis;
this.reofferDeltaMillis = reofferDeltaMillis;
this.slowTakeMillis = slowTakeMillis;
this.slowTakeId = slowTakeId;
this.currentCategory = new AtomicInteger(1);
}
long availableRowsStream(PartitionStripeProvider partitionStripeProvider,
long takeSessionId,
VersionedRing versionedRing,
RingMember ringMember,
AtomicLong electionCounter,
AvailableStream availableStream) throws Exception {
if (expunged || stableTaker(ringMember, takeSessionId, null).isDormant()) {
return Long.MAX_VALUE;
}
lastOfferedMillis = System.currentTimeMillis();
callCount++;
return partitionStripeProvider.txPartition(versionedPartitionName.getPartitionName(),
(txPartitionStripe, highwaterStorage, versionedAquarium) -> {
VersionedPartitionName currentVersionedPartitionName = versionedAquarium.getVersionedPartitionName();
Stable stable = stableTaker(ringMember, takeSessionId, versionedAquarium);
if (stable.isDormant()) {
return Long.MAX_VALUE;
} else if (currentVersionedPartitionName.getPartitionVersion() == versionedPartitionName.getPartitionVersion()) {
return streamHighestTxId(versionedAquarium,
txPartitionStripe,
takeSessionId,
versionedRing,
ringMember,
stable.isOnline(),
stable.isNominated(),
electionCounter,
availableStream);
} else {
LOG.warn("Ignored available rows stream for invalid version {}", versionedPartitionName);
return Long.MAX_VALUE;
}
});
}
private long highestPartitionTx(TxPartitionStripe txPartitionStripe,
VersionedAquarium versionedAquarium) throws Exception {
VersionedPartitionName versionedPartitionName = versionedAquarium.getVersionedPartitionName();
PartitionName partitionName = versionedPartitionName.getPartitionName();
if (partitionName.isSystemPartition()) {
return systemWALStorage.highestPartitionTxId(versionedPartitionName);
} else {
return txPartitionStripe.tx((deltaIndex, stripeIndex, partitionStripe) -> {
long highestTxId = partitionStripe.highestTxId(versionedAquarium.getVersionedPartitionName());
return highestTxId == HighwaterStorage.LOCAL_NONE ? -1 : highestTxId;
});
}
}
private enum Stable {
dormant_online(true, true, false),
active_online(false, true, false),
active_nominated(false, true, true),
active_offline(false, false, false);
private final boolean dormant;
private final boolean online;
private final boolean nominated;
Stable(boolean dormant, boolean online, boolean nominated) {
this.dormant = dormant;
this.online = online;
this.nominated = nominated;
}
public boolean isDormant() {
return dormant;
}
public boolean isOnline() {
return online;
}
public boolean isNominated() {
return nominated;
}
}
private Stable stableTaker(RingMember ringMember, long takeSessionId, VersionedAquarium versionedAquarium) throws Exception {
Session session = sessions.get(ringMember);
if (session == null) {
return Stable.active_offline;
}
synchronized (session) {
if (session.sessionId != takeSessionId) {
return Stable.active_offline;
}
boolean nominated = false;
if (!session.online && versionedAquarium != null) {
session.online = versionedAquarium.isLivelyEndState(ringMember);
if (!session.online) {
nominated = versionedAquarium.isMemberInState(ringMember, State.nominated);
}
}
if (session.online && session.steadyState) {
return Stable.dormant_online;
} else if (session.online) {
return Stable.active_online;
} else if (nominated) {
return Stable.active_nominated;
} else {
return Stable.active_offline;
}
}
}
private long streamHighestTxId(VersionedAquarium versionedAquarium,
TxPartitionStripe txPartitionStripe,
long takeSessionId,
VersionedRing versionedRing,
RingMember ringMember,
boolean takerIsOnline,
boolean takerIsNominated,
AtomicLong electionCounter,
AvailableStream availableStream) throws Exception {
if (isInBootstrap.get()) {
LivelyEndState livelyEndState = versionedAquarium.getLivelyEndState();
if (livelyEndState.getCurrentState() != State.bootstrap) {
isInBootstrap.set(false);
}
}
Integer category = versionedRing.getCategory(ringMember);
boolean isSystemPartition = versionedPartitionName.getPartitionName().isSystemPartition();
boolean isSufficientCategory = category != null && category <= currentCategory.get();
if (!takerIsOnline
|| isInBootstrap.get()
|| isSystemPartition
|| isSufficientCategory) {
boolean available = false;
long reofferDelta = (isSystemPartition ? systemReofferDeltaMillis : reofferDeltaMillis);
long reofferAfterTimeInMillis = System.currentTimeMillis() + reofferDelta;
Session session = sessions.computeIfAbsent(ringMember, key -> new Session());
long highestTxId = highestPartitionTx(txPartitionStripe, versionedAquarium);
boolean electable = false;
while (true) {
synchronized (session) {
if (session.sessionId != takeSessionId) {
available = true;
session.sessionId = takeSessionId;
session.offeredTxId = highestTxId;
session.reofferAtTimeInMillis = reofferAfterTimeInMillis;
session.tookTxId = -1;
session.tookFully = false;
session.steadyState = false;
session.online = false;
} else if (isSystemPartition && !session.tookFully) {
available = true;
session.sessionId = takeSessionId;
session.offeredTxId = highestTxId;
session.reofferAtTimeInMillis = reofferAfterTimeInMillis;
session.tookFully = false;
session.steadyState = false;
} else if (takerIsNominated || isSufficientCategory && (!takerIsOnline || shouldOffer(session, highestTxId))) {
electable |= electionCounter.decrementAndGet() >= 0; // note: magical short circuit OR
if (electable) {
available = true;
session.sessionId = takeSessionId;
session.offeredTxId = highestTxId;
session.reofferAtTimeInMillis = reofferAfterTimeInMillis;
session.steadyState = false;
} else {
break;
}
} else {
session.steadyState = !isSufficientCategory || (session.tookTxId >= highestTxId);
}
}
long checkTxId = highestPartitionTx(txPartitionStripe, versionedAquarium);
if (checkTxId == highestTxId) {
break;
} else {
highestTxId = checkTxId;
}
}
if (available) {
availableStream.available(versionedPartitionName, highestTxId);
return reofferDelta;
} else {
return Long.MAX_VALUE;
}
}
if (category == null) {
return Long.MAX_VALUE;
}
return category * slowTakeMillis;
}
private boolean shouldOffer(Session session, long highestTxId) {
return highestTxId > -1 && (highestTxId > session.offeredTxId
|| (highestTxId > session.tookTxId && System.currentTimeMillis() > session.reofferAtTimeInMillis));
}
void updateTxId(VersionedRing versionedRing, boolean replicated, long updateTxId, boolean invalidateOnline) throws Exception {
if (expunged) {
return;
}
updateCategory(versionedRing, replicated, updateTxId);
for (Session session : sessions.values()) {
synchronized (session) {
session.steadyState = false;
if (invalidateOnline) {
session.online = false;
}
}
}
}
void rowsTaken(long takeSessionId,
TxPartitionStripe txPartitionStripe,
VersionedAquarium versionedAquarium,
VersionedRing versionedRing,
RingMember remoteRingMember,
long localTxId,
boolean replicated) throws Exception {
lastTakenMillis = System.currentTimeMillis();
if (expunged) {
return;
}
Session session = sessions.get(remoteRingMember);
if (session != null) {
long highestTxId = highestPartitionTx(txPartitionStripe, versionedAquarium);
while (true) {
synchronized (session) {
if (session.sessionId == takeSessionId) {
long tookTxId = Math.max(localTxId, session.tookTxId);
session.tookTxId = tookTxId;
session.tookFully = (tookTxId >= session.offeredTxId);
session.steadyState = (localTxId >= highestTxId);
}
}
long checkTxId = highestPartitionTx(txPartitionStripe, versionedAquarium);
if (checkTxId == highestTxId) {
break;
} else {
highestTxId = checkTxId;
}
}
}
if (updateCategory(versionedRing, replicated, localTxId)) {
for (Session wipe : sessions.values()) {
synchronized (wipe) {
wipe.steadyState = false;
}
}
}
}
//TODO call this?
void cleanup(Set<RingMember> retain) {
if (expunged) {
return;
}
Set<RingMember> keySet = sessions.keySet();
keySet.removeAll(Sets.difference(keySet, retain));
}
private boolean updateCategory(VersionedRing versionedRing, boolean replicated, long latestTxId) throws Exception {
lastCategoryCheckMillis = System.currentTimeMillis();
if (replicated) {
long currentTimeTxId = timestampedOrderIdProvider.getApproximateId(System.currentTimeMillis());
int[] fastEnough = { 0 };
int worstCategory = 1;
for (Entry<RingMember, Integer> candidate : versionedRing.members.entrySet()) {
if (fastEnough[0] < versionedRing.takeFromFactor) {
worstCategory = Math.max(worstCategory, candidate.getValue());
Session session = sessions.get(candidate.getKey());
if (session != null) {
synchronized (session) {
if (session.tookTxId == latestTxId) {
fastEnough[0]++;
} else {
long latency = currentTimeTxId - session.tookTxId;
if (latency < slowTakeId * candidate.getValue()) {
fastEnough[0]++;
}
}
}
}
}
}
int category = currentCategory.get();
if (category != worstCategory) {
currentCategory.set(worstCategory);
return true;
}
}
return false;
}
public void expunged() {
expunged = true;
sessions.clear();
}
boolean streamTookLatencies(VersionedRing versionedRing, TookLatencyStream stream) throws Exception {
for (Entry<RingMember, Integer> candidate : versionedRing.members.entrySet()) {
RingMember ringMember = candidate.getKey();
int category = candidate.getValue();
Session session = sessions.get(ringMember);
long tooSlowLatencyTxId = slowTakeId * category;
if (session != null) {
if (!stream.stream(ringMember,
session.offeredTxId,
category,
tooSlowLatencyTxId,
session.sessionId,
session.online,
session.steadyState,
lastOfferedMillis,
lastTakenMillis,
lastCategoryCheckMillis)) {
return false;
}
} else if (!stream.stream(ringMember,
-1L,
category,
tooSlowLatencyTxId,
-1,
false,
false,
lastOfferedMillis,
lastTakenMillis,
lastCategoryCheckMillis)) {
return false;
}
}
return true;
}
public long getCallCount() {
return callCount;
}
static class Session {
long sessionId = -1;
long offeredTxId = -1;
long reofferAtTimeInMillis = -1;
long tookTxId = -1;
boolean tookFully = false;
boolean steadyState = false;
boolean online = false;
}
}