package com.jivesoftware.os.amza.service; import com.google.common.base.Optional; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.jivesoftware.os.amza.api.FailedToAchieveQuorumException; import com.jivesoftware.os.amza.api.partition.VersionedPartitionName; import com.jivesoftware.os.amza.api.ring.RingMember; import com.jivesoftware.os.amza.service.stats.AmzaStats; import com.jivesoftware.os.amza.service.take.TakeCoordinator; import com.jivesoftware.os.mlogger.core.MetricLogger; import com.jivesoftware.os.mlogger.core.MetricLoggerFactory; import com.jivesoftware.os.routing.bird.health.api.HealthTimer; import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.TimeoutException; /** * @author jonathan.colt */ public class AckWaters { private static final MetricLogger LOG = MetricLoggerFactory.getLogger(); private final AmzaStats amzaSystemStats; private final AmzaStats amzaStats; private final HealthTimer quorumLatency; private final AwaitNotify<VersionedPartitionName> awaitNotify; private final boolean verboseLogTimeouts; private final Map<RingMember, Map<VersionedPartitionName, LeadershipTokenAndTxId>> ackWaters = Maps.newConcurrentMap(); public AckWaters(AmzaStats amzaSystemStats, AmzaStats amzaStats, HealthTimer quorumLatency, int stripingLevel, boolean verboseLogTimeouts) { this.amzaSystemStats = amzaSystemStats; this.amzaStats = amzaStats; this.quorumLatency = quorumLatency; this.awaitNotify = new AwaitNotify<>(stripingLevel); this.verboseLogTimeouts = verboseLogTimeouts; } public void set(RingMember ringMember, VersionedPartitionName partitionName, long txId, long leadershipToken) throws Exception { Map<VersionedPartitionName, LeadershipTokenAndTxId> partitionTxIds = ackWaters.computeIfAbsent(ringMember, (t) -> Maps.newConcurrentMap()); awaitNotify.notifyChange(partitionName, () -> { LeadershipTokenAndTxId result = partitionTxIds.compute(partitionName, (key, current) -> { if (current == null) { return new LeadershipTokenAndTxId(leadershipToken, txId); } else { if (txId <= current.txId && leadershipToken <= current.leadershipToken) { return current; } return new LeadershipTokenAndTxId(Math.max(leadershipToken, current.leadershipToken), Math.max(txId, current.txId)); } }); return txId == result.txId || leadershipToken == result.leadershipToken; }); } LeadershipTokenAndTxId get(RingMember ringMember, VersionedPartitionName partitionName) { Map<VersionedPartitionName, LeadershipTokenAndTxId> partitionTxIds = ackWaters.get(ringMember); if (partitionTxIds == null) { return null; } return partitionTxIds.get(partitionName); } static class LeadershipTokenAndTxId { final long leadershipToken; final long txId; LeadershipTokenAndTxId(long leadershipToken, long txId) { this.leadershipToken = leadershipToken; this.txId = txId; } } public int await(VersionedPartitionName versionedPartitionName, long desiredTxId, Collection<RingMember> takeRingMembers, int desiredTakeQuorum, long toMillis, long leadershipToken, TakeCoordinator takeCoordinator) throws Exception { AmzaStats stats = versionedPartitionName.getPartitionName().isSystemPartition() ? amzaSystemStats : amzaStats; RingMember[] ringMembers = takeRingMembers.toArray(new RingMember[takeRingMembers.size()]); int[] passed = new int[1]; long[] tookToTxId = new long[ringMembers.length]; Arrays.fill(tookToTxId, -1); List<RingMember> tookFrom = Lists.newArrayList(); quorumLatency.startTimer(); try { long start = System.currentTimeMillis(); Integer quorum = awaitNotify.awaitChange(versionedPartitionName, () -> { for (int i = 0; i < ringMembers.length; i++) { RingMember ringMember = ringMembers[i]; if (ringMember == null) { continue; } LeadershipTokenAndTxId leadershipTokenAndTxId = get(ringMember, versionedPartitionName); if (leadershipToken > -1 && (leadershipTokenAndTxId != null && leadershipTokenAndTxId.leadershipToken > leadershipToken)) { throw new FailedToAchieveQuorumException( "Leader transitioning from " + leadershipToken + " to " + leadershipTokenAndTxId.leadershipToken); } if (leadershipTokenAndTxId != null && leadershipTokenAndTxId.txId >= desiredTxId) { passed[0]++; ringMembers[i] = null; tookFrom.add(ringMember); } tookToTxId[i] = leadershipTokenAndTxId != null ? leadershipTokenAndTxId.txId : -1; if (passed[0] >= desiredTakeQuorum) { return Optional.of(passed[0]); } } return null; }, toMillis); stats.quorums(versionedPartitionName.getPartitionName(), 1, System.currentTimeMillis() - start, tookFrom); return quorum; } catch (TimeoutException e) { if (verboseLogTimeouts) { StringBuilder buf = new StringBuilder(); takeCoordinator.streamTookLatencies(versionedPartitionName, (ringMember, lastOfferedTxId, category, tooSlowTxId, takeSessionId, online, steadyState, lastOfferedMillis, lastTakenMillis, lastCategoryCheckMillis) -> { buf.append('\n').append(String.format( "- member:%s lastOfferedTxId:%s category:%s tooSlowTxId:%s takeSessionId:%s online:%s " + "steadyState:%s lastOfferedMillis:%s lastTakenMillis:%s, lastCategoryCheckMillis:%s", ringMember, lastOfferedTxId, category, tooSlowTxId, takeSessionId, online, steadyState, lastOfferedMillis, lastTakenMillis, lastCategoryCheckMillis)); return true; }); LOG.warn("Failed to achieve quorum for partition:{} desiredTxId:{} desiredTakeQuorum:{} passed:{} leadershipToken:{} tookToTxId:{} details:{}", versionedPartitionName, desiredTxId, desiredTakeQuorum, passed[0], leadershipToken, Arrays.toString(tookToTxId), buf); } stats.quorumTimeouts(versionedPartitionName.getPartitionName(), 1); throw e; } finally { quorumLatency.stopTimer("Commit Quorum Latency", "Check network connectivity and neighbor health."); } } public interface MemberTxIdStream { boolean stream(RingMember member, long txId) throws Exception; } public boolean streamPartitionTxIds(VersionedPartitionName versionedPartitionName, MemberTxIdStream stream) throws Exception { for (Entry<RingMember, Map<VersionedPartitionName, LeadershipTokenAndTxId>> entry : ackWaters.entrySet()) { RingMember member = entry.getKey(); Map<VersionedPartitionName, LeadershipTokenAndTxId> partitionTxIds = entry.getValue(); LeadershipTokenAndTxId leadershipTokenAndTxId = partitionTxIds.get(versionedPartitionName); if (leadershipTokenAndTxId != null) { if (!stream.stream(member, leadershipTokenAndTxId.txId)) { return false; } } } return true; } }