package com.jivesoftware.os.amza.client.http; import com.google.common.collect.Lists; import com.jivesoftware.os.amza.api.CompareTimestampVersions; import com.jivesoftware.os.amza.api.PartitionClient; import com.jivesoftware.os.amza.api.filer.FilerInputStream; import com.jivesoftware.os.amza.api.filer.UIO; import com.jivesoftware.os.amza.api.partition.Consistency; import com.jivesoftware.os.amza.api.partition.PartitionName; import com.jivesoftware.os.amza.api.ring.RingMember; import com.jivesoftware.os.amza.api.stream.ClientUpdates; import com.jivesoftware.os.amza.api.stream.KeyValueStream; import com.jivesoftware.os.amza.api.stream.KeyValueTimestampStream; import com.jivesoftware.os.amza.api.stream.OffsetUnprefixedWALKeys; import com.jivesoftware.os.amza.api.stream.PrefixedKeyRanges; import com.jivesoftware.os.amza.api.stream.RowType; import com.jivesoftware.os.amza.api.stream.TxKeyValueStream; import com.jivesoftware.os.amza.api.stream.TxKeyValueStream.TxResult; import com.jivesoftware.os.amza.api.stream.UnprefixedWALKeys; import com.jivesoftware.os.amza.api.take.Highwaters; import com.jivesoftware.os.amza.api.take.TakeResult; import com.jivesoftware.os.amza.api.wal.WALHighwater; import com.jivesoftware.os.amza.api.wal.WALHighwater.RingMemberHighwater; import com.jivesoftware.os.mlogger.core.MetricLogger; import com.jivesoftware.os.mlogger.core.MetricLoggerFactory; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Optional; import org.xerial.snappy.SnappyInputStream; /** * @author jonathan.colt */ public class AmzaPartitionClient<C, E extends Throwable> implements PartitionClient { private static final MetricLogger LOG = MetricLoggerFactory.getLogger(); private final PartitionName partitionName; private final AmzaClientCallRouter<C, E> partitionCallRouter; private final RemotePartitionCaller<C, E> remotePartitionCaller; private final long awaitLeaderElectionForNMillis; private final long debugClientCount; private final long debugClientCountInterval; private volatile long lastDebugClientTime = 0; public AmzaPartitionClient(PartitionName partitionName, AmzaClientCallRouter<C, E> partitionCallRouter, RemotePartitionCaller<C, E> remotePartitionCaller, long awaitLeaderElectionForNMillis, long debugClientCount, long debugClientCountInterval) throws IOException { this.partitionName = partitionName; this.partitionCallRouter = partitionCallRouter; this.remotePartitionCaller = remotePartitionCaller; this.awaitLeaderElectionForNMillis = awaitLeaderElectionForNMillis; this.debugClientCount = debugClientCount; this.debugClientCountInterval = debugClientCountInterval; } @Override public void commit(Consistency consistency, byte[] prefix, ClientUpdates updates, long additionalSolverAfterNMillis, long abandonSolutionAfterNMillis, Optional<List<String>> solutionLog) throws Exception { partitionCallRouter.write(solutionLog.orElse(null), partitionName, consistency, "commit", (leader, ringMember, client) -> { return remotePartitionCaller.commit(leader, ringMember, client, consistency, prefix, updates, abandonSolutionAfterNMillis); }, answer -> true, awaitLeaderElectionForNMillis, additionalSolverAfterNMillis, abandonSolutionAfterNMillis); } @Override public long getApproximateCount(Consistency consistency, long additionalSolverAfterNMillis, long abandonLeaderSolutionAfterNMillis, long abandonSolutionAfterNMillis, Optional<List<String>> solutionLog) throws Exception { return partitionCallRouter.read(solutionLog.orElse(null), partitionName, consistency, "approximateCount", remotePartitionCaller::getApproximateCount, (answers) -> { long maxApproximateCount = -1; for (RingMemberAndHostAnswer<CloseableLong> answer : answers) { CloseableLong a = answer.getAnswer(); maxApproximateCount = Math.max(maxApproximateCount, a.getLong()); } return maxApproximateCount; }, awaitLeaderElectionForNMillis, additionalSolverAfterNMillis, abandonLeaderSolutionAfterNMillis, abandonSolutionAfterNMillis); } @Override public boolean get(Consistency consistency, byte[] prefix, UnprefixedWALKeys keys, KeyValueTimestampStream valuesStream, long additionalSolverAfterNMillis, long abandonLeaderSolutionAfterNMillis, long abandonSolutionAfterNMillis, Optional<List<String>> solutionLog) throws Exception { return getInternal(consistency, prefix, keys, (prefix1, key, value, valueTimestamp, valueTombstoned, valueVersion) -> { return valueTombstoned || valuesStream.stream(prefix1, key, value, valueTimestamp, valueVersion); }, additionalSolverAfterNMillis, abandonLeaderSolutionAfterNMillis, abandonSolutionAfterNMillis, solutionLog); } @Override public boolean getOffset(Consistency consistency, byte[] prefix, OffsetUnprefixedWALKeys keys, KeyValueTimestampStream valuesStream, long additionalSolverAfterNMillis, long abandonLeaderSolutionAfterNMillis, long abandonSolutionAfterNMillis, Optional<List<String>> solutionLog) throws Exception { return getInternal(consistency, prefix, keys, (prefix1, key, value, valueTimestamp, valueTombstoned, valueVersion) -> { return valueTombstoned || valuesStream.stream(prefix1, key, value, valueTimestamp, valueVersion); }, additionalSolverAfterNMillis, abandonLeaderSolutionAfterNMillis, abandonSolutionAfterNMillis, solutionLog); } @Override public boolean getRaw(Consistency consistency, byte[] prefix, UnprefixedWALKeys keys, KeyValueStream valuesStream, long additionalSolverAfterNMillis, long abandonLeaderSolutionAfterNMillis, long abandonSolutionAfterNMillis, Optional<List<String>> solutionLog) throws Exception { return getInternal(consistency, prefix, keys, valuesStream, additionalSolverAfterNMillis, abandonLeaderSolutionAfterNMillis, abandonSolutionAfterNMillis, solutionLog); } private boolean getInternal(Consistency consistency, byte[] prefix, UnprefixedWALKeys keys, KeyValueStream stream, long additionalSolverAfterNMillis, long abandonLeaderSolutionAfterNMillis, long abandonSolutionAfterNMillis, Optional<List<String>> solutionLog) throws Exception { return getInternalCall(consistency, stream, additionalSolverAfterNMillis, abandonLeaderSolutionAfterNMillis, abandonSolutionAfterNMillis, solutionLog, (leader, ringMember, client) -> { return remotePartitionCaller.get(leader, ringMember, client, consistency, prefix, keys); }); } private boolean getInternal(Consistency consistency, byte[] prefix, OffsetUnprefixedWALKeys keys, KeyValueStream stream, long additionalSolverAfterNMillis, long abandonLeaderSolutionAfterNMillis, long abandonSolutionAfterNMillis, Optional<List<String>> solutionLog) throws Exception { return getInternalCall(consistency, stream, additionalSolverAfterNMillis, abandonLeaderSolutionAfterNMillis, abandonSolutionAfterNMillis, solutionLog, (leader, ringMember, client) -> { return remotePartitionCaller.getOffset(leader, ringMember, client, consistency, prefix, keys); }); } private boolean getInternalCall(Consistency consistency, KeyValueStream stream, long additionalSolverAfterNMillis, long abandonLeaderSolutionAfterNMillis, long abandonSolutionAfterNMillis, Optional<List<String>> solutionLog, PartitionCall<C, CloseableStreamResponse, E> partitionCall) throws Exception { byte[] intLongBuffer = new byte[8]; partitionCallRouter.read(solutionLog.orElse(null), partitionName, consistency, "get", partitionCall, (answers) -> { List<FilerInputStream> streams = Lists.newArrayList( Lists.transform(answers, input -> { CloseableStreamResponse streamResponse = input.getAnswer(); debugStreamResponse(streamResponse); return new FilerInputStream(streamResponse.getInputStream()); })); int eosed = 0; while (streams.size() > 0 && eosed == 0) { byte[] latestPrefix = null; byte[] latestKey = null; byte[] latestValue = null; long latestTimestamp = Long.MIN_VALUE; boolean latestTombstoned = false; long latestVersion = Long.MIN_VALUE; for (FilerInputStream fis : streams) { if (!UIO.readBoolean(fis, "eos")) { byte[] p = UIO.readByteArray(fis, "prefix", intLongBuffer); byte[] k = UIO.readByteArray(fis, "key", intLongBuffer); byte[] v = UIO.readByteArray(fis, "value", intLongBuffer); long t = UIO.readLong(fis, "timestamp", intLongBuffer); boolean d = UIO.readBoolean(fis, "tombstone"); long z = UIO.readLong(fis, "version", intLongBuffer); int c = CompareTimestampVersions.compare(t, z, latestTimestamp, latestVersion); if (c > 0) { latestPrefix = p; latestKey = k; latestValue = v; latestTimestamp = t; latestTombstoned = d; latestVersion = z; } } else { eosed++; } } if (eosed > 0 && eosed < answers.size()) { throw new RuntimeException("Mismatched response lengths"); } if (eosed == 0 && !stream.stream(latestPrefix, latestKey, latestValue, latestTimestamp, latestTombstoned, latestVersion)) { break; } } return null; }, awaitLeaderElectionForNMillis, additionalSolverAfterNMillis, abandonLeaderSolutionAfterNMillis, abandonSolutionAfterNMillis); return true; } @Override public boolean scan(Consistency consistency, boolean compressed, PrefixedKeyRanges ranges, KeyValueTimestampStream scan, long additionalSolverAfterNMillis, long abandonLeaderSolutionAfterNMillis, long abandonSolutionAfterNMillis, Optional<List<String>> solutionLog) throws Exception { return scanInternal(consistency, compressed, ranges, null, scan, true, additionalSolverAfterNMillis, abandonLeaderSolutionAfterNMillis, abandonSolutionAfterNMillis, solutionLog); } @Override public boolean scanFiltered(Consistency consistency, boolean compressed, PrefixedKeyRanges ranges, KeyValueFilter filter, KeyValueTimestampStream scan, long additionalSolverAfterNMillis, long abandonLeaderSolutionAfterNMillis, long abandonSolutionAfterNMillis, Optional<List<String>> solutionLog) throws Exception { return scanInternal(consistency, compressed, ranges, filter, scan, true, additionalSolverAfterNMillis, abandonLeaderSolutionAfterNMillis, abandonSolutionAfterNMillis, solutionLog); } @Override public boolean scanKeys(Consistency consistency, boolean compressed, PrefixedKeyRanges ranges, KeyValueTimestampStream scan, long additionalSolverAfterNMillis, long abandonLeaderSolutionAfterNMillis, long abandonSolutionAfterNMillis, Optional<List<String>> solutionLog) throws Exception { return scanInternal(consistency, compressed, ranges, null, scan, false, additionalSolverAfterNMillis, abandonLeaderSolutionAfterNMillis, abandonSolutionAfterNMillis, solutionLog); } private boolean scanInternal(Consistency consistency, boolean compressed, PrefixedKeyRanges ranges, KeyValueFilter filter, KeyValueTimestampStream stream, boolean hydrateValues, long additionalSolverAfterNMillis, long abandonLeaderSolutionAfterNMillis, long abandonSolutionAfterNMillis, Optional<List<String>> solutionLog) throws Exception { boolean merge; if (consistency == Consistency.leader_plus_one || consistency == Consistency.leader_quorum || consistency == Consistency.quorum || consistency == Consistency.write_one_read_all) { merge = true; } else { merge = false; } byte[] intLongBuffer = new byte[8]; PartitionCall<C, CloseableStreamResponse, E> partitionCall = (leader, ringMember, client) -> { return remotePartitionCaller.scan(leader, ringMember, client, consistency, compressed, ranges, filter, hydrateValues); }; KeyValueStream keyValueStream = (prefix, key, value, valueTimestamp, valueTombstoned, valueVersion) -> { return valueTombstoned || stream.stream(prefix, key, value, valueTimestamp, valueVersion); }; String family = filter != null ? "scanFiltered" : hydrateValues ? "scan" : "scanKeys"; return partitionCallRouter.read(solutionLog.orElse(null), partitionName, consistency, family, partitionCall, (answers) -> { List<FilerInputStream> streams = Lists.newArrayList(Lists.transform(answers, input -> { try { CloseableStreamResponse streamResponse = input.getAnswer(); debugStreamResponse(streamResponse); InputStream answerInputStream = streamResponse.getInputStream(); InputStream inputStream = compressed ? new SnappyInputStream(new BufferedInputStream(answerInputStream, 8192)) : answerInputStream; return new FilerInputStream(inputStream); } catch (IOException e) { throw new RuntimeException(e); } })); int size = streams.size(); if (merge && size > 1) { while (true) { int eosRange = 0; for (int i = 0; i < size; i++) { FilerInputStream fis = streams.get(i); if (UIO.readBoolean(fis, "eosRange")) { eosRange++; } } if (eosRange == size) { break; } else if (eosRange > 0) { throw new IllegalStateException("Answers returned mismatched ranges"); } boolean[] eos = new boolean[size]; QuorumScan quorumScan = new QuorumScan(size); int eosed = 0; while (eosed < size) { for (int i = 0; i < size; i++) { if (quorumScan.used(i) && !eos[i]) { FilerInputStream fis = streams.get(i); eos[i] = UIO.readBoolean(fis, "eos"); if (!eos[i]) { quorumScan.fill(i, UIO.readByteArray(fis, "prefix", intLongBuffer), UIO.readByteArray(fis, "key", intLongBuffer), hydrateValues ? UIO.readByteArray(fis, "value", intLongBuffer) : null, UIO.readLong(fis, "timestamp", intLongBuffer), UIO.readBoolean(fis, "tombstone"), UIO.readLong(fis, "version", intLongBuffer)); } else { eosed++; } } } int wi = quorumScan.findWinningIndex(); if (wi > -1 && !quorumScan.stream(wi, keyValueStream)) { return false; } } int wi; while ((wi = quorumScan.findWinningIndex()) > -1) { if (!quorumScan.stream(wi, keyValueStream)) { return false; } } } LOG.debug("Merged {}", answers.size()); return true; } else if (size == 1) { FilerInputStream fis = streams.get(0); while (!UIO.readBoolean(fis, "eosRange")) { while (!UIO.readBoolean(fis, "eos")) { byte[] prefix = UIO.readByteArray(fis, "prefix", intLongBuffer); byte[] key = UIO.readByteArray(fis, "key", intLongBuffer); byte[] value = hydrateValues ? UIO.readByteArray(fis, "value", intLongBuffer) : null; long timestamp = UIO.readLong(fis, "timestamp", intLongBuffer); boolean tombstoned = UIO.readBoolean(fis, "tombstone"); long version = UIO.readLong(fis, "version", intLongBuffer); if (!tombstoned && !stream.stream(prefix, key, value, timestamp, version)) { return false; } } } return true; } throw new RuntimeException("Failed to scan."); }, awaitLeaderElectionForNMillis, additionalSolverAfterNMillis, abandonLeaderSolutionAfterNMillis, abandonSolutionAfterNMillis); } @Override public TakeResult takeFromTransactionId(List<RingMember> membersInOrder, Map<RingMember, Long> membersTxId, int limit, Highwaters highwaters, TxKeyValueStream stream, long additionalSolverAfterNMillis, long abandonSolutionAfterNMillis, Optional<List<String>> solutionLog) throws Exception { byte[] intLongBuffer = new byte[8]; return partitionCallRouter.take(solutionLog.orElse(null), partitionName, membersInOrder, "takeFromTransactionId", (leader, ringMember, client) -> { return remotePartitionCaller.takeFromTransactionId(leader, ringMember, client, membersTxId, limit); }, (answers) -> { List<FilerInputStream> streams = Lists.newArrayList( Lists.transform(answers, input -> { CloseableStreamResponse streamResponse = input.getAnswer(); debugStreamResponse(streamResponse); return new FilerInputStream(streamResponse.getInputStream()); })); if (streams.isEmpty()) { throw new RuntimeException("Failed to takeFromTransactionId."); } return take(streams.get(0), highwaters, stream, intLongBuffer); }, awaitLeaderElectionForNMillis, additionalSolverAfterNMillis, abandonSolutionAfterNMillis); } @Override public TakeResult takePrefixFromTransactionId(List<RingMember> membersInOrder, byte[] prefix, Map<RingMember, Long> membersTxId, int limit, Highwaters highwaters, TxKeyValueStream stream, long additionalSolverAfterNMillis, long abandonSolutionAfterNMillis, Optional<List<String>> solutionLog) throws Exception { byte[] intLongBuffer = new byte[8]; return partitionCallRouter.take(solutionLog.orElse(null), partitionName, membersInOrder, "takePrefixFromTransactionId", (leader, ringMember, client) -> { return remotePartitionCaller.takePrefixFromTransactionId(leader, ringMember, client, prefix, membersTxId, limit); }, (answers) -> { List<FilerInputStream> streams = Lists.newArrayList( Lists.transform(answers, input -> { CloseableStreamResponse streamResponse = input.getAnswer(); debugStreamResponse(streamResponse); return new FilerInputStream(streamResponse.getInputStream()); })); if (streams.isEmpty()) { throw new RuntimeException("Failed to takePrefixFromTransactionId."); } return take(streams.get(0), highwaters, stream, intLongBuffer); }, awaitLeaderElectionForNMillis, additionalSolverAfterNMillis, abandonSolutionAfterNMillis); } private TakeResult take(FilerInputStream fis, Highwaters highwaters, TxKeyValueStream stream, byte[] intLongBuffer) throws Exception { long maxTxId = -1; byte[] ringMemberBytes = UIO.readByteArray(fis, "ringMember", intLongBuffer); RingMember ringMember = new RingMember(ringMemberBytes); TxResult done = null; while (!UIO.readBoolean(fis, "eos")) { RowType rowType = RowType.fromByte(UIO.readByte(fis, "type")); if (rowType == RowType.highwater) { highwaters.highwater(readHighwaters(fis, intLongBuffer)); } else if (rowType.isPrimary()) { long rowTxId = UIO.readLong(fis, "rowTxId", intLongBuffer); if (done != null && rowTxId > maxTxId) { // streamed to end of txId return new TakeResult(ringMember, maxTxId, null); } if (done != null) { if (done.isAccepted()) { // ignore result; lastTxId is unchanged stream.stream(rowTxId, UIO.readByteArray(fis, "prefix", intLongBuffer), UIO.readByteArray(fis, "key", intLongBuffer), UIO.readByteArray(fis, "value", intLongBuffer), UIO.readLong(fis, "timestampId", intLongBuffer), UIO.readBoolean(fis, "tombstoned"), UIO.readLong(fis, "version", intLongBuffer)); } } else { TxResult result = stream.stream(rowTxId, UIO.readByteArray(fis, "prefix", intLongBuffer), UIO.readByteArray(fis, "key", intLongBuffer), UIO.readByteArray(fis, "value", intLongBuffer), UIO.readLong(fis, "timestampId", intLongBuffer), UIO.readBoolean(fis, "tombstoned"), UIO.readLong(fis, "version", intLongBuffer)); if (result.isAccepted()) { maxTxId = Math.max(maxTxId, rowTxId); } if (!result.wantsMore()) { if (result.isAccepted()) { // stream to end of txId done = result; } else { // reject entire txId return new TakeResult(ringMember, maxTxId, null); } } } } } ringMemberBytes = UIO.readByteArray(fis, "ringMember", intLongBuffer); ringMember = new RingMember(ringMemberBytes); return new TakeResult(ringMember, UIO.readLong(fis, "lastTxId", intLongBuffer), readHighwaters(fis, intLongBuffer)); } private WALHighwater readHighwaters(FilerInputStream inputStream, byte[] intLongBuffer) throws Exception { int length = UIO.readInt(inputStream, "length", intLongBuffer); if (length == 0) { // did not take to end return null; } List<RingMemberHighwater> walHighwaters = new ArrayList<>(); for (int i = 0; i < length; i++) { byte[] ringMemberBytes = UIO.readByteArray(inputStream, "ringMember", intLongBuffer); RingMember ringMember = new RingMember(ringMemberBytes); long txId = UIO.readLong(inputStream, "txId", intLongBuffer); walHighwaters.add(new RingMemberHighwater(ringMember, txId)); } return new WALHighwater(walHighwaters); } private void debugStreamResponse(CloseableStreamResponse streamResponse) { long ctm = System.currentTimeMillis(); if (debugClientCountInterval >= 0) { long activeCount = streamResponse.getActiveCount(); if (activeCount >= debugClientCount) { if (ctm - lastDebugClientTime >= debugClientCountInterval) { LOG.info("Active client count: {}", activeCount); lastDebugClientTime = ctm; } } } } }