package com.jivesoftware.os.amza.service.replication.http;
import com.jivesoftware.os.amza.api.PartitionClient.KeyValueFilter;
import com.jivesoftware.os.amza.api.RingPartitionProperties;
import com.jivesoftware.os.amza.api.filer.IReadable;
import com.jivesoftware.os.amza.api.filer.IWriteable;
import com.jivesoftware.os.amza.api.filer.UIO;
import com.jivesoftware.os.amza.api.partition.Consistency;
import com.jivesoftware.os.amza.api.partition.PartitionName;
import com.jivesoftware.os.amza.api.partition.PartitionProperties;
import com.jivesoftware.os.amza.api.ring.RingMember;
import com.jivesoftware.os.amza.api.ring.RingMemberAndHost;
import com.jivesoftware.os.amza.api.stream.KeyValueStream;
import com.jivesoftware.os.amza.api.stream.PrefixedKeyRanges;
import com.jivesoftware.os.amza.api.stream.RowType;
import com.jivesoftware.os.amza.api.stream.TxKeyValueStream;
import com.jivesoftware.os.amza.api.stream.TxKeyValueStream.TxResult;
import com.jivesoftware.os.amza.api.take.Highwaters;
import com.jivesoftware.os.amza.api.take.TakeResult;
import com.jivesoftware.os.amza.api.wal.WALHighwater;
import com.jivesoftware.os.amza.service.NotARingMemberException;
import com.jivesoftware.os.amza.service.Partition;
import com.jivesoftware.os.amza.service.Partition.ScanRange;
import com.jivesoftware.os.amza.service.PartitionProvider;
import com.jivesoftware.os.amza.service.PropertiesNotPresentException;
import com.jivesoftware.os.amza.service.ring.AmzaRingReader;
import com.jivesoftware.os.amza.service.ring.AmzaRingWriter;
import com.jivesoftware.os.amza.service.ring.RingTopology;
import com.jivesoftware.os.mlogger.core.MetricLogger;
import com.jivesoftware.os.mlogger.core.MetricLoggerFactory;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.Deque;
import java.util.List;
/**
* @author jonathan.colt
*/
public class AmzaClientService implements AmzaRestClient {
private static final MetricLogger LOG = MetricLoggerFactory.getLogger();
private final AmzaRingReader ringReader;
private final AmzaRingWriter ringWriter;
private final PartitionProvider partitionProvider;
public AmzaClientService(AmzaRingReader ringReader, AmzaRingWriter ringWriter, PartitionProvider partitionProvider) {
this.ringReader = ringReader;
this.ringWriter = ringWriter;
this.partitionProvider = partitionProvider;
}
@Override
public RingPartitionProperties getProperties(PartitionName partitionName) throws Exception {
RingTopology ringTopology = ringReader.getRing(partitionName.getRingName(), 0);
return new RingPartitionProperties(ringTopology.entries.size(), partitionProvider.getProperties(partitionName));
}
@Override
public RingTopology configPartition(PartitionName partitionName, PartitionProperties partitionProperties, int ringSize) throws Exception {
byte[] ringNameBytes = partitionName.getRingName();
ringWriter.ensureSubRing(ringNameBytes, ringSize, 0);
if (!partitionProvider.createPartitionIfAbsent(partitionName, partitionProperties)) {
partitionProvider.updateProperties(partitionName, partitionProperties);
}
return ringReader.getRing(partitionName.getRingName(), 0);
}
@Override
public void configPartition(RingTopology ring, IWriteable writeable) throws Exception {
byte[] lengthBuffer = new byte[4];
UIO.writeInt(writeable, ring.entries.size(), "ringSize", lengthBuffer);
for (RingMemberAndHost entry : ring.entries) {
UIO.writeByteArray(writeable, entry.ringMember.toBytes(), "ringMember", lengthBuffer);
UIO.writeByteArray(writeable, entry.ringHost.toBytes(), "ringHost", lengthBuffer);
UIO.writeByte(writeable, (byte) 0, "leader");
}
}
@Override
public void ensurePartition(PartitionName partitionName, long waitForLeaderElection) throws Exception {
long start = System.currentTimeMillis();
partitionProvider.awaitOnline(partitionName, waitForLeaderElection);
partitionProvider.awaitLeader(partitionName, Math.max(0, waitForLeaderElection - (System.currentTimeMillis() - start)));
}
@Override
public RingLeader ring(PartitionName partitionName) throws Exception {
return new RingLeader(ringReader.getRing(partitionName.getRingName(), 0), null);
}
@Override
public RingLeader ringLeader(PartitionName partitionName, long waitForLeaderElection) throws Exception {
RingMember leader = partitionName.isSystemPartition() ? null : partitionProvider.awaitLeader(partitionName, waitForLeaderElection);
return new RingLeader(ringReader.getRing(partitionName.getRingName(), 0), leader);
}
@Override
public void ring(RingLeader ringLeader, IWriteable writeable) throws IOException {
byte[] lengthBuffer = new byte[4];
UIO.writeInt(writeable, ringLeader.ringTopology.entries.size(), "ringSize", lengthBuffer);
for (RingMemberAndHost entry : ringLeader.ringTopology.entries) {
UIO.writeByteArray(writeable, entry.ringMember.toBytes(), "ringMember", lengthBuffer);
UIO.writeByteArray(writeable, entry.ringHost.toBytes(), "ringHost", lengthBuffer);
boolean isLeader = ringLeader.leader != null && Arrays.equals(entry.ringMember.toBytes(), ringLeader.leader.toBytes());
UIO.writeByte(writeable, isLeader ? (byte) 1 : (byte) 0, "leader");
}
}
@Override
public StateMessageCause commit(PartitionName partitionName,
Consistency consistency,
boolean checkLeader,
long partitionAwaitOnlineTimeoutMillis,
IReadable read) throws Exception {
StateMessageCause response = checkForReadyState(partitionName, consistency, checkLeader, partitionAwaitOnlineTimeoutMillis);
if (response != null) {
return response;
}
Partition partition = partitionProvider.getPartition(partitionName);
byte[] intLongBuffer = new byte[8];
byte[] prefix = UIO.readByteArray(read, "prefix", intLongBuffer);
long timeoutInMillis = UIO.readLong(read, "timeoutInMillis", intLongBuffer);
partition.commit(consistency, prefix, commitKeyValueStream -> {
while (!UIO.readBoolean(read, "eos")) {
boolean result = commitKeyValueStream.commit(
UIO.readByteArray(read, "key", intLongBuffer),
UIO.readByteArray(read, "value", intLongBuffer),
UIO.readLong(read, "valueTimestamp", intLongBuffer),
UIO.readBoolean(read, "valueTombstoned"));
if (!result) {
return false;
}
}
return true;
}, timeoutInMillis);
return null;
}
@Override
public StateMessageCause status(PartitionName partitionName, Consistency consistency, boolean checkLeader,
long partitionAwaitOnlineTimeoutMillis) {
return checkForReadyState(partitionName, consistency, checkLeader, partitionAwaitOnlineTimeoutMillis);
}
@Override
public void get(PartitionName partitionName, Consistency consistency, IReadable in, IWriteable out) throws Exception {
Partition partition = partitionProvider.getPartition(partitionName);
byte[] intLongBuffer = new byte[8];
byte[] prefix = UIO.readByteArray(in, "prefix", intLongBuffer);
partition.get(consistency,
prefix,
true,
(keyStream) -> {
while (!UIO.readBoolean(in, "eos")) {
if (!keyStream.stream(UIO.readByteArray(in, "key", intLongBuffer))) {
return false;
}
}
return true;
},
(prefix1, key, value, timestamp, tombstoned, version) -> {
UIO.writeByte(out, (byte) 0, "eos");
UIO.writeByteArray(out, prefix1, "prefix", intLongBuffer);
UIO.writeByteArray(out, key, "key", intLongBuffer);
UIO.writeByteArray(out, value, "value", intLongBuffer);
UIO.writeLong(out, timestamp, "timestamp", intLongBuffer);
UIO.writeByte(out, (byte) (tombstoned ? 1 : 0), "tombstoned");
UIO.writeLong(out, version, "version", intLongBuffer);
return true;
});
UIO.writeByte(out, (byte) 1, "eos");
}
@Override
public void getOffset(PartitionName partitionName, Consistency consistency, IReadable in, IWriteable out) throws Exception {
Partition partition = partitionProvider.getPartition(partitionName);
byte[] intLongBuffer = new byte[8];
byte[] prefix = UIO.readByteArray(in, "prefix", intLongBuffer);
Deque<int[]> offsetLengths = new ArrayDeque<>();
partition.get(consistency,
prefix,
true,
(keyStream) -> {
while (!UIO.readBoolean(in, "eos")) {
byte[] key = UIO.readByteArray(in, "key", intLongBuffer);
int offset = UIO.readInt(in, "offset", intLongBuffer);
int length = UIO.readInt(in, "length", intLongBuffer);
offsetLengths.addLast(new int[] { offset, length });
if (!keyStream.stream(key)) {
return false;
}
}
return true;
},
(prefix1, key, value, timestamp, tombstoned, version) -> {
int[] offsetLength = offsetLengths.removeFirst();
int offset = offsetLength[0];
int length = offsetLength[1];
UIO.writeByte(out, (byte) 0, "eos");
UIO.writeByteArray(out, prefix1, "prefix", intLongBuffer);
UIO.writeByteArray(out, key, "key", intLongBuffer);
if (value == null || offset == 0 && length >= value.length) {
UIO.writeByteArray(out, value, "value", intLongBuffer);
} else if (offset >= value.length) {
UIO.writeByteArray(out, null, "value", intLongBuffer);
} else {
int available = Math.min(length, value.length - offset);
UIO.writeByteArray(out, value, offset, available, "value", intLongBuffer);
}
UIO.writeLong(out, timestamp, "timestamp", intLongBuffer);
UIO.writeByte(out, (byte) (tombstoned ? 1 : 0), "tombstoned");
UIO.writeLong(out, version, "version", intLongBuffer);
return true;
});
UIO.writeByte(out, (byte) 1, "eos");
}
@Override
public void scan(PartitionName partitionName,
List<ScanRange> ranges,
boolean rangeBoundaries,
KeyValueFilter filter,
IWriteable out,
boolean hydrateValues) throws Exception {
byte[] intLongBuffer = new byte[8];
Partition partition = partitionProvider.getPartition(partitionName);
long[] scannedValuesCostInBytes = new long[2];
PrefixedKeyRanges prefixedKeyRanges = stream -> {
if (rangeBoundaries) {
for (ScanRange range : ranges) {
UIO.writeByte(out, (byte) 0, "eosRange");
boolean result = stream.stream(range.fromPrefix, range.fromKey, range.toPrefix, range.toKey);
UIO.writeByte(out, (byte) 1, "eos");
if (!result) {
UIO.writeByte(out, (byte) 1, "eosRange");
return false;
}
}
UIO.writeByte(out, (byte) 1, "eosRange");
} else {
for (ScanRange range : ranges) {
if (!stream.stream(range.fromPrefix, range.fromKey, range.toPrefix, range.toKey)) {
UIO.writeByte(out, (byte) 1, "eos");
return false;
}
}
UIO.writeByte(out, (byte) 1, "eos");
}
return true;
};
KeyValueStream keyValueStream = (prefix, key, value, timestamp, tombstoned, version) -> {
scannedValuesCostInBytes[0] += value != null ? value.length : 0;
UIO.writeByte(out, (byte) 0, "eos");
UIO.writeByteArray(out, prefix, "prefix", intLongBuffer);
UIO.writeByteArray(out, key, "key", intLongBuffer);
if (hydrateValues) {
UIO.writeByteArray(out, value, "value", intLongBuffer);
}
UIO.writeLong(out, timestamp, "timestamp", intLongBuffer);
UIO.writeByte(out, tombstoned ? (byte) 1 : (byte) 0, "tombstoned");
UIO.writeLong(out, version, "version", intLongBuffer);
return true;
};
if (filter != null) {
partition.scan(
prefixedKeyRanges,
true,
hydrateValues,
(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion) -> {
scannedValuesCostInBytes[1] += value != null ? value.length : 0;
return filter.filter(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion, keyValueStream);
});
LOG.inc("scan>filtered>calls");
LOG.inc("scan>filtered>bytes>saved", scannedValuesCostInBytes[1] - scannedValuesCostInBytes[0]);
LOG.inc("scan>filtered>bytes>total", scannedValuesCostInBytes[0]);
LOG.inc("scan>filtered>bytes>original", scannedValuesCostInBytes[1]);
LOG.inc("scan>filtered>bytes>pow>" + UIO.chunkPower(scannedValuesCostInBytes[0], 0));
} else {
partition.scan(prefixedKeyRanges, true, hydrateValues, keyValueStream);
LOG.inc("scan>unfiltered>bytes>pow>" + UIO.chunkPower(scannedValuesCostInBytes[0], 0));
}
}
@Override
public void takeFromTransactionId(PartitionName partitionName, int limit, IReadable in, IWriteable out) throws Exception {
byte[] intLongBuffer = new byte[8];
long transactionId = UIO.readLong(in, "transactionId", intLongBuffer);
Partition partition = partitionProvider.getPartition(partitionName);
take(out, partition, false, null, transactionId, limit, intLongBuffer);
}
@Override
public void takePrefixFromTransactionId(PartitionName partitionName, int limit, IReadable in, IWriteable out) throws Exception {
byte[] intLongBuffer = new byte[8];
Partition partition = partitionProvider.getPartition(partitionName);
byte[] prefix = UIO.readByteArray(in, "prefix", intLongBuffer);
long txId = UIO.readLong(in, "txId", intLongBuffer);
take(out, partition, true, prefix, txId, limit, intLongBuffer);
}
@Override
public long approximateCount(PartitionName partitionName) throws Exception {
Partition partition = partitionProvider.getPartition(partitionName);
return partition.approximateCount();
}
private void take(IWriteable out,
Partition partition,
boolean usePrefix,
byte[] prefix,
long txId,
int limit,
byte[] lengthBuffer) throws Exception {
RingMember ringMember = ringReader.getRingMember();
UIO.writeByteArray(out, ringMember.toBytes(), "ringMember", lengthBuffer);
Highwaters streamHighwater = (highwater) -> {
UIO.writeByte(out, (byte) 0, "eos");
UIO.writeByte(out, RowType.highwater.toByte(), "type");
writeHighwaters(out, highwater, lengthBuffer);
};
int[] count = { 0 };
TxKeyValueStream stream = (rowTxId, prefix1, key, value, timestamp, tombstoned, version) -> {
UIO.writeByte(out, (byte) 0, "eos");
UIO.writeByte(out, RowType.primary.toByte(), "type");
UIO.writeLong(out, rowTxId, "rowTxId", lengthBuffer);
UIO.writeByteArray(out, prefix1, "prefix", lengthBuffer);
UIO.writeByteArray(out, key, "key", lengthBuffer);
UIO.writeByteArray(out, value, "value", lengthBuffer);
UIO.writeLong(out, timestamp, "timestamp", lengthBuffer);
UIO.writeByte(out, tombstoned ? (byte) 1 : (byte) 0, "tombstoned");
UIO.writeLong(out, version, "version", lengthBuffer);
count[0]++;
return (limit > 0 && count[0] >= limit) ? TxResult.ACCEPT_AND_STOP : TxResult.MORE;
};
TakeResult takeResult;
if (usePrefix) {
takeResult = partition.takePrefixFromTransactionId(prefix, txId, true, streamHighwater, stream);
} else {
takeResult = partition.takeFromTransactionId(txId, true, streamHighwater, stream);
}
UIO.writeByte(out, (byte) 1, "eos");
UIO.writeByteArray(out, takeResult.tookFrom.toBytes(), "ringMember", lengthBuffer);
UIO.writeLong(out, takeResult.lastTxId, "lastTxId", lengthBuffer);
writeHighwaters(out, takeResult.tookToEnd, lengthBuffer);
UIO.writeByte(out, (byte) 1, "eos");
}
private void writeHighwaters(IWriteable out, WALHighwater highwater, byte[] lengthBuffer) throws IOException {
if (highwater == null) {
UIO.writeInt(out, 0, "length", lengthBuffer);
} else {
UIO.writeInt(out, highwater.ringMemberHighwater.size(), "length", lengthBuffer);
for (WALHighwater.RingMemberHighwater ringMemberHighwater : highwater.ringMemberHighwater) {
UIO.writeByteArray(out, ringMemberHighwater.ringMember.toBytes(), "ringMember", lengthBuffer);
UIO.writeLong(out, ringMemberHighwater.transactionId, "txId", lengthBuffer);
}
}
}
private StateMessageCause checkForReadyState(PartitionName partitionName,
Consistency consistency,
boolean checkLeader,
long partitionAwaitOnlineTimeoutMillis) {
try {
partitionProvider.awaitOnline(partitionName, partitionAwaitOnlineTimeoutMillis);
} catch (PropertiesNotPresentException e) {
return new StateMessageCause(partitionName, consistency, checkLeader, partitionAwaitOnlineTimeoutMillis,
State.properties_not_present,
"Properties for partition are not present.", e);
} catch (NotARingMemberException e) {
return new StateMessageCause(partitionName, consistency, checkLeader, partitionAwaitOnlineTimeoutMillis,
State.not_a_ring_member,
"This node is not a member of the requested ring.", e);
} catch (Exception e) {
return new StateMessageCause(partitionName, consistency, checkLeader, partitionAwaitOnlineTimeoutMillis,
State.failed_to_come_online,
"Partition didn't come online within the allotted time of " + partitionAwaitOnlineTimeoutMillis + "millis", e);
}
if (checkLeader && consistency.requiresLeader()) {
try {
RingMember leader = partitionProvider.awaitLeader(partitionName, 0);
if (leader == null) {
return new StateMessageCause(partitionName, consistency, checkLeader, partitionAwaitOnlineTimeoutMillis,
State.lacks_leader, "Lacks required leader.", null);
}
if (!leader.equals(ringReader.getRingMember())) {
return new StateMessageCause(partitionName, consistency, checkLeader, partitionAwaitOnlineTimeoutMillis,
State.not_the_leader, "Leader has changed.", null);
}
} catch (Exception x) {
Object[] vals = new Object[] { partitionName, consistency };
LOG.warn("Failed while determining leader {} at {}. ", vals, x);
return new StateMessageCause(partitionName, consistency, checkLeader, partitionAwaitOnlineTimeoutMillis,
State.error, "Failed while determining leader: " + Arrays.toString(vals), x);
}
}
return null;
}
}