package com.jivesoftware.os.amza.service.replication;
import com.google.common.collect.Maps;
import com.jivesoftware.os.amza.api.AmzaInterner;
import com.jivesoftware.os.amza.api.TimestampedValue;
import com.jivesoftware.os.amza.api.filer.UIO;
import com.jivesoftware.os.amza.api.partition.Durability;
import com.jivesoftware.os.amza.api.partition.PartitionName;
import com.jivesoftware.os.amza.api.partition.PartitionProperties;
import com.jivesoftware.os.amza.api.partition.VersionedPartitionName;
import com.jivesoftware.os.amza.api.ring.RingMember;
import com.jivesoftware.os.amza.api.wal.WALHighwater;
import com.jivesoftware.os.amza.api.wal.WALHighwater.RingMemberHighwater;
import com.jivesoftware.os.amza.api.wal.WALKey;
import com.jivesoftware.os.amza.api.wal.WALUpdated;
import com.jivesoftware.os.amza.service.stats.AmzaStats;
import com.jivesoftware.os.amza.service.storage.PartitionCreator;
import com.jivesoftware.os.amza.service.storage.SystemWALStorage;
import com.jivesoftware.os.amza.service.take.HighwaterStorage;
import com.jivesoftware.os.jive.utils.ordered.id.OrderIdProvider;
import com.jivesoftware.os.mlogger.core.MetricLogger;
import com.jivesoftware.os.mlogger.core.MetricLoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.Callable;
import java.util.concurrent.Semaphore;
import java.util.concurrent.atomic.AtomicLong;
public class PartitionBackedHighwaterStorage implements HighwaterStorage {
private static final MetricLogger LOG = MetricLoggerFactory.getLogger();
private final AmzaStats amzaSystemStats;
private final AmzaStats amzaStats;
private final AmzaInterner memberInterner;
private final OrderIdProvider orderIdProvider;
private final RingMember rootRingMember;
private final PartitionCreator partitionCreator;
private final SystemWALStorage systemWALStorage;
private final WALUpdated walUpdated;
private final long flushHighwatersAfterNUpdates;
private final int numPermits = 1024;
private final Semaphore bigBird = new Semaphore(numPermits, true); // TODO expose to config
private final Map<RingMember, Map<VersionedPartitionName, HighwaterUpdates>> hostToPartitionToHighwaterUpdates = Maps.newConcurrentMap();
private final Map<VersionedPartitionName, LocalHighwater> localHighwaterUpdates = Maps.newConcurrentMap();
private final AtomicLong[] stripeUpdatesSinceLastFlush;
private final AtomicLong systemUpdatesSinceLastFlush = new AtomicLong();
public PartitionBackedHighwaterStorage(AmzaStats amzaSystemStats,
AmzaStats amzaStats,
AmzaInterner memberInterner,
OrderIdProvider orderIdProvider,
RingMember rootRingMember,
PartitionCreator partitionCreator,
SystemWALStorage systemWALStorage,
WALUpdated walUpdated,
long flushHighwatersAfterNUpdates,
int deltaStripeCount) {
this.amzaSystemStats = amzaSystemStats;
this.amzaStats = amzaStats;
this.memberInterner = memberInterner;
this.orderIdProvider = orderIdProvider;
this.rootRingMember = rootRingMember;
this.partitionCreator = partitionCreator;
this.systemWALStorage = systemWALStorage;
this.walUpdated = walUpdated;
this.flushHighwatersAfterNUpdates = flushHighwatersAfterNUpdates;
this.stripeUpdatesSinceLastFlush = new AtomicLong[deltaStripeCount];
for (int i = 0; i < deltaStripeCount; i++) {
stripeUpdatesSinceLastFlush[i] = new AtomicLong();
}
}
public static void main(String[] args) throws Exception {
AmzaInterner amzaInterner = new AmzaInterner();
PartitionName a = amzaInterner.internPartitionNameBase64("AAAAAAABYQAAAARhNzIw");
System.out.println(a);
}
@Override
public void delete(VersionedPartitionName versionedPartitionName) throws Exception {
bigBird.acquire();
try {
for (Map<VersionedPartitionName, HighwaterUpdates> got : hostToPartitionToHighwaterUpdates.values()) {
got.remove(versionedPartitionName);
}
byte[] fromKey = walKey(versionedPartitionName, null);
byte[] toKey = WALKey.prefixUpperExclusive(fromKey);
long removeTimestamp = orderIdProvider.nextId();
systemWALStorage.rangeScan(PartitionCreator.HIGHWATER_MARK_INDEX, null, fromKey, null, toKey,
(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion) -> {
// could skip entries with valueTombstoned, but we ensure better consistency by adding a tombstone with a newer timestamp
systemWALStorage.update(PartitionCreator.HIGHWATER_MARK_INDEX, prefix,
(highwaters, txKeyValueStream) -> txKeyValueStream.row(-1, key, value, removeTimestamp, true, removeTimestamp),
walUpdated);
return true;
}, true);
} finally {
bigBird.release();
}
}
byte[] walKey(VersionedPartitionName versionedPartitionName, RingMember member) throws IOException {
byte[] versionedPartitionNameBytes = versionedPartitionName.toBytes();
byte[] rootRingMemberBytes = rootRingMember.toBytes();
if (member != null) {
byte[] memberBytes = member.toBytes();
byte[] asBytes = new byte[1 + 4 + versionedPartitionNameBytes.length + 4 + rootRingMemberBytes.length + 4 + memberBytes.length];
asBytes[0] = 0; // version
UIO.intBytes(versionedPartitionNameBytes.length, asBytes, 1);
System.arraycopy(versionedPartitionNameBytes, 0, asBytes, 1 + 4, versionedPartitionNameBytes.length);
UIO.intBytes(rootRingMemberBytes.length, asBytes, 1 + 4 + versionedPartitionNameBytes.length);
System.arraycopy(rootRingMemberBytes, 0, asBytes, 1 + 4 + versionedPartitionNameBytes.length + 4, rootRingMemberBytes.length);
UIO.intBytes(memberBytes.length, asBytes, 1 + 4 + versionedPartitionNameBytes.length + 4 + rootRingMemberBytes.length);
System.arraycopy(memberBytes, 0, asBytes, 1 + 4 + versionedPartitionNameBytes.length + 4 + rootRingMemberBytes.length + 4, memberBytes.length);
return asBytes;
} else {
byte[] asBytes = new byte[1 + 4 + versionedPartitionNameBytes.length + 4 + rootRingMemberBytes.length];
asBytes[0] = 0; // version
UIO.intBytes(versionedPartitionNameBytes.length, asBytes, 1);
System.arraycopy(versionedPartitionNameBytes, 0, asBytes, 1 + 4, versionedPartitionNameBytes.length);
UIO.intBytes(rootRingMemberBytes.length, asBytes, 1 + 4 + versionedPartitionNameBytes.length);
System.arraycopy(rootRingMemberBytes, 0, asBytes, 1 + 4 + versionedPartitionNameBytes.length + 4, rootRingMemberBytes.length);
return asBytes;
}
}
RingMember getMember(byte[] rawMember) throws Exception {
int o = 0;
o += 1; //version
o += UIO.bytesInt(rawMember, o); // partition
o += 4;
o += UIO.bytesInt(rawMember, o); // rootMember
o += 4;
int ringMemberLength = UIO.bytesInt(rawMember, o);
o += 4;
return memberInterner.internRingMember(rawMember, o, ringMemberLength);
}
@Override
public void setIfLarger(RingMember member,
VersionedPartitionName versionedPartitionName,
long highwaterTxId,
int deltaIndex,
int updates) throws Exception {
if (member.equals(rootRingMember)) {
return;
}
bigBird.acquire();
if (deltaIndex == -1) {
long pending = systemUpdatesSinceLastFlush.addAndGet(updates);
amzaSystemStats.highwater(0, -1, pending, pending / (double) flushHighwatersAfterNUpdates);
} else {
long pending = stripeUpdatesSinceLastFlush[deltaIndex].addAndGet(updates);
amzaStats.highwater(deltaIndex, -1, pending, pending / (double) flushHighwatersAfterNUpdates);
}
try {
Map<VersionedPartitionName, HighwaterUpdates> partitionHighwaterUpdates = hostToPartitionToHighwaterUpdates.computeIfAbsent(member,
(t) -> Maps.newConcurrentMap());
HighwaterUpdates highwaterUpdates = partitionHighwaterUpdates.computeIfAbsent(versionedPartitionName, (t) -> new HighwaterUpdates());
highwaterUpdates.updateTxId(highwaterTxId);
if (updates > 0) {
highwaterUpdates.addDeltaUpdates(deltaIndex, updates);
}
} finally {
bigBird.release();
}
}
@Override
public void clear(RingMember member, VersionedPartitionName versionedPartitionName) throws Exception {
bigBird.acquire();
try {
Map<VersionedPartitionName, HighwaterUpdates> partitionHighwaterUpdates = hostToPartitionToHighwaterUpdates.get(member);
if (partitionHighwaterUpdates != null) {
long timestampAndVersion = orderIdProvider.nextId();
systemWALStorage.update(PartitionCreator.HIGHWATER_MARK_INDEX, null,
(highwater, scan) -> scan.row(-1, walKey(versionedPartitionName, member), null, timestampAndVersion, true, timestampAndVersion),
walUpdated);
partitionHighwaterUpdates.remove(versionedPartitionName);
}
} finally {
bigBird.release();
}
}
@Override
public long get(RingMember member, VersionedPartitionName versionedPartitionName) throws Exception {
Map<VersionedPartitionName, HighwaterUpdates> partitionHighwaterUpdates = hostToPartitionToHighwaterUpdates.computeIfAbsent(member,
(t) -> Maps.newConcurrentMap());
HighwaterUpdates highwaterUpdates = partitionHighwaterUpdates.get(versionedPartitionName);
if (highwaterUpdates == null) {
PartitionProperties partitionProperties = partitionCreator.getProperties(versionedPartitionName.getPartitionName());
long txId = -1L;
if (partitionProperties.durability != Durability.ephemeral) {
TimestampedValue got = systemWALStorage.getTimestampedValue(PartitionCreator.HIGHWATER_MARK_INDEX, null,
walKey(versionedPartitionName, member));
if (got != null) {
txId = UIO.bytesLong(got.getValue());
}
}
highwaterUpdates = partitionHighwaterUpdates.computeIfAbsent(versionedPartitionName, (t) -> new HighwaterUpdates());
highwaterUpdates.updateTxId(txId);
}
return highwaterUpdates.getTxId();
}
@Override
public WALHighwater getPartitionHighwater(VersionedPartitionName versionedPartitionName, boolean includeLocal) throws Exception {
byte[] fromKey = walKey(versionedPartitionName, null);
byte[] toKey = WALKey.prefixUpperExclusive(fromKey);
List<RingMemberHighwater> highwaters = new ArrayList<>();
systemWALStorage.rangeScan(PartitionCreator.HIGHWATER_MARK_INDEX, null, fromKey, null, toKey,
(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion) -> {
if (valueTimestamp != -1 && !valueTombstoned) {
RingMember member = getMember(key);
if (includeLocal || !member.equals(rootRingMember)) {
highwaters.add(new RingMemberHighwater(member, UIO.bytesLong(value)));
}
}
return true;
}, true);
return new WALHighwater(highwaters);
}
@Override
public void clearRing(final RingMember member) throws Exception {
bigBird.acquire();
try {
final Map<VersionedPartitionName, HighwaterUpdates> partitions = hostToPartitionToHighwaterUpdates.get(member);
if (partitions != null && !partitions.isEmpty()) {
systemWALStorage.update(PartitionCreator.HIGHWATER_MARK_INDEX, null,
(highwater, scan) -> {
long timestampAndVersion = orderIdProvider.nextId();
for (VersionedPartitionName versionedPartitionName : partitions.keySet()) {
if (!scan.row(-1, walKey(versionedPartitionName, member), null, timestampAndVersion, true, timestampAndVersion)) {
return false;
}
}
return true;
}, walUpdated);
}
hostToPartitionToHighwaterUpdates.remove(member);
} finally {
bigBird.release();
}
}
@Override
public boolean flush(int deltaIndex, boolean force, Callable<Void> preFlush) throws Exception {
AtomicLong updatesSinceLastFlush;
AmzaStats stats;
if (deltaIndex == -1) {
updatesSinceLastFlush = systemUpdatesSinceLastFlush;
stats = amzaSystemStats;
} else {
updatesSinceLastFlush = stripeUpdatesSinceLastFlush[deltaIndex];
stats = amzaStats;
}
if (!force && updatesSinceLastFlush.get() < flushHighwatersAfterNUpdates) {
return false;
}
bigBird.acquire(numPermits);
try {
long flushedUpdates = updatesSinceLastFlush.get();
if (!force && flushedUpdates < flushHighwatersAfterNUpdates) {
return false;
} else {
systemWALStorage.update(PartitionCreator.HIGHWATER_MARK_INDEX, null,
(highwater, scan) -> {
if (preFlush != null) {
preFlush.call();
}
long timestampAndVersion = orderIdProvider.nextId();
for (Entry<RingMember, Map<VersionedPartitionName, HighwaterUpdates>> ringEntry : hostToPartitionToHighwaterUpdates.entrySet()) {
RingMember ringMember = ringEntry.getKey();
for (Map.Entry<VersionedPartitionName, HighwaterUpdates> partitionEntry : ringEntry.getValue().entrySet()) {
VersionedPartitionName versionedPartitionName = partitionEntry.getKey();
PartitionProperties properties = partitionCreator.getProperties(versionedPartitionName.getPartitionName());
if (properties.durability != Durability.ephemeral) {
HighwaterUpdates highwaterUpdates = partitionEntry.getValue();
if (highwaterUpdates != null) {
AtomicLong updates = highwaterUpdates.updates.get(deltaIndex);
if (updates != null && updates.get() > 0) {
long txId = highwaterUpdates.getTxId();
long total = updates.get();
if (!scan.row(-1, walKey(versionedPartitionName, ringMember),
UIO.longBytes(txId), timestampAndVersion, false, timestampAndVersion)) {
return false;
}
highwaterUpdates.updateTxId(txId);
highwaterUpdates.addDeltaUpdates(deltaIndex, -total);
}
}
}
}
}
return true;
}, walUpdated);
long pending = updatesSinceLastFlush.addAndGet(-flushedUpdates);
stats.highwater(deltaIndex == -1 ? 0 : deltaIndex, flushedUpdates, pending, pending / (double) flushHighwatersAfterNUpdates);
return true;
}
} finally {
bigBird.release(numPermits);
}
}
private static class HighwaterUpdates {
private final AtomicLong txId;
private final Map<Integer, AtomicLong> updates = Maps.newConcurrentMap();
public HighwaterUpdates() {
this.txId = new AtomicLong(-1L);
}
public void updateTxId(long txId) {
long got = this.txId.longValue();
while (txId > got) {
if (this.txId.compareAndSet(got, txId)) {
break;
} else {
got = this.txId.get();
}
}
}
public long addDeltaUpdates(int deltaIndex, long updates) {
return this.updates.computeIfAbsent(deltaIndex, k -> new AtomicLong()).addAndGet(updates);
}
public long getTxId() {
return txId.get();
}
}
@Override
public void setLocal(VersionedPartitionName versionedPartitionName, long highwaterTxId) {
LocalHighwater highwater = localHighwaterUpdates.computeIfAbsent(versionedPartitionName, versionedPartitionName1 -> new LocalHighwater());
highwater.highwaterTxId.accumulateAndGet(highwaterTxId, Math::max);
}
@Override
public long getLocal(VersionedPartitionName versionedPartitionName) throws Exception {
LocalHighwater highwater = localHighwaterUpdates.computeIfAbsent(versionedPartitionName, versionedPartitionName1 -> new LocalHighwater());
long txId = highwater.highwaterTxId.get();
if (txId == LOCAL_NONE) {
// can't call systemWALStorage inside of highwater lock due to flushLocal lock order
TimestampedValue got = systemWALStorage.getTimestampedValue(PartitionCreator.HIGHWATER_MARK_INDEX, null,
walKey(versionedPartitionName, rootRingMember));
synchronized (highwater) {
long latestTxId = highwater.highwaterTxId.get();
if (latestTxId == LOCAL_NONE) {
if (got != null) {
txId = UIO.bytesLong(got.getValue());
}
highwater.highwaterTxId.set(txId);
highwater.flushedTxId.set(txId);
} else {
// somebody else won the race
txId = latestTxId;
}
}
}
return txId;
}
@Override
public void flushLocal() throws Exception {
systemWALStorage.update(PartitionCreator.HIGHWATER_MARK_INDEX, null,
(highwaters, scan) -> {
long timestampAndVersion = orderIdProvider.nextId();
for (Entry<VersionedPartitionName, LocalHighwater> partitionEntry : localHighwaterUpdates.entrySet()) {
VersionedPartitionName versionedPartitionName = partitionEntry.getKey();
PartitionProperties properties = partitionCreator.getProperties(versionedPartitionName.getPartitionName());
if (properties.durability != Durability.ephemeral) {
LocalHighwater highwater = partitionEntry.getValue();
synchronized (highwater) {
long flushedTxId = highwater.flushedTxId.get();
long highwaterTxId = highwater.highwaterTxId.get();
if (flushedTxId < highwaterTxId) {
boolean result = scan.row(-1, walKey(versionedPartitionName, rootRingMember),
UIO.longBytes(highwaterTxId), timestampAndVersion, false, timestampAndVersion);
highwater.flushedTxId.set(highwaterTxId);
if (!result) {
return false;
}
}
}
}
}
return true;
}, walUpdated);
systemWALStorage.flush(PartitionCreator.HIGHWATER_MARK_INDEX);
}
private static class LocalHighwater {
private final AtomicLong highwaterTxId = new AtomicLong(LOCAL_NONE);
private final AtomicLong flushedTxId = new AtomicLong(LOCAL_NONE);
}
}