/*
* Copyright (C) 2012, 2016 higherfrequencytrading.com
* Copyright (C) 2016 Roman Leventov
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package net.openhft.chronicle.map;
import net.openhft.chronicle.algo.bitset.BitSetFrame;
import net.openhft.chronicle.algo.bitset.SingleThreadedFlatBitSetFrame;
import net.openhft.chronicle.bytes.Bytes;
import net.openhft.chronicle.hash.Data;
import net.openhft.chronicle.hash.VanillaGlobalMutableState;
import net.openhft.chronicle.hash.impl.TierCountersArea;
import net.openhft.chronicle.hash.impl.stage.hash.ChainingInterface;
import net.openhft.chronicle.hash.replication.ReplicableEntry;
import net.openhft.chronicle.map.impl.CompiledReplicatedMapIterationContext;
import net.openhft.chronicle.map.impl.CompiledReplicatedMapQueryContext;
import net.openhft.chronicle.map.replication.MapRemoteOperations;
import net.openhft.chronicle.values.Values;
import net.openhft.chronicle.wire.WireIn;
import net.openhft.chronicle.wire.WireOut;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.concurrent.CopyOnWriteArraySet;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReferenceArray;
import java.util.stream.Stream;
import static net.openhft.chronicle.algo.MemoryUnit.*;
import static net.openhft.chronicle.algo.bitset.BitSetFrame.NOT_FOUND;
import static net.openhft.chronicle.algo.bytes.Access.nativeAccess;
import static net.openhft.chronicle.hash.replication.TimeProvider.currentTime;
/**
* <h2>A Replicating Multi Master HashMap</h2> <p>Each remote hash map, mirrors its changes over to
* another remote hash map, neither hash map is considered the master store of data, each hash map
* uses timestamps to reconcile changes. We refer to an instance of a remote hash-map as a node. A
* node will be connected to any number of other nodes, for the first implementation the maximum
* number of nodes will be fixed. The data that is stored locally in each node will become
* eventually consistent. So changes made to one node, for example by calling put() will be
* replicated over to the other node. To achieve a high level of performance and throughput, the
* call to put() won’t block, with concurrentHashMap, It is typical to check the return code of some
* methods to obtain the old value for example remove(). Due to the loose coupling and lock free
* nature of this multi master implementation, this return value will only be the old value on the
* nodes local data store. In other words the nodes are only concurrent locally. Its worth realising
* that another node performing exactly the same operation may return a different value. However
* reconciliation will ensure the maps themselves become eventually consistent. </p>
* <h2>Reconciliation </h2> <p>If two ( or more nodes ) were to receive a change to their maps for
* the same key but different values, say by a user of the maps, calling the put(key, value). Then,
* initially each node will update its local store and each local store will hold a different value,
* but the aim of multi master replication is to provide eventual consistency across the nodes. So,
* with multi master when ever a node is changed it will notify the other nodes of its change. We
* will refer to this notification as an event. The event will hold a timestamp indicating the time
* the change occurred, it will also hold the state transition, in this case it was a put with a key
* and value. Eventual consistency is achieved by looking at the timestamp from the remote node, if
* for a given key, the remote nodes timestamp is newer than the local nodes timestamp, then the
* event from the remote node will be applied to the local node, otherwise the event will be
* ignored. </p> <p>However there is an edge case that we have to concern ourselves with, If two
* nodes update their map at the same time with different values, we have to deterministically
* resolve which update wins, because of eventual consistency both nodes should end up locally
* holding the same data. Although it is rare two remote nodes could receive an update to their maps
* at exactly the same time for the same key, we have to handle this edge case, its therefore
* important not to rely on timestamps alone to reconcile the updates. Typically the update with the
* newest timestamp should win, but in this example both timestamps are the same, and the decision
* made to one node should be identical to the decision made to the other. We resolve this simple
* dilemma by using a node identifier, each node will have a unique identifier, the update from the
* node with the smallest identifier wins. </p>
*
* @param <K> the entries key type
* @param <V> the entries value type
*/
public class ReplicatedChronicleMap<K, V, R> extends VanillaChronicleMap<K, V, R>
implements Replica, Replica.EntryExternalizable {
private static final Logger LOG = LoggerFactory.getLogger(ReplicatedChronicleMap.class);
public static final int ADDITIONAL_ENTRY_BYTES = 10;
static final byte ENTRY_HUNK = 1;
static final byte BOOTSTRAP_TIME_HUNK = 2;
private long tierModIterBitSetSizeInBits;
private long tierModIterBitSetOuterSize;
private long segmentModIterBitSetsForIdentifierOuterSize;
private long tierBulkModIterBitSetsForIdentifierOuterSize;
/**
* Default value is 0, that corresponds to "unset" identifier value (valid ids are positive)
*/
private transient byte localIdentifier;
/**
* Idiomatically {@code assignedModificationIterators} should be a {@link CopyOnWriteArraySet},
* but we should frequently iterate over this array without creating any garbage,
* that is impossible with {@code CopyOnWriteArraySet}.
*/
private transient ModificationIterator[] assignedModificationIterators;
private transient AtomicReferenceArray<ModificationIterator> modificationIterators;
private transient long startOfModificationIterators;
public transient boolean cleanupRemovedEntries;
public transient long cleanupTimeout;
public transient TimeUnit cleanupTimeoutUnit;
public transient MapRemoteOperations<K, V, R> remoteOperations;
transient BitSetFrame tierModIterFrame;
private transient long[] remoteNodeCouldBootstrapFrom;
public ReplicatedChronicleMap(@NotNull ChronicleMapBuilder<K, V> builder) throws IOException {
super(builder);
tierModIterBitSetSizeInBits = computeTierModIterBitSetSizeInBits();
tierModIterBitSetOuterSize = computeTierModIterBitSetOuterSize();
segmentModIterBitSetsForIdentifierOuterSize =
computeSegmentModIterBitSetsForIdentifierOuterSize();
tierBulkModIterBitSetsForIdentifierOuterSize =
computeTierBulkModIterBitSetsForIdentifierOuterSize(tiersInBulk);
}
@Override
protected void readMarshallableFields(@NotNull WireIn wireIn) {
super.readMarshallableFields(wireIn);
tierModIterBitSetSizeInBits = wireIn.read(() -> "tierModIterBitSetSizeInBits").int64();
tierModIterBitSetOuterSize = wireIn.read(() -> "tierModIterBitSetOuterSize").int64();
segmentModIterBitSetsForIdentifierOuterSize =
wireIn.read(() -> "segmentModIterBitSetsForIdentifierOuterSize").int64();
tierBulkModIterBitSetsForIdentifierOuterSize =
wireIn.read(() -> "tierBulkModIterBitSetsForIdentifierOuterSize").int64();
}
@Override
public void writeMarshallable(@NotNull WireOut wireOut) {
super.writeMarshallable(wireOut);
wireOut.write(() -> "tierModIterBitSetSizeInBits").int64(tierModIterBitSetSizeInBits);
wireOut.write(() -> "tierModIterBitSetOuterSize").int64(tierModIterBitSetOuterSize);
wireOut.write(() -> "segmentModIterBitSetsForIdentifierOuterSize")
.int64(segmentModIterBitSetsForIdentifierOuterSize);
wireOut.write(() -> "tierBulkModIterBitSetsForIdentifierOuterSize")
.int64(tierBulkModIterBitSetsForIdentifierOuterSize);
}
@Override
protected VanillaGlobalMutableState createGlobalMutableState() {
return Values.newNativeReference(ReplicatedGlobalMutableState.class);
}
@Override
public ReplicatedGlobalMutableState globalMutableState() {
return (ReplicatedGlobalMutableState) super.globalMutableState();
}
@Override
public void initTransients() {
super.initTransients();
initOwnTransients();
}
private void initOwnTransients() {
//noinspection unchecked
assignedModificationIterators = new ReplicatedChronicleMap.ModificationIterator[0];
modificationIterators = new AtomicReferenceArray<>(128);
tierModIterFrame = new SingleThreadedFlatBitSetFrame(computeTierModIterBitSetSizeInBits());
remoteNodeCouldBootstrapFrom = new long[128];
}
@Override
void initTransientsFromBuilder(ChronicleMapBuilder<K, V> builder) {
super.initTransientsFromBuilder(builder);
this.localIdentifier = builder.replicationIdentifier;
if (localIdentifier == -1)
throw new IllegalStateException("localIdentifier should not be -1");
//noinspection unchecked
this.remoteOperations = (MapRemoteOperations<K, V, R>) builder.remoteOperations;
cleanupRemovedEntries = builder.cleanupRemovedEntries;
cleanupTimeout = builder.cleanupTimeout;
cleanupTimeoutUnit = builder.cleanupTimeoutUnit;
}
private long computeTierModIterBitSetSizeInBits() {
return LONGS.align(actualChunksPerSegmentTier, BITS);
}
private long computeTierModIterBitSetOuterSize() {
long tierModIterBitSetOuterSize = BYTES.convert(computeTierModIterBitSetSizeInBits(), BITS);
// protect from false sharing between bit sets of adjacent segments
tierModIterBitSetOuterSize += BYTES.convert(2, CACHE_LINES);
if (CACHE_LINES.align(tierModIterBitSetOuterSize, BYTES) == tierModIterBitSetOuterSize) {
tierModIterBitSetOuterSize =
breakL1CacheAssociativityContention(tierModIterBitSetOuterSize);
}
return tierModIterBitSetOuterSize;
}
private long computeSegmentModIterBitSetsForIdentifierOuterSize() {
return computeTierModIterBitSetOuterSize() * actualSegments;
}
private long computeTierBulkModIterBitSetsForIdentifierOuterSize(long tiersInBulk) {
return computeTierModIterBitSetOuterSize() * tiersInBulk;
}
@Override
protected long computeTierBulkInnerOffsetToTiers(long tiersInBulk) {
long tierBulkBitSetsInnerSize =
computeTierBulkModIterBitSetsForIdentifierOuterSize(tiersInBulk) * 128;
return super.computeTierBulkInnerOffsetToTiers(tiersInBulk) +
CACHE_LINES.align(tierBulkBitSetsInnerSize, BYTES);
}
@Override
public long mapHeaderInnerSize() {
return super.mapHeaderInnerSize() + (segmentModIterBitSetsForIdentifierOuterSize * 128);
}
@Override
public void setRemoteNodeCouldBootstrapFrom(byte remoteIdentifier, long bootstrapTimestamp) {
remoteNodeCouldBootstrapFrom[remoteIdentifier] = bootstrapTimestamp;
}
@Override
public long remoteNodeCouldBootstrapFrom(byte remoteIdentifier) {
return remoteNodeCouldBootstrapFrom[remoteIdentifier];
}
@Override
public void onHeaderCreated() {
// Pad modification iterators at 3 cache lines from the end of the map header,
// to avoid false sharing with the header of the first segment
startOfModificationIterators = super.mapHeaderInnerSize() +
RESERVED_GLOBAL_MUTABLE_STATE_BYTES - BYTES.convert(3, CACHE_LINES);
}
@Override
protected void zeroOutNewlyMappedChronicleMapBytes() {
super.zeroOutNewlyMappedChronicleMapBytes();
bs.zeroOut(super.mapHeaderInnerSize(), this.mapHeaderInnerSize());
}
@Override
public byte identifier() {
byte id = localIdentifier;
if (id == 0) {
throw new IllegalStateException("Replication identifier is not set for this\n" +
"replicated Chronicle Map. This should only be possible if persisted\n" +
"replicated Chronicle Map access from another process/JVM run/after\n" +
"a transfer from another machine, and replication identifier is not\n" +
"specified when access is configured, e. g. ChronicleMap.of(...)" +
".createPersistedTo(existingFile).\n" +
"In this case, replicated Chronicle Map \"doesn't know\" it's identifier,\n" +
"and is able to perform simple _read_ operations like map.get(), which\n" +
"doesn't access the identifier. To perform updates, insertions, replication\n" +
"tasks, you should configure the current node identifier,\n" +
"by `replication(identifier)` method call in ChronicleMapBuilder\n" +
"configuration chain.");
}
assert id > 0;
return id;
}
@Override
public ModificationIterator acquireModificationIterator(byte remoteIdentifier) {
ModificationIterator modificationIterator = modificationIterators.get(remoteIdentifier);
if (modificationIterator != null)
return modificationIterator;
globalMutableStateLock();
try {
modificationIterator = modificationIterators.get(remoteIdentifier);
if (modificationIterator != null)
return modificationIterator;
ReplicatedGlobalMutableState globalMutableState = globalMutableState();
boolean modificationIteratorInit =
globalMutableState.getModificationIteratorInitAt(remoteIdentifier);
final ModificationIterator modIter =
new ModificationIterator(remoteIdentifier, modificationIteratorInit);
if (!modificationIteratorInit) {
globalMutableState.setModificationIteratorInitAt(remoteIdentifier, true);
// This doesn't need to be volatile update, because modification iterators count
// is checked (also non-volatile) in the beginning of raiseChange()/dropChange()
// methods, the risk is that a new modification iterator is added, dirtyEntries()
// already completed, and then we call raiseChange() and miss the new iterator.
// raiseChange() is called, when the segment lock is held on update level, as well
// as dirtyEntries() (that is segment iteration, which is always performed on
// update-level lock), so the two actions (dirtyEntries() and raiseChange())
// are serialized between each other, => change to ModificationIteratorsCount is
// visible.
globalMutableState.addModificationIteratorsCount(1);
}
//noinspection unchecked
assignedModificationIterators = Stream
.concat(Arrays.stream(assignedModificationIterators), Stream.of(modIter))
.sorted(Comparator.comparing(it -> it.remoteIdentifier))
.toArray(ReplicatedChronicleMap.ModificationIterator[]::new);
modificationIterators.set(remoteIdentifier, modIter);
return modIter;
} finally {
globalMutableStateUnlock();
}
}
public ModificationIterator[] acquireAllModificationIterators() {
for (int remoteIdentifier = 0; remoteIdentifier < 128; remoteIdentifier++) {
if (globalMutableState().getModificationIteratorInitAt(remoteIdentifier)) {
acquireModificationIterator((byte) remoteIdentifier);
}
}
return assignedModificationIterators;
}
private void updateModificationIteratorsArray() {
if (globalMutableState().getModificationIteratorsCount() !=
assignedModificationIterators.length) {
acquireAllModificationIterators();
}
}
public void raiseChange(long tierIndex, long pos) {
// -1 is invalid remoteIdentifier => raise change for all
raiseChangeForAllExcept(tierIndex, pos, (byte) -1);
}
public void raiseChangeFor(long tierIndex, long pos, byte remoteIdentifier) {
acquireModificationIterator(remoteIdentifier).raiseChange0(tierIndex, pos);
}
public void raiseChangeForAllExcept(long tierIndex, long pos, byte remoteIdentifier) {
updateModificationIteratorsArray();
if (tierIndex <= actualSegments) {
long segmentIndex = tierIndex - 1;
long offsetToTierBitSet = segmentIndex * tierModIterBitSetOuterSize;
for (ModificationIterator it : assignedModificationIterators) {
if (it.remoteIdentifier != remoteIdentifier)
it.raiseChangeInSegment(offsetToTierBitSet, pos);
}
} else {
long extraTierIndex = tierIndex - 1 - actualSegments;
int bulkIndex = (int) (extraTierIndex >> log2TiersInBulk);
long offsetToTierBitSet =
(extraTierIndex & (tiersInBulk - 1)) * tierModIterBitSetOuterSize;
for (ModificationIterator it : assignedModificationIterators) {
if (it.remoteIdentifier != remoteIdentifier)
it.raiseChangeInTierBulk(bulkIndex, offsetToTierBitSet, pos);
}
}
}
public void dropChange(long tierIndex, long pos) {
updateModificationIteratorsArray();
if (tierIndex <= actualSegments) {
long segmentIndex = tierIndex - 1;
long offsetToTierBitSet = segmentIndex * tierModIterBitSetOuterSize;
for (ModificationIterator modificationIterator : assignedModificationIterators) {
modificationIterator.dropChangeInSegment(offsetToTierBitSet, pos);
}
} else {
long extraTierIndex = tierIndex - 1 - actualSegments;
int bulkIndex = (int) (extraTierIndex >> log2TiersInBulk);
long offsetToTierBitSet =
(extraTierIndex & (tiersInBulk - 1)) * tierModIterBitSetOuterSize;
for (ModificationIterator modificationIterator : assignedModificationIterators) {
modificationIterator.dropChangeInTierBulk(bulkIndex, offsetToTierBitSet, pos);
}
}
}
public void dropChangeFor(long tierIndex, long pos, byte remoteIdentifier) {
acquireModificationIterator(remoteIdentifier).dropChange0(tierIndex, pos);
}
public void moveChange(long oldTierIndex, long oldPos, long newTierIndex, long newPos) {
updateModificationIteratorsArray();
if (oldTierIndex <= actualSegments) {
long oldSegmentIndex = oldTierIndex - 1;
long oldOffsetToTierBitSet = oldSegmentIndex * tierModIterBitSetOuterSize;
if (newTierIndex <= actualSegments) {
long newSegmentIndex = newTierIndex - 1;
long newOffsetToTierBitSet = newSegmentIndex * tierModIterBitSetOuterSize;
for (ModificationIterator modificationIterator : assignedModificationIterators) {
if (modificationIterator.dropChangeInSegment(oldOffsetToTierBitSet, oldPos))
modificationIterator.raiseChangeInSegment(newOffsetToTierBitSet, newPos);
}
} else {
long newExtraTierIndex = newTierIndex - 1 - actualSegments;
int newBulkIndex = (int) (newExtraTierIndex >> log2TiersInBulk);
long newOffsetToTierBitSet =
(newExtraTierIndex & (tiersInBulk - 1)) * tierModIterBitSetOuterSize;
for (ModificationIterator modificationIterator : assignedModificationIterators) {
if (modificationIterator.dropChangeInSegment(oldOffsetToTierBitSet, oldPos)) {
modificationIterator
.raiseChangeInTierBulk(newBulkIndex, newOffsetToTierBitSet, newPos);
}
}
}
} else {
long oldExtraTierIndex = oldTierIndex - 1 - actualSegments;
int oldBulkIndex = (int) (oldExtraTierIndex >> log2TiersInBulk);
long oldOffsetToTierBitSet =
(oldExtraTierIndex & (tiersInBulk - 1)) * tierModIterBitSetOuterSize;
if (newTierIndex <= actualSegments) {
long newSegmentIndex = newTierIndex - 1;
long newOffsetToTierBitSet = newSegmentIndex * tierModIterBitSetOuterSize;
for (ModificationIterator modificationIterator : assignedModificationIterators) {
if (modificationIterator
.dropChangeInTierBulk(oldBulkIndex, oldOffsetToTierBitSet, oldPos)) {
modificationIterator.raiseChangeInSegment(newOffsetToTierBitSet, newPos);
}
}
} else {
long newExtraTierIndex = newTierIndex - 1 - actualSegments;
int newBulkIndex = (int) (newExtraTierIndex >> log2TiersInBulk);
long newOffsetToTierBitSet =
(newExtraTierIndex & (tiersInBulk - 1)) * tierModIterBitSetOuterSize;
for (ModificationIterator modificationIterator : assignedModificationIterators) {
if (modificationIterator
.dropChangeInTierBulk(oldBulkIndex, oldOffsetToTierBitSet, oldPos)) {
modificationIterator
.raiseChangeInTierBulk(newBulkIndex, newOffsetToTierBitSet, newPos);
}
}
}
}
}
public boolean isChanged(long tierIndex, long pos) {
updateModificationIteratorsArray();
if (tierIndex <= actualSegments) {
long segmentIndex = tierIndex - 1;
long offsetToTierBitSet = segmentIndex * tierModIterBitSetOuterSize;
for (ModificationIterator modificationIterator : assignedModificationIterators) {
if (modificationIterator.isChangedSegment(offsetToTierBitSet, pos))
return true;
}
} else {
long extraTierIndex = tierIndex - 1 - actualSegments;
int bulkIndex = (int) (extraTierIndex >> log2TiersInBulk);
long offsetToTierBitSet =
(extraTierIndex & (tiersInBulk - 1)) * tierModIterBitSetOuterSize;
for (ModificationIterator modificationIterator : assignedModificationIterators) {
if (modificationIterator.isChangedTierBulk(bulkIndex, offsetToTierBitSet, pos))
return true;
}
}
return false;
}
@Override
public boolean identifierCheck(@NotNull ReplicableEntry entry, int chronicleId) {
return entry.originIdentifier() == identifier();
}
@Override
public void writeExternalEntry(
ReplicableEntry entry, Bytes payload, @NotNull Bytes destination, int chronicleId) {
if (payload != null)
writePayload(payload, destination);
if (entry != null)
writeExternalEntry0(entry, destination);
}
private void writePayload(Bytes payload, Bytes destination) {
destination.writeByte(BOOTSTRAP_TIME_HUNK);
destination.write(payload, payload.readPosition(), payload.readRemaining());
}
/**
* This method does not set a segment lock, A segment lock should be obtained before calling
* this method, especially when being used in a multi threaded context.
*/
private void writeExternalEntry0(ReplicableEntry entry, Bytes destination) {
destination.writeByte(ENTRY_HUNK);
destination.writeStopBit(entry.originTimestamp());
if (entry.originIdentifier() == 0)
throw new IllegalStateException("Identifier can't be 0");
destination.writeByte(entry.originIdentifier());
Data key;
boolean isDeleted;
if (entry instanceof MapEntry) {
isDeleted = false;
key = ((MapEntry) entry).key();
} else {
isDeleted = true;
key = ((MapAbsentEntry) entry).absentKey();
}
destination.writeBoolean(isDeleted);
keySizeMarshaller.writeSize(destination, key.size());
key.writeTo(destination, destination.writePosition());
destination.writeSkip(key.size());
boolean debugEnabled = LOG.isDebugEnabled();
String message = null;
if (debugEnabled) {
if (isDeleted) {
LOG.debug("WRITING ENTRY TO DEST - into local-id={}, remove(key={})",
identifier(), key);
} else {
message = String.format(
"WRITING ENTRY TO DEST - into local-id=%d, put(key=%s,",
identifier(), key);
}
}
if (isDeleted)
return;
Data value = ((MapEntry) entry).value();
valueSizeMarshaller.writeSize(destination, value.size());
value.writeTo(destination, destination.writePosition());
destination.writeSkip(value.size());
if (debugEnabled) {
LOG.debug(message + "value=" + value + ")");
}
}
@Override
ChainingInterface newQueryContext() {
return new CompiledReplicatedMapQueryContext<>(this);
}
@Override
public CompiledReplicatedMapQueryContext<K, V, R> mapContext() {
//noinspection unchecked
return q().getContext(CompiledReplicatedMapQueryContext.class,
// lambda is used instead of constructor reference because currently stage-compiler
// has issues with parsing method/constructor refs.
// TODO replace with constructor ref when stage-compiler is improved
(c, m) -> new CompiledReplicatedMapQueryContext<K, V, R>(c, m), this);
}
/**
* This method does not set a segment lock, A segment lock should be obtained before calling
* this method, especially when being used in a multi threaded context.
*/
@Override
public void readExternalEntry(@NotNull Bytes source, byte remoteNodeIdentifier) {
byte hunk = source.readByte();
if (hunk == BOOTSTRAP_TIME_HUNK) {
setRemoteNodeCouldBootstrapFrom(remoteNodeIdentifier, source.readLong());
} else {
assert hunk == ENTRY_HUNK;
try (CompiledReplicatedMapQueryContext<K, V, R> remoteOpContext = mapContext()) {
remoteOpContext.processReplicatedEvent(remoteNodeIdentifier, source);
}
}
}
@Override
ChainingInterface newIterationContext() {
return new CompiledReplicatedMapIterationContext<>(this);
}
public CompiledReplicatedMapIterationContext<K, V, R> iterationContext() {
//noinspection unchecked
return i().getContext(CompiledReplicatedMapIterationContext.class,
// lambda is used instead of constructor reference because currently stage-compiler
// has issues with parsing method/constructor refs.
// TODO replace with constructor ref when stage-compiler is improved
(c, m) -> new CompiledReplicatedMapIterationContext<K, V, R>(c, m), this);
}
/**
* <p>Once a change occurs to a map, map replication requires that these changes are picked up
* by another thread, this class provides an iterator like interface to poll for such changes.
* </p> <p>In most cases the thread that adds data to the node is unlikely to be the same thread
* that replicates the data over to the other nodes, so data will have to be marshaled between
* the main thread storing data to the map, and the thread running the replication. </p> <p>One
* way to perform this marshalling, would be to pipe the data into a queue. However, This class
* takes another approach. It uses a bit set, and marks bits which correspond to the indexes of
* the entries that have changed. It then provides an iterator like interface to poll for such
* changes. </p>
*
* @author Rob Austin.
*/
public class ModificationIterator implements Replica.ModificationIterator {
private final byte remoteIdentifier;
private final long segmentBitSetsAddr;
private final long offsetToBitSetsWithinATierBulk;
private ModificationNotifier modificationNotifier;
private long bootstrapTimeAfterNextIterationComplete = 0L;
private boolean somethingSentOnThisIteration = false;
// The iteration "cursor" consists of 4 fields:
// 1) if segmentIndex >= 0, bulkIndex = -1, tierIndexOffsetWithinBulk = -1:
// => we are in "first tiers" aka "segments"
// 2) segmentIndex = -1, bulkIndex >= 0, tierIndexOffsetWithinBulk >= 0:
// => we are in extra tiers
private int segmentIndex;
private int bulkIndex;
private int tierIndexOffsetWithinBulk;
/**
* Corresponds to {@link net.openhft.chronicle.hash.impl.stage.entry.HashEntryStages#pos}
*/
private long entryPos;
/**
* Cached addr of the changes bit set of the current tier (which "cursor" points to),
* to avoid re-computation of this addr in {@link #entryIsStillDirty(long)} and
* {@link #clearEntry(long)} methods
*/
private long tierBitSetAddr;
public ModificationIterator(byte remoteIdentifier, boolean sharedMemoryInit) {
this.remoteIdentifier = remoteIdentifier;
segmentBitSetsAddr = bsAddress() + startOfModificationIterators +
remoteIdentifier * segmentModIterBitSetsForIdentifierOuterSize;
if (!sharedMemoryInit) {
nativeAccess().zeroOut(null, segmentBitSetsAddr,
segmentModIterBitSetsForIdentifierOuterSize);
}
offsetToBitSetsWithinATierBulk =
remoteIdentifier * tierBulkModIterBitSetsForIdentifierOuterSize;
resetCursor();
}
private void resetCursor() {
segmentIndex = 0;
bulkIndex = -1;
tierIndexOffsetWithinBulk = -1;
entryPos = -1;
tierBitSetAddr = segmentBitSetsAddr; // + tierModIterBitSetOuterSize * segmentIndex = 0
}
public void setModificationNotifier(@NotNull ModificationNotifier modificationNotifier) {
this.modificationNotifier = modificationNotifier;
}
void raiseChangeInSegment(long offsetToTierBitSet, long pos) {
tierModIterFrame.set(nativeAccess(), null,
segmentBitSetsAddr + offsetToTierBitSet, pos);
if (modificationNotifier != null)
modificationNotifier.onChange();
}
void raiseChangeInTierBulk(int bulkIndex, long offsetToTierBitSet, long pos) {
TierBulkData tierBulkData = tierBulkOffsets.get(bulkIndex);
long bitSetAddr = bitSetsAddr(tierBulkData) + offsetToTierBitSet;
tierModIterFrame.set(nativeAccess(), null, bitSetAddr, pos);
if (modificationNotifier != null)
modificationNotifier.onChange();
}
boolean dropChangeInSegment(long offsetToTierBitSet, long pos) {
return tierModIterFrame.clearIfSet(nativeAccess(), null,
segmentBitSetsAddr + offsetToTierBitSet, pos);
}
boolean dropChangeInTierBulk(int bulkIndex, long offsetToTierBitSet, long pos) {
TierBulkData tierBulkData = tierBulkOffsets.get(bulkIndex);
long bitSetAddr = bitSetsAddr(tierBulkData) + offsetToTierBitSet;
return tierModIterFrame.clearIfSet(nativeAccess(), null, bitSetAddr, pos);
}
boolean isChangedSegment(long offsetToTierBitSet, long pos) {
return tierModIterFrame.isSet(nativeAccess(), null,
segmentBitSetsAddr + offsetToTierBitSet, pos);
}
boolean isChangedTierBulk(int bulkIndex, long offsetToTierBitSet, long pos) {
TierBulkData tierBulkData = tierBulkOffsets.get(bulkIndex);
long bitSetAddr = bitSetsAddr(tierBulkData) + offsetToTierBitSet;
return tierModIterFrame.isSet(nativeAccess(), null, bitSetAddr, pos);
}
private long bitSetsAddr(TierBulkData tierBulkData) {
return tierBulkData.bytesStore.address(tierBulkData.offset) +
offsetToBitSetsWithinATierBulk;
}
/**
* you can continue to poll hasNext() until data becomes available. If are are in the middle
* of processing an entry via {@code nextEntry}, hasNext will return true until the bit is
* cleared
*
* @return true if there is an entry
*/
@Override
public boolean hasNext() {
return nextEntryPos(null, 0) != NOT_FOUND;
}
private long nextEntryPos(Callback callback, int chronicleId) {
long nextEntryPos;
boolean allBitSetsScannedFromTheStart = false;
// at most 2 iterations
while (!allBitSetsScannedFromTheStart) {
if (segmentIndex >= 0) {
allBitSetsScannedFromTheStart = segmentIndex == 0 && entryPos == -1;
if (allBitSetsScannedFromTheStart) {
bootstrapTimeAfterNextIterationComplete = currentTime();
somethingSentOnThisIteration = false;
}
while (segmentIndex < actualSegments) {
// This is needed to ensure, that any entry update with the timestamp,
// smaller than assigned for bootstrapTimeAfterNextIterationComplete, is
// visible during the iteration of the current segment. Bits are raised
// during the update via non-volatile bit set (performance concerns), hence
// to guarantee visibility during the iteration, we build a happens-before
// between bit raise and bit reading:
// bit raised ->
// lock released (end of update operation) ->
// lock acquired (the following acquireAndReleaseUpdateLock() call) ->
// bits are iterated (raised bits should be visible here)
if (entryPos == -1) {
acquireAndReleaseUpdateLock(segmentIndex);
}
if ((nextEntryPos = tierModIterFrame.nextSetBit(nativeAccess(),
null, tierBitSetAddr, entryPos + 1)) != NOT_FOUND) {
return nextEntryPos;
} else {
segmentIndex++;
tierBitSetAddr += tierModIterBitSetOuterSize;
entryPos = -1;
}
}
// go to extra bulks
segmentIndex = -1;
bulkIndex = 0;
tierIndexOffsetWithinBulk = 0;
if (bulkIndex < globalMutableState().getAllocatedExtraTierBulks())
tierBitSetAddr = bitSetsAddr(tierBulkOffsets.get(bulkIndex));
}
// for each allocated tier bulk
while (bulkIndex < globalMutableState().getAllocatedExtraTierBulks()) {
while (tierIndexOffsetWithinBulk < tiersInBulk) {
if ((nextEntryPos = tierModIterFrame.nextSetBit(nativeAccess(), null,
tierBitSetAddr, entryPos + 1)) != NOT_FOUND) {
return nextEntryPos;
} else {
tierIndexOffsetWithinBulk++;
tierBitSetAddr += tierModIterBitSetOuterSize;
entryPos = -1;
}
}
// go to the next bulk
bulkIndex++;
tierIndexOffsetWithinBulk = 0;
if (bulkIndex < globalMutableState().getAllocatedExtraTierBulks())
tierBitSetAddr = bitSetsAddr(tierBulkOffsets.get(bulkIndex));
}
resetCursor();
// we walked through the whole chronicle map instance, "iteration"
if (callback != null && somethingSentOnThisIteration) {
callback.onBootstrapTime(bootstrapTimeAfterNextIterationComplete, chronicleId);
}
}
return NOT_FOUND;
}
private void acquireAndReleaseUpdateLock(int segmentIndex) {
try (CompiledReplicatedMapIterationContext<K, V, R> c = iterationContext()) {
c.initSegmentIndex(segmentIndex);
c.updateLock().lock();
}
}
/**
* @param callback call this to get an entry, this class will take care of the locking
* @return true if an entry was processed
*/
@Override
public boolean nextEntry(@NotNull Callback callback, int chronicleId) {
while (true) {
long nextEntryPos = nextEntryPos(callback, chronicleId);
if (nextEntryPos == NOT_FOUND)
return false;
entryPos = nextEntryPos;
try (CompiledReplicatedMapIterationContext<K, V, R> context = iterationContext()) {
if (segmentIndex >= 0) {
// we are in first tiers (aka "segments")
context.initSegmentIndex(segmentIndex);
} else {
// we are in extra tiers
TierBulkData tierBulkData = tierBulkOffsets.get(bulkIndex);
long tierBaseAddr = tierAddr(tierBulkData, tierIndexOffsetWithinBulk);
long tierCountersAreaAddr = tierBaseAddr + tierHashLookupOuterSize;
context.initSegmentIndex(
TierCountersArea.segmentIndex(tierCountersAreaAddr));
int tier = TierCountersArea.tier(tierCountersAreaAddr);
long tierIndex = actualSegments +
(bulkIndex << log2TiersInBulk) + tierIndexOffsetWithinBulk + 1;
context.initSegmentTier(tier, tierIndex, tierBaseAddr);
}
context.updateLock().lock();
if (entryIsStillDirty(entryPos)) {
context.readExistingEntry(entryPos);
ReplicableEntry entry = (ReplicableEntry) context.entryForIteration();
callback.onEntry(entry, chronicleId);
somethingSentOnThisIteration = true;
clearEntry(entryPos);
return true;
}
// if the entryPos was already cleared by another thread
// while we were trying to obtain segment lock (for example, in relocation()),
// go to pick up next (next iteration in the `while (true)` loop)
}
}
}
private boolean entryIsStillDirty(long entryPos) {
return tierModIterFrame.get(nativeAccess(), null, tierBitSetAddr, entryPos);
}
private void clearEntry(long entryPos) {
tierModIterFrame.clear(nativeAccess(), null, tierBitSetAddr, entryPos);
}
@Override
public void dirtyEntries(long fromTimeStamp) {
try (CompiledReplicatedMapIterationContext<K, V, R> c = iterationContext()) {
// iterate over all the segments and mark bit in the modification iterator
// that correspond to entries with an older timestamp
boolean debugEnabled = LOG.isDebugEnabled();
for (int segmentIndex = 0; segmentIndex < actualSegments; segmentIndex++) {
c.initSegmentIndex(segmentIndex);
c.forEachSegmentReplicableEntry(e -> {
if (debugEnabled) {
LOG.debug("Bootstrap entry: id {}, key {}, value {}", localIdentifier,
c.key(), c.value());
}
// Bizarrely the next line line cause NPE in JDT compiler
//assert re.originTimestamp() > 0L;
if (debugEnabled) {
LOG.debug("Bootstrap decision: bs ts: {}, entry ts: {}, " +
"entry id: {}, local id: {}",
fromTimeStamp, e.originTimestamp(),
e.originIdentifier(), localIdentifier);
}
// TODO currently, all entries, originating not from the current node,
// are bootstrapped. This could be optimized, but requires to generate
// unique connection id, it identify two ChronicleMap instances
// reconnecting vs. different Map start-up
if (e.originIdentifier() != localIdentifier ||
e.originTimestamp() >= fromTimeStamp) {
raiseChange0(c.tierIndex(), c.pos());
}
});
}
}
}
void raiseChange0(long tierIndex, long pos) {
if (tierIndex <= actualSegments) {
long segmentIndex = tierIndex - 1;
long offsetToTierBitSet = segmentIndex * tierModIterBitSetOuterSize;
raiseChangeInSegment(offsetToTierBitSet, pos);
} else {
long extraTierIndex = tierIndex - 1 - actualSegments;
int bulkIndex = (int) (extraTierIndex >> log2TiersInBulk);
long offsetToTierBitSet =
(extraTierIndex & (tiersInBulk - 1)) * tierModIterBitSetOuterSize;
raiseChangeInTierBulk(bulkIndex, offsetToTierBitSet, pos);
}
}
void dropChange0(long tierIndex, long pos) {
if (tierIndex <= actualSegments) {
long segmentIndex = tierIndex - 1;
long offsetToTierBitSet = segmentIndex * tierModIterBitSetOuterSize;
dropChangeInSegment(offsetToTierBitSet, pos);
} else {
long extraTierIndex = tierIndex - 1 - actualSegments;
int bulkIndex = (int) (extraTierIndex >> log2TiersInBulk);
long offsetToTierBitSet =
(extraTierIndex & (tiersInBulk - 1)) * tierModIterBitSetOuterSize;
dropChangeInTierBulk(bulkIndex, offsetToTierBitSet, pos);
}
}
public void clearRange0(long tierIndex, long pos, long endPosExclusive) {
if (tierIndex <= actualSegments) {
long segmentIndex = tierIndex - 1;
long offsetToTierBitSet = segmentIndex * tierModIterBitSetOuterSize;
tierModIterFrame.clearRange(nativeAccess(), null,
segmentBitSetsAddr + offsetToTierBitSet, pos, endPosExclusive);
} else {
long extraTierIndex = tierIndex - 1 - actualSegments;
int bulkIndex = (int) (extraTierIndex >> log2TiersInBulk);
long offsetToTierBitSet =
(extraTierIndex & (tiersInBulk - 1)) * tierModIterBitSetOuterSize;
TierBulkData tierBulkData = tierBulkOffsets.get(bulkIndex);
long bitSetAddr = bitSetsAddr(tierBulkData) + offsetToTierBitSet;
tierModIterFrame.clearRange(nativeAccess(), null, bitSetAddr, pos, endPosExclusive);
}
}
}
@Override
public final V get(Object key) {
return defaultGet(key);
}
@Override
public final V getUsing(K key, V usingValue) {
return defaultGetUsing(key, usingValue);
}
}