package com.jivesoftware.os.amza.service.storage.delta; import com.google.common.base.Preconditions; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.ListMultimap; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import com.jivesoftware.os.amza.api.AmzaInterner; import com.jivesoftware.os.amza.api.CompareTimestampVersions; import com.jivesoftware.os.amza.api.DeltaOverCapacityException; import com.jivesoftware.os.amza.api.IoStats; import com.jivesoftware.os.amza.api.filer.UIO; import com.jivesoftware.os.amza.api.partition.PartitionName; import com.jivesoftware.os.amza.api.partition.PartitionProperties; import com.jivesoftware.os.amza.api.partition.VersionedPartitionName; import com.jivesoftware.os.amza.api.scan.RangeScannable; import com.jivesoftware.os.amza.api.scan.RowStream; import com.jivesoftware.os.amza.api.scan.RowsChanged; import com.jivesoftware.os.amza.api.scan.Scannable; import com.jivesoftware.os.amza.api.stream.Commitable; import com.jivesoftware.os.amza.api.stream.KeyContainedStream; import com.jivesoftware.os.amza.api.stream.KeyValuePointerStream; import com.jivesoftware.os.amza.api.stream.KeyValueStream; import com.jivesoftware.os.amza.api.stream.KeyValues; import com.jivesoftware.os.amza.api.stream.RowType; import com.jivesoftware.os.amza.api.stream.UnprefixedWALKeys; import com.jivesoftware.os.amza.api.wal.KeyUtil; import com.jivesoftware.os.amza.api.wal.KeyedTimestampId; import com.jivesoftware.os.amza.api.wal.PrimaryRowMarshaller; import com.jivesoftware.os.amza.api.wal.WALCompactionStats; import com.jivesoftware.os.amza.api.wal.WALHighwater; import com.jivesoftware.os.amza.api.wal.WALIndex; import com.jivesoftware.os.amza.api.wal.WALIndexProvider; import com.jivesoftware.os.amza.api.wal.WALKey; import com.jivesoftware.os.amza.api.wal.WALPointer; import com.jivesoftware.os.amza.api.wal.WALTimestampId; import com.jivesoftware.os.amza.api.wal.WALUpdated; import com.jivesoftware.os.amza.api.wal.WALValue; import com.jivesoftware.os.amza.service.AckWaters; import com.jivesoftware.os.amza.service.NotARingMemberException; import com.jivesoftware.os.amza.service.PropertiesNotPresentException; import com.jivesoftware.os.amza.service.WALIndexProviderRegistry; import com.jivesoftware.os.amza.service.partition.VersionedPartitionProvider; import com.jivesoftware.os.amza.service.replication.CurrentVersionProvider; import com.jivesoftware.os.amza.service.ring.AmzaRingReader; import com.jivesoftware.os.amza.service.stats.AmzaStats; import com.jivesoftware.os.amza.service.stats.AmzaStats.CompactionFamily; import com.jivesoftware.os.amza.service.stats.AmzaStats.CompactionStats; import com.jivesoftware.os.amza.service.storage.PartitionIndex; import com.jivesoftware.os.amza.service.storage.PartitionStore; import com.jivesoftware.os.amza.service.storage.WALStorage; import com.jivesoftware.os.amza.service.storage.delta.DeltaWAL.KeyValueHighwater; import com.jivesoftware.os.amza.service.storage.delta.PartitionDelta.MergeResult; import com.jivesoftware.os.amza.service.take.HighwaterStorage; import com.jivesoftware.os.mlogger.core.MetricLogger; import com.jivesoftware.os.mlogger.core.MetricLoggerFactory; import com.jivesoftware.os.routing.bird.health.checkers.SickThreads; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.Semaphore; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.LockSupport; /** * @author jonathan.colt */ public class DeltaStripeWALStorage { private static final MetricLogger LOG = MetricLoggerFactory.getLogger(); private static final int numTickleMeElmaphore = 1024; // TODO config private final AmzaInterner amzaInterner; private final int index; private final AmzaStats amzaStats; private final AckWaters ackWaters; private final SickThreads sickThreads; private final AmzaRingReader ringReader; private final HighwaterStorage highwaterStorage; private final DeltaWALFactory deltaWALFactory; private final int maxValueSizeInIndex; private final boolean useHighwaterTxId; private final WALIndexProviderRegistry walIndexProviderRegistry; private final long mergeAfterNUpdates; private final ExecutorService mergeDeltaThreads; private final Object awakeCompactionsLock = new Object(); private final AtomicReference<DeltaWAL> deltaWAL = new AtomicReference<>(); private final Map<VersionedPartitionName, PartitionDelta> partitionDeltas = Maps.newConcurrentMap(); private final Object oneWriterAtATimeLock = new Object(); private final Semaphore tickleMeElmophore = new Semaphore(numTickleMeElmaphore, true); private final AtomicLong updateSinceLastMerge = new AtomicLong(); private final AtomicLong merging = new AtomicLong(0); private final Reentrant reentrant = new Reentrant(); public void hackTruncation(int numBytes) { deltaWAL.get().hackTruncation(numBytes); } static class Reentrant extends ThreadLocal<Integer> { @Override protected Integer initialValue() { return 0; } } public DeltaStripeWALStorage(AmzaInterner amzaInterner, int index, AmzaStats amzaStats, AckWaters ackWaters, SickThreads sickThreads, AmzaRingReader ringReader, HighwaterStorage highwaterStorage, DeltaWALFactory deltaWALFactory, int maxValueSizeInIndex, boolean useHighwaterTxId, WALIndexProviderRegistry walIndexProviderRegistry, long mergeAfterNUpdates, ExecutorService mergeDeltaThreads) { this.amzaInterner = amzaInterner; this.index = index; this.amzaStats = amzaStats; this.ackWaters = ackWaters; this.sickThreads = sickThreads; this.ringReader = ringReader; this.highwaterStorage = highwaterStorage; this.deltaWALFactory = deltaWALFactory; this.maxValueSizeInIndex = maxValueSizeInIndex; this.useHighwaterTxId = useHighwaterTxId; this.walIndexProviderRegistry = walIndexProviderRegistry; this.mergeAfterNUpdates = mergeAfterNUpdates; this.mergeDeltaThreads = mergeDeltaThreads; } public int getId() { return index; } private void acquireOne() throws InterruptedException { } private void releaseOne() { } private void writeAcquireOne() throws InterruptedException { int enters = reentrant.get(); if (enters == 0) { tickleMeElmophore.acquire(); } reentrant.set(enters + 1); } private void writeReleaseOne() { int enters = reentrant.get(); if (enters - 1 == 0) { tickleMeElmophore.release(); reentrant.remove(); } else { reentrant.set(enters - 1); } } private void writeAcquireAll() throws InterruptedException { tickleMeElmophore.acquire(numTickleMeElmaphore); } private void writeReleaseAll() { tickleMeElmophore.release(numTickleMeElmaphore); } public Object getAwakeCompactionLock() { return awakeCompactionsLock; } public void delete(VersionedPartitionName versionedPartitionName) throws Exception { writeAcquireAll(); try { synchronized (partitionDeltas) { partitionDeltas.remove(versionedPartitionName); } } finally { writeReleaseAll(); } } public void load(IoStats ioStats, PartitionIndex partitionIndex, VersionedPartitionProvider versionedPartitionProvider, CurrentVersionProvider currentVersionProvider, PrimaryRowMarshaller primaryRowMarshaller) throws Exception { LOG.info("Reloading deltas..."); long start = System.currentTimeMillis(); CompactionStats compactionStats = amzaStats.beginCompaction(CompactionFamily.load, "load-delta-stripe-" + getId()); try { synchronized (oneWriterAtATimeLock) { List<DeltaWAL> deltaWALs = deltaWALFactory.list(ioStats); if (deltaWALs.isEmpty()) { deltaWAL.set(deltaWALFactory.create(ioStats, -1)); } else { for (int i = 0; i < deltaWALs.size(); i++) { DeltaWAL prevWAL = deltaWAL.get(); DeltaWAL currentWAL = deltaWALs.get(i); if (prevWAL != null) { Preconditions.checkState(currentWAL.getPrevId() == prevWAL.getId(), "Delta WALs were not contiguous, %s->%s", currentWAL.getPrevId(), prevWAL.getId()); mergeDelta(ioStats, compactionStats, partitionIndex, versionedPartitionProvider, currentVersionProvider, prevWAL, true, () -> currentWAL); } deltaWAL.set(currentWAL); Set<VersionedPartitionName> accepted = Sets.newHashSet(); Set<VersionedPartitionName> rejected = Sets.newHashSet(); WALKey.decompose( (WALKey.TxFpRawKeyValueEntries<VersionedPartitionName>) txRawKeyEntryStream -> primaryRowMarshaller.fromRows( txFpRowStream -> { currentWAL.load(ioStats, (rowFP, rowTxId, rowType, rawRow) -> { if (rowType.isPrimary()) { if (!txFpRowStream.stream(rowTxId, rowFP, rowType, rawRow)) { return false; } } return true; }); return true; }, (rowTxId, rowFP, rowType, prefix, key, hasValue, value, valueTimestamp, valueTombstoned, valueVersion, row) -> { VersionedPartitionName versionedPartitionName = amzaInterner.internVersionedPartitionName(prefix, 0, prefix.length); try { boolean acceptable; if (accepted.contains(versionedPartitionName)) { acceptable = true; } else if (rejected.contains(versionedPartitionName)) { acceptable = false; } else { acceptable = currentVersionProvider.isCurrentVersion(versionedPartitionName); if (acceptable) { accepted.add(versionedPartitionName); } else { rejected.add(versionedPartitionName); } } return !acceptable || txRawKeyEntryStream.stream(rowTxId, rowFP, rowType, key, hasValue, value, valueTimestamp, valueTombstoned, valueVersion, versionedPartitionName); } catch (PropertiesNotPresentException e) { LOG.warn("Properties not available on load for {}", versionedPartitionName); return true; } catch (NotARingMemberException e) { LOG.warn("Not a ring member for {}", versionedPartitionName); return true; } }), (txId, fp, rowType, prefix, key, hasValue, value, valueTimestamp, valueTombstoned, valueVersion, versionedPartitionName) -> { acquireOne(); try { return txPartitionDelta(versionedPartitionName, delta -> { // delta is pristine, no need to check timestamps and versions byte[] deltaValue = UIO.readByteArray(value, 0, "value"); delta.put(fp, prefix, key, deltaValue, valueTimestamp, valueTombstoned, valueVersion); delta.onLoadAppendTxFp(prefix, txId, fp); updateSinceLastMerge.incrementAndGet(); return true; }); } finally { releaseOne(); } }); } } } amzaStats.deltaStripeLoad(index, updateSinceLastMerge.get(), updateSinceLastMerge.get() / (double) mergeAfterNUpdates); if (updateSinceLastMerge.get() > mergeAfterNUpdates) { synchronized (awakeCompactionsLock) { awakeCompactionsLock.notifyAll(); } } LOG.info("Reloaded deltas stripe:{} in {} ms", index, (System.currentTimeMillis() - start)); } finally { compactionStats.finished(); } } private interface PartitionDeltaTx { boolean tx(PartitionDelta delta) throws Exception; } public void flush(boolean fsync) throws Exception { DeltaWAL wal = deltaWAL.get(); if (wal != null) { wal.flush(fsync); } } public boolean hasChangesFor(VersionedPartitionName versionedPartitionName) { return partitionDeltas.containsKey(versionedPartitionName); } public interface StorageTxIdProvider { long get() throws Exception; } public long getHighestTxId(VersionedPartitionName versionedPartitionName, StorageTxIdProvider txIdProvider) throws Exception { PartitionDelta partitionDelta = partitionDeltas.get(versionedPartitionName); if (partitionDelta != null) { long highestTxId = partitionDelta.highestTxId(); if (highestTxId > -1) { return highestTxId; } } if (useHighwaterTxId) { long highwaterTxId = highwaterStorage.getLocal(versionedPartitionName); if (highwaterTxId == HighwaterStorage.LOCAL_NONE) { highwaterTxId = txIdProvider.get(); if (highwaterTxId != HighwaterStorage.LOCAL_NONE) { LOG.info("Repaired missing highwater for:{} txId:{}", versionedPartitionName, highwaterTxId); highwaterStorage.setLocal(versionedPartitionName, highwaterTxId); } } return highwaterTxId; } else { long highwaterTxId = highwaterStorage.getLocal(versionedPartitionName); long storageTxId = txIdProvider.get(); if (highwaterTxId == HighwaterStorage.LOCAL_NONE && storageTxId != HighwaterStorage.LOCAL_NONE) { LOG.info("Repaired missing highwater for:{} txId:{}", versionedPartitionName, storageTxId); highwaterStorage.setLocal(versionedPartitionName, storageTxId); } else if (highwaterTxId != HighwaterStorage.LOCAL_NONE && storageTxId > highwaterTxId) { LOG.error("Lagging txId for:{} storage:{} highwater:{}", versionedPartitionName, storageTxId, highwaterTxId); } return storageTxId; } } private boolean txPartitionDelta(VersionedPartitionName versionedPartitionName, PartitionDeltaTx tx) throws Exception { PartitionDelta partitionDelta; synchronized (partitionDeltas) { partitionDelta = partitionDeltas.get(versionedPartitionName); if (partitionDelta == null) { DeltaWAL wal = deltaWAL.get(); if (wal == null) { throw new IllegalStateException("Delta WAL is currently unavailable."); } partitionDelta = partitionDeltas.computeIfAbsent(versionedPartitionName, vpn -> new PartitionDelta(versionedPartitionName, wal, maxValueSizeInIndex, null)); } partitionDelta.acquire(); } try { return tx.tx(partitionDelta); } finally { partitionDelta.release(); } } public boolean mergeable() { return updateSinceLastMerge.get() > mergeAfterNUpdates; } public void merge(IoStats ioStats, PartitionIndex partitionIndex, VersionedPartitionProvider versionedPartitionProvider, CurrentVersionProvider currentVersionProvider, boolean force) throws Exception { if (!force && !mergeable()) { return; } long had = updateSinceLastMerge.get(); if (!merging.compareAndSet(0, had)) { LOG.warn("Trying to merge DeltaStripe:" + partitionIndex + " while another merge is already in progress."); return; } CompactionStats compactionStats = amzaStats.beginCompaction(CompactionFamily.merge, "merge-delta-stripe" + getId()); try { DeltaWAL wal = deltaWAL.get(); updateSinceLastMerge.set(0); boolean mergeDelta = mergeDelta(ioStats, compactionStats, partitionIndex, versionedPartitionProvider, currentVersionProvider, wal, false, () -> deltaWALFactory.create(ioStats, wal.getId()) ); if (!mergeDelta) { updateSinceLastMerge.addAndGet(had); } merging.set(0); } finally { compactionStats.finished(); } } private boolean mergeDelta( IoStats ioStats, WALCompactionStats walCompactionStats, PartitionIndex partitionIndex, VersionedPartitionProvider versionedPartitionProvider, CurrentVersionProvider currentVersionProvider, DeltaWAL wal, boolean validate, Callable<DeltaWAL> newWAL) throws Exception { List<Future<MergeResult>> futures = new ArrayList<>(); writeAcquireAll(); try { synchronized (partitionDeltas) { for (Map.Entry<VersionedPartitionName, PartitionDelta> e : partitionDeltas.entrySet()) { if (e.getValue().isMerging()) { LOG.warn("Ingress is faster than we can merge!"); return false; } } LOG.info("Merging delta partitions..."); DeltaWAL newDeltaWAL = newWAL.call(); deltaWAL.set(newDeltaWAL); amzaStats.deltaStripeLoad(index, 0, 0); AtomicLong mergeable = new AtomicLong(); AtomicLong merged = new AtomicLong(); AtomicLong unmerged = new AtomicLong(); Iterator<Entry<VersionedPartitionName, PartitionDelta>> iter = partitionDeltas.entrySet().iterator(); while (iter.hasNext()) { Entry<VersionedPartitionName, PartitionDelta> entry = iter.next(); VersionedPartitionName versionedPartitionName = entry.getKey(); if (currentVersionProvider.isCurrentVersion(versionedPartitionName)) { PartitionDelta mergeableDelta = entry.getValue(); if (mergeableDelta.needsToMerge()) { long mergeableCount = mergeableDelta.size(); unmerged.addAndGet(mergeableCount); PartitionDelta currentDelta = new PartitionDelta(versionedPartitionName, newDeltaWAL, maxValueSizeInIndex, mergeableDelta); entry.setValue(currentDelta); mergeable.incrementAndGet(); futures.add(mergeDeltaThreads.submit(() -> { return getMergeResult(ioStats, walCompactionStats, partitionIndex, versionedPartitionProvider, currentVersionProvider, validate, mergeable, merged, unmerged, versionedPartitionName, mergeableCount, currentDelta); })); } else { LOG.warn("Ignored merge for empty partition {}", versionedPartitionName); iter.remove(); } } else { LOG.warn("Ignored merge for obsolete partition {}", versionedPartitionName); iter.remove(); } } amzaStats.deltaStripeMerge(index, 0, 0); } } catch (Exception x) { parkSick("This is catastrophic." + " We have permanently parked this thread." + " This delta {} can no longer accept writes." + " You likely need to restart this instance", x); } finally { writeReleaseAll(); } List<MergeResult> results = Lists.newArrayListWithCapacity(futures.size()); try { for (Future<MergeResult> f : futures) { MergeResult result = f.get(); if (result != null) { results.add(result); } } ListMultimap<String, WALIndex> providerIndexes = ArrayListMultimap.create(); for (MergeResult result : results) { if (result.partitionStore != null) { result.partitionStore.flush(true); } if (result.walIndex != null) { providerIndexes.put(result.walIndex.getProviderName(), result.walIndex); } } for (Entry<String, Collection<WALIndex>> entry : providerIndexes.asMap().entrySet()) { WALIndexProvider<?> walIndexProvider = walIndexProviderRegistry.getWALIndexProvider(entry.getKey()); if (walIndexProvider != null) { walIndexProvider.flush((Iterable) entry.getValue(), true); } } } catch (Exception x) { parkSick("This is catastrophic. Failure finalizing merge.", x); } try { wal.awaitDerefenced(); LOG.info("Awaited clear references for delta partitions."); } catch (Exception x) { parkSick("This is catastrophic. Failure awaiting clear references.", x); } try { highwaterStorage.flushLocal(); highwaterStorage.flush(index, true, () -> { flush(true); return null; }); } catch (Exception x) { parkSick("This is catastrophic. Failure flushing highwaters.", x); } try { deltaWALFactory.destroy(wal); LOG.info("Compacted delta partitions."); } catch (Exception x) { parkSick("This is catastrophic. Failure destroying WAL.", x); } try { for (MergeResult result : results) { currentVersionProvider.invalidateDeltaIndexCache(result.versionedPartitionName); } } catch (Exception x) { parkSick("This is catastrophic. Failure invalidating delta index cache.", x); } return true; } private void parkSick(String message, Exception x) { sickThreads.sick(x); LOG.error(message + " We have permanently parked this thread." + " This delta {} can no longer accept writes." + " You likely need to restart this instance", new Object[] { index }, x); LockSupport.park(); } private MergeResult getMergeResult(IoStats ioStats, WALCompactionStats walCompactionStats, PartitionIndex partitionIndex, VersionedPartitionProvider versionedPartitionProvider, CurrentVersionProvider currentVersionProvider, boolean validate, AtomicLong mergeable, AtomicLong merged, AtomicLong unmerged, VersionedPartitionName versionedPartitionName, long mergeableCount, PartitionDelta currentDelta) throws Exception { MergeResult result = null; try { PartitionName partitionName = versionedPartitionName.getPartitionName(); walCompactionStats.add("partitions", 1); walCompactionStats.start(partitionName.toBase64()); try { while (true) { try { result = currentVersionProvider.tx(partitionName, null, (deltaIndex, stripeIndex, storageVersion) -> { MergeResult r; if (stripeIndex == -1) { LOG.warn("Ignored merge for partition {} with nonexistent storage", versionedPartitionName); r = null; } else { PartitionProperties properties = versionedPartitionProvider.getProperties(partitionName); if (properties == null) { LOG.warn("Ignored merge for partition {} with missing properties", versionedPartitionName); r = null; } else { r = currentDelta.merge(ioStats, highwaterStorage, partitionIndex, properties, stripeIndex, validate); } } sickThreads.recovered(); return r; }); break; } catch (Throwable x) { sickThreads.sick(x); if (validate) { LOG.error("Validation merge failed for partition:{} WAL storage must be purged and re-taken!", new Object[] { versionedPartitionName }, x); currentVersionProvider.abandonVersion(versionedPartitionName); break; } else { LOG.error("Background merge failed for partition:{} We will retry in case the issue can be resolved.", new Object[] { versionedPartitionName }, x); Thread.sleep(30_000L); } } } } finally { walCompactionStats.stop(partitionName.toBase64()); } return result; } finally { amzaStats.deltaStripeMerge(index, mergeable.decrementAndGet(), (unmerged.get() - merged.addAndGet(mergeableCount)) / (double) unmerged.get()); sickThreads.recovered(); } } public RowsChanged update(IoStats ioStats, boolean directApply, RowType rowType, HighwaterStorage highwaterStorage, VersionedPartitionName versionedPartitionName, PartitionStore partitionStore, byte[] prefix, Commitable updates, WALUpdated updated) throws Exception { long mergeDebt = merging.get(); if ((mergeDebt > 0 && mergeDebt + updateSinceLastMerge.get() > (2 * mergeAfterNUpdates)) || updateSinceLastMerge.get() > (2 * mergeAfterNUpdates)) { throw new DeltaOverCapacityException("Delta is full"); } if (directApply && mergeDebt > 0) { long highestTxId = partitionStore.mergedTxId(); int takeFromFactor = ringReader.getTakeFromFactor(versionedPartitionName.getPartitionName().getRingName(), 0); int[] taken = { 0 }; ackWaters.streamPartitionTxIds(versionedPartitionName, (member, txId) -> { if (txId >= highestTxId) { taken[0]++; if (taken[0] >= takeFromFactor) { return false; } } return true; }); if (taken[0] < takeFromFactor) { throw new DeltaOverCapacityException("Delta requires replication"); } } Map<WALKey, WALValue> apply = new LinkedHashMap<>(); List<KeyedTimestampId> removes = new ArrayList<>(); List<KeyedTimestampId> clobbers = new ArrayList<>(); List<byte[]> keys = new ArrayList<>(); List<WALValue> values = new ArrayList<>(); updates.commitable(null, (transactionId, key, value, valueTimestamp, valueTombstone, valueVersion) -> { Preconditions.checkArgument(valueTimestamp > 0, "Timestamp must be greater than zero"); Preconditions.checkArgument(valueVersion > 0, "Timestamp must be greater than zero"); keys.add(key); values.add(new WALValue(rowType, value, valueTimestamp, valueTombstone, valueVersion)); return true; }); writeAcquireOne(); try { DeltaWAL wal = deltaWAL.get(); RowsChanged[] rowsChanged = { null }; getPointers(ioStats, versionedPartitionName, partitionStore.getWalStorage(), (stream) -> { for (int i = 0; i < keys.size(); i++) { byte[] key = keys.get(i); WALValue update = values.get(i); if (!stream.stream(prefix, key, update.getValue(), update.getTimestampId(), update.getTombstoned(), update.getVersion())) { return false; } } return true; }, (_prefix, key, value, valueTimestamp, valueTombstone, valueVersion, ptrTimestamp, ptrTombstoned, ptrVersion, ptrFp, ptrHasValue, ptrValue) -> { WALKey walKey = new WALKey(prefix, key); WALValue walValue = new WALValue(rowType, value, valueTimestamp, valueTombstone, valueVersion); if (ptrFp == -1 && !ptrHasValue) { apply.put(walKey, walValue); } else if (CompareTimestampVersions.compare(ptrTimestamp, ptrVersion, valueTimestamp, valueVersion) < 0) { apply.put(walKey, walValue); WALTimestampId walTimestampId = new WALTimestampId(ptrTimestamp, ptrTombstoned); KeyedTimestampId keyedTimestampId = new KeyedTimestampId(prefix, key, walTimestampId.getTimestampId(), walTimestampId.getTombstoned()); clobbers.add(keyedTimestampId); if (valueTombstone && !ptrTombstoned) { removes.add(keyedTimestampId); } } else { amzaStats.deltaFirstCheckRemoves.increment(); } return true; }); long[] appliedCount = { 0 }; if (apply.isEmpty()) { rowsChanged[0] = new RowsChanged(versionedPartitionName, Collections.emptyMap(), removes, clobbers, -1, -1, index); } else { txPartitionDelta(versionedPartitionName, delta -> { WALHighwater partitionHighwater = null; if (delta.shouldWriteHighwater()) { partitionHighwater = highwaterStorage.getPartitionHighwater(versionedPartitionName, false); } DeltaWAL.DeltaWALApplied updateApplied; synchronized (oneWriterAtATimeLock) { Iterator<Entry<WALKey, WALValue>> iter = apply.entrySet().iterator(); while (iter.hasNext()) { Entry<WALKey, WALValue> entry = iter.next(); WALKey key = entry.getKey(); WALValue value = entry.getValue(); WALPointer got = delta.getPointer(key.prefix, key.key); if (got != null && CompareTimestampVersions.compare(got.getTimestampId(), got.getVersion(), value.getTimestampId(), value.getVersion()) >= 0) { iter.remove(); amzaStats.deltaSecondCheckRemoves.increment(); } } updateApplied = wal.update(ioStats, rowType, versionedPartitionName, apply, partitionHighwater); appliedCount[0] = apply.size(); for (int i = 0; i < updateApplied.fps.length; i++) { KeyValueHighwater keyValueHighwater = updateApplied.keyValueHighwaters[i]; long fp = updateApplied.fps[i]; delta.put(fp, keyValueHighwater.prefix, keyValueHighwater.key, keyValueHighwater.value, keyValueHighwater.valueTimestamp, keyValueHighwater.valueTombstone, keyValueHighwater.valueVersion); } delta.appendTxFps(prefix, updateApplied.txId, updateApplied.fps); rowsChanged[0] = new RowsChanged(versionedPartitionName, apply, removes, clobbers, updateApplied.txId, updateApplied.txId, index); } updated.updated(versionedPartitionName, updateApplied.txId); return true; }); } long unmergedUpdates = updateSinceLastMerge.addAndGet(appliedCount[0]); amzaStats.deltaStripeLoad(index, unmergedUpdates, unmergedUpdates / (double) mergeAfterNUpdates); if (unmergedUpdates > mergeAfterNUpdates) { synchronized (awakeCompactionsLock) { awakeCompactionsLock.notifyAll(); } } return rowsChanged[0]; } finally { writeReleaseOne(); } } public boolean takeRowsFromTransactionId(IoStats ioStats, VersionedPartitionName versionedPartitionName, WALStorage storage, long transactionId, RowStream rowStream) throws Exception { long[] lowestTxId = { -1 }; acquireOne(); try { txPartitionDelta(versionedPartitionName, delta -> { lowestTxId[0] = delta.lowestTxId(); return true; }); } finally { releaseOne(); } if ((lowestTxId[0] == -1 || lowestTxId[0] > transactionId) && !storage.takeRowUpdatesSince(ioStats, transactionId, rowStream)) { return false; } acquireOne(); try { return txPartitionDelta(versionedPartitionName, delta -> { return delta.takeRowsFromTransactionId(ioStats, transactionId, rowStream); }); } finally { releaseOne(); } } public boolean takeRowsFromTransactionId(IoStats ioStats, VersionedPartitionName versionedPartitionName, WALStorage storage, byte[] prefix, long transactionId, RowStream rowStream) throws Exception { long[] lowestTxId = { -1 }; acquireOne(); try { txPartitionDelta(versionedPartitionName, delta -> { lowestTxId[0] = delta.lowestTxId(prefix); return true; }); } finally { releaseOne(); } if ((lowestTxId[0] == -1 || lowestTxId[0] > transactionId) && !storage.takeRowUpdatesSince(prefix, transactionId, rowStream)) { return false; } acquireOne(); try { return txPartitionDelta(versionedPartitionName, delta -> { return delta.takeRowsFromTransactionId(ioStats, prefix, transactionId, rowStream); }); } finally { releaseOne(); } } public boolean takeAllRows(IoStats ioStats, VersionedPartitionName versionedPartitionName, WALStorage storage, RowStream rowStream) throws Exception { if (!storage.takeAllRows(ioStats, rowStream)) { return false; } acquireOne(); try { return txPartitionDelta(versionedPartitionName, delta -> { return delta.takeRowsFromTransactionId(ioStats, 0, rowStream); }); } finally { releaseOne(); } } // for testing WALValue get(IoStats ioStats, VersionedPartitionName versionedPartitionName, WALStorage storage, byte[] prefix, byte[] key) throws Exception { WALValue[] walValue = new WALValue[1]; get(ioStats, versionedPartitionName, storage, prefix, stream -> stream.stream(key), (_prefix, _key, value, timestamp, tombstoned, version) -> { if (timestamp != -1) { walValue[0] = new WALValue(null, value, timestamp, tombstoned, version); } return true; }); return walValue[0]; } public boolean get(IoStats ioStats, VersionedPartitionName versionedPartitionName, WALStorage storage, byte[] prefix, UnprefixedWALKeys keys, KeyValueStream stream) throws Exception { acquireOne(); try { return txPartitionDelta(versionedPartitionName, partitionDelta -> { return storage.streamValues(prefix, storageStream -> partitionDelta.get(ioStats, prefix, keys, (fp, rowType, _prefix, key, value, valueTimestamp, valueTombstoned, valueVersion) -> { if (valueTimestamp != -1) { return stream.stream(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion); } else { return storageStream.stream(key); } }), (_prefix, key, value, valueTimestamp, valueTombstoned, valueVersion) -> { return stream.stream(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion); }); }); } finally { releaseOne(); } } public boolean containsKeys(VersionedPartitionName versionedPartitionName, WALStorage storage, byte[] prefix, UnprefixedWALKeys keys, KeyContainedStream stream) throws Exception { acquireOne(); try { return storage.containsKeys(prefix, storageKeyStream -> txPartitionDelta(versionedPartitionName, delta -> delta.containsKeys(prefix, keys, (_prefix, key, timestamp, tombstoned, version, exists) -> { if (exists) { return stream.stream(prefix, key, !tombstoned, timestamp, version); } else { return storageKeyStream.stream(key); } })), stream); } finally { releaseOne(); } } private boolean getPointers(IoStats ioStats, VersionedPartitionName versionedPartitionName, WALStorage storage, KeyValues keyValues, KeyValuePointerStream stream) throws Exception { return storage.streamPointers( ioStats, (storageStream) -> { return txPartitionDelta(versionedPartitionName, partitionDelta -> partitionDelta.getPointers(keyValues, (prefix, key, value, valueTimestamp, valueTombstoned, valueVersion, pTimestamp, pTombstoned, pVersion, pFp, pHasValue, pValue) -> { if (pFp == -1 && !pHasValue) { return storageStream.stream(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion); } else { return stream.stream(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion, pTimestamp, pTombstoned, pVersion, pFp, pHasValue, pValue); } })); }, (prefix, key, value, valueTimestamp, valueTombstoned, valueVersion, ptrTimestamp, ptrTombstoned, ptrVersion, ptrFp, ptrHasValue, ptrValue) -> { if (ptrFp == -1 && !ptrHasValue) { return stream.stream(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion, -1, false, -1, -1, false, null); } else { return stream.stream(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion, ptrTimestamp, ptrTombstoned, ptrVersion, ptrFp, ptrHasValue, ptrValue); } }); } public boolean rangeScan(VersionedPartitionName versionedPartitionName, RangeScannable rangeScannable, byte[] fromPrefix, byte[] fromKey, byte[] toPrefix, byte[] toKey, KeyValueStream keyValueStream, boolean hydrateValues) throws Exception { acquireOne(); try { return txPartitionDelta(versionedPartitionName, delta -> { final DeltaPeekableElmoIterator iterator = delta.rangeScanIterator(fromPrefix, fromKey, toPrefix, toKey, hydrateValues); try { rangeScannable.rangeScan(fromPrefix, fromKey, toPrefix, toKey, new LatestKeyValueStream(iterator, keyValueStream), hydrateValues); return WALKey.decompose( fpRawKeyValueStream -> { Map.Entry<byte[], WALValue> d = iterator.last(); if (d != null || iterator.hasNext()) { if (d != null) { WALValue got = d.getValue(); if (!fpRawKeyValueStream.stream(-1, -1, got.getRowType(), d.getKey(), true, got.getValue(), got.getTimestampId(), got.getTombstoned(), got.getVersion(), null)) { return false; } } while (iterator.hasNext()) { d = iterator.next(); WALValue got = d.getValue(); if (!fpRawKeyValueStream.stream(-1, -1, got.getRowType(), d.getKey(), true, got.getValue(), got.getTimestampId(), got.getTombstoned(), got.getVersion(), null)) { return false; } } } return true; }, (txId, fp, rowType, prefix, key, hasValue, value, valueTimestamp, valueTombstoned, valueVersion, row) -> keyValueStream.stream(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion)); } finally { iterator.close(); } }); } finally { releaseOne(); } } public boolean rowScan(VersionedPartitionName versionedPartitionName, Scannable scannable, KeyValueStream keyValueStream, boolean hydrateValues) throws Exception { acquireOne(); try { return txPartitionDelta(versionedPartitionName, delta -> { DeltaPeekableElmoIterator iterator = delta.rowScanIterator(hydrateValues); try { if (!scannable.rowScan(new LatestKeyValueStream(iterator, keyValueStream), hydrateValues)) { return false; } Map.Entry<byte[], WALValue> d = iterator.last(); if (d != null || iterator.hasNext()) { return WALKey.decompose( fpRawKeyValueStream -> { Map.Entry<byte[], WALValue> last = d; if (last != null) { WALValue got = last.getValue(); if (!fpRawKeyValueStream.stream(-1, -1, got.getRowType(), last.getKey(), true, got.getValue(), got.getTimestampId(), got.getTombstoned(), got.getVersion(), null)) { return false; } } while (iterator.hasNext()) { last = iterator.next(); WALValue got = last.getValue(); if (!fpRawKeyValueStream.stream(-1, -1, got.getRowType(), last.getKey(), true, got.getValue(), got.getTimestampId(), got.getTombstoned(), got.getVersion(), null)) { return false; } } return true; }, (txId, fp, rowType, prefix, key, hasValue, value, valueTimestamp, valueTombstoned, valueVersion, row) -> keyValueStream.stream(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion)); } } finally { iterator.close(); } return true; }); } finally { releaseOne(); } } /** * Stupid expensive!!!! */ public long count(VersionedPartitionName versionedPartitionName, WALStorage storage) throws Exception { acquireOne(); try { return storage.count(stream -> txPartitionDelta(versionedPartitionName, delta -> delta.keys(stream))); } finally { releaseOne(); } } public long approximateCount(VersionedPartitionName versionedPartitionName, WALStorage storage) throws Exception { acquireOne(); try { long[] deltaSize = { -1 }; txPartitionDelta(versionedPartitionName, delta -> { deltaSize[0] = delta.size(); return true; }); return storage.approximateCount() + deltaSize[0]; } finally { releaseOne(); } } static class LatestKeyValueStream implements KeyValueStream { private final DeltaPeekableElmoIterator iterator; private final KeyValueStream keyValueStream; private Map.Entry<byte[], WALValue> d; public LatestKeyValueStream(DeltaPeekableElmoIterator iterator, KeyValueStream keyValueStream) { this.iterator = iterator; this.keyValueStream = keyValueStream; } @Override public boolean stream(byte[] prefix, byte[] key, byte[] value, long valueTimestamp, boolean valueTombstoned, long valueVersion) throws Exception { if (d == null && iterator.hasNext()) { d = iterator.next(); } boolean[] needsKey = { true }; byte[] pk = WALKey.compose(prefix, key); boolean complete = WALKey.decompose( txFpKeyValueStream -> { while (d != null && KeyUtil.compare(d.getKey(), pk) <= 0) { WALValue got = d.getValue(); if (Arrays.equals(d.getKey(), pk)) { needsKey[0] = false; } if (!txFpKeyValueStream.stream(-1, -1, got.getRowType(), d.getKey(), true, got.getValue(), got.getTimestampId(), got.getTombstoned(), got.getVersion(), null)) { return false; } if (iterator.hasNext()) { d = iterator.next(); } else { iterator.eos(); d = null; break; } } return true; }, (txId, fp, rowType, sPrefix, sKey, sHasValue, sValue, sValueTimestamp, sValueTombstoned, sValueVersion, row) -> keyValueStream.stream(sPrefix, sKey, sValue, sValueTimestamp, sValueTombstoned, sValueVersion)); if (!complete) { return false; } else if (needsKey[0]) { return keyValueStream.stream(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion); } else { return true; } } } @Override public String toString() { return "DeltaStripeWALStorage{" + "index=" + index + '}'; } }