/*
* Copyright (C) 2012, 2016 higherfrequencytrading.com
* Copyright (C) 2016 Roman Leventov
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package net.openhft.chronicle.hash.impl.stage.iter;
import net.openhft.chronicle.bytes.BytesUtil;
import net.openhft.chronicle.hash.ChronicleHashCorruption;
import net.openhft.chronicle.hash.ChronicleHashRecoveryFailedException;
import net.openhft.chronicle.hash.Data;
import net.openhft.chronicle.hash.impl.CompactOffHeapLinearHashTable;
import net.openhft.chronicle.hash.impl.VanillaChronicleHash;
import net.openhft.chronicle.hash.impl.stage.entry.SegmentStages;
import net.openhft.chronicle.map.ChronicleHashCorruptionImpl;
import net.openhft.chronicle.map.ExternalMapQueryContext;
import net.openhft.chronicle.map.MapEntry;
import net.openhft.chronicle.map.VanillaChronicleMap;
import net.openhft.chronicle.map.impl.VanillaChronicleMapHolder;
import net.openhft.chronicle.map.impl.stage.entry.MapEntryStages;
import net.openhft.sg.StageRef;
import net.openhft.sg.Staged;
import static net.openhft.chronicle.map.ChronicleHashCorruptionImpl.*;
@Staged
public class TierRecovery {
@StageRef VanillaChronicleMapHolder<?, ?, ?> mh;
@StageRef SegmentStages s;
@StageRef MapEntryStages<?, ?> e;
@StageRef IterationKeyHashCode khc;
public int recoverTier(
int segmentIndex, ChronicleHashCorruption.Listener corruptionListener,
ChronicleHashCorruptionImpl corruption) {
s.freeList.clearAll();
VanillaChronicleHash<?, ?, ?, ?> h = mh.h();
CompactOffHeapLinearHashTable hl = h.hashLookup;
long hlAddr = s.tierBaseAddr;
long validEntries = 0;
long hlPos = 0;
do {
long hlEntry = hl.readEntry(hlAddr, hlPos);
nextHlPos:
if (!hl.empty(hlEntry)) {
// (*)
hl.clearEntry(hlAddr, hlPos);
if (validEntries >= h.maxEntriesPerHashLookup) {
report(corruptionListener, corruption, segmentIndex, () ->
format("Too many entries in tier with index {}, max is {}",
s.tierIndex, h.maxEntriesPerHashLookup)
);
break nextHlPos;
}
long searchKey = hl.key(hlEntry);
long entryPos = hl.value(hlEntry);
int si = checkEntry(searchKey, entryPos, segmentIndex,
corruptionListener, corruption);
if (si < 0) {
break nextHlPos;
} else {
s.freeList.setRange(entryPos, entryPos + e.entrySizeInChunks);
segmentIndex = si;
}
// The entry has passed all checks, re-insert:
long startInsertPos = hl.hlPos(searchKey);
long insertPos = startInsertPos;
do {
long hlInsertEntry = hl.readEntry(hlAddr, insertPos);
if (hl.empty(hlInsertEntry)) {
hl.writeEntry(hlAddr, insertPos, hl.entry(searchKey, entryPos));
validEntries++;
break nextHlPos;
}
if (insertPos == hlPos) {
// means we made a whole loop, without finding a hole to re-insert entry,
// even if hashLookup was corrupted and all slots are dirty now, at least
// the slot cleared at (*) should be clear, if it is dirty, only
// a concurrent modification thread could occupy it
throw new ChronicleHashRecoveryFailedException(
"Concurrent modification of " + h.toIdentityString() +
" while recovery procedure is in progress");
}
checkDuplicateKeys:
if (hl.key(hlInsertEntry) == searchKey) {
long anotherEntryPos = hl.value(hlInsertEntry);
if (anotherEntryPos == entryPos) {
validEntries++;
break nextHlPos;
}
long currentKeyOffset = e.keyOffset;
long currentKeySize = e.keySize;
int currentEntrySizeInChunks = e.entrySizeInChunks;
if (insertPos >= 0 && insertPos < hlPos) {
// insertPos already checked
e.readExistingEntry(anotherEntryPos);
} else if (checkEntry(searchKey, anotherEntryPos, segmentIndex,
corruptionListener, corruption) < 0) {
break checkDuplicateKeys;
}
if (e.keySize == currentKeySize &&
BytesUtil.bytesEqual(s.segmentBS, currentKeyOffset,
s.segmentBS, e.keyOffset, currentKeySize)) {
report(corruptionListener, corruption, segmentIndex, () ->
format("Entries with duplicate keys within a tier: " +
"at pos {} and {} with key {}, first value is {}",
entryPos, anotherEntryPos, e.key(), e.value())
);
s.freeList.clearRange(
entryPos, entryPos + currentEntrySizeInChunks);
break nextHlPos;
}
}
insertPos = hl.step(insertPos);
} while (insertPos != startInsertPos);
throw new ChronicleHashRecoveryFailedException(
"HashLookup overflow should never occur. " +
"It might also be concurrent access to " + h.toIdentityString() +
" while recovery procedure is in progress");
}
hlPos = hl.step(hlPos);
} while (hlPos != 0);
shiftHashLookupEntries();
return segmentIndex;
}
private void shiftHashLookupEntries() {
VanillaChronicleHash<?, ?, ?, ?> h = mh.h();
CompactOffHeapLinearHashTable hl = h.hashLookup;
long hlAddr = s.tierBaseAddr;
long hlPos = 0;
long steps = 0;
do {
long hlEntry = hl.readEntry(hlAddr, hlPos);
if (!hl.empty(hlEntry)) {
long searchKey = hl.key(hlEntry);
long hlHolePos = hl.hlPos(searchKey);
while (hlHolePos != hlPos) {
long hlHoleEntry = hl.readEntry(hlAddr, hlHolePos);
if (hl.empty(hlHoleEntry)) {
hl.writeEntry(hlAddr, hlHolePos, hlEntry);
if (hl.remove(hlAddr, hlPos) != hlPos) {
hlPos = hl.stepBack(hlPos);
steps--;
}
break;
}
hlHolePos = hl.step(hlHolePos);
}
}
hlPos = hl.step(hlPos);
steps++;
} while (hlPos != 0 || steps == 0);
}
public void removeDuplicatesInSegment(
ChronicleHashCorruption.Listener corruptionListener,
ChronicleHashCorruptionImpl corruption) {
long startHlPos = 0L;
VanillaChronicleMap<?, ?, ?> m = mh.m();
CompactOffHeapLinearHashTable hashLookup = m.hashLookup;
long currentTierBaseAddr = s.tierBaseAddr;
while (!hashLookup.empty(hashLookup.readEntry(currentTierBaseAddr, startHlPos))) {
startHlPos = hashLookup.step(startHlPos);
}
long hlPos = startHlPos;
int steps = 0;
long entries = 0;
tierIteration:
do {
hlPos = hashLookup.step(hlPos);
steps++;
long entry = hashLookup.readEntry(currentTierBaseAddr, hlPos);
if (!hashLookup.empty(entry)) {
e.readExistingEntry(hashLookup.value(entry));
Data key = e.key();
try (ExternalMapQueryContext<?, ?, ?> c = m.queryContext(key)) {
MapEntry<?, ?> entry2 = c.entry();
Data<?> key2 = ((MapEntry) c).key();
long keyAddress = key.bytes().address(key.offset());
long key2Address = key2.bytes().address(key2.offset());
if (key2Address != keyAddress) {
report(corruptionListener, corruption, s.segmentIndex, () ->
format("entries with duplicate key {} in segment {}: " +
"with values {} and {}, removing the latter",
key, c.segmentIndex(),
entry2 != null ? ((MapEntry) c).value() : "<deleted>",
!e.entryDeleted() ? e.value() : "<deleted>")
);
if (hashLookup.remove(currentTierBaseAddr, hlPos) != hlPos) {
hlPos = hashLookup.stepBack(hlPos);
steps--;
}
continue tierIteration;
}
}
entries++;
}
// the `steps == 0` condition and this variable updates in the loop fix the bug, when
// shift deletion occurs on the first entry of the tier, and the hlPos
// becomes equal to start pos without making the whole loop, but only visiting a single
// entry
} while (hlPos != startHlPos || steps == 0);
recoverTierEntriesCounter(entries, corruptionListener, corruption);
recoverLowestPossibleFreeChunkTiered(corruptionListener, corruption);
}
private void recoverTierEntriesCounter(
long entries, ChronicleHashCorruption.Listener corruptionListener,
ChronicleHashCorruptionImpl corruption) {
if (s.tierEntries() != entries) {
report(corruptionListener, corruption, s.segmentIndex, () ->
format("Wrong number of entries counter for tier with index {}, " +
"stored: {}, should be: {}", s.tierIndex, s.tierEntries(), entries)
);
s.tierEntries(entries);
}
}
private void recoverLowestPossibleFreeChunkTiered(
ChronicleHashCorruption.Listener corruptionListener,
ChronicleHashCorruptionImpl corruption) {
long lowestFreeChunk = s.freeList.nextClearBit(0);
if (lowestFreeChunk == -1)
lowestFreeChunk = mh.m().actualChunksPerSegmentTier;
if (s.lowestPossiblyFreeChunk() != lowestFreeChunk) {
long finalLowestFreeChunk = lowestFreeChunk;
report(corruptionListener, corruption, s.segmentIndex, () ->
format("wrong lowest free chunk for tier with index {}, " +
"stored: {}, should be: {}",
s.tierIndex, s.lowestPossiblyFreeChunk(), finalLowestFreeChunk)
);
s.lowestPossiblyFreeChunk(lowestFreeChunk);
}
}
private int checkEntry(
long searchKey, long entryPos, int segmentIndex,
ChronicleHashCorruption.Listener corruptionListener,
ChronicleHashCorruptionImpl corruption) {
VanillaChronicleHash<?, ?, ?, ?> h = mh.h();
if (entryPos < 0 || entryPos >= h.actualChunksPerSegmentTier) {
report(corruptionListener, corruption, segmentIndex, () ->
format("Entry pos is out of range: {}, should be 0-{}",
entryPos, h.actualChunksPerSegmentTier - 1)
);
return -1;
}
try {
e.readExistingEntry(entryPos);
} catch (Exception e) {
reportException(corruptionListener, corruption, segmentIndex,
() -> "Exception while reading entry key size", e);
return -1;
}
if (e.keyEnd() > s.segmentBytes.capacity()) {
report(corruptionListener, corruption, segmentIndex, () ->
format("Wrong key size: {}", e.keySize)
);
return -1;
}
long keyHashCode = khc.keyHashCode();
int segmentIndexFromKey = h.hashSplitting.segmentIndex(keyHashCode);
if (segmentIndexFromKey < 0 || segmentIndexFromKey >= h.actualSegments) {
report(corruptionListener, corruption, segmentIndex, () ->
format("Segment index from the entry key hash code is out of range: {}, " +
"should be 0-{}, entry key: {}",
segmentIndexFromKey, h.actualSegments - 1, e.key())
);
return -1;
}
long segmentHashFromKey = h.hashSplitting.segmentHash(keyHashCode);
long searchKeyFromKey = h.hashLookup.maskUnsetKey(segmentHashFromKey);
if (searchKey != searchKeyFromKey) {
report(corruptionListener, corruption, segmentIndex, () ->
format("HashLookup searchKey: {}, HashLookup searchKey " +
"from the entry key hash code: {}, entry key: {}, entry pos: {}",
searchKey, searchKeyFromKey, e.key(), entryPos)
);
return -1;
}
try {
// e.entryEnd() implicitly reads the value size, to be computed
long entryAndChecksumEnd = e.entryEnd() + e.checksumStrategy.extraEntryBytes();
if (entryAndChecksumEnd > s.segmentBytes.capacity()) {
report(corruptionListener, corruption, segmentIndex, () ->
format("Wrong value size: {}, key: {}", e.valueSize, e.key())
);
return -1;
}
} catch (Exception ex) {
reportException(corruptionListener, corruption, segmentIndex, () ->
"Exception while reading entry value size, key: " + e.key(), ex);
return -1;
}
int storedChecksum = e.checksumStrategy.storedChecksum();
int checksumFromEntry = e.checksumStrategy.computeChecksum();
if (storedChecksum != checksumFromEntry) {
report(corruptionListener, corruption, segmentIndex, () ->
format("Checksum doesn't match, stored: {}, should be from " +
"the entry bytes: {}, key: {}, value: {}",
storedChecksum, checksumFromEntry, e.key(), e.value())
);
return -1;
}
if (!s.freeList.isRangeClear(entryPos, entryPos + e.entrySizeInChunks)) {
report(corruptionListener, corruption, segmentIndex, () ->
format("Overlapping entry: positions {}-{}, key: {}, value: {}",
entryPos, entryPos + e.entrySizeInChunks - 1, e.key(), e.value())
);
return -1;
}
if (segmentIndex < 0) {
return segmentIndexFromKey;
} else {
if (segmentIndex != segmentIndexFromKey) {
report(corruptionListener, corruption, segmentIndex, () ->
format("Expected segment index: {}, segment index from the entry key: {}, " +
"key: {}, value: {}",
segmentIndex, searchKeyFromKey, e.key(), e.value())
);
return -1;
} else {
return segmentIndex;
}
}
}
}