package org.infinispan.persistence.sifs; import java.io.IOException; import java.lang.ref.SoftReference; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Stack; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import org.infinispan.persistence.spi.PersistenceException; import org.infinispan.util.TimeService; import org.infinispan.util.logging.Log; import org.infinispan.util.logging.LogFactory; /** * The recursive index structure. References to children are held in soft references, * which allows JVM-handled caching and reduces the amount of reads required while * evading OOMs if the index gets too big. * * @author Radim Vansa <rvansa@redhat.com> */ class IndexNode { private static final Log log = LogFactory.getLog(IndexNode.class); private static final boolean trace = log.isTraceEnabled(); private static final byte HAS_LEAVES = 1; private static final byte HAS_NODES = 2; private static final int INNER_NODE_HEADER_SIZE = 5; private static final int INNER_NODE_REFERENCE_SIZE = 10; private static final int LEAF_NODE_REFERENCE_SIZE = 10; public static final int RESERVED_SPACE = INNER_NODE_HEADER_SIZE + 2 * Math.max(INNER_NODE_REFERENCE_SIZE, LEAF_NODE_REFERENCE_SIZE); private Index.Segment segment; private byte[] prefix; private byte[][] keyParts; InnerNode[] innerNodes; private LeafNode[] leafNodes; private ReadWriteLock lock = new ReentrantReadWriteLock(); private long offset = -1; private int contentLength = -1; private int totalLength = -1; private int occupiedSpace; public enum RecordChange { INCREASE, INCREASE_FOR_OLD, MOVE, DECREASE, } public IndexNode(Index.Segment segment, long offset, int occupiedSpace) throws IOException { this.segment = segment; this.offset = offset; this.occupiedSpace = occupiedSpace; ByteBuffer buffer = loadBuffer(segment.getIndexFile(), offset, occupiedSpace); prefix = new byte[buffer.getShort()]; buffer.get(prefix); byte flags = buffer.get(); int numKeyParts = buffer.getShort(); keyParts = new byte[numKeyParts][]; for (int i = 0; i < numKeyParts; ++i) { keyParts[i] = new byte[buffer.getShort()]; buffer.get(keyParts[i]); } if ((flags & HAS_LEAVES) != 0) { leafNodes = new LeafNode[numKeyParts + 1]; for (int i = 0; i < numKeyParts + 1; ++i) { leafNodes[i] = new LeafNode(buffer.getInt(), buffer.getInt(), buffer.getShort()); } } else if ((flags & HAS_NODES) != 0){ innerNodes = new InnerNode[numKeyParts + 1]; for (int i = 0; i < numKeyParts + 1; ++i) { innerNodes[i] = new InnerNode(buffer.getLong(), buffer.getShort()); } } else { // the default leafNodes = LeafNode.EMPTY_ARRAY; } if (trace) { log.tracef("Loaded %08x from %d:%d (length %d)", System.identityHashCode(this), offset, occupiedSpace, length()); } } private static ByteBuffer loadBuffer(FileChannel indexFile, long offset, int occupiedSpace) throws IOException { ByteBuffer buffer = ByteBuffer.allocate(occupiedSpace); int read = 0; do { int nowRead = indexFile.read(buffer, offset + read); if (nowRead < 0) { throw new IOException("Cannot read record [" + offset + ":" + occupiedSpace + "] (already read " + read + "), file size is " + indexFile.size()); } read += nowRead; } while (read < occupiedSpace); buffer.rewind(); return buffer; } IndexNode(Index.Segment segment, byte[] newPrefix, byte[][] newKeyParts, LeafNode[] newLeafNodes) { this.segment = segment; this.prefix = newPrefix; this.keyParts = newKeyParts; this.leafNodes = newLeafNodes; } IndexNode(Index.Segment segment, byte[] newPrefix, byte[][] newKeyParts, InnerNode[] newInnerNodes) { this.segment = segment; this.prefix = newPrefix; this.keyParts = newKeyParts; this.innerNodes = newInnerNodes; } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; IndexNode indexNode = (IndexNode) o; if (!Arrays.equals(innerNodes, indexNode.innerNodes)) return false; if (!Arrays.equals(leafNodes, indexNode.leafNodes)) return false; if (!Arrays.equals(prefix, indexNode.prefix)) return false; if (!Arrays.deepEquals(keyParts, indexNode.keyParts)) return false; return true; } /** * Can be called only from single writer thread (therefore the write lock guards only other readers) * @param other */ public void replaceContent(IndexNode other) throws IOException { try { lock.writeLock().lock(); this.prefix = other.prefix; this.keyParts = other.keyParts; this.innerNodes = other.innerNodes; this.leafNodes = other.leafNodes; this.contentLength = -1; this.totalLength = -1; } finally { lock.writeLock().unlock(); } // don't have to acquire any lock here // the only node with offset < 0 is the root - we can't lose reference to it if (offset >= 0) { store(new Index.IndexSpace(offset, occupiedSpace)); } } // called only internally or for root void store(Index.IndexSpace indexSpace) throws IOException { this.offset = indexSpace.offset; this.occupiedSpace = indexSpace.length; ByteBuffer buffer = ByteBuffer.allocate(length()); buffer.putShort((short) prefix.length); buffer.put(prefix); byte flags = 0; if (innerNodes != null && innerNodes.length != 0) { flags |= HAS_NODES; } else if (leafNodes != null && leafNodes.length != 0) { flags |= HAS_LEAVES; } buffer.put(flags); buffer.putShort((short) keyParts.length); for (int i = 0; i < keyParts.length; ++i) { buffer.putShort((short) keyParts[i].length); buffer.put(keyParts[i]); } if (innerNodes != null) { for (int i = 0; i < innerNodes.length; ++i) { buffer.putLong(innerNodes[i].offset); buffer.putShort((short) innerNodes[i].length); } } else { for (int i = 0; i < leafNodes.length; ++i) { buffer.putInt(leafNodes[i].file); buffer.putInt(leafNodes[i].offset); buffer.putShort(leafNodes[i].numRecords); } } buffer.flip(); segment.getIndexFile().write(buffer, offset); if (trace) { log.tracef("Persisted %08x (length %d, %d %s) to %d:%d", System.identityHashCode(this), length(), innerNodes != null ? innerNodes.length : leafNodes.length, innerNodes != null ? "children" : "leaves", offset, occupiedSpace); } } private static class Path { public IndexNode node; public int index; private Path(IndexNode node, int index) { this.node = node; this.index = index; } } public enum ReadOperation { GET_RECORD { @Override protected EntryRecord apply(LeafNode leafNode, byte[] key, FileProvider fileProvider, TimeService timeService) throws IOException, IndexNodeOutdatedException { return leafNode.loadRecord(fileProvider, key, timeService); } }, GET_POSITION { @Override protected EntryPosition apply(LeafNode leafNode, byte[] key, FileProvider fileProvider, TimeService timeService) throws IOException, IndexNodeOutdatedException { EntryRecord hak = leafNode.loadHeaderAndKey(fileProvider); if (Arrays.equals(hak.getKey(), key)) { if (hak.getHeader().expiryTime() > 0 && hak.getHeader().expiryTime() <= timeService.wallClockTime()) { if (trace) { log.tracef("Found node on %d:%d but it is expired", leafNode.file, leafNode.offset); } return null; } return leafNode; } else { if (trace) { log.tracef("Found node on %d:%d but key does not match", leafNode.file, leafNode.offset); } } return null; } }, GET_INFO { @Override protected EntryInfo apply(LeafNode leafNode, byte[] key, FileProvider fileProvider, TimeService timeService) throws IOException, IndexNodeOutdatedException { EntryRecord hak = leafNode.loadHeaderAndKey(fileProvider); if (Arrays.equals(hak.getKey(), key)) { return leafNode; } else { if (trace) { log.tracef("Found node on %d:%d but key does not match", leafNode.file, leafNode.offset); } return null; } } }; protected abstract <T> T apply(LeafNode leafNode, byte[] key, FileProvider fileProvider, TimeService timeService) throws IOException, IndexNodeOutdatedException; } public static <T> T applyOnLeaf(Index.Segment segment, byte[] key, Lock rootLock, ReadOperation operation) throws IOException { int attempts = 0; ArrayList<IndexNode> path = new ArrayList<IndexNode>(); for (;;) { rootLock.lock(); IndexNode node = segment.getRoot(); Lock parentLock = rootLock, currentLock = null; try { while (node.innerNodes != null) { path.add(node); currentLock = node.lock.readLock(); currentLock.lock(); if (parentLock != null) { parentLock.unlock(); } parentLock = currentLock; int insertionPoint = node.getInsertionPoint(key); node = node.innerNodes[insertionPoint].getIndexNode(segment); if (node == null) { return null; } } currentLock = node.lock.readLock(); currentLock.lock(); if (node.leafNodes.length == 0) { return null; } int insertionPoint = node.getInsertionPoint(key); return operation.apply(node.leafNodes[insertionPoint], key, segment.getFileProvider(), segment.getTimeService()); } catch (IndexNodeOutdatedException e) { try { if (attempts > 10) { throw new PersistenceException("Index looks corrupt", e); } Thread.sleep(1000); attempts++; path.clear(); } catch (InterruptedException e1) { } // noop, we'll simply retry } finally { if (parentLock != currentLock && parentLock != null) parentLock.unlock(); if (currentLock != null) currentLock.unlock(); } } } public static void setPosition(IndexNode root, byte[] key, int file, int offset, int size, OverwriteHook overwriteHook, RecordChange recordChange) throws IOException { IndexNode node = root; Stack<Path> stack = new Stack<Path>(); while (node.innerNodes != null) { int insertionPoint = node.getInsertionPoint(key); stack.push(new Path(node, insertionPoint)); if (trace) { log.tracef("Pushed %08x (length %d, %d children) to stack (insertion point %d)", System.identityHashCode(node), node.length(), node.innerNodes.length, insertionPoint); } node = node.innerNodes[insertionPoint].getIndexNode(root.segment); } IndexNode copy = node.copyWith(key, file, offset, size, overwriteHook, recordChange); if (copy == node) { // no change was executed return; } if (trace) { log.tracef("Created %08x (length %d) from %08x (length %d), stack size %d", System.identityHashCode(copy), copy.length(), System.identityHashCode(node), node.length(), stack.size()); } Stack<IndexNode> garbage = new Stack<IndexNode>(); try { JoinSplitResult result = manageLength(root.segment, stack, node, copy, garbage); if (result == null) { return; } if (trace) { log.tracef("Created (1) %d new nodes, GC %08x", result.newNodes.size(), System.identityHashCode(node)); } garbage.push(node); for (;;) { if (stack.isEmpty()) { IndexNode newRoot; if (result.newNodes.size() == 1) { newRoot = result.newNodes.get(0); if (trace) { log.tracef("Setting new root %08x (index has shrunk)", System.identityHashCode(newRoot)); } } else { newRoot = IndexNode.emptyWithInnerNodes(root.segment).copyWith(0, 0, result.newNodes); if (trace) { log.tracef("Setting new root %08x (index has grown)", System.identityHashCode(newRoot)); } } newRoot.segment.setRoot(newRoot); return; } Path path = stack.pop(); copy = path.node.copyWith(result.from, result.to, result.newNodes); if (trace) { log.tracef("Created %08x (length %d) from %08x with the %d new nodes (%d - %d)", System.identityHashCode(copy), copy.length(), System.identityHashCode(path.node), result.newNodes.size(), result.from, result.to); } result = manageLength(path.node.segment, stack, path.node, copy, garbage); if (result == null) { if (trace) { log.tracef("No more index updates required"); } return; } if (trace) { log.tracef("Created (2) %d new nodes, GC %08x", result.newNodes.size(), System.identityHashCode(path.node)); } garbage.push(path.node); } } finally { while (!garbage.isEmpty()) { IndexNode oldNode = garbage.pop(); // this will be never unlocked, if the algorithm is correct, this node should be GC'ed soon. oldNode.lock.writeLock().lock(); if (oldNode.offset >= 0) { oldNode.segment.freeIndexSpace(oldNode.offset, oldNode.occupiedSpace); } } } } private static class JoinSplitResult { public final int from; public final int to; public final List<IndexNode> newNodes; private JoinSplitResult(int from, int to, List<IndexNode> newNodes) { this.from = from; this.to = to; this.newNodes = newNodes; } } private static JoinSplitResult manageLength(Index.Segment segment, Stack<Path> stack, IndexNode node, IndexNode copy, Stack<IndexNode> garbage) throws IOException { int from, to; if (copy.length() < segment.getMinNodeSize() && !stack.isEmpty()) { Path parent = stack.peek(); if (parent.node.innerNodes.length == 1) { // we have no siblings - we can't merge with them even when we're really short if (copy.length() <= node.occupiedSpace) { node.replaceContent(copy); return null; } else { return new JoinSplitResult(parent.index, parent.index, Collections.singletonList(copy)); } } int sizeWithLeft = Integer.MAX_VALUE; int sizeWithRight = Integer.MAX_VALUE; if (parent.index > 0) { sizeWithLeft = copy.length() + parent.node.innerNodes[parent.index - 1].length - INNER_NODE_HEADER_SIZE; } if (parent.index < parent.node.innerNodes.length - 1) { sizeWithRight = copy.length() + parent.node.innerNodes[parent.index + 1].length - INNER_NODE_HEADER_SIZE; } int joinWith; // this is just some kind of heuristic, may be changed later if (sizeWithLeft == Integer.MAX_VALUE) { joinWith = parent.index + 1; } else if (sizeWithRight == Integer.MAX_VALUE) { joinWith = parent.index - 1; } else if (sizeWithLeft > segment.getMaxNodeSize() && sizeWithRight > segment.getMaxNodeSize()) { joinWith = sizeWithLeft >= sizeWithRight ? parent.index - 1 : parent.index + 1; } else { joinWith = sizeWithLeft <= sizeWithRight ? parent.index - 1 : parent.index + 1; } if (joinWith < 0 || joinWith >= parent.node.innerNodes.length) { throw new IllegalStateException(String.format("parent %08x, %08x -> %08x: cannot join to %d, with left %d, with right %d, max %d", System.identityHashCode(parent.node), System.identityHashCode(node), System.identityHashCode(copy), joinWith, sizeWithLeft, sizeWithRight, segment.getMaxNodeSize())); } IndexNode joiner = parent.node.innerNodes[joinWith].getIndexNode(segment); byte[] middleKey = concat(parent.node.prefix, parent.node.keyParts[joinWith < parent.index ? parent.index - 1 : parent.index]); if (joinWith < parent.index) { copy = join(joiner, middleKey, copy); from = joinWith; to = parent.index; } else { copy = join(copy, middleKey, joiner); from = parent.index; to = joinWith; } garbage.push(joiner); } else if (copy.length() <= node.occupiedSpace) { if (copy.innerNodes != null && copy.innerNodes.length == 1 && stack.isEmpty()) { IndexNode child = copy.innerNodes[0].getIndexNode(copy.segment); return new JoinSplitResult(0, 0, Collections.singletonList(child)); } else { // special case where we only overwrite the key node.replaceContent(copy); return null; } } else if (stack.isEmpty()) { from = to = 0; } else { from = to = stack.peek().index; } if (copy.length() <= segment.getMaxNodeSize()) { return new JoinSplitResult(from, to, Collections.singletonList(copy)); } else { return new JoinSplitResult(from, to, copy.split()); } } private static IndexNode join(IndexNode left, byte[] middleKey, IndexNode right) throws IOException { byte[] newPrefix = commonPrefix(left.prefix, right.prefix); byte[][] newKeyParts = new byte[left.keyParts.length + right.keyParts.length + 1][]; newPrefix = commonPrefix(newPrefix == null ? left.prefix : newPrefix, middleKey); copyKeyParts(left.keyParts, 0, newKeyParts, 0, left.keyParts.length, left.prefix, newPrefix); byte[] rightmostKey; try { rightmostKey = left.rightmostKey(); } catch (IndexNodeOutdatedException e) { throw new IllegalStateException(e); } int commonLength = Math.abs(compare(middleKey, rightmostKey)); newKeyParts[left.keyParts.length] = substring(middleKey, newPrefix.length, commonLength); copyKeyParts(right.keyParts, 0, newKeyParts, left.keyParts.length + 1, right.keyParts.length, right.prefix, newPrefix); if (left.innerNodes != null && right.innerNodes != null) { InnerNode[] newInnerNodes = new InnerNode[left.innerNodes.length + right.innerNodes.length]; System.arraycopy(left.innerNodes, 0, newInnerNodes, 0, left.innerNodes.length); System.arraycopy(right.innerNodes, 0, newInnerNodes, left.innerNodes.length, right.innerNodes.length); return new IndexNode(left.segment, newPrefix, newKeyParts, newInnerNodes); } else if (left.leafNodes != null && right.leafNodes != null) { LeafNode[] newLeafNodes = new LeafNode[left.leafNodes.length + right.leafNodes.length]; System.arraycopy(left.leafNodes, 0, newLeafNodes, 0, left.leafNodes.length); System.arraycopy(right.leafNodes, 0, newLeafNodes, left.leafNodes.length, right.leafNodes.length); return new IndexNode(left.segment, newPrefix, newKeyParts, newLeafNodes); } else { throw new IllegalArgumentException("Cannot join " + left + " and " + right); } } public IndexNode copyWith(int oldNodesFrom, int oldNodesTo, List<IndexNode> newNodes) throws IOException { InnerNode[] newInnerNodes = new InnerNode[innerNodes.length + newNodes.size() - 1 - oldNodesTo + oldNodesFrom]; System.arraycopy(innerNodes, 0, newInnerNodes, 0, oldNodesFrom); System.arraycopy(innerNodes, oldNodesTo + 1, newInnerNodes, oldNodesFrom + newNodes.size(), innerNodes.length - oldNodesTo - 1); for (int i = 0; i < newNodes.size(); ++i) { IndexNode node = newNodes.get(i); Index.IndexSpace space = segment.allocateIndexSpace(node.length()); node.store(space); newInnerNodes[i + oldNodesFrom] = new InnerNode(node); } byte[][] newKeys = new byte[newNodes.size() - 1][]; byte[] newPrefix = prefix; for (int i = 0; i < newKeys.length; ++i) { try { // TODO: if all keys within the subtree are null (deleted), the new key will be null // will be fixed with proper index reduction newKeys[i] = newNodes.get(i + 1).leftmostKey(); if (newKeys[i] == null) { throw new IllegalStateException(); } } catch (IndexNodeOutdatedException e) { throw new IllegalStateException("Index cannot be outdated for segment updater thread", e); } newPrefix = commonPrefix(newPrefix, newKeys[i]); } byte[][] newKeyParts = new byte[keyParts.length + newNodes.size() - 1 - oldNodesTo + oldNodesFrom][]; copyKeyParts(keyParts, 0, newKeyParts, 0, oldNodesFrom, prefix, newPrefix); copyKeyParts(keyParts, oldNodesTo, newKeyParts, oldNodesFrom + newKeys.length, keyParts.length - oldNodesTo, prefix, newPrefix); for (int i = 0; i < newKeys.length; ++i) { newKeyParts[i + oldNodesFrom] = substring(newKeys[i], newPrefix.length, newKeys[i].length); } return new IndexNode(segment, newPrefix, newKeyParts, newInnerNodes); } private byte[] leftmostKey() throws IOException, IndexNodeOutdatedException { if (innerNodes != null) { for (int i = 0; i < innerNodes.length; ++i) { byte[] key = innerNodes[i].getIndexNode(segment).leftmostKey(); if (key != null) return key; } } else { for (int i = 0; i < leafNodes.length; ++i) { EntryRecord hak = leafNodes[i].loadHeaderAndKey(segment.getFileProvider()); if (hak != null && hak.getKey() != null) return hak.getKey(); } } return null; } private byte[] rightmostKey() throws IOException, IndexNodeOutdatedException { if (innerNodes != null) { for (int i = innerNodes.length - 1; i >= 0; --i) { byte[] key = innerNodes[i].getIndexNode(segment).rightmostKey(); if (key != null) return key; } } else { for (int i = leafNodes.length - 1; i >= 0; --i) { EntryRecord hak = leafNodes[i].loadHeaderAndKey(segment.getFileProvider()); if (hak != null && hak.getKey() != null) return hak.getKey(); } } return null; } /** * Called on the most bottom node * @param key * @param file * @param offset * @param recordChange * @return */ public IndexNode copyWith(byte[] key, int file, int offset, int size, OverwriteHook overwriteHook, RecordChange recordChange) throws IOException { if (leafNodes == null) throw new IllegalArgumentException(); byte[] newPrefix; byte[][] newKeyParts; LeafNode[] newLeafNodes; if (leafNodes.length == 0) { overwriteHook.setOverwritten(false, -1, -1); if (overwriteHook.check(-1, -1)) { return new IndexNode(segment, prefix, keyParts, new LeafNode[]{ new LeafNode(file, offset, (short) 1)}); } else { segment.getCompactor().free(file, size); return this; } } int insertPart = getInsertionPoint(key); LeafNode oldLeafNode = leafNodes[insertPart]; short numRecords = oldLeafNode.numRecords; switch (recordChange) { case INCREASE: case INCREASE_FOR_OLD: if (numRecords == Short.MAX_VALUE) { throw new IllegalStateException("Too many records for this key (short overflow)"); } numRecords++; break; case MOVE: break; case DECREASE: numRecords--; break; } EntryRecord hak; try { hak = oldLeafNode.loadHeaderAndKey(segment.getFileProvider()); } catch (IndexNodeOutdatedException e) { throw new IllegalStateException("Index cannot be outdated for segment updater thread", e); } int keyComp = compare(hak.getKey(), key); if (keyComp == 0) { if (numRecords > 0) { if (overwriteHook.check(oldLeafNode.file, oldLeafNode.offset)) { newPrefix = prefix; newKeyParts = keyParts; newLeafNodes = new LeafNode[leafNodes.length]; System.arraycopy(leafNodes, 0, newLeafNodes, 0, leafNodes.length); // Do not update the file and offset for DROPPED IndexRequests if (recordChange == RecordChange.INCREASE || recordChange == RecordChange.MOVE) { if (trace) { log.trace(String.format("Overwriting %d:%d with %d:%d (%d)", oldLeafNode.file, oldLeafNode.offset, file, offset, numRecords)); } newLeafNodes[insertPart] = new LeafNode(file, offset, numRecords); segment.getCompactor().free(oldLeafNode.file, hak.getHeader().totalLength()); } else { if (trace) { log.trace(String.format("Updating num records for %d:%d to %d", oldLeafNode.file, oldLeafNode.offset, numRecords)); } newLeafNodes[insertPart] = new LeafNode(oldLeafNode.file, oldLeafNode.offset, numRecords); } overwriteHook.setOverwritten(true, oldLeafNode.file, oldLeafNode.offset); } else { overwriteHook.setOverwritten(false, -1, -1); segment.getCompactor().free(file, size); return this; } } else { overwriteHook.setOverwritten(true, oldLeafNode.file, oldLeafNode.offset); if (keyParts.length <= 1) { newPrefix = new byte[0]; newKeyParts = new byte[0][]; } else { newPrefix = prefix; newKeyParts = new byte[keyParts.length - 1][]; if (insertPart == keyParts.length) { System.arraycopy(keyParts, 0, newKeyParts, 0, newKeyParts.length); } else { System.arraycopy(keyParts, 0, newKeyParts, 0, insertPart); System.arraycopy(keyParts, insertPart + 1, newKeyParts, insertPart, newKeyParts.length - insertPart); } } if (leafNodes.length > 0) { newLeafNodes = new LeafNode[leafNodes.length - 1]; System.arraycopy(leafNodes, 0, newLeafNodes, 0, insertPart); System.arraycopy(leafNodes, insertPart + 1, newLeafNodes, insertPart, newLeafNodes.length - insertPart); } else { newLeafNodes = leafNodes; } if (hak != null) { segment.getCompactor().free(oldLeafNode.file, hak.getHeader().totalLength()); } } } else { // IndexRequest cannot be MOVED or DROPPED when the key is not in the index assert recordChange == RecordChange.INCREASE; overwriteHook.setOverwritten(false, -1, -1); // We have to insert the record even if this is a delete request and the key was not found // because otherwise we would have incorrect numRecord count. Eventually, Compactor will // drop the tombstone and update index, removing this node if (keyParts.length == 0) { // TODO: we may use unnecessarily long keys here and the key is never shortened newPrefix = keyComp > 0 ? key : hak.getKey(); } else { newPrefix = commonPrefix(prefix, key); } newKeyParts = new byte[keyParts.length + 1][]; newLeafNodes = new LeafNode[leafNodes.length + 1]; copyKeyParts(keyParts, 0, newKeyParts, 0, insertPart, prefix, newPrefix); copyKeyParts(keyParts, insertPart, newKeyParts, insertPart + 1, keyParts.length - insertPart, prefix, newPrefix); if (keyComp > 0) { newKeyParts[insertPart] = substring(key, newPrefix.length, keyComp); System.arraycopy(leafNodes, 0, newLeafNodes, 0, insertPart + 1); System.arraycopy(leafNodes, insertPart + 1, newLeafNodes, insertPart + 2, leafNodes.length - insertPart - 1); newLeafNodes[insertPart + 1] = new LeafNode(file, offset, (short) 1); } else { newKeyParts[insertPart] = substring(hak.getKey(), newPrefix.length, -keyComp); System.arraycopy(leafNodes, 0, newLeafNodes, 0, insertPart); System.arraycopy(leafNodes, insertPart, newLeafNodes, insertPart + 1, leafNodes.length - insertPart); newLeafNodes[insertPart] = new LeafNode(file, offset, (short) 1); } } return new IndexNode(segment, newPrefix, newKeyParts, newLeafNodes); } private int getInsertionPoint(byte[] key) { int comp = compare(prefix, key, prefix.length); int insertionPoint; if (comp < 0) { insertionPoint = 0; } else if (comp > 0) { insertionPoint = keyParts.length; } else { byte[] keyPostfix = substring(key, prefix.length, key.length); insertionPoint = Arrays.binarySearch(keyParts, keyPostfix, new Comparator<byte[]>() { @Override public int compare(byte[] o1, byte[] o2) { return IndexNode.this.compare(o2, o1); } }); if (insertionPoint < 0) { insertionPoint = -insertionPoint - 1; } else { insertionPoint++; // identical elements must go to the right } } return insertionPoint; } private List<IndexNode> split() { int headerLength = headerLength(); int contentLength = contentLength(); int maxLength = segment.getMaxNodeSize(); int targetParts = contentLength / Math.max(maxLength - headerLength, 1) + 1; int targetLength = contentLength / targetParts + headerLength; List<IndexNode> list = new ArrayList<IndexNode>(); int childLength = innerNodes != null ? INNER_NODE_REFERENCE_SIZE : LEAF_NODE_REFERENCE_SIZE; byte[] prefixExtension = keyParts[0]; // the prefix can be only extended int currentLength = INNER_NODE_HEADER_SIZE + prefix.length + prefixExtension.length + 2 * childLength + 2; int nodeFrom = 0; // TODO: under certain circumstances this algorithm can end up by splitting node into very uneven parts // such as having one part with only 1 child, therefore only 15 bytes long for (int i = 1; i < keyParts.length; ++i) { int newLength; byte[] newPrefixExtension = commonPrefix(prefixExtension, keyParts[i]); if (newPrefixExtension.length != prefixExtension.length) { newLength = currentLength + (prefixExtension.length - newPrefixExtension.length) * (i - nodeFrom - 1); } else { newLength = currentLength; } newLength += keyParts[i].length - newPrefixExtension.length + childLength + 2; if (newLength < targetLength) { currentLength = newLength; } else { IndexNode subNode; if (newLength > maxLength) { subNode = subNode(prefixExtension, nodeFrom, i); ++i; } else { subNode = subNode(newPrefixExtension, nodeFrom, i + 1); i += 2; } list.add(subNode); if (i < keyParts.length) { newPrefixExtension = keyParts[i]; } currentLength = INNER_NODE_HEADER_SIZE + prefix.length + newPrefixExtension.length + 2 * childLength + 2; nodeFrom = i; } prefixExtension = newPrefixExtension; } if (nodeFrom <= keyParts.length) { list.add(subNode(prefixExtension, nodeFrom, keyParts.length)); } return list; } private IndexNode subNode(byte[] newPrefixExtension, int childFrom, int childTo) { // first node takes up to child[to + 1], other do not take the child[from] == child[previousTo + 1] // If the new node has > 1 keyParts, it ignores the first keyPart, otherwise it just sets the first child to be // deleted (empty entry) byte[][] newKeyParts = new byte[childTo - childFrom][]; if (newPrefixExtension.length > 0) { for (int i = childFrom; i < childTo; ++i) { newKeyParts[i - childFrom] = substring(keyParts[i], newPrefixExtension.length, keyParts[i].length); } } else { System.arraycopy(keyParts, childFrom, newKeyParts, 0, childTo - childFrom); } byte[] newPrefix = childFrom == childTo ? new byte[0] : concat(prefix, newPrefixExtension); if (innerNodes != null) { InnerNode[] newInnerNodes = new InnerNode[childTo - childFrom + 1]; System.arraycopy(innerNodes, childFrom, newInnerNodes, 0, childTo - childFrom + 1); return new IndexNode(segment, newPrefix, newKeyParts, newInnerNodes); } else if (leafNodes != null) { LeafNode[] newLeafNodes = new LeafNode[childTo - childFrom + 1]; System.arraycopy(leafNodes, childFrom, newLeafNodes, 0, childTo - childFrom + 1); return new IndexNode(segment, newPrefix, newKeyParts, newLeafNodes); } throw new IllegalStateException(); } private static byte[] concat(byte[] first, byte[] second) { if (first == null || first.length == 0) return second; if (second == null || second.length == 0) return first; byte[] result = new byte[first.length + second.length]; System.arraycopy(first, 0, result, 0, first.length); System.arraycopy(second, 0, result, first.length, second.length); return result; } private static void copyKeyParts(byte[][] src, int srcIndex, byte[][] dest, int destIndex, int length, byte[] oldPrefix, byte[] common) { if (oldPrefix.length == common.length) { System.arraycopy(src, srcIndex, dest, destIndex, length); } else { for (int i = 0; i < length; ++i) { dest[destIndex + i] = findNewKeyPart(oldPrefix, src[srcIndex + i], common); } } } private static byte[] findNewKeyPart(byte[] oldPrefix, byte[] oldKeyPart, byte[] common) { byte[] newPart = new byte[oldKeyPart.length + oldPrefix.length - common.length]; System.arraycopy(oldPrefix, common.length, newPart, 0, oldPrefix.length - common.length); System.arraycopy(oldKeyPart, 0, newPart, oldPrefix.length - common.length, oldKeyPart.length); return newPart; } private static byte[] substring(byte[] key, int begin, int end) { if (end <= begin) return new byte[0]; byte[] sub = new byte[end - begin]; System.arraycopy(key, begin, sub, 0, end - begin); return sub; } private static byte[] commonPrefix(byte[] oldPrefix, byte[] newKey) { int i; for (i = 0; i < oldPrefix.length && i < newKey.length; ++i) { if (newKey[i] != oldPrefix[i]) break; } if (i == oldPrefix.length) { return oldPrefix; } if (i == newKey.length) { return newKey; } byte[] prefix = new byte[i]; for (--i; i >= 0; --i) { prefix[i] = oldPrefix[i]; } return prefix; } private static int compare(byte[] first, byte[] second, int length) { for (int i = 0; i < length; ++i) { if (i >= second.length) { return -1; } if (second[i] == first[i]) continue; return second[i] > first[i] ? 1 : -1; } return 0; } private static int compare(byte[] first, byte[] second) { for (int i = 0; i < first.length && i < second.length; ++i) { if (second[i] == first[i]) continue; return second[i] > first[i] ? i + 1 : -i - 1; } return second.length > first.length ? first.length + 1 : (second.length < first.length ? -second.length - 1 : 0); } private int headerLength() { return INNER_NODE_HEADER_SIZE + prefix.length; } private int contentLength() { if (contentLength >= 0) { return contentLength; } int sum = 0; for (byte[] keyPart : keyParts) { sum += 2 + keyPart.length; } if (innerNodes != null) { sum += INNER_NODE_REFERENCE_SIZE * innerNodes.length; } else if (leafNodes != null) { sum += LEAF_NODE_REFERENCE_SIZE * leafNodes.length; } else { throw new IllegalStateException(); } return contentLength = sum; } public int length() { if (totalLength >= 0) return totalLength; return totalLength = headerLength() + contentLength(); } public static IndexNode emptyWithLeaves(Index.Segment segment) { return new IndexNode(segment, new byte[0], new byte[0][], LeafNode.EMPTY_ARRAY); } public static IndexNode emptyWithInnerNodes(Index.Segment segment) { return new IndexNode(segment, new byte[0], new byte[0][], new InnerNode[]{ new InnerNode(-1l, (short) -1) }); } public static class OverwriteHook { public static final OverwriteHook NOOP = new OverwriteHook(); public boolean check(int oldFile, int oldOffset) { return true; } public void setOverwritten(boolean overwritten, int prevFile, int prevOffset) { } } static class InnerNode extends Index.IndexSpace { private volatile SoftReference<IndexNode> reference; public InnerNode(long offset, short length) { super(offset, length); } public InnerNode(IndexNode node) { super(node.offset, node.occupiedSpace); reference = new SoftReference<IndexNode>(node); } public IndexNode getIndexNode(Index.Segment segment) throws IOException { IndexNode node; if (reference == null || (node = reference.get()) == null) { synchronized (this) { if (reference == null || (node = reference.get()) == null) { if (offset < 0) return null; node = new IndexNode(segment, offset, length); reference = new SoftReference<IndexNode>(node); if (trace) { log.trace("Loaded inner node from " + offset + " - " + length); } } } } return node; } } private static class LeafNode extends EntryInfo { private static LeafNode[] EMPTY_ARRAY = new LeafNode[0]; private volatile SoftReference<EntryRecord> keyReference; public LeafNode(int file, int offset, short numRecords) { super(file, offset, numRecords); } public EntryRecord loadHeaderAndKey(FileProvider fileProvider) throws IOException, IndexNodeOutdatedException { return getHeaderAndKey(fileProvider, null); } private EntryRecord getHeaderAndKey(FileProvider fileProvider, FileProvider.Handle handle) throws IOException, IndexNodeOutdatedException { EntryRecord headerAndKey; if (keyReference == null || (headerAndKey = keyReference.get()) == null) { synchronized (this) { if (keyReference == null || (headerAndKey = keyReference.get()) == null) { boolean ownHandle = false; if (handle == null) { ownHandle = true; handle = fileProvider.getFile(file); if (handle == null) { throw new IndexNodeOutdatedException(file + ":" + offset + " (" + numRecords + ")"); } } try { int readOffset = offset < 0 ? ~offset : offset; EntryHeader header = EntryRecord.readEntryHeader(handle, readOffset); if (header == null) { throw new IllegalStateException("Error reading header from " + file + ":" + readOffset + " | " + handle.getFileSize()); } headerAndKey = new EntryRecord(header, EntryRecord.readKey(handle, header, readOffset), null, null); keyReference = new SoftReference<>(headerAndKey); } finally { if (ownHandle) { handle.close(); } } } } } assert headerAndKey != null; assert headerAndKey.getKey() != null; return headerAndKey; } public EntryRecord loadRecord(FileProvider fileProvider, byte[] key, TimeService timeService) throws IOException, IndexNodeOutdatedException { FileProvider.Handle handle = fileProvider.getFile(file); int readOffset = offset < 0 ? ~offset : offset; if (handle == null) { throw new IndexNodeOutdatedException(file + ":" + readOffset); } try { EntryRecord headerAndKey = getHeaderAndKey(fileProvider, handle); if (!Arrays.equals(key, headerAndKey.getKey())) { if (trace) { log.trace("Key on " + file + ":" + readOffset + " not matched."); } return null; } if (headerAndKey.getHeader().valueLength() <= 0) { if (trace) { log.trace("Entry " + file + ":" + readOffset + " matched, it is a tombstone."); } return null; } if (headerAndKey.getHeader().expiryTime() > 0 && headerAndKey.getHeader().expiryTime() <= timeService.wallClockTime()) { if (trace) { log.trace("Key on " + file + ":" + readOffset + " matched but expired."); } return null; } if (trace) { log.trace("Loaded from " + file + ":" + readOffset); } return headerAndKey.loadMetadataAndValue(handle, readOffset); } finally { handle.close(); } } } private static class IndexNodeOutdatedException extends Exception { IndexNodeOutdatedException(String message) { super(message); } } @Override public String toString() { StringBuilder sb = new StringBuilder(); for (int i = 0; i <= keyParts.length; ++i) { sb.append('\n'); if (leafNodes != null) { sb.append(" [").append(leafNodes[i].file).append(':').append(leafNodes[i].offset).append("] "); } else { sb.append(" [").append(innerNodes[i].offset).append(':').append(innerNodes[i].length).append("] "); } if (i < keyParts.length) { sb.append(new String(concat(prefix, keyParts[i]))); } } sb.append('\n'); return sb.toString(); } }