/* * Copyright (c) 1998-2011 Caucho Technology -- all rights reserved * * This file is part of Resin(R) Open Source * * Each copy or derived work must preserve the copyright notice and this * notice unmodified. * * Resin Open Source is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Resin Open Source is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty * of NON-INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with Resin Open Source; if not, write to the * * Free Software Foundation, Inc. * 59 Temple Place, Suite 330 * Boston, MA 02111-1307 USA * * @author Scott Ferguson */ package com.caucho.db.index; import com.caucho.db.Database; import com.caucho.db.block.Block; import com.caucho.db.block.BlockManager; import com.caucho.db.block.BlockStore; // import com.caucho.db.lock.Lock; import com.caucho.db.xa.DbTransaction; import com.caucho.sql.SQLExceptionWrapper; import com.caucho.util.Hex; import com.caucho.util.L10N; import com.caucho.vfs.Path; import java.io.IOException; import java.sql.SQLException; import java.util.ArrayList; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Lock; import java.util.logging.Level; import java.util.logging.Logger; /** * Structure of the table: * * <pre> * b4 - flags * b4 - length * b8 - parent * b8 - next * tuples* * </pre> * * Structure of a tuple: * * <pre> * b8 - ptr to the actual data * key - the tuple's key * </pre> * * For a non-leaf node, the key is the last matching entry in the subtree. */ public final class BTree { private final static L10N L = new L10N(BTree.class); private final static Logger log = Logger.getLogger(BTree.class.getName()); public final static long FAIL = 0; private final static int BLOCK_SIZE = BlockStore.BLOCK_SIZE; private final static int PTR_SIZE = 8; private final static int FLAGS_OFFSET = 0; private final static int LENGTH_OFFSET = FLAGS_OFFSET + 4; private final static int PARENT_OFFSET = LENGTH_OFFSET + 4; private final static int NEXT_OFFSET = PARENT_OFFSET + PTR_SIZE; private final static int HEADER_SIZE = NEXT_OFFSET + PTR_SIZE; private final static int LEAF_MASK = 0x03; private final static int IS_LEAF = 0x01; private final static int IS_NODE = 0x02; private BlockStore _store; private long _rootBlockId; private Block _rootBlock; private int _keySize; private int _tupleSize; private int _n; private int _minN; private KeyCompare _keyCompare; private long _timeout = 120000L; private volatile boolean _isStarted; /** * Creates a new BTree with the given backing. * * @param store the underlying store containing the btree. */ public BTree(BlockStore store, long rootBlockId, int keySize, KeyCompare keyCompare) throws IOException { if (keyCompare == null) throw new NullPointerException(); _store = store; _store.getBlockManager(); _rootBlockId = rootBlockId; _rootBlock = store.readBlock(rootBlockId); // new Lock("index:" + store.getName()); if (BLOCK_SIZE < keySize + HEADER_SIZE) throw new IOException(L.l("BTree key size '{0}' is too large.", keySize)); _keySize = keySize; _tupleSize = keySize + PTR_SIZE; _n = (BLOCK_SIZE - HEADER_SIZE) / _tupleSize; _minN = (_n + 1) / 2; if (_minN < 0) _minN = 1; _keyCompare = keyCompare; byte []rootBuffer = _rootBlock.getBuffer(); if (getInt(rootBuffer, FLAGS_OFFSET) == 0) setLeaf(rootBuffer, true); } /** * Returns the index root. */ public long getIndexRoot() { return _rootBlockId; } /** * Creates and initializes the btree. */ public void create() throws IOException { } public long lookup(byte []keyBuffer, int keyOffset, int keyLength) throws IOException, SQLException { try { return lookup(keyBuffer, keyOffset, keyLength, _rootBlockId); } catch (InterruptedException e) { throw new IllegalStateException(e); } } private long lookup(byte []keyBuffer, int keyOffset, int keyLength, long blockId) throws IOException, SQLException, InterruptedException { Block block; if (blockId == _rootBlockId) { block = _rootBlock; block.allocate(); } else block = _store.loadBlock(blockId); try { Lock blockLock = block.getReadLock(); blockLock.tryLock(_timeout, TimeUnit.MILLISECONDS); try { validateIndex(block); block.read(); byte []buffer = block.getBuffer(); boolean isLeaf = isLeaf(buffer, block); long value = lookupTuple(blockId, buffer, keyBuffer, keyOffset, keyLength, isLeaf); if (isLeaf || value == FAIL) return value; else return lookup(keyBuffer, keyOffset, keyLength, value); } finally { blockLock.unlock(); } } finally { block.free(); } } /** * Inserts the new value for the given key. * * @return false if the block needs to be split */ public void insert(byte []keyBuffer, int keyOffset, int keyLength, long value, boolean isOverride) throws SQLException { try { while (! insert(keyBuffer, keyOffset, keyLength, value, isOverride, true, _rootBlockId)) { splitRoot(_rootBlockId); } } catch (RuntimeException e) { throw e; } catch (SQLException e) { throw e; } catch (Exception e) { log.log(Level.FINE, e.toString(), e); throw new SQLExceptionWrapper(e.toString(), e); } } /** * Inserts the new value for the given key. * * @return false if the block needs to be split * @throws InterruptedException */ private boolean insert(byte []keyBuffer, int keyOffset, int keyLength, long value, boolean isOverride, boolean isRead, long blockId) throws IOException, SQLException, InterruptedException { Block block; if (blockId == _rootBlockId) { block = _rootBlock; block.allocate(); } else block = _store.loadBlock(blockId); try { validateIndex(block); if (isRead && insertReadChild(keyBuffer, keyOffset, keyLength, value, isOverride, block)) return true; else return insertWriteChild(keyBuffer, keyOffset, keyLength, value, isOverride, block); } finally { block.free(); } } private boolean insertReadChild(byte []keyBuffer, int keyOffset, int keyLength, long value, boolean isOverride, Block block) throws IOException, SQLException, InterruptedException { Lock blockLock = block.getReadLock(); blockLock.tryLock(_timeout, TimeUnit.MILLISECONDS); try { validateIndex(block); block.read(); long blockId = block.getBlockId(); byte []buffer = block.getBuffer(); int length = getLength(buffer); if (length == _n) { // return false if the block needs to be split return false; } if (isLeaf(buffer, block)) { return false; } long childBlockId = lookupTuple(blockId, buffer, keyBuffer, keyOffset, keyLength, false); return insert(keyBuffer, keyOffset, keyLength, value, isOverride, true, childBlockId); } finally { blockLock.unlock(); } } private boolean insertWriteChild(byte []keyBuffer, int keyOffset, int keyLength, long value, boolean isOverride, Block block) throws IOException, SQLException, InterruptedException { Lock blockLock = block.getWriteLock(); blockLock.tryLock(_timeout, TimeUnit.MILLISECONDS); try { block.read(); validate(block); long blockId = block.getBlockId(); byte []buffer = block.getBuffer(); int length = getLength(buffer); if (length == _n) { // return false if the block needs to be split return false; } if (isLeaf(buffer, block)) { insertValue(keyBuffer, keyOffset, keyLength, value, isOverride, block); validate(block); return true; } long childBlockId = lookupTuple(blockId, buffer, keyBuffer, keyOffset, keyLength, false); while (! insert(keyBuffer, keyOffset, keyLength, value, isOverride, true, childBlockId)) { split(block, childBlockId); childBlockId = lookupTuple(blockId, buffer, keyBuffer, keyOffset, keyLength, false); } validate(block); return true; } finally { blockLock.unlock(); } } /** * Inserts into the next block given the current block and the given key. */ private void insertValue(byte []keyBuffer, int keyOffset, int keyLength, long value, boolean isOverride, Block block) throws IOException, SQLException { byte []buffer = block.getBuffer(); insertLeafBlock(block.getBlockId(), buffer, keyBuffer, keyOffset, keyLength, value, isOverride); block.setFlushDirtyOnCommit(false); block.setDirty(0, BlockStore.BLOCK_SIZE); } /** * Inserts into the next block given the current block and the given key. */ private long insertLeafBlock(long blockId, byte []buffer, byte []keyBuffer, int keyOffset, int keyLength, long value, boolean isOverride) throws IOException, SQLException { int tupleSize = _tupleSize; int length = getLength(buffer); int sublen = length; int min = 0; int max = length; int offset = HEADER_SIZE; while (min < max) { int i = (min + max) / 2; offset = HEADER_SIZE + i * tupleSize; int cmp = _keyCompare.compare(keyBuffer, keyOffset, buffer, offset + PTR_SIZE, keyLength); if (cmp == 0) { if (! isOverride) { long oldValue = getPointer(buffer, offset); if (value != oldValue) throw new SqlIndexAlreadyExistsException(L.l("'{0}' insert of key '{1}' fails index uniqueness.", _store, _keyCompare.toString(keyBuffer, keyOffset, keyLength))); } setPointer(buffer, offset, value); //writeBlock(blockIndex, block); return 0; } else if (0 < cmp) { min = i + 1; } else if (cmp < 0) { max = i; } } if (length < _n) { offset = HEADER_SIZE + min * tupleSize; return addKey(blockId, buffer, offset, min, length, keyBuffer, keyOffset, keyLength, value); } else { throw new IllegalStateException("ran out of key space"); } // return split(blockIndex, block); } private long addKey(long blockId, byte []buffer, int offset, int index, int length, byte []keyBuffer, int keyOffset, int keyLength, long value) throws IOException { int tupleSize = _tupleSize; if (index < length) { if (offset + tupleSize < HEADER_SIZE) throw new IllegalStateException(); System.arraycopy(buffer, offset, buffer, offset + tupleSize, (length - index) * tupleSize); } setPointer(buffer, offset, value); setLength(buffer, length + 1); if (log.isLoggable(Level.ALL)) log.log(Level.ALL, "btree insert at " + debugId(blockId) + ":" + offset + " value:" + debugId(value)); if (offset + PTR_SIZE < HEADER_SIZE) throw new IllegalStateException(); System.arraycopy(keyBuffer, keyOffset, buffer, offset + PTR_SIZE, keyLength); for (int j = PTR_SIZE + keyLength; j < tupleSize; j++) buffer[offset + j] = 0; return -value; } /** * The length in lBuf is assumed to be the length of the buffer. * * parent must already be locked * @throws InterruptedException */ private void split(Block parent, long blockId) throws IOException, SQLException, InterruptedException { Block block = _store.readBlock(blockId); try { validate(block); Lock blockLock = block.getWriteLock(); blockLock.tryLock(_timeout, TimeUnit.MILLISECONDS); try { split(parent, block); validate(block); } finally { blockLock.unlock(); } } finally { block.free(); } } /** * The length in lBuf is assumed to be the length of the buffer. */ private void split(Block parentBlock, Block block) throws IOException, SQLException { long parentId = parentBlock.getBlockId(); long blockId = block.getBlockId(); log.finest("btree splitting " + debugId(blockId)); block.setFlushDirtyOnCommit(false); byte []buffer = block.getBuffer(); int length = getLength(buffer); // Check length to avoid possible timing issue, since we release the // read lock for the block between the initial check in insert() and // getting it back in split() if (length < _n / 2) return; if (length < 2) throw new IllegalStateException(L.l("illegal length '{0}' for block {1}", length, debugId(blockId))); Block leftBlock = null; try { parentBlock.setFlushDirtyOnCommit(false); byte []parentBuffer = parentBlock.getBuffer(); int parentLength = getLength(parentBuffer); validate(parentId, parentBuffer); validate(blockId, buffer); leftBlock = _store.allocateIndexBlock(); // System.out.println("TREE-alloc1:" + Long.toHexString(leftBlock.getBlockId())); leftBlock.setFlushDirtyOnCommit(false); // System.out.println("ALLOC: " + leftBlock); byte []leftBuffer = leftBlock.getBuffer(); long leftBlockId = leftBlock.getBlockId(); int pivot = length / 2; int pivotSize = pivot * _tupleSize; int pivotEnd = HEADER_SIZE + pivotSize; int blockEnd = HEADER_SIZE + length * _tupleSize; System.arraycopy(buffer, HEADER_SIZE, leftBuffer, HEADER_SIZE, pivotSize); setInt(leftBuffer, FLAGS_OFFSET, getInt(buffer, FLAGS_OFFSET)); setLength(leftBuffer, pivot); // XXX: NEXT_OFFSET needs to work with getRightIndex setPointer(leftBuffer, NEXT_OFFSET, 0); setPointer(leftBuffer, PARENT_OFFSET, parentId); System.arraycopy(buffer, pivotEnd, buffer, HEADER_SIZE, blockEnd - pivotEnd); setLength(buffer, length - pivot); insertLeafBlock(parentId, parentBuffer, leftBuffer, pivotEnd - _tupleSize + PTR_SIZE, _keySize, leftBlockId, true); validate(parentId, parentBuffer); validate(leftBlockId, leftBuffer); validate(blockId, buffer); validate(block); validate(parentBlock); validate(leftBlock); leftBlock.setDirty(0, BlockStore.BLOCK_SIZE); parentBlock.setDirty(0, BlockStore.BLOCK_SIZE); } finally { if (leftBlock != null) leftBlock.free(); block.setDirty(0, BlockStore.BLOCK_SIZE); } } /** * The length in lBuf is assumed to be the length of the buffer. * @throws InterruptedException */ private void splitRoot(long rootBlockId) throws IOException, SQLException, InterruptedException { Block rootBlock = _rootBlock; // store.readBlock(rootBlockId); rootBlock.allocate(); try { Lock rootLock = rootBlock.getWriteLock(); rootLock.tryLock(_timeout, TimeUnit.MILLISECONDS); try { splitRoot(rootBlock); validate(rootBlock); } finally { rootLock.unlock(); } } finally { rootBlock.free(); } } /** * Splits the current leaf into two. Half of the entries go to the * left leaf and half go to the right leaf. */ private void splitRoot(Block parentBlock) throws IOException { long parentId = parentBlock.getBlockId(); log.finest("btree splitting root " + (parentId / BLOCK_SIZE)); Block leftBlock = null; Block rightBlock = null; try { byte []parentBuffer = parentBlock.getBuffer(); int length = getLength(parentBuffer); if (length == 1) return; parentBlock.setFlushDirtyOnCommit(false); int parentFlags = getInt(parentBuffer, FLAGS_OFFSET); leftBlock = _store.allocateIndexBlock(); // System.out.println("TREE-alloc2:" + Long.toHexString(leftBlock.getBlockId())); leftBlock.setFlushDirtyOnCommit(false); long leftBlockId = leftBlock.getBlockId(); rightBlock = _store.allocateIndexBlock(); // System.out.println("TREE-alloc3:" + Long.toHexString(rightBlock.getBlockId())); rightBlock.setFlushDirtyOnCommit(false); long rightBlockId = rightBlock.getBlockId(); int pivot = (length - 1) / 2; //System.out.println("INDEX SPLIT ROOT: " + (parentId / BLOCK_SIZE) // + " PIVOT=" + pivot); if (length <= 2 || _n < length || pivot < 1 || length <= pivot) throw new IllegalStateException(Long.toHexString(parentBlock.getBlockId()) + ": " + length + " is an illegal length, or pivot " + pivot + " is bad, with n=" + _n); int pivotOffset = HEADER_SIZE + pivot * _tupleSize; long pivotValue = getPointer(parentBuffer, pivotOffset); byte []leftBuffer = leftBlock.getBuffer(); System.arraycopy(parentBuffer, HEADER_SIZE, leftBuffer, HEADER_SIZE, pivotOffset + _tupleSize - HEADER_SIZE); setInt(leftBuffer, FLAGS_OFFSET, parentFlags); setLength(leftBuffer, pivot + 1); setPointer(leftBuffer, PARENT_OFFSET, parentId); setPointer(leftBuffer, NEXT_OFFSET, 0); // rightBlockId); byte []rightBuffer = rightBlock.getBuffer(); if (length - pivot - 1 < 0) throw new IllegalStateException("illegal length " + pivot + " " + length); System.arraycopy(parentBuffer, pivotOffset + _tupleSize, rightBuffer, HEADER_SIZE, (length - pivot - 1) * _tupleSize); setInt(rightBuffer, FLAGS_OFFSET, parentFlags); setLength(rightBuffer, length - pivot - 1); setPointer(rightBuffer, PARENT_OFFSET, parentId); setPointer(rightBuffer, NEXT_OFFSET, getPointer(parentBuffer, NEXT_OFFSET)); System.arraycopy(parentBuffer, pivotOffset, parentBuffer, HEADER_SIZE, _tupleSize); setPointer(parentBuffer, HEADER_SIZE, leftBlockId); setLeaf(parentBuffer, false); setLength(parentBuffer, 1); setPointer(parentBuffer, NEXT_OFFSET, rightBlockId); parentBlock.setDirty(0, BlockStore.BLOCK_SIZE); leftBlock.setDirty(0, BlockStore.BLOCK_SIZE); rightBlock.setDirty(0, BlockStore.BLOCK_SIZE); validate(parentBlock); validate(leftBlock); validate(rightBlock); } finally { if (leftBlock != null) leftBlock.free(); if (rightBlock != null) rightBlock.free(); } } public void remove(byte []keyBuffer, int keyOffset, int keyLength) throws SQLException { try { Block rootBlock = _rootBlock; // _store.readBlock(_rootBlockId); rootBlock.allocate(); try { if (! removeRead(rootBlock, keyBuffer, keyOffset, keyLength)) { removeWrite(rootBlock, keyBuffer, keyOffset, keyLength); } } finally { rootBlock.free(); } } catch (RuntimeException e) { throw e; } catch (SQLException e) { throw e; } catch (Exception e) { throw new SQLExceptionWrapper(e.toString(), e); } } /** * Recursively remove a key from the index. * * block is read-locked by the parent. * @throws InterruptedException */ private boolean removeRead(Block block, byte []keyBuffer, int keyOffset, int keyLength) throws IOException, SQLException, InterruptedException { Lock blockLock = block.getReadLock(); blockLock.tryLock(_timeout, TimeUnit.MILLISECONDS); try { validateIndex(block); byte []buffer = block.getBuffer(); long blockId = block.getBlockId(); if (isLeaf(buffer, block)) return false; long childId; childId = lookupTuple(blockId, buffer, keyBuffer, keyOffset, keyLength, false); if (childId == FAIL) return true; Block childBlock = _store.readBlock(childId); try { validateIndex(childBlock); if (removeRead(childBlock, keyBuffer, keyOffset, keyLength)) return true; else return removeWrite(childBlock, keyBuffer, keyOffset, keyLength); } finally { childBlock.free(); } } finally { blockLock.unlock(); } } /** * Recursively remove a key from the index. * * block is read-locked by the parent. * @throws InterruptedException */ private boolean removeWrite(Block block, byte []keyBuffer, int keyOffset, int keyLength) throws IOException, SQLException, InterruptedException { byte []buffer = block.getBuffer(); long blockId = block.getBlockId(); Lock blockLock = block.getWriteLock(); blockLock.tryLock(_timeout, TimeUnit.MILLISECONDS); try { boolean isLeaf = isLeaf(buffer, block); if (isLeaf) { block.setFlushDirtyOnCommit(false); removeLeafEntry(blockId, buffer, keyBuffer, keyOffset, keyLength); block.setDirty(0, BlockStore.BLOCK_SIZE); } else { long childId; childId = lookupTuple(blockId, buffer, keyBuffer, keyOffset, keyLength, isLeaf); if (childId == FAIL) return true; Block childBlock = _store.readBlock(childId); try { validateIndex(childBlock); boolean isJoin; isJoin = ! removeWrite(childBlock, keyBuffer, keyOffset, keyLength); if (isJoin && joinBlocks(block, childBlock)) { if (childBlock.getUseCount() > 2) { System.out.println("USE: " + childBlock.getUseCount() + " " + block); } childBlock.deallocate(); } validate(block); } finally { childBlock.free(); } } return _minN <= getLength(buffer); } finally { blockLock.unlock(); } } /** * Balances the block size so it's always 1/2 full. joinBlocks is called * when the block has one too few items, i.e. less than half full. * * If the left block has enough items, copy one from the left. * If the right block has enough items, copy one from the right. * * Otherwise, merge the block with either the left or the right block. * * parent is write-locked by the parent. * block is not locked. * * <pre> * ... | leftBlock | block | rightBlock | ... * </pre> * * @return true if the block should be deleted/freed * @throws InterruptedException */ private boolean joinBlocks(Block parent, Block block) throws IOException, SQLException, InterruptedException { long parentBlockId = parent.getBlockId(); byte []parentBuffer = parent.getBuffer(); int parentLength = getLength(parentBuffer); long blockId = block.getBlockId(); byte []buffer = block.getBuffer(); long leftBlockId = getLeftBlockId(parent, blockId); long rightBlockId = getRightBlockId(parent, blockId); // If the left block has extra data, shift the last left item // to the block if (leftBlockId > 0) { Block leftBlock = _store.readBlock(leftBlockId); try { byte []leftBuffer = leftBlock.getBuffer(); Lock leftLock = leftBlock.getWriteLock(); leftLock.tryLock(_timeout, TimeUnit.MILLISECONDS); try { int leftLength = getLength(leftBuffer); Lock blockLock = block.getWriteLock(); blockLock.tryLock(_timeout, TimeUnit.MILLISECONDS); try { if (_minN < leftLength) { validateEqualLeaf(buffer, leftBuffer, block, leftBlock); parent.setFlushDirtyOnCommit(false); leftBlock.setFlushDirtyOnCommit(false); validate(parentBlockId, parentBuffer); validate(leftBlockId, leftBuffer); validate(blockId, buffer); // System.out.println("MOVE_FROM_LEFT: " + debugId(blockId) + " from " + debugId(leftBlockId)); moveFromLeft(parentBuffer, leftBuffer, buffer, blockId); validate(parentBlockId, parentBuffer); validate(leftBlockId, leftBuffer); validate(blockId, buffer); parent.setDirty(0, BlockStore.BLOCK_SIZE); leftBlock.setDirty(0, BlockStore.BLOCK_SIZE); return false; } } finally { blockLock.unlock(); } } finally { leftLock.unlock(); } } finally { leftBlock.free(); } } // If the right block has extra data, shift the first right item // to the block if (rightBlockId > 0) { Block rightBlock = _store.readBlock(rightBlockId); try { byte []rightBuffer = rightBlock.getBuffer(); Lock blockLock = block.getWriteLock(); blockLock.tryLock(_timeout, TimeUnit.MILLISECONDS); try { Lock rightLock = rightBlock.getWriteLock(); rightLock.tryLock(_timeout, TimeUnit.MILLISECONDS); try { int rightLength = getLength(rightBuffer); if (_minN < rightLength) { validateEqualLeaf(buffer, rightBuffer, block, rightBlock); parent.setFlushDirtyOnCommit(false); rightBlock.setFlushDirtyOnCommit(false); // System.out.println("MOVE_FROM_RIGHT: " + debugId(blockId) + " from " + debugId(rightBlockId)); moveFromRight(parentBuffer, buffer, rightBuffer, blockId); validate(parentBlockId, parentBuffer); validate(blockId, buffer); validate(rightBlockId, rightBuffer); parent.setDirty(0, BlockStore.BLOCK_SIZE); rightBlock.setDirty(0, BlockStore.BLOCK_SIZE); return false; } } finally { rightLock.unlock(); } } finally { blockLock.unlock(); } } finally { rightBlock.free(); } } if (parentLength < 2) return false; // If the left block has space, merge with it if (leftBlockId > 0) { Block leftBlock = _store.readBlock(leftBlockId); try { byte []leftBuffer = leftBlock.getBuffer(); Lock leftLock = leftBlock.getWriteLock(); leftLock.tryLock(_timeout, TimeUnit.MILLISECONDS); try { int leftLength = getLength(leftBuffer); Lock blockLock = block.getWriteLock(); blockLock.tryLock(_timeout, TimeUnit.MILLISECONDS); try { int length = getLength(buffer); if (length + leftLength <= _n) { validateEqualLeaf(leftBuffer, buffer, leftBlock, block); parent.setFlushDirtyOnCommit(false); leftBlock.setFlushDirtyOnCommit(false); // System.out.println("MERGE_LEFT: " + debugId(blockId) + " from " + debugId(leftBlockId)); mergeLeft(parentBuffer, leftBuffer, leftBlockId, buffer, blockId); validate(parentBlockId, parentBuffer); validate(leftBlockId, leftBuffer); parent.setDirty(0, BlockStore.BLOCK_SIZE); leftBlock.setDirty(0, BlockStore.BLOCK_SIZE); // System.out.println("FREE-ML: " + block); return true; } } finally { blockLock.unlock(); } } finally { leftLock.unlock(); } } finally { leftBlock.free(); } } // If the right block has space, merge with it if (rightBlockId > 0) { Block rightBlock = _store.readBlock(rightBlockId); try { byte []rightBuffer = rightBlock.getBuffer(); Lock blockLock = block.getWriteLock(); blockLock.tryLock(_timeout, TimeUnit.MILLISECONDS); try { Lock rightLock = rightBlock.getWriteLock(); rightLock.tryLock(_timeout, TimeUnit.MILLISECONDS); try { int length = getLength(buffer); int rightLength = getLength(rightBuffer); if (length + rightLength <= _n) { validateEqualLeaf(rightBuffer, buffer, rightBlock, block); rightBlock.setFlushDirtyOnCommit(false); parent.setFlushDirtyOnCommit(false); // System.out.println("MERGE_RIGHT: " + debugId(blockId) + " from " + debugId(rightBlockId)); validate(blockId, buffer); validate(parentBlockId, parentBuffer); validate(rightBlockId, rightBuffer); mergeRight(parentBuffer, buffer, rightBuffer, blockId); validate(parentBlockId, parentBuffer); validate(rightBlockId, rightBuffer); rightBlock.setDirty(0, BlockStore.BLOCK_SIZE); parent.setDirty(0, BlockStore.BLOCK_SIZE); // System.out.println("FREE-MR: " + block); return true; } } finally { rightLock.unlock(); } } finally { blockLock.unlock(); } } finally { rightBlock.free(); } } // XXX: error return false; } private void validateEqualLeaf(byte []leftBuffer, byte []rightBuffer, Block left, Block right) { if (isLeaf(leftBuffer, left) != isLeaf(rightBuffer, right)) { throw new IllegalStateException(L.l("leaf mismatch {0} {1} and {2} {3}", isLeaf(leftBuffer, left), isLeaf(rightBuffer, right), left, right)); } } /** * Returns the block index to the left of blockId * * <pre> * ... | leftBlockId | blockId | ... * </pre> */ private long getLeftBlockId(Block parent, long blockId) { byte []buffer = parent.getBuffer(); int length = getLength(buffer); if (length < 1) throw new IllegalStateException("zero length for " + debugId(parent.getBlockId())); int offset = HEADER_SIZE; int tupleSize = _tupleSize; int end = offset + length * tupleSize; for (; offset < end; offset += tupleSize) { long pointer = getPointer(buffer, offset); if (pointer == blockId) { if (HEADER_SIZE < offset) { return getPointer(buffer, offset - tupleSize); } else return -1; } } long pointer = getPointer(buffer, NEXT_OFFSET); if (pointer == blockId) return getPointer(buffer, HEADER_SIZE + (length - 1) * tupleSize); else throw new IllegalStateException("Can't find " + debugId(blockId) + " in parent " + debugId(parent.getBlockId())); } /** * Takes the last entry from the left block and moves it to the * first entry in the current block. * * @param parentBuffer the parent block buffer * @param leftBuffer the left block buffer * @param buffer the block's buffer * @param index the index of the block */ private void moveFromLeft(byte []parentBuffer, byte []leftBuffer, byte []buffer, long blockId) { int parentLength = getLength(parentBuffer); int tupleSize = _tupleSize; int parentEnd = HEADER_SIZE + parentLength * tupleSize; int parentOffset = HEADER_SIZE; int leftLength = getLength(leftBuffer); int length = getLength(buffer); // pointer in the parent to the left defaults to the tail - 1 int parentLeftOffset = -1; if (blockId == getPointer(parentBuffer, NEXT_OFFSET)) { // db/0040 // parentLeftOffset = parentOffset - tupleSize; parentLeftOffset = parentEnd - tupleSize; } else { for (parentOffset = HEADER_SIZE + tupleSize; parentOffset < parentEnd; parentOffset += tupleSize) { long pointer = getPointer(parentBuffer, parentOffset); if (pointer == blockId) { parentLeftOffset = parentOffset - tupleSize; break; } } } if (parentLeftOffset < 0) { log.warning("Can't find parent left in deletion borrow left "); return; } // shift the data in the buffer System.arraycopy(buffer, HEADER_SIZE, buffer, HEADER_SIZE + tupleSize, length * tupleSize); int leftEnd = HEADER_SIZE + leftLength * tupleSize; // copy the last item in the left to the buffer System.arraycopy(leftBuffer, leftEnd - tupleSize, buffer, HEADER_SIZE, tupleSize); // add the buffer length setLength(buffer, length + 1); // subtract from the left length leftLength -= 1; setLength(leftBuffer, leftLength); leftEnd = HEADER_SIZE + leftLength * tupleSize; // copy the key from the new left tail to the left item System.arraycopy(leftBuffer, leftEnd - tupleSize + PTR_SIZE, parentBuffer, parentLeftOffset + PTR_SIZE, tupleSize - PTR_SIZE); } /** * Merge the buffer together with the leftBuffer * * <pre> * ... | leftBlock | block | rightBlock | ... * </pre> * * <pre> * ... | leftBlock + block | rightBlock | ... * </pre> */ private void mergeLeft(byte []parentBuffer, byte []leftBuffer, long leftBlockId, byte []buffer, long blockId) { if (isLeaf(leftBuffer) != isLeaf(buffer)) { throw new IllegalStateException("leaf does not match " + isLeaf(leftBuffer) + " " + isLeaf(buffer) + debugId(blockId)); } int tupleSize = _tupleSize; int parentLength = getLength(parentBuffer); int parentEnd = HEADER_SIZE + parentLength * tupleSize; int parentOffset = HEADER_SIZE; int leftLength = getLength(leftBuffer); int leftEnd = HEADER_SIZE + leftLength * tupleSize; int blockLength = getLength(buffer); int blockSize = blockLength * tupleSize; for (parentOffset += tupleSize; parentOffset < parentEnd; parentOffset += tupleSize) { long pointer = getPointer(parentBuffer, parentOffset); if (pointer == blockId) { // shift the parent buffer to replace the left item with the // current item (replacing the key) System.arraycopy(parentBuffer, parentOffset, parentBuffer, parentOffset - tupleSize, parentEnd - parentOffset); // set the parent's pointer to the left block id setPointer(parentBuffer, parentOffset - tupleSize, leftBlockId); setLength(parentBuffer, parentLength - 1); // the new left.next value is the buffer's next value setPointer(leftBuffer, NEXT_OFFSET, getPointer(buffer, NEXT_OFFSET)); // append the buffer to the left buffer System.arraycopy(buffer, HEADER_SIZE, leftBuffer, leftEnd, blockSize); setLength(leftBuffer, leftLength + blockLength); return; } } // Here block is the last item in the parent long pointer = getPointer(parentBuffer, NEXT_OFFSET); if (pointer != blockId) { throw new IllegalStateException("BTree remove can't find matching block: " + debugId(blockId)); } setPointer(parentBuffer, NEXT_OFFSET, leftBlockId); setLength(parentBuffer, parentLength - 1); // the new left.next value is the buffer's next value setPointer(leftBuffer, NEXT_OFFSET, getPointer(buffer, NEXT_OFFSET)); // append the buffer to the left buffer System.arraycopy(buffer, HEADER_SIZE, leftBuffer, leftEnd, blockSize); setLength(leftBuffer, leftLength + blockLength); } /** * Returns the index to the right of the current one */ private long getRightBlockId(Block parent, long blockId) { byte []buffer = parent.getBuffer(); int length = getLength(buffer); int offset = HEADER_SIZE; int tupleSize = _tupleSize; int end = offset + length * tupleSize; for (; offset < end; offset += tupleSize) { long pointer = getPointer(buffer, offset); if (pointer == blockId) { if (offset + tupleSize < end) { return getPointer(buffer, offset + tupleSize); } else return getPointer(buffer, NEXT_OFFSET); } } return -1; } /** * Takes the first entry from the right block and moves it to the * last entry in the current block. * * @param parentBuffer the parent block buffer * @param rightBuffer the right block buffer * @param buffer the block's buffer * @param index the index of the block */ private void moveFromRight(byte []parentBuffer, byte []buffer, byte []rightBuffer, long blockId) { int parentLength = getLength(parentBuffer); int tupleSize = _tupleSize; int parentEnd = HEADER_SIZE + parentLength * tupleSize; int parentOffset; int rightLength = getLength(rightBuffer); int rightSize = rightLength * tupleSize; int blockLength = getLength(buffer); int blockEnd = HEADER_SIZE + blockLength * tupleSize; for (parentOffset = HEADER_SIZE; parentOffset < parentEnd; parentOffset += tupleSize) { long pointer = getPointer(parentBuffer, parentOffset); if (pointer == blockId) break; } if (parentEnd <= parentOffset) { log.warning("Can't find buffer in deletion borrow right "); return; } // copy the first item in the right to the buffer System.arraycopy(rightBuffer, HEADER_SIZE, buffer, blockEnd, tupleSize); // add the buffer length setLength(buffer, blockLength + 1); // shift the data in the right buffer System.arraycopy(rightBuffer, HEADER_SIZE + tupleSize, rightBuffer, HEADER_SIZE, rightSize - tupleSize); // subtract from the right length setLength(rightBuffer, rightLength - 1); // copy the entry from the new buffer tail to the buffer's parent entry System.arraycopy(buffer, blockEnd + PTR_SIZE, parentBuffer, parentOffset + PTR_SIZE, tupleSize - PTR_SIZE); } /** * Merges the buffer with the right-most one. * * <pre> * ... | leftBlock | block | rightBlock | ... * </pre> * * <pre> * ... | leftBlock | block + rightBlock | ... * </pre> */ private void mergeRight(byte []parentBuffer, byte []buffer, byte []rightBuffer, long blockId) { if (isLeaf(buffer) != isLeaf(rightBuffer)) { throw new IllegalStateException("leaf does not match " + isLeaf(buffer) + " " + isLeaf(rightBuffer) + debugId(blockId)); } int tupleSize = _tupleSize; int parentLength = getLength(parentBuffer); int parentEnd = HEADER_SIZE + parentLength * tupleSize; int parentOffset; int rightLength = getLength(rightBuffer); int rightSize = rightLength * tupleSize; int blockLength = getLength(buffer); int blockSize = blockLength * tupleSize; for (parentOffset = HEADER_SIZE; parentOffset < parentEnd; parentOffset += tupleSize) { long pointer = getPointer(parentBuffer, parentOffset); if (pointer == blockId) { // remove the buffer's pointer from the parent System.arraycopy(parentBuffer, parentOffset + tupleSize, parentBuffer, parentOffset, parentEnd - parentOffset - tupleSize); setLength(parentBuffer, parentLength - 1); // add space in the right buffer System.arraycopy(rightBuffer, HEADER_SIZE, rightBuffer, HEADER_SIZE + blockSize, rightSize); // add the buffer to the right buffer System.arraycopy(buffer, HEADER_SIZE, rightBuffer, HEADER_SIZE, blockSize); setLength(rightBuffer, blockLength + rightLength); return; } } throw new IllegalStateException("BTree merge right can't find matching index: " + debugId(blockId)); } /** * Looks up the next block given the current block and the given key. */ private long lookupTuple(long blockId, byte []buffer, byte []keyBuffer, int keyOffset, int keyLength, boolean isLeaf) throws IOException { int length = getLength(buffer); int offset = HEADER_SIZE; int tupleSize = _tupleSize; int end = HEADER_SIZE + length * tupleSize; long value; while (length > 0) { int tail = offset + tupleSize * length; int delta = tupleSize * (length / 2); int newOffset = offset + delta; if (newOffset < 0) { System.out.println("UNDERFLOW: " + debugId(blockId) + " LENGTH:" + length + " STU:" + getLength(buffer) + " DELTA:" + delta); throw new IllegalStateException("lookupTuple underflow newOffset:" + newOffset); } else if (newOffset > 65536) { System.out.println("OVERFLOW: " + debugId(blockId) + " LENGTH:" + length + " STU:" + getLength(buffer) + " DELTA:" + delta); throw new IllegalStateException("lookupTuple overflow newOffset:" + newOffset); } int cmp = _keyCompare.compare(keyBuffer, keyOffset, buffer, PTR_SIZE + newOffset, keyLength); if (cmp == 0) { value = getPointer(buffer, newOffset); if (value == 0 && ! isLeaf) throw new IllegalStateException("illegal 0 value at " + newOffset + " for block " + debugId(blockId)); return value; } else if (cmp > 0) { offset = newOffset + tupleSize; length = (tail - offset) / tupleSize; } else if (cmp < 0) { length = length / 2; } if (length > 0) { } else if (isLeaf) return 0; else if (cmp < 0) { value = getPointer(buffer, newOffset); if (value == 0 && ! isLeaf) throw new IllegalStateException("illegal 0 value at " + newOffset + " for block " + debugId(blockId)); return value; } else if (offset == end) { value = getPointer(buffer, NEXT_OFFSET); if (value != 0 || isLeaf) return value; else return getPointer(buffer, end - tupleSize); /* if (value == 0 && ! isLeaf) throw new IllegalStateException("illegal 0 value at end=" + newOffset + " for block " + debugId(blockId) + " tuple=" + _tupleSize); return value; */ } else { value = getPointer(buffer, offset); if (value == 0 && ! isLeaf) throw new IllegalStateException("illegal 0 value at " + newOffset + " for block " + debugId(blockId)); return value; } } if (isLeaf) return 0; else { value = getPointer(buffer, NEXT_OFFSET); if (value == 0 && ! isLeaf) throw new IllegalStateException("illegal 0 value at NEXT_OFFSET for block " + debugId(blockId)); return value; } } /** * Removes from the next block given the current block and the given key. */ private long removeLeafEntry(long blockIndex, byte []buffer, byte []keyBuffer, int keyOffset, int keyLength) throws IOException { int offset = HEADER_SIZE; int tupleSize = _tupleSize; int length = getLength(buffer); for (int i = 0; i < length; i++) { int cmp = _keyCompare.compare(keyBuffer, keyOffset, buffer, offset + PTR_SIZE, keyLength); if (0 < cmp) { offset += tupleSize; continue; } else if (cmp == 0) { int blockEnd = HEADER_SIZE + length * tupleSize; if (offset + tupleSize < blockEnd) { if (offset < HEADER_SIZE) throw new IllegalStateException(); System.arraycopy(buffer, offset + tupleSize, buffer, offset, blockEnd - offset - tupleSize); } setLength(buffer, length - 1); return i; } else { return 0; } } return 0; } private void validate(long blockId, byte []buffer) { boolean isLeaf = isLeaf(buffer); if (isLeaf) return; int tupleSize = _tupleSize; int length = getLength(buffer); int end = HEADER_SIZE + tupleSize * length; if (length < 0 || BlockStore.BLOCK_SIZE < end) { throw new IllegalStateException("illegal length " + length + " for " + debugId(blockId)); } int offset; if (false && getPointer(buffer, NEXT_OFFSET) == 0) throw new IllegalStateException("Null next pointer for " + debugId(blockId)); for (offset = HEADER_SIZE; offset < end; offset += tupleSize) { if (getPointer(buffer, offset) == 0) throw new IllegalStateException("Null pointer at " + offset + " for " + debugId(blockId) + " tupleSize=" + tupleSize); } } private boolean isLeaf(byte []buffer, Block block) { int flags = getInt(buffer, FLAGS_OFFSET) & LEAF_MASK; if (flags == IS_LEAF) return true; else if (flags == IS_NODE) return false; else { if (! block.isIndex()) throw new IllegalStateException(L.l("block {0} is not an index block", block)); if (! block.isValid()) throw new IllegalStateException(L.l("block {0} is not valid", block)); throw new IllegalStateException(L.l("leaf value is invalid: {0} for {1}", flags, block)); } } private void validate(Block block) { isLeaf(block.getBuffer(), block); } private void validateIndex(Block block) { if (block == _rootBlock) return; block.validateIsIndex(); } private boolean isLeaf(byte []buffer) { int flags = getInt(buffer, FLAGS_OFFSET) & LEAF_MASK; if (flags == IS_LEAF) return true; else if (flags == IS_NODE) return false; else throw new IllegalStateException(L.l("leaf value is invalid: {0}", flags)); } private void setLeaf(byte []buffer, boolean isLeaf) { int flags = getInt(buffer, FLAGS_OFFSET) & ~LEAF_MASK; if (isLeaf) setInt(buffer, FLAGS_OFFSET, flags + IS_LEAF); else setInt(buffer, FLAGS_OFFSET, flags + IS_NODE); } /** * Reads an int */ private int getInt(byte []buffer, int offset) { return (((buffer[offset + 0] & 0xff) << 24) + ((buffer[offset + 1] & 0xff) << 16) + ((buffer[offset + 2] & 0xff) << 8) + ((buffer[offset + 3] & 0xff))); } /** * Reads a pointer. */ private long getPointer(byte []buffer, int offset) { return (((buffer[offset + 0] & 0xffL) << 56) + ((buffer[offset + 1] & 0xffL) << 48) + ((buffer[offset + 2] & 0xffL) << 40) + ((buffer[offset + 3] & 0xffL) << 32) + ((buffer[offset + 4] & 0xffL) << 24) + ((buffer[offset + 5] & 0xffL) << 16) + ((buffer[offset + 6] & 0xffL) << 8) + ((buffer[offset + 7] & 0xffL))); } /** * Sets an int */ private void setInt(byte []buffer, int offset, int value) { buffer[offset + 0] = (byte) (value >> 24); buffer[offset + 1] = (byte) (value >> 16); buffer[offset + 2] = (byte) (value >> 8); buffer[offset + 3] = (byte) (value); } /** * Sets the length */ private void setLength(byte []buffer, int value) { if (value < 0 || BLOCK_SIZE / _tupleSize < value) { System.out.println("BAD-LENGTH: " + value); throw new IllegalArgumentException("BTree: bad length " + value); } setInt(buffer, LENGTH_OFFSET, value); } /** * Sets the length */ private int getLength(byte []buffer) { int value = getInt(buffer, LENGTH_OFFSET); if (value < 0 || value > 65536) { System.out.println("BAD-LENGTH: " + value); throw new IllegalArgumentException("BTree: bad length " + value); } return value; } /** * Sets a pointer. */ private void setPointer(byte []buffer, int offset, long value) { if (offset <= LENGTH_OFFSET) System.out.println("BAD_POINTER: " + offset); buffer[offset + 0] = (byte) (value >> 56); buffer[offset + 1] = (byte) (value >> 48); buffer[offset + 2] = (byte) (value >> 40); buffer[offset + 3] = (byte) (value >> 32); buffer[offset + 4] = (byte) (value >> 24); buffer[offset + 5] = (byte) (value >> 16); buffer[offset + 6] = (byte) (value >> 8); buffer[offset + 7] = (byte) (value); } /** * Opens the BTree. */ private void start() throws IOException { synchronized (this) { if (_isStarted) return; _isStarted = true; } } /** * Testing: returns the keys for a block */ public ArrayList<String> getBlockKeys(long blockIndex) throws IOException { long blockId = _store.addressToBlockId(blockIndex * BLOCK_SIZE); if (! _store.isIndexBlock(blockId)) { return null; } Block block = _store.readBlock(blockId); block.read(); byte []buffer = block.getBuffer(); int length = getInt(buffer, LENGTH_OFFSET); int offset = HEADER_SIZE; int tupleSize = _tupleSize; ArrayList<String> keys = new ArrayList<String>(); for (int i = 0; i < length; i++) { keys.add(_keyCompare.toString(buffer, offset + i * tupleSize + PTR_SIZE, tupleSize - PTR_SIZE)); } block.free(); return keys; } /** * Testing: returns the keys for a block */ public long getBlockNext(long blockIndex) throws IOException { long blockId = _store.addressToBlockId(blockIndex * BLOCK_SIZE); if (! _store.isIndexBlock(blockId)) { return -1; } Block block = _store.readBlock(blockId); block.read(); byte []buffer = block.getBuffer(); long next = getPointer(buffer, NEXT_OFFSET); block.free(); return next / BlockStore.BLOCK_SIZE; } public static BTree createTest(Path path, int keySize) throws IOException, java.sql.SQLException { Database db = new Database(); db.setPath(path); db.init(); BlockStore store = new BlockStore(db, "test", null); store.create(); Block block = store.allocateIndexBlock(); long blockId = block.getBlockId(); block.free(); return new BTree(store, blockId, keySize, new KeyCompare()); } public static BTree createStringTest(Path path, int keySize) throws IOException, java.sql.SQLException { BlockStore store = BlockStore.create(path); Block block = store.allocateIndexBlock(); long blockId = block.getBlockId(); block.free(); return new BTree(store, blockId, keySize, new StringKeyCompare()); } private String debugId(long blockId) { return Long.toHexString(blockId); } public void close() { Block rootBlock = _rootBlock; _rootBlock = null; if (rootBlock != null) rootBlock.free(); } public String toString() { return (getClass().getSimpleName() + "[" + _store + "," + (_rootBlockId / BLOCK_SIZE) + "]"); } }