/* -*- tab-width: 4 -*-
*
* Electric(tm) VLSI Design System
*
* File: BTree.java
*
* Copyright (c) 2009 Sun Microsystems and Static Free Software
*
* Electric(tm) is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Electric(tm) is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Electric(tm); see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, Mass 02111-1307, USA.
*/
package com.sun.electric.database.geometry.btree;
import java.io.*;
import java.util.*;
import com.sun.electric.database.geometry.btree.unboxed.*;
import com.sun.electric.database.geometry.btree.CachingPageStorage.CachedPage;
/**
* A <a href=http://www.youtube.com/watch?v=coRJrcIYbF4>B+Tree</a>
* implemented using {@see PageStorage}.<p>
*
* This is a B-Plus-Tree; values are stored only in leaf nodes.<p>
*
* <h3>Usage Notes</h3>
*
* Each element in a BTree is conceptually a triple
* <ordinal,key,value> where "key" is a user-supplied key
* (belonging to a type that is {@see Comparable}), "value" is a
* user-supplied value (no restrictions) and "ordinal" is an integer
* indicating the number of keys in the tree less than this one.
* Note that the ordinal is not actually stored in the tree, and
* inserting a new value can potentially modify the ordinals of all
* preexisting elements! Each of the getXXX() methods takes one of
* these three coordinates (<tt>Ord</tt>, <tt>Key</tt>, or
* <tt>Val</tt>) and returns one of the others, or else a count
* (<tt>Num</tt>). Additionally, the getXXXFromKey() methods include
* floor/ceiling versions that take an upper/lower bound and search
* for the largest/smallest key which is less/greater than the one
* supplied.<p>
*
* The BTree supports appending a new element (that is, inserting a value
* with a key greater than any key in the table) in near-constant time
* (Actually log<sup>*</sup>(n)) and all other queries in log(n) time.
* All operations are done in a <i>single pass</i> down the BTree from
* the root to the leaves; this brings two benefits: the data structure
* can be made concurrent with very little lock contention and it can
* support copy-on-write shadow versions.<p>
*
* You must distinguish between insert() and replace() ahead of time;
* you can't call insert() on a key that is already in the tree or
* replace() on one that isn't. In order to replace() on a BTree
* with a summary the summary product operation must be commutative
* and invertible, and you must know the old value which you are
* replacing. This lets us update the interior node invariants as we
* walk down the tree and avoid having to walk back up afterwards.
* If you're not sure if a key is in the tree, just do a get() -- the
* net result is two passes over the tree, which is what we'd have to
* do anyways if the user didn't distinguish insert() from replace().
* We're just offering the option to double the performance in the
* case where the user already knows if the key is in the tree or
* not.<p>
*
* You can associate a <i>summary</i> with each leaf node of the BTree.
* In order to do this, you must provide an instance of {@see
* com.sun.electric.database.geometry.btree.unboxed.AssociativeOperation}
* to the BTree when you construct it. The AssociativeOperation knows
* the key and value type of the BTree, and must know two things:
*
* <ul>
* <li> How to calculate the summary of a single (key,value) pair.
* <li> How to merge two summaries.
* </ul>
*
* The process of merging two summaries must be associative; if we want
* to merge three summaries (ABC) it must not matter if we merge them as
* ((AB)C) or (A(BC)). Technically this makes the merge operation
* a <i><a href=http://en.wikipedia.org/wiki/Semigroup>semigroup</a></i>.
* In exchange for providing all of this information
* you can ask the BTree to calculate the summary of any contiguous
* region of the keyspace in log(n) time. For example, this can be used
* to answer "min/max over this range" queries very efficiently.<p>
*
* <h3>Implementation Notes</h3>
*
* We proactively split nodes as soon as they become full rather than
* waiting for them to become overfull. This has a space overhead of
* 1/NUM_KEYS_PER_PAGE, but puts an O(1) bound on the number of pages
* written per operation (number of pages read is still O(log n)).
* It also makes the walk routine tail-recursive.<p>
*
* Each node of the BTree uses one page of the PageStorage; we don't
* yet support situations where a single page is too small for one
* key and one value.<p>
*
* The coding style in this file is pretty unusual; it looks a lot
* like "Java as a better C". This is mainly because Hotspot is
* remarkably good at inlining methods, but remarkably bad (still,
* even in Java 1.7) at figuring out when it's safe to unbox values.
* So I expend a lot of effort trying not to create boxed values, but
* don't worry at all about the overhead of method calls,
* particularly when made via "final" references (nearly always
* inlined). I don't code this way very often -- I reserve this for
* the 2% of my code that bears 98% of the performance burden. There
* is anecdotal evidence that using the server JVM rather than the
* client JVM yields a dramatic increase in performance; this coding
* style is especially friendly to the server JVM's optimization
* techniques.<p>
*
* Keys, values, and summary elements are stored in <i>unboxed</i>
* form. This means that each of these types must know how to
* serialize itself to and deserialize itself from a sequence of
* bytes, and <i>it must be possible to perform the important
* operations (comparison for keys, product for summary values)
* on these values in unboxed form.</i> The reasons for this are set
* out in the previous paragraph. See the package btree.unboxed for
* further details.
*
* @author Adam Megacz <adam.megacz@sun.com>
*/
public class BTree
<K extends Serializable & Comparable,
V extends Serializable,
S extends Serializable> {
final CachingPageStorage ps;
final UnboxedComparable<K> uk;
final AssociativeOperation<S> ao;
final UnboxedFunction<Pair<K,V>,S> summarize;
final Unboxed<V> uv;
final UnboxedInt ui = UnboxedInt.instance;
private LeafNodeCursor<K,V,S> leafNodeCursor;
private InteriorNodeCursor<K,V,S> interiorNodeCursor1;
private InteriorNodeCursor<K,V,S> interiorNodeCursor2;
int rootpage;
private final byte[] keybuf;
private final byte[] keybuf2;
private final byte[] sbuf;
private final byte[] largestKey;
private int largestKeyPage = -1; // or -1 if unknown
private int size = 0;
/**
* Create a BTree.
* @param ps the PageStorage to hold the underlying bytes
* @param uk the unboxed type for keys (must be comparable)
* @param uv the unboxed type for values
* @param summarize the function which summarizes a single (key,value) pair
* @param combine the function which associatively combines two summaries
*/
public BTree(CachingPageStorage ps,
UnboxedComparable<K> uk,
Unboxed<V> uv,
UnboxedFunction<Pair<K,V>,S> summarize,
AssociativeOperation<S> combine) {
AssociativeOperation<S> ao = combine;
this.summarize = summarize;
if (ao!=null) {
if (!(ao instanceof AssociativeCommutativeOperation))
throw new RuntimeException("Only commutative summary operations are supported (allows one-pass insertion)");
// FIXME: if the summary is not invertible (ie a group) and commutative, we cannot do DELETE in a single pass
// I don't think we can ever do REPLACE in one pass unless we knew the previous value
}
this.ps = ps;
this.uk = uk;
this.ao = ao;
this.uv = uv;
this.leafNodeCursor = new LeafNodeCursor<K,V,S>(this);
this.interiorNodeCursor1 = new InteriorNodeCursor<K,V,S>(this);
this.interiorNodeCursor2 = new InteriorNodeCursor<K,V,S>(this);
this.rootpage = ps.createPage();
this.keybuf = new byte[uk.getSize()];
this.keybuf2 = new byte[uk.getSize()];
this.sbuf = ao==null ? null : new byte[ao.getSize()];
this.largestKey = new byte[uk.getSize()];
leafNodeCursor.initBuf(ps.getPage(rootpage, false), true);
leafNodeCursor.writeBack();
}
/**
* Returns the number of entries in the tree with a key between
* min and max inclusive; if either min or max is null it is treated
* as negative or positive infinity (respectively)
*/
public int getNumFromKeys(K min, K max) {
if (min==null && max==null) return size;
throw new RuntimeException("not implemented");
}
/** same as getNumFromKeys(null,null) */
public int size() { return getNumFromKeys(null,null); }
/** returns the value in the tree, or null if not found */
public V getValFromKey(K key) {
uk.serialize(key, keybuf, 0);
return (V)walk(keybuf, 0, null, Op.GET_VAL_FROM_KEY, 0);
}
/** returns the value of the largest key less than or equal to the one supplied */
public V getValFromKeyFloor(K key) {
uk.serialize(key, keybuf, 0);
return (V)walk(keybuf, 0, null, Op.GET_VAL_FROM_KEY_FLOOR, 0);
}
/** returns the value of the smallest key greater than or equal to the one supplied */
public V getValFromKeyCeiling(K key) {
uk.serialize(key, keybuf, 0);
return (V)walk(keybuf, 0, null, Op.GET_VAL_FROM_KEY_CEIL, 0);
}
/** returns the ordinal of the given key, or -1 if not found */
public int getOrdFromKey(K key) {
uk.serialize(key, keybuf, 0);
return ((Integer)walk(keybuf, 0, null, Op.GET_ORD_FROM_KEY, 0)).intValue();
}
/** returns the ordinal of the largest key less than or equal to the one supplied */
public int getOrdFromKeyFloor(K key) {
uk.serialize(key, keybuf, 0);
return ((Integer)walk(keybuf, 0, null, Op.GET_ORD_FROM_KEY_FLOOR, 0)).intValue();
}
/** returns the ordinal of the smallest key greater than or equal to the one supplied */
public int getOrdFromKeyCeiling(K key) {
uk.serialize(key, keybuf, 0);
return ((Integer)walk(keybuf, 0, null, Op.GET_ORD_FROM_KEY_CEIL, 0)).intValue();
}
/** returns the least key <i>strictly</i> greater than the argument */
public V getKeyFromKeyNext(K key) {
throw new RuntimeException("not implemented");
}
/** returns the greatest key <i>strictly</i> less than the argument */
public V getKeyFromKeyPrev(K key) {
throw new RuntimeException("not implemented");
}
/** returns the i^th value in the tree */
public V getValFromOrd(int ord) {
return (V)walk(null, 0, null, Op.GET_VAL_FROM_ORD, ord);
}
/** returns the i^th key in the tree */
public K getKeyFromOrd(int ord) {
return (K)walk(null, 0, null, Op.GET_KEY_FROM_ORD, ord);
}
/** will throw an exception if the key is already in the tree */
public void insert(K key, V val) {
uk.serialize(key, keybuf, 0);
walk(keybuf, 0, val, Op.INSERT, 0);
size++;
}
/** returns value previously in the tree; will throw an exception if the key is not already in the tree */
public V replace(K key, V val) {
uk.serialize(key, keybuf, 0);
return (V)walk(keybuf, 0, val, Op.REPLACE, 0);
}
/** returns value previously in the tree; will throw an exception if the key is not already in the tree */
public V remove(K key) {
throw new RuntimeException("not implemented");
// size--;
}
/** remove all entries */
public void clear() {
throw new RuntimeException("not implemented");
}
/** compute the summary of all (key,value) pairs between min and max, inclusive */
public S getSummaryFromKeys(K min, K max) {
uk.serialize(min, keybuf, 0);
uk.serialize(max, keybuf2, 0);
walk(keybuf, 0, null, Op.SUMMARIZE_LEFT, 0, keybuf2, 0, sbuf, 0);
walk(keybuf, 0, null, Op.SUMMARIZE_MID, 0, keybuf2, 0, sbuf, 0);
walk(keybuf, 0, null, Op.SUMMARIZE_RIGHT, 0, keybuf2, 0, sbuf, 0);
return (S)ao.deserialize(sbuf, 0);
}
private static enum Op {
GET_VAL_FROM_KEY,
GET_VAL_FROM_KEY_FLOOR,
GET_VAL_FROM_KEY_CEIL,
GET_ORD_FROM_KEY,
GET_ORD_FROM_KEY_FLOOR,
GET_ORD_FROM_KEY_CEIL,
GET_VAL_FROM_ORD,
GET_KEY_FROM_ORD,
GET_NEXT,
GET_PREV,
REMOVE,
INSERT,
REPLACE,
SUMMARIZE_LEFT,
SUMMARIZE_MID,
SUMMARIZE_RIGHT,
;
public boolean isGetFromOrd() {
switch(this) {
case GET_VAL_FROM_ORD:
case GET_KEY_FROM_ORD:
return true;
default:
return false;
}
}
public boolean isGetOrd() {
switch(this) {
case GET_ORD_FROM_KEY:
case GET_ORD_FROM_KEY_FLOOR:
case GET_ORD_FROM_KEY_CEIL:
return true;
default:
return false;
}
}
public boolean isGetFromKey() {
switch(this) {
case GET_VAL_FROM_KEY:
case GET_VAL_FROM_KEY_FLOOR:
case GET_VAL_FROM_KEY_CEIL:
case GET_ORD_FROM_KEY:
case GET_ORD_FROM_KEY_FLOOR:
case GET_ORD_FROM_KEY_CEIL:
return true;
default:
return false;
}
}
public boolean isGetFromKeyFloor() {
switch(this) {
case GET_VAL_FROM_KEY_FLOOR:
case GET_ORD_FROM_KEY_FLOOR:
return true;
default:
return false;
}
}
public boolean isGetFromKeyCeil() {
switch(this) {
case GET_VAL_FROM_KEY_CEIL:
case GET_ORD_FROM_KEY_CEIL:
return true;
default:
return false;
}
}
}
private Object walk(byte[] key, int key_ofs, V val, Op op, int ord) {
return walk(key, key_ofs, val, op, ord, null, 0, null, 0);
}
/**
* B+Tree walking routine.
*
* This is the hairiest part, so I arranged things to share a single
* codepath across all four operations (insert/replace/delete/find).
*
* The routine is implemented using a loop rather than recursive
* calls because the JVM does not support tail recursion (and
* probably never will, because its lame security model is based
* on stack inspection).
*
* On writes/deletes, this returns the previous value.
*
*/
private Object walk(byte[] key, int key_ofs, V val, Op op, int ord, byte[] key2, int key2_ofs, byte[] ret, int ret_ofs) {
int pageid = rootpage;
int idx = -1;
int global_ord = 0;
LeafNodeCursor<K,V,S> leafNodeCursor = this.leafNodeCursor;
InteriorNodeCursor<K,V,S> interiorNodeCursor = this.interiorNodeCursor1;
InteriorNodeCursor<K,V,S> parentNodeCursor = this.interiorNodeCursor2;
NodeCursor cur = null;
boolean rightEdge = true;
boolean cheat = false;
int comp = 0;
if (largestKeyPage != -1 && op==Op.INSERT) {
leafNodeCursor.setBuf(ps.getPage(largestKeyPage, true));
comp = uk.compare(key, key_ofs, largestKey, 0);
if (comp >= 0 && !leafNodeCursor.isFull()) {
pageid = largestKeyPage;
parentNodeCursor.forgetCachedPage();
cheat = true;
cur = leafNodeCursor;
}
}
while(true) {
if (cur==null || cur.getCachedPage()==null || cur.getPageId() != pageid) {
CachedPage cp = ps.getPage(pageid, true);
cur = LeafNodeCursor.isLeafNode(cp) ? leafNodeCursor : interiorNodeCursor;
cur.setBuf(cp);
}
if ((op==Op.INSERT || op==Op.REPLACE) && cur.isFull()) {
assert cur!=parentNodeCursor;
int old;
// is the node we're splitting the last child of its parent or the root node?
boolean splitting_last_or_root = false;
if (pageid == rootpage) {
parentNodeCursor.initRoot();
parentNodeCursor.setBucketPageId(0, pageid);
idx = 0;
old = size;
splitting_last_or_root = true;
} else {
assert !parentNodeCursor.isFull();
splitting_last_or_root = idx>=parentNodeCursor.getNumBuckets()-1;
old = splitting_last_or_root ? -1 : parentNodeCursor.getNumValsBelowBucket(idx);
}
if (op==Op.INSERT && old!=-1) old -= 1;
int ofs = parentNodeCursor.insertNewBucketAt(idx+1);
int oldpage = cur.getPageId();
// optimization: if we're splitting a node on the
// "right edge" of the tree, make the split uneven --
// put everything on the left side.
int splitPoint = rightEdge ? cur.getNumBuckets()-1 : cur.getMaxBuckets()/2;
if (rightEdge) splitUnEven++; else splitEven++;
if (ao!=null) {
byte[] monbuf = new byte[ao.getSize()];
cur.getSummary(0, monbuf, 0);
for(int i=1; i<splitPoint; i++) {
cur.getSummary(i, monbuf, 0);
parentNodeCursor.multiplySummaryCommutative(idx, monbuf, 0);
}
}
int num = cur.split(parentNodeCursor.getBuf(), ofs, splitPoint);
parentNodeCursor.setNumValsBelowBucket(idx, num);
int newpage = cur.getPageId();
if (largestKeyPage==oldpage) largestKeyPage = newpage;
parentNodeCursor.setBucketPageId(idx+1, newpage);
if (!splitting_last_or_root)
parentNodeCursor.setNumValsBelowBucket(idx+1, old-num);
if (ao!=null && (!parentNodeCursor.isRightMost() || idx+1<parentNodeCursor.getNumBuckets()-1)) {
byte[] monbuf = new byte[ao.getSize()];
cur.getSummary(0, monbuf, 0);
for(int i=1; i<cur.getNumBuckets() - (cur.isRightMost() ? 1 : 0); i++) {
cur.getSummary(i, monbuf, 0);
parentNodeCursor.multiplySummaryCommutative(idx+1, monbuf, 0);
}
}
cur.writeBack();
parentNodeCursor.writeBack();
pageid = rootpage;
cheat = false;
continue;
}
if (cheat) {
idx = leafNodeCursor.getNumBuckets()-1;
comp = 1;
} else if (!op.isGetFromOrd()) {
idx = cur.search(key, key_ofs);
comp = cur.compare(key, key_ofs, idx);
} else if (!cur.isLeafNode()) {
// FIXME: linear scan => bad
for(idx = 0; idx < interiorNodeCursor.getNumBuckets()-1; idx++) {
int k = interiorNodeCursor.getNumValsBelowBucket(idx);
if (ord < k) break;
ord -= k;
}
}
if (cur.isLeafNode()) {
switch(op) {
case GET_VAL_FROM_ORD: return ord >= leafNodeCursor.getNumBuckets() ? null : leafNodeCursor.getVal(ord);
case GET_KEY_FROM_ORD: return ord >= leafNodeCursor.getNumBuckets() ? null : leafNodeCursor.getKey(ord);
case GET_VAL_FROM_KEY: return comp==0 ? leafNodeCursor.getVal(idx) : null;
case GET_VAL_FROM_KEY_FLOOR: return leafNodeCursor.getVal(idx);
case GET_VAL_FROM_KEY_CEIL: /* FIXME: might need to backtrack one step */ throw new RuntimeException("not implemented");
case GET_ORD_FROM_KEY: return comp==0 ? new Integer(idx+global_ord) : new Integer(-1);
case GET_ORD_FROM_KEY_FLOOR: return new Integer(idx+global_ord /*FIXME: off the end?*/);
case GET_ORD_FROM_KEY_CEIL: return comp==0 ? new Integer(idx+global_ord) : new Integer(idx+global_ord+1 /*FIXME: off the end?*/);
default: /* INSERT or REPLACE; fall through */
}
if (op==Op.INSERT && comp==0) throw new RuntimeException("attempt to re-insert a value at key " + leafNodeCursor.getKey(idx));
if (op==Op.REPLACE && comp!=0) throw new RuntimeException("attempt to replace a value that did not exist");
if (op==Op.INSERT) { if (cheat) insertionFastPath++; else insertionSlowPath++; }
if (largestKeyPage==-1 || cheat)
System.arraycopy(key, key_ofs, largestKey, 0, largestKey.length);
if (largestKeyPage==-1) largestKeyPage = pageid;
if (comp==0) {
if (val==null) throw new RuntimeException("deletion is not yet implemented");
return leafNodeCursor.setVal(idx, val);
}
leafNodeCursor.insertVal(idx+1, key, key_ofs, val);
return null;
} else {
if (op==Op.REMOVE)
throw new RuntimeException("need to adjust 'least value under X' on the way down for deletions");
if (op==Op.INSERT) {
boolean wb = false;
if (idx < interiorNodeCursor.getNumBuckets()-1) {
interiorNodeCursor.setNumValsBelowBucket(idx, interiorNodeCursor.getNumValsBelowBucket(idx)+1);
wb = true;
}
if (ao != null && (idx < interiorNodeCursor.getNumBuckets()-1 || !interiorNodeCursor.isRightMost())) {
throw new RuntimeException("not implemented");
/*
// FIXME
byte[] monbuf = new byte[ao.getSize()];
byte[] vbuf = new byte[uv.getSize()];
uv.serialize(val, vbuf, 0);
summarize.call(key, key_ofs, vbuf, 0, monbuf, 0);
interiorNodeCursor.multiplySummaryCommutative(idx, monbuf, 0);
*/
}
if (wb) interiorNodeCursor.writeBack();
}
if (op.isGetOrd())
for(int i = 0; i < idx; i++)
global_ord += interiorNodeCursor.getNumValsBelowBucket(i);
rightEdge &= idx==interiorNodeCursor.getNumBuckets()-1;
pageid = interiorNodeCursor.getBucketPageId(idx);
InteriorNodeCursor<K,V,S> ic = interiorNodeCursor; interiorNodeCursor = parentNodeCursor; parentNodeCursor = ic;
assert interiorNodeCursor!=parentNodeCursor;
continue;
}
}
}
static long insertionFastPath = 0;
static long insertionSlowPath = 0;
static long splitEven = 0;
static long splitUnEven = 0;
/** debugging method; may go away in future releases */
public static void clearStats() {
BTree.splitUnEven = 0;
BTree.splitEven = 0;
BTree.insertionFastPath = 0;
BTree.insertionSlowPath = 0;
}
/** debugging method; may go away in future releases */
public static void dumpStats(PrintStream pw) {
pw.println("BTree stats: insertion fastpath = " +
BTree.insertionFastPath + "/" + (BTree.insertionFastPath+BTree.insertionSlowPath) + " = " +
(int)(( BTree.insertionFastPath * 100 )/(float)(BTree.insertionFastPath+BTree.insertionSlowPath)) + "%");
pw.println(" intelligent splits = " +
BTree.splitUnEven + "/" + (BTree.splitUnEven+BTree.splitEven) + " = " +
(int)(( BTree.splitUnEven * 100 )/(float)(BTree.splitUnEven+BTree.splitEven)) + "%");
}
}