Node.java example

Explorer
blazegraph-master
- database-master
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
/*
 * Created on Nov 15, 2006
 */
package com.bigdata.btree;

import java.io.PrintStream;
import java.lang.ref.Reference;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.concurrent.Executor;
import java.util.concurrent.FutureTask;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.atomic.AtomicReferenceArray;

import org.apache.log4j.Level;

import com.bigdata.BigdataStatics;
import com.bigdata.btree.AbstractBTree.ChildMemoizer;
import com.bigdata.btree.AbstractBTree.LoadChildRequest;
import com.bigdata.btree.data.DefaultNodeCoder;
import com.bigdata.btree.data.INodeData;
import com.bigdata.btree.raba.IRaba;
import com.bigdata.btree.raba.MutableKeyBuffer;
import com.bigdata.io.AbstractFixedByteArrayBuffer;
import com.bigdata.journal.Journal;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.util.BytesUtil;
import com.bigdata.util.concurrent.LatchedExecutor;
import com.bigdata.util.concurrent.Memoizer;

import cutthecrap.utils.striterators.EmptyIterator;
import cutthecrap.utils.striterators.Expander;
import cutthecrap.utils.striterators.SingleValueIterator;
import cutthecrap.utils.striterators.Striterator;

/**
 * <p>
 * A non-leaf node.
 * </p>
 * <h2>Per-child min/max revision timestamps and timestamp revision filtering</h2>
 * 
 * In order to track the min/max timestamp on the {@link Node} we must also
 * track the min/max timestamp for each direct child of that {@link Node}. While
 * this inflates the size of the {@link INodeData} data record considerably, we
 * are required to track those per-child data in order to avoid a scan of the
 * children when we need to recompute the min/max timestamp for the {@link Node}
 * . The IO latency costs of that scan are simply not acceptable, especially for
 * large branching factors. The min/max timestamp on the {@link Node} is ONLY
 * used for filtering iterators based on a desired tuple revision range. This is
 * why the choice to support tuple revision filters is its own configuration
 * option.
 * 
 * FIXME An alternative to per-child min/max tuple revision timestamps would be
 * the concurrent materialization of the direct children. These data are only
 * mutable for B+Tree instances with relatively small branching factors. They
 * are immutable for the {@link IndexSegment}. However, the per-{@link Node}
 * min/max timestamp also make the tuple revision filtering more efficient since
 * we can prune the search before we materialize the child.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 */
public class Node extends AbstractNode<Node> implements INodeData {

    /**
     * The data record. {@link MutableNodeData} is used for all mutation
     * operations. {@link ReadOnlyNodeData} is used when the {@link Node} is
     * made persistent. A read-only data record is automatically converted into
     * a {@link MutableNodeData} record when a mutation operation is requested.
     * <p>
     * Note: This is package private in order to expose it to {@link Leaf}.
     * 
     * @todo consider volatile and private for {@link Node#data} and
     *       {@link Leaf#data} with accessors and settors at package private
     *       where necessary.
     */
    INodeData data;

    /**
     * <p>
     * Weak references to child nodes (may be nodes or leaves). The capacity of
     * this array is m, where m is the {@link #branchingFactor}. Valid indices
     * are in [0:nkeys+1] since nchildren := nkeys+1 for a {@link Node}.
     * </p>
     * <p>
     * This array is dimensioned to one more than the maximum capacity so that
     * the child reference corresponding to the key that causes overflow and
     * forces the split may be inserted. This greatly simplifies the logic for
     * computing the split point and performing the split.
     * </p>
     * <p>
     * Note: This should not be marked as volatile. Volatile does not make the
     * elements of the array volatile, only the array reference itself. The
     * field would be final except that we clear the reference when stealing the
     * array or deleting the node.
     * </p>
     * 
     * @todo document why package private (AbstractBTree.loadChild uses this but
     *       maybe that method could be moved to Node).
     */
    transient/* volatile */Reference<AbstractNode<?>>[] childRefs;

    // /**
    // * An array of objects used to provide a per-child lock in order to allow
    // * maximum concurrency in {@link #getChild(int)}.
    // * <p>
    // * Note: this array is not allocated for a mutable btree since the caller
    // * will be single threaded and locking is therefore not required in
    // * {@link #getChild(int)}. We only need locking for read-only btrees since
    // * they allow concurrent readers.
    // *
    // * @todo There is a LOT of overhead to creating all these objects. They
    // are
    // * only really useful in high concurrent read scenarios such as highly
    // * concurrent query against an index. However, those situations
    // * typically have a lot of buffered B+Tree nodes so the in-memory
    // * footprint for this array is not really worth it.
    // * <p>
    // * This might be viable if we had an AtomicBitVector class since we
    // * could typically get by with 1-2 longs of data in that case.
    // */
    // transient private Object[] childLocks;

    /**
     * Return <code>((branchingFactor + 1) << 1) - 1</code>
     */
    protected final int minKeys() {

        // /*
        // * Compute the minimum #of children/values. This is the same whether
        // * this is a Node or a Leaf.
        // */
        // final int minChildren = (btree.branchingFactor + 1) >> 1;
        //
        // // this.minKeys = isLeaf() ? minChildren : minChildren - 1;
        //
        // return minChildren - 1;
        return btree.minChildren - 1;

    }

    /**
     * Return <code>branchingFactor - 1</code>, which is the maximum #of keys
     * for a {@link Node}.
     */
    protected final int maxKeys() {

        // // The maximum #of keys is easy to compute.
        // this.maxKeys = isLeaf() ? branchingFactor : branchingFactor - 1;

        return btree.branchingFactor - 1;

    }

    /**
     * Range check a child index.
     * 
     * @param index
     *            The index of a child in [0:nkeys+1].
     * @return <code>true</code>
     * 
     * @throws IndexOutOfBoundsException
     *             if the index is not in the legal range.
     */
    final protected boolean rangeCheckChildIndex(final int index) {

        if (index < 0 || index > data.getKeyCount() + 1)
            throw new IndexOutOfBoundsException();

        return true;

    }

    /**
     * Return the {@link Reference} for the child. This is part of the internal
     * API.
     * 
     * @param index
     *            The index of the child.
     * 
     * @return The {@link Reference} for that child. This will be
     *         <code>null</code> if the child is not buffered. Note that a non-
     *         <code>null</code> return MAY still return <code>null</code> from
     *         {@link Reference#get()}.
     */
    public final Reference<AbstractNode<?>> getChildRef(final int index) {

        assert rangeCheckChildIndex(index);

        return childRefs[index];

    }

    final public INodeData getDelegate() {

        return data;

    }

    /*
     * INodeData.
     */

    /**
     * Always returns <code>false</code>.
     */
    final public boolean isLeaf() {

        return false;

    }

    /**
     * The result depends on the backing {@link INodeData} implementation. The
     * {@link Node} will be mutable when it is first created and is made
     * immutable when it is persisted. If there is a mutation operation, the
     * backing {@link INodeData} is automatically converted into a mutable
     * instance.
     */
    final public boolean isReadOnly() {

        return data.isReadOnly();

    }

    final public boolean isCoded() {

        return data.isCoded();

    }

    final public AbstractFixedByteArrayBuffer data() {

        return data.data();

    }

    final public int getKeyCount() {

        return data.getKeyCount();

    }

    final public IRaba getKeys() {

        return data.getKeys();

    }

    final public int getChildCount() {

        return data.getChildCount();

    }

    public final long getSpannedTupleCount() {

        return data.getSpannedTupleCount();

    }

    public final long getChildAddr(final int index) {

        return data.getChildAddr(index);

    }

    final public long getChildEntryCount(final int index) {

        return data.getChildEntryCount(index);

    }

    final public long getMaximumVersionTimestamp() {

        return data.getMaximumVersionTimestamp();

    }

    public long getMinimumVersionTimestamp() {

        return data.getMinimumVersionTimestamp();

    }

    final public boolean hasVersionTimestamps() {

        return data.hasVersionTimestamps();

    }

    /**
     * Apply the delta to the per-child count for this node and then recursively
     * ascend up the tree applying the delta to all ancestors of this node. This
     * is invoked solely by the methods that add and remove entries from a leaf
     * as those are the only methods that change the #of entries spanned by a
     * parent node. Methods that split, merge, or redistribute keys have a net
     * zero effect on the #of entries spanned by the parent.
     * <p>
     * This also updates {@link #getMinimumVersionTimestamp()} and
     * {@link #getMaximumVersionTimestamp()} iff version timestamps are enabled
     * and the child has a minimum (maximum) version timestamp GE the minimum
     * (maximum) version timestamp of this node.
     * 
     * @param child
     *            The direct child.
     * @param delta
     *            The change in the #of spanned children.
     */
    final protected void updateEntryCount(final AbstractNode<?> child,
            final long delta) {

        final int index = getIndexOf(child);

        assert !isReadOnly();

        final MutableNodeData data = (MutableNodeData) this.data;

        data.childEntryCounts[index] += delta;

		data.nentries += delta;

		if (data.childEntryCounts[index] <= 0) {
			// There must be at least one tuple spanned by the child.
			throw new RuntimeException();
		}

		if (data.nentries <= 0) {
			// There must be at least one tuple spanned by this node.
			throw new RuntimeException();
        }

        if (child.hasVersionTimestamps()) {

            final long cmin = child.getMinimumVersionTimestamp();

            final long cmax = child.getMaximumVersionTimestamp();

            if (cmin < data.minimumVersionTimestamp)
                data.minimumVersionTimestamp = cmin;

            if (cmax > data.maximumVersionTimestamp)
                data.maximumVersionTimestamp = cmax;

        }

        if (parent != null) {

            parent.get().updateEntryCount(this, delta);

        }

    }

    /**
     * Update the {@link #getMinimumVersionTimestamp()} and
     * {@link #getMaximumVersionTimestamp()}. This is invoked when the min/max
     * in the child has changed without a corresponding change to the #of
     * spanned tuples. E.g., when an insert() causes a tuple to be updated
     * rather than added.
     * 
     * @param child
     *            The direct child.
     */
    final protected void updateMinMaxVersionTimestamp(
            final AbstractNode<?> child) {

        assert !isReadOnly();

        final MutableNodeData data = (MutableNodeData) this.data;

        final long cmin = child.getMinimumVersionTimestamp();

        final long cmax = child.getMaximumVersionTimestamp();

        if (cmin < data.minimumVersionTimestamp)
            data.minimumVersionTimestamp = cmin;

        if (cmax > data.maximumVersionTimestamp)
            data.maximumVersionTimestamp = cmax;

        if (parent != null) {

            parent.get().updateMinMaxVersionTimestamp(child);

        }

    }

    /**
     * De-serialization constructor.
     * <p>
     * Note: The de-serialization constructor (and ONLY the de-serialization
     * constructor) ALWAYS creates a clean node. Therefore the {@link PO#dirty}
     * flag passed up from this constructor has the value <code>false</code>.
     * 
     * @param btree
     *            The tree to which the node belongs.
     * @param addr
     *            The persistent identity of the node.
     * @param data
     *            The data record.
     */
    @SuppressWarnings("unchecked")
    protected Node(final AbstractBTree btree, final long addr,
            final INodeData data) {

        super(btree, false /* The node is NOT dirty */);

        final int branchingFactor = btree.branchingFactor;

        // assert branchingFactor >= Options.MIN_BRANCHING_FACTOR;
        //        
        // assert nkeys < branchingFactor;

        assert data != null;

        // assert childAddr.length == branchingFactor + 1;
        //
        // assert childEntryCounts.length == branchingFactor + 1;

        setIdentity(addr);

        this.data = data;

        // this.nentries = nentries;
        //        
        // // this.nkeys = keys.size();
        //        
        // this.keys = keys; // steal reference.
        //        
        // this.childAddr = childAddr;
        //
        // this.childEntryCounts = childEntryCounts;

        childRefs = new Reference[branchingFactor + 1];

        // childLocks = newChildLocks(btree, data.getKeys().size());

        // // must clear the dirty flag since we just de-serialized this node.
        // setDirty(false);

    }

    /**
     * Used to create a new node when a node is split.
     */
    @SuppressWarnings("unchecked")
    protected Node(final BTree btree) {

        super(btree, true /* dirty */);

        final int branchingFactor = btree.branchingFactor;

        data = new MutableNodeData(branchingFactor, btree.getIndexMetadata()
                .getVersionTimestamps());

        childRefs = new Reference[branchingFactor + 1];

        // childLocks = newChildLocks(btree, 0/* nkeys */);

        // nentries = 0;
        //
        // keys = new MutableKeyBuffer(branchingFactor);
        //
        // childAddr = new long[branchingFactor + 1];
        //
        // childEntryCounts = new int[branchingFactor + 1];

    }

    /**
     * This constructor is used when splitting the a root {@link Leaf} or a root
     * {@link Node}. The resulting node has a single child reference and NO
     * keys. The #of entries allocated to the child is the #of remaining in that
     * child <em>after</em> the split.
     * 
     * @param btree
     *            A mutable btree.
     * @param oldRoot
     *            The node that was previously the root of the tree (either a
     *            node or a leaf).
     * @param nentries
     *            The #of entries spanned by the oldRoot <em>before</em> the
     *            split.
     */
    @SuppressWarnings("unchecked")
    protected Node(final BTree btree, final AbstractNode oldRoot,
            final long nentries) {

        super(btree, true /* dirty */);

        // Verify that this is the root.
        assert oldRoot == btree.root;

        // The old root must be dirty when it is being split.
        assert oldRoot.isDirty();

        final int branchingFactor = btree.branchingFactor;

        childRefs = new Reference[branchingFactor + 1];

        // Note: child locks are only for read-only btrees and this ctor is only
        // for a split of the root leaf so we never use child locks for this
        // case.
        assert !btree.isReadOnly();
        // childLocks = null; // newChildLocks(btree);

        final MutableNodeData data;
        this.data = data = new MutableNodeData(branchingFactor, btree
                .getIndexMetadata().getVersionTimestamps());

        // #of entries spanned by the old root _before_ this split.
        data.nentries = nentries;

        // keys = new MutableKeyBuffer(branchingFactor);
        //
        // childAddr = new long[branchingFactor + 1];
        //
        // childEntryCounts = new int[branchingFactor + 1];

        /*
         * Replace the root node on the tree.
         */

        final boolean wasDirty = btree.root.dirty;

        btree.root = this;

        if (!wasDirty) {

            btree.fireDirtyEvent();

        }

        /*
         * Attach the old root to this node.
         */

        childRefs[0] = oldRoot.self;
        // childRefs[0] = btree.newRef(oldRoot);

        // #of entries from the old root _after_ the split.
		data.childEntryCounts[0] = (oldRoot.isLeaf() ? ((Leaf) oldRoot)
				.getKeyCount() : ((Node) oldRoot).getSpannedTupleCount());

        // dirtyChildren.add(oldRoot);

        oldRoot.parent = this.self;
        // oldRoot.parent = btree.newRef(this);

        /*
         * The tree is deeper since we just split the root node.
         */
        btree.height++;

        final int requiredQueueCapacity = 2 * (btree.height + 2);

        if (requiredQueueCapacity > btree.writeRetentionQueue.capacity()) {

            /*
             * FIXME Automatically extend the hard reference queue capacity such
             * that (a) this constraint described below is never violated; and
             * (b) the percentage of distinct nodes (or nodes and leaves) on the
             * queue compared to the nodes (or nodes and leaves) in the tree
             * either remains constant or does not degrade overly quickly.
             * 
             * Constraint: The capacity of the hard reference queue needs to be
             * at least the height of the tree + 2 (or twice that if we touch
             * nodes on the way down and then on the way up again) so that a
             * split can not cause any dirty node in the path to the leaf or its
             * sibling to be evicted while we are processing that split. Note
             * that there is no chance of the nodes being swept by the JVM since
             * there are hard references to them on the stack, but if the are
             * evicted then the copy-on-write mechanism could be defeated since
             * a dirty parent could be evicted, forcing the child to become
             * immutable right when we are trying to operate on it.
             * 
             * Performance: Given a constant capacity for the hard reference
             * queue the percentage of distinct nodes on the queue out of the
             * total #of nodes in the tree will drop as the tree grows. This
             * translates directly into more (de-)serialization of nodes and
             * more disk seeks (if the store is not fully buffered).
             * 
             * However, simply increasing the queue capacity will cause more
             * data to be buffered pending serialization and therefore will
             * increase the commit latency. Instead, we should probably
             * introduce a secondary hard reference retention mechanism based
             * more directly on the #of nodes that we want to retain in memory.
             * One approach is to say N-1 or N-2 levels of the tree, and that
             * might be a good heuristic. However, the select of the specific
             * nodes that are retained should probably be somewhat dynamic so
             * that we do not force the JVM to hold onto references for nodes
             * that we are never or only rarely visiting given the actual access
             * patterns in the application.
             */

            throw new UnsupportedOperationException(
                    "writeRetentionQueue: capacity="
                            + btree.writeRetentionQueue.capacity()
                            + ", but height=" + btree.height);

        }

        btree.nnodes++;

        btree.getBtreeCounters().rootsSplit++;

        if (BTree.INFO || BigdataStatics.debug) {

            // Note: nnodes and nleaves might not reflect rightSibling yet.

            final String msg = "BTree: increasing height: name="
                    + btree.metadata.getName() + ", height=" + btree.height
                    + ", m=" + btree.getBranchingFactor() + ", nentries="
                    + btree.nentries;

            if (BTree.INFO)
                BTree.log.info(msg);

            if (BigdataStatics.debug)
                System.err.println(msg);

        }

    }

    /**
     * Copy constructor.
     * 
     * @param src
     *            The source node (must be immutable).
     * 
     * @param triggeredByChildId
     *            The persistent identity of the child that triggered the copy
     *            constructor. This should be the immutable child NOT the one
     *            that was already cloned. This information is used to avoid
     *            stealing the original child since we already made a copy of
     *            it. It is {@link #NULL} when this information is not
     *            available, e.g., when the copyOnWrite action is triggered by a
     *            join() and we are cloning the sibling before we redistribute a
     *            key to the node/leaf on which the join was invoked.
     * 
     * @todo We could perhaps replace this with the conversion of the
     *       INodeData:data field to a mutable field since the code which
     *       invokes copyOnWrite() no longer needs to operate on a new Node
     *       reference. However, I need to verify that nothing else depends on
     *       the new Node, e.g., the dirty flag, addr, etc.
     * 
     * @todo Can't we just test to see if the child already has this node as its
     *       parent reference and then skip it? If so, then that would remove a
     *       troublesome parameter from the API.
     */
    protected Node(final Node src, final long triggeredByChildId) {

        super(src);

        assert !src.isDirty();

        assert src.isReadOnly();
        // assert src.isPersistent();

        /*
         * Steal/clone the data record.
         * 
         * Note: The copy constructor is invoked when we need to begin mutation
         * operations on an immutable node or leaf, so make sure that the data
         * record is mutable.
         */
        assert src.data != null;
        this.data = src.isReadOnly() ? new MutableNodeData(src
                .getBranchingFactor(), src.data) : src.data;
        assert this.data != null;

        // clear reference on source.
        src.data = null;

        /*
         * Steal strongly reachable unmodified children by setting their parent
         * fields to the new node. Stealing the child means that it MUST NOT be
         * used by its previous ancestor (our source node for this copy).
         */

        childRefs = src.childRefs;
        src.childRefs = null;

        // childLocks = src.childLocks; src.childLocks = null;

        final int nkeys = data.getKeyCount();

        for (int i = 0; i <= nkeys; i++) {

            final AbstractNode child = childRefs[i] == null ? null
                    : childRefs[i].get();

            /*
             * Note: Both child.identity and triggeredByChildId will always be
             * 0L for a transient B+Tree since we never assign persistent
             * identity to the nodes and leaves. Therefore [child.identity !=
             * triggeredByChildId] will fail for ALL children, including the
             * trigger, and therefore fail to set the parent on any of them. The
             * [btree.store==null] test handles this condition and always steals
             * the child, setting its parent to this new node.
             * 
             * FIXME It is clear that testing on child.identity is broken in
             * some other places for the transient store.
             */
            if (child != null
                    && (btree.store == null || child.identity != triggeredByChildId)) {

                /*
                 * Copy on write should never trigger for a dirty node and only
                 * a dirty node can have dirty children.
                 */
                assert !child.isDirty();

                // Steal the child.
                child.parent = this.self;
                // child.parent = btree.newRef(this);

                // // Keep a reference to the clean child.
                // childRefs[i] = new WeakReference<AbstractNode>(child);

            }

        }

    }

    @Override
    public void delete() {

        super.delete();

        // clear state.
        childRefs = null;
        // childLocks = null;
        data = null;

    }

    /**
     * This method must be invoked on a parent to notify the parent that the
     * child has become persistent. The method scans the weak references for the
     * children, finds the index for the specified child, and then sets the
     * corresponding index in the array of child keys. The child is then removed
     * from the dirty list for this node.
     * 
     * @param child
     *            The child.
     * 
     * @exception IllegalStateException
     *                if the child is not persistent.
     * @exception IllegalArgumentException
     *                if the child is not a child of this node.
     */
    void setChildAddr(final AbstractNode<?> child) {

        if (!child.isPersistent()) {

            // The child does not have persistent identity.
            throw new IllegalStateException();

        }

        final int i = getIndexOf(child);

        assert !isReadOnly();

        ((MutableNodeData) data).childAddr[i] = child.getIdentity();

        // if (!dirtyChildren.remove(child)) {
        //
        // throw new AssertionError("Child was not on dirty list.");
        //
        // }

    }

    /**
     * Invoked by {@link #copyOnWrite()} to clear the persistent address for a
     * child on a cloned parent and set the reference to the cloned child.
     * 
     * @param oldChildAddr
     *            The persistent address of the old child. The entries to be
     *            updated are located based on this argument. It is an error if
     *            this address is not found in the list of child addresses for
     *            this {@link Node}.
     * @param newChild
     *            The reference to the new child.
     */
    void replaceChildRef(final long oldChildAddr, final AbstractNode newChild) {

        assert oldChildAddr != NULL || btree.store == null;
        assert newChild != null;

        // This node MUST have been cloned as a pre-condition, so it can not
        // be persistent.
        assert !isPersistent();
        assert !isReadOnly();

        // The newChild MUST have been cloned and therefore MUST NOT be
        // persistent.
        assert !newChild.isPersistent();

        assert !isReadOnly();

        final MutableNodeData data = (MutableNodeData) this.data;

        final int nkeys = getKeyCount();

        // Scan for location in weak references.
        for (int i = 0; i <= nkeys; i++) {

            if (data.childAddr[i] == oldChildAddr) {

                /*
                 * Note: We can not check anything which depends on
                 * oldChild.data since that field was cleared when we cloned the
                 * oldChild to obtain a mutable node/leaf. Since
                 * oldChild.isPersistent() does not capture the correct
                 * semantics for a transient B+Tree (we are interested if the
                 * node was read-only), this entire section has been commented
                 * out.
                 */
                // if (true) {
                //
                // /*
                // * Do some paranoia checks.
                // */
                //
                // final AbstractNode oldChild = childRefs[i] != null ?
                // childRefs[i]
                // .get() : null;
                //
                // if (oldChild != null) {
                //
                // // assert oldChild.isPersistent();
                //
                // // assert !dirtyChildren.contains(oldChild);
                //
                // }
                //
                // }

                // Clear the old key.
                data.childAddr[i] = NULL;

                // remove from cache and free the oldChildAddr if the Strategy
                // supports it
                if (btree.storeCache!=null) {
                    // remove from cache.
                    btree.storeCache.remove(oldChildAddr);
                }
                // free the oldChildAddr if the Strategy supports it
                btree.deleteNodeOrLeaf(oldChildAddr);
                // System.out.println("Deleting " + oldChildAddr);

                // Stash reference to the new child.
                // childRefs[i] = btree.newRef(newChild);
                childRefs[i] = newChild.self;

                // // Add the new child to the dirty list.
                // dirtyChildren.add(newChild);

                // Set the parent on the new child.
                // newChild.parent = btree.newRef(this);
                newChild.parent = this.self;

                return;

            }

        }

        // System.err.println("this: "); dump(Level.DEBUG,System.err);
        // System.err.println("newChild: ");
        // newChild.dump(Level.DEBUG,System.err);
        throw new IllegalArgumentException("Not our child : oldChildAddr="
                + oldChildAddr);

    }

    @Override
    public Tuple insert(final byte[] key, final byte[] value,
            final boolean delete, final boolean putIfAbsent, final long timestamp, final Tuple tuple) {

        assert !deleted;

        if (btree.debug)
            assertInvariants();

        btree.touch(this);

        final int childIndex = findChild(key);

        final AbstractNode<?> child = getChild(childIndex);

        return child.insert(key, value, delete, putIfAbsent, timestamp, tuple);

    }

    @Override
    public Tuple lookup(final byte[] key, final Tuple tuple) {

        assert !deleted;

        if (btree.debug)
            assertInvariants();

        btree.touch(this);

        final int childIndex = findChild(key);

        final AbstractNode<?> child = getChild(childIndex);

        return child.lookup(key, tuple);

    }

    @Override
    public Tuple remove(final byte[] key, final Tuple tuple) {

        assert !deleted;

        if (btree.debug)
            assertInvariants();

        btree.touch(this);

        final int childIndex = findChild(key);

        final AbstractNode<?> child = getChild(childIndex);

        return child.remove(key, tuple);

    }

    @Override
    public long indexOf(final byte[] key) {

        assert !deleted;

        btree.touch(this);

        final int childIndex = findChild(key);

        final AbstractNode<?> child = getChild(childIndex);

        /*
         * Compute running total to this child index plus [n], possible iff
         * successful search at the key level in which case we do not need to
         * pass n down.
         */

        long offset = 0;

        for (int i = 0; i < childIndex; i++) {

            offset += getChildEntryCount(i);

        }

        // recursive invocation, eventually grounds out on a leaf.
        long ret = child.indexOf(key);

        if (ret < 0) {

            // obtain "insert position".
            ret = -ret - 1;

            // add in the offset.
            ret += offset;

            // convert back to the "not found" form.
            return (-(ret) - 1);

        }

        // add in the offset.
        ret += offset;

        // return the index position of the key relative to this node.
        return ret;

    }

    /**
     * Range check an index into the keys of the node.
     * 
     * @param entryIndex
     *            The key index.
     * 
     * @return <code>true</code>
     * 
     * @throws IndexOutOfBoundsException
     *             if the index is LT ZERO (0) -or- GTE the
     *             {@link #getSpannedTupleCount()}
     */
    protected boolean rangeCheckSpannedTupleIndex(final long entryIndex) {

        final long nentries = data.getSpannedTupleCount();

        if (entryIndex < 0)
            throw new IndexOutOfBoundsException("negative: " + entryIndex);

        if (entryIndex >= nentries) {

            throw new IndexOutOfBoundsException("too large: entryIndex="
                    + entryIndex + ", but nentries=" + nentries);

        }

        return true;

    }

    /**
     * Recursive search for the key at the specified entry index.
     * 
     * @param entryIndex
     *            The index of the entry (relative to the first entry spanned by
     *            this node).
     * 
     * @return The key at that entry index.
     */
    @Override
    final public byte[] keyAt(final long entryIndex) {

        /* assert */rangeCheckSpannedTupleIndex(entryIndex);

        // index of the child that spans the desired entry.
        int childIndex = 0;

        // corrects the #of spanned entries by #skipped over.
        long remaining = entryIndex;

        final int nkeys = getKeyCount();

        // search for child spanning the desired entry index.
        for (; childIndex <= nkeys; childIndex++) {

            final long nspanned = getChildEntryCount(childIndex);

            if (remaining < nspanned) {

                // found the child index spanning the desired entry.
                break;

            }

            remaining -= nspanned;

            assert remaining >= 0;

        }

        final AbstractNode<?> child = getChild(childIndex);

        return child.keyAt(remaining);

    }

    /**
     * Recursive search for the value at the specified entry index.
     * 
     * @param entryIndex
     *            The index of the entry (relative to the first entry spanned by
     *            this node).
     * 
     * @return The value at that entry index.
     */
    @Override
    final public void valueAt(final long entryIndex, final Tuple tuple) {

        // Note: Made non-conditional since unit tests verify this.
        /* assert */rangeCheckSpannedTupleIndex(entryIndex);

        // index of the child that spans the desired entry.
        int childIndex = 0;

        // corrects the #of spanned entries by #skipped over.
        long remaining = entryIndex;

        final int nkeys = getKeyCount();

        // search for child spanning the desired entry index.
        for (; childIndex <= nkeys; childIndex++) {

            final long nspanned = getChildEntryCount(childIndex);

            if (remaining < nspanned) {

                // found the child index spanning the desired entry.
                break;

            }

            remaining -= nspanned;

            assert remaining >= 0;

        }

        final AbstractNode<?> child = getChild(childIndex);

        child.valueAt(remaining, tuple);

    }

    /**
     * Return the index of the child to be searched.
     * <p>
     * The interpretation of the key index for a node is as follows. When
     * searching nodes of the tree, we search for the index in keys[] of the
     * first key value greater than or equal (GTE) to the probe key. If the
     * match is equal, then we choose the child at index + 1. Otherwise we
     * choose the child having the same index as the GTE key match. For example,
     * 
     * <pre>
     * keys[]  : [ 5 9 12 ]
     * child[] : [ a b  c  d ]
     * </pre>
     * 
     * A probe with keys up to <code>4</code> matches at index zero (0) and we
     * choose the 1st child, a, which is at index zero (0).
     * <p>
     * A probe whose key is <code>5</code> matches at index zero (0) exactly and
     * we choose the child at <code>index + 1</code>, b, which is at index one
     * (1).
     * <p>
     * A probe with keys in [6:8] matches at index one (1) and we choose the 2nd
     * child, b, which is at index one (1). A probe with <code>9</code> also
     * matches at index one (1), but we choose <code>index+1</code> equals two
     * (2) since this is an exact key match.
     * <p>
     * A probe with keys in [10:11] matches at index two (2) and we choose the
     * 3rd child, c, which is at index two (2). A probe with <code>12</code>
     * also matches at index two (2), but we choose <code>index+1</code> equals
     * three (3) since this is an exact key match.
     * <p>
     * A probe with keys greater than 12 exceeds all keys in the node and always
     * matches the last child in that node. In this case, d, which is at index
     * three (3).
     * <p>
     * Note that we never stop a search on a node, even when there is an exact
     * match on a key. All values are stored in the leaves and we always descend
     * until we reach the leaf in which a value for the key would be stored. A
     * test on the keys of that leaf is then conclusive - either a value is
     * stored in the leaf for that key or it is not stored in the tree.
     * 
     * @param searchkey
     *            The probe key.
     * 
     * @return The child to be searched next for that key.
     */
    final protected int findChild(final byte[] searchKey) {

        int childIndex = this.getKeys().search(searchKey);

        if (childIndex >= 0) {

            /*
             * exact match - use the next child.
             */

            return childIndex + 1;

        } else {

            /*
             * There is no exact match on the key, so we convert the search
             * result to find the insertion point. The insertion point is always
             * an index whose current key (iff it is defined) is greater than
             * the probe key.
             * 
             * keys[] : [ 5 9 12 ]
             * 
             * The insertion point for key == 4 is zero.
             * 
             * The insertion point for key == 6 is one.
             * 
             * etc.
             * 
             * When the probe key is greater than any existing key, then the
             * insertion point is nkeys. E.g., the insertion point for key == 20
             * is 3.
             */

            /*
             * Convert the return by search to obtain the index of the child
             * that covers this key (a non-negative integer).
             */
            childIndex = -childIndex - 1;

            return childIndex;

        }

    }

    /**
     * <p>
     * Split an over-capacity node (a node with <code>maxKeys+1</code> keys),
     * creating a new rightSibling. The splitIndex is <code>(maxKeys+1)/2</code>
     * . The key at the splitIndex is the separatorKey. Unlike when we split a
     * {@link Leaf}, the separatorKey is lifted into the parent and does not
     * appear in either this node or the rightSibling after the split. All keys
     * and child references from <code>splitIndex+1</code> (inclusive) are moved
     * to the new rightSibling. The child reference at <code>splitIndex</code>
     * remains in this node.
     * </p>
     * <p>
     * If this node is the root of the tree (no parent), then a new root
     * {@link Node} is created without any keys and is made the parent of this
     * node.
     * </p>
     * <p>
     * In any case, we then insert( separatorKey, rightSibling ) into the parent
     * node, which may cause the parent node itself to split.
     * </p>
     * <p>
     * Note: splitting a node causes entry counts for the relocated childen to
     * be relocated to the new rightSibling but it does NOT change the #of
     * entries on the parent.
     * </p>
     */
    @Override
    protected IAbstractNode split() {

        assert isDirty(); // MUST be mutable.
        assert getKeyCount() == maxKeys() + 1; // MUST be over capacity by one.

        // cast to mutable implementation class.
        final BTree btree = (BTree) this.btree;

        btree.getBtreeCounters().nodesSplit++;

        /*
         * The #of entries spanned by this node _before_ the split.
         */
        final long nentriesBeforeSplit = getSpannedTupleCount();

        /*
         * The #of child references. (this is +1 since the node is over capacity
         * by one).
         */
        final int nchildren = btree.branchingFactor + 1;

        /*
         * Index at which to split the leaf. This is (maxKeys+1)/2, but that can
         * be simplified to branchingFactor/2 for a Node.
         */
        // final int splitIndex = (maxKeys() + 1) / 2;
        final int splitIndex = btree.branchingFactor >>> 1;

        /*
         * The key at that index, which becomes the separator key in the parent.
         * 
         * Note: Unlike a leaf, we are not free to choose the shortest separator
         * key in a node. This is because separator keys describe separations
         * among the leaves of the tree. This issue is covered by Bayer's
         * article on prefix trees.
         */
        final byte[] separatorKey = getKeys().get(splitIndex);

        // Create the new rightSibling node. It will be mutable.
        final Node rightSibling = new Node(btree);

        // Tunnel through to the mutable objects.
        final MutableNodeData data = (MutableNodeData) this.data;
        final MutableNodeData sdata = (MutableNodeData) rightSibling.data;
        final MutableKeyBuffer keys = data.keys;
        final MutableKeyBuffer skeys = sdata.keys;

        if (DEBUG) {
            log.debug("this=" + this + ", nkeys=" + getKeyCount()
                    + ", splitIndex=" + splitIndex + ", separatorKey="
                    + keyAsString(separatorKey));
            // if(DEBUG) dump(Level.DEBUG,System.err);
        }

        /*
         * copy keys and values to the new rightSibling.
         */

        int j = 0;

        // // #of spanned entries being moved to the new rightSibling.
        // int nentriesMoved = 0;

        for (int i = splitIndex + 1; i < nchildren; i++, j++) {

            if (i + 1 < nchildren) {

                /*
                 * Note: keys[nchildren-1] is undefined.
                 */
                // rightSibling.setKey(j, getKey(i));
                rightSibling.copyKey(j, this.getKeys(), i);

            }

            rightSibling.childRefs[j] = childRefs[i];

            sdata.childAddr[j] = data.childAddr[i];

            final long childEntryCount = data.childEntryCounts[i];

            sdata.childEntryCounts[j] = childEntryCount;

            sdata.nentries += childEntryCount;

            data.nentries -= childEntryCount;

            // nentriesMoved += childEntryCounts[i];

            final AbstractNode tmp = (childRefs[i] == null ? null
                    : childRefs[i].get());

            if (tmp != null) {

                /*
                 * The child node is in memory.
                 * 
                 * Update its parent reference.
                 */

                // tmp.parent = btree.newRef(rightSibling);
                tmp.parent = rightSibling.self;

            }

            /*
             * Clear out the old keys and values, including keys[splitIndex]
             * which is being moved to the parent.
             */

            if (i + 1 < nchildren) {

                keys.keys[i] = null;

                /* nkeys--; */keys.nkeys--; // one less key here.

                /* rightSibling.nkeys++; */skeys.nkeys++; // more more key
                                                          // there.

            }

            childRefs[i] = null;

            data.childAddr[i] = NULL;

            data.childEntryCounts[i] = 0;

        }

        /*
         * Clear the key that is being move into the parent.
         */
        keys.keys[splitIndex] = null;

        /* nkeys--; */keys.nkeys--;

        Node p = getParent();

        if (p == null) {

            /*
             * Use a special constructor to split the root. The result is a new
             * node with zero keys and one child (this node).
             */

            p = new Node(btree, this, nentriesBeforeSplit);

        } else {

            assert !p.isReadOnly();

            // this node now has fewer entries
            ((MutableNodeData) p.data).childEntryCounts[p.getIndexOf(this)] -= sdata.nentries;

        }

        /*
         * insert(separatorKey,rightSibling) into the parent node. This may
         * cause the parent node itself to split.
         */
        p.insertChild(separatorKey, rightSibling);

        btree.nnodes++;

        // Return the high node.
        return rightSibling;

    }

    /**
     * Redistributes a key from the specified sibling into this node in order to
     * bring this node up to the minimum #of keys. This also updates a separator
     * key in the parent for the right most of (this, sibling).
     * 
     * When a key is redistributed from a sibling, the key in the sibling is
     * rotated into the parent where it replaces the current separatorKey and
     * that separatorKey is brought down into this node. The child corresponding
     * to the key is simply moved from the sibling into this node (rather than
     * rotating it through the parent).
     * <p>
     * While redistribution changes the #of entries spanned by the node and the
     * sibling and therefore must update {@link #childEntryCounts} on the shared
     * parent, it does not change the #of entries spanned by the parent.
     * 
     * @param sibling
     *            A direct sibling of this node (either the left or right
     *            sibling). The sibling MUST be mutable.
     * 
     * @todo change to redistribute keys until the node and the sibling have an
     *       equal number of keys. if the other sibling exists and is also
     *       materialized then redistribute keys with it as well? This is the
     *       B*-Tree variation - it has strengths and weaknesses. I do not think
     *       that it will be a big win here since the expected scenario is heavy
     *       writes, good retention of nodes, and de-serialization of nodes from
     *       a fully buffered journal (hence, no IOs even when we need to
     *       materialize a sibling).
     */
    @Override
    protected void redistributeKeys(final AbstractNode sibling,
            final boolean isRightSibling) {

        // the sibling of a Node must be a Node.
        final Node s = (Node) sibling;
        assert s != null;

        final int nkeys = getKeyCount();
        final int snkeys = s.getKeyCount();

        assert dirty;
        assert !deleted;
        assert !isPersistent();
        // verify that this leaf is deficient.
        assert nkeys < minKeys();
        // verify that this leaf is under minimum capacity by one key.
        assert nkeys == minKeys() - 1;

        // the sibling MUST be _OVER_ the minimum #of keys/values.
        assert snkeys > minKeys();
        assert s.dirty;
        assert !s.deleted;
        assert !s.isPersistent();

        final Node p = getParent();

        // children of the same node.
        assert s.getParent() == p;

        if (DEBUG) {
            log.debug("this=" + this + ", sibling=" + sibling
                    + ", rightSibling=" + isRightSibling);
            // if(DEBUG) {
            // System.err.println("this"); dump(Level.DEBUG,System.err);
            // System.err.println("sibling");
            // sibling.dump(Level.DEBUG,System.err);
            // System.err.println("parent"); p.dump(Level.DEBUG,System.err);
            // }
        }

        /*
         * The index of this leaf in its parent. we note this before we start
         * mucking with the keys.
         */
        final int index = p.getIndexOf(this);

        // Tunnel through to the mutable keys object.
        final MutableKeyBuffer keys = (MutableKeyBuffer) this.getKeys();
        final MutableKeyBuffer skeys = (MutableKeyBuffer) s.getKeys();

        // Tunnel through to the mutable data records.
        final MutableNodeData data = (MutableNodeData) this.data;
        final MutableNodeData sdata = (MutableNodeData) s.data;
        final MutableNodeData pdata = (MutableNodeData) p.data;

        /*
         * determine which leaf is earlier in the key ordering and get the index
         * of the sibling.
         */
        if (isRightSibling/* keys[nkeys-1] < s.keys[0] */) {

            /*
             * redistributeKeys(this,rightSibling). all we have to do is replace
             * the separatorKey in the parent with the first key from the
             * rightSibling and copy the old separatorKey from the parent to the
             * end of the keys in this node. we then close up the hole that this
             * left at index 0 in the rightSibling.
             */

            // Mopy the first key/child from the rightSibling.
            // setKey(nkeys, p.getKey(index)); // copy the separatorKey from the
            // parent.
            copyKey(nkeys, p.getKeys(), index); // copy the separatorKey from
                                                // the parent.
            // p.setKey(index, s.getKey(0)); // update the separatorKey from the
            // rightSibling.
            p.copyKey(index, s.getKeys(), 0); // update the separatorKey from
                                              // the rightSibling.
            childRefs[nkeys + 1] = s.childRefs[0]; // copy the child from the
                                                   // rightSibling.
            data.childAddr[nkeys + 1] = sdata.childAddr[0];
            final long siblingChildCount = sdata.childEntryCounts[0]; // #of
                                                                     // spanned
                                                                     // entries
                                                                     // being
                                                                     // moved.
            data.childEntryCounts[nkeys + 1] = siblingChildCount;
            final AbstractNode child = childRefs[nkeys + 1] == null ? null
                    : childRefs[nkeys + 1].get();
            if (child != null) {
                child.parent = this.self;
                // child.parent = btree.newRef(this);
                // if( child.isDirty() ) {
                // if(!s.dirtyChildren.remove(child)) throw new
                // AssertionError();
                // if(!dirtyChildren.add(child)) throw new AssertionError();
                // }
            }

            // copy down the keys on the right sibling to cover up the hole.
            System.arraycopy(skeys.keys, 1, skeys.keys, 0, snkeys - 1);
            System.arraycopy(s.childRefs, 1, s.childRefs, 0, snkeys);
            System.arraycopy(sdata.childAddr, 1, sdata.childAddr, 0, snkeys);
            System.arraycopy(sdata.childEntryCounts, 1, sdata.childEntryCounts,
                    0, snkeys);

            // erase exposed key/value on rightSibling that is no longer
            // defined.
            skeys.keys[snkeys - 1] = null;
            s.childRefs[snkeys] = null;
            sdata.childAddr[snkeys] = NULL;
            sdata.childEntryCounts[snkeys] = 0;

            // update parent : N more entries spanned by this child.
            pdata.childEntryCounts[index] += siblingChildCount;
            // update parent : N fewer entries spanned by our right sibling.
            pdata.childEntryCounts[index + 1] -= siblingChildCount;

            // update #of entries spanned by this node.
            data.nentries += siblingChildCount;
            // update #of entries spanned by our rightSibling.
            sdata.nentries -= siblingChildCount;

            /* s.nkeys--; */skeys.nkeys--;
            /* this.nkeys++; */keys.nkeys++;

            if (btree.debug) {
                assertInvariants();
                s.assertInvariants();
            }

        } else {

            /*
             * redistributeKeys(leftSibling,this). all we have to do is copy
             * down the keys in this node by one position, copy the separatorKey
             * from the parent into the hole that we just opened up, copy the
             * child data across, and update the separatorKey in the parent with
             * the last key from the leftSibling.
             */

            // copy down by one.
            System.arraycopy(keys.keys, 0, keys.keys, 1, nkeys);
            System.arraycopy(childRefs, 0, childRefs, 1, nkeys + 1);
            System.arraycopy(data.childAddr, 0, data.childAddr, 1, nkeys + 1);
            System.arraycopy(data.childEntryCounts, 0, data.childEntryCounts,
                    1, nkeys + 1);

            // move the last key/child from the leftSibling to this node.
            // setKey(0, p.getKey(index-1)); // copy the separatorKey from the
            // parent.
            copyKey(0, p.getKeys(), index - 1); // copy the separatorKey from
                                                // the parent.
            // p.setKey(index-1, s.getKey(s.nkeys-1)); // update the
            // separatorKey
            p.copyKey(index - 1, s.getKeys(), snkeys - 1); // update the
                                                           // separatorKey
            childRefs[0] = s.childRefs[snkeys];
            data.childAddr[0] = sdata.childAddr[snkeys];
            final long siblingChildCount = sdata.childEntryCounts[snkeys];
            data.childEntryCounts[0] = siblingChildCount;
            final AbstractNode child = childRefs[0] == null ? null
                    : childRefs[0].get();
            if (child != null) {
                child.parent = this.self;
                // child.parent = btree.newRef(this);
                // if(child.isDirty()) {
                // if(!s.dirtyChildren.remove(child)) throw new
                // AssertionError();
                // if(!dirtyChildren.add(child)) throw new AssertionError();
                // }
            }
            skeys.keys[snkeys - 1] = null;
            s.childRefs[snkeys] = null;
            sdata.childAddr[snkeys] = NULL;
            sdata.childEntryCounts[snkeys] = 0;
            /* s.nkeys--; */skeys.nkeys--;
            /* this.nkeys++; */keys.nkeys++;

            // update parent : N more entries spanned by this child.
            pdata.childEntryCounts[index] += siblingChildCount;
            // update parent : N fewer entries spanned by our leftSibling.
            pdata.childEntryCounts[index - 1] -= siblingChildCount;
            // update #of entries spanned by this node.
            data.nentries += siblingChildCount;
            // update #of entries spanned by our leftSibling.
            sdata.nentries -= siblingChildCount;

            if (btree.debug) {
                assertInvariants();
                s.assertInvariants();
            }

        }

    }

    /**
     * Merge the keys and values from the sibling into this node, delete the
     * sibling from the store and remove the sibling from the parent. This will
     * trigger recursive {@link AbstractNode#join()} if the parent node is now
     * deficient. While this changes the #of entries spanned by the current node
     * it does NOT effect the #of entries spanned by the parent.
     * 
     * @param sibling
     *            A direct sibling of this node (does NOT need to be mutable).
     *            The sibling MUST have exactly the minimum #of keys.
     */
    @Override
    protected void merge(final AbstractNode sibling,
            final boolean isRightSibling) {

        // The sibling of a Node must be a Node.
        final Node s = (Node) sibling;
        assert s != null;
        assert !s.deleted;

        // Note: local var is updated within this method!
        int nkeys = getKeyCount();

        final int snkeys = s.getKeyCount();

        // verify that this node is deficient.
        assert nkeys < minKeys();
        // verify that this node is under minimum capacity by one key.
        assert nkeys == minKeys() - 1;
        // the sibling MUST at the minimum #of keys/values.
        assert snkeys == s.minKeys();

        final Node p = getParent();

        // children of the same node.
        assert s.getParent() == p;

        if (DEBUG) {
            log.debug("this=" + this + ", sibling=" + sibling
                    + ", rightSibling=" + isRightSibling);
            // if(DEBUG) {
            // System.err.println("this"); dump(Level.DEBUG,System.err);
            // System.err.println("sibling");
            // sibling.dump(Level.DEBUG,System.err);
            // System.err.println("parent"); p.dump(Level.DEBUG,System.err);
            // }
        }

        final long siblingEntryCount = s.getSpannedTupleCount();

        /*
         * The index of this node in its parent. we note this before we start
         * mucking with the keys.
         */
        final int index = p.getIndexOf(this);

        /*
         * Tunnel through to the mutable data records.
         * 
         * Note: We do not require the sibling to be mutable. If it is not, then
         * we create a mutable copy of the sibling for use during this method.
         */
        final MutableNodeData data = (MutableNodeData) this.data;
        final MutableNodeData sdata = s.isReadOnly() ? new MutableNodeData(
                getBranchingFactor(), s.data) : (MutableNodeData) s.data;
        final MutableNodeData pdata = (MutableNodeData) p.data;

        // Tunnel through to the mutable keys objects.
        final MutableKeyBuffer keys = data.keys;
        final MutableKeyBuffer skeys = sdata.keys;

        // /*
        // * Tunnel through to the mutable keys object.
        // *
        // * Note: since we do not require the sibling to be mutable we have to
        // * test and convert the key buffer for the sibling to a mutable key
        // * buffer if the sibling is immutable. Also note that the sibling MUST
        // * have the minimum #of keys for a merge so we set the capacity of the
        // * mutable key buffer to that when we have to convert the siblings
        // keys
        // * into mutable form in order to perform the merge operation.
        // */
        // final MutableKeyBuffer keys = (MutableKeyBuffer) this.getKeys();
        // final MutableKeyBuffer skeys = (s.getKeys() instanceof
        // MutableKeyBuffer ? (MutableKeyBuffer) s.getKeys()
        // : new MutableKeyBuffer(getBranchingFactor(), s.getKeys()));

        /*
         * determine which node is earlier in the key ordering so that we know
         * whether the sibling's keys will be inserted at the front of this
         * nodes's keys or appended to this nodes's keys.
         */
        if (isRightSibling/* keys[nkeys-1] < s.keys[0] */) {

            /*
             * merge( this, rightSibling ). the keys and values from this node
             * will appear in their current position, the separatorKey from the
             * parent is appended after the last key in this node, and the keys
             * and children from the rightSibling are then appended as well.
             */

            /*
             * Get the separator key in the parent and append it to the keys in
             * this node.
             */
            // this.setKey(nkeys++, p.getKey(index));
            this.copyKey(nkeys, p.getKeys(), index);
            nkeys++; // update local var!
            keys.nkeys++;

            /*
             * Copy in the keys and children from the sibling. Note that the
             * children are copied to the position nkeys NOT nkeys+1 since the
             * first child needs to appear at the same position as the
             * separatorKey that we copied from the parent.
             */
            System.arraycopy(skeys.keys, 0, keys.keys, nkeys, snkeys);
            System.arraycopy(s.childRefs, 0, this.childRefs, nkeys, snkeys + 1);
            System.arraycopy(sdata.childAddr, 0, data.childAddr, nkeys,
                    snkeys + 1);
            System.arraycopy(sdata.childEntryCounts, 0, data.childEntryCounts,
                    nkeys, snkeys + 1);

            // update parent on children
            final Reference<Node> ref = (Reference<Node>) this.self;
            // final Reference<Node> weakRef = btree.newRef(this);
            for (int i = 0; i < snkeys + 1; i++) {
                final AbstractNode child = s.childRefs[i] == null ? null
                        : s.childRefs[i].get();
                if (child != null) {
                    child.parent = ref;
                    // if( child.isDirty() ) {
                    // // record hard references for dirty children.
                    // dirtyChildren.add(child);
                    // }
                }
            }

            /*
             * Adjust the #of keys in this leaf.
             */
            /* this.nkeys += s.nkeys; */keys.nkeys += snkeys;
            /*
             * Note: in this case we have to replace the separator key for this
             * node with the separator key for its right sibling.
             * 
             * Note: This temporarily causes the duplication of a separator key
             * in the parent. However, the separator key for the right sibling
             * will be deleted when the sibling is removed from the parent
             * below.
             */
            // p.setKey(index, p.getKey(index+1));
            p.copyKey(index, p.getKeys(), index + 1);

            // reallocate spanned entries from the sibling to this node.
            pdata.childEntryCounts[index] += siblingEntryCount;
            data.nentries += siblingEntryCount;

            if (btree.debug)
                assertInvariants();

        } else {

            /*
             * merge( leftSibling, this ). the keys and values from this node
             * will be move down by sibling.nkeys+1 positions, the keys and
             * values from the sibling will be copied into this node starting at
             * index zero(0), and finally the separatorKey from the parent will
             * be copied into the position after the last sibling key and before
             * the position of the first key copied down in this node to avoid
             * overwrite (that is, we copy the separatorKey from the parent to
             * this.keys[s.nkeys]).
             * 
             * Note: we do not update the separator key in the parent because
             * the separatorKey will be removed when we remove the leftSibling
             * from the parent at the end of this method.
             */

            // move keys and children down by sibling.nkeys+1 positions.
            System.arraycopy(keys.keys, 0, keys.keys, snkeys + 1, nkeys);
            System.arraycopy(this.childRefs, 0, this.childRefs, snkeys + 1,
                    nkeys + 1);
            System.arraycopy(data.childAddr, 0, data.childAddr, snkeys + 1,
                    nkeys + 1);
            System.arraycopy(data.childEntryCounts, 0, data.childEntryCounts,
                    snkeys + 1, nkeys + 1);

            // copy keys and values from the sibling to index 0 of this leaf.
            System.arraycopy(skeys.keys, 0, keys.keys, 0, snkeys);
            System.arraycopy(s.childRefs, 0, this.childRefs, 0, snkeys + 1);
            System.arraycopy(sdata.childAddr, 0, data.childAddr, 0, snkeys + 1);
            System.arraycopy(sdata.childEntryCounts, 0, data.childEntryCounts,
                    0, snkeys + 1);

            // copy the separatorKey from the parent.
            // this.setKey(s.nkeys, p.getKey(index - 1));
            this.copyKey(snkeys, p.getKeys(), index - 1);

            // update parent on children.
            final Reference<Node> ref = (Reference<Node>) this.self;
            // final Reference<Node> weakRef = btree.newRef(this);
            for (int i = 0; i < snkeys + 1; i++) {
                final AbstractNode child = s.childRefs[i] == null ? null
                        : s.childRefs[i].get();
                if (child != null) {
                    child.parent = ref;
                    // if( child.isDirty() ) {
                    // // record hard references for dirty children.
                    // dirtyChildren.add(child);
                    // }
                }
            }

            // we gain nkeys from the sibling and one key from the parent.
            /* this.nkeys += s.nkeys + 1; */keys.nkeys += snkeys + 1;

            // reallocate spanned entries from the sibling to this node.
            pdata.childEntryCounts[index] += s.getSpannedTupleCount();
            data.nentries += siblingEntryCount;

            if (btree.debug)
                assertInvariants();

        }

        /*
         * The sibling is now empty. We need to detach the sibling from its
         * parent node and then delete the sibling from the store.
         */
        p.removeChild(s);

    }

    /**
     * Invoked by {@link AbstractNode#split()} to insert a key and reference for
     * a child created when another child of this node is split. This method has
     * no effect on the #of entries spanned by the parent. *
     * <p>
     * Note: This operation is invoked only when a node or leaf is split. As
     * such, it can not cause the min/max tuple revision timestamp on this
     * {@link Node} to change since no tuples have been added or removed.
     * However, this method does need to record the min/max for the new
     * rightSibling.
     * 
     * @param key
     *            The key on which the old node was split.
     * @param child
     *            The new node.
     * 
     *            FIXME set min/max for the new child.
     */
    protected void insertChild(final byte[] key, final AbstractNode child) {

        if (btree.debug)
            assertInvariants();

        // assert key > IIndex.NEGINF && key < IIndex.POSINF;
        assert child != null;
        assert child.isDirty() : "child not dirty"; // always dirty since it was
                                                    // just created.
        assert isDirty() : "not dirty"; // must be dirty to permit mutation.

        /*
         * Find the location where this key belongs. When a new node is created,
         * the constructor sticks the child that demanded the split into
         * childRef[0]. So, when nkeys == 1, we have nchildren==1 and the key
         * goes into keys[0] but we have to copyDown by one anyway to avoid
         * stepping on the existing child.
         */
        int childIndex = this.getKeys().search(key);

        if (childIndex >= 0) {

            /*
             * The key is already present. This is an error.
             */

            // btree.dump(Level.DEBUG,System.err);

            throw new AssertionError("Split on existing key: childIndex="
                    + childIndex + ", key=" + keyAsString(key) + "\nthis="
                    + this + "\nchild=" + child);

        }

        final int nkeys = getKeyCount();

        // Convert the position to obtain the insertion point.
        childIndex = -childIndex - 1;

        assert childIndex >= 0 && childIndex <= nkeys;

        /*
         * copy down per-key data.
         */
        assert !isReadOnly();
        final MutableNodeData data = (MutableNodeData) this.data;
        final MutableKeyBuffer keys = (MutableKeyBuffer) this.getKeys();

        final int length = nkeys - childIndex;

        if (length > 0) {

            System.arraycopy(keys.keys, childIndex, keys.keys,
                    (childIndex + 1), length);

        }

        /*
         * copy down per-child data. #children == nkeys+1. child[0] is always
         * defined.
         */
        System.arraycopy(childRefs, childIndex + 1, childRefs, childIndex + 2,
                length);
        System.arraycopy(data.childAddr, childIndex + 1, data.childAddr,
                childIndex + 2, length);
        System.arraycopy(data.childEntryCounts, childIndex + 1,
                data.childEntryCounts, childIndex + 2, length);

        /*
         * Insert key at index.
         */
        // setKey(childIndex, key);
        keys.keys[childIndex] = key;
        // System.arraycopy(key, 0, keys.keys, childIndex, 1);

        /*
         * Insert child at index+1.
         */
        childRefs[childIndex + 1] = child.self;
        // childRefs[childIndex + 1] = btree.newRef(child);

        data.childAddr[childIndex + 1] = NULL;

		final long childEntryCount = // child.getSpannedTupleCount();
		(child.isLeaf() ? ((Leaf) child).getKeyCount() : ((Node) child)
				.getSpannedTupleCount());

        data.childEntryCounts[childIndex + 1] = childEntryCount;

        // if( parent != null ) {
        //            
        // parent.get().updateEntryCount(this, childEntryCount);
        //            
        // }

        // nentries += childEntryCount;

        // dirtyChildren.add(child);

        child.parent = this.self;
        // child.parent = btree.newRef(this);

        /* nkeys++; */keys.nkeys++;

        // Note: this tests the post-condition of the split.
        if (keys.nkeys == maxKeys() + 1) {

            /*
             * The node is over capacity so we split the node, creating a new
             * rightSibling and insert( separatorKey, rightSibling ) into the
             * parent.
             */

            final Node rightSibling = (Node) split();

            // assert additional post-split invariants.
            if (btree.debug) {
                getParent().assertInvariants();
                rightSibling.assertInvariants();
            }

            return;

        }

        if (btree.debug)
            assertInvariants();

    }

    /**
     * Return the left sibling. This is used by implementations of
     * {@link AbstractNode#join()} to explore their left sibling.
     * 
     * @param child
     *            The child (must be dirty).
     * @param materialize
     *            When true, the left sibling will be materialized if it exists
     *            but is not resident.
     * 
     * @return The left sibling or <code>null</code> if it does not exist -or-
     *         if it is not materialized and <code>materialized == false</code>.
     *         If the sibling is returned, then is NOT guaranteed to be mutable
     *         and the caller MUST invoke copy-on-write before attempting to
     *         modify the returned sibling.
     */
    protected AbstractNode getLeftSibling(final AbstractNode child,
            final boolean materialize) {

        final int i = getIndexOf(child);

        if (i == 0) {

            /*
             * There is no left sibling for this child that is a child of the
             * same parent.
             */

            return null;

        } else {

            final int index = i - 1;

            AbstractNode sibling = childRefs[index] == null ? null
                    : childRefs[index].get();

            if (sibling == null) {

                if (materialize) {

                    sibling = getChild(index);

                }

            } else {

                btree.touch(sibling);

            }

            return sibling;

        }

    }

    /**
     * Return the right sibling of the specified child of a common parent. This
     * method is invoked on the parent, passing in one child and returning its
     * right sibling under that common parent (if any). This is used by
     * implementations of {@link AbstractNode#join()} to explore their right
     * sibling.
     * 
     * @param child
     *            The child (must be dirty).
     * @param materialize
     *            When true, the left sibling will be materialized if it exists
     *            but is not resident.
     * 
     * @return The left sibling or <code>null</code> if it does not exist -or-
     *         if it is not materialized and <code>materialized == false</code>.
     *         If the sibling is returned, then is NOT guaranteed to be mutable
     *         and the caller MUST invoke copy-on-write before attempting to
     *         modify the returned sibling.
     */
    protected AbstractNode getRightSibling(final AbstractNode child,
            final boolean materialize) {

        final int i = getIndexOf(child);

        if (i == getKeyCount()) {

            /*
             * There is no right sibling for this child that is a child of the
             * same parent.
             */

            return null;

        } else {

            final int index = i + 1;

            AbstractNode sibling = childRefs[index] == null ? null
                    : childRefs[index].get();

            if (sibling == null) {

                if (materialize) {

                    sibling = getChild(index);

                }

            } else {

                btree.touch(sibling);

            }

            return sibling;

        }

    }

    /**
     * Return the index of the child among the direct children of this node.
     * 
     * @param child
     *            The child.
     * 
     * @return The index in {@link #childRefs} where that child is found. This
     *         may also be used as an index into {@link #childAddr} and
     *         {@link #childEntryCounts}.
     * 
     * @exception IllegalArgumentException
     *                iff child is not a child of this node.
     */
    protected int getIndexOf(final AbstractNode child) {

        assert child != null;
        assert child.parent.get() == this;

        /*
         * Scan for location in weak references.
         * 
         * Note: during reads, this method is used for range counts. During
         * writes it is used to update the entry counts on which the range
         * counts are based.
         * 
         * @todo Can this be made more efficient by considering the last key on
         * the child and searching the parent for the index that must correspond
         * to that child? Note that when merging two children the keys in the
         * parent might not be coherent depending on exactly when this method is
         * called - for things to be coherent you would have to discover the
         * index of the children before modifying their keys.
         * 
         * @todo for writes, 85% of the use of this method is
         * updateEntryCount(). Since that method is only called on update, we
         * would do well to buffer hard references during descent and test the
         * buffer in this method before performing a full search. Since
         * concurrent writers are not allowed, we only need a single buffer
         * whose height is the height of the tree. This should prove especially
         * beneficial for larger branching factors. For smaller branching
         * factors the cost might be so small as to be ignorable.
         * 
         * @see Leaf#merge(Leaf sibling,boolean isRightSibling)
         */

        final int nkeys = getKeyCount();

        for (int i = 0; i <= nkeys; i++) {

            if (childRefs[i] != null && childRefs[i].get() == child) {

                return i;

            }

        }

        throw new IllegalArgumentException("Not our child : child=" + child);

    }

    /**
     * Invoked when a non-root node or leaf has no more keys to detach the child
     * from its parent. If the node becomes deficient, then the node is joined
     * with one of its immediate siblings. If the node is the root of the tree,
     * then the root of the tree is also updated. The child is deleted as a
     * post-condition.
     * 
     * @param child
     *            The child (does NOT need to be mutable).
     * 
     * @todo I am a bit suspicious of this method. it appears to be removing the
     *       key and child at the same index rather than the key at index-1 and
     *       the child at index. This interacts with how the separator key gets
     *       updated (or appears to get updated) when a child is removed. That
     *       logic occurs in {@link Leaf#merge(AbstractNode, boolean)} and in
     *       {@link Node#merge(AbstractNode, boolean)}. It may be that I can
     *       simplify things a bit further by making this adjustment here and in
     *       those merge() methods.
     * 
     *       FIXME This should clear the min/max for the child in this node's
     *       data record. This is invoked by merge() on a leaf, then recursively
     *       if we need to join nodes. The caller should have already updated
     *       the min/max for the leaf's own data record and on this node's data
     *       record for that leaf. This method only needs to clear the min/max
     *       entry associated with the child that is being removed.
     */
    protected void removeChild(final AbstractNode child) {

        assert child != null;
        assert !child.deleted;
        assert child.parent.get() == this;

        assert dirty;
        assert !deleted;
        assert !isPersistent();
        assert !isReadOnly();

        // cast to mutable implementation class.
        final BTree btree = (BTree) this.btree;

        if (btree.debug)
            assertInvariants();

        if (DEBUG) {
            log.debug("this=" + this + ", child=" + child);
            /*
             * Note: dumping [this] or the [child] will throw false exceptions
             * at this point - they are in an intermediate state.
             */
            // if(DEBUG) {
            // System.err.println("this"); dump(Level.DEBUG,System.err);
            // System.err.println("child"); child.dump(Level.DEBUG,System.err);
            // }
        }

        final int i = getIndexOf(child);

        /*
         * Note: these comments may be dated and need review.
         * 
         * Copy over the hole created when the child is removed from the node.
         * 
         * Given: v-- remove @ index = 0 index: 0 1 2 root keys : [ 21 24 0 ]
         * index leaf1 keys : [ 1 2 7 - ] 0 <--remove @ index = 0 leaf2 keys : [
         * 21 22 23 - ] 1 leaf4 keys : [ 24 31 - - ] 2
         * 
         * This can also be represented as
         * 
         * ( leaf1, 21, leaf2, 24, leaf4 )
         * 
         * and we remove the sequence ( leaf1, 21 ) leaving a well-formed node.
         * 
         * Remove(leaf1): index := 0 nkeys = 2 nchildren := nkeys(2) + 1 = 3
         * lenChildCopy := #children(3) - index(0) - 1 = 2 lenKeyCopy :=
         * lengthChildCopy - 1 = 1 copyChildren from index+1(1) to index(0)
         * lengthChildCopy(2) copyKeys from index+1(1) to index(0)
         * lengthKeyCopy(1) erase keys[ nkeys - 1 = 1 ] erase children[ nkeys =
         * 2 ]
         * 
         * post-condition: index: 0 1 2 root keys : [ 24 0 0 ] index leaf2 keys
         * : [ 21 22 23 - ] 0 leaf4 keys : [ 24 31 - - ] 1
         */

        {

            /*
             * Copy down to cover up the hole.
             */
            final int index = i;

            final int nkeys = getKeyCount();

            // #of children to copy (equivalent to nchildren - index - 1)
            final int lengthChildCopy = nkeys - index;

            // #of keys to copy.
            final int lengthKeyCopy = lengthChildCopy - 1;

            // Tunnel through to the mutable keys object.
            final MutableKeyBuffer keys = (MutableKeyBuffer) this.getKeys();
            final MutableNodeData data = (MutableNodeData) this.data;
            
            // check for persistent storage to be recycled for the removed child
            if (data.childAddr[index] != 0) {
            	btree.recycle(data.childAddr[index]);
            }

            if (lengthKeyCopy > 0) {

                System.arraycopy(keys.keys, index + 1, keys.keys, index,
                        lengthKeyCopy);

            }

            if (lengthChildCopy > 0) {

                System.arraycopy(childRefs, index + 1, childRefs, index,
                        lengthChildCopy);

                System.arraycopy(data.childAddr, index + 1, data.childAddr,
                        index, lengthChildCopy);

                System.arraycopy(data.childEntryCounts, index + 1,
                        data.childEntryCounts, index, lengthChildCopy);

            }

            /*
             * Erase the data that were exposed by this operation. Note that
             * there is one fewer keys than children so ....
             */

            if (nkeys > 0) {

                // erase the last key position.
                keys.keys[nkeys - 1] = null;

            }

            // erase the last child position.
            childRefs[nkeys] = null;
            data.childAddr[nkeys] = NULL;
            data.childEntryCounts[nkeys] = 0;

            // Clear the parent on the old child.
            child.parent = null;

            // one less the key in this node.
            /* nkeys--; */keys.nkeys--;

            if (child.isLeaf()) {

                btree.nleaves--;

            } else {

                btree.nnodes--;

            }

            // Deallocate the child.
            child.delete();

        }

        if (btree.root == this) {

            /*
             * The root node is allowed to become deficient, but once we are
             * reduced to having no more keys in the root node it is replaced by
             * the last remaining child.
             */
            if (getKeyCount() == 0 && !isLeaf()) {

                final AbstractNode<?> lastChild = getChild(0);

                if (btree.debug)
                    lastChild.assertInvariants();

				if (DEBUG) {
                    log.debug("replacing root: root=" + btree.root + ", node="
                            + this + ", lastChild=" + lastChild);
                    // System.err.println("root");
                    // btree.root.dump(Level.DEBUG,System.err);
                    // System.err.println("this");
                    // this.dump(Level.DEBUG,System.err);
                    // System.err.println("lastChild");
                    // lastChild.dump(Level.DEBUG,System.err);
                }

                final boolean wasDirty = btree.root.dirty;

                assert lastChild != null;

                // replace the root node with a root leaf.
                btree.root = lastChild;

                if (!wasDirty) {

                    btree.fireDirtyEvent();

                }

                // clear the parent reference since this is now the root.
                lastChild.parent = null;

                // one less level in the btree.
                btree.height--;

                // deallocate this node.
                this.delete();

                // one less node in the tree.
                btree.nnodes--;

                if (BTree.INFO) {
                    BTree.log.info("reduced tree height: height="
                            + btree.height + ", newRoot=" + btree.root);
                }

                btree.getBtreeCounters().rootsJoined++;

            }

        } else {

            /*
             * If a non-root node becomes deficient then it is joined with a
             * direct sibling. If this forces a merge with a sibling, then the
             * merged sibling will be removed from the parent which may force
             * the parent to become deficient in turn, and thereby trigger a
             * join() of the parent.
             */

            if (data.getKeyCount() < minKeys()) {

                join();

            }

        }

    }

    // /**
    // * This is invoked by {@link #removeChild(AbstractNode)} when the node is
    // * reduced to a single child in order to replace the reference to the node
    // * on its parent with the reference to the node's sole remaining child.
    // *
    // * @param oldChild
    // * The node.
    // * @param newChild
    // * The node's sole remaining child. This MAY be persistent since
    // * this operation does NOT change the persistent state of the
    // * newChild but only updates its transient state (e.g., its
    // * parent reference).
    // */
    // protected void replaceChild(AbstractNode oldChild,AbstractNode newChild)
    // {
    //        
    // assert oldChild != null;
    // assert !oldChild.isDeleted();
    // assert !oldChild.isPersistent();
    // assert oldChild.parent.get() == this;
    // assert oldChild.nkeys == 0;
    // assertInvariants();
    // oldChild.assertInvariants();
    // newChild.assertInvariants();
    //
    // assert newChild != null;
    // assert !newChild.isDeleted();
    // // assert !newChild.isPersistent(); // MAY be persistent - does not
    // matter.
    // assert newChild.parent.get() == oldChild;
    //
    // assert oldChild != newChild;
    //
    // assert !isDeleted();
    // assert !isPersistent();
    //
    // int i = getIndexOf( oldChild );
    //
    // // dirtyChildren.remove(oldChild);
    // //
    // // if (newChild.isDirty()) {
    // //
    // // dirtyChildren.add(newChild);
    // //
    // // }
    //
    // // set the persistent key for the new child.
    // childAddr[i] = (newChild.isPersistent() ? newChild.getIdentity() : NULL);
    //
    // // set the reference to the new child.
    // childRefs[i] = new WeakReference<AbstractNode>(newChild);
    //
    // // Reuse the weak reference from the oldChild.
    // newChild.parent = oldChild.parent;
    //
    // }

    /**
     * Return the child node or leaf at the specified index in this node. If the
     * node is not in memory then it is read from the store.
     * <p>
     * Note: This implementation DOES NOT cause concurrent threads to block
     * unless they are performing IO for the same child. A {@link Memoizer}
     * pattern is used to assign each concurrent thread a {@link FutureTask} on
     * which it waits for the result. Once the result is available, there is a
     * small <code>synchronized</code> block during which the concurrent
     * requests for a child will content to update the appropriate element in
     * {@link #childRefs}.
     * <p>
     * I believe the contention to update {@link #childRefs} is unavoidable. If
     * this object was made into an {@link AtomicReferenceArray} then we would
     * have difficulty when inserting and removing tuples since the backing
     * array is not visible. An array of {@link AtomicReference} objects would
     * not help since it would not ensure "publication" when the element was
     * changed from a <code>null</code> to an {@link AtomicReference}, only when
     * {@link AtomicReference#compareAndSet(Object, Object)} was used. Thus it
     * could only help if we pre-populated the array with
     * {@link AtomicReference} objects, which seems wasteful.
     * <p>
     * As always, the mutable B+Tree is single threaded so there are not added
     * synchronization costs. Concurrent readers can only arise for read-only
     * {@link BTree}s and for {@link IndexSegment}s.
     * 
     * @param index
     *            The index of the child to be read from the store (in
     *            [0:nkeys]).
     * 
     * @return The child node or leaf and never null.
     * 
     * @throws IndexOutOfBoundsException
     *             if index is negative.
     * @throws IndexOutOfBoundsException
     *             if index is GT the #of keys in the node (there is one more
     *             child than keys in a node).
     */
    final public AbstractNode getChild(final int index) {

        // See BLZG-1657 (Add BTreeCounters for cache hit and cache miss)
        btree.getBtreeCounters().cacheTests.increment();
        
        /*
         * I've take out this test since it turns out to be relatively
         * expensive!?! The interrupt status of the thread is now checked
         * exclusively when reading on the store.
         */
        // if(Thread.interrupted()) {
        // /*
        // * This method is called relatively often - it is used each time we
        // * descend the tree. We check whether or not the thread has been
        // * interrupted so that we can abort running tasks quickly.
        // */
        // throw new RuntimeException(new InterruptedException());
        // }

        if (index < 0 || index > data.getKeyCount()) {

            throw new IndexOutOfBoundsException("index=" + index + ", nkeys="
                    + data.getKeyCount());

        }

        // if (!btree.isReadOnly()) {
        if (btree.memo == null) {

            /*
             * Optimization for the mutable B+Tree.
             * 
             * Note: This optimization depends on the assumption that concurrent
             * operations are never submitted to the mutable B+Tree. In fact,
             * the UnisolatedReadWriteIndex *DOES* allow concurrent readers (it
             * uses a ReentrantReadWriteLock). Therefore this code path is now
             * expressed conditionally on whether or not the Memoizer object is
             * initialized by AbstractBTree.
             * 
             * Note: Since the caller is single-threaded for the mutable B+Tree
             * we do not need to use the Memoizer, which just delegates to
             * _getChild(index). This saves us some object creation and overhead
             * for this case.
             */

            // See BLZG-1657 (Add BTreeCounters for cache hit and cache miss).
            btree.getBtreeCounters().cacheMisses.increment();
            
            return _getChild(index, null/* req */);

        }

        /*
         * If we can resolve a hard reference to the child then we do not need
         * to look any further.
         */
//        synchronized (childRefs) 
        {

            /*
             * Note: we need to synchronize on here to ensure visibility for
             * childRefs[index] (in case it was updated in another thread). This
             * is true even for the mutable B+Tree since the caller could use
             * different threads for different operations. However, this
             * synchronization will never be contended for the mutable B+Tree.
             */

            final Reference<AbstractNode<?>> childRef = childRefs[index];

            final AbstractNode child = childRef == null ? null : childRef.get();

            if (child != null) {

                // Already materialized.
                return child;

            }

        }

        /*
         * Otherwise we need to go through the Memoizer pattern to achieve
         * non-blocking access. It will wind up delegating to _getChild(int),
         * which is immediately below. However, it will ensure that one and only
         * one thread executes _getChild(int) for a given parent and child
         * index. That thread will update childRefs[index]. Any concurrent
         * requests for the same child will wait for the FutureTask inside of
         * the Memoizer and then return the new value of childRefs[index].
         */

        /*
         * See BLZG-1657 (Add BTreeCounters for cache hit and cache miss)
         * 
         * Note: This is done in the caller rather than _getChild() since the
         * latter may be called from the memoizer, in which case only one thread
         * will actually invoke _getChild() while the others will just obtain
         * the child through the memoized Future.
         */
        btree.getBtreeCounters().cacheMisses.increment();
        
        return btree.loadChild(this, index);

    }

    /**
     * Method conditionally reads the child at the specified index from the
     * backing store and sets its reference on the appropriate element of
     * {@link #childRefs}. This method assumes that external mechanisms
     * guarantee that no other thread is requesting the same child via this
     * method at the same time. For the mutable B+Tree, that guarantee is
     * trivially given by its single-threaded constraint. For the read-only
     * B+Tree, {@link AbstractBTree#loadChild(Node, int)} provides this
     * guarantee using a {@link Memoizer} pattern. This method explicitly
     * handshakes with the {@link ChildMemoizer} to clear the {@link FutureTask}
     * from the memoizer's internal cache as soon as the reference to the child
     * has been set on the appropriate element of {@link #childRefs}.
     * 
     * @param index
     *            The index of the child.
     * @param req
     *            The key we need to remove the request from the
     *            {@link ChildMemoizer} cache (and <code>null</code> if this
     *            method is not invoked by the memoizer pattern).
     * 
     * @return The child and never <code>null</code>.
     */
    AbstractNode _getChild(final int index, final LoadChildRequest req) {

		/*
		 * Make sure that the child is not reachable. It could have been
		 * concurrently set even if the caller had tested this and we do not
		 * want to read through to the backing store unless we need to.
		 * 
		 * Note: synchronizing on childRefs[] should not be necessary. For a
		 * read-only B+Tree, the synchronization is provided by the Memoizer
		 * pattern. For a mutable B+Tree, the synchronization is provided by the
		 * single-threaded contract for mutation and by the requirement to use a
		 * construct, such as a Queue or the UnisolatedReadWriteIndex, which
		 * imposes a memory barrier when passing a B+Tree instance between
		 * threads.
		 * 
		 * See http://www.cs.umd.edu/~pugh/java/memoryModel/archive/1096.html
		 */
        AbstractNode child;
        synchronized (childRefs) {

            /*
             * Note: we need to synchronize on here to ensure visibility for
             * childRefs[index] (in case it was updated in another thread).
             */
            final Reference<AbstractNode<?>> childRef = childRefs[index];

            child = childRef == null ? null : childRef.get();

            if (child != null) {

                // Already materialized.
                return child;

            }

        }

        /*
         * The child needs to be read from the backing store.
         */

        // See BLZG-1657 (Add BTreeCounters for cache hit and cache miss)
        btree.getBtreeCounters().cacheMisses.increment();
        
        final long addr = data.getChildAddr(index);

        if (addr == IRawStore.NULL) {

            // dump(Level.DEBUG, System.err);
            /*
             * Note: It appears that this can be triggered by a full disk, but I
             * am not quite certain how a full disk leads to this condition.
             * Presumably the full disk would cause a write of the child to
             * fail. In turn, that should cause the thread writing on the B+Tree
             * to fail. If group commit is being used, the B+Tree should then be
             * discarded and reloaded from its last commit point.
             */
            throw new AssertionError(
                    "Child does not have persistent identity: this=" + this
                            + ", index=" + index);

        }

        /*
         * Read the child from the backing store (potentially reads through to
         * the disk).
         * 
         * Note: This is guaranteed to not do duplicate reads. There are two
         * cases. (A) The mutable B+Tree. Since the mutable B+Tree is single
         * threaded, this case is trivial. (B) The read-only B+Tree. Here our
         * guarantee is that the caller is in ft.run() inside of the Memoizer,
         * and that ensures that only one thread is executing for a given
         * LoadChildRequest object (the input to the Computable). Note that
         * LoadChildRequest MUST meet the criteria for a hash map for this
         * guarantee to obtain.
         */
        child = btree.readNodeOrLeaf(addr);

        /*
         * Update of the childRefs[index] element.
         * 
         * Note: This code block is synchronized in order to facilitate the safe
         * publication of the change in childRefs[index] to other threads.
         */
        synchronized (childRefs) {

            /*
             * Since the childRefs[index] element has not been updated we do so
             * now while we are synchronized.
             * 
             * Note: This paranoia test could be tripped if the caller allowed
             * concurrent requests to enter this method for the same child. In
             * that case childRefs[index] could have an uncleared reference to
             * the child. This would indicate a breakdown in the guarantee we
             * require of the caller.
             */
            assert childRefs[index] == null || childRefs[index].get() == null : "Child is already set: this="
                    + this + ", index=" + index;

            // patch parent reference since loaded from store.
            child.parent = this.self;

            // patch the child reference.
            childRefs[index] = child.self;

        }

        /*
         * Clear the future task from the memoizer cache.
         * 
         * Note: This is necessary in order to prevent the cache from retaining
         * a hard reference to each child materialized for the B+Tree.
         * 
         * Note: This does not depend on any additional synchronization. The
         * Memoizer pattern guarantees that only one thread actually call
         * ft.run() and hence runs this code.
         */
        if (req != null) {

            btree.memo.removeFromCache(req);

        }

        return child;

    }

    // /**
    // * Static helper method allocates the per-child lock objects.
    // * <p>
    // * Note that the mutable {@link BTree} imposes a single-threaded
    // constraint
    // * on its API so we do not need to do any locking for that case and this
    // * method will therefore return <code>null</code> if the owning B+Tree is
    // * mutable.
    // *
    // * @param btree
    // * The owning B+Tree.
    // *
    // * @param nkeys
    // * The #of keys, which is used to dimension the array.
    // *
    // * @return The array of lock objects -or- <code>null</code> if the btree
    // is
    // * mutable.
    // *
    // * @see #childLocks
    // *
    // * @todo Per-child locks will only be useful on nodes with a relatively
    // high
    // * probability of concurrent access. Therefore they should be
    // * conditionally enabled only to a depth of 0 (for the root's direct
    // * children) or 1 (for the children of the root's direct children).
    // * There is just not going to be any utility to this beyond that
    // * point, especially not on an {@link IndexSegment} with a relatively
    // * high branching factor. We could directly compute the probability of
    // * access to any given child based on the branching factor and the
    // * depth of the node in the B+Tree and the assumption of a uniform
    // * distribution of reads by concurrent threads [in fact, in many
    // * benchmark situations we are more likely to content for the same
    // * child unless the queries are parameterized].
    // */
    // static private final Object[] newChildLocks(final AbstractBTree btree,
    // final int nkeys) {
    //        
    // /*
    // * Note: Uncommenting this has the effect of disabling per-child
    // * locking.
    // */
    // // if(true) return null;
    //        
    // if (!btree.isReadOnly() || !btree.getIndexMetadata().getChildLocks()) {
    //
    // /*
    // * Either The mutable B+Tree has a single threaded constraint so we
    // * do not need to do any locking for that case and therefore we do
    // * not allocate the per-child locks here -or- child locks were
    // * disabled as a configuration option.
    // */
    //            
    // return null;
    //            
    // }
    //        
    // /*
    // * Note: The array is dimensioned to [branchingFactor] and not
    // * [branchingFactor+1]. The "overflow" slot of the various arrays are
    // * only used during node overflow/underflow operations. Those operations
    // * do not occur for a read-only B+Tree.
    // */
    // // final int n = btree.branchingFactor + 1;
    //        
    // // Note: We only need locks for the child entries that exist!
    // final int n = nkeys + 1;
    //        
    // final Object[] a = new Object[n];
    //        
    // for (int i = 0; i < n; i++) {
    //
    // a[i] = new Object();
    //            
    // }
    //        
    // return a;
    //        
    // }

    /**
     * Return the right-most child of this node.
     * 
     * @param nodesOnly
     *            when <code>true</code> the search will halt at the right-most
     *            non-leaf. Otherwise it will return the right-most leaf.
     * 
     * @return The right-most child of this node.
     */
    protected AbstractNode getRightMostChild(final boolean nodesOnly) {

        final AbstractNode<?> child = getChild(getKeyCount());

        assert child != null;

        if (child.isLeaf()) {

            if (nodesOnly) {

                return this;

            } else {

                return child;

            }

        }

        return ((Node) child).getRightMostChild(nodesOnly);

    }

    /**
     * Iterator visits children, recursively expanding each child with a
     * post-order traversal of its children and finally visits this node itself.
     */
    @Override
    @SuppressWarnings("unchecked")
    public Iterator<AbstractNode> postOrderNodeIterator(
            final boolean dirtyNodesOnly, final boolean nodesOnly) {

        if (dirtyNodesOnly && !dirty) {

            return EmptyIterator.DEFAULT;

        }

        /*
         * Iterator append this node to the iterator in the post-order position.
         */

        return new Striterator(postOrderIterator1(dirtyNodesOnly,nodesOnly))
                .append(new SingleValueIterator(this));

    }

    /**
     * Iterator visits children in the specified half-open key range,
     * recursively expanding each child with a post-order traversal of its
     * children and finally visits this node itself.
     */
    @Override
    @SuppressWarnings("unchecked")
    public Iterator<AbstractNode> postOrderIterator(final byte[] fromKey,
            final byte[] toKey) {

        /*
         * Iterator append this node to the iterator in the post-order position.
         */

        return new Striterator(postOrderIterator2(fromKey, toKey))
                .append(new SingleValueIterator(this));

    }

    /**
     * Visits the children (recursively) using post-order traversal, but does
     * NOT visit this node.
     */
    @SuppressWarnings("unchecked")
    private Iterator<AbstractNode> postOrderIterator1(
            final boolean dirtyNodesOnly,final boolean nodesOnly) {

        /*
         * Iterator visits the direct children, expanding them in turn with a
         * recursive application of the post-order iterator.
         * 
         * When dirtyNodesOnly is true we use a child iterator that makes a best
         * effort to only visit dirty nodes. Especially, the iterator MUST NOT
         * force children to be loaded from disk if the are not resident since
         * dirty nodes are always resident.
         * 
         * The iterator must touch the node in order to guarantee that a node
         * will still be dirty by the time that the caller visits it. This
         * places the node onto the hard reference queue and increments its
         * reference counter. Evictions do NOT cause IO when the reference is
         * non-zero, so the node will not be made persistent as a result of
         * other node touches. However, the node can still be made persistent if
         * the caller explicitly writes the node onto the store.
         */

        // BTree.log.debug("node: " + this);
        return new Striterator(childIterator(dirtyNodesOnly))
                .addFilter(new Expander() {

                    private static final long serialVersionUID = 1L;

                    /*
                     * Expand each child in turn.
                     */
                    protected Iterator expand(Object childObj) {

                        /*
                         * A child of this node.
                         */

                        final AbstractNode child = (AbstractNode) childObj;

                        if (dirtyNodesOnly && !child.dirty) {

                            return EmptyIterator.DEFAULT;

                        }

                        if (child instanceof Node) {

                            /*
                             * The child is a Node (has children).
                             */

                            // BTree.log.debug("child is node: " + child);
                            // visit the children (recursive post-order
                            // traversal).
                            final Striterator itr = new Striterator(
                                    ((Node) child).postOrderIterator1(
                                            dirtyNodesOnly, nodesOnly));

                            // append this node in post-order position.
                            itr.append(new SingleValueIterator(child));

                            return itr;

                        } else {

                            /*
                             * The child is a leaf.
                             */

                            // BTree.log.debug("child is leaf: " + child);
                            // Visit the leaf itself.
                            if (nodesOnly)
                                return EmptyIterator.DEFAULT;

                            return new SingleValueIterator(child);

                        }
                    }
                });

    }

    /**
     * Visits the children (recursively) using post-order traversal, but does
     * NOT visit this node.
     */
    @SuppressWarnings("unchecked")
    private Iterator<AbstractNode> postOrderIterator2(final byte[] fromKey,
            final byte[] toKey) {

        /*
         * Iterator visits the direct children, expanding them in turn with a
         * recursive application of the post-order iterator.
         * 
         * When dirtyNodesOnly is true we use a child iterator that makes a best
         * effort to only visit dirty nodes. Especially, the iterator MUST NOT
         * force children to be loaded from disk if the are not resident since
         * dirty nodes are always resident.
         * 
         * The iterator must touch the node in order to guarentee that a node
         * will still be dirty by the time that the caller visits it. This
         * places the node onto the hard reference queue and increments its
         * reference counter. Evictions do NOT cause IO when the reference is
         * non-zero, so the node will not be made persistent as a result of
         * other node touches. However, the node can still be made persistent if
         * the caller explicitly writes the node onto the store.
         */

        // BTree.log.debug("node: " + this);
        return new Striterator(childIterator(fromKey, toKey))
                .addFilter(new Expander() {

                    private static final long serialVersionUID = 1L;

                    /*
                     * Expand each child in turn.
                     */
                    protected Iterator expand(final Object childObj) {

                        /*
                         * A child of this node.
                         */

                        final AbstractNode child = (AbstractNode) childObj;

                        if (child instanceof Node) {

                            /*
                             * The child is a Node (has children).
                             * 
                             * Visit the children (recursive post-order
                             * traversal).
                             */

                            // BTree.log.debug("child is node: " + child);
                            final Striterator itr = new Striterator(
                                    ((Node) child).postOrderIterator2(fromKey,
                                            toKey));

                            // append this node in post-order position.
                            itr.append(new SingleValueIterator(child));

                           /*
                            * Note: getReadExecutor() is not defined for IJournal. If
                            * we want to support the read executor pre-fetch pattern
                            * then the code needs to be updated to use IJournal and
                            * IJournal needs to expose getReadExecutor.
                            */
                            if ((btree.store instanceof Journal)
                                    && (((Journal) btree.store)
                                            .getReadExecutor() != null)) {

                                /*
                                 * Prefetch any child leaves we need to visit
                                 * and prefetch the right sibling of the node we
                                 * are about to visit if the iterator will span
                                 * that node as well.
                                 */
                                prefetchChildLeaves((Node) child, fromKey,
                                        toKey);

                            }

                            return itr;

                        } else {

                            /*
                             * The child is a leaf.
                             */

                            // BTree.log.debug("child is leaf: " + child);
                            
                            // Visit the leaf itself.
                            return new SingleValueIterator(child);

                        }
                    }
                });

    }

    /**
     * When we visit a node whose children are leaves, schedule memoization of
     * those leaves whose separator key in the node is LT the toKey
     * (non-blocking). If the rightSibling of the node would be visited by the
     * iterator, then prefetch of the rightSibling is also scheduled.
     * 
     * @param node
     *            A node whose children are leaves.
     * @param toKey
     *            The exclusive upper bound of some iterator.
     * 
     * @todo All memoization should go through the same thread pool in order to
     *       bound the #of threads actually reading on the disk. Right now, the
     *       memoizer runs in the caller's thread. If we modified it to submit a
     *       task to the readService to handle the memoization then we would
     *       bound the #of threads involved.
     * 
     * @todo PREFETCH : JSR 166 Fork/join would also be a good choice here.
     * 
     * @todo PREFETCH : Prefetch will not break if the iterator is closed. This
     *       is not a bug per se, but it might be something we want to support
     *       in the future.
     * 
     * @todo PREFETCH : Prefetch should be cancelled if the btree is closed. We
     *       could do this with an Executor wrapping the {@link LatchedExecutor}
     *       that kept track of the work queue for that executor and allowed us
     *       to cancel just the tasks submitted against it. This would also give
     *       us a means to report on the prefetch work queue length.
     * 
     * @todo PREFETCH : Only journal is supported right now.
     * 
     * @todo PREFETCH : The {@link IRangeQuery#CURSOR} mode is not supported yet.
     */
    protected void prefetchChildLeaves(final Node node, final byte[] fromKey,
                final byte[] toKey) {

        final int nkeys = node.getKeyCount();
        
        // figure out the first index to visit.
        final int fromIndex;
        {

            int index;

            if (fromKey != null) {

                index = node.getKeys().search(fromKey);

                if (index < 0) {

                    index = -index - 1;

                }

            } else {

                index = 0;

            }

            fromIndex = index;

        }

        // figure out the first index to NOT visit.
        final int toIndex;
        {

            int index;

            if (toKey != null) {

                index = node.getKeys().search(toKey);

                if (index < 0) {

                    index = -index - 1;

                }

            } else {

                index = nkeys;

            }

            toIndex = index;

        }

        /*
         * Submit taskS which will materialize the children in
         * [fromIndex,toIndex).
         * 
         * Note: We do not track the futures of these tasks. The tasks have a
         * side effect on the parent/child weak references among the nodes in
         * the B+Tree, on the backing hard reference ring buffer, and on the
         * cache of materialized disk records. That side effect is all that we
         * are seeking.
         * 
         * Note: If the B+Tree is concurrently closed, then these tasks will
         * error out. That is fine.
         */
        final Executor s = ((Journal) btree.store).getReadExecutor();

        for (int i = fromIndex; i < toIndex; i++) {
            
            final int index = i;
            
            s.execute(new Runnable() {

                public void run() {

                    if (!node.btree.isOpen()) {
                        // No longer open.
                        return;

                    }

                    // Materialize the child.
                    node.getChild(index);

                }

            });

        }
        
        if (toIndex == nkeys - 1) {

            /*
             * Prefetch the right sibling.
             */
            
            prefetchRightSibling(node, toKey);
            
        }
        
    }

    /**
     * If the caller's <i>toKey</i> is GT the separator keys for the children of
     * this node then the iterator will need to visit the rightSibling of the
     * node and this method will schedule the memoization of the node's
     * rightSibling in order to reduce the IO latency when the iterator visit
     * that rightSibling (non-blocking).
     * 
     * @param node
     *            A node.
     * @param toKey
     *            The exclusive upper bound of some iterator.
     */
    protected void prefetchRightSibling(final Node node, final byte[] toKey) {
        
        final int nkeys = node.getKeyCount();

        final byte[] lastSeparatorKey = node.getKeys().get(nkeys - 1);

        if (BytesUtil.compareBytes(toKey, lastSeparatorKey) <= 0) {

            /*
             * Since the toKey is LTE to the lastSeparatorKey on this node the
             * last tuple to be visited by the iterator is spanned by this node
             * and we will not visit the node's rightSibling.
             * 
             * @todo This test could be optimized if IRaba exposed a method for
             * unsigned byte[] comparisons against the coded representation of
             * the keys.
             */

            return;

        }

        // The parent of this node.
        final Node p = node.parent.get();

//        /*
//         * Test to see if the rightSibling is already materialized.
//         */
//        
//        Note: Don't bother testing as we will test in the task below anyway.
//        
//        Node rightSibling = (Node) p.getRightSibling(node,
//                false/* materialize */);
//
//        if (rightSibling != null) {
//
//            /*
//             * The rightSibling is already materialized and getRightSibling()
//             * touches the rightSibling as a side-effect so it will be retained
//             * longer. Return now as there is nothing to do.
//             */
//
//            return;
//            
//        }

        /*
         * Submit a task which will materialize that right sibling.
         * 
         * Note: This task will only materialize a rightSibling of a common
         * parent. If [node] is the last child of the parent [p] then you would
         * need to ascend to the parent of [p] and then desend again, which is
         * not the specified behavior for getRightSibling(). Since this is just
         * an optimization for IO scheduling, I think that it is fine as it is.
         * 
         * Note: We do not track the future of this task. The task will have a
         * side effect on the parent/child weak references among the nodes in
         * the B+Tree, on the backing hard reference ring buffer, and on the
         * cache of materialized disk records. That side effect is all that we
         * are seeking.
         * 
         * Note: If the B+Tree is concurrently closed, then this task will error
         * out.  That is fine.
         */
        final Executor s = ((Journal) btree.store).getReadExecutor();

        s.execute(new Runnable() {

            public void run() {

                if (!p.btree.isOpen()) {

                    // No longer open.
                    return;
                    
                }

                // Materialize the right sibling.
                p.getRightSibling(node, true/* materialize */);

            }

        });

    }
    
    /**
     * Iterator visits the direct child nodes in the external key ordering.
     * 
     * @param dirtyNodesOnly
     *            When true, only the direct dirty child nodes will be visited.
     */
    public Iterator<AbstractNode> childIterator(final boolean dirtyNodesOnly) {

        if (dirtyNodesOnly) {

            return new DirtyChildIterator(this);

        } else {

            return new ChildIterator(this);

        }

    }

    /**
     * Iterator visits the direct child nodes in the external key ordering.
     */
    public Iterator<AbstractNode> childIterator(final byte[] fromKey,
            final byte[] toKey) {

        return new ChildIterator(this, fromKey, toKey);

    }

    @Override
    public boolean dump(final Level level, final PrintStream out,
            final int height, final boolean recursive) {

        // True iff we will write out the node structure.
        final boolean debug = level.toInt() <= Level.DEBUG.toInt();

        // Set true iff an inconsistency is detected.
        boolean ok = true;

        final int branchingFactor = this.getBranchingFactor();
        final int nkeys = getKeyCount();
        final int minKeys = this.minKeys();
        final int maxKeys = this.maxKeys();

        if (parent != null && nkeys < minKeys) {
            // min keys failure.
            out.println(indent(height) + "ERROR: too few keys: m="
                    + branchingFactor + ", minKeys=" + minKeys + ", nkeys="
                    + nkeys + ", isLeaf=" + isLeaf());
            ok = false;
        }

        if (nkeys > maxKeys) {
            // max keys failure.
            out.println(indent(height) + "ERROR: too many keys: m="
                    + branchingFactor + ", maxKeys=" + maxKeys + ", nkeys="
                    + nkeys + ", isLeaf=" + isLeaf());
            ok = false;
        }

        { // nentries
            if (this == btree.root) {
                if (getSpannedTupleCount() != btree.getEntryCount()) {
                    out.println(indent(height)
                            + "ERROR: root node has nentries="
                            + getSpannedTupleCount()
                            + ", but btree has nentries="
                            + btree.getEntryCount());
                    ok = false;
                }
            }
            {
                int nentries = 0;
                for (int i = 0; i <= nkeys; i++) {
                    nentries += getChildEntryCount(i);
                    if (nentries <= 0) {
                        out.println(indent(height) + "ERROR: childEntryCount["
                                + i + "] is non-positive");
                        ok = false;
                    }
                }
                if (nentries != getSpannedTupleCount()) {
                    out.println(indent(height) + "ERROR: nentries("
                            + getSpannedTupleCount()
                            + ") does not agree with sum of per-child counts("
                            + nentries + ")");
                    ok = false;
                }
            }
        }

        if (this == btree.root) {
            if (parent != null) {
                out
                        .println(indent(height)
                                + "ERROR: this is the root, but the parent is not null.");
                ok = false;
            }
        } else {
            /*
             * Note: there is a difference between having a parent reference and
             * having the parent be stronly reachable. However, we actually want
             * to maintain both -- a parent MUST always be strongly reachable
             * ... UNLESS you are doing a fast forward or reverse leaf scan
             * since the node hierarchy is not being traversed in that case.
             * 
             * @todo Should we keep leaves using a fast forward or reverse scan
             * out of the hard reference cache since their parents are not
             * strongly reachable?
             */
            if (parent == null) {
                out
                        .println(indent(height)
                                + "ERROR: the parent reference MUST be defined for a non-root node.");
                ok = false;
            } else if (parent.get() == null) {
                out.println(indent(height)
                        + "ERROR: the parent is not strongly reachable.");
                ok = false;
            }
        }

        // verify keys are monotonically increasing.
        try {
            assertKeysMonotonic();
        } catch (AssertionError ex) {
            out.println(indent(height) + "  ERROR: " + ex);
            ok = false;
        }
        if (debug) {
            out.println(indent(height) + toString());
            // out.println(indent(height) + "  parent="
            // + (parent == null ? null : parent.get()));
            // out.println(indent(height) + "  dirty=" + isDirty() + ", nkeys="
            // + nkeys + ", nchildren=" + (nkeys + 1) + ", minKeys="
            // + minKeys + ", maxKeys=" + maxKeys + ", branchingFactor="
            // + branchingFactor+", #entries="+getSpannedTupleCount());
            // out.println(indent(height) + "  keys=" + getKeys());
            // // out.println(indent(height) + "  childAddr="
            // // + Arrays.toString(childAddr));
            // out.print(indent(height) + "  childAddr/Refs=[");
            // for (int i = 0; i <= nkeys + 1; i++) {
            // if (i > 0)
            // out.print(", ");
            // out.print(getChildAddr(i));
            // out.print('(');
            // if (childRefs[i] == null) {
            // out.print("null");
            // } else {
            // // Non-recursive print of the reference.
            // final AbstractNode<?> child = childRefs[i].get();
            // out.print(child.getClass().getName() + "@"
            // + Integer.toHexString(child.hashCode()));
            // }
            // out.print(')');
            // }
            // out.println("]");
            // out.print(indent(height) + "  childEntryCounts=[");
            // for (int i = 0; i <= nkeys; i++) {
            // if (i > 0)
            // out.print(", ");
            // out.print(getChildEntryCount(i));
            // }
            // out.println("]");
            // // + Arrays.toString(childEntryCounts));
        }

        /*
         * Look for inconsistencies for children. A dirty child MUST NOT have an
         * entry in childAddr[] since it is not persistent and MUST show up in
         * dirtyChildren. Likewise if a child is NOT dirty, then it MUST have an
         * entry in childAddr and MUST NOT show up in dirtyChildren.
         * 
         * This also verifies that all entries beyond nchildren (nkeys+1) are
         * unused.
         */
        for (int i = 0; i < branchingFactor + 1; i++) {

            if (i > nkeys) {

                /*
                 * Scanning past the last valid child index.
                 */

                if (!isReadOnly()
                        && ((MutableNodeData) data).childAddr[i] != NULL) {

                    out.println(indent(height) + "  ERROR childAddr[" + i
                            + "] should be " + NULL + ", not "
                            + ((MutableNodeData) data).childAddr[i]);

                    ok = false;

                }

                if (childRefs[i] != null) {

                    out.println(indent(height) + "  ERROR childRefs[" + i
                            + "] should be null, not " + childRefs[i]);

                    ok = false;

                }

            } else {

                /*
                 * Scanning a valid child index.
                 * 
                 * Note: This is not fetching the child if it is not in memory
                 * -- perhaps it should using its persistent id?
                 */

                final AbstractNode<?> child = (childRefs[i] == null ? null
                        : childRefs[i].get());

                if (child != null) {

                    if (child.parent == null || child.parent.get() == null) {
                        /*
                         * the reference to the parent MUST exist since the we
                         * are the parent and therefore the parent is strongly
                         * reachable.
                         */
                        out.println(indent(height) + "  ERROR child[" + i
                                + "] does not have parent reference.");
                        ok = false;
                    }

                    if (child.parent.get() != this) {
                        out.println(indent(height) + "  ERROR child[" + i
                                + "] has wrong parent.");
                        ok = false;
                        // // some extra stuff used to track down a bug.
                        if (!ok) {
                            if (level == Level.DEBUG) {
                                // dump the child also and exit.
                                System.err.println("child");
                                child.dump(Level.DEBUG, System.err);
                                throw new AssertionError();
                            } else {
                                // recursive call to get debug level dump.
                                System.err.println("this");
                                this.dump(Level.DEBUG, System.err);
                            }
                        }
                    }

					final long childSpannedEntryCount = (child.isLeaf() ? ((Leaf) child)
							.getKeyCount()
							: ((Node) child).getSpannedTupleCount());

                    if (getChildEntryCount(i) != childSpannedEntryCount) {
                        out.println(indent(height) + "  ERROR child[" + i
                                + "] spans " + childSpannedEntryCount
                                + " entries, but childEntryCount[" + i + "]="
                                + getChildEntryCount(i));
                        ok = false;
                    }

                    if (child.isDirty()) {
                        /*
                         * Dirty child. The parent of a dirty child MUST also be
                         * dirty.
                         */
                        if (!isDirty()) {
                            out.println(indent(height) + "  ERROR child[" + i
                                    + "] is dirty, but its parent is clean");
                            ok = false;
                        }
                        if (childRefs[i] == null) {
                            out.println(indent(height) + "  ERROR childRefs["
                                    + i + "] is null, but the child is dirty");
                            ok = false;
                        }
                        if (getChildAddr(i) != NULL) {
                            out.println(indent(height) + "  ERROR childAddr["
                                    + i + "]=" + getChildAddr(i)
                                    + ", but MUST be " + NULL
                                    + " since the child is dirty");
                            ok = false;
                        }
                        // if (!dirtyChildren.contains(child)) {
                        // out
                        // .println(indent(height + 1)
                        // + "  ERROR child at index="
                        // + i
                        // + " is dirty, but not on the dirty list: child="
                        // + child);
                        // ok = false;
                        // }
                    } else {
                        /*
                         * Clean child (ie, persistent). The parent of a clean
                         * child may be either clear or dirty.
                         */
                        if (getChildAddr(i) == NULL) {
                            out.println(indent(height) + "  ERROR childKey["
                                    + i + "] is " + NULL
                                    + ", but child is not dirty");
                            ok = false;
                        }
                        // if (dirtyChildren.contains(child)) {
                        // out
                        // .println(indent(height)
                        // + "  ERROR child at index="
                        // + i
                        // + " is not dirty, but is on the dirty list: child="
                        // + child);
                        // ok = false;
                        // }
                    }

                }

            }

        }

        if (!ok && !debug) {

            // @todo show the node structure with the errors since we would not
            // have seen it otherwise.

        }

        if (recursive) {

            /*
             * Dump children using pre-order traversal.
             */

            final Set<AbstractNode<?>> dirty = new HashSet<AbstractNode<?>>();

            for (int i = 0; i <= /* nkeys */branchingFactor; i++) {

                if (childRefs[i] == null && !isReadOnly()
                        && ((MutableNodeData) data).childAddr[i] == 0) {

                    if (i <= nkeys) {

                        /*
                         * This let's us dump a tree with some kinds of
                         * structural problems (missing child reference or key).
                         */

                        out.println(indent(height + 1)
                                + "ERROR can not find child at index=" + i
                                + ", skipping this index.");

                        ok = false;

                    } else {

                        /*
                         * We expect null child entries beyond nkeys+1.
                         */

                    }

                    continue;

                }

                /*
                 * Note: this works around the assert test for the index in
                 * getChild(index) but is not able/willing to follow a childKey
                 * to a child that is not memory resident.
                 */
                // AbstractNode child = getChild(i);
                final AbstractNode<?> child = childRefs[i] == null ? null
                        : childRefs[i].get();

                if (child != null) {

                    if (child.parent == null) {

                        out
                                .println(indent(height + 1)
                                        + "ERROR child does not have parent reference at index="
                                        + i);

                        ok = false;

                    }

                    if (child.parent.get() != this) {

                        out
                                .println(indent(height + 1)
                                        + "ERROR child has incorrect parent reference at index="
                                        + i);

                        ok = false;

                    }

                    // if (child.isDirty() && !dirtyChildren.contains(child)) {
                    //
                    // out
                    // .println(indent(height + 1)
                    // + "ERROR dirty child not in node's dirty list at index="
                    // + i);
                    //
                    // ok = false;
                    //
                    // }
                    //
                    // if (!child.isDirty() && dirtyChildren.contains(child)) {
                    //
                    // out
                    // .println(indent(height + 1)
                    // +
                    // "ERROR clear child found in node's dirty list at index="
                    // + i);
                    //
                    // ok = false;
                    //
                    // }

                    if (child.isDirty()) {

                        dirty.add(child);

                    }

                    if (i == 0) {

                        if (nkeys == 0) {

                            /*
                             * Note: a node with zero keys is valid. It MUST
                             * have a single child. Such nodes arise when
                             * splitting a node in a btree of order m := 3 when
                             * the splitIndex is computed as m/2-1 = 0. This is
                             * perfectly normal.
                             */

                        } else {
                            /*
                             * Note: All keys on the first child MUST be LT the
                             * first key on this node.
                             */
                            final byte[] k0 = getKeys().get(0);
                            final byte[] ck0 = child.getKeys().get(0);
                            if (BytesUtil.compareBytes(ck0, k0) >= 0) {
                                // if( child.compare(0,keys,0) >= 0 ) {

                                out
                                        .println(indent(height + 1)
                                                + "ERROR first key on first child must be LT "
                                                + keyAsString(k0)
                                                + ", but found "
                                                + keyAsString(ck0));

                                ok = false;

                            }

                            if (child.getKeyCount() >= 1) {

                                final byte[] ckn = child.getKeys().get(
                                        child.getKeyCount() - 1);
                                if (BytesUtil.compareBytes(ckn, k0) >= 0) {
                                    // if (child.compare(child.nkeys-1, keys, 0)
                                    // >= 0) {

                                    out
                                            .println(indent(height + 1)
                                                    + "ERROR last key on first child must be LT "
                                                    + keyAsString(k0)
                                                    + ", but found "
                                                    + keyAsString(ckn));

                                    ok = false;

                                }

                            }

                        }

                    } else if (i < nkeys) {

                        // Note: The delete rule does not preserve this
                        // characteristic since we do not
                        // update the separatorKey for a leaf when removing its
                        // left most key.
                        //
                        // if (child.isLeaf() && keys[i - 1] != child.keys[0]) {
                        //
                        // /*
                        // * While each key in a node always is the first key of
                        // * some leaf, we are only testing the direct children
                        // * here. Therefore if the children are not leaves then
                        // * we can not cross check their first key with the
                        // keys
                        // * on this node.
                        // */
                        // out.println(indent(height + 1)
                        // + "ERROR first key on child leaf must be "
                        // + keys[i - 1] + ", not " + child.keys[0]
                        // + " at index=" + i);
                        //
                        // ok = false;
                        //
                        // }

                    } else {

                        /*
                         * While there is a child for the last index of a node,
                         * there is no key for that index.
                         */

                    }

                    if (!child.dump(level, out, height + 1, true)) {

                        ok = false;

                    }

                }

            }

            // if (dirty.size() != dirtyChildren.size()) {
            //
            // out.println(indent(height + 1) + "ERROR found " + dirty.size()
            // + " dirty children, but " + dirtyChildren.size()
            // + " in node's dirty list");
            //
            // ok = false;
            //
            // }

        }

        return ok;

    }

    /**
     * Human readable representation of the {@link Node}.
     */
    @Override
    public String toString() {

        final StringBuilder sb = new StringBuilder();

        // sb.append(getClass().getName());
        sb.append(super.toString());

        sb.append("{ isDirty=" + isDirty());

        sb.append(", isDeleted=" + isDeleted());

        sb.append(", addr=" + identity);

        final Node p = (parent == null ? null : parent.get());

        sb.append(", parent=" + (p == null ? "N/A" : p.toShortString()));

        if (data == null) {

            // No data record? (Generally, this means it was stolen by copy on
            // write).
            sb.append(", data=NA}");

            return sb.toString();

        }

        sb.append(", nkeys=" + getKeyCount());

        sb.append(", minKeys=" + minKeys());

        sb.append(", maxKeys=" + maxKeys());

        DefaultNodeCoder.toString(this, sb);

        // indicate if each child is loaded or unloaded.
        {

            final int nchildren = getChildCount();

            sb.append(", children=[");

            for (int i = 0; i < nchildren; i++) {

                if (i > 0)
                    sb.append(", ");

                final AbstractNode<?> child = childRefs[i] == null ? null
                        : childRefs[i].get();

                sb.append(child == null ? "U" : "L");

            }

            sb.append("]");

        }

        sb.append("}");

        return sb.toString();

    }

}