/** Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * Created on Dec 5, 2006 */ package com.bigdata.btree; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; import java.text.NumberFormat; import java.util.LinkedList; import java.util.List; import java.util.NoSuchElementException; import java.util.Properties; import java.util.UUID; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicLong; import org.apache.log4j.Logger; import com.bigdata.btree.data.IAbstractNodeData; import com.bigdata.btree.data.ILeafData; import com.bigdata.btree.data.INodeData; import com.bigdata.btree.raba.IRaba; import com.bigdata.btree.raba.MutableKeyBuffer; import com.bigdata.btree.raba.MutableValueBuffer; import com.bigdata.btree.view.FusedView; import com.bigdata.io.AbstractFixedByteArrayBuffer; import com.bigdata.io.ByteArrayBuffer; import com.bigdata.io.ChecksumUtility; import com.bigdata.io.DataInputBuffer; import com.bigdata.io.FileChannelUtility; import com.bigdata.io.NOPReopener; import com.bigdata.io.SerializerUtil; import com.bigdata.io.writecache.WriteCache; import com.bigdata.journal.Journal; import com.bigdata.journal.Name2Addr; import com.bigdata.journal.TemporaryRawStore; import com.bigdata.mdi.IResourceMetadata; import com.bigdata.mdi.LocalPartitionMetadata; import com.bigdata.mdi.SegmentMetadata; import com.bigdata.rawstore.IAddressManager; import com.bigdata.rawstore.IBlock; import com.bigdata.rawstore.IRawStore; import com.bigdata.rawstore.WormAddressManager; import com.bigdata.util.Bytes; import com.bigdata.util.BytesUtil; /** * Builds an {@link IndexSegment} given a source btree and a target branching * factor. There are two main use cases: * <ol> * * <li>Evicting a key range of an index into an optimized on-disk index. In this * case, the input is a {@link BTree} that is ideally backed by a fully buffered * {@link IRawStore} so that no random reads are required.</li> * * <li>Merging index segments. In this case, the input is typically records * emerging from a merge-sort. There are two distinct cases here. In one, we * simply have raw records that are being merged into an index. This might occur * when merging two key ranges or when external data are being loaded. In the * other case we are processing two time-stamped versions of an overlapping key * range. In this case, the more recent version may have "delete" markers * indicating that a key present in an older version has been deleted in the * newer version. Also, key-value entries in the newer version replaced (rather * than are merged with) key-value entries in the older version. If an entry * history policy is defined, then it must be applied here to cause key-value * whose retention is no longer required by that policy to be dropped.</li> * * </ol> * * <h3>One pass vs. Two Pass Design Alternatives</h3> * * There are at least three design alternatives for index segment builds: (A) do * an exact range count instead and generate a perfect plan; (B) fully buffer * the source iterator into byte[][] keys, byte[][] vals, boolean[] * deleteMarkers, and long[] versionTimestamps and generate an exact plan, * consuming the buffered byte[]s directly from RAM; and (C) use the fast range * count to generate a plan based on an overestimate of the tuple count and then * apply a variety of hacks when the source iterator is exhausted to make the * output B+Tree usable, but not well formed. * <p> * The disadvantage of (A) is that it requires two passes over the source view, * which substantially increases the run time of the algorithm. In addition, the * passes can drive evictions in the global LRU and could defeat caching for a * view approaching the nominal size for a split. However, with (A) we can do * builds for very large source B+Trees. Therefore, (A) is implemented for such * use cases. * <p> * The disadvantage of (B) is that it requires more memory. However, it is much * faster than (A). To compensate for the increased memory demand, we can single * thread builds, merges, and splits and fall back to (A) if memory is very * tight or the source view is very large. * <p> * The disadvantage of (C) is that the "hacks" break encapsulation and leak into * the API where operations such as retrieving the right sibling of a node could * return an empty leaf (since we ran out of tuples for the plan). Since these * "hacks" would break encapsulation, it would be difficult to have confidence * that the B+Tree API was fully insulated against the effects of ill-formed * {@link IndexSegment}s. Therefore, I have discarded this approach and backed * out changes designed to support it from the code base. * * <h3>Design alternatives for totally ordered nodes and leaves</h3> * * In order for the nodes to be written in a contiguous block we either have to * buffer them in memory or have to write them onto a temporary file and then * copy them into place after the last leaf has been processed. This concern was * not present in West's algorithm because it did not attempt to place the nodes * and/or leaves contiguously onto the generated B+Tree file. * <p> * For the two pass design described above as option (A), the code buffers the * nodes and leaves onto {@link TemporaryRawStore}s. This approach is scalable, * which is the concern of (A), but requires at least twice the IO when compared * to directly writing the nodes and leaves onto the output file. * <p> * When sufficient memory is available, as cases where (B) would apply, we can * write the leaves directly on the backing file (using double-buffering to * update the prior/next addrs). Since there are far fewer nodes than leaves, we * can buffer the nodes in memory, writing them once the leaves are finished. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id: IndexSegmentBuilder.java 2265 2009-10-26 12:51:06Z thompsonbry * $ * * @see "Post-order B-Tree Construction" by Lawerence West, ACM 1992. Note that * West's algorithm is for a b-tree (values are stored on internal stack as * well as leaves), not a b+-tree (values are stored only on the leaves). * Our implementation is therefore an adaptation. * * @see "Batch-Construction of B+-Trees" by Kim and Won, ACM 2001. The approach * outlined by Kim and Won is designed for B+-Trees, but it appears to be * less efficient on first glance. * * @see IndexSegment * @see IndexSegmentFile * @see IndexSegmentCheckpoint * * @todo Put profiler on (B) for build stress tests. The source view should be * pre-generated such that we only measure the build behavior in the * profiler. See {@link TestIndexSegmentBuilderWithLargeTrees}. [We need * to first remove the synchronization for disk reads on the journal so we * have the maximum possible IO rate, and possibly materialize the leaves * of the source view in parallel using pre-fetch.] * <p> * Make sure that {@link #elapsed} reports the total build time, including * the range count or pre-materialization costs as those are a significant * part of the total cost. * <p> * Much of the cost of the build is the range iterator, including decoding * the nodes and leaves and materializing tuples. Most of the remaining * cost is the coding of the new nodes and leaves and their IO. Some of * that cost can be trimmed by faster coders, but we can also trim the IO * using parallel materialization of leaves (for {@link IndexSegment}) and * pre-fetch of nodes and leaves (for {@link BTree}s). * <p> * The B+Tree iterator on the fused view of the journal and index segments * is effectively an incremental merge of the source iterators. However, * the iterator has a [capacity] parameter which is a hint for * materialization. We could do this as a parallel merge sort of the * leaves spanned of the key range with a restriction for the capacity. * That could also be done on a GPU. The same iterator is used as the * source for a compacting merge. If we tunnel the representation to * something like the byte[][] keys, byte[][] vals, boolean[] * deleteMarkers, long[] revisionTimestamps, then we could do a merge sort * of the leaves from the (ordered) view and then do a parallel build * segment from the backing representation. We could do that build step in * java or on a GPU. * * @todo GPU parallel builds. Once we have the data in a bunch of byte[][]s we * could conceivable get this stuff organized to the point where each * tuple and child in the output B+Tree could be assigned to a thread and * each leaf and node to a core on a GPU. At that point, we run the * "build" in parallel. For this approach, the uncoded leaves will all * wind up in memory using the same byte[]s for keys and values as the * source view. * <p> * This approach could be carried further to code the nodes and leaves in * parallel, perhaps as a 2nd GPU program. If the source view was fully * buffered, then it should be released once the nodes and leaves have * been coded as no more use will be made of those data. * <p> * While it may not improve the IO substantially, it is possible to use * gathered writes if the leaves and nodes are fully buffered in memory. * For the leaves, we write them in the index order. For the nodes, we * write them in their pre-order traversal. The order in which we would * write out the nodes and leaves should be part of the state during a GPU * build regardless of whether the IO is done sequentially or as a * gathered write. * * @todo Make sure it is possible to grab the {@link IndexMetadata} and the * bloom filter from the generated file in a single IO. This could be * useful when the index segment files are stored on a parallel file * system. [It is possible to do this since these data are contiguous and * in the same region of the generated file (they both use addresses * relative to the BASE of the file).] */ public class IndexSegmentBuilder implements Callable<IndexSegmentCheckpoint> { /** * Logger. */ private static final Logger log = Logger .getLogger(IndexSegmentBuilder.class); /** * Error message when the #of tuples in the {@link IndexSegment} would * exceed {@link Integer#MAX_VALUE}. * <p> * Note: This is not an inherent limit in the {@link IndexSegment} but * rather a limit in the {@link IndexSegmentPlan} (and perhaps the * {@link IndexSegmentBuilder}) which presumes that the entry count is an * <code>int</code> rather than a <code>long</code>. */ protected static final String ERR_TOO_MANY_TUPLES = "Too many tuples"; /** * Message when the index segment will be empty. */ protected static final String ERR_NO_TUPLES = "No tuples"; /** * The file mode used to open the file on which the {@link IndexSegment} is * written. */ final String mode = "rw"; // also rws or rwd /** * The file specified by the caller on which the {@link IndexSegment} is * written. */ public final File outFile; /** * The value specified to the ctor. */ final public long entryCount; /** * The iterator specified to the ctor. This is the source for the keys and * values that will be written onto the generated {@link IndexSegment}. */ final private ITupleIterator<?> entryIterator; /** * The commit time associated with the view from which the * {@link IndexSegment} is being generated (from the ctor). This value is * written into {@link IndexSegmentCheckpoint#commitTime}. */ final public long commitTime; /** * <code>true</code> iff the generated {@link IndexSegment} will * incorporate all state for the source index (partition) as of the * specified <i>commitTime</i>. * <p> * Note: This flag is written into the {@link IndexSegmentCheckpoint} but it * has no other effect on the build process. */ final public boolean compactingMerge; /** * The name of the index or index partition for which the build is being * performed. */ final String name; /** * A copy of the metadata object provided to the ctor. This object is * further modified before being written on the * {@link IndexSegmentStore}. */ final public IndexMetadata metadata; /** * <code>true</code> iff the source index is isolatable (supports both * deletion markers and version timestamps). */ final boolean isolatable; /** * <code>true</code> iff the source index has delete markers enabled. * <p> * Note: delete markers are ONLY copied for an incremental build (when * {@link #compactingMerge} is <code>false</code>). */ final boolean deleteMarkers; /** * <code>true</code> iff the source index has tuple revision timestamps * enabled. */ final boolean versionTimestamps; /** * <code>true</code> iff the source index has raw records enabled. * <p> * Note: raw records will be copied into the BLOBS region of the index * segment and the address of the raw record in the output tuple will be * updated to reflect the relative address of the record within the index * segment. */ final boolean rawRecords; /** * A buffer used to encode a raw record address for a mutable {@link BTree} * and otherwise <code>null</code>. */ private final ByteArrayBuffer recordAddrBuf; /** * The unique identifier for the generated {@link IndexSegment} resource. */ final public UUID segmentUUID; /** * The cache for the generated {@link IndexSegmentStore}. When non- * <code>null</code> the generated {@link INodeData} objects will be placed * into the cache, which is backed by a shared LRU. This helps to reduce * latency when an index partition built or merge operation finishes and the * index partition view is updated since the data will already be present in * the cache. Generating the index segment will drive evictions from the * shared LRU, but those will be the least recently used records and the new * {@link IndexSegmentStore} is often hot as soon as it is generated. * <p> * Note: If the build fails, then the cache will be cleared. * * @todo The {@link IndexMetadata} and the {@link BloomFilter} should be in * the {@link #storeCache} as well. Make sure that we do this for read * and write for both the {@link BTree} and the {@link IndexSegment}. */ /* * The storeCache field is marked as "Deprecated" but it should stick around * for a while since we might wind up reusing this feature on an index local * basis at some point. * * @see BLZG-1501 (remove LRUNexus) */ @Deprecated final private ConcurrentMap<Long, Object> storeCache; /** * Used to serialize the nodes and leaves of the output tree. */ final private NodeSerializer nodeSer; /** * Note: The offset bits on the {@link IndexSegmentFileStore} does NOT * have to agree with the offset bits on the source store. However, it * must be large enough to handle the large branching factors typically * associated with an {@link IndexSegment} vs a {@link BTree}. Further, * if blobs are to be copied into the index segment then it generally * must be large enough for those blobs (up to 64M per record). * <p> * Note: The same #of offset bits MUST be used by the temporary stores * that we use to buffer nodes, leaves, and blobs as are used by the * generated index segment! */ final int offsetBits = WormAddressManager.SCALE_OUT_OFFSET_BITS; /** * The {@link IAddressManager} used to form addresses for the generated * file. Addresses are formed from a byteCount and an <em>encoded</em> * offset comprised of a relative offset into a known region and the region * identifier. * * @see IndexSegmentRegion * @see IndexSegmentAddressManager */ final private WormAddressManager addressManager; /** * The bloom filter iff we build one (errorRate != 0.0). */ final IBloomFilter bloomFilter; /** * When <code>true</code> record level checksums will be used in the * generated file. * * FIXME This can not be enabled until we factor out the direct use of the * {@link WriteCache} since special handling is otherwise required to ensure * that the checksum makes it into the output record when we write directly * on the disk. * * FIXME When enabling this, make sure that the bloom filter, * {@link IndexMetadata}, and the blobs are all checksummed and make sure * that the {@link IndexSegmentStore} verifies the checksums when it reads * through to the disk and only returns the raw record w/o the trailing * checksum. * * FIXME The right time to reconcile these things may be when this branch * (HAJournal) is merged with the dynamic shard refactor branch. */ final private boolean useChecksums = false; /** * Used to compute record level checksums when {@link #useChecksums} is * <code>true</code>. */ final private ChecksumUtility checker = new ChecksumUtility(); /** * The file on which the {@link IndexSegment} is written. The file is closed * regardless of the outcome of the operation. */ protected RandomAccessFile out = null; /** * The {@link IndexSegmentCheckpoint} record written on the * {@link IndexSegmentStore}. */ private IndexSegmentCheckpoint checkpoint; /** * The {@link IndexSegmentCheckpoint} record written on the * {@link IndexSegmentStore}. */ public IndexSegmentCheckpoint getCheckpoint() { return checkpoint; } // /** // * The buffer used to hold leaves so that they can be evicted en mass onto a // * region of the {@link #outFile}. // * // * @deprecated This forces us to do IO twice for the leaves. They should be // * explicitly double-buffered in memory (the last leaf and the // * current leaf) and evicted directly onto {@link #out}. This // * will remove the requirement for the {@link IUpdateStore} API // * on the {@link TemporaryRawStore} and on the // * {@link DiskOnlyStrategy}. A r/w store version of the // * {@link TemporaryRawStore} could be deployed which supports // * update if that becomes important. // */ // private TemporaryRawStore leafBuffer; /** * This is used to buffer the leaves written onto the output file for * greater efficiency. * * FIXME Use a WriteCacheService which will hide this complexity and give * better throughput. */ private WriteCache.FileChannelWriteCache leafWriteCache; /** * Class combines the address at which a node is written onto the output * file (relative to the start of the nodes region) with the coded data * record for the node. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> * @version $Id$ */ private static class NodeMetadata { public final long addr; public final INodeData data; public NodeMetadata(final long addr,final INodeData data) { this.addr = addr; this.data = data; } } /** * The buffer used to hold nodes so that they can be evicted en mass onto a * region of the {@link #outFile}. This is conditionally enabled depending * on whether #bufferNodes is true. */ private TemporaryRawStore nodeBuffer; /** * When the nodes are to be fully buffered they are added into this list in * the order in which they are generated. */ private List<NodeMetadata> nodeList; /** * When <code>true</code> the generated nodes will be fully buffered in RAM. * Otherwise they will be buffered on the {@link #nodeBuffer} and then * transferred to the output file en mass. */ final protected boolean bufferNodes; /** * The optional buffer used to hold records referenced by index entries. * This is opened if the index uses raw records -or- if the index specifies * and {@link IOverflowHandler}. */ private TemporaryRawStore blobBuffer; private final IOverflowHandler overflowHandler; /** * The encoded address of the first leaf written on the * {@link IndexSegmentStore} (there is always at least one, even if it is * the root leaf). * <p> * Note: A copy of this value is preserved by * {@link IndexSegmentCheckpoint#addrFirstLeaf}. */ private long addrFirstLeaf = 0L; /** * The encoded address of the last leaf written on the * {@link IndexSegmentStore} (there is always at least one, even if it is * the root leaf). * <p> * Note: A copy of this value is preserved by * {@link IndexSegmentCheckpoint#addrLastLeaf}. */ private long addrLastLeaf = 0L; // /** // * The offset in the output file of the last leaf written onto that file. // * Together with {@link #lastLeafSize} this is used to compute the // * address of the prior leaf. // */ // long lastLeafOffset = -1L; // // /** // * The size in bytes of the last leaf written onto the output file (the size // * of the compressed record that is actually written onto the output file // * NOT the size of the serialized leaf before it is compressed). Together // * with {@link #lastLeafOffset} this is used to compute the address of the // * prior leaf. // */ // int lastLeafSize = -1; /** * Tracks the maximum length of any serialized node or leaf. This is used * to fill in one of the {@link IndexSegmentCheckpoint} fields. */ int maxNodeOrLeafLength = 0; /** * The #of tuples written for the output tree. */ long ntuplesWritten; /** * The #of nodes written for the output tree. This will be zero if all * entries fit into a root leaf. */ int nnodesWritten = 0; /** * The #of leaves written for the output tree. */ int nleavesWritten = 0; /** * The #of nodes or leaves that have been written out in each level of the * tree. * * @see IndexSegmentPlan#numInLevel */ final int writtenInLevel[]; /** * The stack of nodes that are currently being populated. The first N-1 * elements in this array are always nodes while the last element is always * a leaf ({@link #leaf} is the same reference as the last element in this * array). The nodes and the leaf in this array are reused rather than being * reallocated. */ final AbstractSimpleNodeData[] stack; /** * The current leaf that is being populated from the source btree. This leaf * is reused for each output leaf rather than being reallocated. In the * degenerate case when the output btree is a single root leaf then this * will be that leaf. This reference is always the same as the last * reference in {@link #stack}. */ final SimpleLeafData leaf; /** * The plan for building the B+-Tree. */ final public IndexSegmentPlan plan; /** * The timestamp in milliseconds when {@link #call()} was invoked. */ private long begin_build; /** * The timestamp in milliseconds when {@link #call()} was invoked -or- * ZERO (0L) if {@link #call()} has not been invoked. */ public long getStartTime() { return begin_build; } /** * The time to setup the index build, including the generation of the index * plan and the initialization of some helper objects. */ public final long elapsed_setup; /** * The time to write the nodes and leaves into their respective buffers, not * including the time to transfer those buffered onto the output file. */ public long elapsed_build; /** * The time to write the nodes and leaves from their respective buffers * onto the output file and synch and close that output file. */ public long elapsed_write; /** * The process runtime in milliseconds. */ public long elapsed; /** * The data throughput rate in megabytes per second. */ public float mbPerSec; /** * Builder factory will build an {@link IndexSegment} from an index * (partition). Delete markers are propagated to the {@link IndexSegment} * unless <i>compactingMerge</i> is <code>true</code>. * * @param src * A view of the index partition as of the <i>createTime</i>. * When <i>compactingMerge</i> is <code>false</code> then this * MUST be a single {@link BTree} since incremental builds are * only support for a {@link BTree} source while compacting * merges are defined for any {@link IIndex}. * @param outFile * The file on which the {@link IndexSegment} will be written. * The file MAY exist, but if it exists then it MUST be empty. * @param compactingMerge * When <code>true</code> the caller asserts that <i>src</i> is a * {@link FusedView} and deleted index entries WILL NOT be * included in the generated {@link IndexSegment}. Otherwise, it * is assumed that the only select component(s) of the index * partition view are being exported onto an {@link IndexSegment} * and deleted index entries will therefore be propagated to the * new {@link IndexSegment} (aka an incremental build). * @param createTime * The commit time associated with the view from which the * {@link IndexSegment} is being generated. This value is written * into {@link IndexSegmentCheckpoint#commitTime}. * @param fromKey * The lowest key that will be included (inclusive). When * <code>null</code> there is no lower bound. * @param toKey * The first key that will be included (exclusive). When * <code>null</code> there is no upper bound. * * @return An object which can be used to construct the {@link IndexSegment} * . * * @throws IOException */ public static IndexSegmentBuilder newInstance( final ILocalBTreeView src, final File outFile, final File tmpDir, final boolean compactingMerge, final long createTime, final byte[] fromKey, final byte[] toKey) throws IOException { if (src == null) throw new IllegalArgumentException(); if (outFile == null) throw new IllegalArgumentException(); if (tmpDir == null) throw new IllegalArgumentException(); if (createTime <= 0L) throw new IllegalArgumentException(); // The output branching factor. final int m = src.getIndexMetadata().getIndexSegmentBranchingFactor(); // a fast range count, which can overestimate the #of tuples in the view. final long fastRangeCount = src.rangeCount(fromKey, toKey); // a fast summary of the view. final ViewStatistics stats = new ViewStatistics(src); // 2x the nominal size of an index shard (200M). final long MAX_SIZE_ON_DISK = Bytes.megabyte * 200 * 2; // ~2x the nominal size of a 200M index shard in tuples at 50 bytes/tuple. final long MAX_TUPLES_IN_VIEW = Bytes.megabyte * 8; /* * FIXME I have temporary disabled this as it appears to be slower to * fully buffer the data on the current test cluster.... I will look * into this further as soon as I get a good baseline on that cluster. * * Ah. The problem is likely to be Java heap pressure. The one pass * approach might have to use the MemoryManager in order for us to * realize the efficiency obtain from a single IO pass, especially since * the two pass approach is already benefiting from the file system * cache. */ if (false && stats.sumSegBytes < MAX_SIZE_ON_DISK && fastRangeCount < MAX_TUPLES_IN_VIEW) { /* * This fully buffers the tuples in RAM, computing the exact range * count as it goes. This is therefore more efficient since it * avoids a 2nd pass over the source view to read the tuples. */ return newInstanceFullyBuffered(src, outFile, tmpDir, m, compactingMerge, createTime, fromKey, toKey, true/* bufferNodes */); } else { /* * There is so much data that we can not materialize it into RAM. */ return newInstanceTwoPass(src, outFile, tmpDir, m, compactingMerge, createTime, fromKey, toKey, false/* bufferNodes */); } } /** * A two pass build algorithm. The first pass is used to obtain an exact * entry count for the view. Based on that exact range count we can compute * a plan for a balanced B+Tree. A second pass over the view is required to * populate the output B+Tree. This flavor also buffers the leaves and nodes * on temporary stores, which means that it does more IO. However, this * version is capable of processing very large source views. */ protected static IndexSegmentBuilder newInstanceTwoPass( final ILocalBTreeView src, final File outFile, final File tmpDir, final int m, final boolean compactingMerge, final long createTime, final byte[] fromKey, final byte[] toKey, final boolean bufferNodes) throws IOException { if (src == null) throw new IllegalArgumentException(); if (outFile == null) throw new IllegalArgumentException(); if (tmpDir == null) throw new IllegalArgumentException(); if (createTime <= 0L) throw new IllegalArgumentException(); // the exact range count. final int nentries; // the flags that will be used to obtain the desired tuples. final int flags; if (compactingMerge) { /* * For a compacting merge the delete markers are ignored so they * will NOT be transferred to the new index segment. */ flags = IRangeQuery.DEFAULT; final long n = src.rangeCountExact(fromKey, toKey); if (n > Integer.MAX_VALUE) { throw new UnsupportedOperationException(ERR_TOO_MANY_TUPLES); } nentries = (int) n; } else { /* * For an incremental build the deleted tuples are propagated to * the new index segment. This is required in order for the fact * that those tuples were deleted as of the commitTime to be * retained by the generated index segment. */ flags = IRangeQuery.DEFAULT | IRangeQuery.DELETED; final long n = src.rangeCountExactWithDeleted(fromKey, toKey); if (n > Integer.MAX_VALUE) { throw new UnsupportedOperationException(ERR_TOO_MANY_TUPLES); } nentries = (int) n; } /* * Iterator reading the source tuples to be copied to the index * segment. * * Note: The DELETED flag was set above unless this is a compacting * merge. That is necessary to ensure that deleted tuples are * preserved when the index segment does not reflect the total * history of a view. */ // source iterator. final ITupleIterator<?> itr = src.rangeIterator(fromKey, toKey, 0/* capacity */, flags, null/* filter */); // metadata for that index / index partition. final IndexMetadata indexMetadata = src.getIndexMetadata(); // Setup the index segment build operation. return IndexSegmentBuilder.newInstance(// outFile, // tmpDir, // nentries, // exact range count itr, // source iterator m, // the output branching factor. indexMetadata,// createTime,// compactingMerge,// bufferNodes// ); } /** * A one pass algorithm which materializes the tuples in RAM, computing the * exact tuple count as it goes. This is faster than the two-pass algorithm * and is a better choice when the source view and the output index segment * are within the normal ranges for an index partition, e.g., an output * index segment file of ~200M on the disk. * * FIXME The unit tests need to run against both builds based on the * materialized tuples and builds based on two passes in order to obtain the * exact range count. They already do for * {@link TestIndexSegmentBuilderWithLargeTrees} but not yet for the other * test suite variants. */ protected static IndexSegmentBuilder newInstanceFullyBuffered( final ILocalBTreeView src, final File outFile, final File tmpDir, final int m, final boolean compactingMerge, final long createTime, final byte[] fromKey, final byte[] toKey, final boolean bufferNodes) throws IOException { if (src == null) throw new IllegalArgumentException(); if (outFile == null) throw new IllegalArgumentException(); if (tmpDir == null) throw new IllegalArgumentException(); if (createTime <= 0L) throw new IllegalArgumentException(); // metadata for that index / index partition. final IndexMetadata indexMetadata = src.getIndexMetadata(); final long fastRangeCount = src.rangeCount(fromKey, toKey); /* * If the fast range count and the size on the disk of the segments in * the view are reasonable, then eagerly materialize the tuples into an * IRaba[] dimensioned to the fast range count and then wrap the data * with an iterator and run the normal build. */ final boolean hasVersionTimestamps = indexMetadata .getVersionTimestamps(); final boolean hasDeleteMarkers = indexMetadata.getDeleteMarkers(); final boolean hasRawRecords = indexMetadata.getRawRecords(); // A temporary leaf used to buffer the data in RAM. final MutableLeafData tleaf = new MutableLeafData((int) fastRangeCount, hasVersionTimestamps, hasDeleteMarkers, hasRawRecords); final int flags; if (compactingMerge) { /* * For a compacting merge the delete markers are ignored so they * will NOT be transferred to the new index segment. */ flags = IRangeQuery.DEFAULT; } else { /* * For an incremental build the deleted tuples are propagated to the * new index segment. This is required in order for the fact that * those tuples were deleted as of the commitTime to be retained by * the generated index segment. */ flags = IRangeQuery.DEFAULT | IRangeQuery.DELETED; } /* * Iterator reading the source tuples to be copied to the index segment. * * Note: The DELETED flag was set above unless this is a compacting * merge. That is necessary to ensure that deleted tuples are preserved * when the index segment does not reflect the total history of a view. * * The tuples are materialized and buffered in a single, and potentially * very large, leaf. That is Ok since the MutableLeaf is using very * simple data structures. * * @todo The fastRangeCount is a hint that we want to eagerly * materialize all of the data. This hint should be turned into * pre-fetch and into a single IO for the index segment leaves if they * are not in memory. [In fact, the hint is completely ignored at this * point. If hints get more weight, then review code for their use.] */ final ITupleIterator<?> titr = src.rangeIterator(fromKey, toKey, (int) fastRangeCount/* capacity */, flags, null/* filter */); int i = 0; // init per API specification. long minimumVersionTimestamp = Long.MAX_VALUE; long maximumVersionTimestamp = Long.MIN_VALUE; while (titr.hasNext()) { final ITuple<?> tuple = titr.next(); tleaf.keys.keys[i] = tuple.getKey(); if (hasVersionTimestamps) { final long t = tuple.getVersionTimestamp(); tleaf.versionTimestamps[i] = t; if (t < minimumVersionTimestamp) { minimumVersionTimestamp = t; } if (t > maximumVersionTimestamp) { maximumVersionTimestamp = t; } } if (hasDeleteMarkers && tuple.isDeletedVersion()) { /* * Note: When delete markers are used, the array will be * pre-populated with [false] so we only have to set the flag on * the tuples that are actually deleted. */ tleaf.deleteMarkers[i] = true; } else { /* * Note: If the source has raw records for some values, then * this will cause those records to be materialized within the * single massive root leaf. From there, the data will be * written onto the index segment file. */ tleaf.vals.values[i] = tuple.getValue(); } i++; } tleaf.keys.nkeys = i; // note final #of tuples. tleaf.vals.nvalues = i; // note final #of tuples. tleaf.maximumVersionTimestamp = maximumVersionTimestamp; tleaf.minimumVersionTimestamp = minimumVersionTimestamp; // The exact range count. final int nentries = i; // The source iterator (reading on the fully buffered tuples). final ITupleIterator<?> itr = new MyTupleIterator(tleaf, flags); // Setup the index segment build operation. return IndexSegmentBuilder.newInstance(// outFile, // tmpDir, // nentries, // exact range count itr, // source iterator m, // the output branching factor. indexMetadata,// createTime,// compactingMerge,// bufferNodes// ); } /** * Variant using an array of objects in the desired order. A single root * leaf is generated from those objects. The root leaf is then fed into the * algorithm to efficient construct the corresponding read-only * {@link IndexSegment}. * * @param a * The array of objects to be written onto the index. The index * must know how to generate tuples from these objects. The * objects must already be in the natural order of the keys that * will be generated for those tuples. * @param alen * The #of elements in that array. * @param indexMetadata * The {@link IndexMetadata} that will serve as the template for * the generated {@link IndexSegment}. * @param outFile * The file on which the {@link IndexSegment} will be written. * The file MAY exist, but if it exists then it MUST be empty. * @param tmpDir * The temporary directory in data are buffered during the build * (optional - the default temporary directory is used if this is * <code>null</code>). * @param m * The branching factor for the generated {@link IndexSegment}. * @param compactingMerge * When <code>true</code> the caller asserts that <i>src</i> is a * {@link FusedView} and deleted index entries WILL NOT be * included in the generated {@link IndexSegment}. Otherwise, it * is assumed that the only select component(s) of the index * partition view are being exported onto an {@link IndexSegment} * and deleted index entries will therefore be propagated to the * new {@link IndexSegment} (aka an incremental build). * @param createTime * The commit time associated with the view from which the * {@link IndexSegment} is being generated. This value is written * into {@link IndexSegmentCheckpoint#commitTime}. * @param bufferNodes * When <code>true</code> the generated nodes will be fully * buffered in RAM (faster, but imposes a memory constraint). * Otherwise they will be written onto a temporary file and then * transferred to the output file en mass. * @return * @throws IOException * * TODO We could pass a flag indicating whether the leaf needs * to be sorted after it is generated, but the caller would * still be responsible for ensuring that there are no * duplicates in the array. */ // * @param fromKey // * The lowest key that will be included (inclusive). When // * <code>null</code> there is no lower bound. // * @param toKey // * The first key that will be included (exclusive). When // * <code>null</code> there is no upper bound. @SuppressWarnings("unchecked") public static IndexSegmentBuilder newInstance( final Object[] a, final int alen, final IndexMetadata indexMetadata, final File outFile, final File tmpDir, final int m, final boolean compactingMerge, final long createTime, //final byte[] fromKey, final byte[] toKey, final boolean bufferNodes) throws IOException { if (a == null) throw new IllegalArgumentException(); if (alen < 0) throw new IllegalArgumentException(); if (alen > a.length) throw new IllegalArgumentException(); if (indexMetadata == null) throw new IllegalArgumentException(); if (outFile == null) throw new IllegalArgumentException(); if (tmpDir == null) throw new IllegalArgumentException(); if (createTime <= 0L) throw new IllegalArgumentException(); final boolean hasVersionTimestamps = indexMetadata .getVersionTimestamps(); if (hasVersionTimestamps) throw new IllegalArgumentException( "versionTimestamps not available in source []."); final boolean hasDeleteMarkers = indexMetadata.getDeleteMarkers(); if (hasDeleteMarkers && !compactingMerge) throw new IllegalArgumentException( "deleteMarkers not available in source []."); final boolean hasRawRecords = indexMetadata.getRawRecords(); // A temporary leaf used to buffer the data in RAM. final MutableLeafData tleaf = new MutableLeafData(alen, hasVersionTimestamps, hasDeleteMarkers, hasRawRecords); final int flags; if (compactingMerge) { /* * For a compacting merge the delete markers are ignored so they * will NOT be transferred to the new index segment. */ flags = IRangeQuery.DEFAULT; } else { /* * For an incremental build the deleted tuples are propagated to the * new index segment. This is required in order for the fact that * those tuples were deleted as of the commitTime to be retained by * the generated index segment. */ flags = IRangeQuery.DEFAULT | IRangeQuery.DELETED; } /* * Iterator reading the source tuples to be copied to the index segment. * * Note: The DELETED flag was set above unless this is a compacting * merge. That is necessary to ensure that deleted tuples are preserved * when the index segment does not reflect the total history of a view. * * The tuples are materialized and buffered in a single, and potentially * very large, leaf. That is Ok since the MutableLeaf is using very * simple data structures. * * @todo The fastRangeCount is a hint that we want to eagerly * materialize all of the data. This hint should be turned into * pre-fetch and into a single IO for the index segment leaves if they * are not in memory. [In fact, the hint is completely ignored at this * point. If hints get more weight, then review code for their use.] */ // final ITupleIterator<?> titr = src.rangeIterator(fromKey, toKey, // (int) fastRangeCount/* capacity */, flags, null/* filter */); // init per API specification. long minimumVersionTimestamp = Long.MAX_VALUE; long maximumVersionTimestamp = Long.MIN_VALUE; @SuppressWarnings("rawtypes") final ITupleSerializer tupSer = indexMetadata.getTupleSerializer(); for (int i = 0; i < alen; i++) { // final ITuple<?> tuple = titr.next(); tleaf.keys.keys[i] = tupSer.serializeKey(a[i]); // Note: Version timestamps are not available from a[]. // if (hasVersionTimestamps) { // // final long t = tuple.getVersionTimestamp(); // // tleaf.versionTimestamps[i] = t; // // if (t < minimumVersionTimestamp) { // // minimumVersionTimestamp = t; // // } // // if (t > maximumVersionTimestamp) { // // maximumVersionTimestamp = t; // // } // // } // Note: delete markers are not available from a[]. // if (hasDeleteMarkers && tuple.isDeletedVersion()) { // // /* // * Note: When delete markers are used, the array will be // * pre-populated with [false] so we only have to set the flag on // * the tuples that are actually deleted. // */ // tleaf.deleteMarkers[i] = true; // // } else { /* * Note: If the source has raw records for some values, then * this will cause those records to be materialized within the * single massive root leaf. From there, the data will be * written onto the index segment file. */ tleaf.vals.values[i] = tupSer.serializeVal(a[i]); // } // i++; } tleaf.keys.nkeys = alen; // note final #of tuples. tleaf.vals.nvalues = alen; // note final #of tuples. tleaf.maximumVersionTimestamp = maximumVersionTimestamp; tleaf.minimumVersionTimestamp = minimumVersionTimestamp; // The exact range count. final int nentries = alen; // The source iterator (reading on the fully buffered tuples). @SuppressWarnings("rawtypes") final ITupleIterator<?> itr = new MyTupleIterator(tleaf, flags); // Setup the index segment build operation. return IndexSegmentBuilder.newInstance(// outFile, // tmpDir, // nentries, // exact range count itr, // source iterator m, // the output branching factor. indexMetadata,// createTime,// compactingMerge,// bufferNodes// ); } /** * <p> * A more flexible factory for an {@link IndexSegment} build which permits * override of the index segment branching factor, replacement of the * {@link IndexMetadata}, and the use of the caller's iterator. * </p> * <p> * Note: The caller must determine whether or not deleted index entries are * present in the view. The <i>entryCount</i> MUST be the exact #of index * entries that are visited by the given iterator. In general, this is not * difficult. However, if a compacting merge is desired (that is, if you are * trying to generate a view containing only the non-deleted entries) then * you MUST explicitly count the #of entries that will be visited by the * iterator, e.g., it will require two passes over the iterator to setup the * index build operation. * </p> * <p> * Note: With a branching factor of 4096 a tree of height 2 (three levels) * could address 68,719,476,736 entries - well beyond what we want in a * given index segment! Well before that the index segment should be split * into multiple files. The split point should be determined by the size of * the serialized leaves and nodes, e.g., the amount of data on disk * required by the index segment and the amount of memory required to fully * buffer the index nodes. While the size of a serialized node can be * estimated easily, the size of a serialized leaf depends on the kinds of * values stored in that index. The actual sizes are recorded in the * {@link IndexSegmentCheckpoint} record in the header of the * {@link IndexSegment}. * </p> * * @param outFile * The file on which the index segment is written. The file MAY * exist but MUST have zero length if it does exist (this permits * you to use the temporary file facility to create the output * file). * @param tmpDir * The temporary directory in data are buffered during the build * (optional - the default temporary directory is used if this is * <code>null</code>). * @param entryCount * The #of entries that will be visited by the iterator. This * MUST be an exact range count. * @param entryIterator * Visits the index entries in key order that will be written * onto the {@link IndexSegment}. * @param m * The branching factor for the generated tree. This can be * chosen with an eye to minimizing the height of the generated * tree. (Small branching factors are permitted for testing, but * generally you want something relatively large.) * @param metadata * The metadata record for the source index. A copy will be made * of this object. The branching factor in the generated tree * will be overridden to <i>m</i>. * @param commitTime * The commit time associated with the view from which the * {@link IndexSegment} is being generated. This value is written * into {@link IndexSegmentCheckpoint#commitTime}. * @param compactingMerge * <code>true</code> iff the generated {@link IndexSegment} will * incorporate all state for the source index (partition) as of * the specified <i>commitTime</i>. This flag is written into the * {@link IndexSegmentCheckpoint} but does not otherwise effect * the build process. * @param bufferNodes * When <code>true</code> the generated nodes will be fully * buffered in RAM (faster, but imposes a memory constraint). * Otherwise they will be written onto a temporary file and then * transferred to the output file en mass. * * @throws IOException */ public static IndexSegmentBuilder newInstance(// final File outFile,// final File tmpDir,// final long entryCount,// final ITupleIterator<?> entryIterator, // final int m,// final IndexMetadata metadata,// final long commitTime,// final boolean compactingMerge,// final boolean bufferNodes// ) throws IOException { return new IndexSegmentBuilder(outFile, tmpDir, entryCount, entryIterator, m, metadata, commitTime, compactingMerge, bufferNodes); } /** * <p> * Designated constructor sets up a build of an {@link IndexSegment} for * some caller defined read-only view. * </p> * <p> * Note: The caller must determine whether or not deleted index entries are * present in the view. The <i>entryCount</i> MUST be the exact #of index * entries that are visited by the given iterator. In general, this is not * difficult. However, if a compacting merge is desired (that is, if you are * trying to generate a view containing only the non-deleted entries) then * you MUST explicitly count the #of entries that will be visited by the * iterator, e.g., it will require two passes over the iterator to setup the * index build operation. * </p> * <p> * Note: With a branching factor of 4096 a tree of height 2 (three levels) * could address 68,719,476,736 entries - well beyond what we want in a * given index segment! Well before that the index segment should be split * into multiple files. The split point should be determined by the size of * the serialized leaves and nodes, e.g., the amount of data on disk * required by the index segment and the amount of memory required to fully * buffer the index nodes. While the size of a serialized node can be * estimated easily, the size of a serialized leaf depends on the kinds of * values stored in that index. The actual sizes are recorded in the * {@link IndexSegmentCheckpoint} record in the header of the * {@link IndexSegment}. * </p> * * @param outFile * The file on which the index segment is written. The file MAY * exist but MUST have zero length if it does exist (this permits * you to use the temporary file facility to create the output * file). * @param tmpDir * The temporary directory in data are buffered during the build * (optional - the default temporary directory is used if this is * <code>null</code>). * @param entryCount * The #of entries that will be visited by the iterator. This * MUST be an exact range count. * @param entryIterator * Visits the index entries in key order that will be written * onto the {@link IndexSegment}. * @param m * The branching factor for the generated tree. This can be * chosen with an eye to minimizing the height of the generated * tree. (Small branching factors are permitted for testing, but * generally you want something relatively large.) * @param metadata * The metadata record for the source index. A copy will be made * of this object. The branching factor in the generated tree * will be overridden to <i>m</i>. * @param commitTime * The commit time associated with the view from which the * {@link IndexSegment} is being generated. This value is written * into {@link IndexSegmentCheckpoint#commitTime}. * @param compactingMerge * <code>true</code> iff the generated {@link IndexSegment} will * incorporate all state for the source index (partition) as of * the specified <i>commitTime</i>. This flag is written into the * {@link IndexSegmentCheckpoint} but does not otherwise effect * the build process. * @param bufferNodes * When <code>true</code> the generated nodes will be fully * buffered in RAM (faster, but imposes a memory constraint). * Otherwise they will be written onto a temporary file and then * transferred to the output file en mass. * * @throws IOException */ protected IndexSegmentBuilder(// final File outFile,// final File tmpDir,// final long entryCount,// final ITupleIterator<?> entryIterator, // final int m,// IndexMetadata metadata,// final long commitTime,// final boolean compactingMerge,// final boolean bufferNodes// ) throws IOException { if (outFile == null) throw new IllegalArgumentException(); if (tmpDir == null) throw new IllegalArgumentException(); if (entryCount < 0) throw new IllegalArgumentException(); // if (entryCount == 0 && !compactingMerge) { // // /* // * Note: A zero entry count is allowed for a compacting merge. This // * can arise when all tuples in an index (partition) have been // * deleted. It is impossible to detect this condition before we // * explicitly range count the tuples (including any delete markers). // * Rather than forcing the caller to handle this via a thrown // * exception it is significantly easier to generate an empty // * IndexSegment. // */ // // throw new IllegalArgumentException(); // // } if (entryCount == 0&&log.isInfoEnabled()) log.info(ERR_NO_TUPLES); if (entryIterator == null) throw new IllegalArgumentException(); if (commitTime <= 0L) throw new IllegalArgumentException(); final long begin_setup = System.currentTimeMillis(); // @todo New files SHOUOLD use record level checksums. // this.useChecksums = false; // the UUID assigned to this index segment file. this.segmentUUID = UUID.randomUUID(); this.entryCount = entryCount; this.entryIterator = entryIterator; // the name of the index or the index partition. name = (metadata.getPartitionMetadata() == null)// // local index name (if any). ? metadata.getName() == null ? "N/A" : metadata.getName() // index partition name : metadata.getName() + "#" + metadata.getPartitionMetadata().getPartitionId(); /* * Make a copy of the caller's metadata. * * Note: The callers's reference is replaced by a reference to the clone * in order to avoid accidental modifications to the caller's metadata * object. */ this.metadata = metadata = metadata.clone(); { final LocalPartitionMetadata pmd = this.metadata.getPartitionMetadata(); if (pmd != null) { /* * Copy the local partition metadata, but do not include the * resource metadata identifying the resources that comprise the * index partition view. that information is only stored on the * BTree, not on the IndexSegment. */ this.metadata.setPartitionMetadata( new LocalPartitionMetadata(// pmd.getPartitionId(),// pmd.getSourcePartitionId(),// pmd.getLeftSeparatorKey(),// pmd.getRightSeparatorKey(),// null, // No resource metadata for indexSegment. pmd.getIndexPartitionCause() // ,pmd.getHistory()+ // "build("+pmd.getPartitionId()+",compactingMerge="+compactingMerge+") " ) ); } } // true iff the source index is isolatable. this.isolatable = metadata.isIsolatable(); /* * true iff the source index maintains tuple revision timestamps. */ this.versionTimestamps = metadata.getVersionTimestamps(); /* * true iff the source index supports delete markers (but they will be * copied IFF this is an incremental build). */ this.deleteMarkers = metadata.getDeleteMarkers(); /* * true iff the source index supports raw records. raw records will be * copied into the BLOBS region of the index segment and the address of * the raw record in the output tuple will be updated to reflect the * relative address of the record within the index segment. */ this.rawRecords = metadata.getRawRecords(); /* * Buffer used to encode addresses into the tuple value for a mutable * B+Tree. */ this.recordAddrBuf = rawRecords ? new ByteArrayBuffer(Bytes.SIZEOF_LONG) : null; // this.commitTime = commitTime; this.compactingMerge = compactingMerge; this.bufferNodes = bufferNodes; /* * Override the branching factor on the index segment. * * Note: this override is a bit dangerous since it might propagate back * to the mutable btree, which could hurt performance through the use of * a too large branching factor on the journal. However, the metadata * index stores the template metadata for the scale-out index and if you * use either that or the metadata record from an existing BTree then * this should never be a problem. */ this.metadata.setBranchingFactor(m); /* * @todo The override of the BTree class name does not make much sense * here. Either we should strongly discourage further subclassing of * BTree and IndexSegment or we should allow the subclass to be named * for both the mutable btree and the read-only index segment. */ this.metadata.setBTreeClassName(IndexSegment.class.getName()); this.addressManager = new WormAddressManager(offsetBits); /* * The INodeData cache for the generated index segment store. * * @todo LIRS: The index segment builder should perhaps only drive into * the shared LRU those records which were already hot. Figuring this * out will break encapsulation. Since the branching factor is not the * same, and since the source is a view, "hot" has to be interpreted in * terms of key ranges which are hot. As a workaround in a memory * limited system you can configure the LRUNexus so that the build will * not drive the records into the cache. [LIRS would partly address this * by not evicting records from the cache which are hot.] */ // TODO BLZG-1501 (remove LRUNexus) storeCache = null; // storeCache = (LRUNexus.INSTANCE != null && LRUNexus // .getIndexSegmentBuildPopulatesCache()) // // ? LRUNexus.INSTANCE.getCache(segmentUUID, addressManager)// // : null// // ; /* * Create the index plan and do misc setup. */ { // Create a plan for generating the output tree. plan = new IndexSegmentPlan(m, entryCount); /* * Setup a stack of nodes (one per non-leaf level) and one leaf. * These are filled in based on the plan and the entries visited in * the source btree. Nodes and leaves are written out to their * respective channel each time they are complete as defined by the * plan given the #of children assigned to a node or the #of keys * assigned to a leaf. */ stack = new AbstractSimpleNodeData[plan.height + 1]; // Note: assumes defaults to all zeros. writtenInLevel = new int[plan.height + 1]; for (int h = 0; h < plan.height; h++) { final SimpleNodeData node = new SimpleNodeData(h, plan.m, versionTimestamps); node.max = plan.numInNode[h][0]; stack[h] = node; } // the output leaf (reused for each leaf we populate). leaf = new SimpleLeafData(plan.height, plan.m, metadata); leaf.max = entryCount == 0 ? 0 : plan.numInNode[plan.height][0]; stack[plan.height] = leaf; /* * Setup optional bloom filter. * * Note: For read-only {@link IndexSegment} we always know the #of * keys exactly at the time that we provision the bloom filter. This * makes it easy for us to tune the filter for a desired false * positive rate. * * Note: The bloom filter can not be used with very large indices * due to the space requirements of the filter. However, very large * in this case is MAX_INT tuples! */ if (metadata.getBloomFilterFactory() != null && plan.nentries > 0 && plan.nentries < Integer.MAX_VALUE) { // the desired error rate for the bloom filter. final double p = metadata.getBloomFilterFactory().p; // create the bloom filter. bloomFilter = new BloomFilter((int) plan.nentries, p); } else { bloomFilter = null; } /* * Used to serialize the nodes and leaves for the output tree. */ nodeSer = new NodeSerializer(// /* * Note: it does not seem like there should be any * interaction between various IAddressSerializer strategies * and the manner in which we encode the region (BASE, NODE, * or BLOB) into the offset of addresses for the index * segment store. The offset is effectively left-shifted by * two bits to encode the region, there by reducing the * maximum possible byte offset within any region (including * BASE). However, that should not pose problems for any * IAddressSerializer strategy as long as it accepts any * legal [byteCount] and [offset] - it is just that our * offsets are essentially 4x larger than they would be * otherwise. */ addressManager,// NOPNodeFactory.INSTANCE,// plan.m,// the output branching factor. 0, // initialBufferCapacity - will be estimated. metadata, // false, // NOT read-only (we are using it for writing). metadata.getIndexSegmentRecordCompressorFactory() ); } this.overflowHandler = metadata.getOverflowHandler(); this.outFile = outFile; elapsed_setup = System.currentTimeMillis() - begin_setup; if (log.isInfoEnabled()) { log.info("name=" + name + ", nentries=" + entryCount + ", compactingMerge=" + compactingMerge); } } /** * Build the {@link IndexSegment} given the parameters specified to the * constructor. */ public IndexSegmentCheckpoint call() throws Exception { /* * Setup for IO. */ begin_build = System.currentTimeMillis(); if (outFile.exists() && outFile.length() != 0L) { throw new IllegalArgumentException("File exists and is not empty: " + outFile.getAbsoluteFile()); } final FileChannel outChannel; try { /* * Open the output channel * * @todo get an exclusive lock (FileLock). */ // out = FileLockUtility.openFile(outFile, mode, true/*useFileLock*/); out = new RandomAccessFile(outFile, mode); // outChannel = out.getChannel(); // // if (outChannel.tryLock() == null) { // // throw new IOException("Could not lock file: " // + outFile.getAbsoluteFile()); // // } // /* // * Open the leaf buffer. We only do this if there is at least a // * single root leaf, i.e., if the output tree is not empty. // */ // leafBuffer = plan.nleaves > 0 ? new TemporaryRawStore(offsetBits) // : null; leafWriteCache = plan.nleaves == 0 ? null : new WriteCache.FileChannelWriteCache( IndexSegmentCheckpoint.SIZE, null/* buf */, useChecksums, false/* isHighlyAvailable */, false/* bufferHasData */, new NOPReopener(out), 0L/* fileExtent */); /* * Open the node buffer. We only do this if there will be at least * one node written, i.e., the output tree will consist of more than * just a root leaf. */ if (plan.nnodes == 0) { // No nodes, so no buffering. nodeBuffer = null; nodeList = null; } else if (bufferNodes) { // Buffer the nodes in memory. nodeBuffer = null; nodeList = new LinkedList<NodeMetadata>(); } else { // Buffer the nodes on a temporary file. nodeBuffer = new TemporaryRawStore(offsetBits); nodeList = null; } /* * Open buffer for blobs if an overflow handler was specified -or- * if the index is using raw records. */ blobBuffer = (rawRecords || overflowHandler != null) // ? new TemporaryRawStore(offsetBits) : null; /* * Generate the output B+Tree. */ buildBTree(); // Verify that all leaves were written out. assert plan.nleaves == nleavesWritten; // Verify that all nodes were written out. assert plan.nnodes == nnodesWritten; elapsed_build = System.currentTimeMillis() - begin_build; final long begin_write = System.currentTimeMillis(); // write everything out on the outFile. checkpoint = writeIndexSegment(outChannel, commitTime); /* * Flush this channel to disk and close the channel. This also * releases our lock. We are done and the index segment is ready for * use. */ outChannel.force(true); // FileLockUtility.closeFile(outFile, out); out.close(); // also releases the lock. //// out = null; elapsed_write = System.currentTimeMillis() - begin_write; /* * log run time. */ elapsed = (System.currentTimeMillis() - begin_build) + elapsed_setup; // data rate in MB/sec. mbPerSec = (elapsed == 0 ? 0 : checkpoint.length / Bytes.megabyte32 / (elapsed / 1000f)); if(log.isInfoEnabled()) { final NumberFormat cf = NumberFormat.getNumberInstance(); cf.setGroupingUsed(true); final NumberFormat fpf = NumberFormat.getNumberInstance(); fpf.setGroupingUsed(false); fpf.setMaximumFractionDigits(2); log.info("finished" + ": total(ms)="+ elapsed// + "= setup("+ elapsed_setup +")"// + "+ build("+ elapsed_build + ")"// + "+ write("+ elapsed_write +")"// + "; branchingFactor=" + plan.m// + ", nentries=("+ ntuplesWritten+ " actual, "+plan.nentries+ " plan)"// + ", nnodes=("+ nnodesWritten+" actual, "+plan.nnodes+" plan)"// + ", nleaves=("+ nleavesWritten+" actual, "+plan.nleaves+" plan)"// + ", length="+ fpf.format(((double) checkpoint.length / Bytes.megabyte32))+ "MB" // + ", rate=" + fpf.format(mbPerSec) + "MB/sec"// ); } return checkpoint; } catch (Exception ex) { /* * Note: The output file is deleted if the build fails. */ deleteOutputFile(); // Re-throw exception throw ex; } catch (Throwable ex) { /* * Note: The output file is deleted if the build fails. */ deleteOutputFile(); // Masquerade exception. throw new RuntimeException(ex); } finally { // /* // * make sure that the temporary file gets deleted regardless. // */ // if (leafBuffer != null && leafBuffer.isOpen()) { // try { // leafBuffer.close(); // also deletes the file if any. // } catch (Throwable t) { // log.warn(t,t); // } // } /* * make sure that the leaf write cache is closed regardless. */ if (leafWriteCache != null) { try { leafWriteCache.close(); } catch (Throwable t) { log.warn(t,t); } } /* * make sure that the temporary file gets deleted regardless. */ if (nodeBuffer != null && nodeBuffer.isOpen()) { try { nodeBuffer.close(); // also deletes the file if any. } catch (Throwable t) { log.warn(t,t); } } } } /** * Scan the source tuple iterator in key order writing output leaves onto * the index segment file with the new branching factor. We also track a * stack of nodes that are being written out concurrently on a temporary * channel. * <p> * The plan tells us the #of values to insert into each leaf and the #of * children to insert into each node. Each time a leaf becomes full * (according to the plan), we "close" the leaf, writing it out onto the * store and obtaining its "address". The "close" logic also takes care of * setting the address on the leaf's parent node (if any). If the parent * node becomes filled (according to the plan) then it is also "closed". * <p> * Each time (except the first) that we start a new leaf we record its first * key as a separatorKey in the appropriate parent node. * <p> * Note: The root may be a leaf as a degenerate case. * * @todo Verify correct rejection if the source iterator visits too many or * too few tuples. */ protected void buildBTree() { // // Flag used to flush the last leaf iff it is dirty. // boolean needsFlush = false; if (plan.nentries == 0) { /* * A single empty root leaf. */ leaf.reset(plan.numInNode[leaf.level][0]); flushNodeOrLeaf(leaf); return; } // For each leaf in the plan while tuples remain. for (int i = 0; i < plan.nleaves && entryIterator.hasNext(); i++) { /* * Fill in defined keys and values for this leaf. * * Note: Since the shortage (if any) is distributed by the plan from * the last leaf backward a shortage will cause [leaf] to have * key/val data that is not overwritten. This does not cause a * problem as long as [leaf.nkeys] is set correctly since only that * many key/val entries will actually be serialized. */ leaf.reset(plan.numInNode[leaf.level][i]); final int limit = leaf.max; // #of keys to fill in this leaf. // For each tuple allowed by the plan into the current leaf. for (int j = 0; j < limit && entryIterator.hasNext(); j++) { // Copy the tuple into the leaf. copyTuple(j, entryIterator.next()); // needsFlush = true; if (i > 0 && j == 0) { /* * Every time (after the first) that we enter a new leaf we * need to record its first key as a separatorKey in the * appropriate parent. * * Note: In the case where the parent of the previous leaf * is full, this actually ascends through the parent of the * previous leaf since the parent slot in the stack has not * yet been reset. This can be a different node than the * parent of this leaf, but only in the case when the parent * of the previous leaf was full. In that case, the * separatorKey is lifted into the parent's parent until an * open slot is found. While confusing, the separatorKey * always winds up in the correct node. */ addSeparatorKey(leaf); } } /* * Close the current leaf. This will write the address of the leaf * on the parent (if any). If the parent becomes full then the * parent will be closed as well. */ flushNodeOrLeaf(leaf);//, !entryIterator.hasNext()); // needsFlush = false; } // if (needsFlush) { // // /* // * This flushes the last leaf when the plan was based on an over // * estimate of the range count of the source iterator. // */ // // flush(leaf, true/* exhausted */); // // } } /** * Copy a tuple into the current leaf at the given index. * * @param j * The index in the leaf to which the tuple will be copied. * @param tuple * The tuple. */ private void copyTuple(final int j, final ITuple<?> tuple) { if (ntuplesWritten == 0) { // Verify iterator is reporting necessary data. assertIteratorOk(tuple); } ntuplesWritten++; final MutableKeyBuffer keys = leaf.keys; assert keys.nkeys == j; keys.keys[j] = tuple.getKey(); if (deleteMarkers) leaf.deleteMarkers[j] = tuple.isDeletedVersion(); if (versionTimestamps) { final long t = tuple.getVersionTimestamp(); leaf.versionTimestamps[j] = t; if (t < leaf.minimumVersionTimestamp) leaf.minimumVersionTimestamp = t; if (t > leaf.maximumVersionTimestamp) leaf.maximumVersionTimestamp = t; } final byte[] val; if(deleteMarkers && tuple.isDeletedVersion()) { val = null; } else { if (overflowHandler != null) { /* * Provide the handler with the opportunity to copy * the blob's data onto the buffer and re-write the * value, which is presumably the blob reference. */ val = overflowHandler.handle(tuple, blobBuffer); } else { /* * Note: If the source index uses raw records then this will * return the materialized value from the raw record. */ val = tuple.getValue(); } } if (rawRecords) { final long maxRecLen = metadata.getMaxRecLen(); if (val != null && val.length > maxRecLen) { // write the value on the backing store. final long addr1 = blobBuffer.write(ByteBuffer.wrap(val)); // decode the offset and byte length of the record. final int nbytes = blobBuffer.getByteCount(addr1); final long offset = blobBuffer.getOffset(addr1); // recode as a relative address against the BLOBs region. final long addr = addressManager.toAddr(nbytes, IndexSegmentRegion.BLOB.encodeOffset(offset)); // save its address in the values raba. leaf.vals.values[j] = AbstractBTree.encodeRecordAddr( recordAddrBuf, addr); // flag as a raw record. leaf.rawRecords[j] = true; } else { leaf.vals.values[j] = val; leaf.rawRecords[j] = false; } } else { leaf.vals.values[j] = val; } if (bloomFilter != null) { /* * Note: We record the keys for deleted tuples in the * bloom filter. This is important since we need a * search of an ordered set of AbstractBTree sources for * a FusedView to halt as soon as it finds a delete * marker for a key. If we do not add the key for * deleted tuples to the bloom filter then the bloom * filter will report (incorrectly) that the key is not * in this IndexSegment. It is - with a delete marker. */ bloomFilter.add(keys.keys[j]); } keys.nkeys++; leaf.vals.nvalues++; } /** * This is invoked for the first tuple visited to make sure that the * iterator is reporting the data we need. */ private void assertIteratorOk(final ITuple<?> tuple) { if (!tuple.getKeysRequested()) throw new RuntimeException("keys not reported by itr."); if (!tuple.getValuesRequested()) throw new RuntimeException("vals not reported by itr."); if (!compactingMerge && deleteMarkers && ((tuple.flags() & IRangeQuery.DELETED) == 0)) { /* * This is an incremental build and the source index supports delete * markers but the iterator is not visiting deleted tuples. */ throw new RuntimeException("delete markers not reported by itr."); } /* * @todo I am not sure about this test. iterators should always report * the revision timestamp metadata. The real question is whether or not * they are reporting deleted tuples and that is tested above. [The * other question is whether we always need to report deleted tuples for * an isolatable index and that is what I am not sure about.] */ assert !isolatable || (isolatable && ((tuple.flags() & IRangeQuery.DELETED) == 0)) : "version metadata not reported by itr for isolatable index"; } /** * Used to make sure that the output file is deleted unless it was * successfully processed. */ private void deleteOutputFile() { if (out != null && out.getChannel().isOpen()) { try { // FileLockUtility.closeFile(outFile, out); out.close(); } catch (Throwable t) { log.error("Ignoring: " + t, t); } } if (!outFile.delete()) { log.warn("Could not delete: file=" + outFile.getAbsolutePath()); } if (storeCache != null) { /* * Clear the cache since the index segment store was not generated * successfully and the cache records will never be read. */ storeCache.clear(); } } /** * <p> * Flush a node or leaf that has been closed (no more data will be added). * </p> * <p> * Note: When a node or leaf is flushed we write it out to obtain its * address and set that address on its direct parent using * {@link #addChild(SimpleNodeData, long, AbstractSimpleNodeData, boolean)}. * This also updates the per-child counters of the #of entries spanned by a * node. * </p> * * @param node * The node to be flushed. */ protected void flushNodeOrLeaf(final AbstractSimpleNodeData node) { // final boolean exhausted) { final int h = node.level; // The index into the level for this node or leaf. final int col = writtenInLevel[h]; assert col < plan.numInLevel[h]; if (log.isDebugEnabled()) log.debug("closing " + (node.isLeaf() ? "leaf" : "node") + "; h=" + h + ", col=" + col + ", max=" + node.max + ", nkeys=" + node.keys.size()); /* * Note: Nodes are written out immediately. For a leaf, this allocates a * data record for the leaf and updates the last leaf's representation * to set the priorAddr and nextAddr fields. If the build is done then * the nextAddr field will remain 0L. * * Note: This will recursively invoke flush() if the parent Node is * full. * * Note: The node is not reset in the stack by this method so it will * remain available to getParent(), which we invoke next. */ final long addr = writeNodeOrLeaf(node);//, exhausted); // Lookup the parent of this leaf/node in the stack. final SimpleNodeData parent = getParent(node); if(parent != null) { addChild(parent, addr, node);//, exhausted); } // if (col + 1 < plan.numInLevel[h]) { // // int max = plan.numInNode[h][col + 1]; // // parent.reset(max); // // } writtenInLevel[h]++; } /** * Record the persistent address of a child on its parent and the #of * entries spanned by that child. If all children on the parent become * assigned then the parent is closed. * * @param parent * The parent. * @param childAddr * The address of the child (node or leaf). * @param child * The child reference. */ protected void addChild(final SimpleNodeData parent, final long childAddr, final AbstractSimpleNodeData child) { // #of entries spanned by this node. final long nentries = (child.isLeaf() ? child.getKeyCount() : ((INodeData) child).getSpannedTupleCount()); if (parent.nchildren == parent.max) { /* * If there are more nodes to be filled at this level then prepare * this node to receive its next values/children. */ resetNode(parent); } // assert parent.nchildren < parent.max; if(log.isDebugEnabled()) log.debug("setting " + (child.isLeaf() ? "leaf" : "node") + " as child(" + parent.nchildren + ")" + " at h=" + parent.level + ", col=" + writtenInLevel[parent.level] + ", addr=" + addressManager.toString(childAddr)); final int nchildren = parent.nchildren; parent.childAddr[nchildren] = childAddr; parent.childEntryCount[nchildren] = nentries; parent.nentries += nentries; if(versionTimestamps) { parent.minimumVersionTimestamp = Math.min( parent.minimumVersionTimestamp, child.minimumVersionTimestamp); parent.maximumVersionTimestamp = Math.max( parent.maximumVersionTimestamp, child.maximumVersionTimestamp); } parent.nchildren++; // final int h = parent.level; // if (exhausted // && child.isLeaf() //// && parent != null // // #of separator keys LT planned childCount for parent. // && (parent.keys.nkeys + 1) < plan.numInNode[h][writtenInLevel[h]]) { // // /* // * When the source iterator is exhausted before the expected #of // * tuples have been processed then the last leaf will be // * non-empty (we do not start a leaf unless there is at least // * one tuple on hand to copy into that leaf). Unless this is the // * root leaf, then its parent may lack a separator key since the // * separator key is chosen based on the first key to enter the // * next leaf and we will never generate that next leaf since // * there are no more tuples in the source iterator. This edge // * case is detected when the #of children in the parent of the // * last leaf is less than the #of planned children. Since we // * never saw the next planned leaf, we need to hack in a // * separator key for that leaf now so that queries LT the // * separator key are directed to the last leaf which we did see. // * This edge case is handled by adding a separatorKey based on // * successor(lastKey) to the parent of the last leaf. // */ // // final byte[] lastKey = leaf.keys.keys[leaf.keys.nkeys - 1]; // // final byte[] separatorKey = BytesUtil.successor(lastKey); // // parent.keys.keys[parent.keys.nkeys++] = separatorKey; //// addSeparatorKey(parent, separatorKey); // // /* // * @todo Note that the childAddr of the next leaf was already // * assigned since we allocate the leaf's record before it is // * populated, so we zero out that childAddr now. [The non-0L // * childAddr for this last leaf is not really a problem since it // * will never be visited by top-down navigation (the B+Tree will not // * have any data for keys GTE the successor key directing probes to // * that leaf). What is more important is that the // * IndexSegmentCheckpoint should not direct us to the empty last // * leaf and that the current leaf [node] should have nextAddr=0L so // * we never navigate to that last leaf. // * // * @todo Write more detailed unit tests for these points. // */ //// parent.childAddr[parent.keys.nkeys] = 0L; // // } if ( parent.nchildren == parent.max ) { /* * Flush the parent if the leaf/node is full. */ flushNodeOrLeaf(parent); } } /** * The {@link #stack} contains nodes which are reused for each node or leaf * at a given level in the generated B+Tree. This method prepares a node in * the stack for reuse. */ protected void resetNode(final SimpleNodeData parent) { final int h = parent.level; /* * The index into the level for this node. Note that we subtract one * since the node is full and was already "closed". What we are * trying to figure out here is whether the node may be reset so as * to allow more children into what is effectively a new node or * whether there are no more nodes allowed at this level of the * output tree. */ final int col = writtenInLevel[h] - 1; if (col + 1 < plan.numInLevel[h]) { /* * Reset the Node in the stack. It will be reused for the next * Node at the same level in the B+Tree. */ parent.reset(plan.numInNode[h][col + 1]/*max*/); } else { /* * The data is driving us to populate more nodes in this level * than the plan allows for the output tree. This is either an * error in the control logic or an error in the plan. */ throw new AssertionError(); } } /** * Copies the first key of a new leaf as a separatorKey for the appropriate * parent (if any) of that leaf. This must be invoked when the first key is * set on that leaf. However, it must not be invoked on the first leaf. * * @param leaf * The current leaf. The first key on that leaf must be defined. */ protected void addSeparatorKey(final SimpleLeafData leaf) { final SimpleNodeData parent = getParent(leaf); if (parent == null) { /* * This is the root leaf, so there is no parent and the separator * key will not be assigned. */ return; } /* * @todo Use the shortest separator key (this provides space savings on * the nodes, but prefix compression of the keys has much the same * effect). */ final byte[] separatorKey = leaf.keys.get(0); if (separatorKey == null) { throw new AssertionError(); } addSeparatorKey(parent, separatorKey); } /** * Copies the separatorKey into the appropriate parent (if any). This method * is self-recursive. * * @param parent * A node which is a parent of the current leaf or an ancestor of * the node which is the parent of the current leaf (non-null). * @param separatorKey * The separator key to be assigned to the parent (non-null). */ private void addSeparatorKey(final SimpleNodeData parent, final byte[] separatorKey) { if (parent == null) throw new AssertionError(); if (separatorKey == null) throw new AssertionError(); /* * The maximum #of keys for a node is one less key than the maximum #of * children for that node. */ final int maxKeys = parent.max - 1; final MutableKeyBuffer parentKeys = parent.keys; if (parentKeys.nkeys < maxKeys) { /* * Copy the separator key into the next free position on the parent, * incrementing the #of keys in the parent. */ if (log.isDebugEnabled()) log.debug("h=" + parent.level + ", col=" + writtenInLevel[parent.level] + ", separatorKey=" + BytesUtil.toString(separatorKey)); parentKeys.keys[parentKeys.nkeys++] = separatorKey; // parentKeys.keys[parentKeys.nkeys++] = leaf.keys.get(0); // parent.copyKey(parentKeys.nkeys++, leaf, 0 ); } else { /* * Delegate to the parent recursively until we find the first parent * into which the separatorKey can be inserted. */ addSeparatorKey(getParent(parent), separatorKey); } } /** * Return the parent of a node or leaf in the {@link #stack}. * * @param node * The node or leaf. * * @return The parent or <code>null</code> iff <i>node</i> is the root node * or leaf. */ protected SimpleNodeData getParent(final AbstractSimpleNodeData node) { if (node.level == 0) { return null; } return (SimpleNodeData) stack[node.level - 1]; } /** * Write the node or leaf onto the appropriate output channel. * * @return The address that may be used to read the node or leaf from the * file. Note that the address of a node is relative to the start of * the node channel and therefore must be adjusted before reading * the node from the final index segment file. */ protected long writeNodeOrLeaf(final AbstractSimpleNodeData node) { return node.isLeaf() ? writeLeaf((SimpleLeafData) node) : writeNode((SimpleNodeData) node); } /** * Code the leaf, obtaining its address, update the prior/next addr of the * previous leaf, and write that previous leaf onto the output file. * <p> * Note: For leaf addresses we know the absolute offset into the * {@link IndexSegmentStore} where the leaf will wind up so we encode the * address of the leaf using the {@link IndexSegmentRegion#BASE} region. * <p> * Note: In order to write out the leaves using a double-linked list with * prior-/next-leaf addresses we have to use a "write behind" strategy. * Instead of writing out the leaf as soon as it is serialized, we save the * uncoded address and a copy of the coded data record on private member * fields. When we code the next leaf (or if we learn that we have no more * leaves to code because {@link IndexSegmentPlan#nleaves} EQ * {@link #nleavesWritten}) then we patch the coded representation of the * prior leaf and write it on the store at the previously obtained address, * thereby linking the leaves together in both directions. It is definitely * confusing. * * @return The address that may be used to read the leaf from the file * backing the {@link IndexSegmentStore}. */ protected long writeLeaf(final SimpleLeafData leaf) { /* * The encoded address of the leaf that we allocated here. The encoded * address will be relative to the BASE region. */ final long addr; { // code the leaf, obtaining a view onto an internal (shared) buffer. // final ByteBuffer buf = nodeSer.encode(leaf).asByteBuffer(); // code the leaf. final ILeafData thisLeafData = nodeSer.encodeLive(leaf); // Obtain address to be assigned to this leaf. // // Allocate a record for the leaf on the temporary store. // final long addr1 = leafBuffer.allocate(buf.remaining()); // final long addr1 = leafBuffer.allocate(thisLeafData.data().len()); final long addr1 = allocateLeafAddr(thisLeafData.data().len()); // encode the address assigned to the serialized leaf. addr = encodeLeafAddr(addr1); if (log.isDebugEnabled()) log.debug("allocated storage for leaf data record"// + ": addr=" + addressManager.toString(addr)); if (nleavesWritten > 0) { /* * Update the previous leaf, but only for the 2nd+ leaf. */ if (log.isDebugEnabled()) log.debug("updating previous leaf"// + ": addr="+addressManager.toString(encodeLeafAddr(bufLastLeafAddr))// + ", priorAddr="+ addressManager.toString(addrPriorLeaf)// + ", nextAddr=" + addressManager.toString(addr)// // + ", exhausted=" + exhausted ); else if (log.isInfoEnabled()) { System.err.print("."); // wrote a leaf. if (nleavesWritten % 80 == 0) { // break lines. System.err.print("\n"); } } // view onto the coded record for the prior leaf. final ByteBuffer bufLastLeaf = lastLeafData.data().asByteBuffer(); /* * Patch representation of the previous leaf. * * Note: This patches the coded record using the ByteBuffer view * of that record. However, the change is made to the backing * byte[] so the change is visible on the coded record as well. */ nodeSer.updateLeaf(bufLastLeaf, addrPriorLeaf, addr/*addrNextLeaf*/); assert lastLeafData.getPriorAddr() == addrPriorLeaf; assert lastLeafData.getNextAddr() == addr; // write the previous leaf onto the store. // leafBuffer.update(bufLastLeafAddr, 0/*offset*/, bufLastLeaf); writeLeafForReal(bufLastLeafAddr, bufLastLeaf); // the encoded address of the leaf that we just wrote out. addrPriorLeaf = encodeLeafAddr(bufLastLeafAddr); if (storeCache != null) { /* * Insert the coded, patched record for the prior leaf into * cache. */ storeCache.putIfAbsent(addrPriorLeaf, lastLeafData); } } // update reference to the leaf we just coded. lastLeafData = thisLeafData; // the address allocated for the leaf in the temp store. bufLastLeafAddr = addr1; } if (nleavesWritten == 0) { /* * Encoded addr of the 1st leaf - update only for the first leaf * that we allocate. */ addrFirstLeaf = addr; } // encoded addr of the last leaf - update for each leaf that we allocate. addrLastLeaf = addr; // the #of leaves written so far. nleavesWritten++; if (plan.nleaves == nleavesWritten) {//||*/ exhausted) { /* * Update the last leaf. * * Note: The last leaf is the one for which we allocated storage * immediately above. * * Note: We only invoke flush() if a leaf has data so we should * never be in a position of writing out an empty leaf (with the * exception of a B+Tree which has no tuples). */ assert plan.nentries == 0 || lastLeafData.getKeyCount() > 0 : "Last leaf is empty?"; if (log.isDebugEnabled()) log.debug("updating last leaf"// + ": addr="+addressManager.toString(encodeLeafAddr(bufLastLeafAddr))// + ", priorAddr="+ addressManager.toString(addrPriorLeaf)// + ", nextAddr=0L"// // + ", exhausted="+exhausted ); // log.debug("Writing leaf: priorLeaf=" + addrPriorLeaf // + ", nextLeaf=" + 0L + ", exhausted=" + exhausted); else if (log.isInfoEnabled()) System.err.print("."); // wrote a leaf. // View onto the coded record for the prior leaf. final ByteBuffer bufLastLeaf = lastLeafData.data().asByteBuffer(); /* * Patch representation of the last leaf. * * Note: This patches the coded record using the ByteBuffer view * of that record. However, the change is made to the backing * byte[] so the change is visible on the coded record as well. */ nodeSer.updateLeaf(bufLastLeaf, addrPriorLeaf, 0L/*addrNextLeaf*/); assert lastLeafData.getPriorAddr() == addrPriorLeaf; assert lastLeafData.getNextAddr() == 0L; // write the last leaf onto the store. // leafBuffer.update(bufLastLeafAddr, 0/*offset*/, bufLastLeaf); writeLeafForReal(bufLastLeafAddr, bufLastLeaf); if (storeCache != null) { /* * Insert the coded, patched record for the prior leaf into * cache. */ storeCache.putIfAbsent(addrLastLeaf, lastLeafData); } } return addr; } private long allocateLeafAddr(final int nbytes) { // final long addr1 = leafBuffer.allocate(nbytes); final long offset = leafAddrFactory.get(); leafAddrFactory.addAndGet(nbytes); final long addr1 = addressManager.toAddr((int) nbytes, offset); return addr1; } /** * The address factory for the leaves. Note that addresses are relative to * the start of the leaf region, not the start of the output file. */ private final AtomicLong leafAddrFactory = new AtomicLong(0L); // IndexSegmentCheckpoint.SIZE); private void writeLeafForReal(final long addr, final ByteBuffer data) { //leafBuffer.update(addr, 0/*offset*/, data); final long offset = addressManager.getOffset(addr); try { final int chk = useChecksums ? checker.checksum(data) : 0; // write leaf on the cache. if(!leafWriteCache.write(offset, data, chk)) { // leaf does not fit in the cache, so evict cache to the file. leafWriteCache.flush(false/*force*/); // reset the cache! leafWriteCache.reset(); // write leaf on the cache. if(!leafWriteCache.write(offset, data, chk)) { /* * The leaf is larger than the write cache, so we will write * it directly onto the output file. * * @todo This is tested by the larger random builds, but we * really should have an explicit test for this case. */ // Write the record onto the file at that offset. FileChannelUtility.writeAll(leafWriteCache.opener, data, offset); } } } catch (Throwable e) { throw new RuntimeException(e); } } /** * "Allocates" a node address when we will buffer the nodes in RAM. * * @throws UnsupportedOperationException * if we are not buffering nodes in RAM. */ private long allocateNodeAddr(final int nbytes) { if (!bufferNodes) throw new UnsupportedOperationException(); final long offset = nodeAddrFactory.get(); nodeAddrFactory.addAndGet(nbytes); final long addr1 = addressManager.toAddr((int) nbytes, offset); return addr1; } /** * The address factory for the nodes used when we will buffer the nodes in * RAM. Note that addresses are relative to the start of the node region, * not the start of the output file. */ private final AtomicLong nodeAddrFactory = new AtomicLong(0L); /** * Encode the address of a leaf. * <p> * Note: This updates {@link #maxNodeOrLeafLength} as a side-effect. * * @param addr1 * The address of a leaf as allocated by * {@link #allocateLeafAddr(int)} * * @return The encoded address of the leaf relative to the * {@link IndexSegmentRegion#BASE} region where it will appear once * the leaves have been copied onto the output file. */ private long encodeLeafAddr(final long addr1) { final int nbytes = addressManager.getByteCount(addr1); if (nbytes > maxNodeOrLeafLength) { // track the largest node or leaf written. maxNodeOrLeafLength = nbytes; } /* * Note: The offset is adjusted by the size of the checkpoint record * such that the offset is correct for the generated file NOT the buffer * into which the leaves are being written. */ final long offset = addressManager.getOffset(addr1) + IndexSegmentCheckpoint.SIZE; // Encode the address of the leaf. final long addr = addressManager.toAddr(nbytes, IndexSegmentRegion.BASE .encodeOffset(offset)); return addr; } /* * Data used to chain the leaves together in a prior/next double-linked * list. */ /** * The address of the previous leaf, but encoded for the generated * {@link IndexSegmentStore}. */ private long addrPriorLeaf = 0L; /** * The address of the last leaf allocated (but not yet written out). * <p> * Note: This address is NOT encoded for the {@link IndexSegmentStore}. * Instead, it is encoded for the output file using the * {@link #addressManager} and is relative to the start of leaves region in * the output file. * * @see #writeLeaf(SimpleLeafData) */ private long bufLastLeafAddr = 0L; // /** // * Buffer holds a copy of the serialized representation of the last leaf. // * This buffer is reset and written by {@link #writeLeaf(SimpleLeafData)}. // * The contents of this buffer are used by {@link #writePriorLeaf(long)} to // * write out the serialized representation of the previous leaf in key order // * after it has been patched to reflect the prior and next leaf addresses. // * The buffer is automatically reallocated if it is too small for a leaf. // */ // private ByteBuffer bufLastLeaf = ByteBuffer.allocate(10 * Bytes.kilobyte32); /** * Buffer holds a copy of the coded representation of the last leaf. This * buffer is written by {@link #writeLeaf(SimpleLeafData)}. The contents of * this buffer are used to write out the serialized representation of the * previous leaf in key order after it has been patched to reflect the prior * and next leaf addresses. The coded {@link ILeafData} record is modified * before the previous leaf is written out to reflect the address assigned * to the next leaf in key order. */ private ILeafData lastLeafData; /** * Code and write the node onto the {@link #nodeBuffer}. * * @return An <em>relative</em> address that must be correctly decoded * before you can read the compressed node from the file. This value * is also set on {@link SimpleNodeData#addr}. * * @see SimpleNodeData * @see IndexSegmentRegion * @see IndexSegmentAddressManager */ protected long writeNode(final SimpleNodeData node) { // code node, obtaining slice onto shared buffer and wrap that // shared buffer. final INodeData codedNodeData = nodeSer.encodeLive(node); // final ByteBuffer buf = nodeSer.encode(node).asByteBuffer(); final long tempAddr; if (nodeBuffer != null) { // write the node on the buffer (a temporary store). tempAddr = nodeBuffer.write(codedNodeData.data().asByteBuffer()); } else { // allocate address relative to the start of the nodes region. tempAddr = allocateNodeAddr(codedNodeData.data().len()); // buffer the node (it will be written out later). nodeList.add(new NodeMetadata(tempAddr, codedNodeData)); } final long offset = addressManager.getOffset(tempAddr); final int nbytes = addressManager.getByteCount(tempAddr); if (nbytes > maxNodeOrLeafLength) { // track the largest node or leaf written. maxNodeOrLeafLength = nbytes; } // the #of nodes written so far. nnodesWritten++; if (log.isInfoEnabled()) System.err.print("x"); // wrote a node. /* * Encode the node address. Since we do not know the offset of the NODE * region in advance this address gets encoded as relative to the start * of the NODE region in the file. */ final long addr = addressManager.toAddr(nbytes, IndexSegmentRegion.NODE .encodeOffset(offset)); node.addr = addr; if (storeCache != null) { /* * Insert the coded record into cache as [addr2 : nodeData], where * nodeData is encodeLive() wrapped version of the slice. */ storeCache.putIfAbsent(addr, codedNodeData); } return addr; } /** * <p> * Writes the complete file format for the index segment. The file is * divided up as follows: * <ol> * * <li>fixed length {@link IndexSegmentCheckpoint} record (required)</li> * <li>leaves (required)</li> * <li>nodes (may be empty)</li> * <li>the bloom filter (optional)</li> * <li>the {@link IndexMetadata} record (required, but extensible)</li> * </ol> * </p> * <p> * The index segment metadata is divided into a base * {@link IndexSegmentCheckpoint} record with a fixed format containing only * essential data and additional metadata records written at the end of the * file including the optional bloom filter and the required * {@link IndexMetadata} record. The latter is where we write variable * length metadata including the _name_ of the index, or additional metadata * defined by a specific class of index. * </p> * <p> * Once all nodes and leaves have been buffered we are ready to start * writing the data. We skip over a fixed size metadata record since * otherwise we are unable to pre-compute the offset to the leaves and hence * the addresses of the leaves. The node addresses are written in an * encoding that requires active translation by the receiver who must be * aware of the offset to the start of the node region. We can not write the * metadata record until we know the size and length of each of these * regions (leaves, nodes, and the bloom filter, or other metadata records) * since that information is required in order to be able to form their * addresses for insertion in the metadata record. * </p> * * @param outChannel * * @param commitTime * * @throws IOException * @throws InterruptedException * * FIXME There is no sense of an atomic commit when building a * new index segment. We should write ZEROs into the checkpoint * record initially and then seek back to the head of the file * once we are done and write out the correct checkpoint record. * <p> * Note: There are similar issues involved when we replicate * index segment or journal files to verify that they are good. */ protected IndexSegmentCheckpoint writeIndexSegment( final FileChannel outChannel, final long commitTime) throws IOException, InterruptedException { /* * All nodes and leaves have been written. If we wrote any nodes * onto the temporary channel then we also have to bulk copy them * into the output channel. */ final long offsetLeaves; final long extentLeaves; final long offsetNodes; final long extentNodes; final long offsetBlobs; final long extentBlobs; final long addrRoot; /* * Skip over the checkpoint record at the start of the file. * * Note: We fill this areas with zeros. When the index segment is empty * (has no entries) then this causes the file length to be extended * beyond the checkpoint record and the index metadata record gets * written onto the file at that point. If we merely position the file * to beyond the checkpoint record then nothing has been written on the * file and the index metadata record gets written at offset 0L! */ outChannel.write(ByteBuffer.allocate(IndexSegmentCheckpoint.SIZE)); /* * Direct copy the leaves from their buffer into the output file. If the * buffer was backed by a file then that file will be deleted as a * post-condition on the index build operation. */ if (plan.nleaves == 0) { /* * The tree is empty (no root leaf). */ // No leaves. offsetLeaves = 0L; extentLeaves = 0L; // No nodes. offsetNodes = 0L; extentNodes = 0L; // No root. addrRoot = 0L; } else { offsetLeaves = IndexSegmentCheckpoint.SIZE; // output the leaf buffer. { // /* // * Transfer the leaf buffer en mass onto the output channel. // * // * Note: If a planned leaf is not emitted then this can cause an // * exception to be thrown indicating that the IO transfer is not // * progressing. This occurs when the record for that leaf was // * allocated on the leafBuffer but never written onto the // * leafBuffer. This allocate-then-write policy allows us to // * double-link the leaves during the build. The build SHOULD // * automatically correct for cases when there are not enough // * tuples to fill out the leaves in the plan. However, if it // * does not correct the problem, and hence does not write the // * last allocated leaf data record, then you might see this // * exception. // */ // extentLeaves = leafBuffer.getBufferStrategy().transferTo(out); // if (nodeBuffer != null) { // // // The offset to the start of the node region. // offsetNodes = IndexSegmentCheckpoint.SIZE + extentLeaves; // // assert outChannel.position() == offsetNodes; // // } else { // // // zero iff there are no nodes. // offsetNodes = 0L; // // } // The extent of the leaves region on the file. extentLeaves = leafAddrFactory.get(); if (plan.nnodes != 0) { // The offset to the start of the node region. offsetNodes = IndexSegmentCheckpoint.SIZE + extentLeaves; } else { // zero iff there are no nodes. offsetNodes = 0L; } // Close the buffer. // leafBuffer.close(); try { // flush the last writes. leafWriteCache.flush(false/* force */); // close cache (discards buffer). leafWriteCache.close(); } catch (InterruptedException e) { throw new RuntimeException(e); } } /* * Direct copy the node index from the buffer into the output file. * If the buffer was backed by a file then that file will be deleted * as a post-condition on the index build operation. */ if (nodeBuffer != null) { /* * Seek to the start of the nodes region (the write cache does * not change the file position when it writes onto the file so * we need to explicitly seek to the desired location). */ outChannel.position(offsetNodes); // Verify we are at the start of the nodes region. assert outChannel.position() == offsetNodes : "position=" + outChannel.position() + ", but offsetNodes=" + offsetNodes; // transfer the nodes en mass onto the output channel. extentNodes = nodeBuffer.getBufferStrategy().transferTo(out); // Close the buffer. nodeBuffer.close(); // Note: already encoded relative to NODE region. addrRoot = (((SimpleNodeData) stack[0]).addr); } else if (nodeList != null) { /* * Write the nodes onto the output file. * * Note: The addresses are relative to the start of the nodes * region, so we adjust the write cache using the offset to the * nodes region. * * FIXME Use a WriteCacheService which will hide this complexity * and give better throughput. */ // Setup a write cache. final WriteCache.FileChannelWriteCache writeCache = new WriteCache.FileChannelWriteCache( offsetNodes, null/* buf */, useChecksums, false/* isHighlyAvailable */, false/* bufferHasData */, new NOPReopener(out), 0L/* fileExtent */); try { // Count the #of bytes in the nodes. int nbytes = 0; // For each node. for (NodeMetadata md : nodeList) { final long addr = md.addr; // the offset relative to the start of the nodes region. final long offset = addressManager.getOffset(addr); final AbstractFixedByteArrayBuffer slice = md.data .data(); // track #of bytes across all nodes. nbytes += slice.len(); final ByteBuffer data = slice.asByteBuffer(); final int chk = useChecksums?checker.checksum(data):0; // write onto cache. if (!writeCache.write(offset, data,chk)) { // cache is full, evict to file. writeCache.flush(false/* force */); // reset the cache! writeCache.reset(); // and write on the cache again. if (!writeCache.write(offset, data,chk)) { // directly write onto the output file. FileChannelUtility.writeAll(writeCache.opener, data, offset); } } } // force the last writes to the output file. writeCache.flush(false/* force */); // reset the cache! writeCache.reset(); // #of bytes across all nodes. extentNodes = nbytes; } finally { // releases the buffer. writeCache.close(); } // Note: already encoded relative to NODE region. addrRoot = (((SimpleNodeData) stack[0]).addr); } else { /* * The tree consists of just a root leaf. */ // This MUST be 0L if there are no leaves. extentNodes = 0L; // Address of the root leaf. addrRoot = addrLastLeaf; } } if (log.isInfoEnabled()) log.info("addrRoot: " + addrRoot + ", " + addressManager.toString(addrRoot)); /* * Direct copy the optional blobBuffer onto the output file. * * Note: The backing BufferStrategy for the blobBuffer is allocated * eagerly if there is an indication that blobs *might* be in use by the * index. However, if nothing ever gets written onto the temporary store * then the backing file is never created and there will be nothing to * transfer. We look for this case an opt out of the transfer when * nothing has been written onto the blobBuffer. */ if (blobBuffer == null || blobBuffer.getBufferStrategy().size() == 0L) { // No blobs region. offsetBlobs = extentBlobs = 0L; } else { // #of bytes written so far on the output file. offsetBlobs = out.length(); // seek to the end of the file. out.seek(offsetBlobs); // transfer the nodes en mass onto the output channel. extentBlobs = blobBuffer.getBufferStrategy().transferTo(out); // Close the buffer. blobBuffer.close(); } /* * If the bloom filter was constructed then serialize it on the end * of the file. */ final long addrBloom; if( bloomFilter == null ) { addrBloom = 0L; } else { // serialize the bloom filter. final byte[] bloomBytes = SerializerUtil.serialize(bloomFilter); // #of bytes written so far on the output file. final long offset = out.length(); // seek to the end of the file. out.seek(offset); // write the serialized bloom filter. out.write(bloomBytes, 0, bloomBytes.length); // Address of the region containing the bloom filter (one record). addrBloom = addressManager.toAddr(bloomBytes.length, IndexSegmentRegion.BASE.encodeOffset(offset)); if (storeCache != null) { /* * Insert the record into the cache. */ storeCache.putIfAbsent(addrBloom, bloomFilter); } } /* * Write out the metadata record. */ final long addrMetadata; { /* * Serialize the metadata record. */ final byte[] metadataBytes = SerializerUtil.serialize(metadata); // #of bytes written so far on the output file. final long offset = out.length(); // seek to the end of the file. out.seek(offset); // write the serialized extension metadata. out.write(metadataBytes, 0, metadataBytes.length); // Address of the region containing the metadata record (one record) addrMetadata = addressManager.toAddr(metadataBytes.length, IndexSegmentRegion.BASE.encodeOffset(offset)); if (storeCache != null) { /* * Insert the record into the cache. */ storeCache.putIfAbsent(addrMetadata, metadata); } } /* * Write out the ICUVersionRecord at the end of the file. * * FIXME Enable when merging in the ICUVersionRecord change set. */ // { // // /* // * Serialize the record. // */ // final byte[] icuVersionBytes = SerializerUtil.serialize(ICUVersionRecord.newInstance()); // // // #of bytes written so far on the output file. // final long offset = out.length(); // // // seek to the end of the file. // out.seek(offset); // // // write the serialized extension metadata. // out.write(icuVersionBytes, 0, icuVersionBytes.length); // // // Address of the region containing the metadata record (one record) // long addrICUVersion = addressManager.toAddr(icuVersionBytes.length, // IndexSegmentRegion.BASE.encodeOffset(offset)); // // if (storeCache != null) { // // /* // * Insert the record into the cache. // */ // // storeCache.putIfAbsent(addrMetadata, metadata); // // } // // } /* * Seek to the start of the file and write out the checkpoint record. */ { // // timestamp for the index segment. // final long now = System.currentTimeMillis(); outChannel.position(0); /* * Note: The build plan is restricted to MAX_INT leaves and there * are always more leaves than nodes in a B+Tree both nnodes and * nleaves are int32 values. */ if(nnodesWritten>Integer.MAX_VALUE) throw new AssertionError(); if(nleavesWritten>Integer.MAX_VALUE) throw new AssertionError(); final IndexSegmentCheckpoint md = new IndexSegmentCheckpoint( addressManager.getOffsetBits(), // plan.height, // will always be correct. (int)nleavesWritten, // actual #of leaves written. (int)nnodesWritten, // actual #of nodes written. ntuplesWritten, // actual #of tuples written. maxNodeOrLeafLength,// offsetLeaves, extentLeaves, offsetNodes, extentNodes, offsetBlobs, extentBlobs, addrRoot, addrMetadata, addrBloom, addrFirstLeaf, addrLastLeaf, out.length(), compactingMerge, useChecksums, segmentUUID, commitTime); md.write(out); if(log.isInfoEnabled()) log.info(md.toString()); // save the index segment resource description for the caller. this.segmentMetadata = new SegmentMetadata(outFile, //out.length(), segmentUUID, commitTime); return md; } } /** * The description of the constructed {@link IndexSegment} resource. * * @throws IllegalStateException * if requested before the build operation is complete. */ public IResourceMetadata getSegmentMetadata() { if (segmentMetadata == null) { throw new IllegalStateException(); } return segmentMetadata; } private SegmentMetadata segmentMetadata = null; /** * Abstract base class for classes used to construct and serialize nodes and * leaves written onto the index segment. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> */ abstract protected static class AbstractSimpleNodeData implements IAbstractNodeData { /** * The level in the output tree for this node or leaf (origin zero). The * root is always at level zero (0). */ final int level; final int m; /** * Mutable keys (directly managed by the {@link IndexSegmentBuilder}). */ final MutableKeyBuffer keys; /** * The max/max version timestamp for the node/leaf. These data are only * used when the B+Tree is maintaining per tuple revision timestamps. */ long minimumVersionTimestamp; long maximumVersionTimestamp; /** * We precompute the #of children to be assigned to each node and the * #of values to be assigned to each leaf and store that value in this * field. While the field name is "max", this is the exact that must be * assigned to the node. */ int max = -1; protected AbstractSimpleNodeData(final int level, final int m) { this.level = level; this.m = m; /* * @todo This should probably be dimensioned to m-1 for a node and m * for a leaf. The mutable B+Tree would have dimensions to m for a * node and m+1 for a leaf to allow for overflow during split/join, * but we only need the exact number of slots. */ this.keys = new MutableKeyBuffer(m); this.minimumVersionTimestamp = Long.MAX_VALUE; this.maximumVersionTimestamp = Long.MIN_VALUE; } /** * * @param max * The #of children to be assigned to this node -or- the #of * tuples to be assigned to a leaf. */ protected void reset(final int max) { this.max = max; this.keys.nkeys = 0; this.minimumVersionTimestamp = Long.MAX_VALUE; this.maximumVersionTimestamp = Long.MIN_VALUE; } final public int getKeyCount() { return keys.size(); } final public IRaba getKeys() { return keys; } /** * Yes (however, note that the {@link IndexSegmentBuilder} directly * accesses and modified the internal data structures). */ final public boolean isReadOnly() { return true; } /** * No. */ final public boolean isCoded() { return false; } final public AbstractFixedByteArrayBuffer data() { throw new UnsupportedOperationException(); } final public long getMaximumVersionTimestamp() { if(!hasVersionTimestamps()) throw new UnsupportedOperationException(); return minimumVersionTimestamp; } final public long getMinimumVersionTimestamp() { if(!hasVersionTimestamps()) throw new UnsupportedOperationException(); return maximumVersionTimestamp; } } /** * A class that can be used to (de-)serialize the data for a leaf without * any of the logic for operations on the leaf. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> */ protected static class SimpleLeafData extends AbstractSimpleNodeData implements ILeafData { /** * The values stored in the leaf (directly accessed by the * {@link IndexSegmentBuilder}). */ final MutableValueBuffer vals; final public IRaba getValues() { return vals; } /** * Allocated iff delete markers are maintained. */ final boolean[] deleteMarkers; /** * Allocated iff version timestamps are maintained. */ final long[] versionTimestamps; /** * Allocated iff raw record markers are maintained. */ final boolean[] rawRecords; public SimpleLeafData(final int level, final int m, final IndexMetadata metadata) { super(level, m); this.vals = new MutableValueBuffer(m); this.deleteMarkers = metadata.getDeleteMarkers() ? new boolean[m] : null; this.versionTimestamps = metadata.getVersionTimestamps() ? new long[m] : null; this.rawRecords = metadata.getRawRecords() ? new boolean[m] : null; } protected void reset(final int max) { super.reset(max); vals.nvalues = 0; } // final public int getSpannedTupleCount() { // // return keys.size(); // // } final public int getValueCount() { return keys.size(); } final public boolean isLeaf() { return true; } final public boolean getDeleteMarker(final int index) { if (deleteMarkers == null) throw new UnsupportedOperationException(); return deleteMarkers[index]; } final public long getVersionTimestamp(final int index) { if (versionTimestamps == null) throw new UnsupportedOperationException(); return versionTimestamps[index]; } final public long getRawRecord(final int index) { if (rawRecords == null) throw new UnsupportedOperationException(); if(!rawRecords[index]) return IRawStore.NULL; return AbstractBTree.decodeRecordAddr(vals.get(index)); } final public boolean hasDeleteMarkers() { return deleteMarkers != null; } final public boolean hasVersionTimestamps() { return versionTimestamps != null; } final public boolean hasRawRecords() { return rawRecords != null; } /** * Yes - the caller maintains the necessary information and then updates * the coded {@link ReadOnlyLeafData} record once we have the address of * the next record. */ final public boolean isDoubleLinked() { return true; } /** * @throws UnsupportedOperationException * since the data are maintained externally and patched on * the coded records by the {@link IndexSegmentBuilder}. */ final public long getNextAddr() { throw new UnsupportedOperationException(); } /** * @throws UnsupportedOperationException * since the data are maintained externally and patched on * the coded records by the {@link IndexSegmentBuilder}. */ final public long getPriorAddr() { throw new UnsupportedOperationException(); } } /** * A class that can be used to (de-)serialize the data for a node without * any of the logic for operations on the node. * <p> * Note: All node addresses that are internal to a node and reference a * child node (vs a leaf) are correct relative to the start of the * {@link IndexSegmentRegion#NODE} region. This is an unavoidable * consequence of serializing the nodes before we have the total offset to * the start of the {@link IndexSegmentRegion#NODE} region. * * @see IndexSegmentRegion * @see IndexSegmentAddressManager * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> */ protected static class SimpleNodeData extends AbstractSimpleNodeData implements INodeData { // mutable. /** * The relative address at which this node was written on the temporary * channel. This is a negative integer. If you flip the sign then it * encodes a relative offset to the start of the index node block and * the correct size for the compressed node. */ long addr = 0L; /** * The address at which the child nodes were written. This is a negative * integer iff the child is a node and a positive integer iff the child * is a leaf. When it is a negative integer, you must flip the sign to * obtain a relative offset to the start of the index node block and the * correct size for the compressed node. The actual offset of the index * node block must be added to the relative offset before you can use * this to address into the output file. */ final long[] childAddr; /** * This tracks the #of defined values in {@link #childAddr} separately * from the #of defined keys. The reason that we do this is that the * logic for handling a leaf eviction and recording the address of the * child and the separator key for the _next_ child requires an * awareness of the intermediate state - when we have filled in the * childAddr for the last leaf but not yet filled in the separatorKey * for the next leaf. */ int nchildren = 0; /** * The #of entries spanned by this node. */ long nentries; /** * The #of entries spanned by each child of this node. */ final long [] childEntryCount; /** * <code>true</code> iff the node is tracking the min/max tuple revision * timestamps. */ final boolean hasVersionTimestamps; final public long getSpannedTupleCount() { return nentries; } final public long getChildAddr(final int index) { if (index < 0 || index > keys.size() + 1) throw new IllegalArgumentException(); return childAddr[index]; } final public long getChildEntryCount(final int index) { if (index < 0 || index > keys.size() + 1) throw new IllegalArgumentException(); return childEntryCount[index]; } public SimpleNodeData(final int level, final int m, final boolean hasVersionTimestamps) { super(level, m); this.childAddr = new long[m]; this.childEntryCount = new long[m]; this.hasVersionTimestamps = hasVersionTimestamps; } /** * Reset counters and flags so that the node may be reused. * * @param max * The new limit on the #of children to fill on this node. */ protected void reset(final int max) { // /* // * Note: We have to clear these arrays for the edge case when source // * iterator is prematurely exhausted. If we do not clear them then // * the last entry in each array can be non-zero when it should be 0L // * when the planned right child under a separatorKey in a Node was // * not emitted. // */ // for (int i = 0; i < nchildren; i++) { // // childAddr[i] = 0L; // // childEntryCount[i] = 0; // // } super.reset(max); addr = 0; nchildren = 0; nentries = 0; } final public int getChildCount() { return keys.size() + 1; } final public boolean isLeaf() { return false; } final public boolean hasVersionTimestamps() { return hasVersionTimestamps; } } /** * Factory does not support node or leaf creation. */ protected static class NOPNodeFactory implements INodeFactory { public static final INodeFactory INSTANCE = new NOPNodeFactory(); private NOPNodeFactory() { } public Leaf allocLeaf(final AbstractBTree btree, final long addr, final ILeafData data) { throw new UnsupportedOperationException(); } public Node allocNode(final AbstractBTree btree, final long addr, final INodeData data) { throw new UnsupportedOperationException(); } } /** * A tuple iterator backed by a mutable leaf. This implementation is used * when we materialize the view in RAM in a single leaf and then do the * build over that. The implementation always returns the pre-allocated * byte[] for the key or value in order to avoid redundant allocations. This * is safe since the data in that {@link MutableLeafData} instance were * allocated when the view was materialized and their references can be * safely reused when we build the output B+Tree. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> * @param <E> The generic type of the objects which would be materialized * from the tuples. * * @todo in fact, we could clear the references from the * {@link MutableLeafData} as we go. */ static private class MyTupleIterator<E> implements ITupleIterator<E> { private final boolean hasVersionTimestamp, hasDeleteMarkers, visitDeleted; /** * Directly exposes the data from the {@link MutableLeafData}. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ * @param <E> */ private class MyTuple implements ITuple<E> { /** The index in the leaf of the state revealed by this tuple. */ private int leafIndex; private final int flags; private final boolean needsKeys, needsVals; public MyTuple(final int flags) { this.flags = flags; this.needsKeys = (flags & IRangeQuery.KEYS) != 0; this.needsVals = (flags & IRangeQuery.VALS) != 0; if (!needsKeys) throw new UnsupportedOperationException(); if (!needsVals) throw new UnsupportedOperationException(); } public int flags() { return flags; } public boolean getKeysRequested() { return needsKeys; } public boolean getValuesRequested() { return needsVals; } public long getVisitCount() { return leafIndex; } public byte[] getKey() { return leaf.keys.keys[leafIndex]; } public byte[] getValue() { return leaf.vals.values[leafIndex]; } public boolean isDeletedVersion() { if(!hasDeleteMarkers) return false; return leaf.deleteMarkers[leafIndex]; } public boolean isNull() { return leaf.vals.values[leafIndex] == null; } public long getVersionTimestamp() { if (!hasVersionTimestamp) return 0L; return leaf.versionTimestamps[leafIndex]; } /* * We do not actually use these methods in the IndexSegmentBuilder * and they would not be as efficient if we did since we are relying * on directly access to the MutableLeafData's internal data * structures. */ public ByteArrayBuffer getKeyBuffer() { throw new UnsupportedOperationException(); } public DataInputBuffer getKeyStream() { throw new UnsupportedOperationException(); } public ByteArrayBuffer getValueBuffer() { throw new UnsupportedOperationException(); } public DataInputBuffer getValueStream() { throw new UnsupportedOperationException(); } public E getObject() { throw new UnsupportedOperationException(); } public int getSourceIndex() { throw new UnsupportedOperationException(); } public ITupleSerializer getTupleSerializer() { throw new UnsupportedOperationException(); } public IBlock readBlock(long addr) { throw new UnsupportedOperationException(); } } /** The source data. */ private final MutableLeafData leaf; /** A view onto the current tuple in that leaf. */ private final MyTuple tuple; /** The index of the next tuple to be considered in the leaf. */ private int i; /** The first index to visit. */ private final int fromIndex; /** The first index to NOT visit. */ private final int toIndex; /** * * @param leaf * The leaf whose entries will be traversed (required). * @param fromKey * The first key whose entry will be visited or * <code>null</code> if the lower bound on the key traversal * is not constrained. * @param toKey * The first key whose entry will NOT be visited or * <code>null</code> if the upper bound on the key traversal * is not constrained. * @param flags * Flags specifying whether the keys and/or values will be * materialized. * * @exception IllegalArgumentException * if fromKey is given and is greater than toKey. */ public MyTupleIterator(final MutableLeafData leaf, final int flags) { this.leaf = leaf; this.tuple = new MyTuple(flags); this.hasVersionTimestamp = leaf.hasVersionTimestamps(); this.hasDeleteMarkers = leaf.hasDeleteMarkers(); this.visitDeleted = (flags & IRangeQuery.DELETED) != 0; fromIndex = 0; toIndex = leaf.getKeyCount(); } /** * Examines the entry at {@link #i}. If it passes the criteria for an * entry to visit then return true. Otherwise increment the {@link #i} * until either all entries in this leaf have been exhausted -or- the an * entry is identified that passes the various criteria. */ public boolean hasNext() { for( ; i >= fromIndex && i < toIndex; i++) { /* * Skip deleted entries unless specifically requested. */ if (hasDeleteMarkers && !visitDeleted && leaf.getDeleteMarker(i)) { // skipping a deleted version. continue; } // entry @ index is next to visit. return true; } // nothing left to visit in this leaf. return false; } public ITuple<E> next() { if (!hasNext()) { throw new NoSuchElementException(); } // next tuple. tuple.leafIndex = i++; return tuple; } public void remove() { throw new UnsupportedOperationException(); } } /** * Identifies which build method to use. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> * @version $Id$ * * @todo add other methods here as they are defined. */ enum BuildEnum { /** * Do two passes over the source iterator. The first pass will compute * the exact range count. The second pass will build the * {@link IndexSegment}. The two pass algorithm uses less memory and can * be highly efficient when using {@link IndexSegmentMultiBlockIterator} * since it will read the data from any source {@link IndexSegment}(s) * at the disk transfer rate. */ TwoPass, /** * Fully buffer the tuples from the {@link IndexSegment} into memory in * a single pass over the source index. This approach does less IO, but * requires more memory in the Java heap. */ FullyBuffered; } /** * Prints the usage and then exits. * * @param args * The command line args. */ protected static void usage(final String[] args, final String msg, final int exitCode) { if (msg != null) System.err.println(msg); System.err.println("usage: [opts] journal [name]*"); System.err.println(" journal is the name of the journal file."); System.err.println(" [name]* is the name of one or more indices (defaults to all)."); System.err.println(" [opts] is any of:"); System.err.println(" -m #\tThe branching factor for the output index segments."); System.err.println(" -alg (FullyBuffered|TwoPass)\tThe algorithm to use."); System.err.println(" -merge (true|false)\tWhen true, performs a compacting merge (default is merge)."); System.err.println(" -O outDir\tThe output directory."); System.err.println(" -bufferNodes (true|false)\tWhen true, the nodes are fully buffered in memory (default true)."); System.exit(exitCode); } /** * Driver for index segment build against a named index on a local journal. * * @param args * <code>[opts] journal [name]*</code>, where <i>journal</i> is * the name of the journal file, where <i>name</i> is the name of * a B+Tree registered on that journal, and where <i>opts</i> are * any of: * <dl> * <dt>-m #</dt> * <dd>Override the default branching factor for the index * segment.</dd> * <dt>-alg <i>algorithm</i></dt> * <dd>Specify which build algorithm to use. See * {@link BuildEnum}.</dd> * <dt>-merge or -build</dt> * <dd>Specifies whether to do a compacting merge (deleted tuples * are purged from the generated index segment) or an incremental * build (deleted tuples are preserved). The default is * <i>merge</i>.</dd> * <dt>-O outDir</dt> * <dd>Specify the name of the directory on which the generated * index segment file(s) will be written. This defaults to the * current working directory. Each index segment file will be * named based on the name of the source index with the * <code>.seg</code> extension). .</dd> * </dl> * . If no <i>name</i>s are specified, then an index segment will * be generated for each named B+Tree registered on the source * journal. * * @throws Exception */ // * <dt>-verify</dt> // * <dd>Verify the generated index segment against the source // * B+Tree.</dd> public static void main(final String[] args) throws Exception { // The output branching factor (optional override). Integer branchingFactorOverride = null; // When true, performs a correctness check against the source BTree. boolean verify = false; // The journal file (must already exist). File journalFile = null; // The name(s) of the indices to be processed. final List<String> names = new LinkedList<String>(); // The directory into which the generated index segments will be // written. Each index segment will be named based on the source index // name. The default is the current directory. File outDir = new File("."); /* * When true, a compacting merge will be performed (deleted tuples will * be purged). Otherwise this will be an incremental build (deleted * tuples will be preserved in the generated index segment). */ boolean compactingMerge = true; /* * When true, the generats nodes will be fully buffered in memory rather * than being written onto a temporary file. */ boolean bufferNodes = true; // Which build algorithm to use. BuildEnum buildEnum = BuildEnum.TwoPass;//FullyBuffered; final File tmpDir = new File(System.getProperty("java.io.tmpdir")); if (!tmpDir.exists() && !tmpDir.mkdir()) { throw new IOException( "Temporary directory does not exist / can not be created: " + tmpDir); } /* * Parse the command line, overriding various properties. */ { int i = 0; for (; i < args.length && args[i].startsWith("-"); i++) { final String arg = args[i]; if (arg.equals("-m")) { branchingFactorOverride = Integer.valueOf(args[++i]); } else if (arg.equals("-O")) { outDir = new File(args[++i]); } else if (arg.equals("-verify")) { verify = true; } else if (arg.equals("-merge")) { compactingMerge = true; } else if (arg.equals("-build")) { compactingMerge = false; } else if (arg.equals("-bufferNodes")) { bufferNodes = Boolean.valueOf(args[++i]); } else if (arg.equals("-alg")) { buildEnum = BuildEnum.valueOf(args[++i]); } else if (arg.equals("-help")||arg.equals("--?")) { usage(args, null/* msg */, 1/* exitCode */); } else { throw new UnsupportedOperationException("Unknown option: " + arg); } } // next arg. // The next argument is the journal file name, which is required. if (i == args.length) { usage(args, "journal name is required.", 1/* exitCode */); } journalFile = new File(args[i++]); if (!journalFile.exists()) { throw new FileNotFoundException(journalFile.toString()); } // The remaining argument(s) are the source B+Tree names. while (i < args.length) { names.add(args[i++]); } if (journalFile == null) { throw new RuntimeException( "The journal file was not specified."); } if (names == null) { throw new RuntimeException("The index name was not specified."); } if (!outDir.exists() && !outDir.mkdirs()) { throw new IOException( "Output directory does not exist and could not be created: " + outDir); } } // parse command line. // Open the journal: must already exist. final Journal journal; { final Properties properties = new Properties(); properties .setProperty(Journal.Options.FILE, journalFile.toString()); properties.setProperty(Journal.Options.READ_ONLY, Boolean.TRUE .toString()); journal = new Journal(properties); } try { // @todo allow caller to specify the commitTime of interest. if (names.isEmpty()) { final ITupleIterator<Name2Addr.Entry> itr = journal .getName2Addr().rangeIterator(); while (itr.hasNext()) { names.add(itr.next().getObject().name); } } else { // Some validation up front. for (String name : names) { // Verify named indices exist. if (journal.getIndex(name) == null) { // Index not found. throw new RuntimeException("Index not found: " + name); } // Verify output file does not exist or is empty. final File outFile = new File(outDir, name + Journal.Options.SEG); if (outFile.exists() && outFile.length() != 0) { throw new RuntimeException( "Output file exists and is non-empty: " + outFile); } } } System.out.println("Will process " + names.size() + " indices."); final long beginAll = System.currentTimeMillis(); // For each named index. for (String name : names) { // Do the build for this B+Tree. final BTree btree = journal.getIndex(name); final File outFile = new File(outDir, name + Journal.Options.SEG); final int m = branchingFactorOverride == null ? btree .getIndexMetadata().getIndexSegmentBranchingFactor() : branchingFactorOverride.intValue(); final long begin = System.currentTimeMillis(); final long commitTime = btree.getLastCommitTime(); System.out.println("Building index segment: in(m=" + btree.getBranchingFactor() + ", rangeCount=" + btree.rangeCount() + "), out(m=" + m + "), alg="+buildEnum); final IndexSegmentBuilder builder; switch (buildEnum) { case TwoPass: builder = IndexSegmentBuilder.newInstanceTwoPass(btree, outFile, tmpDir, m, compactingMerge, commitTime, null/* fromKey */, null/* toKey */, bufferNodes); break; case FullyBuffered: builder = IndexSegmentBuilder.newInstanceFullyBuffered( btree, outFile, tmpDir, m, compactingMerge, commitTime, null/* fromKey */, null/* toKey */, bufferNodes); break; default: throw new AssertionError(buildEnum.toString()); } // Do the build. final IndexSegmentCheckpoint checkpoint = builder.call(); // The total elapsed build time, including range count or // pre-materialization of tuples. final long elapsed = System.currentTimeMillis() - begin; final String results = "name=" + name + " : elapsed=" + elapsed + "ms, setup=" + builder.elapsed_setup + "ms, write=" + builder.elapsed_write + "ms, m=" + builder.plan.m + ", size=" + (builder.outFile.length() / Bytes.megabyte) + "mb, mb/sec=" + builder.mbPerSec; System.out.println(results); if(verify) { /* * Verify the generated index segment against the source * B+Tree. */ // @see BLZG-1501 (remove LRUNexus) // if (LRUNexus.INSTANCE != null) { // // /* // * Clear the records for the index segment from the // * cache so we will read directly from the file. This is // * necessary to ensure that the data on the file is good // * rather than just the data in the cache. // */ // // System.out.println("Flushing index segment cache: " // + builder.outFile); // // LRUNexus.INSTANCE.deleteCache(checkpoint.segmentUUID); // // } final IndexSegmentStore segStore = new IndexSegmentStore( outFile); try { final IndexSegment seg = segStore.loadIndexSegment(); try { System.out.println("Verifying index segment: " + builder.outFile); assertSameEntryIterator(name, btree.rangeIterator(), seg.rangeIterator()); } finally { seg.close(); } } finally { segStore.close(); } } } // next source B+Tree. final long elapsedAll = System.currentTimeMillis() - beginAll; System.out.println("Processed " + names.size() + " indices in " + elapsedAll + "ms"); } finally { journal.close(); } } /** * Verifies that the iterators visit tuples having the same data in the same * order. * * @param expectedItr * @param actualItr */ private static void assertSameEntryIterator( final String name, final ITupleIterator<?> expectedItr, final ITupleIterator<?> actualItr) { long nvisited = 0L; while (expectedItr.hasNext()) { if (!actualItr.hasNext()) throw new RuntimeException(name + ":: Expecting another index entry: nvisited=" + nvisited); final ITuple<?> expectedTuple = expectedItr.next(); final ITuple<?> actualTuple = actualItr.next(); nvisited++; if (!BytesUtil.bytesEqual(expectedTuple.getKey(), actualTuple .getKey())) { throw new RuntimeException(name + ":: Wrong key: nvisited=" + nvisited + ", expected=" + expectedTuple + ", actual=" + actualTuple); } if (!BytesUtil.bytesEqual(expectedTuple.getValue(), actualTuple .getValue())) { throw new RuntimeException(name + ":: Wrong value: nvisited=" + nvisited + ", expected=" + expectedTuple + ", actual=" + actualTuple); } } if (actualItr.hasNext()) throw new RuntimeException(name + ":: Not expecting more tuples"); } }