package com.bigdata.resources; import java.util.Arrays; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.UUID; import com.bigdata.btree.BTree; import com.bigdata.btree.ILocalBTreeView; import com.bigdata.btree.IndexMetadata; import com.bigdata.btree.IndexSegment; import com.bigdata.btree.IndexSegmentStore; import com.bigdata.journal.IConcurrencyManager; import com.bigdata.journal.ITx; import com.bigdata.journal.TimestampUtility; import com.bigdata.mdi.IResourceMetadata; import com.bigdata.mdi.LocalPartitionMetadata; import com.bigdata.mdi.SegmentMetadata; import com.bigdata.service.Event; import com.bigdata.service.EventResource; /** * Task builds an {@link IndexSegment} from the mutable {@link BTree} and zero * or more additional sources in the index partition view and then atomically * updates the view (aka an incremental build). * <p> * Build uses mutable {@link BTree} of the lastCommitTime for the old journal * PLUS ZERO OR MORE additional source(s) taken in view order up to but not * including the source in the view with significant content. This let's us keep * the #of {@link IndexSegment}s in the view down without incurring the cost of * a compacting merge. (The cost of the compacting merge itself comes from * having a large index segment in the view, generally in the last position of * the view.) In turn, this keeps the cost of overflow down and can be a * significant win if there are a number of large index partitions that receive * a few writes in each overflow. * <p> * For example, assuming a large index segment exists from a previous compacting * merge, then once the #of writes exceeds the "copy" threshold there will be an * index build. The view will then have [live, smallSeg1, largeSeg1]. The next * time the copy threshold is exceeded we would get [live, smallSeg2, smallSeg1, * largeSeg1]. However if we include smallSeg1 in the build, then we get [live, * smallSeg2, largeSeg1]. This can continue until we have enough data to warrant * a split or until we have another "large" segment but not yet enough data to * split, at which point we get [live, largeSeg2, largeSeg1] and then [live, * smallSeg3, largeSeg2, largeSeg1]. * <p> * Note: As its last action, this task submits a * {@link AtomicUpdateIncrementalBuildTask} which replaces the view with one * defined by the current {@link BTree} on the journal and the newly built * {@link IndexSegment}. * <p> * Note: If the task fails, then the output {@link IndexSegment} will be * deleted. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ public class IncrementalBuildTask extends AbstractPrepareTask<BuildResult> { final private ViewMetadata vmd; /** * @param vmd * Metadata about the index partition view. */ public IncrementalBuildTask(final ViewMetadata vmd) { super(vmd.resourceManager, TimestampUtility .asHistoricalRead(vmd.commitTime), vmd.name); this.vmd = vmd; } @Override protected void clearRefs() { // release soft references. vmd.clearRef(); } /** * Build an {@link IndexSegment} from one or more sources for an index * partition view. The sources are chosen in view order. New sources are * incorporated until too much work would be performed for the lightweight * semantics of "build". If all sources are incorporated by the build, then * the result is identical a compacting merge. * * @return The {@link BuildResult}. */ protected BuildResult doTask() throws Exception { final Event e = new Event(resourceManager.getFederation(), new EventResource(vmd.indexMetadata), OverflowActionEnum.Build, vmd.getParams()).start(); BuildResult buildResult = null; try { if (resourceManager.isOverflowAllowed()) throw new IllegalStateException(); try { /* * Figure out which sources will be used in the build operation. * The sources are chosen in order. The first source is always * a BTree on a journal and is always in the accepted view. * * Note: The order of the sources MUST be maintained. This * ensures that the generated index segment will preserve only * the most recently written tuple (or delete marker) for each * tuple in the accepted view. We are only permitted to purge * deleted tuples when all sources are accepted in the build * view since that is the only time we have a guarantee that * there is not a delete version of that tuple further back in * history which would reemerge if we dropped the delete marker. */ final BuildViewMetadata buildViewMetadata = new BuildViewMetadata( vmd.getView(), resourceManager.maximumBuildSegmentBytes, e); e.addDetails(buildViewMetadata.getParams()); if(INFO) log.info("acceptedView: " + buildViewMetadata); /* * Build the index segment from a view comprised of just the * accepted sources. */ buildResult = resourceManager.buildIndexSegment(vmd.name, buildViewMetadata.acceptedView, buildViewMetadata.compactingMerge, vmd.commitTime, null/* fromKey */, null/* toKey */, e); e.addDetails(buildResult.getParams()); if (buildResult.sourceCount != buildViewMetadata.naccepted) { throw new AssertionError("Build result has " + buildResult.sourceCount + ", but expected " + buildViewMetadata.naccepted + " : acceptedView=" + buildViewMetadata + ", buildResult=" + buildResult); } if (INFO) log.info("buildResult=" + buildResult); { /* * Verify that the resource manager can open the new index * segment. This provides verification both that the index * segment is registered with the store manager and that the * index segment can be read. However, we do not actually * read the leaves of the index segment here so there still * could be errors on the disk. */ final IndexSegmentStore segStore = (IndexSegmentStore) resourceManager .openStore(buildResult.segmentMetadata.getUUID()); assert segStore != null; if (INFO) log.info("indexSegmentStore=" + segStore.loadIndexSegment()); } } finally { /* * Release our hold on the source index partition view. We only * needed it during the the index partition build. */ clearRefs(); } if (buildResult.compactingMerge && buildResult.builder.getCheckpoint().length >= resourceManager.nominalShardSize) { /* * If a compacting merge was performed and sumSegBytes exceeds the * threshold, then do a split here just as if CompactingMerge was * run instead. * * Note: This is unlikely since build does not accept sources if * they would cause a lot of work. The most likely reasons why this * would happen would be a single index partition on the journal * which receives all writes or the journal size is a healthy * multiple of the target shard size. */ // FIXME reconcile return type and enable post-merge split. // return new SplitCompactViewTask(vmd.name, buildResult); } try { /* * Submit task that will update the definition of the index * partition view and wait for it to complete. */ concurrencyManager.submit( new AtomicUpdateIncrementalBuildTask(resourceManager, concurrencyManager, vmd.name, vmd.indexMetadata .getIndexUUID(), buildResult, e)).get(); } catch (Throwable t) { // make it releasable. resourceManager.retentionSetRemove(buildResult.segmentMetadata .getUUID()); // delete the generated index segment. resourceManager .deleteResource(buildResult.segmentMetadata.getUUID(), false/* isJournal */); // re-throw the exception throw new Exception(t); } return buildResult; } finally { if (buildResult != null) { /* * At this point the index segment was either incorporated into * the new view in a restart safe manner or there was an error. * Either way, we now remove the index segment store's UUID from * the retentionSet so it will be subject to the release policy * of the StoreManager. */ resourceManager.retentionSetRemove(buildResult.segmentMetadata .getUUID()); } e.end(); } } /** * <p> * The source is an {@link IndexSegment} that was built from the mutable * {@link BTree} associated with the lastCommitTime on old journal of some * index partition. What we are doing is replacing the role of that * {@link BTree} on the closed out journal with the {@link IndexSegment}. * Note that the {@link IndexSegment} contains the same data as the * {@link BTree} as of the lastCommitTime. The new view (as defined by this * task) will be selected when the desired view is GTE the lastCommitTime. * The old view will be used whenever the desired view is LT the * lastCommitTime. * </p> * * <pre> * journal A * view={A,...} * ---- sync overflow begins ---- * create journal B * view={B,A,...} * Begin incremental build of segment from A (just the BTree state as identified by the lastCommitTime) * ---- sync overflow ends ---- * ... build continues ... * ... writes against view={B,A,...} are written on B. * ... index segment S0 complete (based on A). * ... * atomic update task runs: view={B,S0,...} * ... writes continue. * </pre> * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ static protected class AtomicUpdateIncrementalBuildTask extends AbstractAtomicUpdateTask<IResourceMetadata[]> { /** * The expected UUID of the scale-out index. */ final protected UUID indexUUID; final protected BuildResult buildResult; final private Event parentEvent; /** * @param resourceManager * @param concurrencyManager * @param resource * @param buildResult */ public AtomicUpdateIncrementalBuildTask(ResourceManager resourceManager, IConcurrencyManager concurrencyManager, String resource, UUID indexUUID, BuildResult buildResult, Event parentEvent) { super(resourceManager, ITx.UNISOLATED, resource); if(indexUUID == null) throw new IllegalArgumentException(); if(buildResult == null) throw new IllegalArgumentException(); if (!resource.equals(buildResult.name)) throw new IllegalArgumentException(); if (parentEvent == null) throw new IllegalArgumentException(); this.indexUUID = indexUUID; this.buildResult = buildResult; this.parentEvent = parentEvent; } /** * <p> * Atomic update. * </p> * * @return The ordered array of resources that define the post-condition * view. */ @Override protected IResourceMetadata[] doTask() throws Exception { // populated with the description of the ordered sources of the new view. final List<IResourceMetadata> newView = new LinkedList<IResourceMetadata>(); /* * Note: The event is labeled a "build" even if all sources * participate in the build. This makes it easier to identify the * compacting merges in the events log. The compacting merges are of * interest since they are only triggered when the #of sources in * the view grows too large and they require more effort. By * contrast, some "builds" will in fact be compacting merges, but * they were selected as builds and they are compacting merges by * virtue of having so little work to do that it is cheaper to use * all sources in the view and thereby postpone a more intensive * compacting merge somewhat longer. */ final Map<String, Object> v = buildResult.getParams(); v.put("summary", OverflowActionEnum.Build + "(" + buildResult.name + ")"); final Event updateEvent = parentEvent.newSubEvent( OverflowSubtaskEnum.AtomicUpdate).start(); try { if (resourceManager.isOverflowAllowed()) throw new IllegalStateException(); final SegmentMetadata segmentMetadata = buildResult.segmentMetadata; if(INFO) log.info(buildResult.toString()); /* * Open the unisolated B+Tree on the live journal that is * absorbing writes. We are going to update its index metadata. * * Note: I am using AbstractTask#getIndex(String name) so that * the concurrency control logic will notice the changes to the * BTree and cause it to be checkpointed if this task succeeds * normally. */ final ILocalBTreeView view = getIndex(getOnlyResource()); // The live B+Tree. final BTree btree = view.getMutableBTree(); // make sure that we are working with the same index. assertSameIndex(indexUUID, btree); if (view instanceof BTree) { /* * Note: there is an expectation that this is not a simple * BTree because this the build task is supposed to be * invoked after an overflow event (or a view checkpoint), * and that event should have re-defined the view to include * the BTree on the new journal plus the historical view. * * One explanation for finding a simple view here is that * the old index was deleted and a new one created in its * place. We check that above. */ throw new RuntimeException("View is only a B+Tree: name=" + buildResult.name + ", pmd=" + view.getIndexMetadata().getPartitionMetadata()); } if (INFO) log.info("src=" + getOnlyResource() + ", counter=" + view.getCounter().get() + ", checkpoint=" + btree.getCheckpoint()); // clone the current metadata record for the live index. final IndexMetadata indexMetadata = btree.getIndexMetadata() .clone(); /* * This is the index partition definition on the live index - * the one that will be replaced with a new view as the result * of this atomic update. */ final LocalPartitionMetadata currentpmd = indexMetadata .getPartitionMetadata(); if (currentpmd == null) { throw new IllegalStateException( "Not an index partition: " + getOnlyResource()); } // Check pre-conditions. final IResourceMetadata[] currentResources = currentpmd .getResources(); { /* * verify that there are at least two resources in the * current view: * * 1. currentResources[0] is the mutable BTree on the live * journal * * 2. currentResources[1] is either the BTree on the old * journal (since closed out for writes so it is no longer * mutable) or a previous snapshot of the mutable BTree * decoupled from the mutable BTree by a view checkpoint * operation. */ if (currentResources.length < 2) { throw new IllegalStateException( "Expecting at least 2 resources in the view: " + Arrays.toString(currentResources)); } if (!currentResources[0].getUUID().equals( getJournal().getRootBlockView().getUUID())) { throw new IllegalStateException( "Expecting live journal to be the first resource: " + Arrays.toString(currentResources)); } /* * verify that the 2nd resource in the view is also a BTree * on a journal. */ if (!currentResources[1].isJournal()) { throw new IllegalStateException( "Expecting live journal to be the first resource: " + Arrays.toString(currentResources)); } // Note: This constraint does not apply when a view checkpoint was used. // /* // * Verify that the new index segment was built from a view // * that did not include data from the live journal. // */ // if (segmentMetadata.getCreateTime() >= getJournal() // .getRootBlockView().getFirstCommitTime()) { // // throw new AssertionError( // "IndexSegment includes data from the live journal?"); // // } } // new view definition. final IResourceMetadata[] newResources; { // the live journal. newView.add(getJournal().getResourceMetadata()); /* * The newly built index segment. This was built from at * least one source, but it MAY have been built from more * than one source. */ newView.add(segmentMetadata); /* * The rest of the components of the old view. * * Note: We start copying resources into the view AFTER the * last source which was included in the view used to * generate the index segment. * * For example, if the index segment was built from a single * journal (the old journal), then [startIndex := 1 + 1 == * 2]. So we retain resources in the current view start at * currentResources[2]. * * If there are 3 sources in the current view (new journal, * old journal, and an index segment) and the sourceCount * was 2 then then build was actually a compacting merge and * [startIndex := 1 + 2 == 3]. Since 3 EQ * currentResources.length we will not include ANY sources * from the old view. This is the semantics of a compacting * merge. All data in the view is captured by the data on * the live journal and the newly built index segment [live, * newSeg]. */ final int startIndex = 1 + buildResult.sourceCount; for (int i = startIndex; i < currentResources.length; i++) { newView.add(currentResources[i]); } newResources = (IResourceMetadata[]) newView .toArray(new IResourceMetadata[] {}); } // describe the index partition. indexMetadata.setPartitionMetadata(new LocalPartitionMetadata(// currentpmd.getPartitionId(),// currentpmd.getSourcePartitionId(),// currentpmd.getLeftSeparatorKey(),// currentpmd.getRightSeparatorKey(),// newResources, // currentpmd.getIndexPartitionCause() // , currentpmd.getHistory() // + OverflowActionEnum.Build// // + "(lastCommitTime=" // + segmentMetadata.getCreateTime()// // + ",segment=" // + segmentMetadata.getUUID()// // + ",#buildSources=" // + buildResult.sourceCount// // + ",merge=" // + buildResult.compactingMerge// // + ",counter=" // + btree.getCounter().get()// // + ",oldResources=" // + Arrays.toString(currentResources) + ") " )); // update the metadata associated with the btree btree.setIndexMetadata(indexMetadata); if (INFO) log.info("Updated view: name=" + getOnlyResource() + ", pmd=" + indexMetadata.getPartitionMetadata() + toString("oldResources", currentResources) + toString("newResources", newResources)); /* * Verify that the btree recognizes that it needs to be * checkpointed. * * Note: The atomic commit point is when this task commits. */ assert btree.needsCheckpoint(); /* * Update counter to reflect successful index partition build. * * Note: All build tasks are reported as builds so that we can * readily distinguish the tasks which were selected as * compacting merges from those which were selected as builds. * If you want to see how many tasks were "effective" compacting * merges (because all sources were used) then you need to look * at the events log for the indexSegmentBuild operation. */ resourceManager.overflowCounters.indexPartitionBuildCounter.incrementAndGet(); updateEvent.addDetail("newView", newView.toString()); return newResources; } finally { updateEvent.end(); } } // doTask() } // AtomicUpdate }