package com.bigdata.resources; import java.util.Arrays; import java.util.HashSet; import java.util.Set; import java.util.UUID; import java.util.concurrent.TimeoutException; import com.bigdata.btree.BTree; import com.bigdata.btree.ILocalBTreeView; import com.bigdata.btree.IndexMetadata; import com.bigdata.btree.IndexSegment; import com.bigdata.btree.ScatterSplitConfiguration; import com.bigdata.btree.proc.BatchLookup; import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedure.ResultBuffer; import com.bigdata.btree.proc.BatchLookup.BatchLookupConstructor; import com.bigdata.io.SerializerUtil; import com.bigdata.journal.AbstractTask; import com.bigdata.journal.IConcurrencyManager; import com.bigdata.journal.IResourceManager; import com.bigdata.journal.ITx; import com.bigdata.journal.TimestampUtility; import com.bigdata.mdi.IResourceMetadata; import com.bigdata.mdi.LocalPartitionMetadata; import com.bigdata.mdi.PartitionLocator; import com.bigdata.mdi.SegmentMetadata; import com.bigdata.resources.OverflowManager.ResourceScores; import com.bigdata.service.DataService; import com.bigdata.service.Event; import com.bigdata.service.EventResource; import com.bigdata.service.ILoadBalancerService; import com.bigdata.service.MetadataService; import com.bigdata.service.ndx.ClientIndexView; import com.bigdata.util.Bytes; /** * Task builds an {@link IndexSegment} from the fused view of an index partition * as of some historical timestamp and then atomically updates the view (aka a * compacting merge). * <p> * Note: This task may be used after {@link IResourceManager#overflow()} in * order to produce a compact view of the index as of the <i>lastCommitTime</i> * on the old journal. * <p> * Note: As its last action, this task submits a * {@link AtomicUpdateCompactingMergeTask} which replaces the view with one * defined by the current {@link BTree} on the journal and the newly built * {@link IndexSegment}. * <p> * Note: If the task fails, then the generated {@link IndexSegment} will be * deleted. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ public class CompactingMergeTask extends AbstractPrepareTask<BuildResult> { final protected ViewMetadata vmd; /** * * @param vmd * The {@link ViewMetadata} for the index partition. */ public CompactingMergeTask(final ViewMetadata vmd) { super(vmd.resourceManager, TimestampUtility .asHistoricalRead(vmd.commitTime), vmd.name); this.vmd = vmd; } @Override protected void clearRefs() { vmd.clearRef(); } /** * Build an {@link IndexSegment} from the compacting merge of an index * partition. * * @return The {@link BuildResult}. */ protected BuildResult doTask() throws Exception { final Event e = new Event(resourceManager.getFederation(), new EventResource(vmd.indexMetadata), OverflowActionEnum.Merge, vmd.getParams()).start(); BuildResult buildResult = null; try { try { if (resourceManager.isOverflowAllowed()) throw new IllegalStateException(); /* * Build the index segment. * * Note: Since this is a compacting merge the view on the old * journal as of the last commit time will be fully captured by * the generated index segment. However, writes buffered by the * live journal WILL NOT be present in that index segment and * the post-condition view will include those writes. */ // build the index segment. buildResult = resourceManager .buildIndexSegment(vmd.name, vmd.getView(), true/* compactingMerge */, vmd.commitTime, null/* fromKey */, null/* toKey */, e); } finally { /* * Release our hold on the source view - we only needed it when * we did the index segment build. */ clearRefs(); } if (buildResult.builder.getCheckpoint().length >= resourceManager.nominalShardSize) { /* * If sumSegBytes exceeds the threshold, then do a split here. */ // FIXME reconcile return type and enable post-merge split. // return new SplitCompactViewTask(vmd.name, buildResult); } /* * @todo error handling should be inside of the atomic update task * since it has more visibility into the state changes and when we * can no longer delete the new index segment. */ try { // scale-out index UUID. final UUID indexUUID = vmd.indexMetadata.getIndexUUID(); // submit task and wait for it to complete concurrencyManager.submit( new AtomicUpdateCompactingMergeTask(resourceManager, concurrencyManager, vmd.name, indexUUID, buildResult, e.newSubEvent( OverflowSubtaskEnum.AtomicUpdate, vmd .getParams()))).get(); // /* // * Verify that the view was updated. If the atomic update task // * runs correctly then it will replace the IndexMetadata object // * on the mutable BTree with a new view containing only the live // * journal and the new index segment (for a compacting merge). // * We verify that right now to make sure that the state change // * to the BTree was noticed and resulted in a commit before // * returning control to us here. // * // * @todo comment this out or replicate for the index build task // * also? // */ // concurrencyManager // .submit( // new VerifyAtomicUpdateTask(resourceManager, // concurrencyManager, vmd.name, // indexUUID, result)).get(); } catch (Throwable t) { // make it releasable. resourceManager.retentionSetRemove(buildResult.segmentMetadata .getUUID()); // delete the generated index segment. resourceManager .deleteResource(buildResult.segmentMetadata.getUUID(), false/* isJournal */); // re-throw the exception throw new Exception(t); } if (resourceManager.compactingMergeWithAfterAction) { /* * Consider possible after-actions now that the view is compact. * If any is selected, then it will be executed in the current * thread. */ final AbstractTask<?> afterActionTask = chooseAfterActionTask(); if (afterActionTask != null) { afterActionTask.call(); } } return buildResult; } finally { if (buildResult != null) { /* * At this point the index segment was either incorporated into * the new view in a restart safe manner or there was an error. * Either way, we now remove the index segment store's UUID from * the retentionSet so it will be subject to the release policy * of the StoreManager. */ resourceManager.retentionSetRemove(buildResult.segmentMetadata .getUUID()); } e.end(); } } /** * Now that the index partition is compact, decide if we will take any after * action, such as {move, join, split, tailSplit, scatterSplit, etc). All of * these operations are much cheaper while the index is compact which is why * we do them here. * <p> * Note: asynchronous overflow processing WILL NOT complete until the * CompactingMergeTask is done. This means that we will still be reading * from the same journal. As long as we are reading from the same ordered * set of resources the lastCommitTime chosen here is somewhat arbitrary. * <p> * The updated view metadata as of the last commit time on the live journal. * * FIXME Concurrent operations can replace the view definition. However, * what would not be good is if they changed the set of resources in the * view. The AtomicUpdate of the after action task MUST check for this * precondition (same set of resources in the view) and abort (and clean up * any intermediate files) if the precondition has been violated (no harm is * done if we abort, just some lost work). * * @todo split + move and friends seem unnecessarily complicated. We can * just move anything that is compact. [Clean up the tasks to remove * this stuff.] * * @todo We might be better off running {@link #chooseAfterActionTask()} * from inside of the atomic update and then doing any work there * while we have the lock on the shard. This will prevent any new data * from building up and can help ensure that the preconditions for the * operation remain valid. This might also help simplify the HA * design. * * @todo Once we have flow control on writes we can save the DS a lot of * work by not accepting new writes for an index partition when we are * going to compact it, move it, split it, etc. */ private AbstractTask<?> chooseAfterActionTask() { final ViewMetadata vmd = new ViewMetadata(resourceManager, resourceManager.getLiveJournal().getLastCommitTime(), this.vmd.name, resourceManager.getIndexCounters(this.vmd.name)); /* * Scatter split? * * Note: Scatter splits are considered before tail splits and normal * splits since they can only be taken when there is a single index * partition for a scale-out index. The other kinds of splits are used * once the index has already been distributed onto the cluster by a * scatter split. */ { final ScatterSplitConfiguration ssc = vmd.indexMetadata .getScatterSplitConfiguration(); if ( // only a single index partitions? (vmd.getIndexPartitionCount() == 1L)// // scatter splits enabled for service && resourceManager.scatterSplitEnabled// // scatter splits enabled for index && ssc.isEnabled()// // The view is compact (only one segment). && vmd.compactView// // trigger scatter split before too much data builds up in one place. && vmd.getPercentOfSplit() >= ssc.getPercentOfSplitThreshold() ) { // Target data services for the new index partitions. final UUID[] moveTargets = getScatterSplitTargets(ssc); if (moveTargets != null) { // #of splits. final int nsplits = ssc.getIndexPartitionCount() == 0// ? (2 * moveTargets.length) // two per data service. : ssc.getIndexPartitionCount()// ; if (log.isInfoEnabled()) log.info("will scatter: " + vmd); // scatter split task. return new ScatterSplitTask(vmd, nsplits, moveTargets); } } } /* * Tail split? * * Note: We can do a tail split as long as we are "close" to a full * index partition. We have an expectation that the head of the split * will be over the minimum capacity. While the tail of the split MIGHT * be under the minimum capacity, if there are continued heavy writes on * the tail then it will should reach the minimum capacity for an index * partition by the time the live journal overflows again. */ if (vmd.isTailSplit() && false) { /* * FIXME The current tailSplit implementation operations against the * BTree, NOT the FusedView and NOT the IndexSegment. It needs to be * refactored before it can be an after action for a compacting * merge. * * It is written to identify the separator key based on an * examination of the mutable BTree. Once it has the separator key * it then does a normal build for each key-range. [@todo It * probably should use a compacting merge in order to avoid sharing * index segments across shards.] */ if (log.isInfoEnabled()) log.info("Will tailSpl" + vmd.name); return new SplitTailTask(vmd, null/* moveTarget */); } /* * Should split? * * Note: Split is NOT allowed if the index is currently being moved * onto this data service. Split, join, and move are all disallowed * until the index partition move is complete since each of them * would cause the index partition to become invalidated. */ if (vmd.getPercentOfSplit() > 1.0) { if (log.isInfoEnabled()) log.info("will split : " + vmd); return new SplitIndexPartitionTask(vmd, (UUID) null/* moveTarget */); } /* * Join undercapacity shard (either with local rightSibling or move to * join with remote rightSibling). * * If the rightSibling of an undercapacity index partition is also local * then a {@link JoinIndexPartitionTask} is used to join those index * partitions. * * If the rightSibling of an undercapacity index partition is remote, * then a {@link MoveTask} is created to move the undercapacity index * partition to the remove data service. * * Note: joins are only considered when the rightSibling of an index * partition exists. The last index partition has [rightSeparatorKey == * null] and there is no rightSibling for that index partition. * * @todo What kinds of guarantees do we have that a local rightSibling * will be around by the time the JoinIndexPartitionTask runs? * * @todo This has even more assumptions about [lastCommitTime] than the * other tasks. All these tasks need to be reviewed to make sure that * there are no gaps created by this refactor. Running these after * action tasks while we hold the write lock on the source shard could * probably help us to reduce the possibility of any such problems but * might require a revisit / refactor / simplification of the tasks. * * FIXME Make sure that we are not running compacting merges as part of * the split, scatter split and other tasks. Some tasks used to do this * in order to have a compact view. */ if (resourceManager.joinsEnabled && vmd.pmd.getRightSeparatorKey() != null && vmd.getPercentOfSplit() < resourceManager.percentOfJoinThreshold) { final String scaleOutIndexName = vmd.indexMetadata.getName(); final PartitionLocator rightSiblingLocator = getRightSiblingLocator( scaleOutIndexName, vmd.commitTime); if (rightSiblingLocator != null) { final UUID targetDataServiceUUID = rightSiblingLocator .getDataServiceUUID(); final String[] resources = new String[2]; // the underutilized index partition. resources[0] = DataService.getIndexPartitionName( scaleOutIndexName, vmd.pmd.getPartitionId()); // its right sibling (may be local or remote). resources[1] = DataService .getIndexPartitionName(scaleOutIndexName, rightSiblingLocator.getPartitionId()); if (resourceManager.getDataServiceUUID().equals( targetDataServiceUUID)) { /* * JOIN underutilized index partition with its local * rightSibling. * * Note: This is only joining two index partitions at a * time. It's possible to do more than that if it happens * that N > 2 underutilized sibling index partitions are on * the same data service, but that is a relatively unlikely * combination of events. */ if (log.isInfoEnabled()) log.info("Will JOIN: " + Arrays.toString(resources)); final String rightSiblingName = DataService .getIndexPartitionName(scaleOutIndexName, rightSiblingLocator.getPartitionId()); final ViewMetadata vmd2 = new ViewMetadata(resourceManager, vmd.commitTime, rightSiblingName, resourceManager .getIndexCounters(rightSiblingName)); return new JoinIndexPartitionTask(resourceManager, vmd.commitTime, resources, new ViewMetadata[] { vmd, vmd2 }); } else { /* * MOVE underutilized index partition to data service * hosting the right sibling. * * @todo The decision to join shards is asymmetric (an * undercapacity shard is moved to its rightSibling). * However, it is possible that its rightSibling was also * undercapacity and was either moved to or locally joined * with its rightSibling (in which case its partition * identifier would have been changed). To avoid these edge * cases there could be a global synchronous agreement for * move/join decisions */ if (log.isInfoEnabled()) { // get the target service name. String targetDataServiceName; try { targetDataServiceName = resourceManager .getFederation().getDataService( targetDataServiceUUID) .getServiceName(); } catch (Throwable t) { targetDataServiceName = targetDataServiceUUID .toString(); } log.info("willMoveToJoinWithRightSibling" + "( " + vmd.name + " -> " + targetDataServiceName // + ", leftSibling=" + resources[0] // + ", rightSibling=" + resources[1] // + ")"); } return new MoveTask(vmd, targetDataServiceUUID); } } // rightSibling != null } // if(join) /* * Move (to shed or redistribute load). * * @todo We should prefer to move smaller shards (faster to move) or * "hotter" shards (sheds more workload). There should be a way to * estimate how much workload will be transferred so we know when we are * done. * * FIXME We should limit the #of shards that we move in a given period * of time to allow both this host and the target host an opportunity to * adapt to their new load. [An exception would be if this host was * critically overloaded, but that should probably be handled by * different logic.] */ ILoadBalancerService loadBalancerService = null; if (vmd.getPercentOfSplit() < resourceManager.maximumMovePercentOfSplit && resourceManager.maximumMovesPerTarget != 0 && resourceManager.getLiveJournal().getName2Addr().rangeCount() > resourceManager.minimumActiveIndexPartitions && (loadBalancerService = getLoadBalancerService()) != null && shouldMove(loadBalancerService)) { // the UUID of this data service. final UUID sourceServiceUUID = resourceManager.getDataServiceUUID(); // Obtain UUID of a relatively underutilized data service. final UUID targetDataServiceUUID = getMoveTarget(sourceServiceUUID, loadBalancerService); if (targetDataServiceUUID != null) { if (log.isInfoEnabled()) { // get the target service name. String targetDataServiceName; try { targetDataServiceName = resourceManager .getFederation().getDataService( targetDataServiceUUID) .getServiceName(); } catch (Throwable t) { targetDataServiceName = targetDataServiceUUID .toString(); } log.info("willMove" + "( " + vmd.name + " -> " + targetDataServiceName + ")"); } // Move the shard to the target host. return new MoveTask(vmd, targetDataServiceUUID); } } // No after action was chosen. return null; } /** * Return the {@link ILoadBalancerService} if it can be discovered. * * @return the {@link ILoadBalancerService} if it can be discovered and * otherwise <code>null</code>. */ private ILoadBalancerService getLoadBalancerService() { // lookup the load balancer service. final ILoadBalancerService loadBalancerService; try { loadBalancerService = resourceManager.getFederation() .getLoadBalancerService(); } catch (Exception ex) { log.warn("Could not discover the load balancer service", ex); return null; } if (loadBalancerService == null) { log.warn("Could not discover the load balancer service"); return null; } return loadBalancerService; } /** * Figure out if this data service is considered to be highly utilized, in * which case the DS should shed some index partitions. * <p> * Note: We consult the load balancer service on this since it is able to * put the load of this service into perspective by also considering the * load on the other services in the federation. * * @param loadBalancerService * The load balancer. */ protected boolean shouldMove(final ILoadBalancerService loadBalancerService) { if (loadBalancerService == null) throw new IllegalArgumentException(); // inquire if this service is highly utilized. final boolean highlyUtilizedService; try { final UUID serviceUUID = resourceManager.getDataServiceUUID(); highlyUtilizedService = loadBalancerService .isHighlyUtilizedDataService(serviceUUID); } catch (Exception ex) { log.warn("Could not determine if this data service is highly utilized"); return false; } if (!highlyUtilizedService) { if(log.isInfoEnabled()) log.info("Service is not highly utilized."); return false; } /* * At this point we know that the LBS considers this host and service to * be highly utilized (relative to the other hosts and services). If * there is evidence of resource exhaustion for critical resources (CPU, * RAM, or DIKS) then we will MOVE index partitions in order to shed * some load. Otherwise, we will SPLIT hot index partitions in order to * increase the potential concurrency of the workload for this service. * * Note: CPU is the only fungable resource since things will just slow * down if a host has 100% CPU while it can die if it runs out of DISK * or RAM (including if it begins to swap heavily). * * @todo config options for these triggers. */ final ResourceScores resourceScores = resourceManager.getResourceScores(); final boolean shouldMove = // // heavy CPU utilization. (resourceScores.percentCPUTime >= resourceManager.movePercentCpuTimeThreshold) || // swapping heavily. (resourceScores.majorPageFaultsPerSec > 20) || // running out of disk (data dir). (resourceScores.dataDirBytesFree < Bytes.gigabyte * 5)|| // running out of disk (tmp dir). (resourceScores.dataDirBytesFree < Bytes.gigabyte * .5) ; return shouldMove; // if (shouldMove) { // // return chooseMoves(loadBalancerService); // // } // return chooseHotSplits(); } /** * Obtain the UUID of some relatively underutilized data service. * * FIXME The LBS should interpret the excludedServiceUUID as the source * service UUID and then provide a list of those services having an LBS * computed service score which is significantly lower than the score for * this service. Changing this will break some unit tests (for the LBS * behavior). */ private UUID getMoveTarget(final UUID sourceServiceUUID, final ILoadBalancerService loadBalancerService) { try { // request under utilized data service UUIDs (RMI). final UUID[] uuids = loadBalancerService.getUnderUtilizedDataServices(// 0, // minCount - no lower bound. 1, // maxCount - no upper bound. sourceServiceUUID // exclude this data service. ); if (uuids != null && uuids.length > 0) { // Found a move target. return uuids[0]; } // No move target. return null; } catch (TimeoutException t) { log.warn(t.getMessage()); return null; } catch (InterruptedException t) { log.warn(t.getMessage()); return null; } catch (Throwable t) { log.error("Could not obtain target service UUIDs: ", t); return null; } } /** * Locate the right sibling for this index partition. * <p> * Note: default key/val serializers are used. * * @return The locator for the right sibling -or- <code>null</code> if no * right sibling could be found (which is an error). * * @todo This does not have to be a batch lookup any more. It could use the * {@link ClientIndexView} class. */ private PartitionLocator getRightSiblingLocator( final String scaleOutIndexName, final long lastCommitTime) { final BatchLookup op = BatchLookupConstructor.INSTANCE.newInstance( 0/* fromIndex */, 1/* toIndex */, new byte[][] { vmd.pmd .getRightSeparatorKey() }, null/* vals */); final ResultBuffer resultBuffer; try { resultBuffer = (ResultBuffer) resourceManager.getFederation() .getMetadataService().submit( TimestampUtility.asHistoricalRead(lastCommitTime), MetadataService .getMetadataIndexName(scaleOutIndexName), op).get(); } catch (Exception e) { log.error("Could not locate rightSiblings: index=" + scaleOutIndexName, e); return null; } // the locator for the rightSibling. return (PartitionLocator) SerializerUtil.deserialize(resultBuffer .getValues().get(0)); } /** * Identify the target data services for the new index partitions. * <p> * Note that when maxCount is ZERO (0) ALL joined data services will be * reported. * <p> * Note: This makes sure that _this_ data service is included in the array * so that we will leave at least one of the post-split index partitions on * this data service. * * @todo For a system which has been up and running for a while we would be * better off using the LBS reported move targets rather than all * discovered data services. However, for a new federation we are * better off with all discovered data services since there is less * uncertainty about which services will be reported. * * @todo move to OverflowManager? */ private UUID[] getScatterSplitTargets(final ScatterSplitConfiguration ssc) { final UUID[] a = resourceManager .getFederation() .getDataServiceUUIDs( ssc.getDataServiceCount()/* maxCount */); if (a == null || a.length == 1) { if (log.isInfoEnabled()) log .info("Will not scatter split - insufficient data services discovered."); // abort scatter split logic. return null; } final Set<UUID> tmp = new HashSet<UUID>(Arrays.asList(a)); tmp.add(resourceManager.getDataServiceUUID()); return tmp.toArray(new UUID[tmp.size()]); } // /** // * A paranoia test that verifies that the definition of the view was in fact // * updated. // * // * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> // * @version $Id$ // */ // static private class VerifyAtomicUpdateTask extends AbstractTask<Void> { // // protected final ResourceManager resourceManager; // // final protected BuildResult buildResult; // // final private Event updateEvent; // // /** // * @param resourceManager // * @param concurrencyManager // * @param resource // * @param buildResult // */ // public VerifyAtomicUpdateTask(ResourceManager resourceManager, // IConcurrencyManager concurrencyManager, String resource, // UUID indexUUID, BuildResult buildResult, Event updateEvent) { // // super(concurrencyManager, ITx.UNISOLATED, resource); // // if (resourceManager == null) // throw new IllegalArgumentException(); // // if (buildResult == null) // throw new IllegalArgumentException(); // // if(!buildResult.compactingMerge) // throw new IllegalArgumentException(); // // if(!resource.equals(buildResult.name)) // throw new IllegalArgumentException(); // // if (updateEvent == null) // throw new IllegalArgumentException(); // // this.resourceManager = resourceManager; // // this.buildResult = buildResult; // // this.updateEvent = updateEvent; // // } // // /** // * Verify that the update was correctly registered on the mutable // * {@link BTree}. // * // * @return <code>null</code> // */ // @Override // protected Void doTask() throws Exception { // // updateEvent.start(); // // try { // // if (resourceManager.isOverflowAllowed()) // throw new IllegalStateException(); // // final SegmentMetadata segmentMetadata = buildResult.segmentMetadata; // // // the correct view definition. // final IResourceMetadata[] expected = new IResourceMetadata[] { // // the live journal. // getJournal().getResourceMetadata(), // // the newly built index segment. // segmentMetadata // }; // // /* // * Open the unisolated B+Tree on the live journal that is absorbing // * writes and verify the definition of the view. // */ // final ILocalBTreeView view = (ILocalBTreeView) getIndex(getOnlyResource()); // // // The live B+Tree. // final BTree btree = view.getMutableBTree(); // // final LocalPartitionMetadata pmd = btree.getIndexMetadata().getPartitionMetadata(); // // final IResourceMetadata[] actual = pmd.getResources(); // // if (expected.length != actual.length) { // // throw new RuntimeException("expected=" + expected // + ", but actual=" + actual); // // } // // for (int i = 0; i < expected.length; i++) { // // if (!expected[i].equals(actual[i])) { // // throw new RuntimeException("Differs at index=" + i // + ", expected=" + expected + ", but actual=" // + actual); // // } // // } // // return null; // // } finally { // // updateEvent.end(); // // } // // } // // } /** * <p> * The source view is pre-overflow (the last writes are on the old journal) * while the current view is post-overflow (reflects writes made since * overflow). What we are doing is replacing the pre-overflow history with * an {@link IndexSegment}. * </p> * * <pre> * journal A * view={A} * ---- sync overflow begins ---- * create journal B * view={B,A} * Begin build segment from view={A} (identified by the lastCommitTime) * ---- sync overflow ends ---- * ... build continues ... * ... writes against view={B,A} * ... index segment S0 complete (based on view={A}). * ... * atomic build update task runs: view={B,S0} * ... writes continue. * </pre> * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ static protected class AtomicUpdateCompactingMergeTask extends AbstractAtomicUpdateTask<Void> { private final Event updateEvent; /** * The expected UUID of the scale-out index. */ final protected UUID indexUUID; final protected BuildResult buildResult; /** * @param resourceManager * @param concurrencyManager * @param resource * @param buildResult */ public AtomicUpdateCompactingMergeTask(ResourceManager resourceManager, IConcurrencyManager concurrencyManager, String resource, UUID indexUUID, BuildResult buildResult, Event updateEvent) { super(resourceManager, ITx.UNISOLATED, resource); if (indexUUID == null) throw new IllegalArgumentException(); if (buildResult == null) throw new IllegalArgumentException(); if(!buildResult.compactingMerge) throw new IllegalArgumentException(); if(!resource.equals(buildResult.name)) throw new IllegalArgumentException(); if (updateEvent == null) throw new IllegalArgumentException(); this.indexUUID = indexUUID; this.buildResult = buildResult; this.updateEvent = updateEvent; } /** * <p> * Atomic update. * </p> * * @return <code>null</code> */ @Override protected Void doTask() throws Exception { updateEvent.start(); try { if (resourceManager.isOverflowAllowed()) throw new IllegalStateException(); final SegmentMetadata segmentMetadata = buildResult.segmentMetadata; if (INFO) log.info("Begin: name=" + getOnlyResource() + ", newSegment=" + segmentMetadata); /* * Open the unisolated B+Tree on the live journal that is * absorbing writes. We are going to update its index metadata. * * Note: I am using AbstractTask#getIndex(String name) so that * the concurrency control logic will notice the changes to the * BTree and cause it to be checkpointed if this task succeeds * normally. */ final ILocalBTreeView view = (ILocalBTreeView) getIndex(getOnlyResource()); // make sure that this is the same scale-out index. assertSameIndex(indexUUID, view.getMutableBTree()); if (view instanceof BTree) { /* * Note: there is an expectation that this is not a simple * BTree because this the build task is supposed to be * invoked after an overflow event, and that event should * have re-defined the view to include the BTree on the new * journal plus the historical view. * * One explanation for finding a simple view here is that * the view was a simple BTree on the old journal and the * data was copied from the old journal into the new journal * and then someone decided to do a build even through a * copy had already been done. However, this is not a very * good explanation since we try to avoid doing a build if * we have already done a copy! */ throw new RuntimeException("View is only a B+Tree: name=" + buildResult.name + ", pmd=" + view.getIndexMetadata().getPartitionMetadata()); } // The live B+Tree. final BTree btree = view.getMutableBTree(); if (INFO) log.info("src=" + getOnlyResource() + ",counter=" + view.getCounter().get() + ",checkpoint=" + btree.getCheckpoint()); assert btree != null : "Expecting index: " + getOnlyResource(); // clone the current metadata record for the live index. final IndexMetadata indexMetadata = btree.getIndexMetadata() .clone(); /* * This is the index partition definition on the live index - * the one that will be replaced with a new view as the result * of this atomic update. */ final LocalPartitionMetadata currentpmd = indexMetadata .getPartitionMetadata(); // Check pre-conditions. final IResourceMetadata[] currentResources = currentpmd .getResources(); { if (currentpmd == null) { throw new IllegalStateException( "Not an index partition: " + getOnlyResource()); } if (!currentResources[0].getUUID().equals( getJournal().getRootBlockView().getUUID())) { throw new IllegalStateException( "Expecting live journal to be the first resource: " + currentResources); } /* * Note: I have commented out a bunch of pre-condition tests * that are not valid for histories such as: * * history=create() register(0) split(0) * copy(entryCount=314) * * This case arises when there are not enough index entries * written on the journal after a split to warrant a build * so the buffered writes are just copied to the new * journal. The resources in the view are: * * 1. journal 2. segment * * And this update will replace the segment. */ // // the old journal's resource metadata. // final IResourceMetadata oldJournalMetadata = // oldResources[1]; // assert oldJournalMetadata != null; // assert oldJournalMetadata instanceof JournalMetadata : // "name=" // + getOnlyResource() + ", old pmd=" + oldpmd // + ", segmentMetadata=" + buildResult.segmentMetadata; // // // live journal must be newer. // assert journal.getRootBlockView().getCreateTime() > // oldJournalMetadata // .getCreateTime(); // new index segment build from a view that did not include // data from the live journal. assert segmentMetadata.getCreateTime() < getJournal() .getRootBlockView().getFirstCommitTime() : "segment createTime LT journal 1st commit time" + ": segmentMetadata=" + segmentMetadata + ", journal: " + getJournal().getRootBlockView(); // if (oldResources.length == 3) { // // // the old index segment's resource metadata. // final IResourceMetadata oldSegmentMetadata = // oldResources[2]; // assert oldSegmentMetadata != null; // assert oldSegmentMetadata instanceof SegmentMetadata; // // assert oldSegmentMetadata.getCreateTime() <= // oldJournalMetadata // .getCreateTime(); // // } } // new view definition. final IResourceMetadata[] newResources = new IResourceMetadata[] { // the live journal. getJournal().getResourceMetadata(), // the newly built index segment. segmentMetadata }; // describe the index partition. indexMetadata.setPartitionMetadata(new LocalPartitionMetadata(// currentpmd.getPartitionId(),// currentpmd.getSourcePartitionId(),// currentpmd.getLeftSeparatorKey(),// currentpmd.getRightSeparatorKey(),// newResources, // currentpmd.getIndexPartitionCause() // currentpmd.getHistory() // + OverflowActionEnum.Merge// // + "(lastCommitTime=" // + segmentMetadata.getCreateTime()// // + ",btreeEntryCount=" // + btree.getEntryCount()// // + ",segmentEntryCount=" // + buildResult.builder.getCheckpoint().nentries// // + ",segment=" // + segmentMetadata.getUUID()// // + ",counter=" // + btree.getCounter().get()// // + ",oldResources=" // + Arrays.toString(currentResources) + ") " )); // update the metadata associated with the btree btree.setIndexMetadata(indexMetadata); if (INFO) log.info("Updated view: name=" + getOnlyResource() + ", pmd=" + indexMetadata.getPartitionMetadata()); /* * Verify that the btree recognizes that it needs to be * checkpointed. * * Note: The atomic commit point is when this task commits. */ assert btree.needsCheckpoint(); // btree.writeCheckpoint(); // { // final long id0 = btree.getCounter().get(); // final long pid = id0 >> 32; // final long mask = 0xffffffffL; // final int ctr = (int) (id0 & mask); // log.warn("name="+getOnlyResource()+", counter="+id0+", pid="+pid+", ctr="+ctr); // } // notify successful index partition build. resourceManager.overflowCounters.indexPartitionMergeCounter.incrementAndGet(); return null; } finally { updateEvent.end(); } } // doTask() } // class AtomicUpdate }