package com.bigdata.resources;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import com.bigdata.btree.ILocalBTreeView;
import com.bigdata.btree.ISimpleSplitHandler;
import com.bigdata.btree.IndexSegment;
import com.bigdata.journal.TimestampUtility;
import com.bigdata.mdi.MetadataIndex;
import com.bigdata.resources.SplitIndexPartitionTask.AtomicUpdateSplitIndexPartitionTask;
import com.bigdata.service.DataService;
import com.bigdata.service.Event;
import com.bigdata.service.EventResource;
import com.bigdata.service.Split;
import com.bigdata.sparse.SparseRowStore;
/**
* Task splits an index partition into N equal sized index partitions and
* scatters those index partitions across data services in the federation.
* Unlike a normal split, this MAY result in index partitions which are under
* the nominal minimum size requirements. The purpose of a scatter split is to
* rapidly redistribute an index partition across the federation in order to
* increase both the potential concurrency of operations on that index partition
* and to permit more resources to be brought to bear on the index partition.
* The "equal" splits are achieved by an "adjustment" to the split handler.
* <p>
* The task reads from the lastCommitTime of the old journal after an overflow.
* It uses a key range scan to sample the index partition, building an ordered
* set of {key,offset} tuples. Based on the actual #of index entries and the
* target #of index entries per index partition, it chooses the #of output index
* partitions, N, and selects N-1 {key,offset} tuples to split the index
* partition. If the index defines a constraint on the split rule, then that
* constraint will be applied to refine the actual split points, e.g., so as to
* avoid splitting a logical row of a {@link SparseRowStore}.
* <p>
* Once the N-1 split points have been selected, N index segments are built -
* one from each of the N key ranges which those N-1 split points define. Once
* the index segment for each split has been built, an
* {@link AtomicUpdateSplitIndexPartitionTask} will atomically re-define the
* source index partition as N new index partition. During the atomic update the
* original index partition becomes un-defined and new index partitions are
* defined in its place which span the same total key range and have the same
* data.
*
* @see AtomicUpdateSplitIndexPartitionTask, which MUST be invoked in order to
* update the index partition definitions on the live journal and the
* {@link MetadataIndex} as an atomic operation.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
public class ScatterSplitTask extends
AbstractPrepareTask<AbstractResult> {
protected final ViewMetadata vmd;
/**
* The #of index partitions that will be generated when we split the source
* index partition.
*/
protected final int nsplits;
/**
* An array of move targets for the new index partitions. The index
* partitions will be assigned to the move targets using a round robin
* process. If one of the move targets is this data service, then the
* corresponding index partition will not be moved.
*/
protected final UUID[] moveTargets;
/**
* The target size of a shard for the scatter split. This is computed by
* dividing the size of the compact segment on the disk by the #of desired
* splits.
*/
protected final long adjustedNominalShardSize;
/**
*
* @param vmd
* The metadata for the index partition to be split.
* @param nsplits
* The index will be split into this many index partitions
* without regard to the #of tuples in each split.
* @param moveTargets
* An array of move targets for the new index partitions. The
* index partitions will be assigned to the move targets using a
* round robin process. If one of the move targets is this data
* service, then the corresponding index partition will not be
* moved.
*/
protected ScatterSplitTask(final ViewMetadata vmd, final int nsplits,
final UUID[] moveTargets) {
super(vmd.resourceManager, TimestampUtility
.asHistoricalRead(vmd.commitTime), vmd.name);
if (vmd == null)
throw new IllegalArgumentException();
this.vmd = vmd;
if (vmd.pmd == null) {
throw new IllegalStateException("Not an index partition.");
}
if(!vmd.compactView) {
throw new IllegalStateException("Not a compact view.");
}
if (vmd.pmd.getSourcePartitionId() != -1) {
throw new IllegalStateException(
"Split not allowed during move: sourcePartitionId="
+ vmd.pmd.getSourcePartitionId());
}
if (nsplits <= 1)
throw new IllegalArgumentException();
if (moveTargets != null) {
if (moveTargets.length == 0)
throw new IllegalArgumentException();
for(UUID t : moveTargets) {
if(t == null)
throw new IllegalArgumentException();
}
}
this.nsplits = nsplits;
this.moveTargets = moveTargets;
this.adjustedNominalShardSize = vmd.sumSegBytes / (nsplits / 2);
}
@Override
protected void clearRefs() {
vmd.clearRef();
}
/**
* Breaks the index partition into N splits, where N was specified to the
* ctor, and redistributes those splits onto the move targets using a round
* robin.
*
* @return A {@link SplitResult} if the index partition was split into 2 or
* more index partitions -or- a {@link BuildResult} iff the index
* partition was not split.
*/
@Override
protected AbstractResult doTask() throws Exception {
final Event e = new Event(resourceManager.getFederation(),
new EventResource(vmd.indexMetadata), OverflowActionEnum.ScatterSplit,
vmd.getParams()).addDetail(
"summary",
OverflowActionEnum.ScatterSplit + "+" + OverflowActionEnum.Move + "("
+ vmd.name + ", nsplits=" + nsplits + ")").addDetail(
"moveTargets", Arrays.toString(moveTargets)).start();
SplitResult splitResult = null;
try {
if (resourceManager.isOverflowAllowed())
throw new IllegalStateException();
try {
final String name = vmd.name;
// Note: fused view for the source index partition.
final ILocalBTreeView src = vmd.getView();
/*
* Get the split points for the index. Each split point
* describes a new index partition. Together the split points
* MUST exactly span the source index partitions key range.
* There MUST NOT be any overlap in the key ranges for the
* splits.
*/
// The application split handler (if any).
final ISimpleSplitHandler splitHandler = vmd.indexMetadata
.getSplitHandler();
final Split[] splits = SplitUtility.getSplits(resourceManager,
vmd.pmd, (IndexSegment) src.getSources()[1],
adjustedNominalShardSize, splitHandler);
if (splits == null) {
final double overextension = ((double) vmd.sumSegBytes)
/ resourceManager.nominalShardSize;
if (overextension > resourceManager.shardOverextensionLimit
&& !resourceManager.isDisabledWrites(vmd.name)) {
/*
* The shard is overextended (it is at least two times
* its nominal maximum size) and is refusing a split.
* Continuing to do incremental builds here will mask
* the problem and cause the cost of a merge on the
* shard to increase over time and will drag down
* performance for this DS. In order to prevent this we
* MUST disallow further writes on the shard. The shard
* can be re-enabled for writes by an administrative
* action once the problem has been fixed.
*
* Note: The default split behavior should always find a
* separator key to split the shard. The mostly likely
* cause for a problem is an application defined split
* handler. Rather than allowing a poorly written split
* handler to foul up the works, we disallow further
* writes onto this shard until the application has
* fixed their split handler.
*/
log.error("Shard will not split - writes are disabled"
+ ": name="
+ vmd.name
+ ", size="
+ vmd.sumSegBytes
+ ", overextended="
+ (int) overextension
+ "x"
+ ", splitHandler="
+ (splitHandler == null ? "N/A" : splitHandler
.getClass().getName()));
// Disable writes on the index partition.
resourceManager.disableWrites(vmd.name);
}
/*
* Do an incremental build.
*/
log.warn("No splits identified: will build: " + vmd);
// Incremental build.
return concurrencyManager.submit(
new IncrementalBuildTask(vmd)).get();
}
// The #of splits.
final int nsplits = splits.length;
if (INFO)
log.info("Will build index segments for " + nsplits
+ " splits for " + name + " : "
+ Arrays.toString(splits));
// validate the splits before processing them.
SplitUtility.validateSplits(src, splits);
splitResult = SplitUtility.buildSplits(vmd, splits, e);
} finally {
/*
* We are done building index segments from the source index
* partition view so we clear our references for that view.
*/
clearRefs();
}
/*
* Do the atomic update.
*/
SplitIndexPartitionTask
.doSplitAtomicUpdate(
resourceManager,
vmd,
splitResult,
OverflowActionEnum.ScatterSplit,
resourceManager.overflowCounters.indexPartitionSplitCounter,
e);
/*
* Note: Unlike a normal move where there are writes on the old
* journal, all the historical data for the each of the index
* partitions is in an index segment that we just built (new
* writes MAY be buffered on the live journal, so we still have
* to deal with that). Therefore we use a different entry point
* into the MOVE operation.
*
* Note: It is allowable for one of the move targets to be this
* data service, in which case we simply leave the corresponding
* index partition in place.
*/
final int nsplits = splitResult.buildResults.length;
final List<MoveTask.AtomicUpdate> moveTasks = new ArrayList<MoveTask.AtomicUpdate>(
nsplits);
// create the move tasks.
{
for (int i = 0; i < nsplits; i++) {
// choose the move target using a round robin.
final UUID moveTarget = moveTargets[i % moveTargets.length];
if (resourceManager.getDataServiceUUID().equals(moveTarget)) {
// ignore move to self.
if (INFO)
log.info("Ignoring move to self.");
continue;
}
/*
* Obtain a new partition identifier for the partition that
* will be created when we move the index partition to the
* target data service.
*/
final int newPartitionId = resourceManager
.nextPartitionId(vmd.indexMetadata.getName());
/*
* The name of the post-split index partition that is the
* source for the move operation.
*/
final String nameOfPartitionToMove = DataService
.getIndexPartitionName(vmd.indexMetadata.getName(),
splitResult.splits[i].pmd.getPartitionId());
/*
* Create a move task.
*
* Note: We do not explicitly delete the source index
* segment for the source index partition after the move. It
* will be required for historical views of the that index
* partition in case any client gained access to the index
* partition after the split and before the move. It will
* eventually be released once the view of the source index
* partition becomes sufficiently aged that it falls off the
* head of the database history.
*/
moveTasks.add(new MoveTask.AtomicUpdate(resourceManager,
nameOfPartitionToMove, splitResult.buildResults[i],
moveTarget, newPartitionId, e));
}
}
/*
* Submit the move tasks to executed in parallel and await their
* outcomes.
*/
final List<Future<MoveResult>> futures = resourceManager
.getConcurrencyManager().invokeAll(moveTasks);
/*
* Log error if any move task failed (other than being canceled).
*/
for (Future<?> f : futures) {
if (!f.isCancelled()) {
try {
f.get();
} catch (ExecutionException ex) {
// log and continue.
log.error(ex, ex);
}
}
}
// Done.
return splitResult;
} finally {
if (splitResult != null) {
for (BuildResult buildResult : splitResult.buildResults) {
if (buildResult != null) {
/*
* At this point the index segment was either incorporated into
* the new view in a restart safe manner or there was an error.
* Either way, we now remove the index segment store's UUID from
* the retentionSet so it will be subject to the release policy
* of the StoreManager.
*/
resourceManager
.retentionSetRemove(buildResult.segmentMetadata
.getUUID());
}
}
}
e.end();
}
}
}