ScatterSplitTask.java example

Explorer
blazegraph-master
- database-master
package com.bigdata.resources;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;

import com.bigdata.btree.ILocalBTreeView;
import com.bigdata.btree.ISimpleSplitHandler;
import com.bigdata.btree.IndexSegment;
import com.bigdata.journal.TimestampUtility;
import com.bigdata.mdi.MetadataIndex;
import com.bigdata.resources.SplitIndexPartitionTask.AtomicUpdateSplitIndexPartitionTask;
import com.bigdata.service.DataService;
import com.bigdata.service.Event;
import com.bigdata.service.EventResource;
import com.bigdata.service.Split;
import com.bigdata.sparse.SparseRowStore;

/**
 * Task splits an index partition into N equal sized index partitions and
 * scatters those index partitions across data services in the federation.
 * Unlike a normal split, this MAY result in index partitions which are under
 * the nominal minimum size requirements. The purpose of a scatter split is to
 * rapidly redistribute an index partition across the federation in order to
 * increase both the potential concurrency of operations on that index partition
 * and to permit more resources to be brought to bear on the index partition.
 * The "equal" splits are achieved by an "adjustment" to the split handler.
 * <p>
 * The task reads from the lastCommitTime of the old journal after an overflow.
 * It uses a key range scan to sample the index partition, building an ordered
 * set of {key,offset} tuples. Based on the actual #of index entries and the
 * target #of index entries per index partition, it chooses the #of output index
 * partitions, N, and selects N-1 {key,offset} tuples to split the index
 * partition. If the index defines a constraint on the split rule, then that
 * constraint will be applied to refine the actual split points, e.g., so as to
 * avoid splitting a logical row of a {@link SparseRowStore}.
 * <p>
 * Once the N-1 split points have been selected, N index segments are built -
 * one from each of the N key ranges which those N-1 split points define. Once
 * the index segment for each split has been built, an
 * {@link AtomicUpdateSplitIndexPartitionTask} will atomically re-define the
 * source index partition as N new index partition. During the atomic update the
 * original index partition becomes un-defined and new index partitions are
 * defined in its place which span the same total key range and have the same
 * data.
 * 
 * @see AtomicUpdateSplitIndexPartitionTask, which MUST be invoked in order to
 *      update the index partition definitions on the live journal and the
 *      {@link MetadataIndex} as an atomic operation.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 */
public class ScatterSplitTask extends
        AbstractPrepareTask<AbstractResult> {

    protected final ViewMetadata vmd;

    /**
     * The #of index partitions that will be generated when we split the source
     * index partition.
     */
    protected final int nsplits;
    
    /**
     * An array of move targets for the new index partitions. The index
     * partitions will be assigned to the move targets using a round robin
     * process. If one of the move targets is this data service, then the
     * corresponding index partition will not be moved.
     */
    protected final UUID[] moveTargets;

    /**
     * The target size of a shard for the scatter split. This is computed by
     * dividing the size of the compact segment on the disk by the #of desired
     * splits.
     */
    protected final long adjustedNominalShardSize;
    
    /**
     * 
     * @param vmd
     *            The metadata for the index partition to be split.
     * @param nsplits
     *            The index will be split into this many index partitions
     *            without regard to the #of tuples in each split.
     * @param moveTargets
     *            An array of move targets for the new index partitions. The
     *            index partitions will be assigned to the move targets using a
     *            round robin process. If one of the move targets is this data
     *            service, then the corresponding index partition will not be
     *            moved.
     */
    protected ScatterSplitTask(final ViewMetadata vmd, final int nsplits,
            final UUID[] moveTargets) {

        super(vmd.resourceManager, TimestampUtility
                .asHistoricalRead(vmd.commitTime), vmd.name);

        if (vmd == null)
            throw new IllegalArgumentException(); 
        
        this.vmd = vmd;

        if (vmd.pmd == null) {

            throw new IllegalStateException("Not an index partition.");

        }
        
        if(!vmd.compactView) {
            
            throw new IllegalStateException("Not a compact view.");
            
        }

        if (vmd.pmd.getSourcePartitionId() != -1) {

            throw new IllegalStateException(
                    "Split not allowed during move: sourcePartitionId="
                            + vmd.pmd.getSourcePartitionId());

        }

        if (nsplits <= 1)
            throw new IllegalArgumentException();

        if (moveTargets != null) {

            if (moveTargets.length == 0)
                throw new IllegalArgumentException();

            for(UUID t : moveTargets) {
                
                if(t == null)
                    throw new IllegalArgumentException();
                
            }
            
        }
        
        this.nsplits = nsplits;
        
        this.moveTargets = moveTargets;

        this.adjustedNominalShardSize = vmd.sumSegBytes / (nsplits / 2);

    }

    @Override
    protected void clearRefs() {
        
        vmd.clearRef();
        
    }
    
    /**
     * Breaks the index partition into N splits, where N was specified to the
     * ctor, and redistributes those splits onto the move targets using a round
     * robin.
     * 
     * @return A {@link SplitResult} if the index partition was split into 2 or
     *         more index partitions -or- a {@link BuildResult} iff the index
     *         partition was not split.
     */
    @Override
    protected AbstractResult doTask() throws Exception {

        final Event e = new Event(resourceManager.getFederation(),
                new EventResource(vmd.indexMetadata), OverflowActionEnum.ScatterSplit,
                vmd.getParams()).addDetail(
                "summary",
                OverflowActionEnum.ScatterSplit + "+" + OverflowActionEnum.Move + "("
                        + vmd.name + ", nsplits=" + nsplits + ")").addDetail(
                "moveTargets", Arrays.toString(moveTargets)).start();

        SplitResult splitResult = null;
        try {

            if (resourceManager.isOverflowAllowed())
                throw new IllegalStateException();

            try {

                final String name = vmd.name;

                // Note: fused view for the source index partition.
                final ILocalBTreeView src = vmd.getView();

                /*
                 * Get the split points for the index. Each split point
                 * describes a new index partition. Together the split points
                 * MUST exactly span the source index partitions key range.
                 * There MUST NOT be any overlap in the key ranges for the
                 * splits.
                 */

                // The application split handler (if any).
                final ISimpleSplitHandler splitHandler = vmd.indexMetadata
                        .getSplitHandler();

                final Split[] splits = SplitUtility.getSplits(resourceManager,
                        vmd.pmd, (IndexSegment) src.getSources()[1],
                        adjustedNominalShardSize, splitHandler);

                if (splits == null) {

                    final double overextension = ((double) vmd.sumSegBytes)
                            / resourceManager.nominalShardSize;

                    if (overextension > resourceManager.shardOverextensionLimit
                            && !resourceManager.isDisabledWrites(vmd.name)) {

                        /*
                         * The shard is overextended (it is at least two times
                         * its nominal maximum size) and is refusing a split.
                         * Continuing to do incremental builds here will mask
                         * the problem and cause the cost of a merge on the
                         * shard to increase over time and will drag down
                         * performance for this DS. In order to prevent this we
                         * MUST disallow further writes on the shard. The shard
                         * can be re-enabled for writes by an administrative
                         * action once the problem has been fixed.
                         * 
                         * Note: The default split behavior should always find a
                         * separator key to split the shard. The mostly likely
                         * cause for a problem is an application defined split
                         * handler. Rather than allowing a poorly written split
                         * handler to foul up the works, we disallow further
                         * writes onto this shard until the application has
                         * fixed their split handler.
                         */

                        log.error("Shard will not split - writes are disabled"
                                + ": name="
                                + vmd.name
                                + ", size="
                                + vmd.sumSegBytes
                                + ", overextended="
                                + (int) overextension
                                + "x"
                                + ", splitHandler="
                                + (splitHandler == null ? "N/A" : splitHandler
                                        .getClass().getName()));

                        // Disable writes on the index partition.
                        resourceManager.disableWrites(vmd.name);
                        
                    }
                    
                    /*
                     * Do an incremental build.
                     */

                    log.warn("No splits identified: will build: " + vmd);

                    // Incremental build.
                    return concurrencyManager.submit(
                            new IncrementalBuildTask(vmd)).get();

                }

                // The #of splits.
                final int nsplits = splits.length;

                if (INFO)
                    log.info("Will build index segments for " + nsplits
                            + " splits for " + name + " : "
                            + Arrays.toString(splits));

                // validate the splits before processing them.
                SplitUtility.validateSplits(src, splits);

                splitResult = SplitUtility.buildSplits(vmd, splits, e);

            } finally {

                /*
                 * We are done building index segments from the source index
                 * partition view so we clear our references for that view.
                 */

                clearRefs();

            }

            /*
             * Do the atomic update.
             */
            SplitIndexPartitionTask
                    .doSplitAtomicUpdate(
                            resourceManager,
                            vmd,
                            splitResult,
                            OverflowActionEnum.ScatterSplit,
                            resourceManager.overflowCounters.indexPartitionSplitCounter,
                            e);

            /*
             * Note: Unlike a normal move where there are writes on the old
             * journal, all the historical data for the each of the index
             * partitions is in an index segment that we just built (new
             * writes MAY be buffered on the live journal, so we still have
             * to deal with that). Therefore we use a different entry point
             * into the MOVE operation.
             * 
             * Note: It is allowable for one of the move targets to be this
             * data service, in which case we simply leave the corresponding
             * index partition in place.
             */

            final int nsplits = splitResult.buildResults.length;

            final List<MoveTask.AtomicUpdate> moveTasks = new ArrayList<MoveTask.AtomicUpdate>(
                    nsplits);

            // create the move tasks.
            {

                for (int i = 0; i < nsplits; i++) {

                    // choose the move target using a round robin.
                    final UUID moveTarget = moveTargets[i % moveTargets.length];

                    if (resourceManager.getDataServiceUUID().equals(moveTarget)) {

                        // ignore move to self.
                        if (INFO)
                            log.info("Ignoring move to self.");
                        continue;

                    }

                    /*
                     * Obtain a new partition identifier for the partition that
                     * will be created when we move the index partition to the
                     * target data service.
                     */
                    final int newPartitionId = resourceManager
                            .nextPartitionId(vmd.indexMetadata.getName());

                    /*
                     * The name of the post-split index partition that is the
                     * source for the move operation.
                     */
                    final String nameOfPartitionToMove = DataService
                            .getIndexPartitionName(vmd.indexMetadata.getName(),
                                    splitResult.splits[i].pmd.getPartitionId());

                    /*
                     * Create a move task.
                     * 
                     * Note: We do not explicitly delete the source index
                     * segment for the source index partition after the move. It
                     * will be required for historical views of the that index
                     * partition in case any client gained access to the index
                     * partition after the split and before the move. It will
                     * eventually be released once the view of the source index
                     * partition becomes sufficiently aged that it falls off the
                     * head of the database history.
                     */
                    moveTasks.add(new MoveTask.AtomicUpdate(resourceManager,
                            nameOfPartitionToMove, splitResult.buildResults[i],
                            moveTarget, newPartitionId, e));

                }

            }

            /*
             * Submit the move tasks to executed in parallel and await their
             * outcomes.
             */
            final List<Future<MoveResult>> futures = resourceManager
                    .getConcurrencyManager().invokeAll(moveTasks);
            
            /*
             * Log error if any move task failed (other than being canceled).
             */
            for (Future<?> f : futures) {

                if (!f.isCancelled()) {

                    try {

                        f.get();

                    } catch (ExecutionException ex) {

                        // log and continue.
                        log.error(ex, ex);

                    }

                }
                
            }
            
            // Done.
            return splitResult;

        } finally {

            if (splitResult != null) {

                for (BuildResult buildResult : splitResult.buildResults) {

                    if (buildResult != null) {

                        /*
                         * At this point the index segment was either incorporated into
                         * the new view in a restart safe manner or there was an error.
                         * Either way, we now remove the index segment store's UUID from
                         * the retentionSet so it will be subject to the release policy
                         * of the StoreManager.
                         */
                        resourceManager
                                .retentionSetRemove(buildResult.segmentMetadata
                                        .getUUID());

                    }

                }

            }

            e.end();

        }

    }

}