/** * (c) Copyright 2014 WibiData, Inc. * * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kiji.mapreduce.framework; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Set; import com.google.common.collect.Lists; import com.google.common.collect.Sets; /** * Combines subsplits into InputSplits. * * This class attempts to combine subplits such that: * <ul> * <li>The final set of InputSplits matches the user's requested number of InputSplits</li> * <li>As many subsplits as possible share a common replica node</li> * </ul> */ class CassandraSubSplitCombiner { /** * Combine subsplits into InputSplits, attempting to group together subsplits that share replica * nodes. * @param subsplits A collection of subsplits to combine. * @param targetNumSplits Target number of input splits to have after combining subsplits. * @return A list of InputSplits. */ public List<CassandraInputSplit> combineSubsplits( Collection<CassandraSubSplit> subsplits, int targetNumSplits) { // Estimate the number of subsplits per input split. final int numSubsplits = subsplits.size(); final int numSubsplitsPerSplit = numSubsplits / targetNumSplits; // Group subsplits by host and try to combine subsplits that share a host. List<CassandraSubSplit> subsplitsSortedByHost = getSubsplitsSortedByHost(subsplits); List<CassandraInputSplit> inputSplits = Lists.newArrayList(); int subsplitIndex = 0; while (subsplitIndex < numSubsplits) { // Start a new InputSplit. Set<CassandraSubSplit> subsplitsToCombine = Sets.newHashSet(); // Go until we get to our target number of subsplits / input split. while (true) { // No more data => can't add to this InputSplit anymore. if (subsplitIndex >= numSubsplits) { break; } // Add this subsplit to the current working input split. CassandraSubSplit subsplitToAdd = subsplitsSortedByHost.get(subsplitIndex); subsplitsToCombine.add(subsplitToAdd); subsplitIndex++; // If we have reached our size goal, then finish this input split. if (subsplitsToCombine.size() == numSubsplitsPerSplit) { break; } } assert(subsplitsToCombine.size() > 0); // Now create the input split. CassandraInputSplit inputSplit = CassandraInputSplit.createFromSubplits(subsplitsToCombine); inputSplits.add(inputSplit); } return inputSplits; } /** * Combine subsplits into InputSplits, attempting to group together subsplits that share replica * nodes. * * Will combine into four subsplits. * * @param subsplits A collection of subsplits to combine. * @return A list of InputSplits. */ public List<CassandraInputSplit> combineSubsplits(Collection<CassandraSubSplit> subsplits) { return combineSubsplits(subsplits, 4); } /** * Sort subsplits by host. * * @param unsortedSubsplits An unsorted collection of subsplits. * @return A list of the subsplits, sorted by host. */ private List<CassandraSubSplit> getSubsplitsSortedByHost( Collection<CassandraSubSplit> unsortedSubsplits) { List<CassandraSubSplit> subsplitsSortedByHost = Lists.newArrayList(unsortedSubsplits); Collections.sort( subsplitsSortedByHost, new Comparator<CassandraSubSplit>() { public int compare(CassandraSubSplit firstSubsplit, CassandraSubSplit secondSubsplit) { String firstHostList = firstSubsplit.getSortedHostListAsString(); String secondHostList = secondSubsplit.getSortedHostListAsString(); return firstHostList.compareTo(secondHostList); } } ); return subsplitsSortedByHost; } }