package org.apache.cassandra.hadoop2.multiquery; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Set; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import org.apache.hadoop.conf.Configuration; /** * Combines subsplits into InputSplits. * * This class attempts to combine subplits such that: * <ul> * <li>The final set of InputSplits matches the user's requested number of InputSplits</li> * <li>As many subsplits as possible share a common replica node</li> * </ul> */ class SubsplitCombiner { private final Configuration conf; /** * Constructor. * @param conf The Hadoop configuration with information about the Cassandra cluster. */ public SubsplitCombiner(Configuration conf) { this.conf = conf; } /** * Combine subsplits into InputSplits, attempting to group together subsplits that share replica * nodes. * @param subsplits A collection of subsplits to combine. * @return A list of InputSplits. */ public List<MultiQueryInputSplit> combineSubsplits(Collection<Subsplit> subsplits) { // Estimate the number of subsplits per input split. final int numSubsplits = subsplits.size(); final int numSubsplitsPerSplit = numSubsplits / ConfigHelper.getDefaultInputTargetNumSplits(conf); // Group subsplits by host and try to combine subsplits that share a host. List<Subsplit> subsplitsSortedByHost = getSubsplitsSortedByHost(subsplits); List<MultiQueryInputSplit> inputSplits = Lists.newArrayList(); int subsplitIndex = 0; while (subsplitIndex < numSubsplits) { // Start a new InputSplit. Set<Subsplit> subsplitsToCombine = Sets.newHashSet(); // Go until we get to our target number of subsplits / input split. while (true) { // No more data => can't add to this InputSplit anymore. if (subsplitIndex >= numSubsplits) { break; } // Add this subsplit to the current working input split. Subsplit subsplitToAdd = subsplitsSortedByHost.get(subsplitIndex); subsplitsToCombine.add(subsplitToAdd); subsplitIndex++; // If we have reached our size goal, then finish this input split. if (subsplitsToCombine.size() == numSubsplitsPerSplit) { break; } } assert(subsplitsToCombine.size() > 0); // Now create the input split. MultiQueryInputSplit inputSplit = MultiQueryInputSplit.createFromSubplits(subsplitsToCombine); inputSplits.add(inputSplit); } return inputSplits; } /** * Sort subsplits by host. * * @param unsortedSubsplits An unsorted collection of subsplits. * @return A list of the subsplits, sorted by host. */ private List<Subsplit> getSubsplitsSortedByHost(Collection<Subsplit> unsortedSubsplits) { List<Subsplit> subsplitsSortedByHost = Lists.newArrayList(unsortedSubsplits); Collections.sort( subsplitsSortedByHost, new Comparator<Subsplit>() { public int compare(Subsplit firstSubsplit, Subsplit secondSubsplit) { String firstHostList = firstSubsplit.getSortedHostListAsString(); String secondHostList = secondSubsplit.getSortedHostListAsString(); return firstHostList.compareTo(secondHostList); } } ); return subsplitsSortedByHost; } }