package picard.util; import htsjdk.samtools.util.Interval; import htsjdk.samtools.util.IntervalList; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; /** * @author mccowan */ public class IntervalListScatterer { public enum Mode { /** * A simple scatter approach in which all output intervals have size equal to the total base count of the source list divide by the * scatter count (except for possible variance in the final interval list). */ INTERVAL_SUBDIVISION, /** * A scatter approach that differs from {@link Mode#INTERVAL_SUBDIVISION} in a few ways. * <ol> * <li>No interval will be subdivided, and consequently, the requested scatter count is an upper bound of scatter count, not a * guarantee as to how many {@link IntervalList}s will be produced (e.g., if scatterCount = 10 but there is only one input interval, * only 1 interval list will be emitted).</li> * <li>When an interval would otherwise be split, it is instead deferred to the next scatter list.</li> * <li>The "target width" of each scatter list may be wider than what is computed for {@link Mode#INTERVAL_SUBDIVISION}. * Specifically, if the widest interval in the source interval list is larger than what would otherwise be the target width, that * interval's width is used.<br/><br/>The reasoning for this is that this approach produces more consistently-sized interval lists, * which is one of the objectives of scattering.</li> * </ol> */ BALANCING_WITHOUT_INTERVAL_SUBDIVISION } private final Mode mode; public IntervalListScatterer(final Mode mode) {this.mode = mode;} private int deduceIdealSplitLength(final IntervalList uniquedList, final int scatterCount) { final int splitWidth = Math.max((int) Math.floor(uniquedList.getBaseCount() / (1.0 * scatterCount)), 1); switch (mode) { case INTERVAL_SUBDIVISION: return splitWidth; case BALANCING_WITHOUT_INTERVAL_SUBDIVISION: final int widestIntervalLength = Collections.max(uniquedList.getIntervals(), new Comparator<Interval>() { @Override public int compare(final Interval o1, final Interval o2) { return Integer.valueOf(o1.length()).compareTo(o2.length()); } }).length(); // There is no purpose to splitting more granularly than the widest interval, so do not. return Math.max(widestIntervalLength, splitWidth); default: throw new IllegalStateException(); } } public List<IntervalList> scatter(final IntervalList sourceIntervalList, final int scatterCount) { if (scatterCount < 1) throw new IllegalArgumentException("scatterCount < 1"); final IntervalList uniquedList = sourceIntervalList.uniqued(); final long idealSplitLength = deduceIdealSplitLength(uniquedList, scatterCount); final List<IntervalList> accumulatedIntervalLists = new ArrayList<IntervalList>(); IntervalList runningIntervalList = new IntervalList(uniquedList.getHeader()); final ArrayDeque<Interval> intervalQueue = new ArrayDeque<Interval>(uniquedList.getIntervals()); while (!intervalQueue.isEmpty() && accumulatedIntervalLists.size() < scatterCount - 1) { final Interval interval = intervalQueue.pollFirst(); final long projectedSize = runningIntervalList.getBaseCount() + interval.length(); if (projectedSize <= idealSplitLength) { runningIntervalList.add(interval); } else { final Interval intervalToAdd; switch (mode) { case INTERVAL_SUBDIVISION: final int amountToConsume = (int) (idealSplitLength - runningIntervalList.getBaseCount()); final Interval left = new Interval( interval.getSequence(), interval.getStart(), interval.getStart() + amountToConsume - 1, interval.isNegativeStrand(), interval.getName() ); final Interval right = new Interval( interval.getSequence(), interval.getStart() + amountToConsume, interval.getEnd(), interval.isNegativeStrand(), interval.getName() ); runningIntervalList.add(left); // Push back the excess back onto our queue for reconsideration. intervalQueue.addFirst(right); break; case BALANCING_WITHOUT_INTERVAL_SUBDIVISION: if (runningIntervalList.getIntervals().isEmpty()) { runningIntervalList.add(interval); } else { // Push this interval into the next scatter; re-inject it into the queue, then advance the scatter. intervalQueue.addFirst(interval); accumulatedIntervalLists.add(runningIntervalList); runningIntervalList = new IntervalList(uniquedList.getHeader()); } break; } } if (runningIntervalList.getBaseCount() >= idealSplitLength) { accumulatedIntervalLists.add(runningIntervalList); runningIntervalList = new IntervalList(uniquedList.getHeader()); } } // Flush the remaining intervals into the last split. while (!intervalQueue.isEmpty()) { runningIntervalList.add(intervalQueue.pollFirst()); } if (!runningIntervalList.getIntervals().isEmpty()) { accumulatedIntervalLists.add(runningIntervalList); } return accumulatedIntervalLists; } }