package mil.nga.giat.geowave.mapreduce.splits; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.TreeSet; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import mil.nga.giat.geowave.core.index.ByteArrayId; import mil.nga.giat.geowave.core.store.adapter.statistics.RowRangeHistogramStatistics; import mil.nga.giat.geowave.core.store.adapter.statistics.histogram.ByteUtils; import mil.nga.giat.geowave.core.store.index.PrimaryIndex; public class IntermediateSplitInfo implements Comparable<IntermediateSplitInfo> { private final static Logger LOGGER = LoggerFactory.getLogger(IntermediateSplitInfo.class); protected class IndexRangeLocation { private RangeLocationPair rangeLocationPair; private final PrimaryIndex index; public IndexRangeLocation( final RangeLocationPair rangeLocationPair, final PrimaryIndex index ) { this.rangeLocationPair = rangeLocationPair; this.index = index; } public IndexRangeLocation split( final RowRangeHistogramStatistics<?> stats, final double currentCardinality, final double targetCardinality ) { if (stats == null) { return null; } final double thisCardinalty = rangeLocationPair.getCardinality(); final double fraction = (targetCardinality - currentCardinality) / thisCardinalty; final byte[] start = rangeLocationPair.getRange().getStartKey(); final byte[] end = rangeLocationPair.getRange().getEndKey(); final double cdfStart = stats.cdf(start); final double cdfEnd = stats.cdf(end); final double expectedEndValue = stats.quantile(cdfStart + ((cdfEnd - cdfStart) * fraction)); final int maxCardinality = Math.max( start.length, end.length); byte[] bytes = ByteUtils.toBytes(expectedEndValue); byte[] splitKey; if ((bytes.length < 8) && (bytes.length < maxCardinality)) { // prepend with 0 bytes = expandBytes( bytes, Math.min( 8, maxCardinality)); } if (bytes.length < maxCardinality) { splitKey = new byte[maxCardinality]; System.arraycopy( bytes, 0, splitKey, 0, bytes.length); } else { splitKey = bytes; } final String location = rangeLocationPair.getLocation(); final boolean startKeyInclusive = true; final boolean endKeyInclusive = false; if (new ByteArrayId( start).compareTo(new ByteArrayId( splitKey)) >= 0 || new ByteArrayId( end).compareTo(new ByteArrayId( splitKey)) <= 0) { splitKey = SplitsProvider.getMidpoint(rangeLocationPair.getRange()); if (splitKey == null) { return null; } // if you can split the range only by setting the split to the // end, but its not inclusive on the end, just clamp this to the // start and don't split producing a new pair if (Arrays.equals( end, splitKey) && !rangeLocationPair.getRange().isEndKeyInclusive()) { rangeLocationPair = splitsProvider.constructRangeLocationPair( splitsProvider.constructRange( rangeLocationPair.getRange().getStartKey(), rangeLocationPair.getRange().isStartKeyInclusive(), splitKey, endKeyInclusive), location, stats.cardinality( rangeLocationPair.getRange().getStartKey(), splitKey)); return null; } // } } try { final RangeLocationPair newPair = splitsProvider.constructRangeLocationPair( splitsProvider.constructRange( rangeLocationPair.getRange().getStartKey(), rangeLocationPair.getRange().isStartKeyInclusive(), splitKey, endKeyInclusive), location, stats.cardinality( rangeLocationPair.getRange().getStartKey(), splitKey)); rangeLocationPair = splitsProvider.constructRangeLocationPair( splitsProvider.constructRange( splitKey, startKeyInclusive, rangeLocationPair.getRange().getEndKey(), rangeLocationPair.getRange().isEndKeyInclusive()), location, stats.cardinality( splitKey, rangeLocationPair.getRange().getEndKey())); return new IndexRangeLocation( newPair, index); } catch (final java.lang.IllegalArgumentException ex) { LOGGER.info( "Unable to split range", ex); return null; } } private byte[] expandBytes( final byte valueBytes[], final int numBytes ) { final byte[] bytes = new byte[numBytes]; int expansion = 0; if (numBytes > valueBytes.length) { expansion = (numBytes - valueBytes.length); for (int i = 0; i < expansion; i++) { bytes[i] = 0; } for (int i = 0; i < valueBytes.length; i++) { bytes[expansion + i] = valueBytes[i]; } } else { for (int i = 0; i < numBytes; i++) { bytes[i] = valueBytes[i]; } } return bytes; } } private final Map<PrimaryIndex, List<RangeLocationPair>> splitInfo; private final SplitsProvider splitsProvider; public IntermediateSplitInfo( final Map<PrimaryIndex, List<RangeLocationPair>> splitInfo, final SplitsProvider splitsProvider ) { this.splitInfo = splitInfo; this.splitsProvider = splitsProvider; } synchronized void merge( final IntermediateSplitInfo split ) { for (final Entry<PrimaryIndex, List<RangeLocationPair>> e : split.splitInfo.entrySet()) { List<RangeLocationPair> thisList = splitInfo.get(e.getKey()); if (thisList == null) { thisList = new ArrayList<RangeLocationPair>(); splitInfo.put( e.getKey(), thisList); } thisList.addAll(e.getValue()); } } /** * Side effect: Break up this split. * * Split the ranges into two * * @return the new split. */ synchronized IntermediateSplitInfo split( final Map<PrimaryIndex, RowRangeHistogramStatistics<?>> statsCache ) { // generically you'd want the split to be as limiting to total // locations as possible and then as limiting as possible to total // indices, but in this case split() is only called when all ranges // are in the same location and the same index final TreeSet<IndexRangeLocation> orderedSplits = new TreeSet<IndexRangeLocation>( new Comparator<IndexRangeLocation>() { @Override public int compare( final IndexRangeLocation o1, final IndexRangeLocation o2 ) { return (o1.rangeLocationPair.getCardinality() - o2.rangeLocationPair.getCardinality()) < 0 ? -1 : 1; } }); for (final Entry<PrimaryIndex, List<RangeLocationPair>> ranges : splitInfo.entrySet()) { for (final RangeLocationPair p : ranges.getValue()) { orderedSplits.add(new IndexRangeLocation( p, ranges.getKey())); } } final double targetCardinality = getTotalRangeAtCardinality() / 2; double currentCardinality = 0.0; final Map<PrimaryIndex, List<RangeLocationPair>> otherSplitInfo = new HashMap<PrimaryIndex, List<RangeLocationPair>>(); splitInfo.clear(); do { final IndexRangeLocation next = orderedSplits.pollFirst(); double nextCardinality = currentCardinality + next.rangeLocationPair.getCardinality(); if (nextCardinality > targetCardinality) { final IndexRangeLocation newSplit = next.split( statsCache.get(next.index), currentCardinality, targetCardinality); double splitCardinality = next.rangeLocationPair.getCardinality(); // Stats can have inaccuracies over narrow ranges // thus, a split based on statistics may not be found if (newSplit != null) { splitCardinality += newSplit.rangeLocationPair.getCardinality(); addPairForIndex( otherSplitInfo, newSplit.rangeLocationPair, newSplit.index); addPairForIndex( splitInfo, next.rangeLocationPair, next.index); } else { // Still add to the other SPLIT if there is remaining // pairs // in this SPLIT addPairForIndex( (!orderedSplits.isEmpty()) ? otherSplitInfo : splitInfo, next.rangeLocationPair, next.index); } nextCardinality = currentCardinality + splitCardinality; if (nextCardinality > targetCardinality) { break; } currentCardinality = nextCardinality; } else { addPairForIndex( otherSplitInfo, next.rangeLocationPair, next.index); currentCardinality = nextCardinality; } } while (!orderedSplits.isEmpty()); // What is left of the ranges // that haven't been placed in the other split info for (final IndexRangeLocation split : orderedSplits) { addPairForIndex( splitInfo, split.rangeLocationPair, split.index); } // All ranges consumed by the other split if (splitInfo.size() == 0) { // First try to move a index set of ranges back. if (otherSplitInfo.size() > 1) { final Iterator<Entry<PrimaryIndex, List<RangeLocationPair>>> it = otherSplitInfo.entrySet().iterator(); final Entry<PrimaryIndex, List<RangeLocationPair>> entry = it.next(); it.remove(); splitInfo.put( entry.getKey(), entry.getValue()); } else { splitInfo.putAll(otherSplitInfo); otherSplitInfo.clear(); } } return otherSplitInfo.size() == 0 ? null : new IntermediateSplitInfo( otherSplitInfo, splitsProvider); } private void addPairForIndex( final Map<PrimaryIndex, List<RangeLocationPair>> otherSplitInfo, final RangeLocationPair pair, final PrimaryIndex index ) { List<RangeLocationPair> list = otherSplitInfo.get(index); if (list == null) { list = new ArrayList<RangeLocationPair>(); otherSplitInfo.put( index, list); } list.add(pair); } public synchronized GeoWaveInputSplit toFinalSplit() { final Set<String> locations = new HashSet<String>(); for (final Entry<PrimaryIndex, List<RangeLocationPair>> entry : splitInfo.entrySet()) { for (final RangeLocationPair pair : entry.getValue()) { locations.add(pair.getLocation()); } } return splitsProvider.constructInputSplit( splitInfo, locations.toArray(new String[locations.size()])); } @Override public int compareTo( final IntermediateSplitInfo o ) { final double thisTotal = getTotalRangeAtCardinality(); final double otherTotal = o.getTotalRangeAtCardinality(); int result = Double.compare( thisTotal, otherTotal); if (result == 0) { result = Integer.compare( splitInfo.size(), o.splitInfo.size()); if (result == 0) { final List<RangeLocationPair> pairs = new ArrayList<>(); final List<RangeLocationPair> otherPairs = new ArrayList<>(); double rangeSum = 0; double otherSum = 0; for (final List<RangeLocationPair> p : splitInfo.values()) { pairs.addAll(p); } for (final List<RangeLocationPair> p : o.splitInfo.values()) { otherPairs.addAll(p); } result = Integer.compare( pairs.size(), otherPairs.size()); if (result == 0) { for (final RangeLocationPair p : pairs) { rangeSum += SplitsProvider.getRangeLength(p.getRange()); } for (final RangeLocationPair p : otherPairs) { otherSum += SplitsProvider.getRangeLength(p.getRange()); } result = Double.compare( rangeSum, otherSum); if (result == 0) { result = Integer.compare( hashCode(), o.hashCode()); } } } } return result; } @Override public int hashCode() { final int prime = 31; int result = 1; result = (prime * result) + ((splitInfo == null) ? 0 : splitInfo.hashCode()); result = (prime * result) + ((splitsProvider == null) ? 0 : splitsProvider.hashCode()); return result; } @Override public boolean equals( final Object obj ) { if (this == obj) { return true; } if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } final IntermediateSplitInfo other = (IntermediateSplitInfo) obj; if (splitInfo == null) { if (other.splitInfo != null) { return false; } } else if (!splitInfo.equals(other.splitInfo)) { return false; } if (splitsProvider == null) { if (other.splitsProvider != null) { return false; } } else if (!splitsProvider.equals(other.splitsProvider)) { return false; } return true; } private synchronized double getTotalRangeAtCardinality() { double sum = 0.0; for (final List<RangeLocationPair> pairList : splitInfo.values()) { for (final RangeLocationPair pair : pairList) { sum += pair.getCardinality(); } } return sum; } }