package mil.nga.giat.geowave.mapreduce.splits; import java.io.IOException; import java.math.BigInteger; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.TreeSet; import org.apache.commons.lang3.tuple.Pair; import org.apache.hadoop.mapreduce.InputSplit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import mil.nga.giat.geowave.core.store.DataStoreOperations; import mil.nga.giat.geowave.core.store.adapter.AdapterIndexMappingStore; import mil.nga.giat.geowave.core.store.adapter.AdapterStore; import mil.nga.giat.geowave.core.store.adapter.DataAdapter; import mil.nga.giat.geowave.core.store.adapter.statistics.DataStatisticsStore; import mil.nga.giat.geowave.core.store.adapter.statistics.RowRangeDataStatistics; import mil.nga.giat.geowave.core.store.adapter.statistics.RowRangeHistogramStatistics; import mil.nga.giat.geowave.core.store.index.Index; import mil.nga.giat.geowave.core.store.index.IndexStore; import mil.nga.giat.geowave.core.store.index.PrimaryIndex; import mil.nga.giat.geowave.core.store.query.DistributableQuery; import mil.nga.giat.geowave.core.store.query.QueryOptions; public abstract class SplitsProvider { private final static Logger LOGGER = LoggerFactory.getLogger(SplitsProvider.class); private static final BigInteger TWO = BigInteger.valueOf(2); public SplitsProvider() {} /** * Read the metadata table to get tablets and match up ranges to them. */ public List<InputSplit> getSplits( final DataStoreOperations operations, final DistributableQuery query, final QueryOptions queryOptions, final AdapterStore adapterStore, final DataStatisticsStore statsStore, final IndexStore indexStore, final AdapterIndexMappingStore adapterIndexMappingStore, final Integer minSplits, final Integer maxSplits ) throws IOException, InterruptedException { final Map<PrimaryIndex, RowRangeHistogramStatistics<?>> statsCache = new HashMap<PrimaryIndex, RowRangeHistogramStatistics<?>>(); final List<InputSplit> retVal = new ArrayList<InputSplit>(); final TreeSet<IntermediateSplitInfo> splits = new TreeSet<IntermediateSplitInfo>(); for (final Pair<PrimaryIndex, List<DataAdapter<Object>>> indexAdapterPair : queryOptions .getAdaptersWithMinimalSetOfIndices( adapterStore, adapterIndexMappingStore, indexStore)) { populateIntermediateSplits( splits, operations, indexAdapterPair.getLeft(), indexAdapterPair.getValue(), statsCache, adapterStore, statsStore, maxSplits, query, queryOptions.getAuthorizations()); } // this is an incremental algorithm, it may be better use the target // split count to drive it (ie. to get 3 splits this will split 1 // large // range into two down the middle and then split one of those ranges // down the middle to get 3, rather than splitting one range into // thirds) if (!statsCache.isEmpty() && !splits.isEmpty() && (minSplits != null) && (splits.size() < minSplits)) { // set the ranges to at least min splits do { // remove the highest range, split it into 2 and add both // back, // increasing the size by 1 final IntermediateSplitInfo highestSplit = splits.pollLast(); final IntermediateSplitInfo otherSplit = highestSplit.split(statsCache); splits.add(highestSplit); if (otherSplit == null) { LOGGER.warn("Cannot meet minimum splits"); break; } splits.add(otherSplit); } while (splits.size() < minSplits); } else if (((maxSplits != null) && (maxSplits > 0)) && (splits.size() > maxSplits)) { // merge splits to fit within max splits do { // this is the naive approach, remove the lowest two ranges // and merge them, decreasing the size by 1 // TODO Ideally merge takes into account locations (as well // as possibly the index as a secondary criteria) to limit // the number of locations/indices final IntermediateSplitInfo lowestSplit = splits.pollFirst(); final IntermediateSplitInfo nextLowestSplit = splits.pollFirst(); lowestSplit.merge(nextLowestSplit); splits.add(lowestSplit); } while (splits.size() > maxSplits); } for (final IntermediateSplitInfo split : splits) { retVal.add(split.toFinalSplit()); } return retVal; } protected abstract TreeSet<IntermediateSplitInfo> populateIntermediateSplits( TreeSet<IntermediateSplitInfo> splits, DataStoreOperations operations, PrimaryIndex left, List<DataAdapter<Object>> value, Map<PrimaryIndex, RowRangeHistogramStatistics<?>> statsCache, AdapterStore adapterStore, DataStatisticsStore statsStore, Integer maxSplits, DistributableQuery query, String[] authorizations ) throws IOException; protected GeoWaveRowRange getRangeMax( final Index<?, ?> index, final AdapterStore adapterStore, final DataStatisticsStore statsStore, final String[] authorizations ) { final RowRangeDataStatistics<?> stats = (RowRangeDataStatistics<?>) statsStore.getDataStatistics( index.getId(), RowRangeDataStatistics.composeId(index.getId()), authorizations); if (stats == null) { LOGGER .warn("Could not determine range of data from 'RowRangeDataStatistics'. Range will not be clipped. This may result in some splits being empty."); return defaultConstructRange(); } final byte[] min = stats.getMin(); final byte[] max = stats.getMax(); return constructRange( getKeyFromBigInteger( new BigInteger( min).subtract(BigInteger.ONE), min.length), true, getKeyFromBigInteger( new BigInteger( max).add(BigInteger.ONE), max.length), true); } protected abstract GeoWaveRowRange constructRange( byte[] startKey, boolean isStartKeyInclusive, byte[] endKey, boolean isEndKeyInclusive ); protected abstract GeoWaveRowRange defaultConstructRange(); protected abstract RangeLocationPair constructRangeLocationPair( GeoWaveRowRange range, String location, double cardinality ); public abstract GeoWaveInputSplit constructInputSplit( Map<PrimaryIndex, List<RangeLocationPair>> splitInfo, String[] locations ); protected double getCardinality( final RowRangeHistogramStatistics<?> rangeStats, final GeoWaveRowRange range ) { return rangeStats == null ? getRangeLength(range) : rangeStats.cardinality( range.getStartKey(), range.getEndKey()); } protected RowRangeHistogramStatistics<?> getHistStats( final PrimaryIndex index, final List<DataAdapter<Object>> adapters, final AdapterStore adapterStore, final DataStatisticsStore statsStore, final Map<PrimaryIndex, RowRangeHistogramStatistics<?>> statsCache, final String[] authorizations ) throws IOException { RowRangeHistogramStatistics<?> rangeStats = statsCache.get(index); if (rangeStats == null) { try { rangeStats = getRangeStats( index, adapters, adapterStore, statsStore, authorizations); } catch (final Exception e) { throw new IOException( e); } } if (rangeStats != null) { statsCache.put( index, rangeStats); } return rangeStats; } protected static byte[] getKeyFromBigInteger( final BigInteger value, final int numBytes ) { // TODO: does this account for the two extra bytes on BigInteger? final byte[] valueBytes = value.toByteArray(); final byte[] bytes = new byte[numBytes]; final int pos = Math.abs(numBytes - valueBytes.length); System.arraycopy( valueBytes, 0, bytes, pos, Math.min( valueBytes.length, bytes.length)); return bytes; } private RowRangeHistogramStatistics<?> getRangeStats( final PrimaryIndex index, final List<DataAdapter<Object>> adapters, final AdapterStore adapterStore, final DataStatisticsStore store, final String[] authorizations ) { RowRangeHistogramStatistics<?> singleStats = null; for (final DataAdapter<?> adapter : adapters) { final RowRangeHistogramStatistics<?> rowStat = (RowRangeHistogramStatistics<?>) store.getDataStatistics( adapter.getAdapterId(), RowRangeHistogramStatistics.composeId(index.getId()), authorizations); if (singleStats == null) { singleStats = rowStat; } else { singleStats.merge(rowStat); } } return singleStats; } protected static BigInteger getRange( final GeoWaveRowRange range, final int cardinality ) { return getEnd( range, cardinality).subtract( getStart( range, cardinality)); } protected static BigInteger getStart( final GeoWaveRowRange range, final int cardinality ) { final byte[] start = range.getStartKey(); byte[] startBytes; if (!range.isInfiniteStartKey() && (start != null)) { startBytes = extractBytes( start, cardinality); } else { startBytes = extractBytes( new byte[] {}, cardinality); } return new BigInteger( startBytes); } protected static BigInteger getEnd( final GeoWaveRowRange range, final int cardinality ) { final byte[] end = range.getEndKey(); byte[] endBytes; if (!range.isInfiniteStopKey() && (end != null)) { endBytes = extractBytes( end, cardinality); } else { endBytes = extractBytes( new byte[] {}, cardinality, true); } return new BigInteger( endBytes); } protected static double getRangeLength( final GeoWaveRowRange range ) { if ((range.getStartKey() == null) || (range.getEndKey() == null)) { return 1; } final byte[] start = range.getStartKey(); final byte[] end = range.getEndKey(); final int maxDepth = Math.max( end.length, start.length); final BigInteger startBI = new BigInteger( extractBytes( start, maxDepth)); final BigInteger endBI = new BigInteger( extractBytes( end, maxDepth)); return endBI.subtract( startBI).doubleValue(); } protected static byte[] getMidpoint( final GeoWaveRowRange range ) { if ((range.getStartKey() == null) || (range.getEndKey() == null)) { return null; } final byte[] start = range.getStartKey(); final byte[] end = range.getEndKey(); if (Arrays.equals( start, end)) { return null; } final int maxDepth = Math.max( end.length, start.length); final BigInteger startBI = new BigInteger( extractBytes( start, maxDepth)); final BigInteger endBI = new BigInteger( extractBytes( end, maxDepth)); final BigInteger rangeBI = endBI.subtract(startBI); if (rangeBI.equals(BigInteger.ZERO) || rangeBI.equals(BigInteger.ONE)) { return end; } final byte[] valueBytes = rangeBI.divide( TWO).add( startBI).toByteArray(); final byte[] bytes = new byte[valueBytes.length - 2]; System.arraycopy( valueBytes, 2, bytes, 0, bytes.length); return bytes; } public static byte[] extractBytes( final byte[] seq, final int numBytes ) { return extractBytes( seq, numBytes, false); } protected static byte[] extractBytes( final byte[] seq, final int numBytes, final boolean infiniteEndKey ) { final byte[] bytes = new byte[numBytes + 2]; bytes[0] = 1; bytes[1] = 0; for (int i = 0; i < numBytes; i++) { if (i >= seq.length) { if (infiniteEndKey) { // -1 is 0xff bytes[i + 2] = -1; } else { bytes[i + 2] = 0; } } else { bytes[i + 2] = seq[i]; } } return bytes; } }