/* * Copyright © 2014-2015 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.data2.dataset2.lib.table; import co.cask.cdap.api.common.Bytes; import com.google.common.collect.Lists; import java.util.Collections; import java.util.List; /** * Provides handy methods for simple table splits calculation. * <p> * This is copied from old data-fabric, could be improved. * <p> * NOTE: there's also seem to be a bug: first split should have open start and last one open end... TODO: fix it */ public class SplitsUtil { // Simplest stateless getSplits method implementation (doesn't use the actual stored data) /** * If the number of splits is not given, and we have no hints from the table structure (that can be implemented in * overriding implementations, though), the primitive getSplits methos will return up to this many splits. Note that * we cannot read this number from configuration, because the current OVCTable(Handle) does not pass configuration * down into the tables anywhere. See ENG-2395 for the fix. */ static final int DEFAULT_NUMBER_OF_SPLITS = 8; /** * Simplest possible implementation of getSplits. Takes the given start and end and divides the key space in * between into (almost) even partitions, using a long integer approximation of the keys. */ static List<KeyRange> primitiveGetSplits(int numSplits, byte[] start, byte[] stop) { // if the range is empty, return no splits if (start != null && stop != null && Bytes.compareTo(start, stop) >= 0) { return Collections.emptyList(); } if (numSplits <= 0) { numSplits = DEFAULT_NUMBER_OF_SPLITS; } // for simplicity, we construct a long from the begin and end, divide the resulting long range into approximately // even splits, and convert the boundaries back to nyte array keys. long begin = longForKey(start, false); long end = longForKey(stop, true); double splitSize = ((double) (end - begin)) / ((double) numSplits); // each range will start with the stop key of the previous range. // start key of the first range is either the given start, or the least possible key {0}; List<KeyRange> ranges = Lists.newArrayListWithExpectedSize(numSplits); byte[] current = start == null ? new byte[] { 0x00 } : start; for (int i = 1; i < numSplits; i++) { long bound = begin + (long) (splitSize * i); byte[] next = keyForBound(bound); // due to rounding and truncation, we may get a bound that is the same as the previous (or if the previous is // the start key, less than that). We may also get a bound that exceeds the stop key. In both cases we want to // ignore this bound and continue. if (Bytes.compareTo(current, next) < 0 && (stop == null || Bytes.compareTo(next, stop) < 0)) { ranges.add(new KeyRange(current, next)); current = next; } } // end of the last range is always the given stop of the range to cover ranges.add(new KeyRange(current, stop)); return ranges; } // helper method to approximate a row key as a long value. Takes the first 7 bytes from the key and prepends a 0x0; // if the key is less than 7 bytes, pads it with zeros to the right. static long longForKey(byte[] key, boolean isStop) { if (key == null) { return isStop ? 0xffffffffffffffL : 0L; } else { // leading zero helps avoid negative long values for keys beginning with a byte > 0x80 final byte[] leadingZero = { 0x00 }; byte[] x; if (key.length >= Bytes.SIZEOF_LONG - 1) { x = Bytes.add(leadingZero, Bytes.head(key, Bytes.SIZEOF_LONG - 1)); } else { x = Bytes.padTail(Bytes.add(leadingZero, key), Bytes.SIZEOF_LONG - 1 - key.length); } return Bytes.toLong(x); } } // helper method to convert a long approximation of a long key into a range bound. // the following invariant holds: keyForBound(longForKey(key)) == removeTrailingZeros(key). // removing the trailing zeros is ok in the context that this is used (only for split bounds) // this is called keyForBound on purpose, and not keyForLong. static byte[] keyForBound(long value) { byte[] bytes = Bytes.tail(Bytes.toBytes(value), Bytes.SIZEOF_LONG - 1); int lastNonZero = bytes.length - 1; while (lastNonZero > 0 && bytes[lastNonZero] == 0) { lastNonZero--; } return Bytes.head(bytes, lastNonZero + 1); } }