/*
* Copyright © 2014 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.data2.dataset2.lib.table;
import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.dataset.table.Filter;
import co.cask.cdap.common.utils.ImmutablePair;
import java.util.Arrays;
import java.util.List;
/**
* This is inspired by HBase's FuzzyRowFilter.
*
* Filters data based on fuzzy row key. Performs fast-forwards during scanning.
* It takes pairs (row key, fuzzy info) to match row keys. Where fuzzy info is
* a byte array with 0 or 1 as its values:
* <ul>
* <li>
* 0 - means that this byte in provided row key is fixed, i.e. row key's byte at same position
* must match
* </li>
* <li>
* 1 - means that this byte in provided row key is NOT fixed, i.e. row key's byte at this
* position can be different from the one in provided row key
* </li>
* </ul>
*
*
* Example:
* Let's assume row key format is userId_actionId_year_month. Length of userId is fixed
* and is 4, length of actionId is 2 and year and month are 4 and 2 bytes long respectively.
*
* Let's assume that we need to fetch all users that performed certain action (encoded as "99")
* in Jan of any year. Then the pair (row key, fuzzy info) would be the following:
* row key = "????_99_????_01" (one can use any value instead of "?")
* fuzzy info = "\x01\x01\x01\x01\x00\x00\x00\x00\x01\x01\x01\x01\x00\x00\x00"
*
* I.e. fuzzy info tells the matching mask is "????_99_????_01", where at ? can be any value.
*
*/
public final class FuzzyRowFilter implements Filter {
private final List<ImmutablePair<byte[], byte[]>> fuzzyKeysData;
public FuzzyRowFilter(List<ImmutablePair<byte[], byte[]>> fuzzyKeysData) {
this.fuzzyKeysData = fuzzyKeysData;
}
public List<ImmutablePair<byte[], byte[]>> getFuzzyKeysData() {
return fuzzyKeysData;
}
/**
* Return codes for filterRow().
*/
public enum ReturnCode {
/**
* Include this row.
*/
INCLUDE,
/**
* Seek to next row which is given as hint by the filter.
*/
SEEK_NEXT_USING_HINT,
/**
* No greater rows can possibly match.
*/
DONE
}
public ReturnCode filterRow(byte[] rowKey) {
// assigning "worst" result first and looking for better options
SatisfiesCode bestOption = SatisfiesCode.NO_NEXT;
for (ImmutablePair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
SatisfiesCode satisfiesCode =
satisfies(rowKey, fuzzyData.getFirst(), fuzzyData.getSecond());
if (satisfiesCode == SatisfiesCode.YES) {
return ReturnCode.INCLUDE;
}
if (satisfiesCode == SatisfiesCode.NEXT_EXISTS) {
bestOption = SatisfiesCode.NEXT_EXISTS;
}
}
if (bestOption == SatisfiesCode.NEXT_EXISTS) {
return ReturnCode.SEEK_NEXT_USING_HINT;
}
// the only unhandled SatisfiesCode is NO_NEXT, i.e. we are done
return ReturnCode.DONE;
}
public byte[] getNextRowHint(byte[] rowKey) {
byte[] nextRowKey = null;
// Searching for the "smallest" row key that satisfies at least one fuzzy row key
for (ImmutablePair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
byte[] nextRowKeyCandidate = getNextForFuzzyRule(rowKey,
fuzzyData.getFirst(), fuzzyData.getSecond());
if (nextRowKeyCandidate == null) {
continue;
}
if (nextRowKey == null || Bytes.compareTo(nextRowKeyCandidate, nextRowKey) < 0) {
nextRowKey = nextRowKeyCandidate;
}
}
if (nextRowKey == null) {
// SHOULD NEVER happen
// TODO: is there a better way than throw exception? (stop the scanner?)
throw new IllegalStateException("No next row key that satisfies fuzzy exists when" +
" getNextKeyHint() is invoked." +
" Filter: " + this.toString() +
" RowKey: " + Bytes.toStringBinary(rowKey));
}
return nextRowKey;
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
sb.append("FuzzyRowFilter");
sb.append("{fuzzyKeysData=");
for (ImmutablePair<byte[], byte[]> fuzzyData : fuzzyKeysData) {
sb.append('{').append(Bytes.toStringBinary(fuzzyData.getFirst())).append(":");
sb.append(Bytes.toStringBinary(fuzzyData.getSecond())).append('}');
}
sb.append("}, ");
return sb.toString();
}
// Utility methods
static enum SatisfiesCode {
// row satisfies fuzzy rule
YES,
// row doesn't satisfy fuzzy rule, but there's possible greater row that does
NEXT_EXISTS,
// row doesn't satisfy fuzzy rule and there's no greater row that does
NO_NEXT
}
static SatisfiesCode satisfies(byte[] row,
byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
return satisfies(row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
}
private static SatisfiesCode satisfies(byte[] row, int offset, int length,
byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
if (row == null) {
// do nothing, let scan to proceed
return SatisfiesCode.YES;
}
boolean nextRowKeyCandidateExists = false;
for (int i = 0; i < fuzzyKeyMeta.length && i < length; i++) {
// First, checking if this position is fixed and not equals the given one
boolean byteAtPositionFixed = fuzzyKeyMeta[i] == 0;
boolean fixedByteIncorrect = byteAtPositionFixed && fuzzyKeyBytes[i] != row[i + offset];
if (fixedByteIncorrect) {
// in this case there's another row that satisfies fuzzy rule and bigger than this row
if (nextRowKeyCandidateExists) {
return SatisfiesCode.NEXT_EXISTS;
}
// If this row byte is less than fixed then there's a byte array bigger than
// this row and which satisfies the fuzzy rule. Otherwise there's no such byte array:
// this row is simply bigger than any byte array that satisfies the fuzzy rule
boolean rowByteLessThanFixed = (row[i + offset] & 0xFF) < (fuzzyKeyBytes[i] & 0xFF);
return rowByteLessThanFixed ? SatisfiesCode.NEXT_EXISTS : SatisfiesCode.NO_NEXT;
}
// Second, checking if this position is not fixed and byte value is not the biggest. In this
// case there's a byte array bigger than this row and which satisfies the fuzzy rule. To get
// bigger byte array that satisfies the rule we need to just increase this byte
// (see the code of getNextForFuzzyRule below) by one.
// Note: if non-fixed byte is already at biggest value, this doesn't allow us to say there's
// bigger one that satisfies the rule as it can't be increased.
if (fuzzyKeyMeta[i] == 1 && !isMax(fuzzyKeyBytes[i])) {
nextRowKeyCandidateExists = true;
}
}
return SatisfiesCode.YES;
}
private static boolean isMax(byte fuzzyKeyByte) {
return (fuzzyKeyByte & 0xFF) == 255;
}
static byte[] getNextForFuzzyRule(byte[] row, byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
return getNextForFuzzyRule(row, 0, row.length, fuzzyKeyBytes, fuzzyKeyMeta);
}
/**
* @return greater byte array than given (row) which satisfies the fuzzy rule if it exists,
* null otherwise
*/
private static byte[] getNextForFuzzyRule(byte[] row, int offset, int length,
byte[] fuzzyKeyBytes, byte[] fuzzyKeyMeta) {
// To find out the next "smallest" byte array that satisfies fuzzy rule and "greater" than
// the given one we do the following:
// 1. setting values on all "fixed" positions to the values from fuzzyKeyBytes
// 2. if during the first step given row did not increase, then we increase the value at
// the first "non-fixed" position (where it is not maximum already)
// It is easier to perform this by using fuzzyKeyBytes copy and setting "non-fixed" position
// values than otherwise.
byte[] result = Arrays.copyOf(fuzzyKeyBytes,
length > fuzzyKeyBytes.length ? length : fuzzyKeyBytes.length);
int toInc = -1;
boolean increased = false;
for (int i = 0; i < result.length; i++) {
if (i >= fuzzyKeyMeta.length || fuzzyKeyMeta[i] == 1) {
result[i] = row[offset + i];
if (!isMax(row[i])) {
// this is "non-fixed" position and is not at max value, hence we can increase it
toInc = i;
}
} else if (i < fuzzyKeyMeta.length && fuzzyKeyMeta[i] == 0) {
if ((row[i + offset] & 0xFF) < (fuzzyKeyBytes[i] & 0xFF)) {
// if setting value for any fixed position increased the original array,
// we are OK
increased = true;
break;
}
if ((row[i + offset] & 0xFF) > (fuzzyKeyBytes[i] & 0xFF)) {
// if setting value for any fixed position makes array "smaller", then just stop:
// in case we found some non-fixed position to increase we will do it, otherwise
// there's no "next" row key that satisfies fuzzy rule and "greater" than given row
break;
}
}
}
if (!increased) {
if (toInc < 0) {
return null;
}
result[toInc]++;
// Setting all "non-fixed" positions to zeroes to the right of the one we increased so
// that found "next" row key is the smallest possible
for (int i = toInc + 1; i < result.length; i++) {
if (i >= fuzzyKeyMeta.length || fuzzyKeyMeta[i] == 1) {
result[i] = 0;
}
}
}
return result;
}
}