/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec.vector.expressions;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
/**
* An abstract class for LIKE and REGEXP expressions. LIKE and REGEXP expression share similar
* functions, but they have different grammars. AbstractFilterStringColLikeStringScalar class
* provides shared classes and methods. Each subclass handles its grammar.
*/
public abstract class AbstractFilterStringColLikeStringScalar extends VectorExpression {
private static final long serialVersionUID = 1L;
private int colNum;
private String pattern;
transient Checker checker = null;
public AbstractFilterStringColLikeStringScalar() {
super();
}
public AbstractFilterStringColLikeStringScalar(int colNum, String pattern) {
this.colNum = colNum;
this.pattern = pattern;
}
protected abstract List<CheckerFactory> getCheckerFactories();
/**
* Selects an optimized checker for a given string.
* @param pattern
* @return
*/
Checker createChecker(String pattern) {
for (CheckerFactory checkerFactory : getCheckerFactories()) {
Checker checker = checkerFactory.tryCreate(pattern);
if (checker != null) {
return checker;
}
}
return null;
}
@Override
public void evaluate(VectorizedRowBatch batch) {
if (checker == null) {
checker = createChecker(pattern);
}
if (childExpressions != null) {
super.evaluateChildren(batch);
}
BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum];
int[] sel = batch.selected;
boolean[] nullPos = inputColVector.isNull;
int n = batch.size;
byte[][] vector = inputColVector.vector;
int[] length = inputColVector.length;
int[] start = inputColVector.start;
// return immediately if batch is empty
if (n == 0) {
return;
}
if (inputColVector.noNulls) {
if (inputColVector.isRepeating) {
// All must be selected otherwise size would be zero Repeating property will not change.
if (!checker.check(vector[0], start[0], length[0])) {
// Entire batch is filtered out.
batch.size = 0;
}
} else if (batch.selectedInUse) {
int newSize = 0;
for (int j = 0; j != n; j++) {
int i = sel[j];
if (checker.check(vector[i], start[i], length[i])) {
sel[newSize++] = i;
}
}
batch.size = newSize;
} else {
int newSize = 0;
for (int i = 0; i != n; i++) {
if (checker.check(vector[i], start[i], length[i])) {
sel[newSize++] = i;
}
}
if (newSize < n) {
batch.size = newSize;
batch.selectedInUse = true;
}
}
} else {
if (inputColVector.isRepeating) {
//All must be selected otherwise size would be zero. Repeating property will not change.
if (!nullPos[0]) {
if (!checker.check(vector[0], start[0], length[0])) {
//Entire batch is filtered out.
batch.size = 0;
}
} else {
batch.size = 0;
}
} else if (batch.selectedInUse) {
int newSize = 0;
for (int j = 0; j != n; j++) {
int i = sel[j];
if (!nullPos[i]) {
if (checker.check(vector[i], start[i], length[i])) {
sel[newSize++] = i;
}
}
}
//Change the selected vector
batch.size = newSize;
} else {
int newSize = 0;
for (int i = 0; i != n; i++) {
if (!nullPos[i]) {
if (checker.check(vector[i], start[i], length[i])) {
sel[newSize++] = i;
}
}
}
if (newSize < n) {
batch.size = newSize;
batch.selectedInUse = true;
}
/* If every row qualified (newSize==n), then we can ignore the sel vector to streamline
* future operations. So selectedInUse will remain false.
*/
}
}
}
@Override
public int getOutputColumn() {
return -1;
}
@Override
public String getOutputType() {
return "boolean";
}
/**
* A Checker contains a pattern and checks whether a given string matches or not.
*/
public interface Checker {
/**
* Checks whether the given string matches with its pattern.
* @param byteS The byte array that contains the string
* @param start The start position of the string
* @param len The length of the string
* @return Whether it matches or not.
*/
boolean check(byte[] byteS, int start, int len);
}
/**
* A CheckerFactory creates checkers of its kind.
*/
protected interface CheckerFactory {
/**
* If the given pattern is acceptable for its checker class, it creates and returns a checker.
* Otherwise, it returns <code>null</code>.
* @param pattern
* @return If the pattern is acceptable, a <code>Checker</code> object. Otherwise
* <code>null</code>.
*/
Checker tryCreate(String pattern);
}
/**
* Matches the whole string to its pattern.
*/
protected static final class NoneChecker implements Checker {
final byte [] byteSub;
NoneChecker(String pattern) {
try {
byteSub = pattern.getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
}
public boolean check(byte[] byteS, int start, int len) {
int lenSub = byteSub.length;
if (len != lenSub) {
return false;
}
for (int i = start, j = 0; j < len; i++, j++) {
if (byteS[i] != byteSub[j]) {
return false;
}
}
return true;
}
}
/**
* Matches the beginning of each string to a pattern.
*/
protected static final class BeginChecker implements Checker {
final byte[] byteSub;
BeginChecker(String pattern) {
try {
byteSub = pattern.getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
}
public boolean check(byte[] byteS, int start, int len) {
int lenSub = byteSub.length;
if (len < byteSub.length) {
return false;
}
return StringExpr.equal(byteSub, 0, lenSub, byteS, start, lenSub);
}
}
/**
* Matches the ending of each string to its pattern.
*/
protected static final class EndChecker implements Checker {
final byte[] byteSub;
EndChecker(String pattern) {
try {
byteSub = pattern.getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
}
public boolean check(byte[] byteS, int start, int len) {
int lenSub = byteSub.length;
if (len < lenSub) {
return false;
}
return StringExpr.equal(byteSub, 0, lenSub, byteS, start + len - lenSub, lenSub);
}
}
/**
* Matches the middle of each string to its pattern.
*/
protected static final class MiddleChecker implements Checker {
final StringExpr.Finder finder;
MiddleChecker(String pattern) {
finder = StringExpr.compile(pattern.getBytes(StandardCharsets.UTF_8));
}
public boolean check(byte[] byteS, int start, int len) {
return index(byteS, start, len) != -1;
}
/*
* Returns absolute offset of the match
*/
public int index(byte[] byteS, int start, int len) {
return finder.find(byteS, start, len);
}
}
/**
* Matches a chained sequence of checkers.
*
* This has 4 chain scenarios cases in it (has no escaping or single char wildcards)
*
* 1) anchored left "abc%def%"
* 2) anchored right "%abc%def"
* 3) unanchored "%abc%def%"
* 4) anchored on both sides "abc%def"
*/
protected static final class ChainedChecker implements Checker {
final int minLen;
final BeginChecker begin;
final EndChecker end;
final MiddleChecker[] middle;
final int[] midLens;
final int beginLen;
final int endLen;
ChainedChecker(String pattern) {
final StringTokenizer tokens = new StringTokenizer(pattern, "%");
final boolean leftAnchor = pattern.startsWith("%") == false;
final boolean rightAnchor = pattern.endsWith("%") == false;
int len = 0;
// at least 2 checkers always
BeginChecker left = null;
EndChecker right = null;
int leftLen = 0; // not -1
int rightLen = 0; // not -1
final List<MiddleChecker> checkers = new ArrayList<MiddleChecker>(2);
final List<Integer> lengths = new ArrayList<Integer>(2);
for (int i = 0; tokens.hasMoreTokens(); i++) {
String chunk = tokens.nextToken();
if (chunk.length() == 0) {
// %% is folded in the .*?.*? regex usually into .*?
continue;
}
len += utf8Length(chunk);
if (leftAnchor && i == 0) {
// first item
left = new BeginChecker(chunk);
leftLen = utf8Length(chunk);
} else if (rightAnchor && tokens.hasMoreTokens() == false) {
// last item
right = new EndChecker(chunk);
rightLen = utf8Length(chunk);
} else {
// middle items in order
checkers.add(new MiddleChecker(chunk));
lengths.add(utf8Length(chunk));
}
}
midLens = ArrayUtils.toPrimitive(lengths.toArray(ArrayUtils.EMPTY_INTEGER_OBJECT_ARRAY));
middle = checkers.toArray(new MiddleChecker[0]);
minLen = len;
begin = left;
end = right;
beginLen = leftLen;
endLen = rightLen;
}
public boolean check(byte[] byteS, final int start, final int len) {
int pos = start;
int mark = len;
if (len < minLen) {
return false;
}
// prefix, extend start
if (begin != null && false == begin.check(byteS, pos, mark)) {
// no match
return false;
} else {
pos += beginLen;
mark -= beginLen;
}
// suffix, reduce len
if (end != null && false == end.check(byteS, pos, mark)) {
// no match
return false;
} else {
// no pos change - no need since we've shrunk the string with same pos
mark -= endLen;
}
// loop for middles
for (int i = 0; i < middle.length; i++) {
int index = middle[i].index(byteS, pos, mark);
if (index == -1) {
// no match
return false;
} else {
mark -= ((index-pos) + midLens[i]);
pos = index + midLens[i];
}
}
// if all is good
return true;
}
private int utf8Length(String chunk) {
try {
return chunk.getBytes("UTF-8").length;
} catch (UnsupportedEncodingException ue) {
throw new RuntimeException(ue);
}
}
}
/**
* Matches each string to a pattern with Java regular expression package.
*/
protected static class ComplexChecker implements Checker {
Pattern compiledPattern;
Matcher matcher;
FastUTF8Decoder decoder;
ComplexChecker(String pattern) {
compiledPattern = Pattern.compile(pattern);
matcher = compiledPattern.matcher("");
decoder = new FastUTF8Decoder();
}
public boolean check(byte[] byteS, int start, int len) {
// Match the given bytes with the like pattern
matcher.reset(decoder.decodeUnsafely(byteS, start, len));
return matcher.find(0);
}
}
/**
* A fast UTF-8 decoder that caches necessary objects for decoding.
*/
private static class FastUTF8Decoder {
CharsetDecoder decoder;
ByteBuffer byteBuffer;
CharBuffer charBuffer;
public FastUTF8Decoder() {
decoder = StandardCharsets.UTF_8.newDecoder()
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE);
byteBuffer = ByteBuffer.allocate(4);
charBuffer = CharBuffer.allocate(4);
}
public CharBuffer decodeUnsafely(byte[] byteS, int start, int len) {
// Prepare buffers
if (byteBuffer.capacity() < len) {
byteBuffer = ByteBuffer.allocate(len * 2);
}
byteBuffer.clear();
byteBuffer.put(byteS, start, len);
byteBuffer.flip();
int maxChars = (int) (byteBuffer.capacity() * decoder.maxCharsPerByte());
if (charBuffer.capacity() < maxChars) {
charBuffer = CharBuffer.allocate(maxChars);
}
charBuffer.clear();
// Decode UTF-8
decoder.reset();
decoder.decode(byteBuffer, charBuffer, true);
decoder.flush(charBuffer);
charBuffer.flip();
return charBuffer;
}
}
public int getColNum() {
return colNum;
}
public void setColNum(int colNum) {
this.colNum = colNum;
}
public String getPattern() {
return pattern;
}
public void setPattern(String pattern) {
this.pattern = pattern;
}
@Override
public String vectorExpressionParameters() {
return "col " + colNum + ", pattern " + pattern;
}
@Override
public VectorExpressionDescriptor.Descriptor getDescriptor() {
return (new VectorExpressionDescriptor.Builder())
.setMode(
VectorExpressionDescriptor.Mode.FILTER)
.setNumArguments(2)
.setArgumentTypes(
VectorExpressionDescriptor.ArgumentType.STRING_FAMILY,
VectorExpressionDescriptor.ArgumentType.STRING_FAMILY)
.setInputExpressionTypes(
VectorExpressionDescriptor.InputExpressionType.COLUMN,
VectorExpressionDescriptor.InputExpressionType.SCALAR).build();
}
}