/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.exec.vector.expressions; import java.util.Arrays; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; /** * String expression evaluation helper functions. */ public class StringExpr { /* Compare two strings from two byte arrays each * with their own start position and length. * Use lexicographic unsigned byte value order. * This is what's used for UTF-8 sort order. * Return negative value if arg1 < arg2, 0 if arg1 = arg2, * positive if arg1 > arg2. */ public static int compare(byte[] arg1, int start1, int len1, byte[] arg2, int start2, int len2) { for (int i = 0; i < len1 && i < len2; i++) { // Note the "& 0xff" is just a way to convert unsigned bytes to signed integer. int b1 = arg1[i + start1] & 0xff; int b2 = arg2[i + start2] & 0xff; if (b1 != b2) { return b1 - b2; } } return len1 - len2; } /* Determine if two strings are equal from two byte arrays each * with their own start position and length. * Use lexicographic unsigned byte value order. * This is what's used for UTF-8 sort order. */ public static boolean equal(byte[] arg1, final int start1, final int len1, byte[] arg2, final int start2, final int len2) { if (len1 != len2) { return false; } if (len1 == 0) { return true; } // do bounds check for OOB exception if (arg1[start1] != arg2[start2] || arg1[start1 + len1 - 1] != arg2[start2 + len2 - 1]) { return false; } if (len1 == len2) { // prove invariant to the compiler: len1 = len2 // all array access between (start1, start1+len1) // and (start2, start2+len2) are valid // no more OOB exceptions are possible final int step = 8; final int remainder = len1 % step; final int wlen = len1 - remainder; // suffix first for (int i = wlen; i < len1; i++) { if (arg1[start1 + i] != arg2[start2 + i]) { return false; } } // SIMD loop for (int i = 0; i < wlen; i += step) { final int s1 = start1 + i; final int s2 = start2 + i; boolean neq = false; for (int j = 0; j < step; j++) { neq = (arg1[s1 + j] != arg2[s2 + j]) || neq; } if (neq) { return false; } } } return true; } public static int characterCount(byte[] bytes) { int end = bytes.length; // count characters int j = 0; int charCount = 0; while(j < end) { // UTF-8 continuation bytes have 2 high bits equal to 0x80. if ((bytes[j] & 0xc0) != 0x80) { ++charCount; } j++; } return charCount; } public static int characterCount(byte[] bytes, int start, int length) { int end = start + length; // count characters int j = start; int charCount = 0; while(j < end) { // UTF-8 continuation bytes have 2 high bits equal to 0x80. if ((bytes[j] & 0xc0) != 0x80) { ++charCount; } j++; } return charCount; } // A setVal with the same function signature as rightTrim, leftTrim, truncate, etc, below. // Useful for class generation via templates. public static void assign(BytesColumnVector outV, int i, byte[] bytes, int start, int length) { // set output vector outV.setVal(i, bytes, start, length); } /* * Right trim a slice of a byte array and return the new byte length. */ public static int rightTrim(byte[] bytes, int start, int length) { // skip trailing blank characters int j = start + length - 1; while(j >= start && bytes[j] == 0x20) { j--; } return (j - start) + 1; } /* * Right trim a slice of a byte array and place the result into element i of a vector. */ public static void rightTrim(BytesColumnVector outV, int i, byte[] bytes, int start, int length) { // skip trailing blank characters int j = start + length - 1; while(j >= start && bytes[j] == 0x20) { j--; } // set output vector outV.setVal(i, bytes, start, (j - start) + 1); } /* * Truncate a slice of a byte array to a maximum number of characters and * return the new byte length. */ public static int truncate(byte[] bytes, int start, int length, int maxLength) { int end = start + length; // count characters forward int j = start; int charCount = 0; while(j < end) { // UTF-8 continuation bytes have 2 high bits equal to 0x80. if ((bytes[j] & 0xc0) != 0x80) { if (charCount == maxLength) { break; } ++charCount; } j++; } return (j - start); } /* * Truncate a slice of a byte array to a maximum number of characters and * place the result into element i of a vector. */ public static void truncate(BytesColumnVector outV, int i, byte[] bytes, int start, int length, int maxLength) { int end = start + length; // count characters forward int j = start; int charCount = 0; while(j < end) { // UTF-8 continuation bytes have 2 high bits equal to 0x80. if ((bytes[j] & 0xc0) != 0x80) { if (charCount == maxLength) { break; } ++charCount; } j++; } // set output vector outV.setVal(i, bytes, start, (j - start)); } /* * Truncate a byte array to a maximum number of characters and * return a byte array with only truncated bytes. */ public static byte[] truncateScalar(byte[] bytes, int maxLength) { int end = bytes.length; // count characters forward int j = 0; int charCount = 0; while(j < end) { // UTF-8 continuation bytes have 2 high bits equal to 0x80. if ((bytes[j] & 0xc0) != 0x80) { if (charCount == maxLength) { break; } ++charCount; } j++; } if (j == end) { return bytes; } else { return Arrays.copyOf(bytes, j); } } /* * Right trim and truncate a slice of a byte array to a maximum number of characters and * return the new byte length. */ public static int rightTrimAndTruncate(byte[] bytes, int start, int length, int maxLength) { int end = start + length; // count characters forward and watch for final run of pads int j = start; int charCount = 0; int padRunStart = -1; while(j < end) { // UTF-8 continuation bytes have 2 high bits equal to 0x80. if ((bytes[j] & 0xc0) != 0x80) { if (charCount == maxLength) { break; } if (bytes[j] == 0x20) { if (padRunStart == -1) { padRunStart = j; } } else { padRunStart = -1; } ++charCount; } else { padRunStart = -1; } j++; } if (padRunStart != -1) { return (padRunStart - start); } else { return (j - start); } } /* * Right trim and truncate a slice of a byte array to a maximum number of characters and * place the result into element i of a vector. */ public static void rightTrimAndTruncate(BytesColumnVector outV, int i, byte[] bytes, int start, int length, int maxLength) { int end = start + length; // count characters forward and watch for final run of pads int j = start; int charCount = 0; int padRunStart = -1; while(j < end) { // UTF-8 continuation bytes have 2 high bits equal to 0x80. if ((bytes[j] & 0xc0) != 0x80) { if (charCount == maxLength) { break; } if (bytes[j] == 0x20) { if (padRunStart == -1) { padRunStart = j; } } else { padRunStart = -1; } ++charCount; } else { padRunStart = -1; } j++; } // set output vector if (padRunStart != -1) { outV.setVal(i, bytes, start, (padRunStart - start)); } else { outV.setVal(i, bytes, start, (j - start) ); } } /* * Right trim and truncate a byte array to a maximum number of characters and * return a byte array with only the trimmed and truncated bytes. */ public static byte[] rightTrimAndTruncateScalar(byte[] bytes, int maxLength) { int end = bytes.length; // count characters forward and watch for final run of pads int j = 0; int charCount = 0; int padRunStart = -1; while(j < end) { // UTF-8 continuation bytes have 2 high bits equal to 0x80. if ((bytes[j] & 0xc0) != 0x80) { if (charCount == maxLength) { break; } if (bytes[j] == 0x20) { if (padRunStart == -1) { padRunStart = j; } } else { padRunStart = -1; } ++charCount; } else { padRunStart = -1; } j++; } if (padRunStart != -1) { return Arrays.copyOf(bytes, padRunStart); } else if (j == end) { return bytes; } else { return Arrays.copyOf(bytes, j); } } /* * Compiles the given pattern with a proper algorithm. */ public static Finder compile(byte[] pattern) { return new BoyerMooreHorspool(pattern); } /* * A finder finds the first index of its pattern in a given byte array. * Its thread-safety depends on its implementation. */ public interface Finder { int find(byte[] input, int start, int len); } /* * StringExpr uses Boyer Moore Horspool algorithm to find faster. * It is thread-safe, because it holds final member instances only. * See https://en.wikipedia.org/wiki/Boyer–Moore–Horspool_algorithm . */ private static class BoyerMooreHorspool implements Finder { private static final int MAX_BYTE = 0xff; private final long[] shift = new long[MAX_BYTE]; private final byte[] pattern; private final int plen; public BoyerMooreHorspool(byte[] pattern) { this.pattern = pattern; this.plen = pattern.length; Arrays.fill(shift, plen); for (int i = 0; i < plen - 1; i++) { shift[pattern[i] & MAX_BYTE] = plen - i - 1; } } public int find(byte[] input, int start, int len) { if (pattern.length == 0) { return 0; } final int end = start + len; int next = start + plen - 1; final int plen = this.plen; final byte[] pattern = this.pattern; while (next < end) { int s_tmp = next; int p_tmp = plen - 1; while (input[s_tmp] == pattern[p_tmp]) { p_tmp--; if (p_tmp < 0) { return s_tmp; } s_tmp--; } next += shift[input[next] & MAX_BYTE]; } return -1; } } }