StringExpr.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec.vector.expressions;

import java.util.Arrays;

import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;

/**
 * String expression evaluation helper functions.
 */
public class StringExpr {

  /* Compare two strings from two byte arrays each
   * with their own start position and length.
   * Use lexicographic unsigned byte value order.
   * This is what's used for UTF-8 sort order.
   * Return negative value if arg1 < arg2, 0 if arg1 = arg2,
   * positive if arg1 > arg2.
   */
  public static int compare(byte[] arg1, int start1, int len1, byte[] arg2, int start2, int len2) {
    for (int i = 0; i < len1 && i < len2; i++) {
      // Note the "& 0xff" is just a way to convert unsigned bytes to signed integer.
      int b1 = arg1[i + start1] & 0xff;
      int b2 = arg2[i + start2] & 0xff;
      if (b1 != b2) {
        return b1 - b2;
      }
    }
    return len1 - len2;
  }

  /* Determine if two strings are equal from two byte arrays each
   * with their own start position and length.
   * Use lexicographic unsigned byte value order.
   * This is what's used for UTF-8 sort order.
   */
  public static boolean equal(byte[] arg1, final int start1, final int len1,
      byte[] arg2, final int start2, final int len2) {
    if (len1 != len2) {
      return false;
    }
    if (len1 == 0) {
      return true;
    }

    // do bounds check for OOB exception
    if (arg1[start1] != arg2[start2]
        || arg1[start1 + len1 - 1] != arg2[start2 + len2 - 1]) {
      return false;
    }

    if (len1 == len2) {
      // prove invariant to the compiler: len1 = len2
      // all array access between (start1, start1+len1) 
      // and (start2, start2+len2) are valid
      // no more OOB exceptions are possible
      final int step = 8;
      final int remainder = len1 % step;
      final int wlen = len1 - remainder;
      // suffix first
      for (int i = wlen; i < len1; i++) {
        if (arg1[start1 + i] != arg2[start2 + i]) {
          return false;
        }
      }
      // SIMD loop
      for (int i = 0; i < wlen; i += step) {
        final int s1 = start1 + i;
        final int s2 = start2 + i;
        boolean neq = false;
        for (int j = 0; j < step; j++) {
          neq = (arg1[s1 + j] != arg2[s2 + j]) || neq;
        }
        if (neq) {
          return false;
        }
      }
    }

    return true;
  }

  public static int characterCount(byte[] bytes) {
    int end = bytes.length;

    // count characters
    int j = 0;
    int charCount = 0;
    while(j < end) {
      // UTF-8 continuation bytes have 2 high bits equal to 0x80.
      if ((bytes[j] & 0xc0) != 0x80) {
        ++charCount;
      }
      j++;
    }
    return charCount;
  }

  public static int characterCount(byte[] bytes, int start, int length) {
    int end = start + length;

    // count characters
    int j = start;
    int charCount = 0;
    while(j < end) {
      // UTF-8 continuation bytes have 2 high bits equal to 0x80.
      if ((bytes[j] & 0xc0) != 0x80) {
        ++charCount;
      }
      j++;
    }
    return charCount;
  }

  // A setVal with the same function signature as rightTrim, leftTrim, truncate, etc, below.
  // Useful for class generation via templates.
  public static void assign(BytesColumnVector outV, int i, byte[] bytes, int start, int length) {
    // set output vector
    outV.setVal(i, bytes, start, length);
  }

  /*
   * Right trim a slice of a byte array and return the new byte length.
   */
  public static int rightTrim(byte[] bytes, int start, int length) {
    // skip trailing blank characters
    int j = start + length - 1;
    while(j >= start && bytes[j] == 0x20) {
      j--;
    }

    return (j - start) + 1;
  }

  /*
   * Right trim a slice of a byte array and place the result into element i of a vector.
   */
  public static void rightTrim(BytesColumnVector outV, int i, byte[] bytes, int start, int length) {
    // skip trailing blank characters
    int j = start + length - 1;
    while(j >= start && bytes[j] == 0x20) {
      j--;
    }

    // set output vector
    outV.setVal(i, bytes, start, (j - start) + 1);
  }

  /*
   * Truncate a slice of a byte array to a maximum number of characters and
   * return the new byte length.
   */
  public static int truncate(byte[] bytes, int start, int length, int maxLength) {
    int end = start + length;

    // count characters forward
    int j = start;
    int charCount = 0;
    while(j < end) {
      // UTF-8 continuation bytes have 2 high bits equal to 0x80.
      if ((bytes[j] & 0xc0) != 0x80) {
        if (charCount == maxLength) {
          break;
        }
        ++charCount;
      }
      j++;
    }
    return (j - start);
  }

  /*
   * Truncate a slice of a byte array to a maximum number of characters and
   * place the result into element i of a vector.
   */
  public static void truncate(BytesColumnVector outV, int i, byte[] bytes, int start, int length, int maxLength) {
    int end = start + length;

    // count characters forward
    int j = start;
    int charCount = 0;
    while(j < end) {
      // UTF-8 continuation bytes have 2 high bits equal to 0x80.
      if ((bytes[j] & 0xc0) != 0x80) {
        if (charCount == maxLength) {
          break;
        }
        ++charCount;
      }
      j++;
    }

    // set output vector
    outV.setVal(i, bytes, start, (j - start));
  }

  /*
   * Truncate a byte array to a maximum number of characters and
   * return a byte array with only truncated bytes.
   */
  public static byte[] truncateScalar(byte[] bytes, int maxLength) {
    int end = bytes.length;

    // count characters forward
    int j = 0;
    int charCount = 0;
    while(j < end) {
      // UTF-8 continuation bytes have 2 high bits equal to 0x80.
      if ((bytes[j] & 0xc0) != 0x80) {
        if (charCount == maxLength) {
          break;
        }
        ++charCount;
      }
      j++;
    }
    if (j == end) {
      return bytes;
    } else {
      return Arrays.copyOf(bytes, j);
    }
  }

  /*
   * Right trim and truncate a slice of a byte array to a maximum number of characters and
   * return the new byte length.
   */
  public static int rightTrimAndTruncate(byte[] bytes, int start, int length, int maxLength) {
    int end = start + length;

    // count characters forward and watch for final run of pads
    int j = start;
    int charCount = 0;
    int padRunStart = -1;
    while(j < end) {
      // UTF-8 continuation bytes have 2 high bits equal to 0x80.
      if ((bytes[j] & 0xc0) != 0x80) {
        if (charCount == maxLength) {
          break;
        }
        if (bytes[j] == 0x20) {
          if (padRunStart == -1) {
            padRunStart = j;
          }
        } else {
          padRunStart = -1;
        }
        ++charCount;
      } else {
        padRunStart = -1;
      }
      j++;
    }
    if (padRunStart != -1) {
      return (padRunStart - start);
    } else {
      return (j - start);
    }
  }

  /*
   * Right trim and truncate a slice of a byte array to a maximum number of characters and
   * place the result into element i of a vector.
   */
  public static void rightTrimAndTruncate(BytesColumnVector outV, int i, byte[] bytes, int start, int length, int maxLength) {
    int end = start + length;

    // count characters forward and watch for final run of pads
    int j = start;
    int charCount = 0;
    int padRunStart = -1;
    while(j < end) {
      // UTF-8 continuation bytes have 2 high bits equal to 0x80.
      if ((bytes[j] & 0xc0) != 0x80) {
        if (charCount == maxLength) {
          break;
        }
        if (bytes[j] == 0x20) {
          if (padRunStart == -1) {
            padRunStart = j;
          }
        } else {
          padRunStart = -1;
        }
        ++charCount;
      } else {
        padRunStart = -1;
      }
      j++;
    }
    // set output vector
    if (padRunStart != -1) {
      outV.setVal(i, bytes, start, (padRunStart - start));
    } else {
      outV.setVal(i, bytes, start, (j - start) );
    }
  }

  /*
   * Right trim and truncate a byte array to a maximum number of characters and
   * return a byte array with only the trimmed and truncated bytes.
   */
  public static byte[] rightTrimAndTruncateScalar(byte[] bytes, int maxLength) {
    int end = bytes.length;

    // count characters forward and watch for final run of pads
    int j = 0;
    int charCount = 0;
    int padRunStart = -1;
    while(j < end) {
      // UTF-8 continuation bytes have 2 high bits equal to 0x80.
      if ((bytes[j] & 0xc0) != 0x80) {
        if (charCount == maxLength) {
          break;
        }
        if (bytes[j] == 0x20) {
          if (padRunStart == -1) {
            padRunStart = j;
          }
        } else {
          padRunStart = -1;
        }
        ++charCount;
      } else {
        padRunStart = -1;
      }
      j++;
    }
    if (padRunStart != -1) {
      return Arrays.copyOf(bytes, padRunStart);
    } else if (j == end) {
      return bytes;
    } else {
      return Arrays.copyOf(bytes, j);
    }
  }

  /*
   * Compiles the given pattern with a proper algorithm.
   */
  public static Finder compile(byte[] pattern) {
    return new BoyerMooreHorspool(pattern);
  }

  /*
   * A finder finds the first index of its pattern in a given byte array.
   * Its thread-safety depends on its implementation.
   */
  public interface Finder {
    int find(byte[] input, int start, int len);
  }

  /*
   * StringExpr uses Boyer Moore Horspool algorithm to find faster.
   * It is thread-safe, because it holds final member instances only.
   * See https://en.wikipedia.org/wiki/Boyer–Moore–Horspool_algorithm .
   */
  private static class BoyerMooreHorspool implements Finder {
    private static final int MAX_BYTE = 0xff;
    private final long[] shift = new long[MAX_BYTE];
    private final byte[] pattern;
    private final int plen;

    public BoyerMooreHorspool(byte[] pattern) {
      this.pattern = pattern;
      this.plen = pattern.length;
      Arrays.fill(shift, plen);
      for (int i = 0; i < plen - 1; i++) {
        shift[pattern[i] & MAX_BYTE] = plen - i - 1;
      }
    }

    public int find(byte[] input, int start, int len) {
      if (pattern.length == 0) {
        return 0;
      }

      final int end = start + len;
      int next = start + plen - 1;
      final int plen = this.plen;
      final byte[] pattern = this.pattern;
      while (next < end) {
        int s_tmp = next;
        int p_tmp = plen - 1;
        while (input[s_tmp] == pattern[p_tmp]) {
          p_tmp--;
          if (p_tmp < 0) {
            return s_tmp;
          }
          s_tmp--;
        }
        next += shift[input[next] & MAX_BYTE];
      }
      return -1;
    }
  }
}