GenericUDFTranslate.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.udf.generic;

import java.nio.ByteBuffer;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.Text;

/**
 * TRANSLATE(string input, string from, string to) is an equivalent function to translate in
 * PostGresSQL. See explain extended annotation below to read more about how this UDF works
 *
 */
@UDFType(deterministic = true)
//@formatter:off
@Description(
  name = "translate",
  value = "_FUNC_(input, from, to) - translates the input string by" +
          " replacing the characters present in the from string with the" +
          " corresponding characters in the to string",
  extended = "_FUNC_(string input, string from, string to) is an" +
             " equivalent function to translate in PostGreSQL. It works" +
             " on a character by character basis on the input string (first" +
             " parameter). A character in the input is checked for" +
             " presence in the from string (second parameter). If a" +
             " match happens, the character from to string (third " +
             "parameter) which appears at the same index as the character" +
             " in from string is obtained. This character is emitted in" +
             " the output string  instead of the original character from" +
             " the input string. If the to string is shorter than the" +
             " from string, there may not be a character present at" +
             " the same index in the to string. In such a case, nothing is" +
             " emitted for the original character and it's deleted from" +
             " the output string." +
             "\n" +
             "For example," +
             "\n" +
             "\n" +
             "_FUNC_('abcdef', 'adc', '19') returns '1b9ef' replacing" +
             " 'a' with '1', 'd' with '9' and removing 'c' from the input" +
             " string" +
             "\n" +
             "\n" +
             "_FUNC_('a b c d', ' ', '') return 'abcd'" +
             " removing all spaces from the input string" +
             "\n" +
             "\n" +
             "If the same character is present multiple times in the" +
             " input string, the first occurence of the character is the" +
             " one that's considered for matching. However, it is not recommended" +
             " to have the same character more than once in the from" +
             " string since it's not required and adds to confusion." +
             "\n" +
             "\n" +
             "For example," +
             "\n" +
             "\n" +
             "_FUNC_('abcdef', 'ada', '192') returns '1bc9ef' replaces" +
             " 'a' with '1' and 'd' with '9' ignoring the second" +
             " occurence of 'a' in the from string mapping it to '2'"
)
//@formatter:on
public class GenericUDFTranslate extends GenericUDF {

  // For all practical purposes a code point is a fancy name for character. A java char data type
  // can store characters that require 16 bits or less. However, the unicode specification has
  // changed to allow for characters whose representation requires more than 16 bits. Therefore we
  // need to represent each character (called a code point from hereon) as int. More details at
  // http://docs.oracle.com/javase/7/docs/api/java/lang/Character.html

  /**
   * If a code point needs to be replaced with another code point, this map with store the mapping.
   */
  private final Map<Integer, Integer> replacementMap = new HashMap<Integer, Integer>();

  /**
   * This set stores all the code points which needed to be deleted from the input string. The
   * objects in deletionSet and keys in replacementMap are mutually exclusive
   */
  private final Set<Integer> deletionSet = new HashSet<Integer>();
  /**
   * A placeholder for result.
   */
  private final Text result = new Text();

  /**
   * The values of from parameter from the previous evaluate() call.
   */
  private Text lastFrom = null;
  /**
   * The values of to parameter from the previous evaluate() call.
   */
  private Text lastTo = null;
  /**
   * Converters for retrieving the arguments to the UDF.
   */
  private transient ObjectInspectorConverters.Converter[] converters;

  @Override
  public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
    if (arguments.length != 3) {
      throw new UDFArgumentLengthException("_FUNC_ expects exactly 3 arguments");
    }

    for (int i = 0; i < arguments.length; i++) {
      if (arguments[i].getCategory() != Category.PRIMITIVE) {
        throw new UDFArgumentTypeException(i,
            "A string argument was expected but an argument of type " + arguments[i].getTypeName()
                + " was given.");

      }

      // Now that we have made sure that the argument is of primitive type, we can get the primitive
      // category
      PrimitiveCategory primitiveCategory = ((PrimitiveObjectInspector) arguments[i])
          .getPrimitiveCategory();

      if (primitiveCategory != PrimitiveCategory.STRING
          && primitiveCategory != PrimitiveCategory.CHAR
          && primitiveCategory != PrimitiveCategory.VARCHAR
          && primitiveCategory != PrimitiveCategory.VOID) {
        throw new UDFArgumentTypeException(i,
            "A string, char, or varchar argument was expected but an argument of type "
                + arguments[i].getTypeName() + " was given.");

      }
    }

    converters = new ObjectInspectorConverters.Converter[arguments.length];
    for (int i = 0; i < arguments.length; i++) {
      converters[i] = ObjectInspectorConverters.getConverter(arguments[i],
          PrimitiveObjectInspectorFactory.writableStringObjectInspector);
    }

    // We will be returning a Text object
    return PrimitiveObjectInspectorFactory.writableStringObjectInspector;
  }

  @Override
  public Object evaluate(DeferredObject[] arguments) throws HiveException {
    assert (arguments.length == 3);
    if (arguments[0].get() == null || arguments[1].get() == null || arguments[2].get() == null) {
      return null;
    }

    Text input = (Text) converters[0].convert(arguments[0].get());
    Text from = (Text) converters[1].convert(arguments[1].get());
    Text to = (Text) converters[2].convert(arguments[2].get());

    populateMappingsIfNecessary(from, to);
    String resultString = processInput(input);
    result.set(resultString);
    return result;
  }

  /**
   * Pre-processes the from and to strings by calling {@link #populateMappings(Text, Text)} if
   * necessary.
   *
   * @param from
   *          from string to be used for translation
   * @param to
   *          to string to be used for translation
   */
  private void populateMappingsIfNecessary(Text from, Text to) {
    // If the from and to strings haven't changed, we don't need to preprocess again to regenerate
    // the mappings of code points that need to replaced or deleted
    if ((lastFrom == null) || (lastTo == null) || !from.equals(lastFrom) || !to.equals(lastTo)) {
      populateMappings(from, to);
      // These are null when evaluate() is called for the first time
      if (lastFrom == null) {
        lastFrom = new Text();
      }
      if (lastTo == null) {
        lastTo = new Text();
      }
      // Need to deep copy here since doing something like lastFrom = from instead, will make
      // lastFrom point to the same Text object which would make from.equals(lastFrom) always true
      lastFrom.set(from);
      lastTo.set(to);
    }
  }

  /**
   * Pre-process the from and to strings populate {@link #replacementMap} and {@link #deletionSet}.
   *
   * @param from
   *          from string to be used for translation
   * @param to
   *          to string to be used for translation
   */
  private void populateMappings(Text from, Text to) {
    replacementMap.clear();
    deletionSet.clear();

    ByteBuffer fromBytes = ByteBuffer.wrap(from.getBytes(), 0, from.getLength());
    ByteBuffer toBytes = ByteBuffer.wrap(to.getBytes(), 0, to.getLength());

    // Traverse through the from string, one code point at a time
    while (fromBytes.hasRemaining()) {
      // This will also move the iterator ahead by one code point
      int fromCodePoint = Text.bytesToCodePoint(fromBytes);
      // If the to string has more code points, make sure to traverse it too
      if (toBytes.hasRemaining()) {
        int toCodePoint = Text.bytesToCodePoint(toBytes);
        // If the code point from from string already has a replacement or is to be deleted, we
        // don't need to do anything, just move on to the next code point
        if (replacementMap.containsKey(fromCodePoint) || deletionSet.contains(fromCodePoint)) {
          continue;
        }
        replacementMap.put(fromCodePoint, toCodePoint);
      } else {
        // If the code point from from string already has a replacement or is to be deleted, we
        // don't need to do anything, just move on to the next code point
        if (replacementMap.containsKey(fromCodePoint) || deletionSet.contains(fromCodePoint)) {
          continue;
        }
        deletionSet.add(fromCodePoint);
      }
    }
  }

  /**
   * Translates the input string based on {@link #replacementMap} and {@link #deletionSet} and
   * returns the translated string.
   *
   * @param input
   *          input string to perform the translation on
   * @return translated string
   */
  private String processInput(Text input) {
    StringBuilder resultBuilder = new StringBuilder();
    // Obtain the byte buffer from the input string so we can traverse it code point by code point
    ByteBuffer inputBytes = ByteBuffer.wrap(input.getBytes(), 0, input.getLength());
    // Traverse the byte buffer containing the input string one code point at a time
    while (inputBytes.hasRemaining()) {
      int inputCodePoint = Text.bytesToCodePoint(inputBytes);
      // If the code point exists in deletion set, no need to emit out anything for this code point.
      // Continue on to the next code point
      if (deletionSet.contains(inputCodePoint)) {
        continue;
      }

      Integer replacementCodePoint = replacementMap.get(inputCodePoint);
      // If a replacement exists for this code point, emit out the replacement and append it to the
      // output string. If no such replacement exists, emit out the original input code point
      char[] charArray = Character.toChars((replacementCodePoint != null) ? replacementCodePoint
          : inputCodePoint);
      resultBuilder.append(charArray);
    }
    String resultString = resultBuilder.toString();
    return resultString;
  }

  @Override
  public String getDisplayString(String[] children) {
    assert (children.length == 3);
    return getStandardDisplayString("translate", children);
  }

}