RegexSerDe.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.serde2;

import java.sql.Date;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.type.HiveChar;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.HiveVarchar;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;

import com.google.common.base.Splitter;
import com.google.common.collect.Lists;

/**
 * RegexSerDe uses regular expression (regex) to deserialize data. It doesn't
 * support data serialization.
 *
 * It can deserialize the data using regex and extracts groups as columns.
 *
 * In deserialization stage, if a row does not match the regex, then all columns
 * in the row will be NULL. If a row matches the regex but has less than
 * expected groups, the missing groups will be NULL. If a row matches the regex
 * but has more than expected groups, the additional groups are just ignored.
 *
 * NOTE: Regex SerDe supports primitive column types such as TINYINT, SMALLINT,
 * INT, BIGINT, FLOAT, DOUBLE, STRING, BOOLEAN and DECIMAL
 *
 *
 * NOTE: This implementation uses javaStringObjectInspector for STRING. A
 * more efficient implementation should use UTF-8 encoded Text and
 * writableStringObjectInspector. We should switch to that when we have a UTF-8
 * based Regex library.
 */
@SerDeSpec(schemaProps = {
    serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES,
    RegexSerDe.INPUT_REGEX, RegexSerDe.INPUT_REGEX_CASE_SENSITIVE })
public class RegexSerDe extends AbstractSerDe {

  public static final Logger LOG = LoggerFactory.getLogger(RegexSerDe.class.getName());

  public static final String INPUT_REGEX = "input.regex";
  public static final String INPUT_REGEX_CASE_SENSITIVE = "input.regex.case.insensitive";

  int numColumns;
  String inputRegex;

  Pattern inputPattern;

  StructObjectInspector rowOI;
  List<Object> row;
  List<TypeInfo> columnTypes;
  Object[] outputFields;
  Text outputRowText;

  boolean alreadyLoggedNoMatch = false;
  boolean alreadyLoggedPartialMatch = false;

  @Override
  public void initialize(Configuration conf, Properties tbl)
      throws SerDeException {

    // We can get the table definition from tbl.

    // Read the configuration parameters
    inputRegex = tbl.getProperty(INPUT_REGEX);
    String columnNameProperty = tbl.getProperty(serdeConstants.LIST_COLUMNS);
    String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES);
    boolean inputRegexIgnoreCase = "true".equalsIgnoreCase(tbl
        .getProperty(INPUT_REGEX_CASE_SENSITIVE));

    // output format string is not supported anymore, warn user of deprecation
    if (null != tbl.getProperty("output.format.string")) {
      LOG.warn("output.format.string has been deprecated");
    }

    // Parse the configuration parameters
    if (inputRegex != null) {
      inputPattern = Pattern.compile(inputRegex, Pattern.DOTALL
          + (inputRegexIgnoreCase ? Pattern.CASE_INSENSITIVE : 0));
    } else {
      inputPattern = null;
      throw new SerDeException(
          "This table does not have serde property \"input.regex\"!");
    }

    final String columnNameDelimiter = tbl.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? tbl
        .getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA);
    List<String> columnNames = Arrays.asList(columnNameProperty.split(columnNameDelimiter));
    columnTypes = TypeInfoUtils
        .getTypeInfosFromTypeString(columnTypeProperty);
    assert columnNames.size() == columnTypes.size();
    numColumns = columnNames.size();

    /* Constructing the row ObjectInspector:
     * The row consists of some set of primitive columns, each column will
     * be a java object of primitive type.
     */
    List<ObjectInspector> columnOIs = new ArrayList<ObjectInspector>(columnNames.size());
    for (int c = 0; c < numColumns; c++) {
      TypeInfo typeInfo = columnTypes.get(c);
      if (typeInfo instanceof PrimitiveTypeInfo) {
        PrimitiveTypeInfo pti = (PrimitiveTypeInfo) columnTypes.get(c);
        AbstractPrimitiveJavaObjectInspector oi =
            PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(pti);
        columnOIs.add(oi);
      } else {
        throw new SerDeException(getClass().getName()
            + " doesn't allow column [" + c + "] named "
            + columnNames.get(c) + " with type " + columnTypes.get(c));
      }
     }

    // StandardStruct uses ArrayList to store the row.
    rowOI = ObjectInspectorFactory.getStandardStructObjectInspector(
        columnNames,columnOIs,Lists.newArrayList(Splitter.on('\0').split(tbl.getProperty("columns.comments"))));

    row = new ArrayList<Object>(numColumns);
    // Constructing the row object, etc, which will be reused for all rows.
    for (int c = 0; c < numColumns; c++) {
      row.add(null);
    }
    outputFields = new Object[numColumns];
    outputRowText = new Text();
  }

  @Override
  public ObjectInspector getObjectInspector() throws SerDeException {
    return rowOI;
  }

  @Override
  public Class<? extends Writable> getSerializedClass() {
    return Text.class;
  }

  // Number of rows not matching the regex
  long unmatchedRowsCount = 0;
  // Number of rows that match the regex but have missing groups.
  long partialMatchedRowsCount = 0;

  @Override
  public Object deserialize(Writable blob) throws SerDeException {

    Text rowText = (Text) blob;
    Matcher m = inputPattern.matcher(rowText.toString());

    if (m.groupCount() != numColumns) {
      throw new SerDeException("Number of matching groups doesn't match the number of columns");
    }

    // If do not match, ignore the line, return a row with all nulls.
    if (!m.matches()) {
      unmatchedRowsCount++;
        if (!alreadyLoggedNoMatch) {
         // Report the row if its the first time
         LOG.warn("" + unmatchedRowsCount + " unmatched rows are found: " + rowText);
         alreadyLoggedNoMatch = true;
      }
      return null;
    }

    // Otherwise, return the row.
    for (int c = 0; c < numColumns; c++) {
      try {
        String t = m.group(c+1);
        TypeInfo typeInfo = columnTypes.get(c);

        // Convert the column to the correct type when needed and set in row obj
        PrimitiveTypeInfo pti = (PrimitiveTypeInfo) typeInfo;
        switch (pti.getPrimitiveCategory()) {
        case STRING:
          row.set(c, t);
          break;
        case BYTE:
          Byte b;
          b = Byte.valueOf(t);
          row.set(c,b);
          break;
        case SHORT:
          Short s;
          s = Short.valueOf(t);
          row.set(c,s);
          break;
        case INT:
          Integer i;
          i = Integer.valueOf(t);
          row.set(c, i);
          break;
        case LONG:
          Long l;
          l = Long.valueOf(t);
          row.set(c, l);
          break;
        case FLOAT:
          Float f;
          f = Float.valueOf(t);
          row.set(c,f);
          break;
        case DOUBLE:
          Double d;
          d = Double.valueOf(t);
          row.set(c,d);
          break;
        case BOOLEAN:
          Boolean bool;
          bool = Boolean.valueOf(t);
          row.set(c, bool);
          break;
        case TIMESTAMP:
          Timestamp ts;
          ts = Timestamp.valueOf(t);
          row.set(c, ts);
          break;
        case DATE:
          Date date;
          date = Date.valueOf(t);
          row.set(c, date);
          break;
        case DECIMAL:
          HiveDecimal bd = HiveDecimal.create(t);
          row.set(c, bd);
          break;
        case CHAR:
          HiveChar hc = new HiveChar(t, ((CharTypeInfo) typeInfo).getLength());
          row.set(c, hc);
          break;
        case VARCHAR:
          HiveVarchar hv = new HiveVarchar(t, ((VarcharTypeInfo)typeInfo).getLength());
          row.set(c, hv);
          break;
        default:
          throw new SerDeException("Unsupported type " + typeInfo);
        }
      } catch (RuntimeException e) {
         partialMatchedRowsCount++;
         if (!alreadyLoggedPartialMatch) {
         // Report the row if its the first row
         LOG.warn("" + partialMatchedRowsCount
            + " partially unmatched rows are found, " + " cannot find group "
            + c + ": " + rowText);
           alreadyLoggedPartialMatch = true;
        }
        row.set(c, null);
       }
     }
    return row;
  }

  @Override
  public Writable serialize(Object obj, ObjectInspector objInspector)
      throws SerDeException {
        throw new UnsupportedOperationException(
          "Regex SerDe doesn't support the serialize() method");
  }

  @Override
  public SerDeStats getSerDeStats() {
    // no support for statistics
    return null;
  }
}