/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.serde2;
import java.sql.Date;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.type.HiveChar;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.HiveVarchar;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.AbstractPrimitiveJavaObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
/**
* RegexSerDe uses regular expression (regex) to deserialize data. It doesn't
* support data serialization.
*
* It can deserialize the data using regex and extracts groups as columns.
*
* In deserialization stage, if a row does not match the regex, then all columns
* in the row will be NULL. If a row matches the regex but has less than
* expected groups, the missing groups will be NULL. If a row matches the regex
* but has more than expected groups, the additional groups are just ignored.
*
* NOTE: Regex SerDe supports primitive column types such as TINYINT, SMALLINT,
* INT, BIGINT, FLOAT, DOUBLE, STRING, BOOLEAN and DECIMAL
*
*
* NOTE: This implementation uses javaStringObjectInspector for STRING. A
* more efficient implementation should use UTF-8 encoded Text and
* writableStringObjectInspector. We should switch to that when we have a UTF-8
* based Regex library.
*/
@SerDeSpec(schemaProps = {
serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES,
RegexSerDe.INPUT_REGEX, RegexSerDe.INPUT_REGEX_CASE_SENSITIVE })
public class RegexSerDe extends AbstractSerDe {
public static final Logger LOG = LoggerFactory.getLogger(RegexSerDe.class.getName());
public static final String INPUT_REGEX = "input.regex";
public static final String INPUT_REGEX_CASE_SENSITIVE = "input.regex.case.insensitive";
int numColumns;
String inputRegex;
Pattern inputPattern;
StructObjectInspector rowOI;
List<Object> row;
List<TypeInfo> columnTypes;
Object[] outputFields;
Text outputRowText;
boolean alreadyLoggedNoMatch = false;
boolean alreadyLoggedPartialMatch = false;
@Override
public void initialize(Configuration conf, Properties tbl)
throws SerDeException {
// We can get the table definition from tbl.
// Read the configuration parameters
inputRegex = tbl.getProperty(INPUT_REGEX);
String columnNameProperty = tbl.getProperty(serdeConstants.LIST_COLUMNS);
String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES);
boolean inputRegexIgnoreCase = "true".equalsIgnoreCase(tbl
.getProperty(INPUT_REGEX_CASE_SENSITIVE));
// output format string is not supported anymore, warn user of deprecation
if (null != tbl.getProperty("output.format.string")) {
LOG.warn("output.format.string has been deprecated");
}
// Parse the configuration parameters
if (inputRegex != null) {
inputPattern = Pattern.compile(inputRegex, Pattern.DOTALL
+ (inputRegexIgnoreCase ? Pattern.CASE_INSENSITIVE : 0));
} else {
inputPattern = null;
throw new SerDeException(
"This table does not have serde property \"input.regex\"!");
}
final String columnNameDelimiter = tbl.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? tbl
.getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA);
List<String> columnNames = Arrays.asList(columnNameProperty.split(columnNameDelimiter));
columnTypes = TypeInfoUtils
.getTypeInfosFromTypeString(columnTypeProperty);
assert columnNames.size() == columnTypes.size();
numColumns = columnNames.size();
/* Constructing the row ObjectInspector:
* The row consists of some set of primitive columns, each column will
* be a java object of primitive type.
*/
List<ObjectInspector> columnOIs = new ArrayList<ObjectInspector>(columnNames.size());
for (int c = 0; c < numColumns; c++) {
TypeInfo typeInfo = columnTypes.get(c);
if (typeInfo instanceof PrimitiveTypeInfo) {
PrimitiveTypeInfo pti = (PrimitiveTypeInfo) columnTypes.get(c);
AbstractPrimitiveJavaObjectInspector oi =
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(pti);
columnOIs.add(oi);
} else {
throw new SerDeException(getClass().getName()
+ " doesn't allow column [" + c + "] named "
+ columnNames.get(c) + " with type " + columnTypes.get(c));
}
}
// StandardStruct uses ArrayList to store the row.
rowOI = ObjectInspectorFactory.getStandardStructObjectInspector(
columnNames,columnOIs,Lists.newArrayList(Splitter.on('\0').split(tbl.getProperty("columns.comments"))));
row = new ArrayList<Object>(numColumns);
// Constructing the row object, etc, which will be reused for all rows.
for (int c = 0; c < numColumns; c++) {
row.add(null);
}
outputFields = new Object[numColumns];
outputRowText = new Text();
}
@Override
public ObjectInspector getObjectInspector() throws SerDeException {
return rowOI;
}
@Override
public Class<? extends Writable> getSerializedClass() {
return Text.class;
}
// Number of rows not matching the regex
long unmatchedRowsCount = 0;
// Number of rows that match the regex but have missing groups.
long partialMatchedRowsCount = 0;
@Override
public Object deserialize(Writable blob) throws SerDeException {
Text rowText = (Text) blob;
Matcher m = inputPattern.matcher(rowText.toString());
if (m.groupCount() != numColumns) {
throw new SerDeException("Number of matching groups doesn't match the number of columns");
}
// If do not match, ignore the line, return a row with all nulls.
if (!m.matches()) {
unmatchedRowsCount++;
if (!alreadyLoggedNoMatch) {
// Report the row if its the first time
LOG.warn("" + unmatchedRowsCount + " unmatched rows are found: " + rowText);
alreadyLoggedNoMatch = true;
}
return null;
}
// Otherwise, return the row.
for (int c = 0; c < numColumns; c++) {
try {
String t = m.group(c+1);
TypeInfo typeInfo = columnTypes.get(c);
// Convert the column to the correct type when needed and set in row obj
PrimitiveTypeInfo pti = (PrimitiveTypeInfo) typeInfo;
switch (pti.getPrimitiveCategory()) {
case STRING:
row.set(c, t);
break;
case BYTE:
Byte b;
b = Byte.valueOf(t);
row.set(c,b);
break;
case SHORT:
Short s;
s = Short.valueOf(t);
row.set(c,s);
break;
case INT:
Integer i;
i = Integer.valueOf(t);
row.set(c, i);
break;
case LONG:
Long l;
l = Long.valueOf(t);
row.set(c, l);
break;
case FLOAT:
Float f;
f = Float.valueOf(t);
row.set(c,f);
break;
case DOUBLE:
Double d;
d = Double.valueOf(t);
row.set(c,d);
break;
case BOOLEAN:
Boolean bool;
bool = Boolean.valueOf(t);
row.set(c, bool);
break;
case TIMESTAMP:
Timestamp ts;
ts = Timestamp.valueOf(t);
row.set(c, ts);
break;
case DATE:
Date date;
date = Date.valueOf(t);
row.set(c, date);
break;
case DECIMAL:
HiveDecimal bd = HiveDecimal.create(t);
row.set(c, bd);
break;
case CHAR:
HiveChar hc = new HiveChar(t, ((CharTypeInfo) typeInfo).getLength());
row.set(c, hc);
break;
case VARCHAR:
HiveVarchar hv = new HiveVarchar(t, ((VarcharTypeInfo)typeInfo).getLength());
row.set(c, hv);
break;
default:
throw new SerDeException("Unsupported type " + typeInfo);
}
} catch (RuntimeException e) {
partialMatchedRowsCount++;
if (!alreadyLoggedPartialMatch) {
// Report the row if its the first row
LOG.warn("" + partialMatchedRowsCount
+ " partially unmatched rows are found, " + " cannot find group "
+ c + ": " + rowText);
alreadyLoggedPartialMatch = true;
}
row.set(c, null);
}
}
return row;
}
@Override
public Writable serialize(Object obj, ObjectInspector objInspector)
throws SerDeException {
throw new UnsupportedOperationException(
"Regex SerDe doesn't support the serialize() method");
}
@Override
public SerDeStats getSerDeStats() {
// no support for statistics
return null;
}
}