/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.contrib.serde2;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.contrib.util.typedbytes.Type;
import org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesWritableInput;
import org.apache.hadoop.hive.contrib.util.typedbytes.TypedBytesWritableOutput;
import org.apache.hadoop.hive.ql.io.NonSyncDataInputBuffer;
import org.apache.hadoop.hive.ql.io.NonSyncDataOutputBuffer;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeSpec;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.io.ByteWritable;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.io.ShortWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
/**
* TypedBytesSerDe uses typed bytes to serialize/deserialize.
*
* More info on the typedbytes stuff that Dumbo uses.
* http://issues.apache.org/jira/browse/HADOOP-1722 A fast python decoder for
* this, which is apparently 25% faster than the python version is available at
* http://github.com/klbostee/ctypedbytes/tree/master
*/
@SerDeSpec(schemaProps = {serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES})
public class TypedBytesSerDe extends AbstractSerDe {
public static final Logger LOG = LoggerFactory.getLogger(TypedBytesSerDe.class
.getName());
int numColumns;
StructObjectInspector rowOI;
ArrayList<Object> row;
BytesWritable serializeBytesWritable;
NonSyncDataOutputBuffer barrStr;
TypedBytesWritableOutput tbOut;
NonSyncDataInputBuffer inBarrStr;
TypedBytesWritableInput tbIn;
List<String> columnNames;
List<TypeInfo> columnTypes;
@Override
public void initialize(Configuration conf, Properties tbl)
throws SerDeException {
// We can get the table definition from tbl.
serializeBytesWritable = new BytesWritable();
barrStr = new NonSyncDataOutputBuffer();
tbOut = new TypedBytesWritableOutput(barrStr);
inBarrStr = new NonSyncDataInputBuffer();
tbIn = new TypedBytesWritableInput(inBarrStr);
// Read the configuration parameters
String columnNameProperty = tbl.getProperty(serdeConstants.LIST_COLUMNS);
String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES);
final String columnNameDelimiter = tbl.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? tbl
.getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA);
columnNames = Arrays.asList(columnNameProperty.split(columnNameDelimiter));
columnTypes = null;
if (columnTypeProperty.length() == 0) {
columnTypes = new ArrayList<TypeInfo>();
} else {
columnTypes = TypeInfoUtils
.getTypeInfosFromTypeString(columnTypeProperty);
}
assert columnNames.size() == columnTypes.size();
numColumns = columnNames.size();
// All columns have to be primitive.
for (int c = 0; c < numColumns; c++) {
if (columnTypes.get(c).getCategory() != Category.PRIMITIVE) {
throw new SerDeException(getClass().getName()
+ " only accepts primitive columns, but column[" + c + "] named "
+ columnNames.get(c) + " has category "
+ columnTypes.get(c).getCategory());
}
}
// Constructing the row ObjectInspector:
// The row consists of some string columns, each column will be a java
// String object.
List<ObjectInspector> columnOIs = new ArrayList<ObjectInspector>(
columnNames.size());
for (int c = 0; c < numColumns; c++) {
columnOIs.add(TypeInfoUtils
.getStandardWritableObjectInspectorFromTypeInfo(columnTypes.get(c)));
}
// StandardStruct uses ArrayList to store the row.
rowOI = ObjectInspectorFactory.getStandardStructObjectInspector(
columnNames, columnOIs);
// Constructing the row object, etc, which will be reused for all rows.
row = new ArrayList<Object>(numColumns);
for (int c = 0; c < numColumns; c++) {
row.add(null);
}
}
@Override
public ObjectInspector getObjectInspector() throws SerDeException {
return rowOI;
}
@Override
public Class<? extends Writable> getSerializedClass() {
return BytesWritable.class;
}
@Override
public Object deserialize(Writable blob) throws SerDeException {
BytesWritable data = (BytesWritable) blob;
inBarrStr.reset(data.getBytes(), 0, data.getLength());
try {
for (int i = 0; i < columnNames.size(); i++) {
row.set(i, deserializeField(tbIn, columnTypes.get(i), row.get(i)));
}
// The next byte should be the marker
assert tbIn.readTypeCode() == Type.ENDOFRECORD;
} catch (IOException e) {
throw new SerDeException(e);
}
return row;
}
static Object deserializeField(TypedBytesWritableInput in, TypeInfo type,
Object reuse) throws IOException {
// read the type
Class<? extends Writable> writableType = in.readType();
if (writableType != null &&
writableType.isAssignableFrom(NullWritable.class)) {
// indicates that the recorded value is null
return null;
}
switch (type.getCategory()) {
case PRIMITIVE: {
PrimitiveTypeInfo ptype = (PrimitiveTypeInfo) type;
switch (ptype.getPrimitiveCategory()) {
case VOID: {
return null;
}
case BOOLEAN: {
BooleanWritable r = reuse == null ? new BooleanWritable()
: (BooleanWritable) reuse;
r = in.readBoolean(r);
return r;
}
case BYTE: {
ByteWritable r = reuse == null ? new ByteWritable()
: (ByteWritable) reuse;
r = in.readByte(r);
return r;
}
case SHORT: {
ShortWritable r = reuse == null ? new ShortWritable()
: (ShortWritable) reuse;
r = in.readShort(r);
return r;
}
case INT: {
IntWritable r = reuse == null ? new IntWritable() : (IntWritable) reuse;
r = in.readInt(r);
return r;
}
case LONG: {
LongWritable r = reuse == null ? new LongWritable()
: (LongWritable) reuse;
r = in.readLong(r);
return r;
}
case FLOAT: {
FloatWritable r = reuse == null ? new FloatWritable()
: (FloatWritable) reuse;
r = in.readFloat(r);
return r;
}
case DOUBLE: {
DoubleWritable r = reuse == null ? new DoubleWritable()
: (DoubleWritable) reuse;
r = in.readDouble(r);
return r;
}
case STRING: {
Text r = reuse == null ? new Text() : (Text) reuse;
r = in.readText(r);
return r;
}
default: {
throw new RuntimeException("Unrecognized type: "
+ ptype.getPrimitiveCategory());
}
}
}
// Currently, deserialization of complex types is not supported
case LIST:
case MAP:
case STRUCT:
default: {
throw new RuntimeException("Unsupported category: " + type.getCategory());
}
}
}
@Override
public Writable serialize(Object obj, ObjectInspector objInspector)
throws SerDeException {
try {
barrStr.reset();
StructObjectInspector soi = (StructObjectInspector) objInspector;
List<? extends StructField> fields = soi.getAllStructFieldRefs();
for (int i = 0; i < numColumns; i++) {
Object o = soi.getStructFieldData(obj, fields.get(i));
ObjectInspector oi = fields.get(i).getFieldObjectInspector();
serializeField(o, oi, row.get(i));
}
// End of the record is part of the data
tbOut.writeEndOfRecord();
serializeBytesWritable.set(barrStr.getData(), 0, barrStr.getLength());
} catch (IOException e) {
throw new SerDeException(e.getMessage());
}
return serializeBytesWritable;
}
private void serializeField(Object o, ObjectInspector oi, Object reuse)
throws IOException {
switch (oi.getCategory()) {
case PRIMITIVE: {
PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
switch (poi.getPrimitiveCategory()) {
case VOID: {
return;
}
case BOOLEAN: {
BooleanObjectInspector boi = (BooleanObjectInspector) poi;
BooleanWritable r = reuse == null ? new BooleanWritable()
: (BooleanWritable) reuse;
r.set(boi.get(o));
tbOut.write(r);
return;
}
case BYTE: {
ByteObjectInspector boi = (ByteObjectInspector) poi;
ByteWritable r = reuse == null ? new ByteWritable()
: (ByteWritable) reuse;
r.set(boi.get(o));
tbOut.write(r);
return;
}
case SHORT: {
ShortObjectInspector spoi = (ShortObjectInspector) poi;
ShortWritable r = reuse == null ? new ShortWritable()
: (ShortWritable) reuse;
r.set(spoi.get(o));
tbOut.write(r);
return;
}
case INT: {
IntObjectInspector ioi = (IntObjectInspector) poi;
IntWritable r = reuse == null ? new IntWritable() : (IntWritable) reuse;
r.set(ioi.get(o));
tbOut.write(r);
return;
}
case LONG: {
LongObjectInspector loi = (LongObjectInspector) poi;
LongWritable r = reuse == null ? new LongWritable()
: (LongWritable) reuse;
r.set(loi.get(o));
tbOut.write(r);
return;
}
case FLOAT: {
FloatObjectInspector foi = (FloatObjectInspector) poi;
FloatWritable r = reuse == null ? new FloatWritable()
: (FloatWritable) reuse;
r.set(foi.get(o));
tbOut.write(r);
return;
}
case DOUBLE: {
DoubleObjectInspector doi = (DoubleObjectInspector) poi;
DoubleWritable r = reuse == null ? new DoubleWritable()
: (DoubleWritable) reuse;
r.set(doi.get(o));
tbOut.write(r);
return;
}
case STRING: {
StringObjectInspector soi = (StringObjectInspector) poi;
Text t = soi.getPrimitiveWritableObject(o);
tbOut.write(t);
return;
}
default: {
throw new RuntimeException("Unrecognized type: "
+ poi.getPrimitiveCategory());
}
}
}
case LIST:
case MAP:
case STRUCT: {
// For complex object, serialize to JSON format
String s = SerDeUtils.getJSONString(o, oi);
Text t = reuse == null ? new Text() : (Text) reuse;
// convert to Text and write it
t.set(s);
tbOut.write(t);
}
default: {
throw new RuntimeException("Unrecognized type: " + oi.getCategory());
}
}
}
public SerDeStats getSerDeStats() {
// no support for statistics
return null;
}
}