/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.serde2.lazy;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.classification.InterfaceAudience.Public;
import org.apache.hadoop.hive.common.classification.InterfaceStability.Stable;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.AbstractEncodingAwareSerDe;
import org.apache.hadoop.hive.serde2.ByteStream;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeSpec;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyObjectInspectorParametersImpl;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.BinaryComparable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
/**
* LazySimpleSerDe can be used to read the same data format as
* MetadataTypedColumnsetSerDe and TCTLSeparatedProtocol.
*
* However, LazySimpleSerDe creates Objects in a lazy way, to provide better
* performance.
*
* Also LazySimpleSerDe outputs typed columns instead of treating all columns as
* String like MetadataTypedColumnsetSerDe.
*/
@Public
@Stable
@SerDeSpec(schemaProps = {
serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES,
serdeConstants.FIELD_DELIM, serdeConstants.COLLECTION_DELIM, serdeConstants.MAPKEY_DELIM,
serdeConstants.SERIALIZATION_FORMAT, serdeConstants.SERIALIZATION_NULL_FORMAT,
serdeConstants.SERIALIZATION_ESCAPE_CRLF,
serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST,
serdeConstants.ESCAPE_CHAR,
serdeConstants.SERIALIZATION_ENCODING,
LazySerDeParameters.SERIALIZATION_EXTEND_NESTING_LEVELS,
LazySerDeParameters.SERIALIZATION_EXTEND_ADDITIONAL_NESTING_LEVELS
})
public class LazySimpleSerDe extends AbstractEncodingAwareSerDe {
private LazySerDeParameters serdeParams = null;
private ObjectInspector cachedObjectInspector;
private long serializedSize;
private SerDeStats stats;
private boolean lastOperationSerialize;
private boolean lastOperationDeserialize;
@Override
public String toString() {
return getClass().toString()
+ "["
+ Arrays.asList(serdeParams.getSeparators())
+ ":"
+ ((StructTypeInfo) serdeParams.getRowTypeInfo()).getAllStructFieldNames()
+ ":"
+ ((StructTypeInfo) serdeParams.getRowTypeInfo())
.getAllStructFieldTypeInfos() + "]";
}
public LazySimpleSerDe() throws SerDeException {
}
/**
* Initialize the SerDe given the parameters. serialization.format: separator
* char or byte code (only supports byte-value up to 127) columns:
* ","-separated column names columns.types: ",", ":", or ";"-separated column
* types
*
* @see org.apache.hadoop.hive.serde2.AbstractSerDe#initialize(Configuration, Properties)
*/
@Override
public void initialize(Configuration job, Properties tbl)
throws SerDeException {
super.initialize(job, tbl);
serdeParams = new LazySerDeParameters(job, tbl, getClass().getName());
// Create the ObjectInspectors for the fields
cachedObjectInspector = LazyFactory.createLazyStructInspector(serdeParams
.getColumnNames(), serdeParams.getColumnTypes(),
new LazyObjectInspectorParametersImpl(serdeParams));
cachedLazyStruct = (LazyStruct) LazyFactory
.createLazyObject(cachedObjectInspector);
serializedSize = 0;
stats = new SerDeStats();
lastOperationSerialize = false;
lastOperationDeserialize = false;
}
// The object for storing row data
LazyStruct cachedLazyStruct;
// The wrapper for byte array
ByteArrayRef byteArrayRef;
/**
* Deserialize a row from the Writable to a LazyObject.
*
* @param field
* the Writable that contains the data
* @return The deserialized row Object.
* @see org.apache.hadoop.hive.serde2.AbstractSerDe#deserialize(Writable)
*/
@Override
public Object doDeserialize(Writable field) throws SerDeException {
if (byteArrayRef == null) {
byteArrayRef = new ByteArrayRef();
}
BinaryComparable b = (BinaryComparable) field;
byteArrayRef.setData(b.getBytes());
cachedLazyStruct.init(byteArrayRef, 0, b.getLength());
lastOperationSerialize = false;
lastOperationDeserialize = true;
return cachedLazyStruct;
}
/**
* Returns the ObjectInspector for the row.
*/
@Override
public ObjectInspector getObjectInspector() throws SerDeException {
return cachedObjectInspector;
}
/**
* Returns the Writable Class after serialization.
*
* @see org.apache.hadoop.hive.serde2.AbstractSerDe#getSerializedClass()
*/
@Override
public Class<? extends Writable> getSerializedClass() {
return Text.class;
}
Text serializeCache = new Text();
ByteStream.Output serializeStream = new ByteStream.Output();
/**
* Serialize a row of data.
*
* @param obj
* The row object
* @param objInspector
* The ObjectInspector for the row object
* @return The serialized Writable object
* @throws SerDeException
* @see org.apache.hadoop.hive.serde2.AbstractSerDe#serialize(Object, ObjectInspector)
*/
@Override
public Writable doSerialize(Object obj, ObjectInspector objInspector)
throws SerDeException {
if (objInspector.getCategory() != Category.STRUCT) {
throw new SerDeException(getClass().toString()
+ " can only serialize struct types, but we got: "
+ objInspector.getTypeName());
}
// Prepare the field ObjectInspectors
StructObjectInspector soi = (StructObjectInspector) objInspector;
List<? extends StructField> fields = soi.getAllStructFieldRefs();
List<Object> list = soi.getStructFieldsDataAsList(obj);
List<? extends StructField> declaredFields = (serdeParams.getRowTypeInfo() != null && ((StructTypeInfo) serdeParams.getRowTypeInfo())
.getAllStructFieldNames().size() > 0) ? ((StructObjectInspector) getObjectInspector())
.getAllStructFieldRefs()
: null;
serializeStream.reset();
serializedSize = 0;
// Serialize each field
for (int i = 0; i < fields.size(); i++) {
// Append the separator if needed.
if (i > 0) {
serializeStream.write(serdeParams.getSeparators()[0]);
}
// Get the field objectInspector and the field object.
ObjectInspector foi = fields.get(i).getFieldObjectInspector();
Object f = (list == null ? null : list.get(i));
if (declaredFields != null && i >= declaredFields.size()) {
throw new SerDeException("Error: expecting " + declaredFields.size()
+ " but asking for field " + i + "\n" + "data=" + obj + "\n"
+ "tableType=" + serdeParams.getRowTypeInfo().toString() + "\n"
+ "dataType="
+ TypeInfoUtils.getTypeInfoFromObjectInspector(objInspector));
}
serializeField(serializeStream, f, foi, serdeParams);
}
// TODO: The copy of data is unnecessary, but there is no work-around
// since we cannot directly set the private byte[] field inside Text.
serializeCache
.set(serializeStream.getData(), 0, serializeStream.getLength());
serializedSize = serializeStream.getLength();
lastOperationSerialize = true;
lastOperationDeserialize = false;
return serializeCache;
}
protected void serializeField(ByteStream.Output out, Object obj, ObjectInspector objInspector,
LazySerDeParameters serdeParams) throws SerDeException {
try {
serialize(out, obj, objInspector, serdeParams.getSeparators(), 1, serdeParams.getNullSequence(),
serdeParams.isEscaped(), serdeParams.getEscapeChar(), serdeParams.getNeedsEscape());
} catch (IOException e) {
throw new SerDeException(e);
}
}
/**
* Serialize the row into the StringBuilder.
*
* @param out
* The StringBuilder to store the serialized data.
* @param obj
* The object for the current field.
* @param objInspector
* The ObjectInspector for the current Object.
* @param separators
* The separators array.
* @param level
* The current level of separator.
* @param nullSequence
* The byte sequence representing the NULL value.
* @param escaped
* Whether we need to escape the data when writing out
* @param escapeChar
* Which char to use as the escape char, e.g. '\\'
* @param needsEscape
* Which byte needs to be escaped for 256 bytes.
* @throws IOException
* @throws SerDeException
*/
public static void serialize(ByteStream.Output out, Object obj,
ObjectInspector objInspector, byte[] separators, int level,
Text nullSequence, boolean escaped, byte escapeChar, boolean[] needsEscape)
throws IOException, SerDeException {
if (obj == null) {
out.write(nullSequence.getBytes(), 0, nullSequence.getLength());
return;
}
char separator;
List<?> list;
switch (objInspector.getCategory()) {
case PRIMITIVE:
LazyUtils.writePrimitiveUTF8(out, obj,
(PrimitiveObjectInspector) objInspector, escaped, escapeChar,
needsEscape);
return;
case LIST:
separator = (char) LazyUtils.getSeparator(separators, level);
ListObjectInspector loi = (ListObjectInspector) objInspector;
list = loi.getList(obj);
ObjectInspector eoi = loi.getListElementObjectInspector();
if (list == null) {
out.write(nullSequence.getBytes(), 0, nullSequence.getLength());
} else {
for (int i = 0; i < list.size(); i++) {
if (i > 0) {
out.write(separator);
}
serialize(out, list.get(i), eoi, separators, level + 1, nullSequence,
escaped, escapeChar, needsEscape);
}
}
return;
case MAP:
separator = (char) LazyUtils.getSeparator(separators, level);
char keyValueSeparator =
(char) LazyUtils.getSeparator(separators, level + 1);
MapObjectInspector moi = (MapObjectInspector) objInspector;
ObjectInspector koi = moi.getMapKeyObjectInspector();
ObjectInspector voi = moi.getMapValueObjectInspector();
Map<?, ?> map = moi.getMap(obj);
if (map == null) {
out.write(nullSequence.getBytes(), 0, nullSequence.getLength());
} else {
boolean first = true;
for (Map.Entry<?, ?> entry : map.entrySet()) {
if (first) {
first = false;
} else {
out.write(separator);
}
serialize(out, entry.getKey(), koi, separators, level + 2,
nullSequence, escaped, escapeChar, needsEscape);
out.write(keyValueSeparator);
serialize(out, entry.getValue(), voi, separators, level + 2,
nullSequence, escaped, escapeChar, needsEscape);
}
}
return;
case STRUCT:
separator = (char) LazyUtils.getSeparator(separators, level);
StructObjectInspector soi = (StructObjectInspector) objInspector;
List<? extends StructField> fields = soi.getAllStructFieldRefs();
list = soi.getStructFieldsDataAsList(obj);
if (list == null) {
out.write(nullSequence.getBytes(), 0, nullSequence.getLength());
} else {
for (int i = 0; i < list.size(); i++) {
if (i > 0) {
out.write(separator);
}
serialize(out, list.get(i), fields.get(i).getFieldObjectInspector(),
separators, level + 1, nullSequence, escaped, escapeChar,
needsEscape);
}
}
return;
case UNION:
separator = (char) LazyUtils.getSeparator(separators, level);
UnionObjectInspector uoi = (UnionObjectInspector) objInspector;
List<? extends ObjectInspector> ois = uoi.getObjectInspectors();
if (ois == null) {
out.write(nullSequence.getBytes(), 0, nullSequence.getLength());
} else {
LazyUtils.writePrimitiveUTF8(out, new Byte(uoi.getTag(obj)),
PrimitiveObjectInspectorFactory.javaByteObjectInspector,
escaped, escapeChar, needsEscape);
out.write(separator);
serialize(out, uoi.getField(obj), ois.get(uoi.getTag(obj)),
separators, level + 1, nullSequence, escaped, escapeChar,
needsEscape);
}
return;
default:
break;
}
throw new RuntimeException("Unknown category type: "
+ objInspector.getCategory());
}
/**
* Returns the statistics after (de)serialization)
*/
@Override
public SerDeStats getSerDeStats() {
// must be different
assert (lastOperationSerialize != lastOperationDeserialize);
if (lastOperationSerialize) {
stats.setRawDataSize(serializedSize);
} else {
stats.setRawDataSize(cachedLazyStruct.getRawDataSerializedSize());
}
return stats;
}
@Override
protected Writable transformFromUTF8(Writable blob) {
Text text = (Text)blob;
return SerDeUtils.transformTextFromUTF8(text, this.charset);
}
@Override
protected Writable transformToUTF8(Writable blob) {
Text text = (Text)blob;
return SerDeUtils.transformTextToUTF8(text, this.charset);
}
/**
* This method is deprecated and is only used for backward compatibility.
* Replaced by @see org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters#LazySerDeParameters(Configuration, Properties, String)
*/
@Deprecated
public static SerDeParameters initSerdeParams(Configuration job,
Properties tbl,
String serdeName) throws SerDeException {
return new SerDeParameters(job, tbl, serdeName);
}
/**
* This class is deprecated and is only used for backward compatibility. Replace by
* @see org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters .
*/
@Deprecated
public static class SerDeParameters extends org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters {
public SerDeParameters(Configuration job,
Properties tbl,
String serdeName) throws SerDeException {
super(job, tbl, serdeName);
}
}
}