/* * Copyright © 2014 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.hive.serde; import co.cask.cdap.api.common.Bytes; import co.cask.cdap.api.data.format.StructuredRecord; import co.cask.cdap.api.data.format.UnexpectedFormatException; import co.cask.cdap.api.data.schema.Schema; import co.cask.cdap.hive.objectinspector.ObjectInspectorFactory; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import java.lang.reflect.Array; import java.lang.reflect.Field; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.UUID; /** * Helper class for translating objects that fit a cdap {@link Schema} into objects * that Hive can understand. */ public class ObjectDeserializer { private final List<String> fieldNames; private final List<TypeInfo> fieldTypes; private final ObjectInspector inspector; // we can almost do without the schema. The problem is that everything in Hive is lowercase, // but when we look up record fields we need the case sensitive field name. private final Schema schema; /** * Creates an ObjectTranslator that will be able to deserialize objects that fit a {@link Schema} into objects * that a Hive ObjectInspector can understand. * * @param properties Properties object passed to a SerDe during initialization that contains the table columns */ public ObjectDeserializer(Properties properties, Schema schema) { this(properties, schema, 0); } /** * Creates an ObjectTranslator that will be able to deserialize objects that fit a {@link Schema} into objects * that a Hive ObjectInspector can understand. Will ignore columns that are before the given field offset when * flattening records. The ObjectInspector will still use those columns. * * @param properties Properties object passed to a SerDe during initialization that contains the table columns * @param fieldOffset Ignore columns before the offset when flattening records */ public ObjectDeserializer(Properties properties, Schema schema, int fieldOffset) { this(Lists.newArrayList(properties.getProperty(serdeConstants.LIST_COLUMNS).split(",")), TypeInfoUtils.getTypeInfosFromTypeString(properties.getProperty(serdeConstants.LIST_COLUMN_TYPES)), schema, fieldOffset); } public ObjectDeserializer(List<String> fieldNames, List<TypeInfo> fieldTypes, Schema schema) { this(fieldNames, fieldTypes, schema, 0); } @VisibleForTesting ObjectDeserializer(List<String> fieldNames, List<TypeInfo> fieldTypes, Schema schema, int fieldOffset) { this.fieldNames = fieldNames.subList(fieldOffset, fieldNames.size()); this.fieldTypes = fieldTypes.subList(fieldOffset, fieldTypes.size()); // inspector should still use all names and types passed in. This is in case there are some fields that are // determined outside of this class, such as the stream case where timestamp and headers are read elsewhere this.inspector = createInspector(fieldNames, fieldTypes); this.schema = schema; } /** * Get an ObjectInspector that Hive should use on the result of {@link #deserialize(Object)}. * * @return ObjectInspector that Hive should use on the result of {@link #deserialize(Object)}. */ public ObjectInspector getInspector() { return inspector; } /** * Using reflection, deserialize an object that fits a {@link Schema} into one that can be examined * by an ObjectInspector. * * @param obj object that fits a {@link Schema}. * @return translated object that is understandable by Hive. * @throws NoSuchFieldException if a struct field was expected but not found in the object * @throws IllegalAccessException if a struct field was not accessible */ public Object deserialize(Object obj) throws NoSuchFieldException, IllegalAccessException { if (fieldTypes.size() == 1) { return deserializeField(obj, fieldTypes.get(0), schema); } else { return flattenRecord(obj, fieldNames, fieldTypes, schema); } } /** * Using reflection, flatten an object into a list of fields so it can be examined by an ObjectInspector. * Assumes the field names and types given as input were derived from the schema of the object. * * @param obj object that fits a {@link Schema}. * @return list of fields in the record, translated to be understandable by Hive. * @throws NoSuchFieldException * @throws IllegalAccessException */ public List<Object> translateRecord(Object obj) throws NoSuchFieldException, IllegalAccessException { return flattenRecord(obj, fieldNames, fieldTypes, schema); } private List<Object> flattenRecord(Object obj, List<String> fieldNames, List<TypeInfo> fieldTypes, Schema schema) throws NoSuchFieldException, IllegalAccessException { boolean isNullable = schema.isNullable(); if (obj == null) { if (isNullable) { return null; } else { throw new UnexpectedFormatException("Non-nullable field is null."); } } if (isNullable) { schema = schema.getNonNullable(); } Map<String, Schema.Field> fieldMap = getFieldMap(schema); List<Object> objectFields = Lists.newArrayListWithCapacity(fieldNames.size()); for (int i = 0; i < fieldNames.size(); i++) { String hiveName = fieldNames.get(i); TypeInfo fieldType = fieldTypes.get(i); Schema.Field schemaField = fieldMap.get(hiveName); // use the name from the schema field in case it is not all lowercase Object recordField = getRecordField(obj, schemaField.getName()); objectFields.add(deserializeField(recordField, fieldType, schemaField.getSchema())); } return objectFields; } /** * Translate a field that fits a {@link Schema} field into a type that Hive understands. * For example, a ByteBuffer is allowed by schema but Hive only understands byte arrays, so all ByteBuffers must * be changed into byte arrays. Reflection is used to examine java objects if the expected hive type is a struct. * * @param field value of the field to deserialize. * @param typeInfo type of the field as expected by Hive. * @param schema schema of the field. * @return translated field. * @throws NoSuchFieldException if a struct field was expected but not found in the object. * @throws IllegalAccessException if a struct field was not accessible. */ private Object deserializeField(Object field, TypeInfo typeInfo, Schema schema) throws NoSuchFieldException, IllegalAccessException { boolean isNullable = schema.isNullable(); if (field == null) { if (isNullable) { return null; } else { throw new UnexpectedFormatException("Non-nullable field was null."); } } if (isNullable) { schema = schema.getNonNullable(); } switch (typeInfo.getCategory()) { case PRIMITIVE: return deserializePrimitive(field, (PrimitiveTypeInfo) typeInfo); case LIST: // HIVE!! some versions will turn bytes into array<tinyint> instead of binary... so special case it. // TODO: remove once CDAP-1556 is done ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo; if (isByteArray(listTypeInfo) && !(field instanceof Collection)) { return deserializeByteArray(field); } return deserializeList(field, (ListTypeInfo) typeInfo, schema.getComponentSchema()); case MAP: return deserializeMap(field, (MapTypeInfo) typeInfo, schema.getMapSchema()); case STRUCT: StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo; ArrayList<String> innerFieldNames = structTypeInfo.getAllStructFieldNames(); ArrayList<TypeInfo> innerFieldTypes = structTypeInfo.getAllStructFieldTypeInfos(); return flattenRecord(field, innerFieldNames, innerFieldTypes, schema); case UNION: // TODO: decide what to do here return field; } return null; } private boolean isByteArray(ListTypeInfo typeInfo) { TypeInfo elementType = typeInfo.getListElementTypeInfo(); return (elementType.getCategory().equals(ObjectInspector.Category.PRIMITIVE) && ((PrimitiveTypeInfo) elementType).getPrimitiveCategory().equals(PrimitiveObjectInspector.PrimitiveCategory.BYTE)); } // Hive's object inspector will try to cast to Object[] so we can't return a byte[]... // TODO: remove once once CDAP-1556 is done private Byte[] deserializeByteArray(Object primitive) { // byte[], ByteBuffer, and UUID get mapped to bytes byte[] raw; if (primitive instanceof ByteBuffer) { ByteBuffer bb = (ByteBuffer) primitive; int length = bb.remaining(); Byte[] output = new Byte[length]; int pos = bb.position(); for (int i = 0; i < length; i++) { output[i] = bb.get(); } bb.position(pos); return output; } else if (primitive instanceof UUID) { raw = Bytes.toBytes((UUID) primitive); } else { raw = (byte[]) primitive; } Byte[] output = new Byte[raw.length]; for (int i = 0; i < output.length; i++) { output[i] = raw[i]; } return output; } /** * Translate a primitive type we understand into the type Hive understands. For example, we understand ByteBuffer * but Hive does not, so all ByteBuffer fields must be changed into byte[] fields. * See {@link co.cask.cdap.internal.io.AbstractSchemaGenerator} for the full mapping. * TODO: refactor so that changes don't have to be made both here and in AbstractSchemaGenerator */ private Object deserializePrimitive(Object primitive, PrimitiveTypeInfo typeInfo) { switch (typeInfo.getPrimitiveCategory()) { case STRING: // URI, URL, and String all get mapped to string // Avro's utf8 also requires .toString() return primitive.toString(); case BINARY: // byte[], ByteBuffer, and UUID get mapped to bytes if (primitive instanceof ByteBuffer) { return Bytes.toBytes((ByteBuffer) primitive); } else if (primitive instanceof UUID) { return Bytes.toBytes((UUID) primitive); } else { return primitive; } case INT: if (primitive instanceof Byte) { return ((Byte) primitive).intValue(); } else if (primitive instanceof Character) { return (int) (Character) primitive; } else if (primitive instanceof Short) { return ((Short) primitive).intValue(); } else { return primitive; } } return primitive; } private Object deserializeList(Object listField, ListTypeInfo typeInfo, Schema elementSchema) throws NoSuchFieldException, IllegalAccessException { TypeInfo listElementType = typeInfo.getListElementTypeInfo(); List<Object> hiveList = Lists.newArrayList(); if (listField instanceof Collection) { for (Object obj : (Collection<?>) listField) { hiveList.add(deserializeField(obj, listElementType, elementSchema)); } } else { for (int i = 0; i < Array.getLength(listField); i++) { hiveList.add(deserializeField(Array.get(listField, i), listElementType, elementSchema)); } } return hiveList; } @SuppressWarnings("unchecked") private Object deserializeMap(Object mapField, MapTypeInfo typeInfo, Map.Entry<Schema, Schema> mapSchema) throws NoSuchFieldException, IllegalAccessException { Map<Object, Object> ourMap = (Map) mapField; TypeInfo keyType = typeInfo.getMapKeyTypeInfo(); TypeInfo valType = typeInfo.getMapValueTypeInfo(); Schema keySchema = mapSchema.getKey(); Schema valSchema = mapSchema.getValue(); Map translatedMap = Maps.newHashMap(); for (Map.Entry entry : ourMap.entrySet()) { translatedMap.put(deserializeField(entry.getKey(), keyType, keySchema), deserializeField(entry.getValue(), valType, valSchema)); } return translatedMap; } // get a field from the object using the get method if the object is a StructuredRecord, // or using reflection if it is not. private Object getRecordField(Object record, String fieldName) throws NoSuchFieldException, IllegalAccessException { if (record instanceof StructuredRecord) { return ((StructuredRecord) record).get(fieldName); } Class recordClass = record.getClass(); Field field = recordClass.getDeclaredField(fieldName); field.setAccessible(true); return field.get(record); } // get a map from the expected hive name of a field in the schema to the field in the schema. private Map<String, Schema.Field> getFieldMap(Schema schema) { Map<String, Schema.Field> fieldMap = Maps.newHashMap(); for (Schema.Field field : schema.getFields()) { fieldMap.put(field.getName().toLowerCase(), field); } return fieldMap; } private ObjectInspector createInspector(List<String> fieldNames, List<TypeInfo> fieldTypes) { List<ObjectInspector> fieldInspectors = Lists.newArrayListWithCapacity(fieldTypes.size()); for (TypeInfo typeInfo : fieldTypes) { fieldInspectors.add(TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(typeInfo)); } return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldInspectors); } }