/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.serde2.avro; import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.apache.avro.Schema; import org.apache.avro.file.DataFileStream; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; import org.apache.commons.lang.ClassUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.lazy.ByteArrayRef; import org.apache.hadoop.hive.serde2.lazy.LazyArray; import org.apache.hadoop.hive.serde2.lazy.LazyFactory; import org.apache.hadoop.hive.serde2.lazy.LazyMap; import org.apache.hadoop.hive.serde2.lazy.LazyObject; import org.apache.hadoop.hive.serde2.lazy.LazyStruct; import org.apache.hadoop.hive.serde2.lazy.LazyUnion; import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazyListObjectInspector; import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazyMapObjectInspector; import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector; import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazyUnionObjectInspector; import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyObjectInspectorParameters; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector.StandardUnion; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.io.Text; /** * Lazy objectinspector for avro serialization * */ public class AvroLazyObjectInspector extends LazySimpleStructObjectInspector { /** * Reader {@link Schema} for the avro data * */ private Schema readerSchema; /** * {@link AvroSchemaRetriever} to retrieve avro schema * */ private AvroSchemaRetriever schemaRetriever; /** * LOGGER * */ public static final Logger LOG = LoggerFactory.getLogger(AvroLazyObjectInspector.class); /** * Constructor * * @param structFieldNames fields within the given protobuf object * @param structFieldObjectInspectors object inspectors for the fields * @param structFieldComments comments for the given fields * @param separator separator between different fields * @param nullSequence sequence to represent null value * @param lastColumnTakesRest whether the last column of the struct should take the rest of the * row if there are extra fields. * @param escaped whether the data is escaped or not * @param escapeChar if escaped is true, the escape character * */ @Deprecated public AvroLazyObjectInspector(List<String> structFieldNames, List<ObjectInspector> structFieldObjectInspectors, List<String> structFieldComments, byte separator, Text nullSequence, boolean lastColumnTakesRest, boolean escaped, byte escapeChar) { super(structFieldNames, structFieldObjectInspectors, structFieldComments, separator, nullSequence, lastColumnTakesRest, escaped, escapeChar); } public AvroLazyObjectInspector(List<String> structFieldNames, List<ObjectInspector> structFieldObjectInspectors, List<String> structFieldComments, byte separator, LazyObjectInspectorParameters lazyParams) { super(structFieldNames, structFieldObjectInspectors, structFieldComments, separator, lazyParams); } /** * Set the reader schema for the {@link AvroLazyObjectInspector} to the given schema * */ public void setReaderSchema(Schema readerSchema) { this.readerSchema = readerSchema; } /** * Set the {@link AvroSchemaRetriever} for the {@link AvroLazyObjectInspector} to the given class * * @param schemaRetriever the schema retriever class to be set * */ public void setSchemaRetriever(AvroSchemaRetriever schemaRetriever) { this.schemaRetriever = schemaRetriever; } @SuppressWarnings("unchecked") @Override public Object getStructFieldData(Object data, StructField f) { if (data == null) { return null; } int fieldID = f.getFieldID(); if (LOG.isDebugEnabled()) { LOG.debug("Getting struct field data for field: [" + f.getFieldName() + "] on data [" + data.getClass() + "]"); } if (data instanceof LazyStruct) { LazyStruct row = (LazyStruct) data; // get the field out of struct Object rowField = row.getField(fieldID); if (rowField instanceof LazyStruct) { if (LOG.isDebugEnabled() && rowField != null) { LOG.debug("Deserializing struct [" + rowField.getClass() + "]"); } return deserializeStruct(rowField, f.getFieldName()); } else if (rowField instanceof LazyMap) { // We have found a map. Systematically deserialize the values of the map and return back the // map LazyMap lazyMap = (LazyMap) rowField; for (Entry<Object, Object> entry : lazyMap.getMap().entrySet()) { Object _key = entry.getKey(); Object _value = entry.getValue(); if (_value instanceof LazyStruct) { lazyMap.getMap().put(_key, deserializeStruct(_value, f.getFieldName())); } } if (LOG.isDebugEnabled()) { LOG.debug("Returning a lazy map for field [" + f.getFieldName() + "]"); } return lazyMap; } else { if (LOG.isDebugEnabled()) { LOG.debug("Returning [" + rowField + "] for field [" + f.getFieldName() + "]"); } // Just return the object. We need no further operation on it return rowField; } } else { // The Avro deserializer would deserialize our object and return back a list of object that // hive can operate on. Here we should be getting the same object back. if (!(data instanceof List)) { throw new IllegalArgumentException("data should be an instance of list"); } if (!(fieldID < ((List<Object>) data).size())) { return null; } // lookup the field corresponding to the given field ID and return Object field = ((List<Object>) data).get(fieldID); if (field == null) { return null; } // convert to a lazy object and return return toLazyObject(field, f.getFieldObjectInspector()); } } @Override public List<Object> getStructFieldsDataAsList(Object data) { if (data == null) { return null; } List<Object> result = new ArrayList<Object>(fields.size()); for (int i = 0; i < fields.size(); i++) { result.add(getStructFieldData(data, fields.get(i))); } return result; } /** * Deserialize the given struct object * * @param struct the object to deserialize * @param fieldName name of the field on which we are currently operating on * @return a deserialized object can hive can further operate on * @throws AvroObjectInspectorException if something goes wrong during deserialization * */ private Object deserializeStruct(Object struct, String fieldName) { byte[] data = ((LazyStruct) struct).getBytes(); AvroDeserializer deserializer = new AvroDeserializer(); if (data == null || data.length == 0) { return null; } if (readerSchema == null && schemaRetriever == null) { throw new IllegalArgumentException("reader schema or schemaRetriever must be set for field [" + fieldName + "]"); } Schema ws = null; Schema rs = null; int offset = 0; AvroGenericRecordWritable avroWritable = new AvroGenericRecordWritable(); if (readerSchema == null) { offset = schemaRetriever.getOffset(); if (data.length < offset) { throw new IllegalArgumentException("Data size cannot be less than [" + offset + "]. Found [" + data.length + "]"); } rs = schemaRetriever.retrieveReaderSchema(data); if (rs == null) { // still nothing, Raise exception throw new IllegalStateException( "A valid reader schema could not be retrieved either directly or from the schema retriever for field [" + fieldName + "]"); } ws = schemaRetriever.retrieveWriterSchema(data); if (ws == null) { throw new IllegalStateException( "Null writer schema retrieved from schemaRetriever for field [" + fieldName + "]"); } // adjust the data bytes according to any possible offset that was provided if (LOG.isDebugEnabled()) { LOG.debug("Retrieved writer Schema: " + ws.toString()); LOG.debug("Retrieved reader Schema: " + rs.toString()); } try { avroWritable.readFields(data, offset, data.length, ws, rs); } catch (IOException ioe) { throw new AvroObjectInspectorException("Error deserializing avro payload", ioe); } } else { // a reader schema was provided if (schemaRetriever != null) { // a schema retriever has been provided as well. Attempt to read the write schema from the // retriever ws = schemaRetriever.retrieveWriterSchema(data); if (ws == null) { throw new IllegalStateException( "Null writer schema retrieved from schemaRetriever for field [" + fieldName + "]"); } } else { // attempt retrieving the schema from the data ws = retrieveSchemaFromBytes(data); } rs = readerSchema; try { avroWritable.readFields(data, ws, rs); } catch (IOException ioe) { throw new AvroObjectInspectorException("Error deserializing avro payload", ioe); } } AvroObjectInspectorGenerator oiGenerator = null; Object deserializedObject = null; try { oiGenerator = new AvroObjectInspectorGenerator(rs); deserializedObject = deserializer.deserialize(oiGenerator.getColumnNames(), oiGenerator.getColumnTypes(), avroWritable, rs); } catch (SerDeException se) { throw new AvroObjectInspectorException("Error deserializing avro payload", se); } return deserializedObject; } /** * Retrieve schema from the given bytes * * @return the retrieved {@link Schema schema} * */ private Schema retrieveSchemaFromBytes(byte[] data) { ByteArrayInputStream bais = new ByteArrayInputStream(data); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); Schema schema = null; try { // dfs is AutoCloseable @SuppressWarnings("resource") DataFileStream<GenericRecord> dfs = new DataFileStream<GenericRecord>(bais, reader); schema = dfs.getSchema(); } catch (IOException ioe) { throw new AvroObjectInspectorException("An error occurred retrieving schema from bytes", ioe); } return schema; } /** * Converts the given field to a lazy object * * @param field to be converted to a lazy object * @param fieldOI {@link ObjectInspector} for the given field * @return returns the converted lazy object * */ private Object toLazyObject(Object field, ObjectInspector fieldOI) { if (isPrimitive(field.getClass())) { return toLazyPrimitiveObject(field, fieldOI); } else if (fieldOI instanceof LazyListObjectInspector) { return toLazyListObject(field, fieldOI); } else if (field instanceof StandardUnion) { return toLazyUnionObject(field, fieldOI); } else if (fieldOI instanceof LazyMapObjectInspector) { return toLazyMapObject(field, fieldOI); } else { return field; } } /** * Convert the given object to a lazy object using the given {@link ObjectInspector} * * @param obj Object to be converted to a {@link LazyObject} * @param oi ObjectInspector used for the conversion * @return the created {@link LazyObject lazy object} * */ private LazyObject<? extends ObjectInspector> toLazyPrimitiveObject(Object obj, ObjectInspector oi) { if (obj == null) { return null; } LazyObject<? extends ObjectInspector> lazyObject = LazyFactory.createLazyObject(oi); ByteArrayRef ref = new ByteArrayRef(); String objAsString = obj.toString().trim(); ref.setData(objAsString.getBytes()); // initialize the lazy object lazyObject.init(ref, 0, ref.getData().length); return lazyObject; } /** * Convert the given object to a lazy object using the given {@link ObjectInspector} * * @param obj Object to be converted to a {@link LazyObject} * @param oi ObjectInspector used for the conversion * @return the created {@link LazyObject lazy object} * */ private Object toLazyListObject(Object obj, ObjectInspector objectInspector) { if (obj == null) { return null; } List<?> listObj = (List<?>) obj; LazyArray retList = (LazyArray) LazyFactory.createLazyObject(objectInspector); List<Object> lazyList = retList.getList(); ObjectInspector listElementOI = ((ListObjectInspector) objectInspector).getListElementObjectInspector(); for (int i = 0; i < listObj.size(); i++) { lazyList.add(toLazyObject(listObj.get(i), listElementOI)); } return retList; } /** * Convert the given object to a lazy object using the given {@link ObjectInspector} * * @param obj Object to be converted to a {@link LazyObject} * @param oi ObjectInspector used for the conversion * @return the created {@link LazyObject lazy object} * */ @SuppressWarnings({ "rawtypes", "unchecked" }) private Object toLazyMapObject(Object obj, ObjectInspector objectInspector) { if (obj == null) { return null; } // avro guarantees that the key will be of type string. So we just need to worry about // deserializing the value here LazyMap lazyMap = (LazyMap) LazyFactory.createLazyObject(objectInspector); Map map = lazyMap.getMap(); Map<Object, Object> origMap = (Map) obj; ObjectInspector keyObjectInspector = ((MapObjectInspector) objectInspector).getMapKeyObjectInspector(); ObjectInspector valueObjectInspector = ((MapObjectInspector) objectInspector).getMapValueObjectInspector(); for (Entry entry : origMap.entrySet()) { Object value = entry.getValue(); map.put(toLazyPrimitiveObject(entry.getKey(), keyObjectInspector), toLazyObject(value, valueObjectInspector)); } return lazyMap; } /** * Convert the given object to a lazy object using the given {@link ObjectInspector} * * @param obj Object to be converted to a {@link LazyObject} * @param oi ObjectInspector used for the conversion * @return the created {@link LazyObject lazy object} * */ private Object toLazyUnionObject(Object obj, ObjectInspector objectInspector) { if (obj == null) { return null; } if (!(objectInspector instanceof LazyUnionObjectInspector)) { throw new IllegalArgumentException( "Invalid objectinspector found. Expected LazyUnionObjectInspector, Found " + objectInspector.getClass()); } StandardUnion standardUnion = (StandardUnion) obj; LazyUnionObjectInspector lazyUnionOI = (LazyUnionObjectInspector) objectInspector; // Grab the tag and the field byte tag = standardUnion.getTag(); Object field = standardUnion.getObject(); ObjectInspector fieldOI = lazyUnionOI.getObjectInspectors().get(tag); // convert to lazy object Object convertedObj = null; if (field != null) { convertedObj = toLazyObject(field, fieldOI); } if (convertedObj == null) { return null; } return new LazyUnion(lazyUnionOI, tag, convertedObj); } /** * Determines if the given object is a primitive or a wrapper to a primitive. Note, even though a * <code>String</code> may not be a primitive in the traditional sense, but it is considered one * here as it is <i>not</i> a struct. * * @param clazz input class * @return true, if the object is a primitive or a wrapper to a primitive, false otherwise. * */ private boolean isPrimitive(Class<?> clazz) { return clazz.isPrimitive() || ClassUtils.wrapperToPrimitive(clazz) != null || clazz.getSimpleName().equals("String"); } }