/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.serde2.avro;
import org.apache.avro.Schema;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.node.JsonNodeFactory;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Convert Hive TypeInfo to an Avro Schema
*/
public class TypeInfoToSchema {
private long recordCounter = 0;
/**
* Converts Hive schema to avro schema
*
* @param columnNames Names of the hive columns
* @param columnTypes Hive Column types
* @param namespace Namespace of Avro schema
* @param name Avro schema name
* @param doc Avro schema doc
* @return Avro Schema
*/
public Schema convert(List<String> columnNames, List<TypeInfo> columnTypes,
List<String> columnComments, String namespace, String name, String doc) {
List<Schema.Field> fields = new ArrayList<Schema.Field>();
for (int i = 0; i < columnNames.size(); ++i) {
final String comment = columnComments.size() > i ? columnComments.get(i) : null;
final Schema.Field avroField = createAvroField(columnNames.get(i), columnTypes.get(i),
comment);
fields.addAll(getFields(avroField));
}
if (name == null || name.isEmpty()) {
name = "baseRecord";
}
Schema avroSchema = Schema.createRecord(name, doc, namespace, false);
avroSchema.setFields(fields);
return avroSchema;
}
private Schema.Field createAvroField(String name, TypeInfo typeInfo, String comment) {
return new Schema.Field(name, createAvroSchema(typeInfo), comment, null);
}
private Schema createAvroSchema(TypeInfo typeInfo) {
Schema schema = null;
switch (typeInfo.getCategory()) {
case PRIMITIVE:
schema = createAvroPrimitive(typeInfo);
break;
case LIST:
schema = createAvroArray(typeInfo);
break;
case MAP:
schema = createAvroMap(typeInfo);
break;
case STRUCT:
schema = createAvroRecord(typeInfo);
break;
case UNION:
schema = createAvroUnion(typeInfo);
break;
}
return wrapInUnionWithNull(schema);
}
private Schema createAvroPrimitive(TypeInfo typeInfo) {
PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo;
Schema schema;
switch (primitiveTypeInfo.getPrimitiveCategory()) {
case STRING:
schema = Schema.create(Schema.Type.STRING);
break;
case CHAR:
schema = AvroSerdeUtils.getSchemaFor("{" +
"\"type\":\"" + AvroSerDe.AVRO_STRING_TYPE_NAME + "\"," +
"\"logicalType\":\"" + AvroSerDe.CHAR_TYPE_NAME + "\"," +
"\"maxLength\":" + ((CharTypeInfo) typeInfo).getLength() + "}");
break;
case VARCHAR:
schema = AvroSerdeUtils.getSchemaFor("{" +
"\"type\":\"" + AvroSerDe.AVRO_STRING_TYPE_NAME + "\"," +
"\"logicalType\":\"" + AvroSerDe.VARCHAR_TYPE_NAME + "\"," +
"\"maxLength\":" + ((VarcharTypeInfo) typeInfo).getLength() + "}");
break;
case BINARY:
schema = Schema.create(Schema.Type.BYTES);
break;
case BYTE:
schema = Schema.create(Schema.Type.INT);
break;
case SHORT:
schema = Schema.create(Schema.Type.INT);
break;
case INT:
schema = Schema.create(Schema.Type.INT);
break;
case LONG:
schema = Schema.create(Schema.Type.LONG);
break;
case FLOAT:
schema = Schema.create(Schema.Type.FLOAT);
break;
case DOUBLE:
schema = Schema.create(Schema.Type.DOUBLE);
break;
case BOOLEAN:
schema = Schema.create(Schema.Type.BOOLEAN);
break;
case DECIMAL:
DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfo;
String precision = String.valueOf(decimalTypeInfo.precision());
String scale = String.valueOf(decimalTypeInfo.scale());
schema = AvroSerdeUtils.getSchemaFor("{" +
"\"type\":\"bytes\"," +
"\"logicalType\":\"decimal\"," +
"\"precision\":" + precision + "," +
"\"scale\":" + scale + "}");
break;
case DATE:
schema = AvroSerdeUtils.getSchemaFor("{" +
"\"type\":\"" + AvroSerDe.AVRO_INT_TYPE_NAME + "\"," +
"\"logicalType\":\"" + AvroSerDe.DATE_TYPE_NAME + "\"}");
break;
case TIMESTAMP:
schema = AvroSerdeUtils.getSchemaFor("{" +
"\"type\":\"" + AvroSerDe.AVRO_LONG_TYPE_NAME + "\"," +
"\"logicalType\":\"" + AvroSerDe.TIMESTAMP_TYPE_NAME + "\"}");
break;
case VOID:
schema = Schema.create(Schema.Type.NULL);
break;
default:
throw new UnsupportedOperationException(typeInfo + " is not supported.");
}
return schema;
}
private Schema createAvroUnion(TypeInfo typeInfo) {
List<Schema> childSchemas = new ArrayList<Schema>();
for (TypeInfo childTypeInfo : ((UnionTypeInfo) typeInfo).getAllUnionObjectTypeInfos()) {
final Schema childSchema = createAvroSchema(childTypeInfo);
if (childSchema.getType() == Schema.Type.UNION) {
childSchemas.addAll(childSchema.getTypes());
} else {
childSchemas.add(childSchema);
}
}
return Schema.createUnion(removeDuplicateNullSchemas(childSchemas));
}
private Schema createAvroRecord(TypeInfo typeInfo) {
List<Schema.Field> childFields = new ArrayList<Schema.Field>();
final List<String> allStructFieldNames =
((StructTypeInfo) typeInfo).getAllStructFieldNames();
final List<TypeInfo> allStructFieldTypeInfos =
((StructTypeInfo) typeInfo).getAllStructFieldTypeInfos();
if (allStructFieldNames.size() != allStructFieldTypeInfos.size()) {
throw new IllegalArgumentException("Failed to generate avro schema from hive schema. " +
"name and column type differs. names = " + allStructFieldNames + ", types = " +
allStructFieldTypeInfos);
}
for (int i = 0; i < allStructFieldNames.size(); ++i) {
final TypeInfo childTypeInfo = allStructFieldTypeInfos.get(i);
final Schema.Field grandChildSchemaField = createAvroField(allStructFieldNames.get(i),
childTypeInfo, childTypeInfo.toString());
final List<Schema.Field> grandChildFields = getFields(grandChildSchemaField);
childFields.addAll(grandChildFields);
}
Schema recordSchema = Schema.createRecord("record_" + recordCounter, typeInfo.toString(),
null, false);
++recordCounter;
recordSchema.setFields(childFields);
return recordSchema;
}
private Schema createAvroMap(TypeInfo typeInfo) {
TypeInfo keyTypeInfo = ((MapTypeInfo) typeInfo).getMapKeyTypeInfo();
if (((PrimitiveTypeInfo) keyTypeInfo).getPrimitiveCategory()
!= PrimitiveObjectInspector.PrimitiveCategory.STRING) {
throw new UnsupportedOperationException("Key of Map can only be a String");
}
TypeInfo valueTypeInfo = ((MapTypeInfo) typeInfo).getMapValueTypeInfo();
Schema valueSchema = createAvroSchema(valueTypeInfo);
return Schema.createMap(valueSchema);
}
private Schema createAvroArray(TypeInfo typeInfo) {
ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo;
Schema listSchema = createAvroSchema(listTypeInfo.getListElementTypeInfo());
return Schema.createArray(listSchema);
}
private List<Schema.Field> getFields(Schema.Field schemaField) {
List<Schema.Field> fields = new ArrayList<Schema.Field>();
JsonNode nullDefault = JsonNodeFactory.instance.nullNode();
if (schemaField.schema().getType() == Schema.Type.RECORD) {
for (Schema.Field field : schemaField.schema().getFields()) {
fields.add(new Schema.Field(field.name(), field.schema(), field.doc(), nullDefault));
}
} else {
fields.add(new Schema.Field(schemaField.name(), schemaField.schema(), schemaField.doc(),
nullDefault));
}
return fields;
}
private Schema wrapInUnionWithNull(Schema schema) {
Schema wrappedSchema = schema;
switch (schema.getType()) {
case NULL:
break;
case UNION:
List<Schema> existingSchemas = removeDuplicateNullSchemas(schema.getTypes());
wrappedSchema = Schema.createUnion(existingSchemas);
break;
default:
wrappedSchema = Schema.createUnion(Arrays.asList(Schema.create(Schema.Type.NULL), schema));
}
return wrappedSchema;
}
private List<Schema> removeDuplicateNullSchemas(List<Schema> childSchemas) {
List<Schema> prunedSchemas = new ArrayList<Schema>();
boolean isNullPresent = false;
for (Schema schema : childSchemas) {
if (schema.getType() == Schema.Type.NULL) {
isNullPresent = true;
} else {
prunedSchemas.add(schema);
}
}
if (isNullPresent) {
prunedSchemas.add(0, Schema.create(Schema.Type.NULL));
}
return prunedSchemas;
}
}