/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.apache.pig.piggybank.storage.avro; import java.io.IOException; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.avro.Schema; import org.apache.pig.ResourceSchema; import org.apache.pig.ResourceSchema.ResourceFieldSchema; import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; import org.apache.pig.data.DataType; /** * This class converts Avro schema to Pig schema */ public class AvroSchema2Pig { public static String RECORD = "RECORD"; public static String FIELD = "FIELD"; public static String ARRAY_FIELD = "ARRAY_ELEM"; public static String MAP_VALUE_FIELD = "m_value"; /** * Wrap a pig type to a field schema */ public static ResourceFieldSchema getPigSchema(byte pigType, String fieldName) { return new ResourceFieldSchema( new FieldSchema(fieldName, pigType)); } /** * Convert an Avro schema to a Pig schema */ public static ResourceSchema convert(Schema schema) throws IOException { if (AvroStorageUtils.containsGenericUnion(schema)) throw new IOException ("We don't accept schema containing generic unions."); Set<Schema> visitedRecords = new HashSet<Schema>(); ResourceFieldSchema inSchema = inconvert(schema, FIELD, visitedRecords); ResourceSchema tupleSchema; if (inSchema.getType() == DataType.TUPLE) { tupleSchema = inSchema.getSchema(); } else { // other typs ResourceFieldSchema tupleWrapper = AvroStorageUtils.wrapAsTuple(inSchema); ResourceSchema topSchema = new ResourceSchema(); topSchema.setFields(new ResourceFieldSchema[] { tupleWrapper }); tupleSchema = topSchema; } return tupleSchema; } /** * Convert a schema with field name to a pig schema */ private static ResourceFieldSchema inconvert(Schema in, String fieldName, Set<Schema> visitedRecords) throws IOException { AvroStorageLog.details("InConvert avro schema with field name " + fieldName); Schema.Type avroType = in.getType(); ResourceFieldSchema fieldSchema = new ResourceFieldSchema(); fieldSchema.setName(fieldName); if (avroType.equals(Schema.Type.RECORD)) { AvroStorageLog.details("convert to a pig tuple"); if (visitedRecords.contains(in)) { fieldSchema.setType(DataType.BYTEARRAY); } else { visitedRecords.add(in); fieldSchema.setType(DataType.TUPLE); ResourceSchema tupleSchema = new ResourceSchema(); List<Schema.Field> fields = in.getFields(); ResourceFieldSchema[] childFields = new ResourceFieldSchema[fields.size()]; int index = 0; for (Schema.Field field : fields) { childFields[index++] = inconvert(field.schema(), field.name(), visitedRecords); } tupleSchema.setFields(childFields); fieldSchema.setSchema(tupleSchema); visitedRecords.remove(in); } } else if (avroType.equals(Schema.Type.ARRAY)) { AvroStorageLog.details("convert array to a pig bag"); fieldSchema.setType(DataType.BAG); Schema elemSchema = in.getElementType(); ResourceFieldSchema subFieldSchema = inconvert(elemSchema, ARRAY_FIELD, visitedRecords); add2BagSchema(fieldSchema, subFieldSchema); } else if (avroType.equals(Schema.Type.MAP)) { AvroStorageLog.details("convert map to a pig map"); fieldSchema.setType(DataType.MAP); } else if (avroType.equals(Schema.Type.UNION)) { if (AvroStorageUtils.isAcceptableUnion(in)) { Schema acceptSchema = AvroStorageUtils.getAcceptedType(in); ResourceFieldSchema realFieldSchema = inconvert(acceptSchema, null, visitedRecords); fieldSchema.setType(realFieldSchema.getType()); fieldSchema.setSchema(realFieldSchema.getSchema()); } else throw new IOException("Do not support generic union:" + in); } else if (avroType.equals(Schema.Type.FIXED)) { fieldSchema.setType(DataType.BYTEARRAY); } else if (avroType.equals(Schema.Type.BOOLEAN)) { fieldSchema.setType(DataType.BOOLEAN); } else if (avroType.equals(Schema.Type.BYTES)) { fieldSchema.setType(DataType.BYTEARRAY); } else if (avroType.equals(Schema.Type.DOUBLE)) { fieldSchema.setType(DataType.DOUBLE); } else if (avroType.equals(Schema.Type.ENUM)) { fieldSchema.setType(DataType.CHARARRAY); } else if (avroType.equals(Schema.Type.FLOAT)) { fieldSchema.setType(DataType.FLOAT); } else if (avroType.equals(Schema.Type.INT)) { fieldSchema.setType(DataType.INTEGER); } else if (avroType.equals(Schema.Type.LONG)) { fieldSchema.setType(DataType.LONG); } else if (avroType.equals(Schema.Type.STRING)) { fieldSchema.setType(DataType.CHARARRAY); } else if (avroType.equals(Schema.Type.NULL)) { // value of NULL is always NULL fieldSchema.setType(DataType.INTEGER); } else { throw new IOException("Unsupported avro type:" + avroType); } return fieldSchema; } /** * Add a field schema to a bag schema */ static protected void add2BagSchema(ResourceFieldSchema fieldSchema, ResourceFieldSchema subFieldSchema) throws IOException { ResourceFieldSchema wrapped = (subFieldSchema.getType() == DataType.TUPLE) ? subFieldSchema : AvroStorageUtils.wrapAsTuple(subFieldSchema); ResourceSchema listSchema = new ResourceSchema(); listSchema.setFields(new ResourceFieldSchema[] { wrapped }); fieldSchema.setSchema(listSchema); } }