/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.apache.pig.piggybank.storage.avro;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.data.DataType;
import org.codehaus.jackson.JsonNode;
/**
* This class contains functions to convert Pig schema to Avro. It consists of
* two sets of methods:
*
* 1. Convert a Pig schema to Avro schema;
* 2. Validate whether a Pig schema is compatible with a given Avro schema.
* Notice that the Avro schema doesn't need to cover all fields in Pig schema,
* and the missing fields are converted using methods in set 1.
*
*/
public class PigSchema2Avro {
public static final String TUPLE_NAME = "TUPLE";
public static final String FIELD_NAME = "FIELD";
public static int tupleIndex = 0;
// //////////////////////////////////////////////////////////
// Methods in Set 1: Convert Pig schema to Avro schema
// //////////////////////////////////////////////////////////
/**
* Convert a pig ResourceSchema to avro schema
*
*/
public static Schema convert(ResourceSchema pigSchema, boolean nullable) throws IOException {
ResourceFieldSchema[] pigFields = pigSchema.getFields();
/* remove the pig tuple wrapper */
if (pigFields.length == 1) {
AvroStorageLog.details("Ignore the pig tuple wrapper.");
return convert(pigFields[0], nullable);
} else
return convertRecord(pigFields, nullable);
}
/**
* Convert a Pig ResourceFieldSchema to avro schema
*
*/
protected static Schema convert(ResourceFieldSchema pigSchema, boolean nullable) throws IOException {
AvroStorageLog.details("Convert pig field schema:" + pigSchema);
final byte pigType = pigSchema.getType();
if (pigType == DataType.TUPLE) {
AvroStorageLog.details("Convert a pig field tuple: " + pigSchema);
ResourceFieldSchema[] listSchemas = pigSchema.getSchema()
.getFields();
Schema outSchema = null;
if (AvroStorageUtils.isTupleWrapper(pigSchema)) {
/* remove Pig tuple wrapper */
AvroStorageLog.details("Ignore the pig tuple wrapper.");
if (listSchemas.length != 1)
throw new IOException("Expect one subfield from "
+ pigSchema);
outSchema = convert(listSchemas[0], nullable);
} else {
outSchema = convertRecord(listSchemas, nullable);
}
return AvroStorageUtils.wrapAsUnion(outSchema, nullable);
} else if (pigType == DataType.BAG) {
AvroStorageLog.details("Convert a pig field bag:" + pigSchema);
/* Bag elements have to be Tuples */
ResourceFieldSchema[] fs = pigSchema.getSchema().getFields();
if (fs == null || fs.length != 1
|| fs[0].getType() != DataType.TUPLE)
throw new IOException("Expect one tuple field in a bag");
Schema outSchema = Schema.createArray(convert(fs[0], nullable));
return AvroStorageUtils.wrapAsUnion(outSchema, nullable);
} else if (pigType == DataType.MAP) {
/* Pig doesn't provide schema info of Map value */
throw new IOException("Please provide schema for Map field!");
} else if (pigType == DataType.UNKNOWN) {
/* Results of Pig UNION operation is of UNKNOWN type */
throw new IOException("Must specify a schema for UNKNOWN pig type.");
} else if (pigType == DataType.CHARARRAY
|| pigType == DataType.BIGCHARARRAY
|| pigType == DataType.BOOLEAN
|| pigType == DataType.BYTE
|| pigType == DataType.BYTEARRAY
|| pigType == DataType.DOUBLE
|| pigType == DataType.FLOAT
|| pigType == DataType.INTEGER
|| pigType == DataType.LONG) {
AvroStorageLog.details("Convert a pig field primitive:" + pigSchema);
Schema outSchema = convertPrimitiveType(pigType);
return AvroStorageUtils.wrapAsUnion(outSchema, nullable);
} else
throw new IOException("unsupported pig type:"
+ DataType.findTypeName(pigType));
}
/**
* Convert pig data to Avro record
*
*/
protected static Schema convertRecord(ResourceFieldSchema[] pigFields, boolean nullable) throws IOException {
AvroStorageLog.funcCall("convertRecord");
// Type name is required for Avro record
String typeName = getRecordName();
Schema outSchema = Schema.createRecord(typeName, null, null, false);
List<Schema.Field> outFields = new ArrayList<Schema.Field>();
for (int i = 0; i < pigFields.length; i++) {
/* get schema */
Schema fieldSchema = convert(pigFields[i], nullable);
/* get field name of output */
String outname = pigFields[i].getName();
if (outname == null)
outname = FIELD_NAME + "_" + i; // field name cannot be null
/* get doc of output */
String desc = pigFields[i].getDescription();
outFields.add(new Field(outname, fieldSchema, desc, null));
}
outSchema.setFields(outFields);
return outSchema;
}
/**
* This is a painful hack to make unit tests pass. The static counter holds
* state between unit tests, so it needs to be reset. Otherwise tests will
* fail if their order is swapped or a new test is added.
*
* @param index
*/
public static void setTupleIndex(int index) { tupleIndex = index; }
private static String getRecordName() {
String name = TUPLE_NAME + "_" + tupleIndex;
tupleIndex++;
return name;
}
/**
* Convert Pig primitive type to Avro type
*
*/
protected static Schema convertPrimitiveType(byte pigType) throws IOException {
if (pigType == DataType.BOOLEAN) {
return AvroStorageUtils.BooleanSchema;
} else if (pigType == DataType.BYTEARRAY) {
return AvroStorageUtils.BytesSchema;
} else if (pigType == DataType.CHARARRAY
|| pigType == DataType.BIGCHARARRAY) {
return AvroStorageUtils.StringSchema;
} else if (pigType == DataType.DOUBLE) {
return AvroStorageUtils.DoubleSchema;
} else if (pigType == DataType.FLOAT) {
return AvroStorageUtils.FloatSchema;
} else if (pigType == DataType.INTEGER) {
return AvroStorageUtils.IntSchema;
} else if (pigType == DataType.LONG) {
return AvroStorageUtils.LongSchema;
} else
throw new IOException("unsupported pig type:"
+ DataType.findTypeName(pigType));
}
// //////////////////////////////////////////////////////////
// Methods in Set 2: Validate whether a Pig schema is compatible
// with a given Avro schema.
// //////////////////////////////////////////////////////////
/**
* Validate whether pigSchema is compatible with avroSchema
*/
public static Schema validateAndConvert(Schema avroSchema, ResourceSchema pigSchema) throws IOException {
return validateAndConvertRecord(avroSchema, pigSchema.getFields());
}
/**
* Validate whether pigSchema is compatible with avroSchema and convert
* those Pig fields with to corresponding Avro schemas.
*/
protected static Schema validateAndConvert(Schema avroSchema, ResourceFieldSchema pigSchema) throws IOException {
AvroStorageLog.details("Validate pig field schema:" + pigSchema);
/* compatibility check based on data types */
if (!isCompatible(avroSchema, pigSchema))
throw new IOException("Schemas are not compatible.\n Avro=" + avroSchema + "\n" + "Pig=" + pigSchema);
final byte pigType = pigSchema.getType();
if (avroSchema.getType().equals(Schema.Type.UNION)) {
AvroStorageLog.details("Validate Pig schema with Avro union:" + avroSchema);
List<Schema> unionSchemas = avroSchema.getTypes();
for (Schema schema : unionSchemas) {
try {
@SuppressWarnings("unused")
Schema s = validateAndConvert(schema, pigSchema);
return avroSchema;
} catch (IOException e) {
// ignore the unmatched one
}
}
throw new IOException("pig schema " + pigSchema + " is not compatible with avro " + avroSchema);
} else if (pigType == DataType.TUPLE) {
AvroStorageLog.details("Validate a pig tuple: " + pigSchema);
ResourceFieldSchema[] pigFields = pigSchema.getSchema().getFields();
Schema outSchema = validateAndConvertRecord(avroSchema, pigFields);
return outSchema;
} else if (pigType == DataType.BAG) {
AvroStorageLog.details("Validate a pig bag:" + pigSchema);
/* get fields of containing tuples */
ResourceFieldSchema[] fs = pigSchema.getSchema().getFields();
if (fs == null || fs.length != 1 || fs[0].getType() != DataType.TUPLE)
throw new IOException("Expect one tuple field in a bag");
Schema inElemSchema = avroSchema.getElementType();
Schema outSchema = Schema.createArray(validateAndConvert(inElemSchema, fs[0]));
return outSchema;
} else if (pigType == DataType.MAP) {
AvroStorageLog.details("Cannot validate a pig map. Will use user defined Avro schema.");
return avroSchema;
} else if (pigType == DataType.UNKNOWN || pigType == DataType.CHARARRAY
|| pigType == DataType.BIGCHARARRAY
|| pigType == DataType.BOOLEAN
|| pigType == DataType.BYTE
|| pigType == DataType.BYTEARRAY
|| pigType == DataType.DOUBLE
|| pigType == DataType.FLOAT
|| pigType == DataType.INTEGER
|| pigType == DataType.LONG) {
AvroStorageLog.details("Validate a pig primitive type:" + pigSchema);
return avroSchema;
} else
throw new IOException("Unsupported pig type:" + DataType.findTypeName(pigType));
}
/**
* Validate a Pig tuple is compatible with Avro record. If the Avro schema
* is not complete (with uncovered fields), then convert those fields using
* methods in set 1.
*
* Notice that users can get rid of Pig tuple wrappers, e.g. an Avro schema
* "int" is compatible with a Pig schema "T:(int)"
*
*/
protected static Schema validateAndConvertRecord(Schema avroSchema, ResourceFieldSchema[] pigFields) throws IOException {
/* Get rid of Pig tuple wrappers. */
if (!avroSchema.getType().equals(Schema.Type.RECORD)) {
if (pigFields.length != 1)
throw new IOException("Expect only one field in Pig tuple schema. Avro schema is " + avroSchema.getType());
return validateAndConvert(avroSchema, pigFields[0]);
}
/* validate and convert a pig tuple with avro record */
boolean isPartialSchema = AvroStorageUtils.isUDPartialRecordSchema(avroSchema);
AvroStorageLog.details("isPartialSchema=" + isPartialSchema);
String typeName = isPartialSchema ? getRecordName() : avroSchema.getName();
Schema outSchema = Schema.createRecord(typeName, avroSchema.getDoc(), avroSchema.getNamespace(), false);
List<Schema.Field> inFields = avroSchema.getFields();
if (!isPartialSchema && inFields.size() != pigFields.length) {
throw new IOException("Expect " + inFields.size() + " fields in pig schema." + " But there are " + pigFields.length);
}
List<Schema.Field> outFields = new ArrayList<Schema.Field>();
for (int i = 0; i < pigFields.length; i++) {
/* get user defined avro field schema */
Field inputField = isPartialSchema ? AvroStorageUtils.getUDField(avroSchema, i) : inFields.get(i);
/* get schema */
Schema fieldSchema = null;
if (inputField == null) {
/* convert pig schema (nullable) */
fieldSchema = convert(pigFields[i], true);
} else if (inputField.schema() == null) {
/* convert pig schema (not-null) */
fieldSchema = convert(pigFields[i], false);
} else {
/* validate pigFields[i] with given avro schema */
fieldSchema = validateAndConvert(inputField.schema(),
pigFields[i]);
}
/* get field name of output */
String outname = (isPartialSchema) ? pigFields[i].getName() : inputField.name();
if (outname == null)
outname = FIELD_NAME + "_" + i; // field name cannot be null
/* get doc of output */
String doc = (isPartialSchema) ? pigFields[i].getDescription() : inputField.doc();
JsonNode defaultvalue = (inputField != null) ? inputField.defaultValue() : null;
outFields.add(new Field(outname, fieldSchema, doc, defaultvalue));
}
outSchema.setFields(outFields);
return outSchema;
}
/**
* Check whether Avro type is compatible with Pig type
*
*/
protected static boolean isCompatible(Schema avroSchema, ResourceFieldSchema pigSchema) {
Schema.Type avroType = avroSchema.getType();
byte pigType = pigSchema.getType();
if (avroType.equals(Schema.Type.UNION)) {
return true;
} else if (pigType == DataType.TUPLE) {
/* Tuple is compatible with any type; for users may want to
get rid of the tuple wrapper */
return true;
}
return (avroType.equals(Schema.Type.ARRAY) && pigType == DataType.BAG)
|| (avroType.equals(Schema.Type.MAP) && pigType == DataType.MAP)
|| (avroType.equals(Schema.Type.STRING)
&& pigType == DataType.CHARARRAY
|| pigType == DataType.BIGCHARARRAY)
|| (avroType.equals(Schema.Type.ENUM)
&& pigType == DataType.CHARARRAY)
|| (avroType.equals(Schema.Type.BOOLEAN)
&& pigType == DataType.BOOLEAN
|| pigType == DataType.INTEGER)
|| (avroType.equals(Schema.Type.BYTES)
&& pigType == DataType.BYTEARRAY)
|| (avroType.equals(Schema.Type.DOUBLE)
&& pigType == DataType.DOUBLE
|| pigType == DataType.FLOAT
|| pigType == DataType.INTEGER
|| pigType == DataType.LONG)
|| (avroType.equals(Schema.Type.FLOAT)
&& pigType == DataType.FLOAT
|| pigType == DataType.INTEGER
|| pigType == DataType.LONG)
|| (avroType.equals(Schema.Type.FIXED)
&& pigType == DataType.BYTEARRAY)
|| (avroType.equals(Schema.Type.INT)
&& pigType == DataType.INTEGER)
|| (avroType.equals(Schema.Type.LONG)
&& pigType == DataType.LONG
|| pigType == DataType.INTEGER);
}
}