/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.utils; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.block.ColumnType; import com.linkedin.cubert.block.DataType; import com.linkedin.cubert.pig.piggybank.storage.avro.AvroSchema2Pig; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.avro.Schema.Type; import org.apache.avro.file.DataFileReader; import org.apache.avro.file.DataFileWriter; import org.apache.avro.file.SeekableInput; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; import org.apache.avro.mapred.FsInput; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsAction; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.mapred.JobConf; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.node.NullNode; /** * Various utility methods related to Avro Schema. * * @author Maneesh Varshney * */ public class AvroUtils { private static int arrayElemInSchemaCounter = 0; private static final boolean PadDefaultNullsToSchema = true; /** * Extracts the schema of an Avro file. * * @param conf * @param path * @return * @throws IOException */ public static Schema getSchema(Configuration conf, Path path) throws IOException { FileSystem fs = path.getFileSystem(conf); Path anAvroFile = FileSystemUtils.getFirstMatch(fs, path, "*.avro", true); if (anAvroFile == null) throw new IOException("there are no files in " + path.toString()); System.out.println("Obtaining schema of avro file " + anAvroFile.toString()); return getSchema(new FsInput(anAvroFile, conf)); } public static Schema getSchema(SeekableInput input) throws IOException { DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(input, datumReader); Schema schema = dataFileReader.getSchema(); if (PadDefaultNullsToSchema) { // a list of "cloned" fields, with optional default value set to null ArrayList<Field> paddedFields = new ArrayList<Field>(); for (Field field: schema.getFields()) { // should this field be padded? boolean needsNullPadding = (field.schema() != null) // the field has nested schema && (field.schema().getType().equals(Type.UNION)) // the nested schema is UNION && (field.schema().getTypes().get(0).getType().equals(Type.NULL)); // the first element of union is NULL type JsonNode defValue = needsNullPadding ? NullNode.getInstance() : field.defaultValue(); Field f = new Field(field.name(), field.schema(), field.doc(), defValue); paddedFields.add(f); } schema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), schema.isError()); schema.setFields(paddedFields); } return schema; } /** * Converts a ColumnType array to Avro's Schema. * * @param recordName * @param schema * @return */ public static Schema convertFromBlockSchema(String recordName, BlockSchema schema) { arrayElemInSchemaCounter = 0; return convertFromBlockSchema(recordName, Type.RECORD, schema, true); } private static Field[] createFields(BlockSchema schema){ Field[] fields = new Field[schema.getNumColumns()]; for (int idx = 0; idx < fields.length; idx++) { final ColumnType col = schema.getColumnType(idx); final DataType colType = col.getType(); final Type subType = convertToAvroType(colType); final Schema colSchema; if (col.getColumnSchema() != null || subType == Type.ARRAY || subType == Type.MAP) { colSchema = convertFromBlockSchema(col.getName(), subType, col.getColumnSchema(), false); } else { List<Schema> unionSchema = new ArrayList<Schema>(); unionSchema.add(Schema.create(Type.NULL)); unionSchema.add(Schema.create(subType)); colSchema = Schema.createUnion(unionSchema); } fields[idx] = new Field(col.getName(), colSchema, null, null); } return fields; } private static Schema convertFromBlockSchema(final String name, final Type type, final BlockSchema schema, boolean toplevel) { Schema avroSchema; switch (type) { case RECORD: Field[] fields = createFields(schema); avroSchema = Schema.createRecord(name, null, null, false); avroSchema.setFields(Arrays.asList(fields)); if (toplevel) break; List<Schema> unionSchema = new ArrayList<Schema>(); unionSchema.add(Schema.create(Type.NULL)); unionSchema.add(avroSchema); avroSchema = Schema.createUnion(unionSchema); break; case ARRAY: { if (schema.getNumColumns() != 1) { throw new RuntimeException("Type ARRAY must have a single element in the subschema"); } ColumnType elemColType = schema.getColumnType(0); Schema elemType; if (elemColType.getColumnSchema() == null) { elemType = Schema.create(convertToAvroType(elemColType.getType())); } else { elemType = convertFromBlockSchema(elemColType.getName() + (arrayElemInSchemaCounter++), convertToAvroType(elemColType.getType()), elemColType.getColumnSchema(), false); } avroSchema = Schema.createArray(elemType); unionSchema = new ArrayList<Schema>(); unionSchema.add(Schema.create(Type.NULL)); unionSchema.add(avroSchema); avroSchema = Schema.createUnion(unionSchema); break; } case MAP: { ColumnType valueColType = schema.getColumnType(0); Schema valueType; if (valueColType.getColumnSchema() == null) { valueType = Schema.create(convertToAvroType(valueColType.getType())); } else { valueType = convertFromBlockSchema(valueColType.getName(), convertToAvroType(valueColType.getType()), valueColType.getColumnSchema(), false); } avroSchema = Schema.createMap(valueType); unionSchema = new ArrayList<Schema>(); unionSchema.add(Schema.create(Type.NULL)); unionSchema.add(avroSchema); avroSchema = Schema.createUnion(unionSchema); break; } default: throw new IllegalArgumentException("Unsupported composite Type: " + type); } return avroSchema; } private static Type convertToAvroType(DataType colType) { final Type subType; if (colType == DataType.TUPLE) { /* Pig converts RECORD to TUPLE. Converting it back. */ subType = Type.RECORD; } else if (colType == DataType.BAG) { subType = Type.ARRAY; } else if (colType == DataType.MAP) { subType = Type.MAP; } else { subType = Type.valueOf(colType.toString().toUpperCase()); } return subType; } // Convert to Pig Schema use the utility functions in Pig first, then convert to Block // Schema. // Thus only the Pig <-> Cubert schema conversion path is required to be maintained in // the code. public static BlockSchema convertToBlockSchema(Schema avroSchema) { try { org.apache.pig.ResourceSchema pigResourceSchema = AvroSchema2Pig.convert(avroSchema); org.apache.pig.impl.logicalLayer.schema.Schema pigSchema = org.apache.pig.impl.logicalLayer.schema.Schema.getPigSchema(pigResourceSchema); return SchemaUtils.convertToBlockSchema(pigSchema); } catch (IOException e) { throw new RuntimeException(e); } } public static void createFileIfNotExists(BlockSchema fileSchema, String path) throws IOException { Configuration conf = new JobConf(); FileSystem fs = FileSystem.get(conf); if (fs.exists(new Path(path))) return; Schema avroSchema = convertFromBlockSchema("CUBERT_MV_RECORD", fileSchema); System.out.println("Creating avro file with schema = " + avroSchema); GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(avroSchema); DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(datumWriter); FSDataOutputStream fout = FileSystem.create(fs, new Path(path), new FsPermission(FsAction.ALL, FsAction.READ_EXECUTE, FsAction.READ_EXECUTE)); writer.create(avroSchema, fout); writer.flush(); writer.close(); } public static void main(String[] args) throws IOException { JobConf conf = new JobConf(); System.out.println(AvroUtils.getSchema(conf, new Path(args[0])).toString(true)); } }