/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.apache.pig.piggybank.storage.avro; import java.io.IOException; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.io.Encoder; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.Tuple; /** * An avro GenericDatumWriter to write pig data as Avro data. * */ public class PigAvroDatumWriter extends GenericDatumWriter<Object> { /** * construct with output schema */ public PigAvroDatumWriter(Schema schema) { setSchema(schema); } @Override protected void write(Schema schema, Object datum, Encoder out) throws IOException { try { /** * In case users want to get rid of Pig tuple wrapper */ if (!schema.getType().equals(Schema.Type.RECORD) && !schema.getType().equals(Schema.Type.UNION) && datum instanceof Tuple && unwrappedInstanceOf(schema, datum)) { Tuple t = (Tuple) datum; if (t.size() > 1) throw new IOException("Incompatible schema:" + schema+ " \n for data " + datum); write(schema, t.get(0), out); return; } switch (schema.getType()) { case FIXED: writeFixed(schema, datum, out); break; case ENUM: writeEnum(schema, datum, out); break; case STRING: writeString(schema, datum, out); break; case BYTES: writeBytes(datum, out); break; case BOOLEAN: writeBoolean(datum, out); break; case UNION: writeUnion(schema, datum, out); break; case LONG: writeLong(datum, out); break; case FLOAT: writeFloat(datum, out); break; case DOUBLE: writeDouble(datum, out); break; case ARRAY: /*falls through*/ case MAP: /*falls through*/ case RECORD: /*falls through*/ case INT: /*falls through*/ case NULL:/*falls through*/ default: super.write(schema, datum, out); } } catch (NullPointerException e) { throw npe(e, " of " + schema.getName()); } } /** * Called to write union. */ protected void writeUnion(Schema schema, Object datum, Encoder out) throws IOException { int index = resolveUnionSchema(schema, datum); out.writeIndex(index); write(schema.getTypes().get(index), datum, out); } /** * Called to resolve union. */ protected int resolveUnionSchema(Schema union, Object datum) throws IOException { int i = 0; for (Schema type : union.getTypes()) { if (type.getType().equals(Schema.Type.UNION)) throw new IOException("A union cannot immediately contain other unions."); if (instanceOf(type, datum)) return i; i++; } throw new RuntimeException("Datum " + datum + " is not in union " + union); } /** * Recursively check whether "datum" is an instance of "schema" and called * by {@link #resolveUnionSchema(Schema,Object)}, * {@link #unwrappedInstanceOf(Schema,Object)}. * */ protected boolean instanceOf(Schema schema, Object datum) throws IOException { try { switch (schema.getType()) { case RECORD: if (datum instanceof Tuple) { Tuple tuple = (Tuple) datum; List<Field> fields = schema.getFields(); if (fields.size() != tuple.size()) { return false; } for (int i = 0; i < fields.size(); i++) { if (!instanceOf(fields.get(i).schema(), tuple.get(i))) return false; } return true; } return false; case UNION: @SuppressWarnings("unused") int index = resolveUnionSchema(schema, datum); return true; case ENUM: return datum instanceof String && schema.hasEnumSymbol(((String) datum)) || unwrappedInstanceOf(schema, datum); case ARRAY: return datum instanceof DataBag || unwrappedInstanceOf(schema, datum); case MAP: return datum instanceof Map || unwrappedInstanceOf(schema, datum); case FIXED: return datum instanceof DataByteArray && ((DataByteArray) datum).size() == schema.getFixedSize() || unwrappedInstanceOf(schema, datum); case STRING: return datum instanceof String || unwrappedInstanceOf(schema, datum); case BYTES: return datum instanceof DataByteArray || unwrappedInstanceOf(schema, datum); case INT: return datum instanceof Integer || unwrappedInstanceOf(schema, datum); case LONG: return datum instanceof Long || datum instanceof Integer || unwrappedInstanceOf(schema, datum); case FLOAT: return datum instanceof Float || datum instanceof Integer || datum instanceof Long || unwrappedInstanceOf(schema, datum); case DOUBLE: return datum instanceof Double || datum instanceof Float || datum instanceof Integer || datum instanceof Long || unwrappedInstanceOf(schema, datum); case BOOLEAN: return datum instanceof Boolean || datum instanceof Integer || unwrappedInstanceOf(schema, datum); case NULL: return datum == null; default: throw new RuntimeException("Unexpected type: " + schema); } } catch (ExecException e) { e.printStackTrace(System.err); throw new RuntimeException(e); } } /** * Check whether "datum" is an instance of "schema" after stripping * the tuple wrapper. */ private boolean unwrappedInstanceOf(Schema schema, Object datum) throws IOException { try { if (!(datum instanceof Tuple)) return false; Tuple tuple = (Tuple) datum; if (tuple.size() != 1) return false; switch (schema.getType()) { case ENUM: case ARRAY: case MAP: case FIXED: case STRING: case BYTES: case INT: case LONG: case FLOAT: case DOUBLE: case BOOLEAN: return instanceOf(schema, tuple.get(0)); default: throw new IOException("Invalid type:" + schema.getType()); } } catch (Exception e) { e.printStackTrace(System.err); throw new IOException(e); } } /** * Write double. Users can cast long, float and integer to double. * */ protected void writeDouble(Object datum, Encoder out) throws IOException { double num; if (datum instanceof Integer) { num = ((Integer) datum).doubleValue(); } else if (datum instanceof Long) { num = ((Long) datum).doubleValue(); } else if (datum instanceof Float) { num = ((Float) datum).doubleValue(); } else if (datum instanceof Double) { num = (Double) datum; } else throw new IOException("Cannot convert to double:" + datum.getClass()); out.writeDouble(num); } /** * Write float. Users can cast long and integer into float. * */ protected void writeFloat(Object datum, Encoder out) throws IOException { float num; if (datum instanceof Integer) { num = ((Integer) datum).floatValue(); } else if (datum instanceof Long) { num = ((Long) datum).floatValue(); } else if (datum instanceof Float) { num = (Float) datum; } else throw new IOException("Cannot convert to float:" + datum.getClass()); out.writeFloat(num); } /** * Write long. Users can cast integer into long. * */ protected void writeLong(Object datum, Encoder out) throws IOException { long num; if (datum instanceof Integer) { num = ((Integer) datum).longValue(); } else if (datum instanceof Long) { num = (Long) datum; } else throw new IOException("Cannot convert to long:" + datum.getClass()); out.writeLong(num); } /** * Write boolean. Users can cast an integer into boolean. * */ protected void writeBoolean(Object datum, Encoder out) throws IOException { if (datum instanceof Boolean) { out.writeBoolean((Boolean) datum); } else if (datum instanceof Integer) { out.writeBoolean(((Integer) datum) != 0); } else throw new RuntimeException("Unsupported type boolean:" + datum.getClass()); } /** * As of Avro 1.5.1 this method is now in the superclass so it's no longer * needed here, but leaving here for backward compatibility with Avro 1.4.1. */ protected NullPointerException npe(NullPointerException e, String s) { NullPointerException result = new NullPointerException(e.getMessage() + s); result.initCause(e.getCause() == null ? e : e.getCause()); return result; } /** * Called to write a bytes. */ @Override protected void writeBytes(Object datum, org.apache.avro.io.Encoder out) throws IOException { if (datum instanceof DataByteArray) { out.writeBytes(((DataByteArray) datum).get()); } else throw new RuntimeException("Unsupported type bytes:" + datum.getClass()); } /** * Called to write a fixed value. */ @Override protected void writeFixed(Schema schema, Object datum, org.apache.avro.io.Encoder out) throws IOException { if (datum instanceof DataByteArray) { final byte[] bytes = ((DataByteArray) datum).get(); out.writeFixed(bytes, 0, bytes.length); } else throw new RuntimeException("Unsupported type fixed:" + datum.getClass()); } /** * Overriding to fetch the field value from the Tuple. */ @Override protected void writeRecord(Schema schema, Object datum, Encoder out) throws IOException { for (Field f : schema.getFields()) { Object value = getField(datum, f.name(), f.pos()); try { write(f.schema(), value, out); } catch (NullPointerException e) { throw npe(e, " in field "+f.name()); } } } /** * Called by the implementation of {@link #writeRecord} to retrieve * a record field value. */ protected Object getField(Object record, String name, int pos) { if (record instanceof Tuple) { try { return ((Tuple) record).get(pos); } catch (ExecException e) { e.printStackTrace(); throw new RuntimeException(e); } } else throw new RuntimeException("Unsupported type in record:" + record.getClass()); } /** * Called by the implementation of {@link #writeArray} to get the * size of an array. */ @Override protected long getArraySize(Object array) { if (array instanceof DataBag) { return ((DataBag) array).size(); } else throw new RuntimeException("Unsupported type in array:" + array.getClass()); } /** * Called by the implementation of {@link #writeArray} to enumerate * array elements. */ @Override protected Iterator<? extends Object> getArrayElements(Object array) { if (array instanceof DataBag) { return ((DataBag) array).iterator(); } else throw new RuntimeException("Unsupported type in array:" + array.getClass()); } }