/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.apache.pig.piggybank.storage.avro; import java.io.IOException; import org.apache.avro.AvroRuntimeException; import org.apache.avro.Schema; import org.apache.avro.file.DataFileReader; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import java.util.ArrayList; import java.util.List; import java.util.Map; import org.codehaus.jackson.JsonNode; /** * This is an implementation of record reader which reads in avro data and * convert them into <NullWritable, Writable> pairs. */ public class PigAvroRecordReader extends RecordReader<NullWritable, Writable> { private static final Log LOG = LogFactory.getLog(PigAvroRecordReader.class); private AvroStorageInputStream in; private DataFileReader<Object> reader; /*reader of input avro data*/ private long start; private long end; private Path path; private boolean ignoreBadFiles; /* whether ignore corrupted files during load */ private TupleFactory tupleFactory = TupleFactory.getInstance(); /* if multiple avro record schemas are merged, this list will hold field objects * of the merged schema. */ private ArrayList<Object> mProtoTuple; /* establish is multiple_schema flag is used to pass this to the RecordReader*/ private boolean useMultipleSchemas = false; /* if multiple avro record schemas are merged, this map associates each input * record with a remapping of its fields relative to the merged schema. please * see AvroStorageUtils.getSchemaToMergedSchemaMap() for more details. */ private Map<Path, Map<Integer, Integer>> schemaToMergedSchemaMap; /** * constructor to initialize input and avro data reader */ public PigAvroRecordReader(TaskAttemptContext context, FileSplit split, Schema readerSchema, boolean ignoreBadFiles, Map<Path, Map<Integer, Integer>> schemaToMergedSchemaMap, boolean useMultipleSchemas) throws IOException { this.path = split.getPath(); this.in = new AvroStorageInputStream(path, context); this.useMultipleSchemas = useMultipleSchemas; if(readerSchema == null) { AvroStorageLog.details("No avro schema given; assuming the schema is embedded"); } Schema writerSchema; try { FileSystem fs = FileSystem.get(path.toUri(), context.getConfiguration()); writerSchema = AvroStorageUtils.getSchema(path, fs); } catch (IOException e) { AvroStorageLog.details("No avro writer schema found in '"+path+"'; assuming writer schema matches reader schema"); writerSchema = null; } try { if (useMultipleSchemas) { this.reader = new DataFileReader<Object>(in, new PigAvroDatumReader(writerSchema, null)); } else { this.reader = new DataFileReader<Object>(in, new PigAvroDatumReader(writerSchema, readerSchema)); } } catch (IOException e) { throw new IOException("Error initializing data file reader for file (" + split.getPath() + ")", e); } this.reader.sync(split.getStart()); // sync to start this.start = in.tell(); this.end = split.getStart() + split.getLength(); this.ignoreBadFiles = ignoreBadFiles; this.schemaToMergedSchemaMap = schemaToMergedSchemaMap; if (schemaToMergedSchemaMap != null) { // initialize mProtoTuple with the right default values int maxPos = 0; for (Map<Integer, Integer> map : schemaToMergedSchemaMap.values()) { for (Integer i : map.values()) { maxPos = Math.max(i, maxPos); } } int tupleSize = maxPos + 1; AvroStorageLog.details("Creating proto tuple of fixed size: " + tupleSize); mProtoTuple = new ArrayList<Object>(tupleSize); for (int i = 0; i < tupleSize; i++) { // Get the list of fields from the passed schema List<Schema.Field> subFields = readerSchema.getFields(); JsonNode defValue = subFields.get(i).defaultValue(); if (defValue != null) { Schema.Type type = subFields.get(i).schema().getType(); switch (type) { case BOOLEAN: mProtoTuple.add(i, defValue.getBooleanValue()); break; case ENUM: mProtoTuple.add(i, defValue.getTextValue()); break; case FIXED: mProtoTuple.add(i, defValue.getTextValue()); break; case INT: mProtoTuple.add(i, defValue.getIntValue()); break; case LONG: mProtoTuple.add(i, defValue.getIntValue()); break; case FLOAT: mProtoTuple.add(i, defValue.getNumberValue().floatValue()); break; case DOUBLE: mProtoTuple.add(i, defValue.getNumberValue().doubleValue()); break; case STRING: mProtoTuple.add(i, defValue.getTextValue()); break; default: mProtoTuple.add(i, null); break; } } else { mProtoTuple.add(i, null); } } } } @Override public float getProgress() throws IOException { return end == start ? 0.0f : Math.min(1.0f, (getPos() - start) / (float) (end - start)); } public long getPos() throws IOException { return in.tell(); } @Override public void close() throws IOException { reader.close(); } @Override public NullWritable getCurrentKey() throws IOException, InterruptedException { return NullWritable.get(); } @Override public Writable getCurrentValue() throws IOException, InterruptedException { Object obj = reader.next(); Tuple result = null; if (obj instanceof Tuple) { AvroStorageLog.details("Class =" + obj.getClass()); result = (Tuple) obj; } else { if (obj != null) { AvroStorageLog.details("Wrap class " + obj.getClass() + " as a tuple."); } else { AvroStorageLog.details("Wrap null as a tuple."); } result = wrapAsTuple(obj); } if (schemaToMergedSchemaMap != null) { // remap the position of fields to the merged schema Map<Integer, Integer> map = schemaToMergedSchemaMap.get(path); if (map == null) { throw new IOException("The schema of '" + path + "' " + "is not merged by AvroStorage."); } result = remap(result, map); } return result; } /** * Remap the position of fields to the merged schema */ private Tuple remap(Tuple tuple, Map<Integer, Integer> map) throws IOException { try { for (int pos = 0; pos < tuple.size(); pos++) { mProtoTuple.set(map.get(pos), tuple.get(pos)); } } catch (Exception e) { throw new IOException(e); } return tupleFactory.newTuple(mProtoTuple); } /** * Wrap non-tuple value as a tuple */ protected Tuple wrapAsTuple(Object in) { Tuple tuple = tupleFactory.newTuple(); tuple.append(in); return tuple; } @Override public void initialize(InputSplit arg0, TaskAttemptContext arg1) throws IOException, InterruptedException { // Nothing to do } @Override public boolean nextKeyValue() throws IOException, InterruptedException { try { if (!reader.hasNext() || reader.pastSync(end)) { return false; } return true; } catch (AvroRuntimeException e) { if (ignoreBadFiles) { // For currupted files, AvroRuntimeException can be thrown. // We ignore them if the option 'ignore_bad_files' is enabled. LOG.warn("Ignoring bad file '" + path + "'."); return false; } else { throw e; } } } }