/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.apache.pig.piggybank.storage.avro;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.avro.Schema;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
/**
* The InputFormat for avro data.
*
*/
public class PigAvroInputFormat extends FileInputFormat<NullWritable, Writable> {
private Schema readerSchema = null; /* avro schema */
/* establish is multiple_schema flag is used to pass this to the RecordReader*/
private boolean useMultipleSchemas = false;
private boolean ignoreBadFiles = false; /* whether ignore corrupted files during load */
/* if multiple avro record schemas are merged, this map associates each input
* record with a remapping of its fields relative to the merged schema. please
* see AvroStorageUtils.getSchemaToMergedSchemaMap() for more details.
*/
private Map<Path, Map<Integer, Integer>> schemaToMergedSchemaMap;
/**
* empty constructor
*/
public PigAvroInputFormat() {
}
/**
* constructor called by AvroStorage to pass in schema and ignoreBadFiles.
* @param readerSchema reader schema
* @param ignoreBadFiles whether ignore corrupted files during load
* @param schemaToMergedSchemaMap map that associates each input record
* with a remapping of its fields relative to the merged schema
*/
public PigAvroInputFormat(Schema readerSchema, boolean ignoreBadFiles,
Map<Path, Map<Integer, Integer>> schemaToMergedSchemaMap,
boolean useMultipleSchemas) {
this.readerSchema = readerSchema;
this.ignoreBadFiles = ignoreBadFiles;
this.schemaToMergedSchemaMap = schemaToMergedSchemaMap;
this.useMultipleSchemas = useMultipleSchemas;
}
/**
* Create and return an avro record reader.
* It uses the input schema passed in to the
* constructor.
*/
@Override
public RecordReader<NullWritable, Writable>
createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
context.setStatus(split.toString());
return new PigAvroRecordReader(context, (FileSplit) split, readerSchema,
ignoreBadFiles, schemaToMergedSchemaMap, useMultipleSchemas);
}
}