package com.linkedin.thirdeye.hadoop.transform; import java.io.IOException; import java.util.Map; import org.apache.avro.Schema; import org.apache.avro.mapreduce.AvroKeyInputFormat; import org.apache.avro.mapreduce.AvroKeyRecordReader; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.codehaus.jackson.JsonParseException; import org.codehaus.jackson.map.JsonMappingException; import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.type.TypeReference; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class DelegatingAvroKeyInputFormat<T> extends AvroKeyInputFormat<T> { private static final Logger LOGGER = LoggerFactory.getLogger(DelegatingAvroKeyInputFormat.class); private static TypeReference MAP_STRING_STRING_TYPE = new TypeReference<Map<String, String>>() { }; public org.apache.hadoop.mapreduce.RecordReader<org.apache.avro.mapred.AvroKey<T>, NullWritable> createRecordReader( InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { LOGGER.info("DelegatingAvroKeyInputFormat.createRecordReader() for split:{}", split); FileSplit fileSplit = (FileSplit) split; Configuration configuration = context.getConfiguration(); String sourceName = getSourceNameFromPath(fileSplit, configuration); LOGGER.info("Source Name for path {} : {}", fileSplit.getPath(), sourceName); Map<String, String> schemaJSONMapping = new ObjectMapper() .readValue(configuration.get("schema.json.mapping"), MAP_STRING_STRING_TYPE); LOGGER.info("Schema JSON Mapping: {}", schemaJSONMapping); String sourceSchemaJSON = schemaJSONMapping.get(sourceName); Schema schema = new Schema.Parser().parse(sourceSchemaJSON); return new AvroKeyRecordReader<T>(schema); } public static String getSourceNameFromPath(FileSplit fileSplit, Configuration configuration) throws IOException, JsonParseException, JsonMappingException { String content = configuration.get("schema.path.mapping"); Map<String, String> schemaPathMapping = new ObjectMapper().readValue(content, MAP_STRING_STRING_TYPE); LOGGER.info("Schema Path Mapping: {}", schemaPathMapping); String sourceName = null; for (String path : schemaPathMapping.keySet()) { if (fileSplit.getPath().toString().indexOf(path) > -1) { sourceName = schemaPathMapping.get(path); break; } } return sourceName; }; }