package com.linkedin.thirdeye.hadoop.join; import static com.linkedin.thirdeye.hadoop.join.JoinPhaseConstants.*; import java.io.FileInputStream; import java.io.IOException; import java.io.StringWriter; import java.lang.reflect.Constructor; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.concurrent.atomic.AtomicInteger; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.mapred.AvroKey; import org.apache.avro.mapreduce.AvroJob; import org.apache.avro.mapreduce.AvroKeyOutputFormat; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.CounterGroup; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Lists; /** * This is a generic join job that can be used to prepare the data for Third * Eye. Many teams just need a way to join multiple data sets into one. * Currently they do this by using pig script which is highly inefficient, since * it does a pair wise join. The idea is as follows there are N named sources, * there is a join key common across all these sources. <br/> * S1: join key s1_key <br/> * S2: join key s2_key <br/> * ... <br/> * SN: join key sn_key<br/> */ public class JoinPhaseJob extends Configured { private static final Logger LOGGER = LoggerFactory.getLogger(JoinPhaseJob.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private String name; private Properties props; public JoinPhaseJob(String name, Properties props) { super(new Configuration()); this.name = name; this.props = props; } public static class GenericJoinMapper extends Mapper<AvroKey<GenericRecord>, NullWritable, BytesWritable, BytesWritable> { String sourceName; JoinKeyExtractor joinKeyExtractor; @Override public void setup(Context context) throws IOException, InterruptedException { LOGGER.info("GenericAvroJoinJob.GenericJoinMapper.setup()"); FileSplit fileSplit = (FileSplit) context.getInputSplit(); LOGGER.info("split name:" + fileSplit.toString()); Configuration configuration = context.getConfiguration(); try { sourceName = DelegatingAvroKeyInputFormat.getSourceNameFromPath(fileSplit, configuration); LOGGER.info("Input: {} belongs to Source:{}", fileSplit, sourceName); String joinKeyExtractorClass = configuration.get(JOIN_KEY_EXTRACTOR_CLASS.toString()); Map<String, String> params = new HashMap<>(); List<String> sourceNames = Lists.newArrayList(configuration.get(JOIN_SOURCE_NAMES.toString()).split(",")); for (String sourceName : sourceNames) { String joinKeyExtractorConfig = configuration.get(sourceName + "." + JOIN_KEY_EXTRACTOR_CONFIG.toString()); if (StringUtils.isNotBlank(joinKeyExtractorConfig)) { params.put(sourceName, joinKeyExtractorConfig); } } LOGGER.info("Initializing JoinKeyExtractorClass:{} with params:{}", joinKeyExtractorClass, params); Constructor<?> constructor = Class.forName(joinKeyExtractorClass).getConstructor(Map.class); joinKeyExtractor = (JoinKeyExtractor) constructor.newInstance(params); } catch (Exception e) { throw new IOException(e); } } @Override public void map(AvroKey<GenericRecord> recordWrapper, NullWritable value, Context context) throws IOException, InterruptedException { GenericRecord record = recordWrapper.datum(); MapOutputValue mapOutputValue = new MapOutputValue(record.getSchema().getName(), record); String joinKeyValue = joinKeyExtractor.extractJoinKey(sourceName, record); LOGGER.info("Join Key:{}", joinKeyValue); if (!"INVALID".equals(joinKeyValue)) { context.write(new BytesWritable(joinKeyValue.toString().getBytes()), new BytesWritable(mapOutputValue.toBytes())); } } } public static class GenericJoinReducer extends Reducer<BytesWritable, BytesWritable, AvroKey<GenericRecord>, NullWritable> { String statOutputDir; private FileSystem fileSystem; private static TypeReference MAP_STRING_STRING_TYPE = new TypeReference<Map<String, String>>() { }; private Map<String, Schema> schemaMap = new HashMap<String, Schema>(); private JoinUDF joinUDF; private Map<String, AtomicInteger> countersMap = new HashMap<String, AtomicInteger>(); private List<String> sourceNames; @Override public void setup(Context context) throws IOException, InterruptedException { Configuration configuration = context.getConfiguration(); fileSystem = FileSystem.get(configuration); try { Map<String, String> schemaJSONMapping = new ObjectMapper().readValue( context.getConfiguration().get("schema.json.mapping"), MAP_STRING_STRING_TYPE); LOGGER.info("Schema JSON Mapping: {}", schemaJSONMapping); for (String sourceName : schemaJSONMapping.keySet()) { Schema schema = new Schema.Parser().parse(schemaJSONMapping.get(sourceName)); schemaMap.put(sourceName, schema); } sourceNames = Lists.newArrayList(configuration.get(JOIN_SOURCE_NAMES.toString()).split(",")); String joinUDFClass = configuration.get(JOIN_UDF_CLASS.toString()); Map<String, String> params = new HashMap<>(); for (String sourceName : sourceNames) { String joinUdfConfig = configuration.get(sourceName + "." + JOIN_UDF_CONFIG.toString()); if (StringUtils.isNotBlank(joinUdfConfig)) { params.put(sourceName, joinUdfConfig); } } Constructor<?> constructor = Class.forName(joinUDFClass).getConstructor(Map.class); LOGGER.info("Initializing JoinUDFClass:{} with params:{}", joinUDFClass, params); joinUDF = (JoinUDF) constructor.newInstance(params); String outputSchemaPath = configuration.get(JOIN_OUTPUT_SCHEMA.toString()); // Avro schema Schema.Parser parser = new Schema.Parser(); Schema outputSchema = parser.parse(fileSystem.open(new Path(outputSchemaPath))); LOGGER.info("Setting outputschema:{}", outputSchema); joinUDF.init(outputSchema); } catch (Exception e) { throw new IOException(e); } } @Override public void reduce(BytesWritable joinKeyWritable, Iterable<BytesWritable> recordBytesWritable, Context context) throws IOException, InterruptedException { Map<String, List<GenericRecord>> joinInput = new HashMap<String, List<GenericRecord>>(); for (BytesWritable writable : recordBytesWritable) { byte[] bytes = writable.copyBytes(); MapOutputValue mapOutputValue = MapOutputValue.fromBytes(bytes, schemaMap); String schemaName = mapOutputValue.getSchemaName(); if (!joinInput.containsKey(schemaName)) { joinInput.put(schemaName, new ArrayList<GenericRecord>()); } joinInput.get(schemaName).add(mapOutputValue.getRecord()); } int[] exists = new int[sourceNames.size()]; for (int i = 0; i < sourceNames.size(); i++) { String source = sourceNames.get(i); if (joinInput.containsKey(source)) { exists[i] = 1; } else { exists[i] = 0; } } String counterName = Arrays.toString(exists); if (!countersMap.containsKey(counterName)) { countersMap.put(counterName, new AtomicInteger(0)); } countersMap.get(counterName).incrementAndGet(); // invoke the udf and pass in the join data List<GenericRecord> outputRecords = joinUDF.performJoin(new String(joinKeyWritable.copyBytes()), joinInput); if (outputRecords != null) { for (GenericRecord outputRecord : outputRecords) { context.write(new AvroKey<GenericRecord>(outputRecord), NullWritable.get()); } } } protected void cleanup(Context context) throws IOException, InterruptedException { for (String counterName : countersMap.keySet()) { context.getCounter("DynamicCounter", counterName) .increment(countersMap.get(counterName).get()); } } } public Job run() throws Exception { Job job = Job.getInstance(getConf()); Configuration conf = job.getConfiguration(); job.setJobName(name); job.setJarByClass(JoinPhaseJob.class); FileSystem fs = FileSystem.get(conf); String outputSchemaPath = getAndSetConfiguration(conf, JOIN_OUTPUT_SCHEMA); Schema.Parser parser = new Schema.Parser(); Schema outputSchema = parser.parse(fs.open(new Path(outputSchemaPath))); LOGGER.info("{}", outputSchema); // Set custom config like adding distributed caches String joinConfigUDFClass = getAndSetConfiguration(conf, JoinPhaseConstants.JOIN_CONFIG_UDF_CLASS); LOGGER.info("Initializing JoinConfigUDFClass:{} with params:{}", joinConfigUDFClass); Constructor<?> constructor = Class.forName(joinConfigUDFClass).getConstructor(); JoinConfigUDF joinConfigUDF = (JoinConfigUDF) constructor.newInstance(); joinConfigUDF.setJoinConfig(job); getAndSetConfiguration(conf, JOIN_KEY_EXTRACTOR_CLASS); getAndSetConfiguration(conf, JOIN_UDF_CLASS); List<String> sourceNames = Lists.newArrayList( getAndSetConfiguration(conf, JoinPhaseConstants.JOIN_SOURCE_NAMES).split(",")); // Map config job.setMapperClass(GenericJoinMapper.class); // AvroJob.setInputKeySchema(job, unionSchema); job.setInputFormatClass(DelegatingAvroKeyInputFormat.class); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(BytesWritable.class); // Reduce config job.setReducerClass(GenericJoinReducer.class); AvroJob.setOutputKeySchema(job, outputSchema); job.setOutputFormatClass(AvroKeyOutputFormat.class); job.setOutputKeyClass(AvroKey.class); job.setOutputValueClass(NullWritable.class); String numReducers = props.getProperty("num.reducers"); if (numReducers != null) { job.setNumReduceTasks(Integer.parseInt(numReducers)); } else { job.setNumReduceTasks(10); } LOGGER.info("Setting number of reducers : " + job.getNumReduceTasks()); Map<String, String> schemaMap = new HashMap<String, String>(); Map<String, String> schemaPathMapping = new HashMap<String, String>(); for (String sourceName : sourceNames) { // load schema for each source LOGGER.info("Loading Schema for {}", sourceName); FSDataInputStream schemaStream = fs.open(new Path(getAndCheck(sourceName + "." + JOIN_INPUT_SCHEMA.toString()))); Schema schema = new Schema.Parser().parse(schemaStream); schemaMap.put(sourceName, schema.toString()); LOGGER.info("Schema for {}: \n{}", sourceName, schema); // configure input data for each source String inputPathDir = getAndCheck(sourceName + "." + JOIN_INPUT_PATH.toString()); LOGGER.info("Input path dir for " + sourceName + ": " + inputPathDir); for (String inputPath : inputPathDir.split(",")) { Path input = new Path(inputPath); FileStatus[] listFiles = fs.listStatus(input); boolean isNested = false; for (FileStatus fileStatus : listFiles) { if (fileStatus.isDirectory()) { isNested = true; Path path = fileStatus.getPath(); LOGGER.info("Adding input:" + path); FileInputFormat.addInputPath(job, path); schemaPathMapping.put(path.toString(), sourceName); } } if (!isNested) { LOGGER.info("Adding input:" + inputPath); FileInputFormat.addInputPath(job, input); schemaPathMapping.put(input.toString(), sourceName); } } } StringWriter temp = new StringWriter(); OBJECT_MAPPER.writeValue(temp, schemaPathMapping); job.getConfiguration().set("schema.path.mapping", temp.toString()); temp = new StringWriter(); OBJECT_MAPPER.writeValue(temp, schemaMap); job.getConfiguration().set("schema.json.mapping", temp.toString()); Path outputPath = new Path(getAndCheck(JOIN_OUTPUT_PATH.toString())); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(job, new Path(getAndCheck(JOIN_OUTPUT_PATH.toString()))); for (Object key : props.keySet()) { conf.set(key.toString(), props.getProperty(key.toString())); } job.waitForCompletion(true); dumpSummary(job, sourceNames); return job; } private void dumpSummary(Job job, List<String> sourceNames) throws IOException { System.out.println("Join Input Matrix."); CounterGroup group = job.getCounters().getGroup("DynamicCounter"); for (String source : sourceNames) { System.out.print(String.format("%25s\t", source)); } if (group != null) { Iterator<Counter> iterator = group.iterator(); while (iterator.hasNext()) { Counter counter = iterator.next(); String displayName = counter.getDisplayName(); String[] split = displayName.replace("[", "").replace("[", "").split(","); for (String str : split) { if (str.trim().equals("1")) { System.out.print(String.format("%25s\t", "1")); } else { System.out.print(String.format("%25s\t", "-")); } } } } } private String getAndSetConfiguration(Configuration configuration, JoinPhaseConstants constant) { String value = getAndCheck(constant.toString()); configuration.set(constant.toString(), value); return value; } private String getAndCheck(String propName) { String propValue = props.getProperty(propName); if (propValue == null) { throw new IllegalArgumentException(propName + " required property"); } return propValue; } public static void main(String[] args) throws Exception { if (args.length != 1) { throw new IllegalArgumentException("usage: config.properties"); } Properties props = new Properties(); props.load(new FileInputStream(args[0])); JoinPhaseJob job = new JoinPhaseJob("aggregate_avro_job", props); job.run(); } }