package com.linkedin.thirdeye.hadoop.join; import java.util.ArrayList; import java.util.List; import java.util.Map; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.linkedin.thirdeye.hadoop.join.GenericJoinUDFConfig.Field; public class GenericJoinUDF implements JoinUDF { private static final Logger LOGGER = LoggerFactory.getLogger(GenericJoinUDF.class); private GenericJoinUDFConfig config; private Schema outputSchema; private List<Field> fields; public GenericJoinUDF(Map<String, String> params) { LOGGER.info("Initializing GenericJoinUDF with params:" + params); this.config = new GenericJoinUDFConfig(params); fields = config.getFields(); } @Override public void init(Schema outputSchema) { this.outputSchema = outputSchema; } /** * Trivial implementation of a generic join udf. Assumes the data type is the * same in source and output. */ @Override public List<GenericRecord> performJoin(Object joinKeyVal, Map<String, List<GenericRecord>> joinInput) { List<GenericRecord> outputRecords = new ArrayList<GenericRecord>(); GenericRecord outputRecord = new GenericData.Record(outputSchema); for (Field field : fields) { Object value = null; // try to find the field in one of the source events, break out as soon as // we find a non null value for (String source : field.sourceEvents) { List<GenericRecord> list = joinInput.get(source); if (list != null && list.size() >= 1) { for (GenericRecord record : list) { value = record.get(field.name); if (value != null) { break; } } } if (value != null) { break; } } if (value != null) { outputRecord.put(field.name, value); } } outputRecords.add(outputRecord); return outputRecords; } }