/**
* Copyright (C) 2014-2015 LinkedIn Corp. (pinot-core@linkedin.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.linkedin.thirdeye.hadoop.topk;
import static com.linkedin.thirdeye.hadoop.topk.TopKPhaseConstants.TOPK_PHASE_INPUT_PATH;
import static com.linkedin.thirdeye.hadoop.topk.TopKPhaseConstants.TOPK_PHASE_OUTPUT_PATH;
import static com.linkedin.thirdeye.hadoop.topk.TopKPhaseConstants.TOPK_PHASE_THIRDEYE_CONFIG;
import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.Set;
import com.linkedin.thirdeye.hadoop.config.MetricType;
import com.linkedin.thirdeye.hadoop.config.ThirdEyeConfigProperties;
import com.linkedin.thirdeye.hadoop.config.ThirdEyeConstants;
import com.linkedin.thirdeye.hadoop.config.TopKDimensionToMetricsSpec;
import com.linkedin.thirdeye.hadoop.config.ThirdEyeConfig;
import com.linkedin.thirdeye.hadoop.util.ThirdeyeAggregateMetricUtils;
import com.linkedin.thirdeye.hadoop.util.ThirdeyeAvroUtils;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapreduce.AvroKeyInputFormat;
import org.apache.commons.collections.MapUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.MinMaxPriorityQueue;
/**
* This phase reads avro input, and produces a file with top k values for dimensions
*
* Map:
* Map phase reads avro records, and for each record emits
* Key=(Dimension name, Dimension Value) Value=(Metrics)
* For each record, map also emits a
* Key=(ALL, ALL) Value=(Metrics)
* This is used for computing the metric sums in the reduce phase
*
* Combine:
* Combine phase receives Key=(DimensionName, DimensionValue)
* from each map, and aggregates the metric values. This phase
* helps in reducing the traffic sent to reducer
*
* Reduce:
* We strictly use just 1 reducer.
* Reduce phase receives Key=(DimensionName, DimensionValue)
* and aggregates the metric values
* The very first key received is (ALL, ALL) with total metric sum
* These metric sums are used to check metric thresholds of other
* (dimensionName, dimensionValue) pairs. If none of the metric
* thresholds pass, the pair is discarded.
* In the cleanup, top k dimension values are picked for each dimension
* based on the metric value
* The top k dimension values for each dimension are written to a file
*
*/
public class TopKPhaseJob extends Configured {
private static final Logger LOGGER = LoggerFactory.getLogger(TopKPhaseJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String TOPK_ALL_DIMENSION_NAME = "0";
private static final String TOPK_ALL_DIMENSION_VALUE = "0";
private String name;
private Properties props;
/**
* @param name
* @param props
*/
public TopKPhaseJob(String name, Properties props) {
super(new Configuration());
this.name = name;
this.props = props;
}
public static class TopKPhaseMapper
extends Mapper<AvroKey<GenericRecord>, NullWritable, BytesWritable, BytesWritable> {
private TopKPhaseConfig config;
ThirdEyeConfig thirdeyeConfig;
private List<String> dimensionNames;
private List<String> metricNames;
private List<MetricType> metricTypes;
private int numMetrics;
BytesWritable keyWritable;
BytesWritable valWritable;
Map<String, Integer> dimensionNameToIndexMapping;
Map<String, Long> metricSums;
@Override
public void setup(Context context) throws IOException, InterruptedException {
LOGGER.info("TopKPhaseJob.TopKPhaseMapper.setup()");
Configuration configuration = context.getConfiguration();
try {
thirdeyeConfig = OBJECT_MAPPER.readValue(configuration.get(TOPK_PHASE_THIRDEYE_CONFIG.toString()), ThirdEyeConfig.class);
config = TopKPhaseConfig.fromThirdEyeConfig(thirdeyeConfig);
dimensionNames = config.getDimensionNames();
metricNames = config.getMetricNames();
metricTypes = config.getMetricTypes();
numMetrics = metricNames.size();
valWritable = new BytesWritable();
keyWritable = new BytesWritable();
dimensionNameToIndexMapping = new HashMap<String, Integer>();
for (int i = 0; i < dimensionNames.size(); i++) {
dimensionNameToIndexMapping.put(dimensionNames.get(i), i);
}
metricSums = new HashMap<String, Long>();
for (String metricName : metricNames) {
metricSums.put(metricName, 0L);
}
} catch (Exception e) {
throw new IOException(e);
}
}
@Override
public void map(AvroKey<GenericRecord> key, NullWritable value, Context context)
throws IOException, InterruptedException {
// input record
GenericRecord inputRecord = key.datum();
// read metrics
Number[] metricValues = new Number[numMetrics];
for (int i = 0; i < numMetrics; i++) {
String metricName = metricNames.get(i);
Number metricValue = ThirdeyeAvroUtils.getMetricFromRecord(inputRecord, metricName);
metricValues[i] = metricValue;
}
TopKPhaseMapOutputValue valWrapper = new TopKPhaseMapOutputValue(metricValues, metricTypes);
byte[] valBytes = valWrapper.toBytes();
valWritable.set(valBytes, 0, valBytes.length);
// read dimensions
for (String dimensionName : dimensionNames) {
String dimensionValue = ThirdeyeAvroUtils.getDimensionFromRecord(inputRecord, dimensionName);
TopKPhaseMapOutputKey keyWrapper = new TopKPhaseMapOutputKey(dimensionName, dimensionValue);
byte[] keyBytes = keyWrapper.toBytes();
keyWritable.set(keyBytes, 0, keyBytes.length);
context.write(keyWritable, valWritable);
keyWrapper = new TopKPhaseMapOutputKey(TOPK_ALL_DIMENSION_NAME, TOPK_ALL_DIMENSION_VALUE);
keyBytes = keyWrapper.toBytes();
keyWritable.set(keyBytes, 0, keyBytes.length);
context.write(keyWritable, valWritable);
}
}
@Override
public void cleanup(Context context) throws IOException, InterruptedException {
}
}
public static class TopKPhaseCombiner
extends Reducer<BytesWritable, BytesWritable, BytesWritable, BytesWritable> {
private TopKPhaseConfig config;
ThirdEyeConfig thirdeyeConfig;
private List<MetricType> metricTypes;
private int numMetrics;
BytesWritable keyWritable;
BytesWritable valWritable;
@Override
public void setup(Context context) throws IOException, InterruptedException {
LOGGER.info("TopKPhaseJob.TopKPhaseCombiner.setup()");
Configuration configuration = context.getConfiguration();
try {
thirdeyeConfig = OBJECT_MAPPER.readValue(configuration.get(TOPK_PHASE_THIRDEYE_CONFIG.toString()), ThirdEyeConfig.class);
config = TopKPhaseConfig.fromThirdEyeConfig(thirdeyeConfig);
metricTypes = config.getMetricTypes();
numMetrics = metricTypes.size();
valWritable = new BytesWritable();
keyWritable = new BytesWritable();
} catch (Exception e) {
throw new IOException(e);
}
}
@Override
public void reduce(BytesWritable key, Iterable<BytesWritable> values, Context context)
throws IOException, InterruptedException {
Number[] aggMetricValues = new Number[numMetrics];
Arrays.fill(aggMetricValues, 0);
for (BytesWritable value : values) {
TopKPhaseMapOutputValue valWrapper = TopKPhaseMapOutputValue.fromBytes(value.getBytes(), metricTypes);
Number[] metricValues = valWrapper.getMetricValues();
ThirdeyeAggregateMetricUtils.aggregate(metricTypes, aggMetricValues, metricValues);
}
TopKPhaseMapOutputValue valWrapper = new TopKPhaseMapOutputValue(aggMetricValues, metricTypes);
byte[] valBytes = valWrapper.toBytes();
valWritable.set(valBytes, 0, valBytes.length);
context.write(key, valWritable);
}
}
public static class TopKPhaseReducer
extends Reducer<BytesWritable, BytesWritable, NullWritable, NullWritable> {
private FileSystem fileSystem;
private Configuration configuration;
private ThirdEyeConfig thirdeyeConfig;
private TopKPhaseConfig config;
private List<String> dimensionNames;
private List<String> metricNames;
private List<MetricType> metricTypes;
private Map<String, Integer> metricToIndexMapping;
private int numMetrics;
BytesWritable keyWritable;
BytesWritable valWritable;
Number[] metricSums;
private Map<String, Map<String, Number[]>> dimensionNameToValuesMap;
private TopKDimensionValues topkDimensionValues;
private Map<String, Double> metricThresholds;
private Map<String, Integer> thresholdPassCount;
private Map<String, TopKDimensionToMetricsSpec> topKDimensionToMetricsSpecMap;
private Map<String, Set<String>> whitelist;
@Override
public void setup(Context context) throws IOException, InterruptedException {
LOGGER.info("TopKPhaseJob.TopKPhaseReducer.setup()");
configuration = context.getConfiguration();
fileSystem = FileSystem.get(configuration);
try {
thirdeyeConfig = OBJECT_MAPPER.readValue(configuration.get(TOPK_PHASE_THIRDEYE_CONFIG.toString()), ThirdEyeConfig.class);
config = TopKPhaseConfig.fromThirdEyeConfig(thirdeyeConfig);
metricThresholds = config.getMetricThresholds();
topKDimensionToMetricsSpecMap = config.getTopKDimensionToMetricsSpec();
dimensionNames = config.getDimensionNames();
metricNames = config.getMetricNames();
metricTypes = config.getMetricTypes();
whitelist = config.getWhitelist();
numMetrics = metricNames.size();
metricToIndexMapping = new HashMap<>();
for (int i = 0; i < numMetrics; i ++) {
metricToIndexMapping.put(metricNames.get(i), i);
}
dimensionNameToValuesMap = new HashMap<>();
thresholdPassCount = new HashMap<>();
for (String dimension : dimensionNames) {
dimensionNameToValuesMap.put(dimension, new HashMap<String, Number[]>());
thresholdPassCount.put(dimension, 0);
}
topkDimensionValues = new TopKDimensionValues();
keyWritable = new BytesWritable();
valWritable = new BytesWritable();
} catch (Exception e) {
throw new IOException(e);
}
}
@Override
public void reduce(BytesWritable key, Iterable<BytesWritable> values,
Context context) throws IOException, InterruptedException {
TopKPhaseMapOutputKey keyWrapper = TopKPhaseMapOutputKey.fromBytes(key.getBytes());
String dimensionName = keyWrapper.getDimensionName();
String dimensionValue = keyWrapper.getDimensionValue();
// Get aggregate metric values for dimension name value pair
Number[] aggMetricValues = new Number[numMetrics];
Arrays.fill(aggMetricValues, 0);
for (BytesWritable value : values) {
TopKPhaseMapOutputValue valWrapper = TopKPhaseMapOutputValue.fromBytes(value.getBytes(), metricTypes);
Number[] metricValues = valWrapper.getMetricValues();
ThirdeyeAggregateMetricUtils.aggregate(metricTypes, aggMetricValues, metricValues);
}
// Metric sums case
if (dimensionName.equals(TOPK_ALL_DIMENSION_NAME) && dimensionValue.equals(TOPK_ALL_DIMENSION_VALUE)) {
LOGGER.info("Setting metric sums");
metricSums = new Number[numMetrics];
metricSums = Arrays.copyOf(aggMetricValues, numMetrics);
return;
}
// Check metric percentage threshold
if (MapUtils.isNotEmpty(metricThresholds)) {
boolean isPassThreshold = false;
for (int i = 0; i < numMetrics; i++) {
String metric = metricNames.get(i);
double metricValue = aggMetricValues[i].doubleValue();
double metricSum = metricSums[i].doubleValue();
double metricThresholdPercentage = metricThresholds.get(metric);
if (metricValue > (metricSum * metricThresholdPercentage / 100)) {
isPassThreshold = true;
thresholdPassCount.put(dimensionName, thresholdPassCount.get(dimensionName) + 1);
break;
}
}
if (!isPassThreshold) {
return;
}
dimensionNameToValuesMap.get(dimensionName).put(dimensionValue, aggMetricValues);
}
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
for (String dimension : dimensionNames) {
LOGGER.info("{} records passed metric threshold for dimension {}", thresholdPassCount.get(dimension), dimension);
// Get top k
TopKDimensionToMetricsSpec topkSpec = topKDimensionToMetricsSpecMap.get(dimension);
if (topkSpec != null && topkSpec.getDimensionName() != null && topkSpec.getTopk() != null) {
// Get top k for each metric specified
Map<String, Integer> topkMetricsMap = topkSpec.getTopk();
for (Entry<String, Integer> topKEntry : topkMetricsMap.entrySet()) {
String metric = topKEntry.getKey();
int k = topKEntry.getValue();
MinMaxPriorityQueue<DimensionValueMetricPair> topKQueue = MinMaxPriorityQueue.maximumSize(k).create();
Map<String, Number[]> dimensionToMetricsMap = dimensionNameToValuesMap.get(dimension);
for (Entry<String, Number[]> entry : dimensionToMetricsMap.entrySet()) {
topKQueue.add(new DimensionValueMetricPair(entry.getKey(), entry.getValue()[metricToIndexMapping.get(metric)]));
}
LOGGER.info("Picking Top {} values for {} based on Metric {} : {}", k, dimension, metric, topKQueue);
for (DimensionValueMetricPair pair : topKQueue) {
topkDimensionValues.addValue(dimension, pair.getDimensionValue());
}
}
}
}
if (topkDimensionValues.getTopKDimensions().size() > 0) {
String topkValuesPath = configuration.get(TOPK_PHASE_OUTPUT_PATH.toString());
LOGGER.info("Writing top k values to {}",topkValuesPath);
FSDataOutputStream topKDimensionValuesOutputStream = fileSystem.create(
new Path(topkValuesPath + File.separator + ThirdEyeConstants.TOPK_VALUES_FILE));
OBJECT_MAPPER.writeValue((DataOutput) topKDimensionValuesOutputStream, topkDimensionValues);
topKDimensionValuesOutputStream.close();
}
}
}
public Job run() throws Exception {
Job job = Job.getInstance(getConf());
job.setJobName(name);
job.setJarByClass(TopKPhaseJob.class);
Configuration configuration = job.getConfiguration();
FileSystem fs = FileSystem.get(configuration);
// Properties
LOGGER.info("Properties {}", props);
// Input Path
String inputPathDir = getAndSetConfiguration(configuration, TOPK_PHASE_INPUT_PATH);
LOGGER.info("Input path dir: " + inputPathDir);
for (String inputPath : inputPathDir.split(ThirdEyeConstants.FIELD_SEPARATOR)) {
LOGGER.info("Adding input:" + inputPath);
Path input = new Path(inputPath);
FileInputFormat.addInputPath(job, input);
}
// Output path
Path outputPath = new Path(getAndSetConfiguration(configuration, TOPK_PHASE_OUTPUT_PATH));
LOGGER.info("Output path dir: " + outputPath.toString());
if (fs.exists(outputPath)) {
fs.delete(outputPath, true);
}
FileOutputFormat.setOutputPath(job, outputPath);
// Schema
Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir);
LOGGER.info("Schema : {}", avroSchema.toString(true));
// ThirdEyeConfig
String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty(
props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()),
props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema);
props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty);
ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
LOGGER.info("Thirdeye Config {}", thirdeyeConfig.encode());
job.getConfiguration().set(TOPK_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));
// Map config
job.setMapperClass(TopKPhaseMapper.class);
job.setInputFormatClass(AvroKeyInputFormat.class);
job.setMapOutputKeyClass(BytesWritable.class);
job.setMapOutputValueClass(BytesWritable.class);
// Combiner
job.setCombinerClass(TopKPhaseCombiner.class);
// Reduce config
job.setReducerClass(TopKPhaseReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(1);
job.waitForCompletion(true);
return job;
}
private String getAndSetConfiguration(Configuration configuration,
TopKPhaseConstants constant) {
String value = getAndCheck(constant.toString());
configuration.set(constant.toString(), value);
return value;
}
private String getAndCheck(String propName) {
String propValue = props.getProperty(propName);
if (propValue == null) {
throw new IllegalArgumentException(propName + " required property");
}
return propValue;
}
}