package com.linkedin.thirdeye.rootcause.impl;
import com.linkedin.thirdeye.client.DAORegistry;
import com.linkedin.thirdeye.client.MetricExpression;
import com.linkedin.thirdeye.client.ThirdEyeCacheRegistry;
import com.linkedin.thirdeye.client.cache.QueryCache;
import com.linkedin.thirdeye.client.diffsummary.Cube;
import com.linkedin.thirdeye.client.diffsummary.DimNameValueCostEntry;
import com.linkedin.thirdeye.client.diffsummary.Dimensions;
import com.linkedin.thirdeye.client.diffsummary.OLAPDataBaseClient;
import com.linkedin.thirdeye.client.diffsummary.PinotThirdEyeSummaryClient;
import com.linkedin.thirdeye.constant.MetricAggFunction;
import com.linkedin.thirdeye.dashboard.Utils;
import com.linkedin.thirdeye.dataframe.DataFrame;
import com.linkedin.thirdeye.dataframe.DoubleSeries;
import com.linkedin.thirdeye.dataframe.Series;
import com.linkedin.thirdeye.dataframe.StringSeries;
import com.linkedin.thirdeye.datalayer.bao.DatasetConfigManager;
import com.linkedin.thirdeye.datalayer.bao.MetricConfigManager;
import com.linkedin.thirdeye.datalayer.dto.DatasetConfigDTO;
import com.linkedin.thirdeye.datalayer.dto.MetricConfigDTO;
import com.linkedin.thirdeye.rootcause.Pipeline;
import com.linkedin.thirdeye.rootcause.PipelineContext;
import com.linkedin.thirdeye.rootcause.PipelineResult;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Pipeline for identifying relevant dimensions by performing
* contribution analysis. The pipeline first fetches the Current and Baseline entities and
* MetricEntities in the search context. It then maps the metrics to ThirdEye's internal database
* and performs contribution analysis using a {@code DimensionScorer).
*
* @see DimensionScorer
*/
public class DimensionAnalysisPipeline extends Pipeline {
private static final Logger LOG = LoggerFactory.getLogger(DimensionAnalysisPipeline.class);
public static final String PROP_PARALLELISM = "parallelism";
public static final String PROP_PARALLELISM_DEFAULT = "1";
public static final long TIMEOUT = 120000;
private static final String KEY = "key";
static final String DIMENSION = "dimension";
static final String COST = "cost";
static final String VALUE = "value";
private final QueryCache cache;
private final MetricConfigManager metricDAO;
private final DatasetConfigManager datasetDAO;
private final ExecutorService executor;
/**
* Constructor for dependency injection
*
* @param outputName pipeline output name
* @param inputNames input pipeline names
* @param metricDAO metric config DAO
* @param datasetDAO dataset config DAO
* @param cache query cache for running contribution analysis
* @param executor executor service for parallel task execution
*/
public DimensionAnalysisPipeline(String outputName, Set<String> inputNames, MetricConfigManager metricDAO,
DatasetConfigManager datasetDAO, QueryCache cache, ExecutorService executor) {
super(outputName, inputNames);
this.metricDAO = metricDAO;
this.datasetDAO = datasetDAO;
this.cache = cache;
this.executor = executor;
}
/**
* Alternate constructor for use by PipelineLoader
*
* @param outputName pipeline output name
* @param inputNames input pipeline names
* @param properties configuration properties ({@code PROP_PARALLELISM=1})
*/
public DimensionAnalysisPipeline(String outputName, Set<String> inputNames, Map<String, String> properties) {
super(outputName, inputNames);
this.metricDAO = DAORegistry.getInstance().getMetricConfigDAO();
this.datasetDAO = DAORegistry.getInstance().getDatasetConfigDAO();
this.cache = ThirdEyeCacheRegistry.getInstance().getQueryCache();
String parallelismProp = PROP_PARALLELISM_DEFAULT;
if(properties.containsKey(PROP_PARALLELISM))
parallelismProp = properties.get(PROP_PARALLELISM);
this.executor = Executors.newFixedThreadPool(Integer.parseInt(parallelismProp));
}
@Override
public PipelineResult run(PipelineContext context) {
Set<MetricEntity> metricsEntities = context.filter(MetricEntity.class);
final TimeRangeEntity current = TimeRangeEntity.getContextCurrent(context);
final TimeRangeEntity baseline = TimeRangeEntity.getContextBaseline(context);
DataFrame dfScore = new DataFrame();
dfScore.addSeries(DIMENSION, StringSeries.empty());
dfScore.addSeries(VALUE, StringSeries.empty());
dfScore.addSeries(COST, DoubleSeries.empty());
Map<String, Future<DataFrame>> scores = new HashMap<>();
for(MetricEntity me : metricsEntities) {
final MetricConfigDTO metricDTO = metricDAO.findById(me.getId());
if(metricDTO == null) {
LOG.warn("Could not resolve metric id {}. Skipping.", me.getId());
continue;
}
final DatasetConfigDTO datasetDTO = datasetDAO.findByDataset(metricDTO.getDataset());
if(datasetDTO == null) {
LOG.warn("Could not resolve dataset '{}'. Skipping metric id {}", metricDTO.getDataset(), me.getId());
continue;
}
// Create asynchronous scoring task
final MetricEntity entity = me;
Callable<DataFrame> scoreTask = new Callable<DataFrame>() {
@Override
public DataFrame call() throws Exception {
LOG.info("Scoring metric '{}' in dataset '{}' with weight {}", metricDTO.getName(), datasetDTO.getDataset(), entity.getScore());
DataFrame dfMetric = DimensionAnalysisPipeline.this.score(datasetDTO, metricDTO, current, baseline);
// modify cost by metric score
final double metricScore = entity.getScore();
dfMetric.mapInPlace(new Series.DoubleFunction() {
@Override
public double apply(double... values) {
return values[0] * metricScore;
}
}, COST);
return dfMetric;
}
};
Future<DataFrame> fScore = this.executor.submit(scoreTask);
scores.put(me.getUrn(), fScore);
}
// Combine results
for(Map.Entry<String, Future<DataFrame>> e : scores.entrySet()) {
try {
dfScore = dfScore.append(e.getValue().get(TIMEOUT, TimeUnit.MILLISECONDS));
} catch (Exception ex) {
LOG.warn("Exception while combining results for '{}'. Skipping.", e.getKey(), ex);
}
}
// TODO use multi-column grouping when available
// generate dimension keys
dfScore.mapInPlace(new Series.StringFunction() {
@Override
public String apply(String... values) {
return values[0] + ":" + values[1];
}
}, KEY, DIMENSION, VALUE);
DataFrame.DataFrameGrouping grouping = dfScore.groupBy(KEY);
DataFrame sumCost = grouping.aggregate(COST, DoubleSeries.SUM).fillNull(COST);
DataFrame dimension = grouping.aggregate(DIMENSION, StringSeries.FIRST);
DataFrame value = grouping.aggregate(VALUE, StringSeries.FIRST);
// TODO cleanup
// truncate results to most important dimensions
DataFrame trunc = sumCost.sortedBy(COST);
final double total = sumCost.getDoubles(COST).sum();
final double truncTotal = trunc.getDoubles(COST).sum();
LOG.info("Using {} out of {} scored dimensions, explaining {} of total differences", trunc.size(), sumCost.size(), truncTotal / total);
DataFrame result = trunc.joinLeft(dimension).joinLeft(value);
result.mapInPlace(new Series.DoubleFunction() {
@Override
public double apply(double... values) {
return values[0] / total;
}
}, COST);
return new PipelineResult(context, toEntities(result));
}
private static Set<DimensionEntity> toEntities(DataFrame df) {
Set<DimensionEntity> entities = new HashSet<>();
for(int i=0; i<df.size(); i++) {
String dimension = df.getString(DIMENSION, i);
String value = df.getString(VALUE, i).toLowerCase();
double score = df.getDouble(COST, i);
entities.add(DimensionEntity.fromDimension(score, dimension, value));
}
return entities;
}
/**
* Perform contribution analysis on a metric given a time range and baseline.
*
* @param dataset thirdeye dataset reference
* @param metric thirdeye metric reference
* @param current current time range
* @param baseline baseline time range
* @return DataFrame with normalized cost
* @throws Exception if data cannot be fetched or data is invalid
*/
DataFrame score(DatasetConfigDTO dataset, MetricConfigDTO metric, TimeRangeEntity current, TimeRangeEntity baseline) throws Exception {
if(!metric.getDataset().equals(dataset.getDataset()))
throw new IllegalArgumentException("Dataset and metric must match");
// build data cube
OLAPDataBaseClient olapClient = getOlapDataBaseClient(current, baseline, metric, dataset);
Dimensions dimensions = new Dimensions(dataset.getDimensions());
Cube cube = new Cube();
cube.buildDimensionCostSet(olapClient, dimensions);
return toNormalizedDataFrame(cube.getCostSet());
}
private OLAPDataBaseClient getOlapDataBaseClient(TimeRangeEntity current, TimeRangeEntity baseline, MetricConfigDTO metric, DatasetConfigDTO dataset) throws Exception {
final String timezone = "UTC";
List<MetricExpression> metricExpressions = Utils.convertToMetricExpressions(metric.getName(), MetricAggFunction.SUM, dataset.getDataset());
OLAPDataBaseClient olapClient = new PinotThirdEyeSummaryClient(cache);
olapClient.setCollection(dataset.getDataset());
olapClient.setMetricExpression(metricExpressions.get(0));
olapClient.setCurrentStartInclusive(new DateTime(current.getStart(), DateTimeZone.forID(timezone)));
olapClient.setCurrentEndExclusive(new DateTime(current.getEnd(), DateTimeZone.forID(timezone)));
olapClient.setBaselineStartInclusive(new DateTime(baseline.getStart(), DateTimeZone.forID(timezone)));
olapClient.setBaselineEndExclusive(new DateTime(baseline.getEnd(), DateTimeZone.forID(timezone)));
return olapClient;
}
private static DataFrame toNormalizedDataFrame(Collection<DimNameValueCostEntry> costs) {
String[] dim = new String[costs.size()];
String[] value = new String[costs.size()];
double[] cost = new double[costs.size()];
int i = 0;
for(DimNameValueCostEntry e : costs) {
dim[i] = e.getDimName();
value[i] = e.getDimValue();
cost[i] = e.getCost();
i++;
}
DoubleSeries sCost = DataFrame.toSeries(cost).fillNull();
DataFrame df = new DataFrame();
df.addSeries(DIMENSION, dim);
df.addSeries(VALUE, value);
if(sCost.sum() > 0.0) {
df.addSeries(COST, sCost.divide(sCost.sum()));
} else {
df.addSeries(COST, sCost);
}
return df;
}
}