package com.linkedin.thirdeye.autoload.pinot.metrics; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import org.apache.commons.collections.CollectionUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.JsonNode; import com.google.common.collect.Lists; import com.linkedin.pinot.client.ResultSet; import com.linkedin.pinot.client.ResultSetGroup; import com.linkedin.pinot.common.data.FieldSpec; import com.linkedin.pinot.common.data.MetricFieldSpec; import com.linkedin.pinot.common.data.Schema; import com.linkedin.thirdeye.client.DAORegistry; import com.linkedin.thirdeye.client.ThirdEyeCacheRegistry; import com.linkedin.thirdeye.client.pinot.PinotQuery; import com.linkedin.thirdeye.common.ThirdEyeConfiguration; import com.linkedin.thirdeye.datalayer.bao.DashboardConfigManager; import com.linkedin.thirdeye.datalayer.bao.DatasetConfigManager; import com.linkedin.thirdeye.datalayer.bao.MetricConfigManager; import com.linkedin.thirdeye.datalayer.dto.DashboardConfigDTO; import com.linkedin.thirdeye.datalayer.dto.DatasetConfigDTO; import com.linkedin.thirdeye.datalayer.dto.MetricConfigDTO; import com.linkedin.thirdeye.util.ThirdEyeUtils; /** * This is a service to onboard datasets automatically to thirdeye from pinot * This service runs periodically and checks for new tables in pinot, to add to thirdeye * If the table is an ingraph table, it loads metrics from the ingraph table * It also looks for any changes in dimensions or metrics to the existing tables */ public class AutoLoadPinotMetricsService implements Runnable { private static final Logger LOG = LoggerFactory.getLogger(AutoLoadPinotMetricsService.class); private static final DAORegistry DAO_REGISTRY = DAORegistry.getInstance(); private static final String DEFAULT_INGRAPH_METRIC_NAMES_COLUMN = "metricName"; private static final String DEFAULT_INGRAPH_METRIC_VALUES_COLUMN = "value"; private ScheduledExecutorService scheduledExecutorService; private AutoLoadPinotMetricsUtils autoLoadPinotMetricsUtils; private List<String> allDatasets = new ArrayList<>(); private Map<String, Schema> allSchemas = new HashMap<>(); public AutoLoadPinotMetricsService() { } public AutoLoadPinotMetricsService(ThirdEyeConfiguration config) { autoLoadPinotMetricsUtils = new AutoLoadPinotMetricsUtils(config); scheduledExecutorService = Executors.newSingleThreadScheduledExecutor(); } public void start() { scheduledExecutorService.scheduleAtFixedRate(this, 0, 4, TimeUnit.HOURS); } public void shutdown() { scheduledExecutorService.shutdown(); } public void run() { try { loadDatasets(); for (String dataset : allDatasets) { LOG.info("Checking dataset {}", dataset); Schema schema = allSchemas.get(dataset); if (!isIngraphDataset(schema)) { DatasetConfigDTO datasetConfig = DAO_REGISTRY.getDatasetConfigDAO().findByDataset(dataset); addPinotDataset(dataset, schema, datasetConfig); } } } catch (Exception e) { LOG.error("Exception in loading datasets", e); } } /** * Adds a dataset to the thirdeye database * @param dataset * @param schema * @param datasetConfig */ public void addPinotDataset(String dataset, Schema schema, DatasetConfigDTO datasetConfig) throws Exception { if (datasetConfig == null) { LOG.info("Dataset {} is new, adding it to thirdeye", dataset); addNewDataset(dataset, schema); } else { LOG.info("Dataset {} already exists, checking for updates", dataset); refreshOldDataset(dataset, datasetConfig, schema); } } /** * Adds a new dataset to the thirdeye database * @param dataset * @param schema */ private void addNewDataset(String dataset, Schema schema) throws Exception { List<MetricFieldSpec> metricSpecs = schema.getMetricFieldSpecs(); // Create DatasetConfig DatasetConfigDTO datasetConfigDTO = ConfigGenerator.generateDatasetConfig(dataset, schema); LOG.info("Creating dataset for {}", dataset); DAO_REGISTRY.getDatasetConfigDAO().save(datasetConfigDTO); // Create MetricConfig for (MetricFieldSpec metricFieldSpec : metricSpecs) { MetricConfigDTO metricConfigDTO = ConfigGenerator.generateMetricConfig(metricFieldSpec, dataset); LOG.info("Creating metric {} for {}", metricConfigDTO.getName(), dataset); DAO_REGISTRY.getMetricConfigDAO().save(metricConfigDTO); } // Create Default DashboardConfig List<Long> metricIds = ConfigGenerator.getMetricIdsFromMetricConfigs(DAO_REGISTRY.getMetricConfigDAO().findByDataset(dataset)); DashboardConfigDTO dashboardConfigDTO = ConfigGenerator.generateDefaultDashboardConfig(dataset, metricIds); LOG.info("Creating default dashboard for dataset {}", dataset); DAO_REGISTRY.getDashboardConfigDAO().save(dashboardConfigDTO); } /** * Refreshes an existing dataset in the thirdeye database * with any dimension/metric changes from pinot schema * @param dataset * @param datasetConfig * @param schema */ private void refreshOldDataset(String dataset, DatasetConfigDTO datasetConfig, Schema schema) throws Exception { if (datasetConfig.isMetricAsDimension()) { LOG.info("Checking refresh for metricAsDimension dataset {}", dataset); checkMetricAsDimensionDataset(datasetConfig, schema); } else { checkDimensionChanges(dataset, datasetConfig, schema); checkMetricChanges(dataset, datasetConfig, schema); } } private void checkDimensionChanges(String dataset, DatasetConfigDTO datasetConfig, Schema schema) { LOG.info("Checking for dimensions changes in {}", dataset); List<String> schemaDimensions = schema.getDimensionNames(); List<String> datasetDimensions = datasetConfig.getDimensions(); List<String> dimensionsToAdd = new ArrayList<>(); List<String> dimensionsToRemove = new ArrayList<>(); // dimensions which are new in the pinot schema for (String dimensionName : schemaDimensions) { if (!datasetDimensions.contains(dimensionName)) { dimensionsToAdd.add(dimensionName); } } // dimensions which are removed form pinot schema for (String dimensionName : datasetDimensions) { if (!schemaDimensions.contains(dimensionName)) { dimensionsToRemove.add(dimensionName); } } if (CollectionUtils.isNotEmpty(dimensionsToAdd) || CollectionUtils.isNotEmpty(dimensionsToRemove)) { datasetDimensions.addAll(dimensionsToAdd); datasetDimensions.removeAll(dimensionsToRemove); datasetConfig.setDimensions(datasetDimensions); if (!datasetConfig.isAdditive() && CollectionUtils.isNotEmpty(datasetConfig.getDimensionsHaveNoPreAggregation())) { List<String> dimensionsHaveNoPreAggregation = datasetConfig.getDimensionsHaveNoPreAggregation(); dimensionsHaveNoPreAggregation.removeAll(dimensionsToRemove); datasetConfig.setDimensionsHaveNoPreAggregation(dimensionsHaveNoPreAggregation); } LOG.info("Added dimensions {}, removed {}", dimensionsToAdd, dimensionsToRemove); DAO_REGISTRY.getDatasetConfigDAO().update(datasetConfig); } } private void checkMetricChanges(String dataset, DatasetConfigDTO datasetConfig, Schema schema) { LOG.info("Checking for metric changes in {}", dataset); List<MetricFieldSpec> schemaMetricSpecs = schema.getMetricFieldSpecs(); List<MetricConfigDTO> datasetMetricConfigs = DAO_REGISTRY.getMetricConfigDAO().findByDataset(dataset); List<String> datasetMetricNames = new ArrayList<>(); for (MetricConfigDTO metricConfig : datasetMetricConfigs) { datasetMetricNames.add(metricConfig.getName()); } List<Long> metricsToAdd = new ArrayList<>(); for (MetricFieldSpec metricSpec : schemaMetricSpecs) { // metrics which are new in pinot schema, create them String metricName = metricSpec.getName(); if (!datasetMetricNames.contains(metricName)) { MetricConfigDTO metricConfigDTO = ConfigGenerator.generateMetricConfig(metricSpec, dataset); LOG.info("Creating metric {} for {}", metricName, dataset); metricsToAdd.add(DAO_REGISTRY.getMetricConfigDAO().save(metricConfigDTO)); } } // add new metricIds to default dashboard if (CollectionUtils.isNotEmpty(metricsToAdd)) { LOG.info("Metrics to add {}", metricsToAdd); String dashboardName = ThirdEyeUtils.getDefaultDashboardName(dataset); DashboardConfigDTO dashboardConfig = DAO_REGISTRY.getDashboardConfigDAO().findByName(dashboardName); List<Long> metricIds = dashboardConfig.getMetricIds(); metricIds.addAll(metricsToAdd); DAO_REGISTRY.getDashboardConfigDAO().update(dashboardConfig); } // TODO: write a tool, which given a metric id, erases all traces of that metric from the database // This will include: // 1) delete the metric from metricConfigs // 2) remove any derived metrics which use the deleted metric // 3) remove the metric, and derived metrics from all dashboards // 4) remove any anomaly functions associated with the metric // 5) remove any alerts associated with these anomaly functions } /** * Reads all table names in pinot, and loads their schema * @throws IOException */ private void loadDatasets() throws IOException { JsonNode tables = autoLoadPinotMetricsUtils.getAllTablesFromPinot(); for (JsonNode table : tables) { String dataset = table.asText(); Schema schema = autoLoadPinotMetricsUtils.getSchemaFromPinot(dataset); if (schema != null) { if (!autoLoadPinotMetricsUtils.verifySchemaCorrectness(schema)) { LOG.info("Skipping {} due to incorrect schema", dataset); } else { allDatasets.add(dataset); allSchemas.put(dataset, schema); } } } } private boolean isIngraphDataset(Schema schema) { boolean isIngraphDataset = false; if ((schema.getDimensionNames().contains(DEFAULT_INGRAPH_METRIC_NAMES_COLUMN) && schema.getMetricNames().contains(DEFAULT_INGRAPH_METRIC_VALUES_COLUMN))) { isIngraphDataset = true; } return isIngraphDataset; } private List<String> fetchMetricAsADimensionMetrics(String dataset, String metricNamesColumn) { List<String> distinctMetricNames = new ArrayList<>(); ThirdEyeCacheRegistry CACHE_REGISTRY = ThirdEyeCacheRegistry.getInstance(); String sql = String.format("select count(*) from %s group by %s top 10", dataset, metricNamesColumn); try { ResultSetGroup result = CACHE_REGISTRY.getResultSetGroupCache().get(new PinotQuery(sql, dataset)); ResultSet resultSet = result.getResultSet(0); int rowCount = resultSet.getRowCount(); for (int i = 0; i < rowCount; i++) { String dimensionName = resultSet.getGroupKeyString(i, 0); distinctMetricNames.add(dimensionName); } LOG.info("Distinct Metrics {}", distinctMetricNames); } catch (Exception e) { LOG.error("Exception in fetching metrics from pinot", e); } return distinctMetricNames; } private void checkMetricAsDimensionDataset(DatasetConfigDTO datasetConfigDTO, Schema schema) { String dataset = datasetConfigDTO.getDataset(); String metricNamesColumn = datasetConfigDTO.getMetricNamesColumn(); String metricValuesColumn = datasetConfigDTO.getMetricValuesColumn(); FieldSpec metricValuesColumnFieldSpec = schema.getFieldSpecFor(metricValuesColumn); String dashboardName = ThirdEyeUtils.getDefaultDashboardName(dataset); // remove metricNamesColumn from dimensions if exists List<String> dimensions = datasetConfigDTO.getDimensions(); if (dimensions.contains(metricNamesColumn)) { dimensions.removeAll(Lists.newArrayList(metricNamesColumn)); datasetConfigDTO.setDimensions(dimensions); DAO_REGISTRY.getDatasetConfigDAO().update(datasetConfigDTO); } // remove metricValuesColumn from metrics if exists MetricConfigDTO metricConfigDTO = DAO_REGISTRY.getMetricConfigDAO().findByMetricAndDataset(metricValuesColumn, dataset); if (metricConfigDTO != null) { Long metricId = metricConfigDTO.getId(); DAO_REGISTRY.getMetricConfigDAO().delete(metricConfigDTO); // remove metricValuesColumn id from default dashboard DashboardConfigDTO dashboardConfig = DAO_REGISTRY.getDashboardConfigDAO().findByName(dashboardName); List<Long> dashboardMetricIds = dashboardConfig.getMetricIds(); dashboardMetricIds.removeAll(Lists.newArrayList(metricId)); LOG.info("Updating dashboard config for {}", dashboardName); DAO_REGISTRY.getDashboardConfigDAO().update(dashboardConfig); } if (datasetConfigDTO.isAutoDiscoverMetrics()) { // query pinot to fetch distinct metricNamesColumn List<String> allDistinctMetricNames = fetchMetricAsADimensionMetrics(dataset, metricNamesColumn); // create metrics for these metric names, if they dont exist List<MetricConfigDTO> existingMetricConfigs = DAO_REGISTRY.getMetricConfigDAO().findByDataset(dataset); List<String> existingMetricNames = Lists.newArrayList(); for (MetricConfigDTO existingMetricConfig : existingMetricConfigs) { existingMetricNames.add(existingMetricConfig.getName()); } allDistinctMetricNames.removeAll(existingMetricNames); for (String metricName : allDistinctMetricNames) { LOG.info("Creating metric config for {}", metricName); MetricFieldSpec metricFieldSpec = new MetricFieldSpec(metricName, metricValuesColumnFieldSpec.getDataType()); MetricConfigDTO metricConfig = ConfigGenerator.generateMetricConfig(metricFieldSpec, dataset); DAO_REGISTRY.getMetricConfigDAO().save(metricConfig); } // Add metrics to default dashboard List<Long> allMetricIds = ConfigGenerator.getMetricIdsFromMetricConfigs(DAO_REGISTRY.getMetricConfigDAO().findByDataset(dataset)); DashboardConfigDTO dashboardConfig = DAO_REGISTRY.getDashboardConfigDAO().findByName(dashboardName); dashboardConfig.setMetricIds(allMetricIds); LOG.info("Creating dashboard config for {}", dashboardName); DAO_REGISTRY.getDashboardConfigDAO().update(dashboardConfig); } } }