package com.linkedin.thirdeye.client.diffsummary; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.lang3.builder.ToStringBuilder; import org.apache.commons.lang3.builder.ToStringStyle; import org.apache.commons.lang3.tuple.MutablePair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.ObjectMapper; @JsonIgnoreProperties(ignoreUnknown = true) public class Cube { // the cube (Ca|Cb) private static final Logger LOG = LoggerFactory.getLogger(Cube.class); private static final int DEFAULT_TOP_DIMENSION = 3; public static final double PERCENTAGE_CONTRIBUTION_THRESHOLD = 3d; private double topBaselineValue; private double topCurrentValue; private double topRatio; private List<DimNameValueCostEntry> costSet; @JsonProperty("dimensions") private Dimensions dimensions; // The data stored in levels @JsonProperty("hierarchicalRows") private List<List<Row>> hierarchicalRows = new ArrayList<>(); // The logical nodes of the hierarchy among rows (i.e., the actual data) @JsonIgnore private List<List<HierarchyNode>> hierarchicalNodes = new ArrayList<>(); public double getTopBaselineValue() { return topBaselineValue; } public double getTopCurrentValue() { return topCurrentValue; } public double getTopRatio() { return topRatio; } public Dimensions getDimensions() { return dimensions; } @JsonIgnore public HierarchyNode getRoot() { if (hierarchicalNodes.size() != 0 && hierarchicalNodes.get(0).size() != 0) { return hierarchicalNodes.get(0).get(0); } else { return null; } } public List<DimNameValueCostEntry> getCostSet() { return costSet; } public void buildWithAutoDimensionOrder(OLAPDataBaseClient olapClient, Dimensions dimensions) throws Exception { buildWithAutoDimensionOrder(olapClient, dimensions, DEFAULT_TOP_DIMENSION, Collections.<List<String>>emptyList()); } public void buildWithAutoDimensionOrder(OLAPDataBaseClient olapClient, Dimensions dimensions, int topDimensions) throws Exception { buildWithAutoDimensionOrder(olapClient, dimensions, topDimensions, Collections.<List<String>>emptyList()); } public void buildWithAutoDimensionOrder(OLAPDataBaseClient olapClient, Dimensions dimensions, int topDimension, List<List<String>> hierarchy) throws Exception { Dimensions sanitizedDimensions = sanitizeDimensions(dimensions); initializeBasicInfo(olapClient); if (dimensions == null || dimensions.size() == 0) { throw new IllegalArgumentException("Dimensions cannot be empty."); } if (hierarchy == null) { hierarchy = Collections.emptyList(); } this.costSet = computeOneDimensionCost(olapClient, topRatio, sanitizedDimensions); this.dimensions = sortDimensionOrder(costSet, sanitizedDimensions, topDimension, hierarchy); LOG.info("Auto decided dimensions: " + this.dimensions); buildWithManualDimensionOrder(olapClient, this.dimensions); } public void buildDimensionCostSet(OLAPDataBaseClient olapClient, Dimensions dimensions) throws Exception { Dimensions sanitizedDimensions = sanitizeDimensions(dimensions); initializeBasicInfo(olapClient); this.costSet = computeOneDimensionCost(olapClient, topRatio, sanitizedDimensions); } private Dimensions sanitizeDimensions(Dimensions dimensions) { List<String> allDimensions = dimensions.allDimensions(); List<String> dimensionsToRemove = new ArrayList<>(); dimensionsToRemove.add("environment"); dimensionsToRemove.add("colo"); dimensionsToRemove.add("fabric"); List<String> validDimensionNames = new ArrayList<>(); for(String dim:allDimensions){ if(dim.indexOf("_topk") > -1) { String rawDimensionName = dim.replaceAll("_topk", ""); dimensionsToRemove.add(rawDimensionName.toLowerCase()); } } for(String dim:allDimensions){ if(!dimensionsToRemove.contains(dim.toLowerCase())){ validDimensionNames.add(dim); } } return new Dimensions(validDimensionNames); } public void buildWithManualDimensionOrder(OLAPDataBaseClient olapClient, Dimensions dimensions) throws Exception { if (dimensions == null || dimensions.size() == 0) { throw new IllegalArgumentException("Dimensions cannot be empty."); } if (this.dimensions == null) { // which means buildWithAutoDimensionOrder is not triggered initializeBasicInfo(olapClient); this.dimensions = dimensions; this.costSet = computeOneDimensionCost(olapClient, topRatio, dimensions); } int size = 0; // Get the rows at each level and sort them in the post-order of their hierarchical relationship, // in which a parent row aggregates the details rows under it. For instance, in the following // hierarchy row b aggregates rows d and e, and row a aggregates rows b and c. // Level 0 a // / \ // Level 1 b c // / \ \ // Level 2 d e f // The Comparator for generating the order is implemented in the class DimensionValues. List<List<Row>> rowOfLevels = olapClient.getAggregatedValuesOfLevels(dimensions); for (int i = 0; i <= dimensions.size(); ++i) { List<Row> rowAtLevelI = rowOfLevels.get(i); Collections.sort(rowAtLevelI, new RowDimensionValuesComparator()); hierarchicalRows.add(rowAtLevelI); size += rowAtLevelI.size(); } LOG.info("Size of the cube for generating summary: " + size); buildHierarchy(); } /** * Calculate the change ratio of the top aggregated values. * @throws Exception An exception is thrown if OLAP database cannot be connected. */ private void initializeBasicInfo(OLAPDataBaseClient olapClient) throws Exception { Row topAggValues = olapClient.getTopAggregatedValues(); topBaselineValue = topAggValues.baselineValue; // aggregated baseline values topCurrentValue = topAggValues.currentValue; // aggregated current values topRatio = topCurrentValue / topBaselineValue; // change ratio } /** * Sort the rows in the post-order of their hierarchical relationship */ static class RowDimensionValuesComparator implements Comparator<Row> { @Override public int compare(Row r1, Row r2) { return r1.dimensionValues.compareTo(r2.dimensionValues); } } /** * Establish the hierarchy between aggregated and detailed rows. */ private void buildHierarchy() { HashMap<String, HierarchyNode> curParent = new HashMap<>(); HashMap<String, HierarchyNode> nextParent = new HashMap<>(); for (int level = 0; level <= this.dimensions.size(); ++level) { hierarchicalNodes.add(new ArrayList<HierarchyNode>(hierarchicalRows.get(level).size())); if (level != 0) { for (int index = 0; index < hierarchicalRows.get(level).size(); ++index) { Row row = hierarchicalRows.get(level).get(index); StringBuilder parentDimValues = new StringBuilder(); for (int i = 0; i < level - 1; ++i) { parentDimValues.append(row.dimensionValues.get(i)); } HierarchyNode parentNode = curParent.get(parentDimValues.toString()); // Sometimes Pinot returns a node without any matching parent; we discard those nodes. if (parentNode == null) { continue; } HierarchyNode node = new HierarchyNode(level, index, row, parentNode); parentNode.children.add(node); hierarchicalNodes.get(level).add(node); // Add current node's dimension values to next parent lookup table for the next level of nodes parentDimValues.append(row.dimensionValues.get(level - 1)); nextParent.put(parentDimValues.toString(), node); } } else { // root Row row = hierarchicalRows.get(0).get(0); HierarchyNode node = new HierarchyNode(0, 0, row, null); hierarchicalNodes.get(0).add(node); nextParent.put("", node); } // The last level of nodes won't be a parent of any other nodes, so we don't need to initialized // the hashmap of parent nodes for it. if (level != this.dimensions.size()) { curParent = nextParent; nextParent = new HashMap<>(); } } } private static List<DimNameValueCostEntry> computeOneDimensionCost(OLAPDataBaseClient olapClient, double topRatio, Dimensions dimensions) throws Exception { List<DimNameValueCostEntry> costSet = new ArrayList<>(); List<List<Row>> wowValuesOfDimensions = olapClient.getAggregatedValuesOfDimension(dimensions); double baselineTotal = 0; double currentTotal = 0; //use one dimension to compute baseline/current total List<Row> wowValuesOfFirstDimension = wowValuesOfDimensions.get(0); for (int j = 0; j < wowValuesOfFirstDimension.size(); ++j) { Row wowValues = wowValuesOfFirstDimension.get(j); baselineTotal += wowValues.baselineValue; currentTotal += wowValues.currentValue; } LOG.info("baselineTotal: {}", baselineTotal); LOG.info("currentTotal: {}", currentTotal); for (int i = 0; i < dimensions.size(); ++i) { String dimension = dimensions.get(i); List<Row> wowValuesOfOneDimension = wowValuesOfDimensions.get(i); for (int j = 0; j < wowValuesOfOneDimension.size(); ++j) { Row wowValues = wowValuesOfOneDimension.get(j); String dimValue = wowValues.getDimensionValues().get(0); double dimValueCost = CostFunction .errWithPercentageRemoval(wowValues.baselineValue, wowValues.currentValue, topRatio, PERCENTAGE_CONTRIBUTION_THRESHOLD, currentTotal + baselineTotal); double contributionFactor = (wowValues.baselineValue + wowValues.currentValue) / (baselineTotal + currentTotal); costSet.add(new DimNameValueCostEntry(dimension, dimValue, dimValueCost, contributionFactor, wowValues.currentValue, wowValues.baselineValue)); } } Collections.sort(costSet, Collections.reverseOrder()); LOG.info("Cost set"); for (DimNameValueCostEntry entry : costSet.subList(0, Math.min(costSet.size(), 20))) { LOG.info("{}", entry); } return costSet; } /** * Sort dimensions according to their cost, which is the sum of the error for aggregating all its children rows. * Dimensions with larger error is sorted in the front of the list. * The order among the dimensions that belong to the same hierarchical group will be maintained. An example of * a hierarchical group {continent, country}. The cost of a group is the average of member costs. * @throws Exception An exception is thrown if OLAP database cannot be connected. */ private static Dimensions sortDimensionOrder(List<DimNameValueCostEntry> costSet, Dimensions dimensions, int topDimension, List<List<String>> hierarchy) throws Exception { List<MutablePair<String, Double>> dimensionCostPairs = new ArrayList<>(); Map<String, Double> dimNameToCost = new HashMap<>(); for (DimNameValueCostEntry dimNameValueCostEntry : costSet) { double cost = dimNameValueCostEntry.getCost(); if (dimNameToCost.containsKey(dimNameValueCostEntry.getDimName())) { cost += dimNameToCost.get(dimNameValueCostEntry.getDimName()); } dimNameToCost.put(dimNameValueCostEntry.getDimName(), cost); } // Given one dimension name D, returns the hierarchical dimension to which D belong. Map<String, HierarchicalDimension> hierarchicalDimensionMap = new HashMap<>(); Set<String> availableDimensionKeySet = new HashSet<>(dimensions.allDimensions()); // Process the suggested hierarchy list and filter out only the hierarchies that can be applied to the available // dimensions of the dataset. for (List<String> suggestedHierarchyList : hierarchy) { if (suggestedHierarchyList == null || suggestedHierarchyList.size() < 2) { continue; } List<String> actualHierarchy = new ArrayList<>(); for (String dimension : suggestedHierarchyList) { if (availableDimensionKeySet.contains(dimension)) { actualHierarchy.add(dimension); } } if (actualHierarchy.size() > 1) { HierarchicalDimension hierarchicalDimension = new HierarchicalDimension(); hierarchicalDimension.hierarchy = actualHierarchy; for (String dimension : actualHierarchy) { hierarchicalDimensionMap.put(dimension, hierarchicalDimension); } hierarchicalDimension.index = dimensionCostPairs.size(); dimensionCostPairs.add(new MutablePair<>(actualHierarchy.get(0), .0)); } } // Calculate cost for each dimension. The costs of the dimensions of the same hierarchical group will be the max // cost among all the children in that hierarchy. for (int i = 0; i < dimensions.size(); ++i) { String dimension = dimensions.get(i); double cost = 0d; if (dimNameToCost.containsKey(dimension)) { cost += dimNameToCost.get(dimension); } if (hierarchicalDimensionMap.containsKey(dimension)) { HierarchicalDimension hierarchicalDimension = hierarchicalDimensionMap.get(dimension); MutablePair<String, Double> costOfDimensionPair = dimensionCostPairs.get(hierarchicalDimension.index); // The max cost of children will be the cost of a group costOfDimensionPair.right = Math.max(cost, costOfDimensionPair.right); } else { // The dimension does not belong to any hierarchy MutablePair<String, Double> costOfDimensionPair = new MutablePair<>(dimension, cost); dimensionCostPairs.add(costOfDimensionPair); } } // Sort dimensions according to their costs in a descending order Collections.sort(dimensionCostPairs, Collections.reverseOrder(new DimensionCostPairSorter())); // If there exists a huge gap (e.g., 1/10 of cost) between two cost pairs, then we chop of the dimensions because // pairs with small costs does not provide useful information // Invariance to keep: cutOffPairIdx <= number of dimensionCostPairs int cutOffPairIdx = 1; if (dimensionCostPairs.size() > 1) { double cutOffCost = dimensionCostPairs.get(0).getRight() / 10d; for (; cutOffPairIdx < dimensionCostPairs.size(); ++cutOffPairIdx) { double curCost = dimensionCostPairs.get(cutOffPairIdx).getRight(); if (Double.compare(cutOffCost, curCost) > 0) { break; } } } else { cutOffPairIdx = 0; } // Create a new Dimension instance whose dimensions follow the calculated order ArrayList<String> newDimensions = new ArrayList<>(); int pairIdx = 0; for (MutablePair<String, Double> dimensionCostPair : dimensionCostPairs) { StringBuilder sb = new StringBuilder(" Dimension: "); if (hierarchicalDimensionMap.containsKey(dimensionCostPair.getLeft())) { HierarchicalDimension hierarchicalDimension = hierarchicalDimensionMap.get(dimensionCostPair.getLeft()); if (pairIdx <= cutOffPairIdx) { newDimensions.addAll(hierarchicalDimension.hierarchy); } sb.append(hierarchicalDimension.hierarchy); } else { // The dimension does not belong to any hierarchy if (pairIdx <= cutOffPairIdx) { newDimensions.add(dimensionCostPair.getLeft()); } sb.append(dimensionCostPair.getLeft()); } sb.append(", Cost: "); sb.append(dimensionCostPair.getRight()); LOG.info(sb.toString()); ++pairIdx; } return new Dimensions(newDimensions.subList(0, Math.min(topDimension, newDimensions.size()))); } static class DimensionCostPairSorter implements Comparator<MutablePair<String, Double>> { @Override public int compare(MutablePair<String, Double> o1, MutablePair<String, Double> o2) { return Double.compare(o1.getRight(), o2.getRight()); } } static class HierarchicalDimension { int index = -1; List<String> hierarchy; } public void toJson(String fileName) throws IOException { new ObjectMapper().writeValue(new File(fileName), this); } public static Cube fromJson(String fileName) throws IOException { Cube cube = new ObjectMapper().readValue(new File(fileName), Cube.class); cube.buildHierarchy(); return cube; } @Override public String toString() { ToStringBuilder tsb = new ToStringBuilder(this, ToStringStyle.MULTI_LINE_STYLE); tsb.append("Baseline Value", topBaselineValue) .append("Current Value", topCurrentValue) .append("Ratio", topRatio) .append("Dimentions", this.dimensions) .append("#Detailed Rows", hierarchicalRows.get(hierarchicalRows.size() - 1).size()); return tsb.toString(); } }