/**
* Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.linkedin.pinot.core.startree;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.BiMap;
import com.google.common.collect.HashBiMap;
import com.linkedin.pinot.common.data.DimensionFieldSpec;
import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.data.FieldSpec.DataType;
import com.linkedin.pinot.common.data.MetricFieldSpec;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.data.TimeFieldSpec;
import com.linkedin.pinot.common.utils.Pairs.IntPair;
import com.linkedin.pinot.core.data.GenericRow;
import com.linkedin.pinot.core.segment.creator.impl.V1Constants;
import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import com.linkedin.pinot.core.startree.hll.HllUtil;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.joda.time.DateTime;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Uses file to build the star tree. Each row is divided into dimension and metrics. Time is added
* to dimension list.
* We use the split order to build the tree. In most cases, split order will be ranked depending on
* the cardinality (descending order).
* Time column will be excluded or last entry in split order irrespective of its cardinality
* This is a recursive algorithm where we branch on one dimension at every level.
* <b>Psuedo algo</b>
* <code>
*
* build(){
* let table(1,N) consists of N input rows
* table.sort(1,N) //sort the table on all dimensions, according to split order
* constructTree(table, 0, N, 0);
* }
* constructTree(table,start,end, level){
* splitDimensionName = dimensionsSplitOrder[level]
* groupByResult<dimName, length> = table.groupBy(dimensionsSplitOrder[level]); //returns the number of rows for each value in splitDimension
* int rangeStart = 0;
* for each ( entry<dimName,length> groupByResult){
* if(entry.length > minThreshold){
* constructTree(table, rangeStart, rangeStart + entry.length, level +1);
* }
* rangeStart = rangeStart + entry.length;
* updateStarTree() //add new child
* }
*
* //create a star tree node
*
* aggregatedRows = table.uniqueAfterRemovingAttributeAndAggregateMetrics(start,end, splitDimensionName);
* for(each row in aggregatedRows_
* table.add(row);
* if(aggregateRows.size > minThreshold) {
* table.sort(end, end + aggregatedRows.size);
* constructStarTree(table, end, end + aggregatedRows.size, level +1);
* }
* }
* </code>
*/
public class OffHeapStarTreeBuilder implements StarTreeBuilder {
private static final Logger LOG = LoggerFactory.getLogger(OffHeapStarTreeBuilder.class);
File dataFile;
private Schema schema;
private DataOutputStream dataBuffer;
int rawRecordCount = 0;
int aggRecordCount = 0;
private List<String> dimensionsSplitOrder;
private Set<String> skipStarNodeCreationForDimensions;
private Set<String> skipMaterializationForDimensions;
private int maxLeafRecords;
private StarTree starTree;
private StarTreeIndexNode starTreeRootIndexNode;
private int numDimensions;
private int numMetrics;
private List<String> dimensionNames;
private List<String> metricNames;
private String timeColumnName;
private List<DataType> dimensionTypes;
private Map<String, Object> dimensionNameToStarValueMap;
private HashBiMap<String, Integer> dimensionNameToIndexMap;
private Map<String, Integer> metricNameToIndexMap;
private int dimensionSizeBytes;
private int metricSizeBytes;
private File outDir;
private Map<String, HashBiMap<Object, Integer>> dictionaryMap;
boolean debugMode = false;
private int[] sortOrder;
private int skipMaterializationCardinalityThreshold;
private boolean enableOffHeapFormat;
public void init(StarTreeBuilderConfig builderConfig) throws Exception {
schema = builderConfig.schema;
timeColumnName = schema.getTimeColumnName();
this.dimensionsSplitOrder = builderConfig.dimensionsSplitOrder;
skipStarNodeCreationForDimensions = builderConfig.getSkipStarNodeCreationForDimensions();
skipMaterializationForDimensions = builderConfig.getSkipMaterializationForDimensions();
skipMaterializationCardinalityThreshold = builderConfig.getSkipMaterializationCardinalityThreshold();
enableOffHeapFormat = builderConfig.isEnableOffHealpFormat();
this.maxLeafRecords = builderConfig.maxLeafRecords;
this.outDir = builderConfig.getOutDir();
if (outDir == null) {
outDir = new File(System.getProperty("java.io.tmpdir"), V1Constants.STAR_TREE_INDEX_DIR + "_" + DateTime.now());
}
LOG.info("Index output directory:{}", outDir);
dimensionTypes = new ArrayList<>();
dimensionNames = new ArrayList<>();
dimensionNameToIndexMap = HashBiMap.create();
dimensionNameToStarValueMap = new HashMap<>();
dictionaryMap = new HashMap<>();
// READ DIMENSIONS COLUMNS
List<DimensionFieldSpec> dimensionFieldSpecs = schema.getDimensionFieldSpecs();
for (int index = 0; index < dimensionFieldSpecs.size(); index++) {
DimensionFieldSpec spec = dimensionFieldSpecs.get(index);
String dimensionName = spec.getName();
dimensionNames.add(dimensionName);
dimensionNameToIndexMap.put(dimensionName, index);
Object starValue;
starValue = getAllStarValue(spec);
dimensionNameToStarValueMap.put(dimensionName, starValue);
dimensionTypes.add(spec.getDataType());
HashBiMap<Object, Integer> dictionary = HashBiMap.create();
dictionaryMap.put(dimensionName, dictionary);
}
// treat time column as just another dimension, only difference is that we will never split on
// this dimension unless explicitly specified in split order
if (timeColumnName != null) {
dimensionNames.add(timeColumnName);
TimeFieldSpec timeFieldSpec = schema.getTimeFieldSpec();
dimensionTypes.add(timeFieldSpec.getDataType());
int index = dimensionNameToIndexMap.size();
dimensionNameToIndexMap.put(timeColumnName, index);
Object starValue;
starValue = getAllStarValue(timeFieldSpec);
dimensionNameToStarValueMap.put(timeColumnName, starValue);
HashBiMap<Object, Integer> dictionary = HashBiMap.create();
dictionaryMap.put(schema.getTimeColumnName(), dictionary);
}
dimensionSizeBytes = dimensionNames.size() * Integer.SIZE / 8;
this.numDimensions = dimensionNames.size();
// READ METRIC COLUMNS
this.metricNames = new ArrayList<>();
this.metricNameToIndexMap = new HashMap<>();
this.metricSizeBytes = 0;
List<MetricFieldSpec> metricFieldSpecs = schema.getMetricFieldSpecs();
for (int index = 0; index < metricFieldSpecs.size(); index++) {
MetricFieldSpec spec = metricFieldSpecs.get(index);
String metricName = spec.getName();
metricNames.add(metricName);
metricNameToIndexMap.put(metricName, index);
metricSizeBytes += spec.getFieldSize();
}
numMetrics = metricNames.size();
builderConfig.getOutDir().mkdirs();
dataFile = new File(outDir, "star-tree.buf");
LOG.info("StarTree output data file: {}", dataFile.getAbsolutePath());
dataBuffer = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(dataFile)));
// INITIALIZE THE ROOT NODE
this.starTreeRootIndexNode = new StarTreeIndexNode();
this.starTreeRootIndexNode.setDimensionName(StarTreeIndexNodeInterf.ALL);
this.starTreeRootIndexNode.setDimensionValue(StarTreeIndexNodeInterf.ALL);
this.starTreeRootIndexNode.setLevel(0);
LOG.info("dimensionNames:{}", dimensionNames);
LOG.info("metricNames:{}", metricNames);
}
private Object getAllStarValue(FieldSpec spec) throws Exception {
switch (spec.getDataType()) {
case STRING:
return "ALL";
case BOOLEAN:
case BYTE:
case CHAR:
case DOUBLE:
case FLOAT:
case INT:
case LONG:
return spec.getDefaultNullValue();
case OBJECT:
case SHORT:
case DOUBLE_ARRAY:
case CHAR_ARRAY:
case FLOAT_ARRAY:
case INT_ARRAY:
case LONG_ARRAY:
case SHORT_ARRAY:
case STRING_ARRAY:
case BYTE_ARRAY:
default:
throw new Exception("Unsupported dimension data type" + spec);
}
}
public GenericRow toGenericRow(DimensionBuffer dimensionKey, MetricBuffer metricsHolder) {
GenericRow row = new GenericRow();
Map<String, Object> map = new HashMap<>();
for (int i = 0; i < dimensionNames.size(); i++) {
String dimName = dimensionNames.get(i);
BiMap<Integer, Object> inverseDictionary = dictionaryMap.get(dimName).inverse();
Object dimValue = inverseDictionary.get(dimensionKey.getDimension(i));
if (dimValue == null) {
dimValue = dimensionNameToStarValueMap.get(dimName);
}
map.put(dimName, dimValue);
}
for (int i = 0; i < numMetrics; i++) {
String metName = metricNames.get(i);
map.put(metName, metricsHolder.getValueConformToDataType(i));
}
row.init(map);
return row;
}
public void append(GenericRow row) throws Exception {
DimensionBuffer dimension = new DimensionBuffer(numDimensions);
for (int i = 0; i < dimensionNames.size(); i++) {
String dimName = dimensionNames.get(i);
Map<Object, Integer> dictionary = dictionaryMap.get(dimName);
Object dimValue = row.getValue(dimName);
if (dimValue == null) {
// TODO: Have another default value to represent STAR. Using default value to represent STAR
// as of now.
// It does not matter during query execution, since we know that values is STAR from the
// star tree
dimValue = dimensionNameToStarValueMap.get(dimName);
}
if (!dictionary.containsKey(dimValue)) {
dictionary.put(dimValue, dictionary.size());
}
dimension.setDimension(i, dictionary.get(dimValue));
}
// initialize raw data row
Object[] metrics = new Object[numMetrics];
for (int i = 0; i < numMetrics; i++) {
String metName = metricNames.get(i);
if (schema.getMetricFieldSpecs().get(i).getDerivedMetricType() == MetricFieldSpec.DerivedMetricType.HLL) {
// hll field is in string format, convert it to hll data type first
metrics[i] = HllUtil.convertStringToHll((String) row.getValue(metName));
} else {
// no conversion for standard data types
metrics[i] = row.getValue(metName);
}
}
MetricBuffer metricBuffer = new MetricBuffer(metrics, schema.getMetricFieldSpecs());
appendToRawBuffer(dimension, metricBuffer);
}
private void appendToRawBuffer(DimensionBuffer dimension, MetricBuffer metrics) throws IOException {
appendToBuffer(dataBuffer, dimension, metrics);
rawRecordCount++;
}
private void appendToAggBuffer(DimensionBuffer dimension, MetricBuffer metrics) throws IOException {
appendToBuffer(dataBuffer, dimension, metrics);
aggRecordCount++;
}
private void appendToBuffer(DataOutputStream dos, DimensionBuffer dimensions,
MetricBuffer metricHolder) throws IOException {
for (int i = 0; i < numDimensions; i++) {
dos.writeInt(dimensions.getDimension(i));
}
dos.write(metricHolder.toBytes(metricSizeBytes));
}
public void build() throws Exception {
if (skipMaterializationForDimensions == null || skipMaterializationForDimensions.isEmpty()) {
skipMaterializationForDimensions = computeDefaultDimensionsToSkipMaterialization();
}
// For default split order, give preference to skipMaterializationForDimensions.
// For user-defined split order, give preference to split-order.
if (dimensionsSplitOrder == null || dimensionsSplitOrder.isEmpty()) {
dimensionsSplitOrder = computeDefaultSplitOrder();
dimensionsSplitOrder.removeAll(skipMaterializationForDimensions);
} else {
skipMaterializationForDimensions.removeAll(dimensionsSplitOrder);
}
LOG.info("Split order: {}", dimensionsSplitOrder);
LOG.info("Skip Materilazitaion For Dimensions: {}", skipMaterializationForDimensions);
long start = System.currentTimeMillis();
dataBuffer.flush();
// Sort the data based on default sort order (split order + remaining dimensions)
sort(dataFile, 0, rawRecordCount);
// Recursively construct the star tree, continuously sorting the data
constructStarTree(starTreeRootIndexNode, 0, rawRecordCount, 0, dataFile);
// Split the leaf nodes on time column. This is only possible if we have not split on time-column name
// yet, and time column is still preserved (ie not replaced by StarTreeNode.all()).
if (timeColumnName != null && !skipMaterializationForDimensions.contains(timeColumnName) &&
!dimensionsSplitOrder.contains(timeColumnName)) {
splitLeafNodesOnTimeColumn();
}
// Create aggregate rows for all nodes in the tree
createAggDocForAllNodes(starTreeRootIndexNode);
long end = System.currentTimeMillis();
LOG.info("Took {} ms to build star tree index. Original records:{} Materialized record:{}",
(end - start), rawRecordCount, aggRecordCount);
starTree = new StarTree(starTreeRootIndexNode, dimensionNameToIndexMap);
File treeBinary = new File(outDir, "star-tree.bin");
if (enableOffHeapFormat) {
LOG.info("Saving tree in off-heap binary format at: {} ", treeBinary);
StarTreeSerDe.writeTreeOffHeapFormat(starTree, treeBinary);
} else {
LOG.info("Saving tree in on-heap binary at: {} ", treeBinary);
StarTreeSerDe.writeTreeOnHeapFormat(starTree, treeBinary);
}
printTree(starTreeRootIndexNode, 0);
LOG.info("Finished build tree. out dir: {} ", outDir);
dataBuffer.close();
}
/**
* Create aggregated docs using BFS
* @param node
*/
private MetricBuffer createAggDocForAllNodes(StarTreeIndexNode node) throws Exception {
MetricBuffer aggMetricBuffer = null;
if (node.isLeaf()) {
StarTreeDataTable leafDataTable =
new StarTreeDataTable(dataFile, dimensionSizeBytes, metricSizeBytes, null);
Iterator<Pair<byte[], byte[]>> iterator =
leafDataTable.iterator(node.getStartDocumentId(), node.getEndDocumentId());
Pair<byte[], byte[]> first = iterator.next();
aggMetricBuffer = MetricBuffer.fromBytes(first.getRight(), schema.getMetricFieldSpecs());
while (iterator.hasNext()) {
Pair<byte[], byte[]> next = iterator.next();
MetricBuffer metricBuffer = MetricBuffer.fromBytes(next.getRight(), schema.getMetricFieldSpecs());
aggMetricBuffer.aggregate(metricBuffer);
}
} else {
Iterator<StarTreeIndexNode> childrenIterator = node.getChildrenIterator();
while (childrenIterator.hasNext()) {
StarTreeIndexNode child = childrenIterator.next();
MetricBuffer childMetricBuffer = createAggDocForAllNodes(child);
// don't use the star node value to compute aggregate for the parent
if (child.getDimensionValue() == StarTreeIndexNodeInterf.ALL) {
continue;
}
if (aggMetricBuffer == null) {
aggMetricBuffer = new MetricBuffer(childMetricBuffer);
} else {
aggMetricBuffer.aggregate(childMetricBuffer);
}
}
}
//compute the dimension values for this node using the path, can be optimized by passing the path in the method call.
Map<Integer, Integer> pathValues = node.getPathValues();
DimensionBuffer dimensionBuffer = new DimensionBuffer(numDimensions);
for (int i = 0; i < numDimensions; i++) {
if (pathValues.containsKey(i)) {
dimensionBuffer.setDimension(i, pathValues.get(i));
} else {
dimensionBuffer.setDimension(i, StarTreeIndexNodeInterf.ALL);
}
}
node.setAggregatedDocumentId(rawRecordCount + aggRecordCount);
appendToAggBuffer(dimensionBuffer, aggMetricBuffer);
return aggMetricBuffer;
}
/**
* Helper method that visits each leaf node does the following:
* - Re-orders the doc-id's corresponding to leaf node wrt time column.
* - Create children nodes for each time value under this leaf node.
* - Adds a new record with aggregated data for this leaf node.
* @throws Exception
*/
private void splitLeafNodesOnTimeColumn() throws Exception {
Queue<StarTreeIndexNode> nodes = new LinkedList<>();
nodes.add(starTreeRootIndexNode);
StarTreeDataSorter dataSorter = new StarTreeDataSorter(dataFile, dimensionSizeBytes, metricSizeBytes);
while (!nodes.isEmpty()) {
StarTreeIndexNode node = nodes.remove();
if (node.isLeaf()) {
// If we have time column, split on time column, helps in time based filtering
if (timeColumnName != null) {
int level = node.getLevel();
int[] newSortOrder = moveColumnInSortOrder(timeColumnName, getSortOrder(), level);
int startDocId = node.getStartDocumentId();
int endDocId = node.getEndDocumentId();
dataSorter.sort(startDocId, endDocId, newSortOrder);
int timeColIndex = dimensionNameToIndexMap.get(timeColumnName);
Map<Integer, IntPair> timeColumnRangeMap =
dataSorter.groupByIntColumnCount(startDocId, endDocId, timeColIndex);
node.setChildDimensionName(timeColIndex);
node.setChildren(new HashMap<Integer, StarTreeIndexNode>());
for (int timeValue : timeColumnRangeMap.keySet()) {
IntPair range = timeColumnRangeMap.get(timeValue);
StarTreeIndexNode child = new StarTreeIndexNode();
child.setDimensionName(timeColIndex);
child.setDimensionValue(timeValue);
child.setParent(node);
child.setLevel(node.getLevel() + 1);
child.setStartDocumentId(range.getLeft());
child.setEndDocumentId(range.getRight());
node.addChild(child, timeValue);
}
}
} else {
Iterator<StarTreeIndexNode> childrenIterator = node.getChildrenIterator();
while (childrenIterator.hasNext()) {
nodes.add(childrenIterator.next());
}
}
}
dataSorter.close();
}
/**
* Helper method that moves the given column from its current position to
* the specified new position.
* @param columnToMove
* @param origSortOrder
* @param newPositionForTimeColumn
* @return
*/
private int[] moveColumnInSortOrder(String columnToMove, int[] origSortOrder,
int newPositionForTimeColumn) {
Preconditions.checkArgument(columnToMove != null);
Preconditions.checkArgument(
newPositionForTimeColumn >= 0 && newPositionForTimeColumn < origSortOrder.length);
int timeDimensionIndex = dimensionNameToIndexMap.get(columnToMove);
int[] newSortOrder = new int[origSortOrder.length];
int index = 0;
// Retain the sort order based on the path to this leaf node
for (int i = 0; i < newPositionForTimeColumn; i++) {
newSortOrder[index++] = origSortOrder[i];
}
// Move time to the front
newSortOrder[index++] = timeDimensionIndex;
// Append remaining columns
for (int i = newPositionForTimeColumn; i < numDimensions; i++) {
if (i != timeDimensionIndex) {
newSortOrder[index++] = origSortOrder[i];
}
}
return newSortOrder;
}
/**
* Debug method to print the tree.
* @param node
* @param level
*/
private void printTree(StarTreeIndexNode node, int level) {
for (int i = 0; i < level; i++) {
LOG.debug(" ");
}
BiMap<Integer, String> inverse = dimensionNameToIndexMap.inverse();
String dimName = "ALL";
Object dimValue = "ALL";
if (node.getDimensionName() != StarTreeIndexNodeInterf.ALL) {
dimName = inverse.get(node.getDimensionName());
}
if (node.getDimensionValue() != StarTreeIndexNodeInterf.ALL) {
dimValue = dictionaryMap.get(dimName).inverse().get(node.getDimensionValue());
}
String formattedOutput = Objects.toStringHelper(node).add("nodeId", node.getNodeId())
.add("level", level).add("dimensionName", dimName).add("dimensionValue", dimValue)
.add("childDimensionName", inverse.get(node.getChildDimensionName()))
.add("childCount", node.getNumChildren())
.add("startDocumentId", node.getStartDocumentId())
.add("endDocumentId", node.getEndDocumentId())
.add("documentCount", (node.getEndDocumentId() - node.getStartDocumentId())).toString();
LOG.debug(formattedOutput);
if (!node.isLeaf()) {
Iterator<StarTreeIndexNode> childrenIterator = node.getChildrenIterator();
while (childrenIterator.hasNext()) {
printTree(childrenIterator.next(), level + 1);
}
}
}
private List<String> computeDefaultSplitOrder() {
ArrayList<String> defaultSplitOrder = new ArrayList<>();
// include only the dimensions not time column. Also, assumes that
// skipMaterializationForDimensions is built.
for (String dimensionName : dimensionNames) {
if (skipMaterializationForDimensions != null
&& !skipMaterializationForDimensions.contains(dimensionName)) {
defaultSplitOrder.add(dimensionName);
}
}
if (timeColumnName != null) {
defaultSplitOrder.remove(timeColumnName);
}
Collections.sort(defaultSplitOrder, new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
return dictionaryMap.get(o2).size() - dictionaryMap.get(o1).size(); // descending
}
});
return defaultSplitOrder;
}
private Set<String> computeDefaultDimensionsToSkipMaterialization() {
Set<String> skipDimensions = new HashSet<String>();
for (String dimensionName : dimensionNames) {
if (dictionaryMap.get(dimensionName).size() > skipMaterializationCardinalityThreshold) {
skipDimensions.add(dimensionName);
}
}
return skipDimensions;
}
/*
* Sorts the file on all dimensions
*/
private void sort(File file, int startDocId, int endDocId) throws IOException {
if (debugMode) {
LOG.info("BEFORE SORTING");
printFile(file, startDocId, endDocId);
}
StarTreeDataTable dataSorter =
new StarTreeDataTable(file, dimensionSizeBytes, metricSizeBytes, getSortOrder());
dataSorter.sort(startDocId, endDocId);
if (debugMode) {
LOG.info("AFTER SORTING");
printFile(file, startDocId, endDocId);
}
}
private int[] getSortOrder() {
if (sortOrder == null) {
sortOrder = new int[dimensionNames.size()];
for (int i = 0; i < dimensionsSplitOrder.size(); i++) {
sortOrder[i] = dimensionNameToIndexMap.get(dimensionsSplitOrder.get(i));
}
// add remaining dimensions that were not part of dimensionsSplitOrder
int counter = 0;
for (String dimName : dimensionNames) {
if (!dimensionsSplitOrder.contains(dimName)) {
sortOrder[dimensionsSplitOrder.size() + counter] = dimensionNameToIndexMap.get(dimName);
counter = counter + 1;
}
}
}
return sortOrder;
}
private void printFile(File file, int startDocId, int endDocId) throws IOException {
LOG.info("Contents of file:{} from:{} to:{}", file.getName(), startDocId, endDocId);
StarTreeDataTable dataSorter =
new StarTreeDataTable(file, dimensionSizeBytes, metricSizeBytes, getSortOrder());
Iterator<Pair<byte[], byte[]>> iterator = dataSorter.iterator(startDocId, endDocId);
int numRecordsToPrint = 100;
int counter = 0;
while (iterator.hasNext()) {
Pair<byte[], byte[]> next = iterator.next();
LOG.info("{}, {}", DimensionBuffer.fromBytes(next.getLeft()),
MetricBuffer.fromBytes(next.getRight(), schema.getMetricFieldSpecs()));
if (counter++ == numRecordsToPrint) {
break;
}
}
}
private int constructStarTree(StarTreeIndexNode node, int startDocId, int endDocId, int level,
File file) throws Exception {
// node.setStartDocumentId(startDocId);
int docsAdded = 0;
if (level == dimensionsSplitOrder.size()) {
return 0;
}
String splitDimensionName = dimensionsSplitOrder.get(level);
Integer splitDimensionId = dimensionNameToIndexMap.get(splitDimensionName);
LOG.debug(
"Building tree at level:{} using file:{} from startDoc:{} endDocId:{} splitting on dimension:{}",
level, file.getName(), startDocId, endDocId, splitDimensionName);
Map<Integer, IntPair> sortGroupBy = groupBy(startDocId, endDocId, splitDimensionId, file);
LOG.debug("Group stats:{}", sortGroupBy);
node.setChildDimensionName(splitDimensionId);
node.setChildren(new HashMap<Integer, StarTreeIndexNode>());
for (int childDimensionValue : sortGroupBy.keySet()) {
StarTreeIndexNode child = new StarTreeIndexNode();
child.setDimensionName(splitDimensionId);
child.setDimensionValue(childDimensionValue);
child.setParent(node);
child.setLevel(node.getLevel() + 1);
// n.b. We will number the nodes later using BFS after fully split
// Add child to parent
node.addChild(child, childDimensionValue);
int childDocs = 0;
IntPair range = sortGroupBy.get(childDimensionValue);
if (range.getRight() - range.getLeft() > maxLeafRecords) {
childDocs = constructStarTree(child, range.getLeft(), range.getRight(), level + 1, file);
docsAdded += childDocs;
}
// Either range <= maxLeafRecords, or we did not split further (last level).
if (childDocs == 0) {
child.setStartDocumentId(range.getLeft());
child.setEndDocumentId(range.getRight());
}
}
// Return if star node does not need to be created.
if (skipStarNodeCreationForDimensions != null
&& skipStarNodeCreationForDimensions.contains(splitDimensionName)) {
return docsAdded;
}
// create star node
StarTreeIndexNode starChild = new StarTreeIndexNode();
starChild.setDimensionName(splitDimensionId);
starChild.setDimensionValue(StarTreeIndexNodeInterf.ALL);
starChild.setParent(node);
starChild.setLevel(node.getLevel() + 1);
// n.b. We will number the nodes later using BFS after fully split
// Add child to parent
node.addChild(starChild, StarTreeIndexNodeInterf.ALL);
Iterator<Pair<DimensionBuffer, MetricBuffer>> iterator =
uniqueCombinations(startDocId, endDocId, file, splitDimensionId);
int rowsAdded = 0;
int startOffset = rawRecordCount + aggRecordCount;
while (iterator.hasNext()) {
Pair<DimensionBuffer, MetricBuffer> next = iterator.next();
DimensionBuffer dimension = next.getLeft();
MetricBuffer metricsHolder = next.getRight();
LOG.debug("Adding row:{}", dimension);
appendToAggBuffer(dimension, metricsHolder);
rowsAdded++;
}
docsAdded += rowsAdded;
LOG.debug("Added {} additional records at level {}", rowsAdded, level);
// flush
dataBuffer.flush();
int childDocs = 0;
if (rowsAdded >= maxLeafRecords) {
sort(dataFile, startOffset, startOffset + rowsAdded);
childDocs =
constructStarTree(starChild, startOffset, startOffset + rowsAdded, level + 1, dataFile);
docsAdded += childDocs;
}
// Either rowsAdded < maxLeafRecords, or we did not split further (last level).
if (childDocs == 0) {
starChild.setStartDocumentId(startOffset);
starChild.setEndDocumentId(startOffset + rowsAdded);
}
// node.setEndDocumentId(endDocId + docsAdded);
return docsAdded;
}
/**
* Assumes the file is already sorted, returns the unique combinations after removing a specified
* dimension.
* Aggregates the metrics for each unique combination, currently only sum is supported by default
* @param startDocId
* @param endDocId
* @param file
* @param splitDimensionId
* @return
* @throws Exception
*/
private Iterator<Pair<DimensionBuffer, MetricBuffer>> uniqueCombinations(int startDocId,
int endDocId, File file, int splitDimensionId) throws Exception {
StarTreeDataTable dataSorter =
new StarTreeDataTable(file, dimensionSizeBytes, metricSizeBytes, getSortOrder());
Iterator<Pair<byte[], byte[]>> iterator1 = dataSorter.iterator(startDocId, endDocId);
File tempFile =
new File(outDir, file.getName() + "_" + startDocId + "_" + endDocId + ".unique.tmp");
DataOutputStream dos =
new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tempFile)));
while (iterator1.hasNext()) {
Pair<byte[], byte[]> next = iterator1.next();
byte[] dimensionBuffer = next.getLeft();
byte[] metricBuffer = next.getRight();
DimensionBuffer dimensions = DimensionBuffer.fromBytes(dimensionBuffer);
for (int i = 0; i < numDimensions; i++) {
String dimensionName = dimensionNameToIndexMap.inverse().get(i);
if (i == splitDimensionId || (skipMaterializationForDimensions != null
&& skipMaterializationForDimensions.contains(dimensionName))) {
dos.writeInt(StarTreeIndexNodeInterf.ALL);
} else {
dos.writeInt(dimensions.getDimension(i));
}
}
dos.write(metricBuffer);
}
dos.close();
dataSorter =
new StarTreeDataTable(tempFile, dimensionSizeBytes, metricSizeBytes, getSortOrder());
dataSorter.sort(0, endDocId - startDocId);
if (debugMode) {
printFile(tempFile, 0, endDocId - startDocId);
}
final Iterator<Pair<byte[], byte[]>> iterator = dataSorter.iterator(0, endDocId - startDocId);
return new Iterator<Pair<DimensionBuffer, MetricBuffer>>() {
Pair<DimensionBuffer, MetricBuffer> prev = null;
boolean done = false;
@Override
public void remove() {
throw new UnsupportedOperationException();
}
@Override
public boolean hasNext() {
return !done;
}
@Override
public Pair<DimensionBuffer, MetricBuffer> next() {
while (iterator.hasNext()) {
Pair<byte[], byte[]> next = iterator.next();
byte[] dimBuffer = next.getLeft();
byte[] metricBuffer = next.getRight();
if (prev == null) {
prev = Pair.of(DimensionBuffer.fromBytes(dimBuffer),
MetricBuffer.fromBytes(metricBuffer, schema.getMetricFieldSpecs()));
} else {
Pair<DimensionBuffer, MetricBuffer> current =
Pair.of(DimensionBuffer.fromBytes(dimBuffer),
MetricBuffer.fromBytes(metricBuffer, schema.getMetricFieldSpecs()));
if (!current.getLeft().equals(prev.getLeft())) {
Pair<DimensionBuffer, MetricBuffer> ret = prev;
prev = current;
LOG.debug("Returning unique {}", prev.getLeft());
return ret;
} else {
prev.getRight().aggregate(current.getRight());
}
}
}
done = true;
LOG.debug("Returning unique {}", prev.getLeft());
return prev;
}
};
}
/**
* Group by on dimension column, assumes data is already sorted on this dimension from start to
* end doc id
* @param startDocId
* @param endDocId
* @param dimension
* @param file
* @return
*/
private Int2ObjectMap<IntPair> groupBy(int startDocId, int endDocId, Integer dimension, File file) {
StarTreeDataTable dataSorter =
new StarTreeDataTable(file, dimensionSizeBytes, metricSizeBytes, getSortOrder());
return dataSorter.groupByIntColumnCount(startDocId, endDocId, dimension);
}
/**
* Iterator to iterate over the records from startDocId to endDocId
*/
@Override
public Iterator<GenericRow> iterator(final int startDocId, final int endDocId) throws Exception {
StarTreeDataTable dataSorter =
new StarTreeDataTable(dataFile, dimensionSizeBytes, metricSizeBytes, getSortOrder());
final Iterator<Pair<byte[], byte[]>> iterator = dataSorter.iterator(startDocId, endDocId);
return new Iterator<GenericRow>() {
@Override
public boolean hasNext() {
return iterator.hasNext();
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
@Override
public GenericRow next() {
Pair<byte[], byte[]> pair = iterator.next();
DimensionBuffer dimensionKey = DimensionBuffer.fromBytes(pair.getLeft());
MetricBuffer metricsHolder = MetricBuffer.fromBytes(pair.getRight(), schema.getMetricFieldSpecs());
return toGenericRow(dimensionKey, metricsHolder);
}
};
}
public JSONObject getStarTreeAsJSON() throws Exception {
JSONObject json = new JSONObject();
toJson(json, starTreeRootIndexNode, dictionaryMap);
return json;
}
private void toJson(JSONObject json, StarTreeIndexNode node,
Map<String, HashBiMap<Object, Integer>> dictionaryMap) throws Exception {
String dimName = "ALL";
Object dimValue = "ALL";
if (node.getDimensionName() != StarTreeIndexNodeInterf.ALL) {
dimName = dimensionNames.get(node.getDimensionName());
}
if (node.getDimensionValue() != StarTreeIndexNodeInterf.ALL) {
dimValue = dictionaryMap.get(dimName).inverse().get(node.getDimensionValue());
}
json.put("title", dimName + ":" + dimValue);
Iterator<StarTreeIndexNode> childrenIterator = node.getChildrenIterator();
if (childrenIterator != null) {
JSONObject[] childJsons = new JSONObject[node.getNumChildren()];
int index = 0;
while (childrenIterator.hasNext()) {
StarTreeIndexNode childNode = childrenIterator.next();
JSONObject childJson = new JSONObject();
toJson(childJson, childNode, dictionaryMap);
childJsons[index++] = childJson;
}
json.put("nodes", childJsons);
}
}
@Override
public void cleanup() {
if (outDir != null) {
FileUtils.deleteQuietly(outDir);
}
}
@Override
public StarTree getTree() {
return starTree;
}
@Override
public int getTotalRawDocumentCount() {
return rawRecordCount;
}
@Override
public int getTotalAggregateDocumentCount() {
return aggRecordCount;
}
@Override
public int getMaxLeafRecords() {
return maxLeafRecords;
}
@Override
public List<String> getDimensionsSplitOrder() {
return dimensionsSplitOrder;
}
public Map<String, HashBiMap<Object, Integer>> getDictionaryMap() {
return dictionaryMap;
}
public HashBiMap<String, Integer> getDimensionNameToIndexMap() {
return dimensionNameToIndexMap;
}
@Override
public Set<String> getSkipMaterializationForDimensions() {
return skipMaterializationForDimensions;
}
}