StarTreeIndexOperator.java example

Explorer
pinot-master
/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.core.operator.filter;

import com.google.common.collect.HashBiMap;
import com.linkedin.pinot.common.request.BrokerRequest;
import com.linkedin.pinot.common.request.GroupBy;
import com.linkedin.pinot.common.utils.request.FilterQueryTree;
import com.linkedin.pinot.common.utils.request.RequestUtils;
import com.linkedin.pinot.core.common.BlockDocIdIterator;
import com.linkedin.pinot.core.common.BlockId;
import com.linkedin.pinot.core.common.DataSource;
import com.linkedin.pinot.core.common.DataSourceMetadata;
import com.linkedin.pinot.core.common.Operator;
import com.linkedin.pinot.core.common.Predicate;
import com.linkedin.pinot.core.indexsegment.IndexSegment;
import com.linkedin.pinot.core.operator.blocks.BaseFilterBlock;
import com.linkedin.pinot.core.operator.dociditerators.BitmapDocIdIterator;
import com.linkedin.pinot.core.operator.docidsets.FilterBlockDocIdSet;
import com.linkedin.pinot.core.operator.filter.predicate.PredicateEvaluator;
import com.linkedin.pinot.core.operator.filter.predicate.PredicateEvaluatorProvider;
import com.linkedin.pinot.core.segment.index.readers.Dictionary;
import com.linkedin.pinot.core.startree.StarTreeIndexNodeInterf;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import org.roaringbitmap.buffer.MutableRoaringBitmap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


public class StarTreeIndexOperator extends BaseFilterOperator {
  private static final Logger LOGGER = LoggerFactory.getLogger(StarTreeIndexOperator.class);
  private static final String OPERATOR_NAME = "StarTreeIndexOperator";

  private final int numRawDocs;
  private IndexSegment segment;

  // Predicates map
  Map<String, PredicateEntry> predicatesMap;

  // Group by columns
  Set<String> groupByColumns;

  // Columns with predicate on them
  Set<String> predicateColumns;

  boolean emptyResult = false;
  private BrokerRequest brokerRequest;

  public StarTreeIndexOperator(IndexSegment segment, BrokerRequest brokerRequest) {
    this.segment = segment;
    numRawDocs = segment.getSegmentMetadata().getTotalRawDocs();
    this.brokerRequest = brokerRequest;
    predicateColumns = new HashSet<>();
    groupByColumns = new HashSet<>();
    predicatesMap = new HashMap<>();
    initPredicatesToEvaluate();
  }

  private void initPredicatesToEvaluate() {
    FilterQueryTree filterTree = RequestUtils.generateFilterQueryTree(brokerRequest);
    // Find all filter columns
    if (filterTree != null) {
      if (filterTree.getChildren() != null && !filterTree.getChildren().isEmpty()) {
        for (FilterQueryTree childFilter : filterTree.getChildren()) {
          // Nested filters are not supported
          assert childFilter.getChildren() == null || childFilter.getChildren().isEmpty();
          processFilterTree(childFilter);
        }
      } else {
        processFilterTree(filterTree);
      }
    }
    // Group by columns, we cannot lose group by columns during traversal
    GroupBy groupBy = brokerRequest.getGroupBy();
    if (groupBy != null) {
      groupByColumns.addAll(groupBy.getColumns());
    }
  }

  private void processFilterTree(FilterQueryTree childFilter) {
    String column = childFilter.getColumn();
    // Only equality predicates are supported
    Predicate predicate = Predicate.newPredicate(childFilter);
    DataSource dataSource = segment.getDataSource(column);
    Dictionary dictionary = dataSource.getDictionary();
    PredicateEntry predicateEntry = null;

    PredicateEvaluator predicateEvaluator =
        PredicateEvaluatorProvider.getPredicateFunctionFor(predicate, dataSource);

    // If dictionary does not have any values that satisfy the predicate, set emptyResults to
    // true.
    if (predicateEvaluator.alwaysFalse()) {
      emptyResult = true;
    }

    // Store this predicate, we will have to apply it later
    predicateEntry = new PredicateEntry(predicate, predicateEvaluator);
    predicateColumns.add(column);
    predicatesMap.put(column, predicateEntry);
  }

  @Override
  public boolean open() {
    return true;
  }

  @Override
  public boolean close() {
    return true;
  }

  @Override
  public BaseFilterBlock nextFilterBlock(BlockId blockId) {
    MutableRoaringBitmap finalResult = null;
    if (emptyResult) {
      finalResult = new MutableRoaringBitmap();
      final BitmapDocIdIterator bitmapDocIdIterator =
          new BitmapDocIdIterator(finalResult.getIntIterator());
      return createBaseFilterBlock(bitmapDocIdIterator);
    }

    List<BaseFilterOperator> matchingLeafOperators = buildMatchingLeafOperators();
    if (matchingLeafOperators.size() == 1) {
      BaseFilterOperator baseFilterOperator = (BaseFilterOperator) matchingLeafOperators.get(0);
      return baseFilterOperator.nextFilterBlock(blockId);
    } else {
      CompositeOperator compositeOperator = new CompositeOperator(matchingLeafOperators);
      return compositeOperator.nextFilterBlock(blockId);
    }
  }

  @Override
  public boolean isResultEmpty() {
    return false;
  }

  /**
   * Helper method to build a list of operators for matching leaf nodes.
   * - Finds all leaf nodes that match the predicates
   * - Iterates over all the matching leaf nodes, and generate a list of matching ranges
   * @return
   */
  private List<BaseFilterOperator> buildMatchingLeafOperators() {
    int totalDocsToScan = 0;
    int numExactlyMatched = 0;
    long start = System.currentTimeMillis();

    final MutableRoaringBitmap exactlyMatchedDocsBitmap = new MutableRoaringBitmap();
    Queue<SearchEntry> matchedEntries = findMatchingLeafNodes();

    // Iterate over the matching nodes. For each column, generate the list of ranges.
    List<BaseFilterOperator> matchingLeafOperators = new ArrayList<>();
    for (SearchEntry matchedEntry : matchedEntries) {
      BaseFilterOperator matchingLeafOperator = null;
      StarTreeIndexNodeInterf matchedLeafNode = matchedEntry.starTreeIndexnode;

      int startDocId = matchedLeafNode.getStartDocumentId();
      int endDocId = matchedLeafNode.getEndDocumentId();

      if (matchedEntry.remainingPredicateColumns.isEmpty()) {
        // No more filters to apply
        // Use aggregated doc for this leaf node if possible.
        int aggregatedDocumentId = matchedLeafNode.getAggregatedDocumentId();
        if (isValidAggregatedDocId(aggregatedDocumentId) && matchedEntry.remainingGroupByColumns.isEmpty()) {
          exactlyMatchedDocsBitmap.add(aggregatedDocumentId);
          numExactlyMatched = numExactlyMatched + 1;
        } else {
          // Have to scan all the documents under this leaf node
          exactlyMatchedDocsBitmap.add(startDocId, endDocId);
          numExactlyMatched += (endDocId - startDocId);
        }
      } else {
        Map<String, PredicateEntry> remainingPredicatesMap = computeRemainingPredicates(matchedEntry);
        List<BaseFilterOperator> filterOperators =
            createFilterOperatorsForRemainingPredicates(matchedEntry, remainingPredicatesMap);

        if (filterOperators.size() == 0) {
          // The predicates are applied, but we cannot use aggregated doc, as we might have lost
          // the group by dimensions, in the aggregated doc.
          exactlyMatchedDocsBitmap.add(startDocId, endDocId);
          numExactlyMatched += (endDocId - startDocId);
        } else if (filterOperators.size() == 1) {
          matchingLeafOperator = filterOperators.get(0);
        } else {
          matchingLeafOperator = new AndOperator(filterOperators);
        }
        if (matchingLeafOperator != null) {
          matchingLeafOperators.add(matchingLeafOperator);
        }
      }

      totalDocsToScan += (endDocId - startDocId);
      LOGGER.debug("{}", matchedLeafNode);
    }

    // Add an operator for exactlyMatchedDocs
    if (numExactlyMatched > 0) {
      matchingLeafOperators.add(createFilterOperator(exactlyMatchedDocsBitmap));
      totalDocsToScan += numExactlyMatched;
    }

    long end = System.currentTimeMillis();
    LOGGER.debug("Found {} matching leaves, took {} ms to create remaining filter operators. Total docs to scan:{}",
        matchedEntries.size(), (end - start), totalDocsToScan);
    return matchingLeafOperators;
  }

  /**
   * Returns true if aggregated doc id is valid, ie >= numRawDocs, false otherwise.
   *
   * This is a temporary fix to handle aggredated doc id's being de-serialized as 0 instead of -1
   * for older segments that do not have this field. This will be resolved once we implement the
   * on-disk format for star-tree along with its own serializer/deserializer, and versioning.
   *
   * @param aggregatedDocumentId
   * @return
   */
  private boolean isValidAggregatedDocId(int aggregatedDocumentId) {
    return (aggregatedDocumentId >= numRawDocs);
  }

  private BaseFilterOperator createFilterOperator(final MutableRoaringBitmap answer) {
    return new BaseFilterOperator() {
      private static final String OPERATOR_NAME = "Anonymous";

      @Override
      public String getOperatorName() {
        return OPERATOR_NAME;
      }

      @Override
      public boolean open() {
        return true;
      }

      @Override
      public boolean close() {
        return true;
      }

      @Override
      public BaseFilterBlock nextFilterBlock(BlockId blockId) {
        return createBaseFilterBlock(new BitmapDocIdIterator(answer.getIntIterator()));
      }

      @Override
      public boolean isResultEmpty() {
        return false;
      }
    };
  }

  /**
   * Builds a list of filter operators for a given matched leaf node for the given
   * set of predicates.
   * @param matchedEntry
   * @param remainingPredicatesMap
   * @return
   */
  private List<BaseFilterOperator> createFilterOperatorsForRemainingPredicates(SearchEntry matchedEntry,
      Map<String, PredicateEntry> remainingPredicatesMap) {
    int startDocId = matchedEntry.starTreeIndexnode.getStartDocumentId();
    int endDocId = matchedEntry.starTreeIndexnode.getEndDocumentId();

    List<BaseFilterOperator> childOperators = new ArrayList<>();
    for (String column : remainingPredicatesMap.keySet()) {
      PredicateEntry predicateEntry = remainingPredicatesMap.get(column);

      // predicateEntry could be null if column appeared only in groupBy
      if (predicateEntry != null) {
        BaseFilterOperator childOperator =
            createChildOperator(startDocId, endDocId - 1, column, predicateEntry);
        childOperators.add(childOperator);
      }
    }
    return childOperators;
  }

  /**
   * Helper method to compute remaining predicates from remainingPredicateColumns of
   * the given search entry.
   *
   * @param entry Search entry for which to compute the remaining predicates.
   * @return
   */
  private Map<String, PredicateEntry> computeRemainingPredicates(SearchEntry entry) {
    Map<String, PredicateEntry> remainingPredicatesMap = new HashMap<>();
    for (String column : entry.remainingPredicateColumns) {
      PredicateEntry predicateEntry = predicatesMap.get(column);
      remainingPredicatesMap.put(column, predicateEntry);
    }
    return remainingPredicatesMap;
  }

  private BaseFilterOperator createChildOperator(int startDocId, int endDocId, String column,
      PredicateEntry predicateEntry) {
    DataSource dataSource = segment.getDataSource(column);
    DataSourceMetadata dataSourceMetadata = dataSource.getDataSourceMetadata();
    BaseFilterOperator childOperator;
    Predicate predicate = predicateEntry.predicate;
    if (dataSourceMetadata.hasInvertedIndex()) {
      if (dataSourceMetadata.isSorted()) {
        childOperator = new SortedInvertedIndexBasedFilterOperator(predicate, dataSource, startDocId, endDocId);
      } else {
        childOperator = new BitmapBasedFilterOperator(predicate, dataSource, startDocId, endDocId);
      }
    } else {
      childOperator = new ScanBasedFilterOperator(predicate, dataSource, startDocId, endDocId);
    }
    return childOperator;
  }

  private BaseFilterBlock createBaseFilterBlock(final BitmapDocIdIterator bitmapDocIdIterator) {
    return new BaseFilterBlock() {

      @Override
      public FilterBlockDocIdSet getFilteredBlockDocIdSet() {
        return new FilterBlockDocIdSet() {

          @Override
          public BlockDocIdIterator iterator() {
            return bitmapDocIdIterator;
          }

          @Override
          public <T> T getRaw() {
            return null;
          }

          @Override
          public void setStartDocId(int startDocId) {
            // no-op
          }

          @Override
          public void setEndDocId(int endDocId) {
            // no-op
          }

          @Override
          public long getNumEntriesScannedInFilter() {
            return 0L;
          }

          @Override
          public int getMinDocId() {
            return 0;
          }

          @Override
          public int getMaxDocId() {
            return segment.getSegmentMetadata().getTotalDocs() - 1;
          }
        };
      }

      @Override
      public BlockId getId() {
        return new BlockId(0);
      }
    };
  }

  private Queue<SearchEntry> findMatchingLeafNodes() {
    Queue<SearchEntry> matchedEntries = new LinkedList<>();
    Queue<SearchEntry> searchQueue = new LinkedList<>();
    HashBiMap<String, Integer> dimensionIndexToNameMapping =
        segment.getStarTree().getDimensionNameToIndexMap();

    SearchEntry startEntry = new SearchEntry();
    startEntry.starTreeIndexnode = segment.getStarTree().getRoot();
    startEntry.remainingPredicateColumns = new HashSet<>(predicatesMap.keySet());
    startEntry.remainingGroupByColumns = new HashSet<>(groupByColumns);
    searchQueue.add(startEntry);

    while (!searchQueue.isEmpty()) {
      SearchEntry searchEntry = searchQueue.remove();
      StarTreeIndexNodeInterf current = searchEntry.starTreeIndexnode;
      HashSet<String> remainingPredicateColumns = searchEntry.remainingPredicateColumns;
      HashSet<String> remainingGroupByColumns = searchEntry.remainingGroupByColumns;
      // Check if its leaf, or if there are no remaining predicates/groupbycolumns, and node has valid aggregated docId
      if (current.isLeaf() || (remainingPredicateColumns.isEmpty() && remainingGroupByColumns.isEmpty()) &&
          isValidAggregatedDocId(current.getAggregatedDocumentId())) {
        // reached leaf
        matchedEntries.add(searchEntry);
        continue;
      }
      // Find next set of nodes to search
      String nextDimension =
          dimensionIndexToNameMapping.inverse().get(current.getChildDimensionName());

      HashSet<String> newRemainingPredicateColumns = new HashSet<>();
      newRemainingPredicateColumns.addAll(remainingPredicateColumns);
      HashSet<String> newRemainingGroupByColumns = new HashSet<>();
      newRemainingGroupByColumns.addAll(remainingGroupByColumns);

      addMatchingChildrenToQueue(searchQueue, current, nextDimension, newRemainingPredicateColumns,
          newRemainingGroupByColumns);
    }
    return matchedEntries;
  }

  /**
   * Helper method to add matching children into the search queue.
   * - If predicate can be applied (i.e. equality predicate that is eligible), add the child
   * satisfying the predicate into the queue.
   * - If predicate cannot be applied (either inEligible or nonEquality), add all children to the
   * queue.
   * - If no predicate on the column, add the star-child to the queue
   * @param searchQueue
   * @param node
   * @param column
   * @param remainingPredicateColumns
   * @param remainingGroupByColumns
   */
  private void addMatchingChildrenToQueue(Queue<SearchEntry> searchQueue, StarTreeIndexNodeInterf node,
      String column, HashSet<String> remainingPredicateColumns,
      HashSet<String> remainingGroupByColumns) {

    if (predicateColumns.contains(column)) {
      // Check if there is exact match filter on this column
      PredicateEntry predicateEntry = predicatesMap.get(column);

      remainingPredicateColumns.remove(column);
      remainingGroupByColumns.remove(column);

      int[] matchingDictionaryIds = predicateEntry.predicateEvaluator.getMatchingDictionaryIds();
      for (int matchingDictionaryId : matchingDictionaryIds) {
        StarTreeIndexNodeInterf child = node.getChildForDimensionValue(matchingDictionaryId);
        if (child != null) {
          addNodeToSearchQueue(searchQueue, child, remainingPredicateColumns, remainingGroupByColumns);
        }
      }
    } else {
      int nextValueId;
      if (groupByColumns.contains(column) || predicatesMap.containsKey(column)
          || (node.getChildForDimensionValue(StarTreeIndexNodeInterf.ALL) == null)) {
        Iterator<? extends StarTreeIndexNodeInterf> childrenIterator = node.getChildrenIterator();

        while (childrenIterator.hasNext()) {
          StarTreeIndexNodeInterf child = childrenIterator.next();
          if (child.getDimensionValue() != StarTreeIndexNodeInterf.ALL) {
            remainingPredicateColumns.remove(column);
            remainingGroupByColumns.remove(column);
            addNodeToSearchQueue(searchQueue, child, remainingPredicateColumns,
                remainingGroupByColumns);
          }
        }
      } else {
        // Since we have a star node and no group by on this column we can take lose this dimension
        // by taking star node path
        nextValueId = StarTreeIndexNodeInterf.ALL;
        addNodeToSearchQueue(searchQueue, node.getChildForDimensionValue(nextValueId), remainingPredicateColumns,
            remainingGroupByColumns);
      }
    }
  }

  /**
   * Helper method to add the given node the the provided queue.
   * @param searchQueue
   * @param node
   * @param predicateColumns
   * @param groupByColumns
   */
  private void addNodeToSearchQueue(Queue<SearchEntry> searchQueue, StarTreeIndexNodeInterf node,
      HashSet<String> predicateColumns, HashSet<String> groupByColumns) {
    SearchEntry newEntry = new SearchEntry();
    newEntry.starTreeIndexnode = node;
    newEntry.remainingPredicateColumns = predicateColumns;
    newEntry.remainingGroupByColumns = groupByColumns;
    searchQueue.add(newEntry);
  }

  @Override
  public String getOperatorName() {
    return OPERATOR_NAME;
  }

  class SearchEntry {
    StarTreeIndexNodeInterf starTreeIndexnode;
    HashSet<String> remainingPredicateColumns;
    HashSet<String> remainingGroupByColumns;

    @Override
    public String toString() {
      StringBuilder sb = new StringBuilder();
      sb.append(starTreeIndexnode);
      sb.append("\t").append(remainingPredicateColumns);
      return sb.toString();
    }
  }

  class PredicateEntry {
    Predicate predicate;
    private PredicateEvaluator predicateEvaluator;

    public PredicateEntry(Predicate predicate, PredicateEvaluator predicateEvaluator) {
      this.predicate = predicate;
      this.predicateEvaluator = predicateEvaluator;
    }
  }
}