/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.core.operator.filter; import com.linkedin.pinot.core.operator.filter.predicate.PredicateEvaluatorProvider; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.linkedin.pinot.common.utils.Pairs.IntPair; import com.linkedin.pinot.core.common.BlockDocIdValueSet; import com.linkedin.pinot.core.common.BlockId; import com.linkedin.pinot.core.common.BlockMetadata; import com.linkedin.pinot.core.common.BlockValSet; import com.linkedin.pinot.core.common.DataSource; import com.linkedin.pinot.core.common.Predicate; import com.linkedin.pinot.core.operator.blocks.BaseFilterBlock; import com.linkedin.pinot.core.operator.docidsets.FilterBlockDocIdSet; import com.linkedin.pinot.core.operator.docidsets.SortedDocIdSet; import com.linkedin.pinot.core.operator.filter.predicate.PredicateEvaluator; import com.linkedin.pinot.core.segment.index.readers.SortedInvertedIndexReader; public class SortedInvertedIndexBasedFilterOperator extends BaseFilterOperator { private static final Logger LOGGER = LoggerFactory.getLogger(SortedInvertedIndexBasedFilterOperator.class); private static final String OPERATOR_NAME = "SortedInvertedIndexBasedFilterOperator"; private final Predicate predicate; private PredicateEvaluator predicateEvaluator; private DataSource dataSource; private SortedBlock sortedBlock; private int startDocId; private int endDocId; /** * * @param dataSource * @param startDocId inclusive * @param endDocId inclusive */ public SortedInvertedIndexBasedFilterOperator(Predicate predicate, DataSource dataSource, int startDocId, int endDocId) { this.predicate = predicate; this.predicateEvaluator = PredicateEvaluatorProvider.getPredicateFunctionFor(predicate, dataSource); this.dataSource = dataSource; this.startDocId = startDocId; this.endDocId = endDocId; } @Override public boolean open() { return true; } @Override public BaseFilterBlock nextFilterBlock(BlockId BlockId) { final SortedInvertedIndexReader invertedIndex = (SortedInvertedIndexReader) dataSource.getInvertedIndex(); List<IntPair> pairs = new ArrayList<IntPair>(); // At this point, we need to create a list of matching docId ranges. There are two kinds of operators: // // - "Additive" operators, such as EQ, IN and RANGE build up a list of ranges and merge overlapping/adjacent ones, // clipping the ranges to [startDocId; endDocId] // // - "Subtractive" operators, such as NEQ and NOT IN build up a list of ranges that do not match and build a list of // matching intervals by subtracting a list of non-matching intervals from the given range of // [startDocId; endDocId] // // For now, we don't look at the cardinality of the column's dictionary, although we should do that if someone // specifies a very large list of IN/NOT IN predicates relative to the column's cardinality or a very large/small // range predicate relative to the cardinality. However, as adjacent ranges get merged before returning the final // list of ranges, the only drawback is that we use a lot of memory during the filter block evaluation. final int[] dictionaryIds; boolean additiveRanges = true; switch (predicate.getType()) { case EQ: case IN: case RANGE: dictionaryIds = predicateEvaluator.getMatchingDictionaryIds(); break; case NEQ: case NOT_IN: additiveRanges = false; dictionaryIds = predicateEvaluator.getNonMatchingDictionaryIds(); break; case REGEXP_LIKE: throw new RuntimeException("Regex is not supported"); default: throw new RuntimeException("Unimplemented!"); } if (0 < dictionaryIds.length) { // Sort the dictionaryIds in ascending order, so that their respective ranges are adjacent if their // dictionaryIds are adjacent Arrays.sort(dictionaryIds); IntPair lastPair = invertedIndex.getMinMaxRangeFor(dictionaryIds[0]); IntRanges.clip(lastPair, startDocId, endDocId); for (int i = 1; i < dictionaryIds.length; i++) { IntPair currentPair = invertedIndex.getMinMaxRangeFor(dictionaryIds[i]); IntRanges.clip(currentPair, startDocId, endDocId); // If the previous range is degenerate, just keep the current one if (IntRanges.isInvalid(lastPair)) { lastPair = currentPair; continue; } // If the current range is adjacent or overlaps with the previous range, merge it into the previous range, // otherwise add the previous range and keep the current one to be added if (IntRanges.rangesAreMergeable(lastPair, currentPair)) { IntRanges.mergeIntoFirst(lastPair, currentPair); } else { if (!IntRanges.isInvalid(lastPair)) { pairs.add(lastPair); } lastPair = currentPair; } } // Add the last range if it's valid if (!IntRanges.isInvalid(lastPair)) { pairs.add(lastPair); } } if (!additiveRanges) { // If the ranges are not additive ranges, our list of pairs is a list of "holes" in the [startDocId; endDocId] // range. We need to take this list of pairs and invert it. To do so, there are three cases: // // - No holes, in which case the final range is [startDocId; endDocId] // - One or more hole, in which case the final ranges are [startDocId; firstHoleStartDocId - 1] and // [lastHoleEndDocId + 1; endDocId] and ranges in between other holes List<IntPair> newPairs = new ArrayList<>(); if (pairs.isEmpty()) { newPairs.add(new IntPair(startDocId, endDocId)); } else { // Add the first filled area (between startDocId and the first hole) IntPair firstHole = pairs.get(0); IntPair firstRange = new IntPair(startDocId, firstHole.getLeft() - 1); if (!IntRanges.isInvalid(firstRange)) { newPairs.add(firstRange); } // Add the filled areas between contiguous holes int pairCount = pairs.size(); for (int i = 1; i < pairCount; i++) { IntPair previousHole = pairs.get(i - 1); IntPair currentHole = pairs.get(i); IntPair range = new IntPair(previousHole.getRight() + 1, currentHole.getLeft() - 1); if (!IntRanges.isInvalid(range)) { newPairs.add(range); } } // Add the last filled area (between the last hole and endDocId) IntPair lastHole = pairs.get(pairs.size() - 1); IntPair lastRange = new IntPair(lastHole.getRight() + 1, endDocId); if (!IntRanges.isInvalid(lastRange)) { newPairs.add(lastRange); } } pairs = newPairs; } LOGGER.debug("Creating a Sorted Block with pairs: {}", pairs); sortedBlock = new SortedBlock(dataSource.getOperatorName(), pairs); return sortedBlock; } @Override public boolean isResultEmpty() { return predicateEvaluator.alwaysFalse(); } @Override public boolean close() { return true; } @Override public String getOperatorName() { return OPERATOR_NAME; } public static class SortedBlock extends BaseFilterBlock { private List<IntPair> pairs; private SortedDocIdSet sortedDocIdSet; private String datasourceName; public SortedBlock(String datasourceName, List<IntPair> pairs) { this.datasourceName = datasourceName; this.pairs = pairs; } @Override public BlockId getId() { return new BlockId(0); } @Override public boolean applyPredicate(Predicate predicate) { throw new UnsupportedOperationException("applypredicate not supported in " + this.getClass()); } @Override public FilterBlockDocIdSet getFilteredBlockDocIdSet() { sortedDocIdSet = new SortedDocIdSet(datasourceName, pairs); return sortedDocIdSet; } @Override public BlockValSet getBlockValueSet() { throw new UnsupportedOperationException("getBlockValueSet not supported in " + this.getClass()); } @Override public BlockDocIdValueSet getBlockDocIdValueSet() { throw new UnsupportedOperationException("getBlockDocIdValueSet not supported in " + this.getClass()); } @Override public BlockMetadata getMetadata() { throw new UnsupportedOperationException("getMetadata not supported in " + this.getClass()); } } }