/**
* Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.linkedin.pinot.core.operator.filter;
import com.linkedin.pinot.core.operator.filter.predicate.PredicateEvaluatorProvider;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.linkedin.pinot.common.utils.Pairs.IntPair;
import com.linkedin.pinot.core.common.BlockDocIdValueSet;
import com.linkedin.pinot.core.common.BlockId;
import com.linkedin.pinot.core.common.BlockMetadata;
import com.linkedin.pinot.core.common.BlockValSet;
import com.linkedin.pinot.core.common.DataSource;
import com.linkedin.pinot.core.common.Predicate;
import com.linkedin.pinot.core.operator.blocks.BaseFilterBlock;
import com.linkedin.pinot.core.operator.docidsets.FilterBlockDocIdSet;
import com.linkedin.pinot.core.operator.docidsets.SortedDocIdSet;
import com.linkedin.pinot.core.operator.filter.predicate.PredicateEvaluator;
import com.linkedin.pinot.core.segment.index.readers.SortedInvertedIndexReader;
public class SortedInvertedIndexBasedFilterOperator extends BaseFilterOperator {
private static final Logger LOGGER = LoggerFactory.getLogger(SortedInvertedIndexBasedFilterOperator.class);
private static final String OPERATOR_NAME = "SortedInvertedIndexBasedFilterOperator";
private final Predicate predicate;
private PredicateEvaluator predicateEvaluator;
private DataSource dataSource;
private SortedBlock sortedBlock;
private int startDocId;
private int endDocId;
/**
*
* @param dataSource
* @param startDocId inclusive
* @param endDocId inclusive
*/
public SortedInvertedIndexBasedFilterOperator(Predicate predicate, DataSource dataSource, int startDocId, int endDocId) {
this.predicate = predicate;
this.predicateEvaluator = PredicateEvaluatorProvider.getPredicateFunctionFor(predicate, dataSource);
this.dataSource = dataSource;
this.startDocId = startDocId;
this.endDocId = endDocId;
}
@Override
public boolean open() {
return true;
}
@Override
public BaseFilterBlock nextFilterBlock(BlockId BlockId) {
final SortedInvertedIndexReader invertedIndex = (SortedInvertedIndexReader) dataSource.getInvertedIndex();
List<IntPair> pairs = new ArrayList<IntPair>();
// At this point, we need to create a list of matching docId ranges. There are two kinds of operators:
//
// - "Additive" operators, such as EQ, IN and RANGE build up a list of ranges and merge overlapping/adjacent ones,
// clipping the ranges to [startDocId; endDocId]
//
// - "Subtractive" operators, such as NEQ and NOT IN build up a list of ranges that do not match and build a list of
// matching intervals by subtracting a list of non-matching intervals from the given range of
// [startDocId; endDocId]
//
// For now, we don't look at the cardinality of the column's dictionary, although we should do that if someone
// specifies a very large list of IN/NOT IN predicates relative to the column's cardinality or a very large/small
// range predicate relative to the cardinality. However, as adjacent ranges get merged before returning the final
// list of ranges, the only drawback is that we use a lot of memory during the filter block evaluation.
final int[] dictionaryIds;
boolean additiveRanges = true;
switch (predicate.getType()) {
case EQ:
case IN:
case RANGE:
dictionaryIds = predicateEvaluator.getMatchingDictionaryIds();
break;
case NEQ:
case NOT_IN:
additiveRanges = false;
dictionaryIds = predicateEvaluator.getNonMatchingDictionaryIds();
break;
case REGEXP_LIKE:
throw new RuntimeException("Regex is not supported");
default:
throw new RuntimeException("Unimplemented!");
}
if (0 < dictionaryIds.length) {
// Sort the dictionaryIds in ascending order, so that their respective ranges are adjacent if their
// dictionaryIds are adjacent
Arrays.sort(dictionaryIds);
IntPair lastPair = invertedIndex.getMinMaxRangeFor(dictionaryIds[0]);
IntRanges.clip(lastPair, startDocId, endDocId);
for (int i = 1; i < dictionaryIds.length; i++) {
IntPair currentPair = invertedIndex.getMinMaxRangeFor(dictionaryIds[i]);
IntRanges.clip(currentPair, startDocId, endDocId);
// If the previous range is degenerate, just keep the current one
if (IntRanges.isInvalid(lastPair)) {
lastPair = currentPair;
continue;
}
// If the current range is adjacent or overlaps with the previous range, merge it into the previous range,
// otherwise add the previous range and keep the current one to be added
if (IntRanges.rangesAreMergeable(lastPair, currentPair)) {
IntRanges.mergeIntoFirst(lastPair, currentPair);
} else {
if (!IntRanges.isInvalid(lastPair)) {
pairs.add(lastPair);
}
lastPair = currentPair;
}
}
// Add the last range if it's valid
if (!IntRanges.isInvalid(lastPair)) {
pairs.add(lastPair);
}
}
if (!additiveRanges) {
// If the ranges are not additive ranges, our list of pairs is a list of "holes" in the [startDocId; endDocId]
// range. We need to take this list of pairs and invert it. To do so, there are three cases:
//
// - No holes, in which case the final range is [startDocId; endDocId]
// - One or more hole, in which case the final ranges are [startDocId; firstHoleStartDocId - 1] and
// [lastHoleEndDocId + 1; endDocId] and ranges in between other holes
List<IntPair> newPairs = new ArrayList<>();
if (pairs.isEmpty()) {
newPairs.add(new IntPair(startDocId, endDocId));
} else {
// Add the first filled area (between startDocId and the first hole)
IntPair firstHole = pairs.get(0);
IntPair firstRange = new IntPair(startDocId, firstHole.getLeft() - 1);
if (!IntRanges.isInvalid(firstRange)) {
newPairs.add(firstRange);
}
// Add the filled areas between contiguous holes
int pairCount = pairs.size();
for (int i = 1; i < pairCount; i++) {
IntPair previousHole = pairs.get(i - 1);
IntPair currentHole = pairs.get(i);
IntPair range = new IntPair(previousHole.getRight() + 1, currentHole.getLeft() - 1);
if (!IntRanges.isInvalid(range)) {
newPairs.add(range);
}
}
// Add the last filled area (between the last hole and endDocId)
IntPair lastHole = pairs.get(pairs.size() - 1);
IntPair lastRange = new IntPair(lastHole.getRight() + 1, endDocId);
if (!IntRanges.isInvalid(lastRange)) {
newPairs.add(lastRange);
}
}
pairs = newPairs;
}
LOGGER.debug("Creating a Sorted Block with pairs: {}", pairs);
sortedBlock = new SortedBlock(dataSource.getOperatorName(), pairs);
return sortedBlock;
}
@Override
public boolean isResultEmpty() {
return predicateEvaluator.alwaysFalse();
}
@Override
public boolean close() {
return true;
}
@Override
public String getOperatorName() {
return OPERATOR_NAME;
}
public static class SortedBlock extends BaseFilterBlock {
private List<IntPair> pairs;
private SortedDocIdSet sortedDocIdSet;
private String datasourceName;
public SortedBlock(String datasourceName, List<IntPair> pairs) {
this.datasourceName = datasourceName;
this.pairs = pairs;
}
@Override
public BlockId getId() {
return new BlockId(0);
}
@Override
public boolean applyPredicate(Predicate predicate) {
throw new UnsupportedOperationException("applypredicate not supported in " + this.getClass());
}
@Override
public FilterBlockDocIdSet getFilteredBlockDocIdSet() {
sortedDocIdSet = new SortedDocIdSet(datasourceName, pairs);
return sortedDocIdSet;
}
@Override
public BlockValSet getBlockValueSet() {
throw new UnsupportedOperationException("getBlockValueSet not supported in " + this.getClass());
}
@Override
public BlockDocIdValueSet getBlockDocIdValueSet() {
throw new UnsupportedOperationException("getBlockDocIdValueSet not supported in " + this.getClass());
}
@Override
public BlockMetadata getMetadata() {
throw new UnsupportedOperationException("getMetadata not supported in " + this.getClass());
}
}
}