/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.core.realtime.converter.stats; import com.linkedin.pinot.common.config.ColumnPartitionConfig; import com.linkedin.pinot.common.data.FieldSpec; import com.linkedin.pinot.core.common.Block; import com.linkedin.pinot.core.common.BlockMultiValIterator; import com.linkedin.pinot.core.data.partition.PartitionFunction; import com.linkedin.pinot.core.data.partition.PartitionFunctionFactory; import com.linkedin.pinot.core.io.reader.SingleColumnSingleValueReader; import com.linkedin.pinot.core.operator.blocks.RealtimeSingleValueBlock; import com.linkedin.pinot.core.realtime.impl.datasource.RealtimeColumnDataSource; import com.linkedin.pinot.core.realtime.impl.dictionary.BaseOnHeapMutableDictionary; import com.linkedin.pinot.core.segment.creator.ColumnStatistics; import java.util.Arrays; import java.util.List; import org.apache.commons.lang.math.IntRange; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Column statistics for a column coming from an in-memory realtime segment. */ public class RealtimeColumnStatistics implements ColumnStatistics { private static final Logger LOGGER = LoggerFactory.getLogger(RealtimeColumnStatistics.class); private final RealtimeColumnDataSource _dataSource; private final int[] _sortedDocIdIterationOrder; private final BaseOnHeapMutableDictionary _dictionaryReader; private final Block _block; private PartitionFunction partitionFunction; private int numPartitions; private int partitionRangeStart = Integer.MAX_VALUE; private int partitionRangeEnd = Integer.MIN_VALUE; public RealtimeColumnStatistics(RealtimeColumnDataSource dataSource, int[] sortedDocIdIterationOrder, ColumnPartitionConfig columnPartitionConfig) { _dataSource = dataSource; _sortedDocIdIterationOrder = sortedDocIdIterationOrder; _dictionaryReader = dataSource.getDictionary(); _block = dataSource.getNextBlock(); if (columnPartitionConfig != null) { String functionName = columnPartitionConfig.getFunctionName(); numPartitions = columnPartitionConfig.getNumPartitions(); partitionFunction = (functionName != null) ? PartitionFunctionFactory.getPartitionFunction(functionName, numPartitions) : null; if (partitionFunction != null) { updatePartition(); } } } @Override public Object getMinValue() { return _dictionaryReader.getMinVal(); } @Override public Object getMaxValue() { return _dictionaryReader.getMaxVal(); } @Override public Object getUniqueValuesSet() { return _dictionaryReader.getSortedValues(); } @Override public int getCardinality() { return _dictionaryReader.length(); } @Override public int getLengthOfLargestElement() { // Length of longest string int maximumStringLength = 0; // If this column is a string column, iterate over the dictionary to find the maximum length if (_dataSource.getDataSourceMetadata().getDataType() == FieldSpec.DataType.STRING) { final int length = _dictionaryReader.length(); for (int i = 0; i < length; i++) { maximumStringLength = Math.max(_dictionaryReader.getStringValue(i).length(), maximumStringLength); } } return maximumStringLength; } @Override public boolean isSorted() { // Multivalue columns can't be in sorted order if (!_block.getMetadata().isSingleValue()) { return false; } // If this is a single value, then by definition the data is sorted final int blockLength = _block.getMetadata().getLength(); if (blockLength <= 1 || getCardinality() <= 1) { return true; } // Iterate over all data to figure out whether or not it's in sorted order SingleColumnSingleValueReader singleValueReader = ((RealtimeSingleValueBlock) _block).getReader(); int docIdIndex = _sortedDocIdIterationOrder != null ? _sortedDocIdIterationOrder[0] : 0; int dictionaryId = singleValueReader.getInt(docIdIndex); Comparable previousValue = (Comparable) _dictionaryReader.get(dictionaryId); for (int i = 1; i < blockLength; i++) { docIdIndex = _sortedDocIdIterationOrder != null ? _sortedDocIdIterationOrder[i] : i; dictionaryId = singleValueReader.getInt(docIdIndex); Comparable currentValue = (Comparable) _dictionaryReader.get(dictionaryId); // If previousValue is greater than currentValue if (0 < previousValue.compareTo(currentValue)) { return false; } else { previousValue = currentValue; } } return true; } @Override public int getTotalNumberOfEntries() { // Number of multivalue entries int multivalueEntryCount = 0; // If this column is a multivalue column, iterate over all data to find the total number of multivalue entries (this // information doesn't seem to be exposed via an API) if (!_block.getMetadata().isSingleValue()) { int[] dictionaryIds = new int[getMaxNumberOfMultiValues()]; BlockMultiValIterator valIterator = (BlockMultiValIterator) _block.getBlockValueSet().iterator(); while (valIterator.hasNext()) { multivalueEntryCount += valIterator.nextIntVal(dictionaryIds); } } return multivalueEntryCount; } @Override public int getMaxNumberOfMultiValues() { return _block.getMetadata().getMaxNumberOfMultiValues(); } @Override public boolean hasNull() { return false; } @Override public PartitionFunction getPartitionFunction() { return partitionFunction; } @Override public int getNumPartitions() { return numPartitions; } @Override public List<IntRange> getPartitionRanges() { if (partitionRangeStart <= partitionRangeEnd) { return Arrays.asList(new IntRange(partitionRangeStart, partitionRangeEnd)); } else { return null; } } /** * Update partition ranges based on column values. * */ void updatePartition() { // Iterate over the dictionary to check the partitioning final int length = _dictionaryReader.length(); for (int i = 0; i < length; i++) { int partition = partitionFunction.getPartition(_dictionaryReader.get(i)); if (partition < partitionRangeStart) { partitionRangeStart = partition; } if (partition > partitionRangeEnd) { partitionRangeEnd = partition; } } } }