/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.core.segment.creator.impl.stats; import com.linkedin.pinot.common.data.FieldSpec; import com.linkedin.pinot.common.data.FieldSpec.DataType; import com.linkedin.pinot.core.data.partition.PartitionFunction; import com.linkedin.pinot.core.segment.creator.ColumnStatistics; import com.linkedin.pinot.core.segment.creator.StatsCollectorConfig; import com.linkedin.pinot.core.segment.creator.impl.V1Constants; import java.util.Arrays; import java.util.List; import org.apache.avro.reflect.Nullable; import org.apache.commons.lang.math.IntRange; /** * This class in initialized per column and all the data is * sent to it before actual indexes are created * the job of this class is to collect * unique elements * record cardinality * compute min * compute max * see if column isSorted */ public abstract class AbstractColumnStatisticsCollector implements ColumnStatistics { protected static final int INITIAL_HASH_SET_SIZE = 1000; private Object previousValue = null; protected final FieldSpec fieldSpec; private boolean isSorted = true; private final String column; protected int totalNumberOfEntries = 0; protected int maxNumberOfMultiValues = 0; private int numInputNullValues = 0; // Number of rows in which this column was null in the input. private PartitionFunction partitionFunction; private final int numPartitions; private int partitionRangeStart = Integer.MAX_VALUE; private int partitionRangeEnd = Integer.MIN_VALUE; void updateTotalNumberOfEntries(Object[] entries) { totalNumberOfEntries += entries.length; } public int getTotalNumberOfEntries() { return totalNumberOfEntries; } public AbstractColumnStatisticsCollector(String column, StatsCollectorConfig statsCollectorConfig) { this.column = column; fieldSpec = statsCollectorConfig.getFieldSpecForColumn(column); partitionFunction = statsCollectorConfig.getPartitionFunction(column); numPartitions = statsCollectorConfig.getNumPartitions(column); addressNull(previousValue, fieldSpec.getDataType()); previousValue = null; } public int getMaxNumberOfMultiValues() { return maxNumberOfMultiValues; } void addressSorted(Object entry) { if (isSorted) { if (previousValue != null) { if (!entry.equals(previousValue) && previousValue != null) { final Comparable prevValue = (Comparable) previousValue; final Comparable origin = (Comparable) entry; if (origin.compareTo(prevValue) < 0) { isSorted = false; } } } previousValue = entry; } } @Override public boolean isSorted() { return fieldSpec.isSingleValueField() && isSorted; } /** * Collect statistics for given the entry. * Entry is expected to be 'raw', and not pre-aggregated (for star-tree). * @param entry Entry to be collected */ public abstract void collect(Object entry); /** * Collected statistics for the given entry. * * @param entry Entry to be collected * @param isAggregated True for aggregated, False for raw. */ public abstract void collect(Object entry, boolean isAggregated); public abstract Object getMinValue(); public abstract Object getMaxValue(); public abstract Object getUniqueValuesSet(); public abstract int getCardinality(); public int getLengthOfLargestElement() { return -1; } public abstract void seal(); Object addressNull(Object entry, DataType e) { if (entry == null) { if (e == DataType.STRING) { entry = V1Constants.Str.NULL_STRING; } else if (e == DataType.BOOLEAN) { entry = V1Constants.Str.NULL_BOOLEAN; } else if (e == DataType.DOUBLE) { entry = V1Constants.Numbers.NULL_DOUBLE; } else if (e == DataType.FLOAT) { entry = V1Constants.Numbers.NULL_FLOAT; } else if (e == DataType.LONG) { entry = V1Constants.Numbers.NULL_LONG; } else if (e == DataType.INT) { entry = V1Constants.Numbers.NULL_INT; } } return entry; } /** * Returns the {@link PartitionFunction} for the column. * @return Partition function for the column. */ public PartitionFunction getPartitionFunction() { return partitionFunction; } /** * Returns the number of partitions for this column. * * @return Number of partitions. */ public int getNumPartitions() { return numPartitions; } /** * Returns the partition range within which the column values exist. * * @return List of ranges for the column values. */ @Nullable public List<IntRange> getPartitionRanges() { if (partitionRangeStart <= partitionRangeEnd) { return Arrays.asList(new IntRange(partitionRangeStart, partitionRangeEnd)); } else { return null; } } /** * Updates the partition range based on the partition of the given value. * * @param value Column value. */ protected void updatePartition(Object value) { if (partitionFunction != null) { int partition = partitionFunction.getPartition(value); if (partition < partitionRangeStart) { partitionRangeStart = partition; } if (partition > partitionRangeEnd) { partitionRangeEnd = partition; } } } }