/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.core.segment.creator.impl.stats; import com.linkedin.pinot.core.segment.creator.StatsCollectorConfig; import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; import it.unimi.dsi.fastutil.objects.ObjectSet; import java.nio.charset.Charset; import java.util.Arrays; import com.linkedin.pinot.core.segment.creator.impl.V1Constants; public class StringColumnPreIndexStatsCollector extends AbstractColumnStatisticsCollector { private static final Charset UTF_8 = Charset.forName("UTF-8"); private String min = V1Constants.Str.NULL_STRING; private String max = V1Constants.Str.NULL_STRING; private int longestStringLength = 0; private final ObjectSet<String> rawStringSet; private final ObjectSet<String> aggregatedStringSet; private String[] sortedStringList; private boolean sealed = false; public StringColumnPreIndexStatsCollector(String column, StatsCollectorConfig statsCollectorConfig) { super(column, statsCollectorConfig); rawStringSet = new ObjectOpenHashSet<>(INITIAL_HASH_SET_SIZE); aggregatedStringSet = new ObjectOpenHashSet<>(INITIAL_HASH_SET_SIZE); } /** * Collect statistics for the given entry. * - Add it to the passed in set (which could be raw or aggregated) * - Update maximum number of values for Multi-valued entries * - Update Total number of entries * - Check if entry is sorted. * @param entry * @param set */ private void collectEntry(Object entry, ObjectSet<String> set) { if (entry instanceof Object[]) { for (final Object e : (Object[]) entry) { String value = e.toString(); set.add(value); longestStringLength = Math.max(longestStringLength, value.getBytes(UTF_8).length); } if (maxNumberOfMultiValues < ((Object[]) entry).length) { maxNumberOfMultiValues = ((Object[]) entry).length; } updateTotalNumberOfEntries((Object[]) entry); } else { String value; if (entry != null) { value = entry.toString(); } else { value = fieldSpec.getDefaultNullValue().toString(); } addressSorted(value); updatePartition(value); set.add(value); longestStringLength = Math.max(longestStringLength, value.getBytes(UTF_8).length); totalNumberOfEntries++; } } /** * {@inheritDoc} * @param entry Entry to be collected * @param isAggregated True for aggregated, False for raw. */ @Override public void collect(Object entry, boolean isAggregated) { if (isAggregated) { collectEntry(entry, aggregatedStringSet); } else { collectEntry(entry, rawStringSet); } } /** * {@inheritDoc} * @param entry Entry to be collected */ @Override public void collect(Object entry) { collect(entry, false /* isAggregated */); } @Override public String getMinValue() { if (sealed) { return min; } throw new IllegalStateException("you must seal the collector first before asking for min value"); } @Override public String getMaxValue() { if (sealed) { return max; } throw new IllegalStateException("you must seal the collector first before asking for max value"); } @Override public Object[] getUniqueValuesSet() { if (sealed) { return sortedStringList; } throw new IllegalStateException("you must seal the collector first before asking for unique values set"); } @Override public int getLengthOfLargestElement() { if (sealed) { return longestStringLength; } throw new IllegalStateException("you must seal the collector first before asking for longest value"); } @Override public int getCardinality() { if (sealed) { return sortedStringList.length; } throw new IllegalStateException("you must seal the collector first before asking for cardinality"); } @Override public boolean hasNull() { return false; } @Override public void seal() { sealed = true; sortedStringList = new String[rawStringSet.size()]; rawStringSet.toArray(sortedStringList); Arrays.sort(sortedStringList); if (sortedStringList.length == 0) { min = null; max = null; return; } // Update min/max based on raw docs. min = sortedStringList[0]; max = sortedStringList[sortedStringList.length - 1]; // Merge the raw and aggregated docs, so stats for dictionary creation are collected correctly. int numAggregated = aggregatedStringSet.size(); if (numAggregated > 0) { rawStringSet.addAll(aggregatedStringSet); sortedStringList = new String[rawStringSet.size()]; rawStringSet.toArray(sortedStringList); Arrays.sort(sortedStringList); } } }