/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.core.segment.creator.impl.stats; import com.linkedin.pinot.core.segment.creator.StatsCollectorConfig; import java.util.HashMap; import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.linkedin.pinot.common.data.FieldSpec; import com.linkedin.pinot.common.data.Schema; import com.linkedin.pinot.core.data.GenericRow; import com.linkedin.pinot.core.segment.creator.ColumnStatistics; import com.linkedin.pinot.core.segment.creator.SegmentPreIndexStatsCollector; public class SegmentPreIndexStatsCollectorImpl implements SegmentPreIndexStatsCollector { private static final Logger LOGGER = LoggerFactory.getLogger(SegmentPreIndexStatsCollectorImpl.class); private final StatsCollectorConfig _statsCollectorConfig; private Map<String, AbstractColumnStatisticsCollector> columnStatsCollectorMap; private int rawDocCount; private int aggregatedDocCount; private int totalDocCount; public SegmentPreIndexStatsCollectorImpl(StatsCollectorConfig statsCollectorConfig) { this._statsCollectorConfig = statsCollectorConfig; } @Override public void init() { columnStatsCollectorMap = new HashMap<>(); Schema dataSchema = _statsCollectorConfig.getSchema(); for (final FieldSpec spec : dataSchema.getAllFieldSpecs()) { String column = spec.getName(); switch (spec.getDataType()) { case BOOLEAN: case STRING: columnStatsCollectorMap.put(spec.getName(), new StringColumnPreIndexStatsCollector(column, _statsCollectorConfig)); break; case INT: columnStatsCollectorMap.put(spec.getName(), new IntColumnPreIndexStatsCollector(column, _statsCollectorConfig)); break; case LONG: columnStatsCollectorMap.put(spec.getName(), new LongColumnPreIndexStatsCollector(column, _statsCollectorConfig)); break; case FLOAT: columnStatsCollectorMap.put(spec.getName(), new FloatColumnPreIndexStatsCollector(column, _statsCollectorConfig)); break; case DOUBLE: columnStatsCollectorMap.put(spec.getName(), new DoubleColumnPreIndexStatsCollector(column, _statsCollectorConfig)); break; default: break; } } } @Override public void build() { for (final String column : columnStatsCollectorMap.keySet()) { columnStatsCollectorMap.get(column).seal(); } } @Override public ColumnStatistics getColumnProfileFor(String column) { return columnStatsCollectorMap.get(column); } @Override public void collectRow(GenericRow row) throws Exception { collectRow(row, false); } @Override public void collectRow(GenericRow row, boolean isAggregated) throws Exception { for (Map.Entry<String, Object> columnNameAndValue : row.getEntrySet()) { final String columnName = columnNameAndValue.getKey(); final Object value = columnNameAndValue.getValue(); if (columnStatsCollectorMap.containsKey(columnName)) { try { columnStatsCollectorMap.get(columnName).collect(value, isAggregated); } catch (Exception e) { LOGGER.error("Exception while collecting stats for column:{} in row:{}", columnName, row); throw e; } } } ++totalDocCount; if (!isAggregated) { ++rawDocCount; } else { ++aggregatedDocCount; } } @Override public int getRawDocCount() { return rawDocCount; } @Override public int getAggregatedDocCount() { return aggregatedDocCount; } @Override public int getTotalDocCount() { return totalDocCount; } @Override public void logStats() { try { for (final String column : columnStatsCollectorMap.keySet()) { AbstractColumnStatisticsCollector statisticsCollector = columnStatsCollectorMap.get(column); LOGGER.info("********** logging for column : " + column + " ********************* "); LOGGER.info("min value : " + statisticsCollector.getMinValue()); LOGGER.info("max value : " + statisticsCollector.getMaxValue()); LOGGER.info("cardinality : " + statisticsCollector.getCardinality()); LOGGER.info("length of largest column : " + statisticsCollector.getLengthOfLargestElement()); LOGGER.info("is sorted : " + statisticsCollector.isSorted()); LOGGER.info("column type : " + _statsCollectorConfig.getSchema().getFieldSpecFor(column).getDataType()); if (statisticsCollector.getPartitionFunction() != null) { LOGGER.info("min partition value: " + statisticsCollector.getPartitionRanges().toString()); } LOGGER.info("***********************************************"); } } catch (final Exception e) { LOGGER.error("Caught exception while logging column stats", e); } } }