/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.core.common; import com.google.common.base.Preconditions; import com.linkedin.pinot.common.data.FieldSpec; import com.linkedin.pinot.core.plan.DocIdSetPlanNode; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import com.linkedin.pinot.core.operator.BaseOperator; import com.linkedin.pinot.core.segment.index.readers.Dictionary; /** * DataFetcher is a higher level abstraction for data fetching. Given an index segment, DataFetcher can manage the * DataSource, Dictionary, BlockValSet and BlockValIterator for this segment, preventing redundant construction for * these instances. DataFetcher can be used by both selection, aggregation and group-by data fetching process, reducing * duplicate codes and garbage collection. */ public class DataFetcher { private static final BlockId BLOCK_ZERO = new BlockId(0); private final Map<String, Dictionary> _columnToDictionaryMap; private final Map<String, BlockValSet> _columnToBlockValSetMap; private final Map<String, BlockValIterator> _columnToBlockValIteratorMap; private final Map<String, BlockMetadata> _columnToBlockMetadataMap; // Map from MV column name to max number of entries for the column. private final Map<String, Integer> _columnToMaxNumMultiValuesMap; // Thread local (reusable) array for all dictionary ids in the block, of a single valued column private static final ThreadLocal<int[]> THREAD_LOCAL_DICT_IDS = new ThreadLocal<int[]>() { @Override protected int[] initialValue() { return new int[DocIdSetPlanNode.MAX_DOC_PER_CALL]; } }; // Re-usable array to store MV dictionary id's for a given docId private static final ThreadLocal<int[]> THREAD_LOCAL_MV_DICT_IDS = new ThreadLocal<int[]>() { @Override protected int[] initialValue() { // Size is known only at runtime, which is when the array is expanded. return new int[0]; } }; private int _reusableMVDictIdSize; /** * Constructor for DataFetcher. * * @param columnToDataSourceMap Map from column name to data source */ public DataFetcher(Map<String, BaseOperator> columnToDataSourceMap) { _columnToDictionaryMap = new HashMap<>(); _columnToBlockValSetMap = new HashMap<>(); _columnToBlockValIteratorMap = new HashMap<>(); _columnToBlockMetadataMap = new HashMap<>(); _columnToMaxNumMultiValuesMap = new HashMap<>(); _reusableMVDictIdSize = 0; for (String column : columnToDataSourceMap.keySet()) { BaseOperator dataSource = columnToDataSourceMap.get(column); Block dataSourceBlock = dataSource.nextBlock(BLOCK_ZERO); BlockMetadata metadata = dataSourceBlock.getMetadata(); _columnToDictionaryMap.put(column, metadata.getDictionary()); BlockValSet blockValSet = dataSourceBlock.getBlockValueSet(); _columnToBlockValSetMap.put(column, blockValSet); _columnToBlockValIteratorMap.put(column, blockValSet.iterator()); _columnToBlockMetadataMap.put(column, metadata); int maxNumberOfMultiValues = metadata.getMaxNumberOfMultiValues(); _columnToMaxNumMultiValuesMap.put(column, maxNumberOfMultiValues); _reusableMVDictIdSize = Math.max(_reusableMVDictIdSize, maxNumberOfMultiValues); } } /** * Given a column, fetch its dictionary. * * @param column column name. * @return dictionary associated with this column. */ public Dictionary getDictionaryForColumn(String column) { return _columnToDictionaryMap.get(column); } /** * Given a column, fetch its block value set. * * @param column column name. * @return block value set associated with this column. */ public BlockValSet getBlockValSetForColumn(String column) { return _columnToBlockValSetMap.get(column); } /** * Returns the BlockValIterator for the specified column. * * @param column Column for which to return the blockValIterator. * @return BlockValIterator for the column. */ public BlockValIterator getBlockValIteratorForColumn(String column) { return _columnToBlockValIteratorMap.get(column); } public BlockMetadata getBlockMetadataFor(String column) { return _columnToBlockMetadataMap.get(column); } /** * Fetch the dictionary Ids for a single value column. * * @param column column name. * @param inDocIds document Id array. * @param inStartPos input start position. * @param length input length. * @param outDictIds dictionary Id array buffer. * @param outStartPos output start position. */ public void fetchSingleDictIds(String column, int[] inDocIds, int inStartPos, int length, int[] outDictIds, int outStartPos) { BlockValSet blockValSet = getBlockValSetForColumn(column); blockValSet.getDictionaryIds(inDocIds, inStartPos, length, outDictIds, outStartPos); } /** * Fetch the dictionary Ids for a multi value column. * * @param column column name. * @param inDocIds document Id array. * @param inStartPos input start position. * @param length input length. * @param outDictIdsArray dictionary Id array array buffer. * @param outStartPos output start position. * @param tempDictIdArray temporary holding dictIds read from BlockMultiValIterator. * Array size has to be >= max number of entries for this column. */ public void fetchMultiValueDictIds(String column, int[] inDocIds, int inStartPos, int length, int[][] outDictIdsArray, int outStartPos, int[] tempDictIdArray) { BlockMultiValIterator iterator = (BlockMultiValIterator) getBlockValIteratorForColumn(column); for (int i = inStartPos; i < inStartPos + length; i++, outStartPos++) { iterator.skipTo(inDocIds[i]); int dictIdLength = iterator.nextIntVal(tempDictIdArray); outDictIdsArray[outStartPos] = Arrays.copyOfRange(tempDictIdArray, 0, dictIdLength); } } /** * For a given multi-value column, trying to get the max number of * entries per row. * * @param column Column for which to get the max number of multi-values. * @return max number of entries for a given column. */ public int getMaxNumberOfEntriesForColumn(String column) { return _columnToMaxNumMultiValuesMap.get(column); } /** * Fetch the values for a single int value column. * * @param column column name. * @param inDocIds doc Id array. * @param inStartPos input start position. * @param length input length. * @param outValues value array buffer. * @param outStartPos output start position. */ public void fetchIntValues(String column, int[] inDocIds, int inStartPos, int length, int[] outValues, int outStartPos) { Dictionary dictionary = getDictionaryForColumn(column); if (dictionary != null) { int[] dictIds = THREAD_LOCAL_DICT_IDS.get(); fetchSingleDictIds(column, inDocIds, inStartPos, length, dictIds, 0); dictionary.readIntValues(dictIds, 0, length, outValues, outStartPos); } else { BlockValSet blockValSet = _columnToBlockValSetMap.get(column); blockValSet.getIntValues(inDocIds, inStartPos, length, outValues, outStartPos); } } /** * Fetch the int values for a multi-valued column. * * @param column column name. * @param inDocIds dictionary Id array. * @param inStartPos input start position. * @param length input length. * @param outValues value array buffer. * @param outStartPos output start position. */ public void fetchIntValues(String column, int[] inDocIds, int inStartPos, int length, int[][] outValues, int outStartPos) { Dictionary dictionary = getDictionaryForColumn(column); BlockMultiValIterator iterator = (BlockMultiValIterator) getBlockValIteratorForColumn(column); int inEndPos = inStartPos + length; int[] reusableMVDictIds = getReusableMVDictIds(_reusableMVDictIdSize); for (int i = inStartPos; i < inEndPos; i++, outStartPos++) { iterator.skipTo(inDocIds[i]); int numValues = iterator.nextIntVal(reusableMVDictIds); outValues[outStartPos] = new int[numValues]; dictionary.readIntValues(reusableMVDictIds, 0, numValues, outValues[outStartPos], 0); } } /** * Fetch the values for a single long value column. * * @param column column name. * @param inDocIds doc Id array. * @param inStartPos input start position. * @param length input length. * @param outValues value array buffer. * @param outStartPos output start position. */ public void fetchLongValues(String column, int[] inDocIds, int inStartPos, int length, long[] outValues, int outStartPos) { Dictionary dictionary = getDictionaryForColumn(column); if (dictionary != null) { int[] dictIds = THREAD_LOCAL_DICT_IDS.get(); fetchSingleDictIds(column, inDocIds, inStartPos, length, dictIds, 0); dictionary.readLongValues(dictIds, 0, length, outValues, outStartPos); } else { BlockValSet blockValSet = _columnToBlockValSetMap.get(column); blockValSet.getLongValues(inDocIds, inStartPos, length, outValues, outStartPos); } } /** * Fetch the long values for a multi-valued column. * * @param column column name. * @param inDocIds dictionary Id array. * @param inStartPos input start position. * @param length input length. * @param outValues value array buffer. * @param outStartPos output start position. */ public void fetchLongValues(String column, int[] inDocIds, int inStartPos, int length, long[][] outValues, int outStartPos) { Dictionary dictionary = getDictionaryForColumn(column); BlockMultiValIterator iterator = (BlockMultiValIterator) getBlockValIteratorForColumn(column); int inEndPos = inStartPos + length; int[] reusableMVDictIds = getReusableMVDictIds(_reusableMVDictIdSize); for (int i = inStartPos; i < inEndPos; i++, outStartPos++) { iterator.skipTo(inDocIds[i]); int numValues = iterator.nextIntVal(reusableMVDictIds); outValues[outStartPos] = new long[numValues]; dictionary.readLongValues(reusableMVDictIds, 0, numValues, outValues[outStartPos], 0); } } /** * Fetch the values for a single float value column. * * @param column column name. * @param inDocIds doc Id array. * @param inStartPos input start position. * @param length input length. * @param outValues value array buffer. * @param outStartPos output start position. */ public void fetchFloatValues(String column, int[] inDocIds, int inStartPos, int length, float[] outValues, int outStartPos) { Dictionary dictionary = getDictionaryForColumn(column); if (dictionary != null) { int[] dictIds = THREAD_LOCAL_DICT_IDS.get(); fetchSingleDictIds(column, inDocIds, inStartPos, length, dictIds, 0); dictionary.readFloatValues(dictIds, 0, length, outValues, outStartPos); } else { BlockValSet blockValSet = _columnToBlockValSetMap.get(column); blockValSet.getFloatValues(inDocIds, inStartPos, length, outValues, outStartPos); } } /** * Fetch the float values for a multi-valued column. * * @param column column name. * @param inDocIds dictionary Id array. * @param inStartPos input start position. * @param length input length. * @param outValues value array buffer. * @param outStartPos output start position. */ public void fetchFloatValues(String column, int[] inDocIds, int inStartPos, int length, float[][] outValues, int outStartPos) { Dictionary dictionary = getDictionaryForColumn(column); BlockMultiValIterator iterator = (BlockMultiValIterator) getBlockValIteratorForColumn(column); int inEndPos = inStartPos + length; int[] reusableMVDictIds = getReusableMVDictIds(_reusableMVDictIdSize); for (int i = inStartPos; i < inEndPos; i++, outStartPos++) { iterator.skipTo(inDocIds[i]); int numValues = iterator.nextIntVal(reusableMVDictIds); outValues[outStartPos] = new float[numValues]; dictionary.readFloatValues(reusableMVDictIds, 0, numValues, outValues[outStartPos], 0); } } /** * Fetch the values for a single double value column. * * @param column column name. * @param inDocIds dictionary Id array. * @param inStartPos input start position. * @param length input length. * @param outValues value array buffer. * @param outStartPos output start position. */ public void fetchDoubleValues(String column, int[] inDocIds, int inStartPos, int length, double[] outValues, int outStartPos) { Dictionary dictionary = getDictionaryForColumn(column); if (dictionary != null) { int[] dictIds = THREAD_LOCAL_DICT_IDS.get(); fetchSingleDictIds(column, inDocIds, inStartPos, length, dictIds, 0); dictionary.readDoubleValues(dictIds, 0, length, outValues, outStartPos); } else { BlockValSet blockValSet = _columnToBlockValSetMap.get(column); blockValSet.getDoubleValues(inDocIds, inStartPos, length, outValues, outStartPos); } } /** * Fetch the double values for a multi-valued column. * * @param column column name. * @param inDocIds dictionary Id array. * @param inStartPos input start position. * @param length input length. * @param outValues value array buffer. * @param outStartPos output start position. */ public void fetchDoubleValues(String column, int[] inDocIds, int inStartPos, int length, double[][] outValues, int outStartPos) { Dictionary dictionary = getDictionaryForColumn(column); BlockMultiValIterator iterator = (BlockMultiValIterator) getBlockValIteratorForColumn(column); int inEndPos = inStartPos + length; int[] reusableMVDictIds = getReusableMVDictIds(_reusableMVDictIdSize); for (int i = inStartPos; i < inEndPos; i++, outStartPos++) { iterator.skipTo(inDocIds[i]); int numValues = iterator.nextIntVal(reusableMVDictIds); outValues[outStartPos] = new double[numValues]; dictionary.readDoubleValues(reusableMVDictIds, 0, numValues, outValues[outStartPos], 0); } } /** * * @param column Column for which to fetch the values * @param inDocIds Array of docIds for which to fetch the values * @param outValues Array of strings where output will be written * @param length Length of input docIds */ public void fetchStringValues(String column, int[] inDocIds, int inStartPos, int length, String[] outValues, int outStartPos) { Dictionary dictionary = getDictionaryForColumn(column); if (dictionary != null) { int[] dictIds = THREAD_LOCAL_DICT_IDS.get(); fetchSingleDictIds(column, inDocIds, inStartPos, length, dictIds, 0); dictionary.readStringValues(dictIds, 0, length, outValues, outStartPos); } else { BlockValSet blockValSet = _columnToBlockValSetMap.get(column); blockValSet.getStringValues(inDocIds, inStartPos, length, outValues, outStartPos); } } /** * * @param column Column for which to fetch the values * @param inDocIds Array of docIds for which to fetch the values * @param outValues Array of strings where output will be written * @param length Length of input docIds */ public void fetchStringValues(String column, int[] inDocIds, int inStartPos, int length, String[][] outValues, int outStartPos) { Dictionary dictionary = getDictionaryForColumn(column); BlockMultiValIterator iterator = (BlockMultiValIterator) getBlockValIteratorForColumn(column); int inEndPos = inStartPos + length; int[] reusableMVDictIds = getReusableMVDictIds(_reusableMVDictIdSize); for (int i = inStartPos; i < inEndPos; i++, outStartPos++) { iterator.skipTo(inDocIds[i]); int numValues = iterator.nextIntVal(reusableMVDictIds); outValues[outStartPos] = new String[numValues]; dictionary.readStringValues(reusableMVDictIds, 0, numValues, outValues[outStartPos], 0); } } /** * Returns the data type for the specified column. * * @param column Name of column for which to return the data type. * @return Data type of the column. */ public FieldSpec.DataType getDataType(String column) { BlockMetadata blockMetadata = _columnToBlockMetadataMap.get(column); Preconditions.checkNotNull(blockMetadata, "Invalid column " + column + " specified in DataFetcher."); return blockMetadata.getDataType(); } /** * Helper method that returns ThreadLocal reusable int array for MV dictionary ids. * If desired size is larger than existing thread local storage, the latter is expanded. * * @param size Desired size. * @return Thread local int array of at least desired size. */ private int[] getReusableMVDictIds(int size) { // If current size is not large enough, expand to new size. int[] reusableMVDictIds = THREAD_LOCAL_MV_DICT_IDS.get(); if (reusableMVDictIds.length < size) { reusableMVDictIds = new int[size]; THREAD_LOCAL_MV_DICT_IDS.set(reusableMVDictIds); } return reusableMVDictIds; } }