/**
* Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.linkedin.pinot.core.segment.creator.impl;
import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.data.MetricFieldSpec;
import com.linkedin.pinot.core.io.writer.impl.FixedByteSingleValueMultiColWriter;
import it.unimi.dsi.fastutil.doubles.Double2IntOpenHashMap;
import it.unimi.dsi.fastutil.floats.Float2IntOpenHashMap;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.ArrayUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class SegmentDictionaryCreator implements Closeable {
private static final Logger LOGGER = LoggerFactory.getLogger(SegmentDictionaryCreator.class);
private final Object sortedList;
private final FieldSpec spec;
private final File dictionaryFile;
private final int rowCount;
private final char paddingChar;
private static final Charset utf8CharSet = Charset.forName("UTF-8");
private Int2IntOpenHashMap intValueToIndexMap;
private Long2IntOpenHashMap longValueToIndexMap;
private Float2IntOpenHashMap floatValueToIndexMap;
private Double2IntOpenHashMap doubleValueToIndexMap;
private Object2IntOpenHashMap<String> stringValueToIndexMap;
private int stringColumnMaxLength = 0;
public SegmentDictionaryCreator(boolean hasNulls, Object sortedList, FieldSpec spec, File indexDir, char paddingChar)
throws IOException {
rowCount = ArrayUtils.getLength(sortedList);
Object first = null;
Object last = null;
if (0 < rowCount) {
if (sortedList instanceof int[]) {
int[] intSortedList = (int[]) sortedList;
first = intSortedList[0];
last = intSortedList[rowCount - 1];
} else if (sortedList instanceof long[]) {
long[] longSortedList = (long[]) sortedList;
first = longSortedList[0];
last = longSortedList[rowCount - 1];
} else if (sortedList instanceof float[]) {
float[] floatSortedList = (float[]) sortedList;
first = floatSortedList[0];
last = floatSortedList[rowCount - 1];
} else if (sortedList instanceof double[]) {
double[] doubleSortedList = (double[]) sortedList;
first = doubleSortedList[0];
last = doubleSortedList[rowCount - 1];
} else if (sortedList instanceof String[]) {
String[] intSortedList = (String[]) sortedList;
first = intSortedList[0];
last = intSortedList[rowCount - 1];
} else if (sortedList instanceof Object[]) {
Object[] intSortedList = (Object[]) sortedList;
first = intSortedList[0];
last = intSortedList[rowCount - 1];
}
}
// make hll column log info different than other columns, since range makes no sense for hll column
if (spec instanceof MetricFieldSpec &&
((MetricFieldSpec)spec).getDerivedMetricType() == MetricFieldSpec.DerivedMetricType.HLL) {
LOGGER.info(
"Creating segment for column {}, hasNulls = {}, cardinality = {}, dataType = {}, single value field = {}, is HLL derived column",
spec.getName(), hasNulls, rowCount, spec.getDataType(), spec.isSingleValueField());
} else {
LOGGER.info(
"Creating segment for column {}, hasNulls = {}, cardinality = {}, dataType = {}, single value field = {}, range = {} to {}",
spec.getName(), hasNulls, rowCount, spec.getDataType(), spec.isSingleValueField(), first, last);
}
this.sortedList = sortedList;
this.spec = spec;
this.paddingChar = paddingChar;
dictionaryFile = new File(indexDir, spec.getName() + ".dict");
FileUtils.touch(dictionaryFile);
}
@Override
public void close() throws IOException {
}
public void build(boolean[] isSorted) throws Exception {
switch (spec.getDataType()) {
case INT:
final FixedByteSingleValueMultiColWriter intDictionaryWrite =
new FixedByteSingleValueMultiColWriter(dictionaryFile, rowCount, 1,
V1Constants.Dict.INT_DICTIONARY_COL_SIZE);
intValueToIndexMap = new Int2IntOpenHashMap(rowCount);
int[] sortedInts = (int[]) sortedList;
for (int i = 0; i < rowCount; i++) {
final int entry = sortedInts[i];
intDictionaryWrite.setInt(i, 0, entry);
intValueToIndexMap.put(entry, i);
}
intDictionaryWrite.close();
break;
case FLOAT:
final FixedByteSingleValueMultiColWriter floatDictionaryWrite =
new FixedByteSingleValueMultiColWriter(dictionaryFile, rowCount, 1,
V1Constants.Dict.FLOAT_DICTIONARY_COL_SIZE);
floatValueToIndexMap = new Float2IntOpenHashMap(rowCount);
float[] sortedFloats = (float[]) sortedList;
for (int i = 0; i < rowCount; i++) {
final float entry = sortedFloats[i];
floatDictionaryWrite.setFloat(i, 0, entry);
floatValueToIndexMap.put(entry, i);
}
floatDictionaryWrite.close();
break;
case LONG:
final FixedByteSingleValueMultiColWriter longDictionaryWrite =
new FixedByteSingleValueMultiColWriter(dictionaryFile, rowCount, 1,
V1Constants.Dict.LONG_DICTIONARY_COL_SIZE);
longValueToIndexMap = new Long2IntOpenHashMap(rowCount);
long[] sortedLongs = (long[]) sortedList;
for (int i = 0; i < rowCount; i++) {
final long entry = sortedLongs[i];
longDictionaryWrite.setLong(i, 0, entry);
longValueToIndexMap.put(entry, i);
}
longDictionaryWrite.close();
break;
case DOUBLE:
final FixedByteSingleValueMultiColWriter doubleDictionaryWrite =
new FixedByteSingleValueMultiColWriter(dictionaryFile, rowCount, 1,
V1Constants.Dict.DOUBLE_DICTIONARY_COL_SIZE);
doubleValueToIndexMap = new Double2IntOpenHashMap(rowCount);
double[] sortedDoubles = (double[]) sortedList;
for (int i = 0; i < rowCount; i++) {
final double entry = sortedDoubles[i];
doubleDictionaryWrite.setDouble(i, 0, entry);
doubleValueToIndexMap.put(entry, i);
}
doubleDictionaryWrite.close();
break;
case STRING:
case BOOLEAN:
Object[] sortedObjects = (Object[]) sortedList;
stringColumnMaxLength = 1; // make sure that there is non-zero sized dictionary JIRA:PINOT-2947
for (final Object e : sortedObjects) {
String val = e.toString();
int length = val.getBytes(utf8CharSet).length;
if (stringColumnMaxLength < length) {
stringColumnMaxLength = length;
}
}
final FixedByteSingleValueMultiColWriter stringDictionaryWrite =
new FixedByteSingleValueMultiColWriter(dictionaryFile, rowCount, 1,
new int[] { stringColumnMaxLength });
final String[] revised = new String[rowCount];
Map<String, String> revisedMap = new HashMap<String, String>();
for (int i = 0; i < rowCount; i++) {
final String toWrite = sortedObjects[i].toString();
String entry = getPaddedString(toWrite, stringColumnMaxLength, paddingChar);
revised[i] = entry;
if (isSorted[0] && i> 0 && (revised[i-1].compareTo(entry) > 0)) {
isSorted[0] = false;
}
assert (revised[i].getBytes(utf8CharSet).length == stringColumnMaxLength);
revisedMap.put(revised[i], toWrite);
}
if (revisedMap.size() != sortedObjects.length) {
// Two strings map to the same padded string in the current column
throw new RuntimeException("Number of entries in dictionary != number of unique values in the data in column "
+ spec.getName());
}
Arrays.sort(revised);
stringValueToIndexMap = new Object2IntOpenHashMap<>(rowCount);
for (int i = 0; i < revised.length; i++) {
stringDictionaryWrite.setString(i, 0, revised[i]);
// No need to store padded value, we can store and lookup by raw value. In certain cases, original sorted order
// may be different from revised sorted order [PINOT-2730], so would need to use the original order in value
// to index map.
String origString = revisedMap.get(revised[i]);
stringValueToIndexMap.put(origString, i);
}
stringDictionaryWrite.close();
break;
default:
throw new RuntimeException("Unhandled type " + spec.getDataType());
}
}
public int getStringColumnMaxLength() {
return stringColumnMaxLength;
}
public int indexOfSV(Object e) {
switch (spec.getDataType()) {
case INT:
return intValueToIndexMap.get(e);
case FLOAT:
return floatValueToIndexMap.get(e);
case DOUBLE:
return doubleValueToIndexMap.get(e);
case LONG:
return longValueToIndexMap.get(e);
case STRING:
case BOOLEAN:
String value = e.toString();
return stringValueToIndexMap.get(value);
default:
throw new UnsupportedOperationException("Unsupported data type : " + spec.getDataType() +
" for column : " + spec.getName());
}
}
public int[] indexOfMV(Object e) {
final Object[] multiValues = (Object[]) e;
final int[] ret = new int[multiValues.length];
switch (spec.getDataType()) {
case INT:
for (int i = 0; i < multiValues.length; i++) {
ret[i] = intValueToIndexMap.get(multiValues[i]);
}
break;
case FLOAT:
for (int i = 0; i < multiValues.length; i++) {
ret[i] = floatValueToIndexMap.get(multiValues[i]);
}
break;
case LONG:
for (int i = 0; i < multiValues.length; i++) {
ret[i] = longValueToIndexMap.get(multiValues[i]);
}
break;
case DOUBLE:
for (int i = 0; i < multiValues.length; i++) {
ret[i] = doubleValueToIndexMap.get(multiValues[i]);
}
break;
case STRING:
case BOOLEAN:
for (int i = 0; i < multiValues.length; i++) {
String value = multiValues[i].toString();
ret[i] = stringValueToIndexMap.get(value);
}
break;
default:
throw new UnsupportedOperationException("Unsupported data type : " + spec.getDataType() +
" for multivalue column : " + spec.getName());
}
return ret;
}
/**
* Given an input string and a target length append the padding characters to the string
* to make it of desired length. If length of string >= target length, returns the original string.
*
* @param inputString
* @param targetLength
* @param paddingChar should be in range u0000 to u007F, other chars would occupy more than one byte under utf-8
* @return
*/
public static String getPaddedString(String inputString, int targetLength, char paddingChar) {
if (inputString.length() >= targetLength) {
return inputString;
}
StringBuilder stringBuilder = new StringBuilder(inputString);
final int padding = targetLength - inputString.getBytes(utf8CharSet).length;
for (int i = 0; i < padding; i++) {
stringBuilder.append(paddingChar);
}
return stringBuilder.toString();
}
}