/** * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.linkedin.pinot.segments.v1.creator; import com.linkedin.pinot.common.data.DimensionFieldSpec; import com.linkedin.pinot.common.data.FieldSpec; import com.linkedin.pinot.common.data.FieldSpec.DataType; import com.linkedin.pinot.common.data.Schema; import com.linkedin.pinot.common.segment.ReadMode; import com.linkedin.pinot.core.indexsegment.columnar.ColumnarSegmentLoader; import com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig; import com.linkedin.pinot.core.indexsegment.utils.AvroUtils; import com.linkedin.pinot.core.segment.creator.impl.stats.AbstractColumnStatisticsCollector; import com.linkedin.pinot.core.segment.creator.SegmentIndexCreationDriver; import com.linkedin.pinot.core.segment.creator.StatsCollectorConfig; import com.linkedin.pinot.core.segment.creator.impl.SegmentCreationDriverFactory; import com.linkedin.pinot.core.segment.creator.impl.SegmentDictionaryCreator; import com.linkedin.pinot.core.segment.creator.impl.V1Constants; import com.linkedin.pinot.core.segment.creator.impl.stats.DoubleColumnPreIndexStatsCollector; import com.linkedin.pinot.core.segment.creator.impl.stats.FloatColumnPreIndexStatsCollector; import com.linkedin.pinot.core.segment.creator.impl.stats.IntColumnPreIndexStatsCollector; import com.linkedin.pinot.core.segment.creator.impl.stats.LongColumnPreIndexStatsCollector; import com.linkedin.pinot.core.segment.creator.impl.stats.StringColumnPreIndexStatsCollector; import com.linkedin.pinot.core.segment.index.ColumnMetadata; import com.linkedin.pinot.core.segment.index.IndexSegmentImpl; import com.linkedin.pinot.core.segment.index.SegmentMetadataImpl; import com.linkedin.pinot.core.segment.index.readers.DoubleDictionary; import com.linkedin.pinot.core.segment.index.readers.FloatDictionary; import com.linkedin.pinot.core.segment.index.readers.ImmutableDictionaryReader; import com.linkedin.pinot.core.segment.index.readers.IntDictionary; import com.linkedin.pinot.core.segment.index.readers.LongDictionary; import com.linkedin.pinot.core.segment.index.readers.StringDictionary; import com.linkedin.pinot.util.TestUtils; import java.io.File; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.TimeUnit; import org.apache.avro.Schema.Field; import org.apache.avro.file.DataFileStream; import org.apache.avro.generic.GenericRecord; import org.apache.avro.util.Utf8; import org.apache.commons.io.FileUtils; import org.testng.Assert; import org.testng.annotations.AfterClass; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; public class DictionariesTest { private static final String AVRO_DATA = "data/test_sample_data.avro"; private static File INDEX_DIR = new File(DictionariesTest.class.toString()); static Map<String, Set<Object>> uniqueEntries; private static File segmentDirectory; @AfterClass public static void cleanup() { FileUtils.deleteQuietly(INDEX_DIR); } @BeforeClass public static void before() throws Exception { final String filePath = TestUtils.getFileFromResourceUrl(DictionariesTest.class.getClassLoader().getResource(AVRO_DATA)); if (INDEX_DIR.exists()) { FileUtils.deleteQuietly(INDEX_DIR); } final SegmentGeneratorConfig config = SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(new File(filePath), INDEX_DIR, "time_day", TimeUnit.DAYS, "test"); final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null); driver.init(config); driver.build(); segmentDirectory = new File(INDEX_DIR, driver.getSegmentName()); final Schema schema = AvroUtils.extractSchemaFromAvro(new File(filePath)); final DataFileStream<GenericRecord> avroReader = AvroUtils.getAvroReader(new File(filePath)); final org.apache.avro.Schema avroSchema = avroReader.getSchema(); final String[] columns = new String[avroSchema.getFields().size()]; int i = 0; for (final Field f : avroSchema.getFields()) { columns[i] = f.name(); i++; } uniqueEntries = new HashMap<String, Set<Object>>(); for (final String column : columns) { uniqueEntries.put(column, new HashSet<Object>()); } while (avroReader.hasNext()) { final GenericRecord rec = avroReader.next(); for (final String column : columns) { Object val = rec.get(column); if (val instanceof Utf8) { val = ((Utf8) val).toString(); } uniqueEntries.get(column).add(getAppropriateType(schema.getFieldSpecFor(column).getDataType(), val)); } } } private static Object getAppropriateType(DataType spec, Object val) { if (val == null) { switch (spec) { case DOUBLE: return V1Constants.Numbers.NULL_DOUBLE; case FLOAT: return V1Constants.Numbers.NULL_FLOAT; case INT: return V1Constants.Numbers.NULL_INT; case LONG: return V1Constants.Numbers.NULL_LONG; default: return V1Constants.Str.NULL_STRING; } } return val; } @Test public void test1() throws Exception { final IndexSegmentImpl heapSegment = (IndexSegmentImpl) ColumnarSegmentLoader.load(segmentDirectory, ReadMode.heap); final IndexSegmentImpl mmapSegment = (IndexSegmentImpl) ColumnarSegmentLoader.load(segmentDirectory, ReadMode.mmap); for (final String column : ((SegmentMetadataImpl) mmapSegment.getSegmentMetadata()).getColumnMetadataMap() .keySet()) { final ImmutableDictionaryReader heapDictionary = heapSegment.getDictionaryFor(column); final ImmutableDictionaryReader mmapDictionary = mmapSegment.getDictionaryFor(column); switch (((SegmentMetadataImpl) mmapSegment.getSegmentMetadata()).getColumnMetadataMap() .get(column) .getDataType()) { case BOOLEAN: case STRING: Assert.assertTrue(heapDictionary instanceof StringDictionary); Assert.assertTrue(mmapDictionary instanceof StringDictionary); break; case DOUBLE: Assert.assertTrue(heapDictionary instanceof DoubleDictionary); Assert.assertTrue(mmapDictionary instanceof DoubleDictionary); break; case FLOAT: Assert.assertTrue(heapDictionary instanceof FloatDictionary); Assert.assertTrue(mmapDictionary instanceof FloatDictionary); break; case LONG: Assert.assertTrue(heapDictionary instanceof LongDictionary); Assert.assertTrue(mmapDictionary instanceof LongDictionary); break; case INT: Assert.assertTrue(heapDictionary instanceof IntDictionary); Assert.assertTrue(mmapDictionary instanceof IntDictionary); break; } Assert.assertEquals(mmapDictionary.length(), heapDictionary.length()); for (int i = 0; i < heapDictionary.length(); i++) { Assert.assertEquals(mmapDictionary.get(i), heapDictionary.get(i)); } } } @Test public void test2() throws Exception { final IndexSegmentImpl heapSegment = (IndexSegmentImpl) ColumnarSegmentLoader.load(segmentDirectory, ReadMode.heap); final IndexSegmentImpl mmapSegment = (IndexSegmentImpl) ColumnarSegmentLoader.load(segmentDirectory, ReadMode.mmap); final Map<String, ColumnMetadata> metadataMap = ((SegmentMetadataImpl) mmapSegment.getSegmentMetadata()).getColumnMetadataMap(); for (final String column : metadataMap.keySet()) { final ImmutableDictionaryReader heapDictionary = heapSegment.getDictionaryFor(column); final ImmutableDictionaryReader mmapDictionary = mmapSegment.getDictionaryFor(column); final Set<Object> uniques = uniqueEntries.get(column); final List<Object> list = Arrays.asList(uniques.toArray()); Collections.shuffle(list); for (final Object entry : list) { Assert.assertEquals(mmapDictionary.indexOf(entry), heapDictionary.indexOf(entry)); if (!column.equals("pageKey")) { Assert.assertFalse(heapDictionary.indexOf(entry) < 0); Assert.assertFalse(mmapDictionary.indexOf(entry) < 0); } } } } @Test public void testIntColumnPreIndexStatsCollector() throws Exception { AbstractColumnStatisticsCollector statsCollector = buildStatsCollector("column1", DataType.INT); statsCollector.collect(new Integer(1)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Float(2)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Long(3)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Double(4)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Integer(4)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Float(2)); Assert.assertFalse(statsCollector.isSorted()); statsCollector.collect(new Double(40)); Assert.assertFalse(statsCollector.isSorted()); statsCollector.collect(new Double(20)); Assert.assertFalse(statsCollector.isSorted()); statsCollector.seal(); Assert.assertEquals(statsCollector.getCardinality(), 6); Assert.assertEquals(((Number) statsCollector.getMinValue()).intValue(), 1); Assert.assertEquals(((Number) statsCollector.getMaxValue()).intValue(), 40); Assert.assertFalse(statsCollector.isSorted()); } @Test public void testFloatColumnPreIndexStatsCollector() throws Exception { AbstractColumnStatisticsCollector statsCollector = buildStatsCollector("column1", DataType.FLOAT); statsCollector.collect(new Integer(1)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Float(2)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Long(3)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Double(4)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Integer(4)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Float(2)); Assert.assertFalse(statsCollector.isSorted()); statsCollector.collect(new Double(40)); Assert.assertFalse(statsCollector.isSorted()); statsCollector.collect(new Double(20)); Assert.assertFalse(statsCollector.isSorted()); statsCollector.seal(); Assert.assertEquals(statsCollector.getCardinality(), 6); Assert.assertEquals(((Number) statsCollector.getMinValue()).intValue(), 1); Assert.assertEquals(((Number) statsCollector.getMaxValue()).intValue(), 40); Assert.assertFalse(statsCollector.isSorted()); } @Test public void testLongColumnPreIndexStatsCollector() throws Exception { AbstractColumnStatisticsCollector statsCollector = buildStatsCollector("column1", DataType.LONG); statsCollector.collect(new Integer(1)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Float(2)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Long(3)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Double(4)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Integer(4)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Float(2)); Assert.assertFalse(statsCollector.isSorted()); statsCollector.collect(new Double(40)); Assert.assertFalse(statsCollector.isSorted()); statsCollector.collect(new Double(20)); Assert.assertFalse(statsCollector.isSorted()); statsCollector.seal(); Assert.assertEquals(statsCollector.getCardinality(), 6); Assert.assertEquals(((Number) statsCollector.getMinValue()).intValue(), 1); Assert.assertEquals(((Number) statsCollector.getMaxValue()).intValue(), 40); Assert.assertFalse(statsCollector.isSorted()); } @Test public void testDoubleColumnPreIndexStatsCollector() throws Exception { AbstractColumnStatisticsCollector statsCollector = buildStatsCollector("column1", DataType.DOUBLE); statsCollector.collect(new Integer(1)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Float(2)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Long(3)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Double(4)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Integer(4)); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect(new Float(2)); Assert.assertFalse(statsCollector.isSorted()); statsCollector.collect(new Double(40)); Assert.assertFalse(statsCollector.isSorted()); statsCollector.collect(new Double(20)); Assert.assertFalse(statsCollector.isSorted()); statsCollector.seal(); Assert.assertEquals(statsCollector.getCardinality(), 6); Assert.assertEquals(((Number) statsCollector.getMinValue()).intValue(), 1); Assert.assertEquals(((Number) statsCollector.getMaxValue()).intValue(), 40); Assert.assertFalse(statsCollector.isSorted()); } @Test public void testStringColumnPreIndexStatsCollectorForRandomString() throws Exception { AbstractColumnStatisticsCollector statsCollector = buildStatsCollector("column1", DataType.STRING); statsCollector.collect("a"); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect("b"); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect("c"); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect("d"); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect("d"); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect("b"); Assert.assertFalse(statsCollector.isSorted()); statsCollector.collect("z"); Assert.assertFalse(statsCollector.isSorted()); statsCollector.collect("u"); Assert.assertFalse(statsCollector.isSorted()); statsCollector.seal(); Assert.assertEquals(statsCollector.getCardinality(), 6); Assert.assertEquals((statsCollector.getMinValue()).toString(), "a"); Assert.assertEquals((statsCollector.getMaxValue()).toString(), "z"); Assert.assertFalse(statsCollector.isSorted()); } @Test public void testStringColumnPreIndexStatsCollectorForBoolean() throws Exception { AbstractColumnStatisticsCollector statsCollector = buildStatsCollector("column1", DataType.BOOLEAN); statsCollector.collect("false"); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect("false"); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect("false"); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect("true"); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect("true"); Assert.assertTrue(statsCollector.isSorted()); statsCollector.collect("false"); Assert.assertFalse(statsCollector.isSorted()); statsCollector.collect("false"); Assert.assertFalse(statsCollector.isSorted()); statsCollector.collect("true"); Assert.assertFalse(statsCollector.isSorted()); statsCollector.seal(); Assert.assertEquals(statsCollector.getCardinality(), 2); Assert.assertEquals((statsCollector.getMinValue()).toString(), "false"); Assert.assertEquals((statsCollector.getMaxValue()).toString(), "true"); Assert.assertFalse(statsCollector.isSorted()); } /** * Tests DictionaryCreator for case when one value is a substring of another. * For example, in case of sorted values {"abc", "abc def"} after padding, * the sorted order would change to {"abc def%%%%", "abc%%%%%%%"} * * This test asserts that DictionaryCreator.indexOfSV("abc") returns 1 (ie index of "abc%%%%%%%" * in actual padded dictionary), and not 0. * * @throws Exception */ @Test public void testStringsValuesWithPadding() throws Exception { File indexDir = new File("/tmp/dict.test"); indexDir.deleteOnExit(); FieldSpec fieldSpec = new DimensionFieldSpec("test", DataType.STRING, true); String[] inputStrings = new String[2]; String[] paddedStrings = new String[2]; char paddingChar = '%'; inputStrings[0] = "abc def"; inputStrings[1] = "abc"; Arrays.sort(inputStrings); // Sorted order: {"abc", "abc def"} boolean[] isSorted = new boolean[1]; isSorted[0] = true; SegmentDictionaryCreator dictionaryCreator = new SegmentDictionaryCreator(false, inputStrings, fieldSpec, indexDir, paddingChar); dictionaryCreator.build(isSorted); Assert.assertFalse(isSorted[0]); // Get the padded string as stored in the dictionary. int targetPaddedLength = dictionaryCreator.getStringColumnMaxLength(); for (int i = 0; i < inputStrings.length; i++) { paddedStrings[i] = SegmentDictionaryCreator.getPaddedString(inputStrings[i], targetPaddedLength, paddingChar); } Arrays.sort(paddedStrings); // Sorted Order: {"abc def%%%%", "abc%%%%%%%"} // Assert that indexOfSV for un-padded string returns the index of the corresponding padded string. for (int i = 0; i < inputStrings.length; i++) { int paddedIndex = dictionaryCreator.indexOfSV(inputStrings[i]); Assert.assertTrue(paddedStrings[paddedIndex].equals( SegmentDictionaryCreator.getPaddedString(inputStrings[i], targetPaddedLength, paddingChar))); } dictionaryCreator.close(); FileUtils.deleteQuietly(indexDir); } /** * Test for ensuring that Strings with special characters can be handled * correctly. * * @throws Exception */ @Test public void testUTF8Characters() throws Exception { File indexDir = new File("/tmp/dict.test"); indexDir.deleteOnExit(); FieldSpec fieldSpec = new DimensionFieldSpec("test", DataType.STRING, true); String[] inputStrings = new String[3]; char paddingChar = '%'; inputStrings[0] = new String(new byte[]{67, 97, 102, -61, -87}); // "Café"; inputStrings[1] = new String(new byte[]{70, 114, 97, 110, -61, -89, 111, 105, 115}); // "François"; inputStrings[2] = new String(new byte[]{67, -61, -76, 116, 101, 32, 100, 39, 73, 118, 111, 105, 114, 101}); // "Côte d'Ivoire"; Arrays.sort(inputStrings); SegmentDictionaryCreator dictionaryCreator = new SegmentDictionaryCreator(false, inputStrings, fieldSpec, indexDir, paddingChar); dictionaryCreator.build(new boolean[]{false}); for (String inputString : inputStrings) { Assert.assertTrue(dictionaryCreator.indexOfSV(inputString) >= 0, "Value not found in dictionary " + inputString); } dictionaryCreator.close(); FileUtils.deleteQuietly(indexDir); } /** * Tests SegmentDictionaryCreator for case when there is only one string * and it is empty * * This test asserts that the padded length of the empty string is 1 * in actual padded dictionary), and not 0. * * @throws Exception */ @Test public void testSingleEmptyString() throws Exception { File indexDir = new File("/tmp/dict.test"); indexDir.deleteOnExit(); FieldSpec fieldSpec = new DimensionFieldSpec("test", DataType.STRING, true); String[] inputStrings = new String[1]; String[] paddedStrings = new String[1]; try { inputStrings[0] = ""; Arrays.sort(inputStrings); // Sorted order: {""} boolean[] isSorted = new boolean[1]; isSorted[0] = true; SegmentDictionaryCreator dictionaryCreator = new SegmentDictionaryCreator(false, inputStrings, fieldSpec, indexDir, V1Constants.Str.DEFAULT_STRING_PAD_CHAR); dictionaryCreator.build(isSorted); // Get the padded string as stored in the dictionary. int targetPaddedLength = dictionaryCreator.getStringColumnMaxLength(); Assert.assertTrue(targetPaddedLength == 1); for (int i = 0; i < inputStrings.length; i++) { paddedStrings[i] = SegmentDictionaryCreator.getPaddedString(inputStrings[i], targetPaddedLength, V1Constants.Str.DEFAULT_STRING_PAD_CHAR); } Arrays.sort(paddedStrings); // Sorted Order: {"%"} // Assert that indexOfSV for un-padded string returns the index of the corresponding padded string. for (int i = 0; i < inputStrings.length; i++) { int paddedIndex = dictionaryCreator.indexOfSV(inputStrings[i]); Assert.assertTrue(paddedStrings[paddedIndex].equals( SegmentDictionaryCreator.getPaddedString(inputStrings[i], targetPaddedLength, V1Constants.Str.DEFAULT_STRING_PAD_CHAR))); } // Verify that empty string got padded Assert.assertTrue(paddedStrings[0].equals( SegmentDictionaryCreator.getPaddedString(inputStrings[0], targetPaddedLength, V1Constants.Str.DEFAULT_STRING_PAD_CHAR))); dictionaryCreator.close(); } catch (Exception e) { throw e; } finally { FileUtils.deleteQuietly(indexDir); } } /** * Tests SegmentDictionaryCreator for case when there is one empty string * and a string with a single padding character * * This test asserts that the padded length of the empty string is 1 * in actual padded dictionary), and not 0. * * @throws Exception */ @Test public void testPaddedConflict() throws Exception { File indexDir = new File("/tmp/dict.test"); indexDir.deleteOnExit(); FieldSpec fieldSpec = new DimensionFieldSpec("test", DataType.STRING, true); String[] inputStrings = new String[2]; String[] paddedStrings = new String[2]; char paddingChar = '%'; try { inputStrings[0] = ""; inputStrings[1] = "%"; Arrays.sort(inputStrings); // Sorted order: {"", "%"} SegmentDictionaryCreator dictionaryCreator = new SegmentDictionaryCreator(false, inputStrings, fieldSpec, indexDir, paddingChar); boolean[] isSorted = new boolean[1]; isSorted[0] = true; dictionaryCreator.build(isSorted); } catch (Exception e) { Assert.assertEquals(e.getMessage(), "Number of entries in dictionary != number of unique values in the data in column test"); } finally { FileUtils.deleteQuietly(indexDir); } } /** * Tests SegmentDictionaryCreator for case when there is one empty string * and a string with a single '%' character * * This test asserts that the padded length of the empty string is 1 * in actual padded dictionary), and not 0. * * @throws Exception */ @Test public void testPaddedNoConflict() throws Exception { File indexDir = new File("/tmp/dict.test"); FieldSpec fieldSpec = new DimensionFieldSpec("test", DataType.STRING, true); String[] inputStrings = new String[2]; String[] paddedStrings = new String[2]; char paddingChar = '\0'; inputStrings[0] = ""; inputStrings[1] = "%"; Arrays.sort(inputStrings); // Sorted order: {"", "%"} SegmentDictionaryCreator dictionaryCreator = new SegmentDictionaryCreator(false, inputStrings, fieldSpec, indexDir, paddingChar); boolean[] isSorted = new boolean[1]; isSorted[0] = true; dictionaryCreator.build(isSorted); // Get the padded string as stored in the dictionary. int targetPaddedLength = dictionaryCreator.getStringColumnMaxLength(); for (int i = 0; i < inputStrings.length; i++) { paddedStrings[i] = SegmentDictionaryCreator.getPaddedString(inputStrings[i], targetPaddedLength, paddingChar); } Arrays.sort(paddedStrings); // Sorted Order: {"abc def%%%%", "abc%%%%%%%"} // Assert that indexOfSV for un-padded string returns the index of the corresponding padded string. for (int i = 0; i < inputStrings.length; i++) { int paddedIndex = dictionaryCreator.indexOfSV(inputStrings[i]); Assert.assertTrue(paddedStrings[paddedIndex].equals( SegmentDictionaryCreator.getPaddedString(inputStrings[i], targetPaddedLength, paddingChar))); } dictionaryCreator.close(); FileUtils.deleteQuietly(indexDir); FileUtils.deleteQuietly(indexDir); } /** * Tests SegmentDictionaryCreator for case when there is only one string * and it is "null" * * This test asserts that the padded length of the null string is 4 * * @throws Exception */ @Test public void testSingleNullString() throws Exception { File indexDir = new File("/tmp/dict.test"); FieldSpec fieldSpec = new DimensionFieldSpec("test", DataType.STRING, true); String[] inputStrings = new String[1]; String[] paddedStrings = new String[1]; inputStrings[0] = "null"; Arrays.sort(inputStrings); // Sorted order: {"null"} try { SegmentDictionaryCreator dictionaryCreator = new SegmentDictionaryCreator(false, inputStrings, fieldSpec, indexDir, V1Constants.Str.DEFAULT_STRING_PAD_CHAR); boolean[] isSorted = new boolean[1]; isSorted[0] = true; dictionaryCreator.build(isSorted); // Get the padded string as stored in the dictionary. int targetPaddedLength = dictionaryCreator.getStringColumnMaxLength(); Assert.assertTrue(targetPaddedLength == 4); for (int i = 0; i < inputStrings.length; i++) { paddedStrings[i] = SegmentDictionaryCreator.getPaddedString(inputStrings[i], targetPaddedLength, V1Constants.Str.DEFAULT_STRING_PAD_CHAR); } Arrays.sort(paddedStrings); // Sorted Order: {"null"} // Assert that indexOfSV for un-padded string returns the index of the corresponding padded string. for (int i = 0; i < inputStrings.length; i++) { int paddedIndex = dictionaryCreator.indexOfSV(inputStrings[i]); Assert.assertTrue(paddedStrings[paddedIndex].equals( SegmentDictionaryCreator.getPaddedString(inputStrings[i], targetPaddedLength, V1Constants.Str.DEFAULT_STRING_PAD_CHAR))); } // Verify that the string "null" did not get changed Assert.assertTrue(paddedStrings[0].equals("null")); dictionaryCreator.close(); } catch (Exception e) { throw e; } finally { FileUtils.deleteQuietly(indexDir); } } /** * Helper method to build stats collector for a given column. * * @param column Column name * @param dataType Data type for the column * @return StatsCollector for the column */ private AbstractColumnStatisticsCollector buildStatsCollector(String column, DataType dataType) { Schema schema = new Schema(); schema.addField(new DimensionFieldSpec(column, dataType, true)); StatsCollectorConfig statsCollectorConfig = new StatsCollectorConfig(schema, null); switch (dataType) { case INT: return new IntColumnPreIndexStatsCollector(column, statsCollectorConfig); case LONG: return new LongColumnPreIndexStatsCollector(column, statsCollectorConfig); case FLOAT: return new FloatColumnPreIndexStatsCollector(column, statsCollectorConfig); case DOUBLE: return new DoubleColumnPreIndexStatsCollector(column, statsCollectorConfig); case STRING: return new StringColumnPreIndexStatsCollector(column, statsCollectorConfig); case BOOLEAN: return new StringColumnPreIndexStatsCollector(column, statsCollectorConfig); default: throw new IllegalArgumentException("Illegal data type for stats builder: " + dataType); } } }