DictionariesTest.java example

Explorer
pinot-master
/**
 * Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.linkedin.pinot.segments.v1.creator;

import com.linkedin.pinot.common.data.DimensionFieldSpec;
import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.data.FieldSpec.DataType;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.common.segment.ReadMode;
import com.linkedin.pinot.core.indexsegment.columnar.ColumnarSegmentLoader;
import com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig;
import com.linkedin.pinot.core.indexsegment.utils.AvroUtils;
import com.linkedin.pinot.core.segment.creator.impl.stats.AbstractColumnStatisticsCollector;
import com.linkedin.pinot.core.segment.creator.SegmentIndexCreationDriver;
import com.linkedin.pinot.core.segment.creator.StatsCollectorConfig;
import com.linkedin.pinot.core.segment.creator.impl.SegmentCreationDriverFactory;
import com.linkedin.pinot.core.segment.creator.impl.SegmentDictionaryCreator;
import com.linkedin.pinot.core.segment.creator.impl.V1Constants;
import com.linkedin.pinot.core.segment.creator.impl.stats.DoubleColumnPreIndexStatsCollector;
import com.linkedin.pinot.core.segment.creator.impl.stats.FloatColumnPreIndexStatsCollector;
import com.linkedin.pinot.core.segment.creator.impl.stats.IntColumnPreIndexStatsCollector;
import com.linkedin.pinot.core.segment.creator.impl.stats.LongColumnPreIndexStatsCollector;
import com.linkedin.pinot.core.segment.creator.impl.stats.StringColumnPreIndexStatsCollector;
import com.linkedin.pinot.core.segment.index.ColumnMetadata;
import com.linkedin.pinot.core.segment.index.IndexSegmentImpl;
import com.linkedin.pinot.core.segment.index.SegmentMetadataImpl;
import com.linkedin.pinot.core.segment.index.readers.DoubleDictionary;
import com.linkedin.pinot.core.segment.index.readers.FloatDictionary;
import com.linkedin.pinot.core.segment.index.readers.ImmutableDictionaryReader;
import com.linkedin.pinot.core.segment.index.readers.IntDictionary;
import com.linkedin.pinot.core.segment.index.readers.LongDictionary;
import com.linkedin.pinot.core.segment.index.readers.StringDictionary;
import com.linkedin.pinot.util.TestUtils;
import java.io.File;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.avro.Schema.Field;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.util.Utf8;
import org.apache.commons.io.FileUtils;
import org.testng.Assert;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;


public class DictionariesTest {
  private static final String AVRO_DATA = "data/test_sample_data.avro";
  private static File INDEX_DIR = new File(DictionariesTest.class.toString());
  static Map<String, Set<Object>> uniqueEntries;

  private static File segmentDirectory;

  @AfterClass
  public static void cleanup() {
    FileUtils.deleteQuietly(INDEX_DIR);
  }

  @BeforeClass
  public static void before()
      throws Exception {
    final String filePath =
        TestUtils.getFileFromResourceUrl(DictionariesTest.class.getClassLoader().getResource(AVRO_DATA));
    if (INDEX_DIR.exists()) {
      FileUtils.deleteQuietly(INDEX_DIR);
    }

    final SegmentGeneratorConfig config =
        SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(new File(filePath), INDEX_DIR, "time_day",
            TimeUnit.DAYS, "test");

    final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null);
    driver.init(config);
    driver.build();
    segmentDirectory = new File(INDEX_DIR, driver.getSegmentName());
    final Schema schema = AvroUtils.extractSchemaFromAvro(new File(filePath));

    final DataFileStream<GenericRecord> avroReader = AvroUtils.getAvroReader(new File(filePath));
    final org.apache.avro.Schema avroSchema = avroReader.getSchema();
    final String[] columns = new String[avroSchema.getFields().size()];
    int i = 0;
    for (final Field f : avroSchema.getFields()) {
      columns[i] = f.name();
      i++;
    }

    uniqueEntries = new HashMap<String, Set<Object>>();
    for (final String column : columns) {
      uniqueEntries.put(column, new HashSet<Object>());
    }

    while (avroReader.hasNext()) {
      final GenericRecord rec = avroReader.next();
      for (final String column : columns) {
        Object val = rec.get(column);
        if (val instanceof Utf8) {
          val = ((Utf8) val).toString();
        }
        uniqueEntries.get(column).add(getAppropriateType(schema.getFieldSpecFor(column).getDataType(), val));
      }
    }
  }

  private static Object getAppropriateType(DataType spec, Object val) {
    if (val == null) {
      switch (spec) {
        case DOUBLE:
          return V1Constants.Numbers.NULL_DOUBLE;
        case FLOAT:
          return V1Constants.Numbers.NULL_FLOAT;
        case INT:
          return V1Constants.Numbers.NULL_INT;
        case LONG:
          return V1Constants.Numbers.NULL_LONG;
        default:
          return V1Constants.Str.NULL_STRING;
      }
    }
    return val;
  }

  @Test
  public void test1()
      throws Exception {
    final IndexSegmentImpl heapSegment = (IndexSegmentImpl) ColumnarSegmentLoader.load(segmentDirectory, ReadMode.heap);
    final IndexSegmentImpl mmapSegment = (IndexSegmentImpl) ColumnarSegmentLoader.load(segmentDirectory, ReadMode.mmap);

    for (final String column : ((SegmentMetadataImpl) mmapSegment.getSegmentMetadata()).getColumnMetadataMap()
        .keySet()) {
      final ImmutableDictionaryReader heapDictionary = heapSegment.getDictionaryFor(column);
      final ImmutableDictionaryReader mmapDictionary = mmapSegment.getDictionaryFor(column);

      switch (((SegmentMetadataImpl) mmapSegment.getSegmentMetadata()).getColumnMetadataMap()
          .get(column)
          .getDataType()) {
        case BOOLEAN:
        case STRING:
          Assert.assertTrue(heapDictionary instanceof StringDictionary);
          Assert.assertTrue(mmapDictionary instanceof StringDictionary);
          break;
        case DOUBLE:
          Assert.assertTrue(heapDictionary instanceof DoubleDictionary);
          Assert.assertTrue(mmapDictionary instanceof DoubleDictionary);
          break;
        case FLOAT:
          Assert.assertTrue(heapDictionary instanceof FloatDictionary);
          Assert.assertTrue(mmapDictionary instanceof FloatDictionary);
          break;
        case LONG:
          Assert.assertTrue(heapDictionary instanceof LongDictionary);
          Assert.assertTrue(mmapDictionary instanceof LongDictionary);
          break;
        case INT:
          Assert.assertTrue(heapDictionary instanceof IntDictionary);
          Assert.assertTrue(mmapDictionary instanceof IntDictionary);
          break;
      }

      Assert.assertEquals(mmapDictionary.length(), heapDictionary.length());
      for (int i = 0; i < heapDictionary.length(); i++) {
        Assert.assertEquals(mmapDictionary.get(i), heapDictionary.get(i));
      }
    }
  }

  @Test
  public void test2()
      throws Exception {
    final IndexSegmentImpl heapSegment = (IndexSegmentImpl) ColumnarSegmentLoader.load(segmentDirectory, ReadMode.heap);
    final IndexSegmentImpl mmapSegment = (IndexSegmentImpl) ColumnarSegmentLoader.load(segmentDirectory, ReadMode.mmap);

    final Map<String, ColumnMetadata> metadataMap =
        ((SegmentMetadataImpl) mmapSegment.getSegmentMetadata()).getColumnMetadataMap();
    for (final String column : metadataMap.keySet()) {
      final ImmutableDictionaryReader heapDictionary = heapSegment.getDictionaryFor(column);
      final ImmutableDictionaryReader mmapDictionary = mmapSegment.getDictionaryFor(column);

      final Set<Object> uniques = uniqueEntries.get(column);
      final List<Object> list = Arrays.asList(uniques.toArray());
      Collections.shuffle(list);
      for (final Object entry : list) {
        Assert.assertEquals(mmapDictionary.indexOf(entry), heapDictionary.indexOf(entry));
        if (!column.equals("pageKey")) {
          Assert.assertFalse(heapDictionary.indexOf(entry) < 0);
          Assert.assertFalse(mmapDictionary.indexOf(entry) < 0);
        }
      }
    }
  }

  @Test
  public void testIntColumnPreIndexStatsCollector()
      throws Exception {
    AbstractColumnStatisticsCollector statsCollector = buildStatsCollector("column1", DataType.INT);
    statsCollector.collect(new Integer(1));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Float(2));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Long(3));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Double(4));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Integer(4));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Float(2));
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.collect(new Double(40));
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.collect(new Double(20));
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.seal();
    Assert.assertEquals(statsCollector.getCardinality(), 6);
    Assert.assertEquals(((Number) statsCollector.getMinValue()).intValue(), 1);
    Assert.assertEquals(((Number) statsCollector.getMaxValue()).intValue(), 40);
    Assert.assertFalse(statsCollector.isSorted());
  }

  @Test
  public void testFloatColumnPreIndexStatsCollector()
      throws Exception {
    AbstractColumnStatisticsCollector statsCollector = buildStatsCollector("column1", DataType.FLOAT);
    statsCollector.collect(new Integer(1));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Float(2));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Long(3));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Double(4));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Integer(4));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Float(2));
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.collect(new Double(40));
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.collect(new Double(20));
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.seal();
    Assert.assertEquals(statsCollector.getCardinality(), 6);
    Assert.assertEquals(((Number) statsCollector.getMinValue()).intValue(), 1);
    Assert.assertEquals(((Number) statsCollector.getMaxValue()).intValue(), 40);
    Assert.assertFalse(statsCollector.isSorted());
  }

  @Test
  public void testLongColumnPreIndexStatsCollector()
      throws Exception {
    AbstractColumnStatisticsCollector statsCollector = buildStatsCollector("column1", DataType.LONG);
    statsCollector.collect(new Integer(1));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Float(2));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Long(3));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Double(4));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Integer(4));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Float(2));
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.collect(new Double(40));
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.collect(new Double(20));
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.seal();
    Assert.assertEquals(statsCollector.getCardinality(), 6);
    Assert.assertEquals(((Number) statsCollector.getMinValue()).intValue(), 1);
    Assert.assertEquals(((Number) statsCollector.getMaxValue()).intValue(), 40);
    Assert.assertFalse(statsCollector.isSorted());
  }

  @Test
  public void testDoubleColumnPreIndexStatsCollector()
      throws Exception {
    AbstractColumnStatisticsCollector statsCollector = buildStatsCollector("column1", DataType.DOUBLE);
    statsCollector.collect(new Integer(1));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Float(2));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Long(3));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Double(4));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Integer(4));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Float(2));
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.collect(new Double(40));
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.collect(new Double(20));
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.seal();
    Assert.assertEquals(statsCollector.getCardinality(), 6);
    Assert.assertEquals(((Number) statsCollector.getMinValue()).intValue(), 1);
    Assert.assertEquals(((Number) statsCollector.getMaxValue()).intValue(), 40);
    Assert.assertFalse(statsCollector.isSorted());
  }

  @Test
  public void testStringColumnPreIndexStatsCollectorForRandomString()
      throws Exception {
    AbstractColumnStatisticsCollector statsCollector = buildStatsCollector("column1", DataType.STRING);
    statsCollector.collect("a");
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect("b");
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect("c");
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect("d");
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect("d");
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect("b");
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.collect("z");
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.collect("u");
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.seal();
    Assert.assertEquals(statsCollector.getCardinality(), 6);
    Assert.assertEquals((statsCollector.getMinValue()).toString(), "a");
    Assert.assertEquals((statsCollector.getMaxValue()).toString(), "z");
    Assert.assertFalse(statsCollector.isSorted());
  }

  @Test
  public void testStringColumnPreIndexStatsCollectorForBoolean()
      throws Exception {
    AbstractColumnStatisticsCollector statsCollector = buildStatsCollector("column1", DataType.BOOLEAN);
    statsCollector.collect("false");
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect("false");
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect("false");
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect("true");
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect("true");
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect("false");
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.collect("false");
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.collect("true");
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.seal();
    Assert.assertEquals(statsCollector.getCardinality(), 2);
    Assert.assertEquals((statsCollector.getMinValue()).toString(), "false");
    Assert.assertEquals((statsCollector.getMaxValue()).toString(), "true");
    Assert.assertFalse(statsCollector.isSorted());
  }

  /**
   * Tests DictionaryCreator for case when one value is a substring of another.
   * For example, in case of sorted values {"abc", "abc def"} after padding,
   * the sorted order would change to {"abc def%%%%", "abc%%%%%%%"}
   *
   * This test asserts that DictionaryCreator.indexOfSV("abc") returns 1 (ie index of "abc%%%%%%%"
   * in actual padded dictionary), and not 0.
   *
   * @throws Exception
   */
  @Test
  public void testStringsValuesWithPadding()
      throws Exception {
    File indexDir = new File("/tmp/dict.test");
    indexDir.deleteOnExit();
    FieldSpec fieldSpec = new DimensionFieldSpec("test", DataType.STRING, true);

    String[] inputStrings = new String[2];
    String[] paddedStrings = new String[2];
    char paddingChar = '%';

    inputStrings[0] = "abc def";
    inputStrings[1] = "abc";
    Arrays.sort(inputStrings); // Sorted order: {"abc", "abc def"}

    boolean[] isSorted = new boolean[1];
    isSorted[0] = true;
    SegmentDictionaryCreator dictionaryCreator =
        new SegmentDictionaryCreator(false, inputStrings, fieldSpec, indexDir, paddingChar);
    dictionaryCreator.build(isSorted);
    Assert.assertFalse(isSorted[0]);

    // Get the padded string as stored in the dictionary.
    int targetPaddedLength = dictionaryCreator.getStringColumnMaxLength();
    for (int i = 0; i < inputStrings.length; i++) {
      paddedStrings[i] = SegmentDictionaryCreator.getPaddedString(inputStrings[i], targetPaddedLength, paddingChar);
    }
    Arrays.sort(paddedStrings); // Sorted Order: {"abc def%%%%", "abc%%%%%%%"}

    // Assert that indexOfSV for un-padded string returns the index of the corresponding padded string.
    for (int i = 0; i < inputStrings.length; i++) {
      int paddedIndex = dictionaryCreator.indexOfSV(inputStrings[i]);
      Assert.assertTrue(paddedStrings[paddedIndex].equals(
          SegmentDictionaryCreator.getPaddedString(inputStrings[i], targetPaddedLength, paddingChar)));
    }

    dictionaryCreator.close();
    FileUtils.deleteQuietly(indexDir);
  }

  /**
   * Test for ensuring that Strings with special characters can be handled
   * correctly.
   *
   * @throws Exception
   */
  @Test
  public void testUTF8Characters()
      throws Exception {
    File indexDir = new File("/tmp/dict.test");
    indexDir.deleteOnExit();
    FieldSpec fieldSpec = new DimensionFieldSpec("test", DataType.STRING, true);

    String[] inputStrings = new String[3];
    char paddingChar = '%';

    inputStrings[0] = new String(new byte[]{67, 97, 102, -61, -87}); // "Café";
    inputStrings[1] = new String(new byte[]{70, 114, 97, 110, -61, -89, 111, 105, 115}); // "François";
    inputStrings[2] =
        new String(new byte[]{67, -61, -76, 116, 101, 32, 100, 39, 73, 118, 111, 105, 114, 101}); // "Côte d'Ivoire";
    Arrays.sort(inputStrings);

    SegmentDictionaryCreator dictionaryCreator =
        new SegmentDictionaryCreator(false, inputStrings, fieldSpec, indexDir, paddingChar);
    dictionaryCreator.build(new boolean[]{false});

    for (String inputString : inputStrings) {
      Assert.assertTrue(dictionaryCreator.indexOfSV(inputString) >= 0, "Value not found in dictionary " + inputString);
    }

    dictionaryCreator.close();
    FileUtils.deleteQuietly(indexDir);
  }

  /**
   * Tests SegmentDictionaryCreator for case when there is only one string
   * and it is empty
   *
   * This test asserts that the padded length of the empty string is 1
   * in actual padded dictionary), and not 0.
   *
   * @throws Exception
   */
  @Test
  public void testSingleEmptyString()
      throws Exception {
    File indexDir = new File("/tmp/dict.test");
    indexDir.deleteOnExit();
    FieldSpec fieldSpec = new DimensionFieldSpec("test", DataType.STRING, true);

    String[] inputStrings = new String[1];
    String[] paddedStrings = new String[1];

    try {
      inputStrings[0] = "";
      Arrays.sort(inputStrings); // Sorted order: {""}

      boolean[] isSorted = new boolean[1];
      isSorted[0] = true;
      SegmentDictionaryCreator dictionaryCreator =
          new SegmentDictionaryCreator(false, inputStrings, fieldSpec, indexDir,
              V1Constants.Str.DEFAULT_STRING_PAD_CHAR);
      dictionaryCreator.build(isSorted);

      // Get the padded string as stored in the dictionary.
      int targetPaddedLength = dictionaryCreator.getStringColumnMaxLength();
      Assert.assertTrue(targetPaddedLength == 1);
      for (int i = 0; i < inputStrings.length; i++) {
        paddedStrings[i] = SegmentDictionaryCreator.getPaddedString(inputStrings[i], targetPaddedLength,
            V1Constants.Str.DEFAULT_STRING_PAD_CHAR);
      }
      Arrays.sort(paddedStrings); // Sorted Order: {"%"}

      // Assert that indexOfSV for un-padded string returns the index of the corresponding padded string.
      for (int i = 0; i < inputStrings.length; i++) {
        int paddedIndex = dictionaryCreator.indexOfSV(inputStrings[i]);
        Assert.assertTrue(paddedStrings[paddedIndex].equals(
            SegmentDictionaryCreator.getPaddedString(inputStrings[i], targetPaddedLength,
                V1Constants.Str.DEFAULT_STRING_PAD_CHAR)));
      }

      // Verify that empty string got padded
      Assert.assertTrue(paddedStrings[0].equals(
          SegmentDictionaryCreator.getPaddedString(inputStrings[0], targetPaddedLength,
              V1Constants.Str.DEFAULT_STRING_PAD_CHAR)));
      dictionaryCreator.close();
    } catch (Exception e) {
      throw e;
    } finally {
      FileUtils.deleteQuietly(indexDir);
    }
  }

  /**
   * Tests SegmentDictionaryCreator for case when there is one empty string
   * and a string with a single padding character
   *
   * This test asserts that the padded length of the empty string is 1
   * in actual padded dictionary), and not 0.
   *
   * @throws Exception
   */
  @Test
  public void testPaddedConflict()
      throws Exception {
    File indexDir = new File("/tmp/dict.test");
    indexDir.deleteOnExit();
    FieldSpec fieldSpec = new DimensionFieldSpec("test", DataType.STRING, true);

    String[] inputStrings = new String[2];
    String[] paddedStrings = new String[2];
    char paddingChar = '%';

    try {
      inputStrings[0] = "";
      inputStrings[1] = "%";
      Arrays.sort(inputStrings); // Sorted order: {"", "%"}
      SegmentDictionaryCreator dictionaryCreator =
          new SegmentDictionaryCreator(false, inputStrings, fieldSpec, indexDir, paddingChar);
      boolean[] isSorted = new boolean[1];
      isSorted[0] = true;
      dictionaryCreator.build(isSorted);
    } catch (Exception e) {
      Assert.assertEquals(e.getMessage(),
          "Number of entries in dictionary != number of unique values in the data in column test");
    } finally {
      FileUtils.deleteQuietly(indexDir);
    }
  }

  /**
   * Tests SegmentDictionaryCreator for case when there is one empty string
   * and a string with a single '%' character
   *
   * This test asserts that the padded length of the empty string is 1
   * in actual padded dictionary), and not 0.
   *
   * @throws Exception
   */
  @Test
  public void testPaddedNoConflict()
      throws Exception {
    File indexDir = new File("/tmp/dict.test");
    FieldSpec fieldSpec = new DimensionFieldSpec("test", DataType.STRING, true);

    String[] inputStrings = new String[2];
    String[] paddedStrings = new String[2];
    char paddingChar = '\0';

    inputStrings[0] = "";
    inputStrings[1] = "%";
    Arrays.sort(inputStrings); // Sorted order: {"", "%"}
    SegmentDictionaryCreator dictionaryCreator =
        new SegmentDictionaryCreator(false, inputStrings, fieldSpec, indexDir, paddingChar);
    boolean[] isSorted = new boolean[1];
    isSorted[0] = true;
    dictionaryCreator.build(isSorted);
    // Get the padded string as stored in the dictionary.
    int targetPaddedLength = dictionaryCreator.getStringColumnMaxLength();
    for (int i = 0; i < inputStrings.length; i++) {
      paddedStrings[i] = SegmentDictionaryCreator.getPaddedString(inputStrings[i], targetPaddedLength, paddingChar);
    }
    Arrays.sort(paddedStrings); // Sorted Order: {"abc def%%%%", "abc%%%%%%%"}

    // Assert that indexOfSV for un-padded string returns the index of the corresponding padded string.
    for (int i = 0; i < inputStrings.length; i++) {
      int paddedIndex = dictionaryCreator.indexOfSV(inputStrings[i]);
      Assert.assertTrue(paddedStrings[paddedIndex].equals(
          SegmentDictionaryCreator.getPaddedString(inputStrings[i], targetPaddedLength, paddingChar)));
    }

    dictionaryCreator.close();
    FileUtils.deleteQuietly(indexDir);

    FileUtils.deleteQuietly(indexDir);
  }

  /**
   * Tests SegmentDictionaryCreator for case when there is only one string
   * and it is "null"
   *
   * This test asserts that the padded length of the null string is 4
   *
   * @throws Exception
   */
  @Test
  public void testSingleNullString()
      throws Exception {
    File indexDir = new File("/tmp/dict.test");
    FieldSpec fieldSpec = new DimensionFieldSpec("test", DataType.STRING, true);

    String[] inputStrings = new String[1];
    String[] paddedStrings = new String[1];

    inputStrings[0] = "null";
    Arrays.sort(inputStrings); // Sorted order: {"null"}

    try {
      SegmentDictionaryCreator dictionaryCreator =
          new SegmentDictionaryCreator(false, inputStrings, fieldSpec, indexDir,
              V1Constants.Str.DEFAULT_STRING_PAD_CHAR);
      boolean[] isSorted = new boolean[1];
      isSorted[0] = true;
      dictionaryCreator.build(isSorted);

      // Get the padded string as stored in the dictionary.
      int targetPaddedLength = dictionaryCreator.getStringColumnMaxLength();
      Assert.assertTrue(targetPaddedLength == 4);
      for (int i = 0; i < inputStrings.length; i++) {
        paddedStrings[i] = SegmentDictionaryCreator.getPaddedString(inputStrings[i], targetPaddedLength,
            V1Constants.Str.DEFAULT_STRING_PAD_CHAR);
      }
      Arrays.sort(paddedStrings); // Sorted Order: {"null"}

      // Assert that indexOfSV for un-padded string returns the index of the corresponding padded string.
      for (int i = 0; i < inputStrings.length; i++) {
        int paddedIndex = dictionaryCreator.indexOfSV(inputStrings[i]);
        Assert.assertTrue(paddedStrings[paddedIndex].equals(
            SegmentDictionaryCreator.getPaddedString(inputStrings[i], targetPaddedLength,
                V1Constants.Str.DEFAULT_STRING_PAD_CHAR)));
      }

      // Verify that the string "null" did not get changed
      Assert.assertTrue(paddedStrings[0].equals("null"));
      dictionaryCreator.close();
    } catch (Exception e) {
      throw e;
    } finally {
      FileUtils.deleteQuietly(indexDir);
    }
  }

  /**
   * Helper method to build stats collector for a given column.
   *
   * @param column Column name
   * @param dataType Data type for the column
   * @return StatsCollector for the column
   */
  private AbstractColumnStatisticsCollector buildStatsCollector(String column, DataType dataType) {
    Schema schema = new Schema();
    schema.addField(new DimensionFieldSpec(column, dataType, true));
    StatsCollectorConfig statsCollectorConfig = new StatsCollectorConfig(schema, null);

    switch (dataType) {
      case INT:
        return new IntColumnPreIndexStatsCollector(column, statsCollectorConfig);

      case LONG:
        return new LongColumnPreIndexStatsCollector(column, statsCollectorConfig);

      case FLOAT:
        return new FloatColumnPreIndexStatsCollector(column, statsCollectorConfig);

      case DOUBLE:
        return new DoubleColumnPreIndexStatsCollector(column, statsCollectorConfig);

      case STRING:
        return new StringColumnPreIndexStatsCollector(column, statsCollectorConfig);

      case BOOLEAN:
        return new StringColumnPreIndexStatsCollector(column, statsCollectorConfig);

      default:
        throw new IllegalArgumentException("Illegal data type for stats builder: " + dataType);
    }
  }

}