/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.exec.vector; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Properties; import java.util.Random; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.serde2.ByteStream.Output; import org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableDeserializeRead; import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite; import org.apache.hadoop.hive.serde2.fast.DeserializeRead; import org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.hive.serde2.lazy.VerifyLazy; import org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead; import org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleSerializeWrite; import org.apache.hadoop.hive.serde2.lazy.fast.StringToDouble; import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead; import org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinarySerializeWrite; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.UnionObject; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.fast.SerializeWrite; import junit.framework.TestCase; /** * Unit test for the vectorized serialize and deserialize row. */ public class TestVectorSerDeRow extends TestCase { public static enum SerializationType { NONE, BINARY_SORTABLE, LAZY_BINARY, LAZY_SIMPLE } private void verifyRead( DeserializeRead deserializeRead, TypeInfo typeInfo, Object expectedObject) throws IOException { if (typeInfo.getCategory() == ObjectInspector.Category.PRIMITIVE) { VectorVerifyFast.verifyDeserializeRead(deserializeRead, typeInfo, expectedObject); } else { Object complexFieldObj = VectorVerifyFast.deserializeReadComplexType(deserializeRead, typeInfo); if (expectedObject == null) { if (complexFieldObj != null) { TestCase.fail("Field reports not null but object is null (class " + complexFieldObj.getClass().getName() + ", " + complexFieldObj.toString() + ")"); } } else { if (complexFieldObj == null) { // It's hard to distinguish a union with null from a null union. if (expectedObject instanceof UnionObject) { UnionObject expectedUnion = (UnionObject) expectedObject; if (expectedUnion.getObject() == null) { return; } } TestCase.fail("Field reports null but object is not null (class " + expectedObject.getClass().getName() + ", " + expectedObject.toString() + ")"); } } if (!VerifyLazy.lazyCompare(typeInfo, complexFieldObj, expectedObject)) { TestCase.fail("Comparision failed typeInfo " + typeInfo.toString()); } } } void deserializeAndVerify( Output output, DeserializeRead deserializeRead, VectorRandomRowSource source, Object[] expectedRow) throws HiveException, IOException { deserializeRead.set(output.getData(), 0, output.getLength()); TypeInfo[] typeInfos = source.typeInfos(); for (int i = 0; i < typeInfos.length; i++) { Object expected = expectedRow[i]; TypeInfo typeInfo = typeInfos[i]; verifyRead(deserializeRead, typeInfo, expected); } TestCase.assertTrue(deserializeRead.isEndOfInputReached()); } void serializeBatch( VectorizedRowBatch batch, VectorSerializeRow vectorSerializeRow, DeserializeRead deserializeRead, VectorRandomRowSource source, Object[][] randomRows, int firstRandomRowIndex) throws HiveException, IOException { Output output = new Output(); for (int i = 0; i < batch.size; i++) { output.reset(); vectorSerializeRow.setOutput(output); vectorSerializeRow.serializeWrite(batch, i); Object[] expectedRow = randomRows[firstRandomRowIndex + i]; byte[] bytes = output.getData(); int length = output.getLength(); char[] chars = new char[length]; for (int c = 0; c < chars.length; c++) { chars[c] = (char) (bytes[c] & 0xFF); } deserializeAndVerify(output, deserializeRead, source, expectedRow); } } void testVectorSerializeRow(Random r, SerializationType serializationType) throws HiveException, IOException, SerDeException { for (int i = 0; i < 20; i++) { innerTestVectorSerializeRow(r, serializationType); } } void innerTestVectorSerializeRow( Random r, SerializationType serializationType) throws HiveException, IOException, SerDeException { String[] emptyScratchTypeNames = new String[0]; VectorRandomRowSource source = new VectorRandomRowSource(); source.init(r, VectorRandomRowSource.SupportedTypes.ALL, 4, false); VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); batchContext.init(source.rowStructObjectInspector(), emptyScratchTypeNames); VectorizedRowBatch batch = batchContext.createVectorizedRowBatch(); VectorAssignRow vectorAssignRow = new VectorAssignRow(); vectorAssignRow.init(source.typeNames()); int fieldCount = source.typeNames().size(); DeserializeRead deserializeRead; SerializeWrite serializeWrite; switch (serializationType) { case BINARY_SORTABLE: deserializeRead = new BinarySortableDeserializeRead(source.typeInfos(), /* useExternalBuffer */ false); serializeWrite = new BinarySortableSerializeWrite(fieldCount); break; case LAZY_BINARY: deserializeRead = new LazyBinaryDeserializeRead(source.typeInfos(), /* useExternalBuffer */ false); serializeWrite = new LazyBinarySerializeWrite(fieldCount); break; case LAZY_SIMPLE: { StructObjectInspector rowObjectInspector = source.rowStructObjectInspector(); // Use different separator values. byte[] separators = new byte[] {(byte) 9, (byte) 2, (byte) 3, (byte) 4, (byte) 5, (byte) 6, (byte) 7, (byte) 8}; LazySerDeParameters lazySerDeParams = getSerDeParams(rowObjectInspector, separators); deserializeRead = new LazySimpleDeserializeRead( source.typeInfos(), /* useExternalBuffer */ false, lazySerDeParams); serializeWrite = new LazySimpleSerializeWrite(fieldCount, lazySerDeParams); } break; default: throw new Error("Unknown serialization type " + serializationType); } VectorSerializeRow vectorSerializeRow = new VectorSerializeRow(serializeWrite); vectorSerializeRow.init(source.typeNames()); Object[][] randomRows = source.randomRows(2000); int firstRandomRowIndex = 0; for (int i = 0; i < randomRows.length; i++) { Object[] row = randomRows[i]; vectorAssignRow.assignRow(batch, batch.size, row); batch.size++; if (batch.size == batch.DEFAULT_SIZE) { serializeBatch(batch, vectorSerializeRow, deserializeRead, source, randomRows, firstRandomRowIndex); firstRandomRowIndex = i + 1; batch.reset(); } } if (batch.size > 0) { serializeBatch(batch, vectorSerializeRow, deserializeRead, source, randomRows, firstRandomRowIndex); } } void examineBatch(VectorizedRowBatch batch, VectorExtractRow vectorExtractRow, TypeInfo[] typeInfos, Object[][] randomRows, int firstRandomRowIndex ) { int rowSize = vectorExtractRow.getCount(); Object[] row = new Object[rowSize]; for (int i = 0; i < batch.size; i++) { vectorExtractRow.extractRow(batch, i, row); Object[] expectedRow = randomRows[firstRandomRowIndex + i]; for (int c = 0; c < rowSize; c++) { Object rowObj = row[c]; Object expectedObj = expectedRow[c]; if (rowObj == null) { if (expectedObj == null) { continue; } fail("Unexpected NULL from extractRow. Expected class " + typeInfos[c].getCategory() + " value " + expectedObj + " batch index " + i + " firstRandomRowIndex " + firstRandomRowIndex); } if (!rowObj.equals(expectedObj)) { fail("Row " + (firstRandomRowIndex + i) + " and column " + c + " mismatch (" + typeInfos[c].getCategory() + " actual value " + rowObj + " and expected value " + expectedObj + ")"); } } } } private Output serializeRow(Object[] row, VectorRandomRowSource source, SerializeWrite serializeWrite) throws HiveException, IOException { Output output = new Output(); serializeWrite.set(output); TypeInfo[] typeInfos = source.typeInfos(); for (int i = 0; i < typeInfos.length; i++) { VectorVerifyFast.serializeWrite(serializeWrite, typeInfos[i], row[i]); } return output; } private void addToProperties(Properties tbl, String fieldNames, String fieldTypes) { // Set the configuration parameters tbl.setProperty(serdeConstants.SERIALIZATION_FORMAT, "9"); tbl.setProperty("columns", fieldNames); tbl.setProperty("columns.types", fieldTypes); tbl.setProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, "\\N"); } private LazySerDeParameters getSerDeParams( StructObjectInspector rowObjectInspector, byte[] separators) throws SerDeException { return getSerDeParams(new Configuration(), new Properties(), rowObjectInspector, separators); } private LazySerDeParameters getSerDeParams( Configuration conf, Properties tbl, StructObjectInspector rowObjectInspector, byte[] separators) throws SerDeException { String fieldNames = ObjectInspectorUtils.getFieldNames(rowObjectInspector); String fieldTypes = ObjectInspectorUtils.getFieldTypes(rowObjectInspector); addToProperties(tbl, fieldNames, fieldTypes); LazySerDeParameters lazySerDeParams = new LazySerDeParameters(conf, tbl, LazySimpleSerDe.class.getName()); for (int i = 0; i < separators.length; i++) { lazySerDeParams.setSeparator(i, separators[i]); } return lazySerDeParams; } void testVectorDeserializeRow( Random r, SerializationType serializationType, boolean alternate1, boolean alternate2, boolean useExternalBuffer) throws HiveException, IOException, SerDeException { for (int i = 0; i < 20; i++) { innerTestVectorDeserializeRow(r, serializationType, alternate1, alternate2, useExternalBuffer); } } void innerTestVectorDeserializeRow( Random r, SerializationType serializationType, boolean alternate1, boolean alternate2, boolean useExternalBuffer) throws HiveException, IOException, SerDeException { String[] emptyScratchTypeNames = new String[0]; VectorRandomRowSource source = new VectorRandomRowSource(); source.init(r, VectorRandomRowSource.SupportedTypes.ALL, 4, false); VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx(); batchContext.init(source.rowStructObjectInspector(), emptyScratchTypeNames); VectorizedRowBatch batch = batchContext.createVectorizedRowBatch(); // junk the destination for the 1st pass for (ColumnVector cv : batch.cols) { Arrays.fill(cv.isNull, true); } TypeInfo[] typeInfos = source.typeInfos(); int fieldCount = source.typeNames().size(); DeserializeRead deserializeRead; SerializeWrite serializeWrite; switch (serializationType) { case BINARY_SORTABLE: boolean useColumnSortOrderIsDesc = alternate1; if (!useColumnSortOrderIsDesc) { deserializeRead = new BinarySortableDeserializeRead(source.typeInfos(), useExternalBuffer); serializeWrite = new BinarySortableSerializeWrite(fieldCount); } else { boolean[] columnSortOrderIsDesc = new boolean[fieldCount]; for (int i = 0; i < fieldCount; i++) { columnSortOrderIsDesc[i] = r.nextBoolean(); } byte[] columnNullMarker = new byte[fieldCount]; byte[] columnNotNullMarker = new byte[fieldCount]; for (int i = 0; i < fieldCount; i++) { if (columnSortOrderIsDesc[i]) { // Descending // Null last (default for descending order) columnNullMarker[i] = BinarySortableSerDe.ZERO; columnNotNullMarker[i] = BinarySortableSerDe.ONE; } else { // Ascending // Null first (default for ascending order) columnNullMarker[i] = BinarySortableSerDe.ZERO; columnNotNullMarker[i] = BinarySortableSerDe.ONE; } } serializeWrite = new BinarySortableSerializeWrite(columnSortOrderIsDesc, columnNullMarker, columnNotNullMarker); deserializeRead = new BinarySortableDeserializeRead(source.typeInfos(), useExternalBuffer, columnSortOrderIsDesc, columnNullMarker, columnNotNullMarker); } boolean useBinarySortableCharsNeedingEscape = alternate2; if (useBinarySortableCharsNeedingEscape) { source.addBinarySortableAlphabets(); } break; case LAZY_BINARY: deserializeRead = new LazyBinaryDeserializeRead(source.typeInfos(), useExternalBuffer); serializeWrite = new LazyBinarySerializeWrite(fieldCount); break; case LAZY_SIMPLE: { StructObjectInspector rowObjectInspector = source.rowStructObjectInspector(); Configuration conf = new Configuration(); Properties tbl = new Properties(); tbl.setProperty(serdeConstants.FIELD_DELIM, "\t"); tbl.setProperty(serdeConstants.LINE_DELIM, "\n"); byte separator = (byte) '\t'; boolean useLazySimpleEscapes = alternate1; if (useLazySimpleEscapes) { tbl.setProperty(serdeConstants.QUOTE_CHAR, "'"); String escapeString = "\\"; tbl.setProperty(serdeConstants.ESCAPE_CHAR, escapeString); } LazySerDeParameters lazySerDeParams = getSerDeParams(conf, tbl, rowObjectInspector, new byte[] { separator }); if (useLazySimpleEscapes) { // LazySimple seems to throw away everything but \n and \r. boolean[] needsEscape = lazySerDeParams.getNeedsEscape(); StringBuilder sb = new StringBuilder(); if (needsEscape['\n']) { sb.append('\n'); } if (needsEscape['\r']) { sb.append('\r'); } // for (int i = 0; i < needsEscape.length; i++) { // if (needsEscape[i]) { // sb.append((char) i); // } // } String needsEscapeStr = sb.toString(); if (needsEscapeStr.length() > 0) { source.addEscapables(needsEscapeStr); } } deserializeRead = new LazySimpleDeserializeRead(source.typeInfos(), useExternalBuffer, lazySerDeParams); serializeWrite = new LazySimpleSerializeWrite(fieldCount, lazySerDeParams); } break; default: throw new Error("Unknown serialization type " + serializationType); } VectorDeserializeRow vectorDeserializeRow = new VectorDeserializeRow(deserializeRead); vectorDeserializeRow.init(); // junk the destination for the 1st pass for (ColumnVector cv : batch.cols) { Arrays.fill(cv.isNull, true); cv.noNulls = false; } VectorExtractRow vectorExtractRow = new VectorExtractRow(); vectorExtractRow.init(source.typeNames()); Object[][] randomRows = source.randomRows(2000); int firstRandomRowIndex = 0; for (int i = 0; i < randomRows.length; i++) { Object[] row = randomRows[i]; Output output = serializeRow(row, source, serializeWrite); vectorDeserializeRow.setBytes(output.getData(), 0, output.getLength()); try { vectorDeserializeRow.deserialize(batch, batch.size); } catch (Exception e) { throw new HiveException( "\nDeserializeRead details: " + vectorDeserializeRow.getDetailedReadPositionString(), e); } batch.size++; if (batch.size == batch.DEFAULT_SIZE) { examineBatch(batch, vectorExtractRow, typeInfos, randomRows, firstRandomRowIndex); firstRandomRowIndex = i + 1; batch.reset(); } } if (batch.size > 0) { examineBatch(batch, vectorExtractRow, typeInfos, randomRows, firstRandomRowIndex); } } public void testVectorBinarySortableSerializeRow() throws Throwable { Random r = new Random(8732); testVectorSerializeRow(r, SerializationType.BINARY_SORTABLE); } public void testVectorLazyBinarySerializeRow() throws Throwable { Random r = new Random(8732); testVectorSerializeRow(r, SerializationType.LAZY_BINARY); } public void testVectorLazySimpleSerializeRow() throws Throwable { Random r = new Random(8732); testVectorSerializeRow(r, SerializationType.LAZY_SIMPLE); } public void testVectorBinarySortableDeserializeRow() throws Throwable { Random r = new Random(8732); testVectorDeserializeRow(r, SerializationType.BINARY_SORTABLE, /* alternate1 = useColumnSortOrderIsDesc */ false, /* alternate2 = useBinarySortableCharsNeedingEscape */ false, /* useExternalBuffer */ false); testVectorDeserializeRow(r, SerializationType.BINARY_SORTABLE, /* alternate1 = useColumnSortOrderIsDesc */ true, /* alternate2 = useBinarySortableCharsNeedingEscape */ false, /* useExternalBuffer */ false); testVectorDeserializeRow(r, SerializationType.BINARY_SORTABLE, /* alternate1 = useColumnSortOrderIsDesc */ false, /* alternate2 = useBinarySortableCharsNeedingEscape */ false, /* useExternalBuffer */ true); testVectorDeserializeRow(r, SerializationType.BINARY_SORTABLE, /* alternate1 = useColumnSortOrderIsDesc */ true, /* alternate2 = useBinarySortableCharsNeedingEscape */ false, /* useExternalBuffer */ true); testVectorDeserializeRow(r, SerializationType.BINARY_SORTABLE, /* alternate1 = useColumnSortOrderIsDesc */ false, /* alternate2 = useBinarySortableCharsNeedingEscape */ true, /* useExternalBuffer */ false); testVectorDeserializeRow(r, SerializationType.BINARY_SORTABLE, /* alternate1 = useColumnSortOrderIsDesc */ true, /* alternate2 = useBinarySortableCharsNeedingEscape */ true, /* useExternalBuffer */ false); testVectorDeserializeRow(r, SerializationType.BINARY_SORTABLE, /* alternate1 = useColumnSortOrderIsDesc */ false, /* alternate2 = useBinarySortableCharsNeedingEscape */ true, /* useExternalBuffer */ true); testVectorDeserializeRow(r, SerializationType.BINARY_SORTABLE, /* alternate1 = useColumnSortOrderIsDesc */ true, /* alternate2 = useBinarySortableCharsNeedingEscape */ true, /* useExternalBuffer */ true); } public void testVectorLazyBinaryDeserializeRow() throws Throwable { Random r = new Random(8732); testVectorDeserializeRow(r, SerializationType.LAZY_BINARY, /* alternate1 = unused */ false, /* alternate2 = unused */ false, /* useExternalBuffer */ false); testVectorDeserializeRow(r, SerializationType.LAZY_BINARY, /* alternate1 = unused */ false, /* alternate2 = unused */ false, /* useExternalBuffer */ true); } public void testVectorLazySimpleDeserializeRow() throws Throwable { Random r = new Random(8732); testVectorDeserializeRow(r, SerializationType.LAZY_SIMPLE, /* alternate1 = useLazySimpleEscapes */ false, /* alternate2 = unused */ false, /* useExternalBuffer */ false); testVectorDeserializeRow(r, SerializationType.LAZY_SIMPLE, /* alternate1 = useLazySimpleEscapes */ false, /* alternate2 = unused */ false, /* useExternalBuffer */ true); testVectorDeserializeRow(r, SerializationType.LAZY_SIMPLE, /* alternate1 = useLazySimpleEscapes */ true, /* alternate2 = unused */ false, /* useExternalBuffer */ false); testVectorDeserializeRow(r, SerializationType.LAZY_SIMPLE, /* alternate1 = useLazySimpleEscapes */ true, /* alternate2 = unused */ false, /* useExternalBuffer */ true); } }