/** * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io.parquet; import org.apache.commons.lang.ArrayUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; import org.apache.hadoop.hive.ql.io.IOConstants; import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport; import org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetTableUtils; import org.apache.hadoop.hive.ql.io.parquet.vector.VectorizedParquetRecordReader; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.Job; import org.apache.parquet.example.data.Group; import org.apache.parquet.example.data.simple.SimpleGroupFactory; import org.apache.parquet.hadoop.ParquetInputFormat; import org.apache.parquet.hadoop.ParquetInputSplit; import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.example.GroupReadSupport; import org.apache.parquet.hadoop.example.GroupWriteSupport; import org.apache.parquet.io.api.Binary; import org.apache.parquet.schema.MessageType; import java.io.IOException; import java.math.BigDecimal; import java.math.BigInteger; import java.util.List; import java.util.TimeZone; import static junit.framework.Assert.assertTrue; import static junit.framework.TestCase.assertFalse; import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_1_0; import static org.apache.parquet.hadoop.api.ReadSupport.PARQUET_READ_SCHEMA; import static org.apache.parquet.hadoop.metadata.CompressionCodecName.GZIP; import static org.apache.parquet.schema.MessageTypeParser.parseMessageType; import static org.junit.Assert.assertEquals; public class VectorizedColumnReaderTestBase { protected final static int nElements = 2500; protected final static int UNIQUE_NUM = 10; protected final static int NULL_FREQUENCY = 13; protected final static Configuration conf = new Configuration(); protected final static Path file = new Path("target/test/TestParquetVectorReader/testParquetFile"); protected static final MessageType schema = parseMessageType( "message hive_schema { " + "required int32 int32_field; " + "required int64 int64_field; " + "required int96 int96_field; " + "required double double_field; " + "required float float_field; " + "required boolean boolean_field; " + "required fixed_len_byte_array(3) flba_field; " + "optional fixed_len_byte_array(1) some_null_field; " + "optional fixed_len_byte_array(1) all_null_field; " + "required binary binary_field; " + "optional binary binary_field_some_null; " + "required binary value (DECIMAL(5,2)); " + "required group struct_field {" + " required int32 a;\n" + " required double b;\n" + "}\n" + "optional group nested_struct_field {" + " optional group nsf {" + " optional int32 c;\n" + " optional int32 d;\n" + " }\n" + " optional double e;\n" + "}\n" + "optional group struct_field_some_null {" + " optional int32 f;\n" + " optional double g;\n" + "}\n" + "optional group map_field (MAP) {\n" + " repeated group map (MAP_KEY_VALUE) {\n" + " required binary key;\n" + " optional binary value;\n" + " }\n" + "}\n" + "optional group array_list (LIST) {\n" + " repeated group bag {\n" + " optional int32 array_element;\n" + " }\n" + "}\n" + "} "); protected static void removeFile() throws IOException { FileSystem fs = file.getFileSystem(conf); if (fs.exists(file)) { fs.delete(file, true); } } protected static ParquetWriter<Group> initWriterFromFile() throws IOException { GroupWriteSupport.setSchema(schema, conf); return new ParquetWriter<>( file, new GroupWriteSupport(), GZIP, 1024 * 1024, 1024, 1024 * 1024, true, false, PARQUET_1_0, conf); } protected static int getIntValue( boolean isDictionaryEncoding, int index) { return isDictionaryEncoding ? index % UNIQUE_NUM : index; } protected static double getDoubleValue( boolean isDictionaryEncoding, int index) { return isDictionaryEncoding ? index % UNIQUE_NUM : index; } protected static long getLongValue( boolean isDictionaryEncoding, int index) { return isDictionaryEncoding ? (long) 2 * index % UNIQUE_NUM : (long) 2 * index; } protected static float getFloatValue( boolean isDictionaryEncoding, int index) { return (float) (isDictionaryEncoding ? index % UNIQUE_NUM * 2.0 : index * 2.0); } protected static boolean getBooleanValue( float index) { return (index % 2 == 0); } protected static String getTimestampStr(int index) { String s = String.valueOf(index); int l = 4 - s.length(); for (int i = 0; i < l; i++) { s = "0" + s; } return "99999999" + s; } protected static HiveDecimal getDecimal( boolean isDictionaryEncoding, int index) { int decimalVal = index % 100; String decimalStr = (decimalVal < 10) ? "0" + String.valueOf(decimalVal) : String.valueOf (decimalVal); int intVal = (isDictionaryEncoding) ? index % UNIQUE_NUM : index / 100; String d = String.valueOf(intVal) + decimalStr; BigInteger bi = new BigInteger(d); BigDecimal bd = new BigDecimal(bi); return HiveDecimal.create(bd); } protected static Binary getTimestamp( boolean isDictionaryEncoding, int index) { String s = isDictionaryEncoding ? getTimestampStr(index % UNIQUE_NUM) : getTimestampStr(index); return Binary.fromReusedByteArray(s.getBytes()); } protected static String getStr( boolean isDictionaryEncoding, int index) { int binaryLen = isDictionaryEncoding ? index % UNIQUE_NUM : index; String v = ""; while (binaryLen > 0) { char t = (char) ('a' + binaryLen % 26); binaryLen /= 26; v = t + v; } return v; } protected static Binary getBinaryValue( boolean isDictionaryEncoding, int index) { return Binary.fromString(getStr(isDictionaryEncoding, index)); } protected static boolean isNull(int index) { return (index % NULL_FREQUENCY == 0); } protected VectorizedParquetRecordReader createParquetReader(String schemaString, Configuration conf) throws IOException, InterruptedException, HiveException { conf.set(PARQUET_READ_SCHEMA, schemaString); HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true); HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp"); Job vectorJob = new Job(conf, "read vector"); ParquetInputFormat.setInputPaths(vectorJob, file); ParquetInputFormat parquetInputFormat = new ParquetInputFormat(GroupReadSupport.class); ParquetInputSplit split = (ParquetInputSplit) parquetInputFormat.getSplits(vectorJob).get(0); initialVectorizedRowBatchCtx(conf); return new VectorizedParquetRecordReader(split, new JobConf(conf)); } protected static void writeData(ParquetWriter<Group> writer, boolean isDictionaryEncoding) throws IOException { SimpleGroupFactory f = new SimpleGroupFactory(schema); for (int i = 0; i < nElements; i++) { boolean isNull = isNull(i); int intVal = getIntValue(isDictionaryEncoding, i); long longVal = getLongValue(isDictionaryEncoding, i); Binary timeStamp = getTimestamp(isDictionaryEncoding, i); HiveDecimal decimalVal = getDecimal(isDictionaryEncoding, i).setScale(2); double doubleVal = getDoubleValue(isDictionaryEncoding, i); float floatVal = getFloatValue(isDictionaryEncoding, i); boolean booleanVal = getBooleanValue(i); Binary binary = getBinaryValue(isDictionaryEncoding, i); Group group = f.newGroup() .append("int32_field", intVal) .append("int64_field", longVal) .append("int96_field", timeStamp) .append("double_field", doubleVal) .append("float_field", floatVal) .append("boolean_field", booleanVal) .append("flba_field", "abc"); if (!isNull) { group.append("some_null_field", "x"); } group.append("binary_field", binary); if (!isNull) { group.append("binary_field_some_null", binary); } HiveDecimalWritable w = new HiveDecimalWritable(decimalVal); group.append("value", Binary.fromConstantByteArray(w.getInternalStorage())); group.addGroup("struct_field") .append("a", intVal) .append("b", doubleVal); Group g = group.addGroup("nested_struct_field"); g.addGroup("nsf").append("c", intVal).append("d", intVal); g.append("e", doubleVal); Group some_null_g = group.addGroup("struct_field_some_null"); if (i % 2 != 0) { some_null_g.append("f", intVal); } if (i % 3 != 0) { some_null_g.append("g", doubleVal); } Group mapGroup = group.addGroup("map_field"); if (i % 13 != 1) { mapGroup.addGroup("map").append("key", binary).append("value", "abc"); } else { mapGroup.addGroup("map").append("key", binary); } Group arrayGroup = group.addGroup("array_list"); for (int j = 0; j < i % 4; j++) { arrayGroup.addGroup("bag").append("array_element", intVal); } writer.write(group); } writer.close(); } protected void initialVectorizedRowBatchCtx(Configuration conf) throws HiveException { MapWork mapWork = new MapWork(); VectorizedRowBatchCtx rbCtx = new VectorizedRowBatchCtx(); rbCtx.init(createStructObjectInspector(conf), new String[0]); mapWork.setVectorMode(true); mapWork.setVectorizedRowBatchCtx(rbCtx); Utilities.setMapWork(conf, mapWork); } private StructObjectInspector createStructObjectInspector(Configuration conf) { // Create row related objects String columnNames = conf.get(IOConstants.COLUMNS); List<String> columnNamesList = DataWritableReadSupport.getColumnNames(columnNames); String columnTypes = conf.get(IOConstants.COLUMNS_TYPES); List<TypeInfo> columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes); TypeInfo rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNamesList, columnTypesList); return new ArrayWritableObjectInspector((StructTypeInfo) rowTypeInfo); } protected void intRead(boolean isDictionaryEncoding) throws InterruptedException, HiveException, IOException { Configuration conf = new Configuration(); conf.set(IOConstants.COLUMNS,"int32_field"); conf.set(IOConstants.COLUMNS_TYPES,"int"); conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); conf.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, TimeZone.getDefault().getID()); VectorizedParquetRecordReader reader = createParquetReader("message test { required int32 int32_field;}", conf); VectorizedRowBatch previous = reader.createValue(); try { int c = 0; while (reader.next(NullWritable.get(), previous)) { LongColumnVector vector = (LongColumnVector) previous.cols[0]; assertTrue(vector.noNulls); for (int i = 0; i < vector.vector.length; i++) { if(c == nElements){ break; } assertEquals("Failed at " + c, getIntValue(isDictionaryEncoding, c), vector.vector[i]); assertFalse(vector.isNull[i]); c++; } } assertEquals(nElements, c); } finally { reader.close(); } } protected void longRead(boolean isDictionaryEncoding) throws Exception { Configuration conf = new Configuration(); conf.set(IOConstants.COLUMNS, "int64_field"); conf.set(IOConstants.COLUMNS_TYPES, "bigint"); conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); conf.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, TimeZone.getDefault().getID()); VectorizedParquetRecordReader reader = createParquetReader("message test { required int64 int64_field;}", conf); VectorizedRowBatch previous = reader.createValue(); try { int c = 0; while (reader.next(NullWritable.get(), previous)) { LongColumnVector vector = (LongColumnVector) previous.cols[0]; assertTrue(vector.noNulls); for (int i = 0; i < vector.vector.length; i++) { if (c == nElements) { break; } assertEquals("Failed at " + c, getLongValue(isDictionaryEncoding, c), vector.vector[i]); assertFalse(vector.isNull[i]); c++; } } assertEquals(nElements, c); } finally { reader.close(); } } protected void doubleRead(boolean isDictionaryEncoding) throws Exception { Configuration conf = new Configuration(); conf.set(IOConstants.COLUMNS, "double_field"); conf.set(IOConstants.COLUMNS_TYPES, "double"); conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); conf.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, TimeZone.getDefault().getID()); VectorizedParquetRecordReader reader = createParquetReader("message test { required double double_field;}", conf); VectorizedRowBatch previous = reader.createValue(); try { int c = 0; while (reader.next(NullWritable.get(), previous)) { DoubleColumnVector vector = (DoubleColumnVector) previous.cols[0]; assertTrue(vector.noNulls); for (int i = 0; i < vector.vector.length; i++) { if (c == nElements) { break; } assertEquals("Failed at " + c, getDoubleValue(isDictionaryEncoding, c), vector.vector[i], 0); assertFalse(vector.isNull[i]); c++; } } assertEquals(nElements, c); } finally { reader.close(); } } protected void floatRead(boolean isDictionaryEncoding) throws Exception { Configuration conf = new Configuration(); conf.set(IOConstants.COLUMNS, "float_field"); conf.set(IOConstants.COLUMNS_TYPES, "float"); conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); conf.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, TimeZone.getDefault().getID()); VectorizedParquetRecordReader reader = createParquetReader("message test { required float float_field;}", conf); VectorizedRowBatch previous = reader.createValue(); try { int c = 0; while (reader.next(NullWritable.get(), previous)) { DoubleColumnVector vector = (DoubleColumnVector) previous.cols[0]; assertTrue(vector.noNulls); for (int i = 0; i < vector.vector.length; i++) { if (c == nElements) { break; } assertEquals("Failed at " + c, getFloatValue(isDictionaryEncoding, c), vector.vector[i], 0); assertFalse(vector.isNull[i]); c++; } } assertEquals(nElements, c); } finally { reader.close(); } } protected void booleanRead() throws Exception { Configuration conf = new Configuration(); conf.set(IOConstants.COLUMNS, "boolean_field"); conf.set(IOConstants.COLUMNS_TYPES, "boolean"); conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); conf.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, TimeZone.getDefault().getID()); VectorizedParquetRecordReader reader = createParquetReader("message test { required boolean boolean_field;}", conf); VectorizedRowBatch previous = reader.createValue(); try { int c = 0; while (reader.next(NullWritable.get(), previous)) { LongColumnVector vector = (LongColumnVector) previous.cols[0]; assertTrue(vector.noNulls); for (int i = 0; i < vector.vector.length; i++) { if (c == nElements) { break; } assertEquals("Failed at " + c, (getBooleanValue(c) ? 1 : 0), vector.vector[i]); assertFalse(vector.isNull[i]); c++; } } assertEquals(nElements, c); } finally { reader.close(); } } protected void binaryRead(boolean isDictionaryEncoding) throws Exception { Configuration conf = new Configuration(); conf.set(IOConstants.COLUMNS, "binary_field_some_null"); conf.set(IOConstants.COLUMNS_TYPES, "string"); conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); conf.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, TimeZone.getDefault().getID()); VectorizedParquetRecordReader reader = createParquetReader("message test { required binary binary_field_some_null;}", conf); VectorizedRowBatch previous = reader.createValue(); int c = 0; try { while (reader.next(NullWritable.get(), previous)) { BytesColumnVector vector = (BytesColumnVector) previous.cols[0]; boolean noNull = true; for (int i = 0; i < vector.vector.length; i++) { if (c == nElements) { break; } String actual; assertEquals("Null assert failed at " + c, isNull(c), vector.isNull[i]); if (!vector.isNull[i]) { actual = new String(ArrayUtils .subarray(vector.vector[i], vector.start[i], vector.start[i] + vector.length[i])); assertEquals("failed at " + c, getStr(isDictionaryEncoding, c), actual); } else { noNull = false; } c++; } assertEquals("No Null check failed at " + c, noNull, vector.noNulls); assertFalse(vector.isRepeating); } assertEquals("It doesn't exit at expected position", nElements, c); } finally { reader.close(); } } protected void structRead(boolean isDictionaryEncoding) throws Exception { Configuration conf = new Configuration(); conf.set(IOConstants.COLUMNS, "struct_field"); conf.set(IOConstants.COLUMNS_TYPES, "struct<a:int,b:double>"); conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); conf.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, TimeZone.getDefault().getID()); String schema = "message hive_schema {\n" + "group struct_field {\n" + " optional int32 a;\n" + " optional double b;\n" + "}\n" + "}\n"; VectorizedParquetRecordReader reader = createParquetReader(schema, conf); VectorizedRowBatch previous = reader.createValue(); int c = 0; try { while (reader.next(NullWritable.get(), previous)) { StructColumnVector vector = (StructColumnVector) previous.cols[0]; LongColumnVector cv = (LongColumnVector) vector.fields[0]; DoubleColumnVector dv = (DoubleColumnVector) vector.fields[1]; for (int i = 0; i < cv.vector.length; i++) { if (c == nElements) { break; } assertEquals(getIntValue(isDictionaryEncoding, c), cv.vector[i]); assertEquals(getDoubleValue(isDictionaryEncoding, c), dv.vector[i], 0); assertFalse(vector.isNull[i]); assertFalse(vector.isRepeating); c++; } } assertEquals("It doesn't exit at expected position", nElements, c); } finally { reader.close(); } } protected void nestedStructRead0(boolean isDictionaryEncoding) throws Exception { Configuration conf = new Configuration(); conf.set(IOConstants.COLUMNS, "nested_struct_field"); conf.set(IOConstants.COLUMNS_TYPES, "struct<nsf:struct<c:int,d:int>,e:double>"); conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); conf.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, TimeZone.getDefault().getID()); String schema = "message hive_schema {\n" + "group nested_struct_field {\n" + " optional group nsf {\n" + " optional int32 c;\n" + " optional int32 d;\n" + " }" + "optional double e;\n" + "}\n"; VectorizedParquetRecordReader reader = createParquetReader(schema, conf); VectorizedRowBatch previous = reader.createValue(); int c = 0; try { while (reader.next(NullWritable.get(), previous)) { StructColumnVector vector = (StructColumnVector) previous.cols[0]; StructColumnVector sv = (StructColumnVector) vector.fields[0]; LongColumnVector cv = (LongColumnVector) sv.fields[0]; LongColumnVector dv = (LongColumnVector) sv.fields[1]; DoubleColumnVector ev = (DoubleColumnVector) vector.fields[1]; for (int i = 0; i < cv.vector.length; i++) { if (c == nElements) { break; } assertEquals(getIntValue(isDictionaryEncoding, c), cv.vector[i]); assertEquals(getIntValue(isDictionaryEncoding, c), dv.vector[i]); assertEquals(getDoubleValue(isDictionaryEncoding, c), ev.vector[i], 0); assertFalse(vector.isNull[i]); assertFalse(vector.isRepeating); c++; } } assertEquals("It doesn't exit at expected position", nElements, c); } finally { reader.close(); } } protected void nestedStructRead1(boolean isDictionaryEncoding) throws Exception { Configuration conf = new Configuration(); conf.set(IOConstants.COLUMNS, "nested_struct_field"); conf.set(IOConstants.COLUMNS_TYPES, "struct<nsf:struct<c:int>>"); conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); conf.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, TimeZone.getDefault().getID()); String schema = "message hive_schema {\n" + "group nested_struct_field {\n" + " optional group nsf {\n" + " optional int32 c;\n" + " }" + "}\n"; VectorizedParquetRecordReader reader = createParquetReader(schema, conf); VectorizedRowBatch previous = reader.createValue(); int c = 0; try { while (reader.next(NullWritable.get(), previous)) { StructColumnVector vector = (StructColumnVector) previous.cols[0]; StructColumnVector sv = (StructColumnVector) vector.fields[0]; LongColumnVector cv = (LongColumnVector) sv.fields[0]; for (int i = 0; i < cv.vector.length; i++) { if (c == nElements) { break; } assertEquals(getIntValue(isDictionaryEncoding, c), cv.vector[i]); assertFalse(vector.isNull[i]); assertFalse(vector.isRepeating); c++; } } assertEquals("It doesn't exit at expected position", nElements, c); } finally { reader.close(); } } protected void structReadSomeNull(boolean isDictionaryEncoding) throws Exception { Configuration conf = new Configuration(); conf.set(IOConstants.COLUMNS, "struct_field_some_null"); conf.set(IOConstants.COLUMNS_TYPES, "struct<f:int,g:double>"); conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); conf.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, TimeZone.getDefault().getID()); String schema = "message hive_schema {\n" + "group struct_field_some_null {\n" + " optional int32 f;\n" + " optional double g;\n" + "}\n"; VectorizedParquetRecordReader reader = createParquetReader(schema, conf); VectorizedRowBatch previous = reader.createValue(); int c = 0; try { while (reader.next(NullWritable.get(), previous)) { StructColumnVector sv = (StructColumnVector) previous.cols[0]; LongColumnVector fv = (LongColumnVector) sv.fields[0]; DoubleColumnVector gv = (DoubleColumnVector) sv.fields[1]; for (int i = 0; i < fv.vector.length; i++) { if (c == nElements) { break; } assertEquals(c % 2 == 0, fv.isNull[i]); assertEquals(c % 3 == 0, gv.isNull[i]); assertEquals(c % /* 2*3 = */6 == 0, sv.isNull[i]); if (!sv.isNull[i]) { if (!fv.isNull[i]) { assertEquals(getIntValue(isDictionaryEncoding, c), fv.vector[i]); } if (!gv.isNull[i]) { assertEquals(getDoubleValue(isDictionaryEncoding, c), gv.vector[i], 0); } } assertFalse(fv.isRepeating); c++; } } assertEquals("It doesn't exit at expected position", nElements, c); } finally { reader.close(); } } protected void decimalRead(boolean isDictionaryEncoding) throws Exception { Configuration conf = new Configuration(); conf.set(IOConstants.COLUMNS, "value"); conf.set(IOConstants.COLUMNS_TYPES, "decimal(5,2)"); conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); conf.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY, TimeZone.getDefault().getID()); VectorizedParquetRecordReader reader = createParquetReader("message hive_schema { required value (DECIMAL(5,2));}", conf); VectorizedRowBatch previous = reader.createValue(); try { int c = 0; while (reader.next(NullWritable.get(), previous)) { DecimalColumnVector vector = (DecimalColumnVector) previous.cols[0]; assertTrue(vector.noNulls); for (int i = 0; i < vector.vector.length; i++) { if (c == nElements) { break; } assertEquals("Check failed at pos " + c, getDecimal(isDictionaryEncoding, c), vector.vector[i].getHiveDecimal()); assertFalse(vector.isNull[i]); c++; } } assertEquals(nElements, c); } finally { reader.close(); } } }