/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io.orc; import java.io.File; import java.sql.Date; import java.sql.Timestamp; import java.util.Calendar; import java.util.Random; import junit.framework.Assert; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.serde2.io.ByteWritable; import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.io.DoubleWritable; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.hive.serde2.io.ShortWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.junit.Before; import org.junit.Test; import static org.junit.Assert.assertEquals; /** * * Class that tests ORC reader vectorization by comparing records that are * returned by "row by row" reader with batch reader. * */ public class TestVectorizedORCReader { private Configuration conf; private FileSystem fs; private Path testFilePath; @Before public void openFileSystem() throws Exception { conf = new Configuration(); fs = FileSystem.getLocal(conf); Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); fs.setWorkingDirectory(workDir); testFilePath = new Path("TestVectorizedORCReader.testDump.orc"); fs.delete(testFilePath, false); } @SuppressWarnings("unused") static class MyRecord { private final Boolean bo; private final Byte by; private final Integer i; private final Long l; private final Short s; private final Double d; private final String k; private final Timestamp t; private final Date dt; private final HiveDecimal hd; MyRecord(Boolean bo, Byte by, Integer i, Long l, Short s, Double d, String k, Timestamp t, Date dt, HiveDecimal hd) { this.bo = bo; this.by = by; this.i = i; this.l = l; this.s = s; this.d = d; this.k = k; this.t = t; this.dt = dt; this.hd = hd; } } @Test public void createFile() throws Exception { ObjectInspector inspector; synchronized (TestVectorizedORCReader.class) { inspector = ObjectInspectorFactory.getReflectionObjectInspector (MyRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 100000, CompressionKind.ZLIB, 10000, 10000); Random r1 = new Random(1); String[] words = new String[] {"It", "was", "the", "best", "of", "times,", "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age", "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it", "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch", "of", "incredulity,", "it", "was", "the", "season", "of", "Light,", "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the", "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,", "we", "had", "everything", "before", "us,", "we", "had", "nothing", "before", "us,", "we", "were", "all", "going", "direct", "to", "Heaven,", "we", "were", "all", "going", "direct", "the", "other", "way"}; String[] dates = new String[] {"1991-02-28", "1970-01-31", "1950-04-23"}; String[] decimalStrings = new String[] {"234.443", "10001000", "0.3333367", "67788798.0", "-234.443", "-10001000", "-0.3333367", "-67788798.0", "0"}; for (int i = 0; i < 21000; ++i) { if ((i % 7) != 0) { writer.addRow(new MyRecord(((i % 3) == 0), (byte)(i % 5), i, (long) 200, (short) (300 + i), (double) (400 + i), words[r1.nextInt(words.length)], new Timestamp(Calendar.getInstance().getTime().getTime()), Date.valueOf(dates[i % 3]), HiveDecimal.create(decimalStrings[i % decimalStrings.length]))); } else { writer.addRow(new MyRecord(null, null, i, (long) 200, null, null, null, null, null, null)); } } writer.close(); checkVectorizedReader(); } private void checkVectorizedReader() throws Exception { Reader vreader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf)); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf)); RecordReaderImpl vrr = (RecordReaderImpl) vreader.rows(); RecordReaderImpl rr = (RecordReaderImpl) reader.rows(); VectorizedRowBatch batch = reader.getSchema().createRowBatch(); OrcStruct row = null; // Check Vectorized ORC reader against ORC row reader while (vrr.nextBatch(batch)) { for (int i = 0; i < batch.size; i++) { row = (OrcStruct) rr.next(row); for (int j = 0; j < batch.cols.length; j++) { Object a = (row.getFieldValue(j)); ColumnVector cv = batch.cols[j]; // if the value is repeating, use row 0 int rowId = cv.isRepeating ? 0 : i; // make sure the null flag agrees if (a == null) { Assert.assertEquals(true, !cv.noNulls && cv.isNull[rowId]); } else if (a instanceof BooleanWritable) { // Boolean values are stores a 1's and 0's, so convert and compare Long temp = (long) (((BooleanWritable) a).get() ? 1 : 0); long b = ((LongColumnVector) cv).vector[rowId]; Assert.assertEquals(temp.toString(), Long.toString(b)); } else if (a instanceof TimestampWritable) { // Timestamps are stored as long, so convert and compare TimestampWritable t = ((TimestampWritable) a); TimestampColumnVector tcv = ((TimestampColumnVector) cv); Assert.assertEquals(t.getTimestamp(), tcv.asScratchTimestamp(rowId)); } else if (a instanceof DateWritable) { // Dates are stored as long, so convert and compare DateWritable adt = (DateWritable) a; long b = ((LongColumnVector) cv).vector[rowId]; Assert.assertEquals(adt.get().getTime(), DateWritable.daysToMillis((int) b)); } else if (a instanceof HiveDecimalWritable) { // Decimals are stored as BigInteger, so convert and compare HiveDecimalWritable dec = (HiveDecimalWritable) a; HiveDecimalWritable b = ((DecimalColumnVector) cv).vector[i]; Assert.assertEquals(dec, b); } else if (a instanceof DoubleWritable) { double b = ((DoubleColumnVector) cv).vector[rowId]; assertEquals(a.toString(), Double.toString(b)); } else if (a instanceof Text) { BytesColumnVector bcv = (BytesColumnVector) cv; Text b = new Text(); b.set(bcv.vector[rowId], bcv.start[rowId], bcv.length[rowId]); assertEquals(a, b); } else if (a instanceof IntWritable || a instanceof LongWritable || a instanceof ByteWritable || a instanceof ShortWritable) { assertEquals(a.toString(), Long.toString(((LongColumnVector) cv).vector[rowId])); } else { assertEquals("huh", a.getClass().getName()); } } } // Check repeating Assert.assertEquals(false, batch.cols[0].isRepeating); Assert.assertEquals(false, batch.cols[1].isRepeating); Assert.assertEquals(false, batch.cols[2].isRepeating); Assert.assertEquals(true, batch.cols[3].isRepeating); Assert.assertEquals(false, batch.cols[4].isRepeating); Assert.assertEquals(false, batch.cols[5].isRepeating); Assert.assertEquals(false, batch.cols[6].isRepeating); Assert.assertEquals(false, batch.cols[7].isRepeating); Assert.assertEquals(false, batch.cols[8].isRepeating); Assert.assertEquals(false, batch.cols[9].isRepeating); // Check non null Assert.assertEquals(false, batch.cols[0].noNulls); Assert.assertEquals(false, batch.cols[1].noNulls); Assert.assertEquals(true, batch.cols[2].noNulls); Assert.assertEquals(true, batch.cols[3].noNulls); Assert.assertEquals(false, batch.cols[4].noNulls); Assert.assertEquals(false, batch.cols[5].noNulls); Assert.assertEquals(false, batch.cols[6].noNulls); Assert.assertEquals(false, batch.cols[7].noNulls); Assert.assertEquals(false, batch.cols[8].noNulls); Assert.assertEquals(false, batch.cols[9].noNulls); } Assert.assertEquals(false, rr.nextBatch(batch)); } }