/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io.orc;
import static junit.framework.Assert.assertEquals;
import static junit.framework.Assert.assertNotNull;
import static junit.framework.Assert.assertNull;
import java.io.File;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hive.common.util.HiveTestUtils;
import org.apache.orc.BinaryColumnStatistics;
import org.apache.orc.BooleanColumnStatistics;
import org.apache.orc.ColumnStatistics;
import org.apache.orc.DoubleColumnStatistics;
import org.apache.orc.IntegerColumnStatistics;
import org.apache.orc.StringColumnStatistics;
import org.apache.orc.StripeInformation;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TestName;
import com.google.common.collect.Lists;
public class TestOrcSerDeStats {
public static class ListStruct {
List<String> list1;
public ListStruct(List<String> l1) {
this.list1 = l1;
}
}
public static class MapStruct {
Map<String, Double> map1;
public MapStruct(Map<String, Double> m1) {
this.map1 = m1;
}
}
public static class SimpleStruct {
BytesWritable bytes1;
Text string1;
SimpleStruct(BytesWritable b1, String s1) {
this.bytes1 = b1;
if (s1 == null) {
this.string1 = null;
} else {
this.string1 = new Text(s1);
}
}
}
public static class InnerStruct {
int int1;
Text string1 = new Text();
InnerStruct(int int1, String string1) {
this.int1 = int1;
this.string1.set(string1);
}
}
public static class MiddleStruct {
List<InnerStruct> list = new ArrayList<InnerStruct>();
MiddleStruct(InnerStruct... items) {
list.clear();
list.addAll(Arrays.asList(items));
}
}
public static class BigRow {
Boolean boolean1;
Byte byte1;
Short short1;
Integer int1;
Long long1;
Float float1;
Double double1;
BytesWritable bytes1;
Text string1;
List<InnerStruct> list = new ArrayList<InnerStruct>();
Map<Text, InnerStruct> map = new HashMap<Text, InnerStruct>();
Timestamp ts;
HiveDecimal decimal1;
MiddleStruct middle;
BigRow(Boolean b1, Byte b2, Short s1, Integer i1, Long l1, Float f1,
Double d1,
BytesWritable b3, String s2, MiddleStruct m1,
List<InnerStruct> l2, Map<Text, InnerStruct> m2, Timestamp ts1,
HiveDecimal dec1) {
this.boolean1 = b1;
this.byte1 = b2;
this.short1 = s1;
this.int1 = i1;
this.long1 = l1;
this.float1 = f1;
this.double1 = d1;
this.bytes1 = b3;
if (s2 == null) {
this.string1 = null;
} else {
this.string1 = new Text(s2);
}
this.middle = m1;
this.list = l2;
this.map = m2;
this.ts = ts1;
this.decimal1 = dec1;
}
}
private static InnerStruct inner(int i, String s) {
return new InnerStruct(i, s);
}
private static Map<Text, InnerStruct> map(InnerStruct... items) {
Map<Text, InnerStruct> result = new HashMap<Text, InnerStruct>();
for (InnerStruct i : items) {
result.put(new Text(i.string1), i);
}
return result;
}
private static List<InnerStruct> list(InnerStruct... items) {
List<InnerStruct> result = new ArrayList<InnerStruct>();
result.addAll(Arrays.asList(items));
return result;
}
private static BytesWritable bytes(int... items) {
BytesWritable result = new BytesWritable();
result.setSize(items.length);
for (int i = 0; i < items.length; ++i) {
result.getBytes()[i] = (byte) items[i];
}
return result;
}
Path workDir = new Path(System.getProperty("test.tmp.dir",
"target" + File.separator + "test" + File.separator + "tmp"));
Configuration conf;
FileSystem fs;
Path testFilePath;
@Rule
public TestName testCaseName = new TestName();
@Before
public void openFileSystem() throws Exception {
conf = new Configuration();
fs = FileSystem.getLocal(conf);
testFilePath = new Path(workDir, "TestOrcSerDeStats." +
testCaseName.getMethodName() + ".orc");
fs.delete(testFilePath, false);
}
@Test
public void testStringAndBinaryStatistics() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcSerDeStats.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(SimpleStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf)
.inspector(inspector)
.stripeSize(100000)
.bufferSize(10000));
writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4), "foo"));
writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3), "bar"));
writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4, 5), null));
writer.addRow(new SimpleStruct(null, "hi"));
writer.close();
assertEquals(4, writer.getNumberOfRows());
assertEquals(273, writer.getRawDataSize());
Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf).filesystem(fs));
assertEquals(4, reader.getNumberOfRows());
assertEquals(273, reader.getRawDataSize());
assertEquals(15, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1")));
assertEquals(258, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1")));
assertEquals(273, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "string1")));
// check the stats
ColumnStatistics[] stats = reader.getStatistics();
assertEquals(4, stats[0].getNumberOfValues());
assertEquals("count: 4 hasNull: false", stats[0].toString());
assertEquals(3, stats[1].getNumberOfValues());
assertEquals(15, ((BinaryColumnStatistics) stats[1]).getSum());
assertEquals("count: 3 hasNull: true sum: 15", stats[1].toString());
assertEquals(3, stats[2].getNumberOfValues());
assertEquals("bar", ((StringColumnStatistics) stats[2]).getMinimum());
assertEquals("hi", ((StringColumnStatistics) stats[2]).getMaximum());
assertEquals(8, ((StringColumnStatistics) stats[2]).getSum());
assertEquals("count: 3 hasNull: true min: bar max: hi sum: 8",
stats[2].toString());
// check the inspectors
StructObjectInspector readerInspector =
(StructObjectInspector) reader.getObjectInspector();
assertEquals(ObjectInspector.Category.STRUCT,
readerInspector.getCategory());
assertEquals("struct<bytes1:binary,string1:string>",
readerInspector.getTypeName());
List<? extends StructField> fields =
readerInspector.getAllStructFieldRefs();
BinaryObjectInspector bi = (BinaryObjectInspector) readerInspector.
getStructFieldRef("bytes1").getFieldObjectInspector();
StringObjectInspector st = (StringObjectInspector) readerInspector.
getStructFieldRef("string1").getFieldObjectInspector();
RecordReader rows = reader.rows();
Object row = rows.next(null);
assertNotNull(row);
// check the contents of the first row
assertEquals(bytes(0, 1, 2, 3, 4), bi.getPrimitiveWritableObject(
readerInspector.getStructFieldData(row, fields.get(0))));
assertEquals("foo", st.getPrimitiveJavaObject(readerInspector.
getStructFieldData(row, fields.get(1))));
// check the contents of second row
assertEquals(true, rows.hasNext());
row = rows.next(row);
assertEquals(bytes(0, 1, 2, 3), bi.getPrimitiveWritableObject(
readerInspector.getStructFieldData(row, fields.get(0))));
assertEquals("bar", st.getPrimitiveJavaObject(readerInspector.
getStructFieldData(row, fields.get(1))));
// check the contents of second row
assertEquals(true, rows.hasNext());
row = rows.next(row);
assertEquals(bytes(0, 1, 2, 3, 4, 5), bi.getPrimitiveWritableObject(
readerInspector.getStructFieldData(row, fields.get(0))));
assertNull(st.getPrimitiveJavaObject(readerInspector.
getStructFieldData(row, fields.get(1))));
// check the contents of second row
assertEquals(true, rows.hasNext());
row = rows.next(row);
assertNull(bi.getPrimitiveWritableObject(
readerInspector.getStructFieldData(row, fields.get(0))));
assertEquals("hi", st.getPrimitiveJavaObject(readerInspector.
getStructFieldData(row, fields.get(1))));
// handle the close up
assertEquals(false, rows.hasNext());
rows.close();
}
@Test
public void testOrcSerDeStatsList() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcSerDeStats.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(ListStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf)
.inspector(inspector)
.stripeSize(10000)
.bufferSize(10000));
for (int row = 0; row < 5000; row++) {
List<String> test = new ArrayList<String>();
for (int i = 0; i < 1000; i++) {
test.add("hi");
}
writer.addRow(new ListStruct(test));
}
writer.close();
assertEquals(5000, writer.getNumberOfRows());
assertEquals(430000000, writer.getRawDataSize());
Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf).filesystem(fs));
// stats from reader
assertEquals(5000, reader.getNumberOfRows());
assertEquals(430000000, reader.getRawDataSize());
assertEquals(430000000, reader.getRawDataSizeOfColumns(Lists.newArrayList("list1")));
}
@Test
public void testOrcSerDeStatsMap() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcSerDeStats.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(MapStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf)
.inspector(inspector)
.stripeSize(10000)
.bufferSize(10000));
for (int row = 0; row < 1000; row++) {
Map<String, Double> test = new HashMap<String, Double>();
for (int i = 0; i < 10; i++) {
test.put("hi" + i, 2.0);
}
writer.addRow(new MapStruct(test));
}
writer.close();
// stats from writer
assertEquals(1000, writer.getNumberOfRows());
assertEquals(950000, writer.getRawDataSize());
Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf).filesystem(fs));
// stats from reader
assertEquals(1000, reader.getNumberOfRows());
assertEquals(950000, reader.getRawDataSize());
assertEquals(950000, reader.getRawDataSizeOfColumns(Lists.newArrayList("map1")));
}
@Test
public void testOrcSerDeStatsSimpleWithNulls() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcSerDeStats.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(SimpleStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf)
.inspector(inspector)
.stripeSize(10000)
.bufferSize(10000));
for (int row = 0; row < 1000; row++) {
if (row % 2 == 0) {
writer.addRow(new SimpleStruct(new BytesWritable(new byte[] {1, 2, 3}), "hi"));
} else {
writer.addRow(null);
}
}
writer.close();
// stats from writer
assertEquals(1000, writer.getNumberOfRows());
assertEquals(44500, writer.getRawDataSize());
Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf).filesystem(fs));
// stats from reader
assertEquals(1000, reader.getNumberOfRows());
assertEquals(44500, reader.getRawDataSize());
assertEquals(1500, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1")));
assertEquals(43000, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1")));
assertEquals(44500, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "string1")));
}
@Test
public void testOrcSerDeStatsComplex() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcSerDeStats.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf)
.inspector(inspector)
.stripeSize(100000)
.bufferSize(10000));
// 1 + 2 + 4 + 8 + 4 + 8 + 5 + 2 + 4 + 3 + 4 + 4 + 4 + 4 + 4 + 3 = 64
writer.addRow(new BigRow(false, (byte) 1, (short) 1024, 65536,
Long.MAX_VALUE, (float) 1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi",
new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
list(inner(3, "good"), inner(4, "bad")),
map(), Timestamp.valueOf("2000-03-12 15:00:00"), HiveDecimal.create(
"12345678.6547456")));
// 1 + 2 + 4 + 8 + 4 + 8 + 3 + 4 + 3 + 4 + 4 + 4 + 3 + 4 + 2 + 4 + 3 + 5 + 4 + 5 + 7 + 4 + 7 =
// 97
writer.addRow(new BigRow(true, (byte) 100, (short) 2048, 65536,
Long.MAX_VALUE, (float) 2.0, -5.0, bytes(), "bye",
new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")),
map(inner(5, "chani"), inner(1, "mauddib")), Timestamp.valueOf("2000-03-11 15:00:00"),
HiveDecimal.create("12345678.6547452")));
writer.close();
long rowCount = writer.getNumberOfRows();
long rawDataSize = writer.getRawDataSize();
assertEquals(2, rowCount);
assertEquals(1740, rawDataSize);
Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf).filesystem(fs));
assertEquals(2, reader.getNumberOfRows());
assertEquals(1740, reader.getRawDataSize());
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1")));
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("byte1")));
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("short1")));
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("int1")));
assertEquals(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("long1")));
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("float1")));
assertEquals(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("double1")));
assertEquals(5, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1")));
assertEquals(172, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1")));
assertEquals(455, reader.getRawDataSizeOfColumns(Lists.newArrayList("list")));
assertEquals(368, reader.getRawDataSizeOfColumns(Lists.newArrayList("map")));
assertEquals(364, reader.getRawDataSizeOfColumns(Lists.newArrayList("middle")));
assertEquals(80, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts")));
assertEquals(224, reader.getRawDataSizeOfColumns(Lists.newArrayList("decimal1")));
assertEquals(88, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts", "int1")));
assertEquals(1195,
reader.getRawDataSizeOfColumns(Lists.newArrayList("middle", "list", "map", "float1")));
assertEquals(185,
reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "byte1", "string1")));
assertEquals(rawDataSize, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1",
"byte1", "short1", "int1", "long1", "float1", "double1", "bytes1", "string1", "list",
"map", "middle", "ts", "decimal1")));
// check the stats
ColumnStatistics[] stats = reader.getStatistics();
assertEquals(2, stats[1].getNumberOfValues());
assertEquals(1, ((BooleanColumnStatistics) stats[1]).getFalseCount());
assertEquals(1, ((BooleanColumnStatistics) stats[1]).getTrueCount());
assertEquals("count: 2 hasNull: false true: 1", stats[1].toString());
assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
assertEquals(3072, ((IntegerColumnStatistics) stats[3]).getSum());
assertEquals("count: 2 hasNull: false min: 1024 max: 2048 sum: 3072",
stats[3].toString());
assertEquals(Long.MAX_VALUE,
((IntegerColumnStatistics) stats[5]).getMaximum());
assertEquals(Long.MAX_VALUE,
((IntegerColumnStatistics) stats[5]).getMinimum());
assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined());
assertEquals("count: 2 hasNull: false min: 9223372036854775807 max: 9223372036854775807",
stats[5].toString());
assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum());
assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum());
assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
assertEquals("count: 2 hasNull: false min: -15.0 max: -5.0 sum: -20.0",
stats[7].toString());
assertEquals("count: 2 hasNull: false min: bye max: hi sum: 5", stats[9].toString());
}
@Test
public void testOrcSerDeStatsComplexOldFormat() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcSerDeStats.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector
(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf)
.inspector(inspector)
.stripeSize(100000)
.version(OrcFile.Version.V_0_11)
.bufferSize(10000));
// 1 + 2 + 4 + 8 + 4 + 8 + 5 + 2 + 4 + 3 + 4 + 4 + 4 + 4 + 4 + 3 = 64
writer.addRow(new BigRow(false, (byte) 1, (short) 1024, 65536,
Long.MAX_VALUE, (float) 1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi",
new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
list(inner(3, "good"), inner(4, "bad")),
map(), Timestamp.valueOf("2000-03-12 15:00:00"), HiveDecimal.create(
"12345678.6547456")));
// 1 + 2 + 4 + 8 + 4 + 8 + 3 + 4 + 3 + 4 + 4 + 4 + 3 + 4 + 2 + 4 + 3 + 5 + 4 + 5 + 7 + 4 + 7 =
// 97
writer.addRow(new BigRow(true, (byte) 100, (short) 2048, 65536,
Long.MAX_VALUE, (float) 2.0, -5.0, bytes(), "bye",
new MiddleStruct(inner(1, "bye"), inner(2, "sigh")),
list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")),
map(inner(5, "chani"), inner(1, "mauddib")), Timestamp.valueOf("2000-03-11 15:00:00"),
HiveDecimal.create("12345678.6547452")));
writer.close();
long rowCount = writer.getNumberOfRows();
long rawDataSize = writer.getRawDataSize();
assertEquals(2, rowCount);
assertEquals(1740, rawDataSize);
Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf).filesystem(fs));
assertEquals(2, reader.getNumberOfRows());
assertEquals(1740, reader.getRawDataSize());
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1")));
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("byte1")));
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("short1")));
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("int1")));
assertEquals(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("long1")));
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("float1")));
assertEquals(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("double1")));
assertEquals(5, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1")));
assertEquals(172, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1")));
assertEquals(455, reader.getRawDataSizeOfColumns(Lists.newArrayList("list")));
assertEquals(368, reader.getRawDataSizeOfColumns(Lists.newArrayList("map")));
assertEquals(364, reader.getRawDataSizeOfColumns(Lists.newArrayList("middle")));
assertEquals(80, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts")));
assertEquals(224, reader.getRawDataSizeOfColumns(Lists.newArrayList("decimal1")));
assertEquals(88, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts", "int1")));
assertEquals(1195,
reader.getRawDataSizeOfColumns(Lists.newArrayList("middle", "list", "map", "float1")));
assertEquals(185,
reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "byte1", "string1")));
assertEquals(rawDataSize, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1",
"byte1", "short1", "int1", "long1", "float1", "double1", "bytes1", "string1", "list",
"map", "middle", "ts", "decimal1")));
// check the stats
ColumnStatistics[] stats = reader.getStatistics();
assertEquals(2, stats[1].getNumberOfValues());
assertEquals(1, ((BooleanColumnStatistics) stats[1]).getFalseCount());
assertEquals(1, ((BooleanColumnStatistics) stats[1]).getTrueCount());
assertEquals("count: 2 hasNull: false true: 1", stats[1].toString());
assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
assertEquals(3072, ((IntegerColumnStatistics) stats[3]).getSum());
assertEquals("count: 2 hasNull: false min: 1024 max: 2048 sum: 3072",
stats[3].toString());
assertEquals(Long.MAX_VALUE,
((IntegerColumnStatistics) stats[5]).getMaximum());
assertEquals(Long.MAX_VALUE,
((IntegerColumnStatistics) stats[5]).getMinimum());
assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined());
assertEquals("count: 2 hasNull: false min: 9223372036854775807 max: 9223372036854775807",
stats[5].toString());
assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum());
assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum());
assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
assertEquals("count: 2 hasNull: false min: -15.0 max: -5.0 sum: -20.0",
stats[7].toString());
assertEquals(5, ((BinaryColumnStatistics) stats[8]).getSum());
assertEquals("count: 2 hasNull: false sum: 5", stats[8].toString());
assertEquals("bye", ((StringColumnStatistics) stats[9]).getMinimum());
assertEquals("hi", ((StringColumnStatistics) stats[9]).getMaximum());
assertEquals(5, ((StringColumnStatistics) stats[9]).getSum());
assertEquals("count: 2 hasNull: false min: bye max: hi sum: 5", stats[9].toString());
}
@Test(expected = ClassCastException.class)
public void testSerdeStatsOldFormat() throws Exception {
Path oldFilePath = new Path(HiveTestUtils.getFileFromClasspath("orc-file-11-format.orc"));
Reader reader = OrcFile.createReader(oldFilePath,
OrcFile.readerOptions(conf).filesystem(fs));
int stripeCount = 0;
int rowCount = 0;
long currentOffset = -1;
for (StripeInformation stripe : reader.getStripes()) {
stripeCount += 1;
rowCount += stripe.getNumberOfRows();
if (currentOffset < 0) {
currentOffset = stripe.getOffset() + stripe.getIndexLength()
+ stripe.getDataLength() + stripe.getFooterLength();
} else {
assertEquals(currentOffset, stripe.getOffset());
currentOffset += stripe.getIndexLength() + stripe.getDataLength()
+ stripe.getFooterLength();
}
}
assertEquals(reader.getNumberOfRows(), rowCount);
assertEquals(6300000, reader.getRawDataSize());
assertEquals(2, stripeCount);
// check the stats
ColumnStatistics[] stats = reader.getStatistics();
assertEquals(7500, stats[1].getNumberOfValues());
assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getFalseCount());
assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getTrueCount());
assertEquals("count: 7500 hasNull: true true: 3750", stats[1].toString());
assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
assertEquals(11520000, ((IntegerColumnStatistics) stats[3]).getSum());
assertEquals("count: 7500 hasNull: true min: 1024 max: 2048 sum: 11520000",
stats[3].toString());
assertEquals(Long.MAX_VALUE,
((IntegerColumnStatistics) stats[5]).getMaximum());
assertEquals(Long.MAX_VALUE,
((IntegerColumnStatistics) stats[5]).getMinimum());
assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined());
assertEquals(
"count: 7500 hasNull: true min: 9223372036854775807 max: 9223372036854775807",
stats[5].toString());
assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum());
assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum());
assertEquals(-75000.0, ((DoubleColumnStatistics) stats[7]).getSum(),
0.00001);
assertEquals("count: 7500 hasNull: true min: -15.0 max: -5.0 sum: -75000.0",
stats[7].toString());
assertEquals("bye", ((StringColumnStatistics) stats[9]).getMinimum());
assertEquals("hi", ((StringColumnStatistics) stats[9]).getMaximum());
assertEquals(0, ((StringColumnStatistics) stats[9]).getSum());
assertEquals("count: 7500 hasNull: true min: bye max: hi sum: 0", stats[9].toString());
// old orc format will not have binary statistics. toString() will show only
// the general column statistics
assertEquals("count: 7500 hasNull: true", stats[8].toString());
// since old orc format doesn't support binary statistics,
// this should throw ClassCastException
assertEquals(5, ((BinaryColumnStatistics) stats[8]).getSum());
}
}