/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io.orc; import static org.junit.Assert.*; import java.io.DataInput; import java.io.DataOutput; import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.security.PrivilegedExceptionAction; import java.sql.Date; import java.sql.Timestamp; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.TimeZone; import java.util.TreeSet; import org.apache.commons.codec.binary.Base64; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FSInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hive.common.ValidTxnList; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.ql.exec.SerializationUtilities; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.mr.ExecMapper; import org.apache.hadoop.hive.ql.exec.tez.ColumnarSplitSizeEstimator; import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; import org.apache.hadoop.hive.ql.io.AcidInputFormat; import org.apache.hadoop.hive.ql.io.AcidOutputFormat; import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; import org.apache.hadoop.hive.ql.io.HiveInputFormat; import org.apache.hadoop.hive.ql.io.HiveOutputFormat; import org.apache.hadoop.hive.ql.io.IOConstants; import org.apache.hadoop.hive.ql.io.InputFormatChecker; import org.apache.hadoop.hive.ql.io.RecordIdentifier; import org.apache.hadoop.hive.ql.io.RecordUpdater; import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.Context; import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.SplitStrategy; import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg; import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.VectorPartitionDesc; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; import org.apache.hadoop.hive.shims.CombineHiveKey; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputFormat; import org.apache.hadoop.mapred.RecordWriter; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.Progressable; import org.apache.orc.OrcConf; import org.apache.orc.OrcProto; import org.apache.orc.TypeDescription; import org.junit.Before; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TestName; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.io.Output; @SuppressWarnings({ "deprecation", "unchecked", "rawtypes" }) public class TestInputOutputFormat { private static final Logger LOG = LoggerFactory.getLogger(TestInputOutputFormat.class); public static String toKryo(SearchArgument sarg) { Output out = new Output(4 * 1024, 10 * 1024 * 1024); new Kryo().writeObject(out, sarg); out.close(); return Base64.encodeBase64String(out.toBytes()); } Path workDir = new Path(System.getProperty("test.tmp.dir","target/tmp")); static final int MILLIS_IN_DAY = 1000 * 60 * 60 * 24; private static final SimpleDateFormat DATE_FORMAT = new SimpleDateFormat("yyyy/MM/dd"); private static final SimpleDateFormat TIME_FORMAT = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss.SSS"); private static final TimeZone LOCAL_TIMEZONE = TimeZone.getDefault(); static { TimeZone gmt = TimeZone.getTimeZone("GMT+0"); DATE_FORMAT.setTimeZone(gmt); TIME_FORMAT.setTimeZone(gmt); } public static class BigRow implements Writable { boolean booleanValue; byte byteValue; short shortValue; int intValue; long longValue; float floatValue; double doubleValue; String stringValue; HiveDecimal decimalValue; Date dateValue; Timestamp timestampValue; BigRow(long x) { booleanValue = x % 2 == 0; byteValue = (byte) x; shortValue = (short) x; intValue = (int) x; longValue = x; floatValue = x; doubleValue = x; stringValue = Long.toHexString(x); decimalValue = HiveDecimal.create(x); long millisUtc = x * MILLIS_IN_DAY; millisUtc -= LOCAL_TIMEZONE.getOffset(millisUtc); dateValue = new Date(millisUtc); timestampValue = new Timestamp(millisUtc); } @Override public void write(DataOutput dataOutput) throws IOException { throw new UnsupportedOperationException("no write"); } @Override public void readFields(DataInput dataInput) throws IOException { throw new UnsupportedOperationException("no read"); } @Override public String toString() { StringBuilder builder = new StringBuilder(); builder.append("bigrow{booleanValue: "); builder.append(booleanValue); builder.append(", byteValue: "); builder.append(byteValue); builder.append(", shortValue: "); builder.append(shortValue); builder.append(", intValue: "); builder.append(intValue); builder.append(", longValue: "); builder.append(longValue); builder.append(", floatValue: "); builder.append(floatValue); builder.append(", doubleValue: "); builder.append(doubleValue); builder.append(", stringValue: "); builder.append(stringValue); builder.append(", decimalValue: "); builder.append(decimalValue); builder.append(", dateValue: "); builder.append(DATE_FORMAT.format(dateValue)); builder.append(", timestampValue: "); builder.append(TIME_FORMAT.format(timestampValue)); builder.append("}"); return builder.toString(); } static String getColumnNamesProperty() { return "booleanValue,byteValue,shortValue,intValue,longValue,floatValue,doubleValue,stringValue,decimalValue,dateValue,timestampValue"; } static String getColumnTypesProperty() { return "boolean:tinyint:smallint:int:bigint:float:double:string:decimal:date:timestamp"; } } public static class BigRowField implements StructField { private final int id; private final String fieldName; private final ObjectInspector inspector; BigRowField(int id, String fieldName, ObjectInspector inspector) { this.id = id; this.fieldName = fieldName; this.inspector = inspector; } @Override public String getFieldName() { return fieldName; } @Override public ObjectInspector getFieldObjectInspector() { return inspector; } @Override public String getFieldComment() { return null; } @Override public int getFieldID() { return id; } @Override public String toString() { return "field " + id + " " + fieldName; } } public static class BigRowInspector extends StructObjectInspector { static final List<BigRowField> FIELDS = new ArrayList<BigRowField>(); static { FIELDS.add(new BigRowField(0, "booleanValue", PrimitiveObjectInspectorFactory.javaBooleanObjectInspector)); FIELDS.add(new BigRowField(1, "byteValue", PrimitiveObjectInspectorFactory.javaByteObjectInspector)); FIELDS.add(new BigRowField(2, "shortValue", PrimitiveObjectInspectorFactory.javaShortObjectInspector)); FIELDS.add(new BigRowField(3, "intValue", PrimitiveObjectInspectorFactory.javaIntObjectInspector)); FIELDS.add(new BigRowField(4, "longValue", PrimitiveObjectInspectorFactory.javaLongObjectInspector)); FIELDS.add(new BigRowField(5, "floatValue", PrimitiveObjectInspectorFactory.javaFloatObjectInspector)); FIELDS.add(new BigRowField(6, "doubleValue", PrimitiveObjectInspectorFactory.javaDoubleObjectInspector)); FIELDS.add(new BigRowField(7, "stringValue", PrimitiveObjectInspectorFactory.javaStringObjectInspector)); FIELDS.add(new BigRowField(8, "decimalValue", PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector)); FIELDS.add(new BigRowField(9, "dateValue", PrimitiveObjectInspectorFactory.javaDateObjectInspector)); FIELDS.add(new BigRowField(10, "timestampValue", PrimitiveObjectInspectorFactory.javaTimestampObjectInspector)); } @Override public List<? extends StructField> getAllStructFieldRefs() { return FIELDS; } @Override public StructField getStructFieldRef(String fieldName) { for(StructField field: FIELDS) { if (field.getFieldName().equals(fieldName)) { return field; } } throw new IllegalArgumentException("Can't find field " + fieldName); } @Override public Object getStructFieldData(Object data, StructField fieldRef) { BigRow obj = (BigRow) data; switch (((BigRowField) fieldRef).id) { case 0: return obj.booleanValue; case 1: return obj.byteValue; case 2: return obj.shortValue; case 3: return obj.intValue; case 4: return obj.longValue; case 5: return obj.floatValue; case 6: return obj.doubleValue; case 7: return obj.stringValue; case 8: return obj.decimalValue; case 9: return obj.dateValue; case 10: return obj.timestampValue; } throw new IllegalArgumentException("No such field " + fieldRef); } @Override public List<Object> getStructFieldsDataAsList(Object data) { BigRow obj = (BigRow) data; List<Object> result = new ArrayList<Object>(11); result.add(obj.booleanValue); result.add(obj.byteValue); result.add(obj.shortValue); result.add(obj.intValue); result.add(obj.longValue); result.add(obj.floatValue); result.add(obj.doubleValue); result.add(obj.stringValue); result.add(obj.decimalValue); result.add(obj.dateValue); result.add(obj.timestampValue); return result; } @Override public String getTypeName() { return "struct<booleanValue:boolean,byteValue:tinyint," + "shortValue:smallint,intValue:int,longValue:bigint," + "floatValue:float,doubleValue:double,stringValue:string," + "decimalValue:decimal>"; } @Override public Category getCategory() { return Category.STRUCT; } } public static class MyRow implements Writable { int x; int y; MyRow(int x, int y) { this.x = x; this.y = y; } @Override public void write(DataOutput dataOutput) throws IOException { throw new UnsupportedOperationException("no write"); } @Override public void readFields(DataInput dataInput) throws IOException { throw new UnsupportedOperationException("no read"); } static String getColumnNamesProperty() { return "x,y"; } static String getColumnTypesProperty() { return "int:int"; } } @Rule public TestName testCaseName = new TestName(); JobConf conf; FileSystem fs; Path testFilePath; @Before public void openFileSystem () throws Exception { conf = new JobConf(); fs = FileSystem.getLocal(conf); testFilePath = new Path(workDir, "TestInputOutputFormat." + testCaseName.getMethodName() + ".orc"); fs.delete(testFilePath, false); } @Test public void testOverlap() throws Exception { assertEquals(0, OrcInputFormat.SplitGenerator.getOverlap(100, 100, 200, 100)); assertEquals(0, OrcInputFormat.SplitGenerator.getOverlap(0, 1000, 2000, 100)); assertEquals(100, OrcInputFormat.SplitGenerator.getOverlap(1000, 1000, 1500, 100)); assertEquals(250, OrcInputFormat.SplitGenerator.getOverlap(1000, 250, 500, 2000)); assertEquals(100, OrcInputFormat.SplitGenerator.getOverlap(1000, 1000, 1900, 1000)); assertEquals(500, OrcInputFormat.SplitGenerator.getOverlap(2000, 1000, 2500, 2000)); } @Test public void testGetInputPaths() throws Exception { conf.set("mapred.input.dir", "a,b,c"); assertArrayEquals(new Path[]{new Path("a"), new Path("b"), new Path("c")}, OrcInputFormat.getInputPaths(conf)); conf.set("mapred.input.dir", "/a/b/c/d/e"); assertArrayEquals(new Path[]{new Path("/a/b/c/d/e")}, OrcInputFormat.getInputPaths(conf)); conf.set("mapred.input.dir", "/a/b/c\\,d,/e/f\\,g/h"); assertArrayEquals(new Path[]{new Path("/a/b/c,d"), new Path("/e/f,g/h")}, OrcInputFormat.getInputPaths(conf)); } private FileSystem generateMockFiles(final int count, final int size) { final byte[] data = new byte[size]; MockFile[] files = new MockFile[count]; for (int i = 0; i < count; i++) { files[i] = new MockFile(String.format("mock:/a/b/part-%d", i), size, data); } return new MockFileSystem(conf, files); } @Test public void testSplitStrategySelection() throws Exception { conf.set("mapreduce.input.fileinputformat.split.maxsize", "500"); conf.set(HiveConf.ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE.varname, "10Mb"); final int[] counts = { 1, 10, 100, 256 }; final int[] sizes = { 100, 1000 }; final int[] numSplits = { 1, 9, 10, 11, 99, 111 }; final String[] strategyResults = new String[] { "ETLSplitStrategy", /* 1 files x 100 size for 1 splits */ "ETLSplitStrategy", /* 1 files x 100 size for 9 splits */ "ETLSplitStrategy", /* 1 files x 100 size for 10 splits */ "ETLSplitStrategy", /* 1 files x 100 size for 11 splits */ "ETLSplitStrategy", /* 1 files x 100 size for 99 splits */ "ETLSplitStrategy", /* 1 files x 100 size for 111 splits */ "ETLSplitStrategy", /* 1 files x 1000 size for 1 splits */ "ETLSplitStrategy", /* 1 files x 1000 size for 9 splits */ "ETLSplitStrategy", /* 1 files x 1000 size for 10 splits */ "ETLSplitStrategy", /* 1 files x 1000 size for 11 splits */ "ETLSplitStrategy", /* 1 files x 1000 size for 99 splits */ "ETLSplitStrategy", /* 1 files x 1000 size for 111 splits */ "BISplitStrategy", /* 10 files x 100 size for 1 splits */ "BISplitStrategy", /* 10 files x 100 size for 9 splits */ "ETLSplitStrategy", /* 10 files x 100 size for 10 splits */ "ETLSplitStrategy", /* 10 files x 100 size for 11 splits */ "ETLSplitStrategy", /* 10 files x 100 size for 99 splits */ "ETLSplitStrategy", /* 10 files x 100 size for 111 splits */ "ETLSplitStrategy", /* 10 files x 1000 size for 1 splits */ "ETLSplitStrategy", /* 10 files x 1000 size for 9 splits */ "ETLSplitStrategy", /* 10 files x 1000 size for 10 splits */ "ETLSplitStrategy", /* 10 files x 1000 size for 11 splits */ "ETLSplitStrategy", /* 10 files x 1000 size for 99 splits */ "ETLSplitStrategy", /* 10 files x 1000 size for 111 splits */ "BISplitStrategy", /* 100 files x 100 size for 1 splits */ "BISplitStrategy", /* 100 files x 100 size for 9 splits */ "BISplitStrategy", /* 100 files x 100 size for 10 splits */ "BISplitStrategy", /* 100 files x 100 size for 11 splits */ "BISplitStrategy", /* 100 files x 100 size for 99 splits */ "ETLSplitStrategy", /* 100 files x 100 size for 111 splits */ "ETLSplitStrategy", /* 100 files x 1000 size for 1 splits */ "ETLSplitStrategy", /* 100 files x 1000 size for 9 splits */ "ETLSplitStrategy", /* 100 files x 1000 size for 10 splits */ "ETLSplitStrategy", /* 100 files x 1000 size for 11 splits */ "ETLSplitStrategy", /* 100 files x 1000 size for 99 splits */ "ETLSplitStrategy", /* 100 files x 1000 size for 111 splits */ "BISplitStrategy", /* 256 files x 100 size for 1 splits */ "BISplitStrategy", /* 256 files x 100 size for 9 splits */ "BISplitStrategy", /* 256 files x 100 size for 10 splits */ "BISplitStrategy", /* 256 files x 100 size for 11 splits */ "BISplitStrategy", /* 256 files x 100 size for 99 splits */ "BISplitStrategy", /* 256 files x 100 size for 111 splits */ "ETLSplitStrategy", /* 256 files x 1000 size for 1 splits */ "ETLSplitStrategy", /* 256 files x 1000 size for 9 splits */ "ETLSplitStrategy", /* 256 files x 1000 size for 10 splits */ "ETLSplitStrategy", /* 256 files x 1000 size for 11 splits */ "ETLSplitStrategy", /* 256 files x 1000 size for 99 splits */ "ETLSplitStrategy", /* 256 files x 1000 size for 111 splits */ }; int k = 0; for (int c : counts) { for (int s : sizes) { final FileSystem fs = generateMockFiles(c, s); for (int n : numSplits) { final OrcInputFormat.Context context = new OrcInputFormat.Context( conf, n); OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator( context, fs, new MockPath(fs, "mock:/a/b"), false, null); List<SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen); assertEquals(1, splitStrategies.size()); final SplitStrategy splitStrategy = splitStrategies.get(0); assertTrue( String.format( "Split strategy for %d files x %d size for %d splits", c, s, n), splitStrategy.getClass().getSimpleName() .equals(strategyResults[k++])); } } } k = 0; conf.set(ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE.varname, "0"); for (int c : counts) { for (int s : sizes) { final FileSystem fs = generateMockFiles(c, s); for (int n : numSplits) { final OrcInputFormat.Context context = new OrcInputFormat.Context( conf, n); OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator( context, fs, new MockPath(fs, "mock:/a/b"), false, null); List<SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen); assertEquals(1, splitStrategies.size()); final SplitStrategy splitStrategy = splitStrategies.get(0); assertTrue( String.format( "Split strategy for %d files x %d size for %d splits", c, s, n), splitStrategy.getClass().getSimpleName() .equals(strategyResults[k++])); } } } } @Test public void testFileGenerator() throws Exception { OrcInputFormat.Context context = new OrcInputFormat.Context(conf); MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/b/part-00", 1000, new byte[1]), new MockFile("mock:/a/b/part-01", 1000, new byte[1]), new MockFile("mock:/a/b/_part-02", 1000, new byte[1]), new MockFile("mock:/a/b/.part-03", 1000, new byte[1]), new MockFile("mock:/a/b/part-04", 1000, new byte[1])); OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null); List<OrcInputFormat.SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen); assertEquals(1, splitStrategies.size()); assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.BISplitStrategy); conf.set("mapreduce.input.fileinputformat.split.maxsize", "500"); context = new OrcInputFormat.Context(conf); fs = new MockFileSystem(conf, new MockFile("mock:/a/b/part-00", 1000, new byte[1000]), new MockFile("mock:/a/b/part-01", 1000, new byte[1000]), new MockFile("mock:/a/b/_part-02", 1000, new byte[1000]), new MockFile("mock:/a/b/.part-03", 1000, new byte[1000]), new MockFile("mock:/a/b/part-04", 1000, new byte[1000])); gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null); splitStrategies = createSplitStrategies(context, gen); assertEquals(1, splitStrategies.size()); assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ETLSplitStrategy); } @Test public void testACIDSplitStrategy() throws Exception { conf.set("bucket_count", "2"); OrcInputFormat.Context context = new OrcInputFormat.Context(conf); MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/delta_000_001/part-00", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_000_001/part-01", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_001_002/part-02", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_001_002/part-03", 1000, new byte[1], new MockBlock("host1"))); OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a"), false, null); List<OrcInputFormat.SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen); assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy); List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy)splitStrategies.get(0)).getSplits(); ColumnarSplitSizeEstimator splitSizeEstimator = new ColumnarSplitSizeEstimator(); for (OrcSplit split: splits) { assertEquals(Integer.MAX_VALUE, splitSizeEstimator.getEstimatedSize(split)); } assertEquals(2, splits.size()); } @Test public void testACIDSplitStrategyForSplitUpdate() throws Exception { conf.set("bucket_count", "2"); conf.set(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, "true"); conf.set(hive_metastoreConstants.TABLE_TRANSACTIONAL_PROPERTIES, "default"); OrcInputFormat.Context context = new OrcInputFormat.Context(conf); // Case 1: Test with just originals => Single split strategy with two splits. MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/b/000000_0", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/b/000000_1", 1000, new byte[1], new MockBlock("host1"))); OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a"), false, null); List<OrcInputFormat.SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen); assertEquals(1, splitStrategies.size()); assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy); List<OrcSplit> splits = ((OrcInputFormat.ACIDSplitStrategy)splitStrategies.get(0)).getSplits(); assertEquals(2, splits.size()); assertEquals("mock:/a/b/000000_0", splits.get(0).getPath().toUri().toString()); assertEquals("mock:/a/b/000000_1", splits.get(1).getPath().toUri().toString()); assertTrue(splits.get(0).isOriginal()); assertTrue(splits.get(1).isOriginal()); // Case 2: Test with originals and base => Single split strategy with two splits on compacted // base since the presence of a base will make the originals obsolete. fs = new MockFileSystem(conf, new MockFile("mock:/a/b/000000_0", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/b/000000_1", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/base_0000001/bucket_00000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/base_0000001/bucket_00001", 1000, new byte[1], new MockBlock("host1"))); gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a"), false, null); splitStrategies = createSplitStrategies(context, gen); assertEquals(1, splitStrategies.size()); assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy); splits = ((OrcInputFormat.ACIDSplitStrategy)splitStrategies.get(0)).getSplits(); assertEquals(2, splits.size()); assertEquals("mock:/a/base_0000001/bucket_00000", splits.get(0).getPath().toUri().toString()); assertEquals("mock:/a/base_0000001/bucket_00001", splits.get(1).getPath().toUri().toString()); assertFalse(splits.get(0).isOriginal()); assertFalse(splits.get(1).isOriginal()); // Case 3: Test with originals and deltas => Two split strategies with two splits for each. fs = new MockFileSystem(conf, new MockFile("mock:/a/b/000000_0", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/b/000000_1", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_0000001_0000001_0000/bucket_00000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_0000001_0000001_0000/bucket_00001", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delete_delta_0000001_0000001_0000/bucket_00000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delete_delta_0000001_0000001_0000/bucket_00001", 1000, new byte[1], new MockBlock("host1"))); gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a"), false, null); splitStrategies = createSplitStrategies(context, gen); assertEquals(2, splitStrategies.size()); assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy); splits = ((OrcInputFormat.ACIDSplitStrategy)splitStrategies.get(0)).getSplits(); assertEquals(2, splits.size()); assertEquals("mock:/a/b/000000_0", splits.get(0).getPath().toUri().toString()); assertEquals("mock:/a/b/000000_1", splits.get(1).getPath().toUri().toString()); assertTrue(splits.get(0).isOriginal()); assertTrue(splits.get(1).isOriginal()); assertEquals(true, splitStrategies.get(1) instanceof OrcInputFormat.ACIDSplitStrategy); splits = ((OrcInputFormat.ACIDSplitStrategy)splitStrategies.get(1)).getSplits(); assertEquals(2, splits.size()); assertEquals("mock:/a/delta_0000001_0000001_0000/bucket_00000", splits.get(0).getPath().toUri().toString()); assertEquals("mock:/a/delta_0000001_0000001_0000/bucket_00001", splits.get(1).getPath().toUri().toString()); assertFalse(splits.get(0).isOriginal()); assertFalse(splits.get(1).isOriginal()); // Case 4: Test with originals and deltas but now with only one bucket covered, i.e. we will // have originals & insert_deltas for only one bucket, but the delete_deltas will be for two // buckets => Two strategies with one split for each. // When split-update is enabled, we do not need to account for buckets that aren't covered. // The reason why we are able to do so is because the valid user data has already been considered // as base for the covered buckets. Hence, the uncovered buckets do not have any relevant // data and we can just ignore them. fs = new MockFileSystem(conf, new MockFile("mock:/a/b/000000_0", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_0000001_0000001_0000/bucket_00000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delete_delta_0000001_0000001_0000/bucket_00000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delete_delta_0000001_0000001_0000/bucket_00001", 1000, new byte[1], new MockBlock("host1"))); gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a"), false, null); splitStrategies = createSplitStrategies(context, gen); assertEquals(2, splitStrategies.size()); assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy); splits = ((OrcInputFormat.ACIDSplitStrategy)splitStrategies.get(0)).getSplits(); assertEquals(1, splits.size()); assertEquals("mock:/a/b/000000_0", splits.get(0).getPath().toUri().toString()); assertTrue(splits.get(0).isOriginal()); assertEquals(true, splitStrategies.get(1) instanceof OrcInputFormat.ACIDSplitStrategy); splits = ((OrcInputFormat.ACIDSplitStrategy)splitStrategies.get(1)).getSplits(); assertEquals(1, splits.size()); assertEquals("mock:/a/delta_0000001_0000001_0000/bucket_00000", splits.get(0).getPath().toUri().toString()); assertFalse(splits.get(0).isOriginal()); // Case 5: Test with originals, compacted_base, insert_deltas, delete_deltas (exhaustive test) // This should just generate one strategy with splits for base and insert_deltas. fs = new MockFileSystem(conf, new MockFile("mock:/a/b/000000_0", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/b/000000_1", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/base_0000001/bucket_00000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/base_0000001/bucket_00001", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_0000002_0000002_0000/bucket_00000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delta_0000002_0000002_0000/bucket_00001", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delete_delta_0000002_0000002_0000/bucket_00000", 1000, new byte[1], new MockBlock("host1")), new MockFile("mock:/a/delete_delta_0000002_0000002_0000/bucket_00001", 1000, new byte[1], new MockBlock("host1"))); gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a"), false, null); splitStrategies = createSplitStrategies(context, gen); assertEquals(1, splitStrategies.size()); assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.ACIDSplitStrategy); splits = ((OrcInputFormat.ACIDSplitStrategy)splitStrategies.get(0)).getSplits(); assertEquals(4, splits.size()); assertEquals("mock:/a/base_0000001/bucket_00000", splits.get(0).getPath().toUri().toString()); assertEquals("mock:/a/base_0000001/bucket_00001", splits.get(1).getPath().toUri().toString()); assertEquals("mock:/a/delta_0000002_0000002_0000/bucket_00000", splits.get(2).getPath().toUri().toString()); assertEquals("mock:/a/delta_0000002_0000002_0000/bucket_00001", splits.get(3).getPath().toUri().toString()); assertFalse(splits.get(0).isOriginal()); assertFalse(splits.get(1).isOriginal()); assertFalse(splits.get(2).isOriginal()); assertFalse(splits.get(3).isOriginal()); } @Test public void testBIStrategySplitBlockBoundary() throws Exception { conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "BI"); OrcInputFormat.Context context = new OrcInputFormat.Context(conf); MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/b/part-00", 1000, new byte[1], new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-01", 1000, new byte[1], new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-02", 1000, new byte[1], new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-03", 1000, new byte[1], new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-04", 1000, new byte[1], new MockBlock("host1", "host2"))); OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null); List<OrcInputFormat.SplitStrategy<?>> splitStrategies = createSplitStrategies(context, gen); assertEquals(1, splitStrategies.size()); assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.BISplitStrategy); List<OrcSplit> splits = ((OrcInputFormat.BISplitStrategy)splitStrategies.get(0)).getSplits(); int numSplits = splits.size(); assertEquals(5, numSplits); context = new OrcInputFormat.Context(conf); fs = new MockFileSystem(conf, new MockFile("mock:/a/b/part-00", 1000, new byte[1000], new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-01", 1000, new byte[1000], new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-02", 1000, new byte[1000], new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-03", 1000, new byte[1000], new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-04", 1000, new byte[1000], new MockBlock("host1", "host2"))); gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null); splitStrategies = createSplitStrategies(context, gen); assertEquals(1, splitStrategies.size()); assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.BISplitStrategy); splits = ((OrcInputFormat.BISplitStrategy)splitStrategies.get(0)).getSplits(); numSplits = splits.size(); assertEquals(5, numSplits); context = new OrcInputFormat.Context(conf); fs = new MockFileSystem(conf, new MockFile("mock:/a/b/part-00", 1000, new byte[1100], new MockBlock("host1", "host2"), new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-01", 1000, new byte[1100], new MockBlock("host1", "host2"), new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-02", 1000, new byte[1100], new MockBlock("host1", "host2"), new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-03", 1000, new byte[1100], new MockBlock("host1", "host2"), new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-04", 1000, new byte[1100], new MockBlock("host1", "host2"), new MockBlock("host1", "host2"))); gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null); splitStrategies = createSplitStrategies(context, gen); assertEquals(1, splitStrategies.size()); assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.BISplitStrategy); splits = ((OrcInputFormat.BISplitStrategy)splitStrategies.get(0)).getSplits(); numSplits = splits.size(); assertEquals(10, numSplits); context = new OrcInputFormat.Context(conf); fs = new MockFileSystem(conf, new MockFile("mock:/a/b/part-00", 1000, new byte[2000], new MockBlock("host1", "host2"), new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-01", 1000, new byte[2000], new MockBlock("host1", "host2"), new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-02", 1000, new byte[2000], new MockBlock("host1", "host2"), new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-03", 1000, new byte[2000], new MockBlock("host1", "host2"), new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-04", 1000, new byte[2000], new MockBlock("host1", "host2"), new MockBlock("host1", "host2"))); gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null); splitStrategies = createSplitStrategies(context, gen); assertEquals(1, splitStrategies.size()); assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.BISplitStrategy); splits = ((OrcInputFormat.BISplitStrategy)splitStrategies.get(0)).getSplits(); numSplits = splits.size(); assertEquals(10, numSplits); context = new OrcInputFormat.Context(conf); fs = new MockFileSystem(conf, new MockFile("mock:/a/b/part-00", 1000, new byte[2200], new MockBlock("host1", "host2"), new MockBlock("host1", "host2"), new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-01", 1000, new byte[2200], new MockBlock("host1", "host2"), new MockBlock("host1", "host2"), new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-02", 1000, new byte[2200], new MockBlock("host1", "host2"), new MockBlock("host1", "host2"), new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-03", 1000, new byte[2200], new MockBlock("host1", "host2"), new MockBlock("host1", "host2"), new MockBlock("host1", "host2")), new MockFile("mock:/a/b/part-04", 1000, new byte[2200], new MockBlock("host1", "host2"), new MockBlock("host1", "host2"), new MockBlock("host1", "host2"))); gen = new OrcInputFormat.FileGenerator(context, fs, new MockPath(fs, "mock:/a/b"), false, null); splitStrategies = createSplitStrategies(context, gen); assertEquals(1, splitStrategies.size()); assertEquals(true, splitStrategies.get(0) instanceof OrcInputFormat.BISplitStrategy); splits = ((OrcInputFormat.BISplitStrategy)splitStrategies.get(0)).getSplits(); numSplits = splits.size(); assertEquals(15, numSplits); } @Test public void testEtlCombinedStrategy() throws Exception { conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "ETL"); conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_DIRECTORY_BATCH_MS.varname, "1000000"); OrcInputFormat.Context context = new OrcInputFormat.Context(conf); MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/1/part-00", 1000, new byte[1]), new MockFile("mock:/a/1/part-01", 1000, new byte[1]), new MockFile("mock:/a/2/part-00", 1000, new byte[1]), new MockFile("mock:/a/2/part-01", 1000, new byte[1]), new MockFile("mock:/a/3/base_0/1", 1000, new byte[1]), new MockFile("mock:/a/4/base_0/1", 1000, new byte[1]), new MockFile("mock:/a/5/base_0/1", 1000, new byte[1]), new MockFile("mock:/a/5/delta_0_25/1", 1000, new byte[1]) ); OrcInputFormat.CombinedCtx combineCtx = new OrcInputFormat.CombinedCtx(); // The first directory becomes the base for combining. List<SplitStrategy<?>> ss = createOrCombineStrategies(context, fs, "mock:/a/1", combineCtx); assertTrue(ss.isEmpty()); assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy); OrcInputFormat.ETLSplitStrategy etlSs = combineCtx.combined; assertEquals(2, etlSs.files.size()); assertTrue(etlSs.isOriginal); assertEquals(1, etlSs.dirs.size()); // The second one should be combined into the first. ss = createOrCombineStrategies(context, fs, "mock:/a/2", combineCtx); assertTrue(ss.isEmpty()); assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy); assertEquals(4, etlSs.files.size()); assertEquals(2, etlSs.dirs.size()); // The third one has the base file, so it shouldn't be combined but could be a base. ss = createOrCombineStrategies(context, fs, "mock:/a/3", combineCtx); assertEquals(1, ss.size()); assertSame(etlSs, ss.get(0)); assertEquals(4, etlSs.files.size()); assertEquals(2, etlSs.dirs.size()); assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy); etlSs = combineCtx.combined; assertEquals(1, etlSs.files.size()); assertFalse(etlSs.isOriginal); assertEquals(1, etlSs.dirs.size()); // Try the first again, it would not be combined and we'd retain the old base (less files). ss = createOrCombineStrategies(context, fs, "mock:/a/1", combineCtx); assertEquals(1, ss.size()); assertTrue(ss.get(0) instanceof OrcInputFormat.ETLSplitStrategy); assertNotSame(etlSs, ss.get(0)); OrcInputFormat.ETLSplitStrategy rejectedEtlSs = (OrcInputFormat.ETLSplitStrategy)ss.get(0); assertEquals(2, rejectedEtlSs.files.size()); assertEquals(1, rejectedEtlSs.dirs.size()); assertTrue(rejectedEtlSs.isOriginal); assertEquals(1, etlSs.files.size()); assertEquals(1, etlSs.dirs.size()); // The fourth could be combined again. ss = createOrCombineStrategies(context, fs, "mock:/a/4", combineCtx); assertTrue(ss.isEmpty()); assertTrue(combineCtx.combined instanceof OrcInputFormat.ETLSplitStrategy); assertEquals(2, etlSs.files.size()); assertEquals(2, etlSs.dirs.size()); // The fifth will not be combined because of delta files. ss = createOrCombineStrategies(context, fs, "mock:/a/5", combineCtx); assertEquals(1, ss.size()); assertTrue(ss.get(0) instanceof OrcInputFormat.ETLSplitStrategy); assertNotSame(etlSs, ss); assertEquals(2, etlSs.files.size()); assertEquals(2, etlSs.dirs.size()); } public List<SplitStrategy<?>> createOrCombineStrategies(OrcInputFormat.Context context, MockFileSystem fs, String path, OrcInputFormat.CombinedCtx combineCtx) throws IOException { OrcInputFormat.AcidDirInfo adi = createAdi(context, fs, path); return OrcInputFormat.determineSplitStrategies(combineCtx, context, adi.fs, adi.splitPath, adi.acidInfo, adi.baseFiles, adi.parsedDeltas, null, null, true); } public OrcInputFormat.AcidDirInfo createAdi( OrcInputFormat.Context context, MockFileSystem fs, String path) throws IOException { return new OrcInputFormat.FileGenerator( context, fs, new MockPath(fs, path), false, null).call(); } private List<OrcInputFormat.SplitStrategy<?>> createSplitStrategies( OrcInputFormat.Context context, OrcInputFormat.FileGenerator gen) throws IOException { OrcInputFormat.AcidDirInfo adi = gen.call(); return OrcInputFormat.determineSplitStrategies( null, context, adi.fs, adi.splitPath, adi.acidInfo, adi.baseFiles, adi.parsedDeltas, null, null, true); } public static class MockBlock { int offset; int length; final String[] hosts; public MockBlock(String... hosts) { this.hosts = hosts; } public void setOffset(int offset) { this.offset = offset; } public void setLength(int length) { this.length = length; } @Override public String toString() { StringBuilder buffer = new StringBuilder(); buffer.append("block{offset: "); buffer.append(offset); buffer.append(", length: "); buffer.append(length); buffer.append(", hosts: ["); for(int i=0; i < hosts.length; i++) { if (i != 0) { buffer.append(", "); } buffer.append(hosts[i]); } buffer.append("]}"); return buffer.toString(); } } public static class MockFile { final Path path; int blockSize; int length; MockBlock[] blocks; byte[] content; public MockFile(String path, int blockSize, byte[] content, MockBlock... blocks) { this.path = new Path(path); this.blockSize = blockSize; this.blocks = blocks; this.content = content; this.length = content.length; int offset = 0; for(MockBlock block: blocks) { block.offset = offset; block.length = Math.min(length - offset, blockSize); offset += block.length; } } @Override public int hashCode() { return path.hashCode() + 31 * length; } @Override public boolean equals(final Object obj) { if (!(obj instanceof MockFile)) { return false; } return ((MockFile) obj).path.equals(this.path) && ((MockFile) obj).length == this.length; } @Override public String toString() { StringBuilder buffer = new StringBuilder(); buffer.append("mockFile{path: "); buffer.append(path.toString()); buffer.append(", blkSize: "); buffer.append(blockSize); buffer.append(", len: "); buffer.append(length); buffer.append(", blocks: ["); for(int i=0; i < blocks.length; i++) { if (i != 0) { buffer.append(", "); } buffer.append(blocks[i]); } buffer.append("]}"); return buffer.toString(); } } static class MockInputStream extends FSInputStream { final MockFile file; int offset = 0; public MockInputStream(MockFile file) throws IOException { this.file = file; } @Override public void seek(long offset) throws IOException { this.offset = (int) offset; } @Override public long getPos() throws IOException { return offset; } @Override public boolean seekToNewSource(long l) throws IOException { return false; } @Override public int read() throws IOException { if (offset < file.length) { return file.content[offset++] & 0xff; } return -1; } } public static class MockPath extends Path { private final FileSystem fs; public MockPath(FileSystem fs, String path) { super(path); this.fs = fs; } @Override public FileSystem getFileSystem(Configuration conf) { return fs; } } public static class MockOutputStream extends FSDataOutputStream { private final MockFile file; public MockOutputStream(MockFile file) throws IOException { super(new DataOutputBuffer(), null); this.file = file; } /** * Set the blocks and their location for the file. * Must be called after the stream is closed or the block length will be * wrong. * @param blocks the list of blocks */ public void setBlocks(MockBlock... blocks) { file.blocks = blocks; int offset = 0; int i = 0; while (offset < file.length && i < blocks.length) { blocks[i].offset = offset; blocks[i].length = Math.min(file.length - offset, file.blockSize); offset += blocks[i].length; i += 1; } } @Override public void close() throws IOException { super.close(); DataOutputBuffer buf = (DataOutputBuffer) getWrappedStream(); file.length = buf.getLength(); file.content = new byte[file.length]; MockBlock block = new MockBlock("host1"); block.setLength(file.length); setBlocks(block); System.arraycopy(buf.getData(), 0, file.content, 0, file.length); } @Override public String toString() { return "Out stream to " + file.toString(); } } public static class MockFileSystem extends FileSystem { final List<MockFile> files = new ArrayList<MockFile>(); final Map<MockFile, FileStatus> fileStatusMap = new HashMap<>(); Path workingDir = new Path("/"); // statics for when the mock fs is created via FileSystem.get private static String blockedUgi = null; private final static List<MockFile> globalFiles = new ArrayList<MockFile>(); protected Statistics statistics; public MockFileSystem() { // empty } @Override public void initialize(URI uri, Configuration conf) { setConf(conf); statistics = getStatistics("mock", getClass()); } public MockFileSystem(Configuration conf, MockFile... files) { setConf(conf); this.files.addAll(Arrays.asList(files)); statistics = getStatistics("mock", getClass()); } public static void setBlockedUgi(String s) { blockedUgi = s; } void clear() { files.clear(); } @Override public URI getUri() { try { return new URI("mock:///"); } catch (URISyntaxException err) { throw new IllegalArgumentException("huh?", err); } } // increments file modification time public void touch(MockFile file) { if (fileStatusMap.containsKey(file)) { FileStatus fileStatus = fileStatusMap.get(file); FileStatus fileStatusNew = new FileStatus(fileStatus.getLen(), fileStatus.isDirectory(), fileStatus.getReplication(), fileStatus.getBlockSize(), fileStatus.getModificationTime() + 1, fileStatus.getAccessTime(), fileStatus.getPermission(), fileStatus.getOwner(), fileStatus.getGroup(), fileStatus.getPath()); fileStatusMap.put(file, fileStatusNew); } } @SuppressWarnings("serial") public static class MockAccessDenied extends IOException { } @Override public FSDataInputStream open(Path path, int i) throws IOException { statistics.incrementReadOps(1); checkAccess(); MockFile file = findFile(path); if (file != null) return new FSDataInputStream(new MockInputStream(file)); throw new IOException("File not found: " + path); } private MockFile findFile(Path path) { for (MockFile file: files) { if (file.path.equals(path)) { return file; } } for (MockFile file: globalFiles) { if (file.path.equals(path)) { return file; } } return null; } private void checkAccess() throws IOException { if (blockedUgi == null) return; if (!blockedUgi.equals(UserGroupInformation.getCurrentUser().getShortUserName())) return; throw new MockAccessDenied(); } @Override public FSDataOutputStream create(Path path, FsPermission fsPermission, boolean overwrite, int bufferSize, short replication, long blockSize, Progressable progressable ) throws IOException { statistics.incrementWriteOps(1); checkAccess(); MockFile file = findFile(path); if (file == null) { file = new MockFile(path.toString(), (int) blockSize, new byte[0]); files.add(file); } return new MockOutputStream(file); } @Override public FSDataOutputStream append(Path path, int bufferSize, Progressable progressable ) throws IOException { statistics.incrementWriteOps(1); checkAccess(); return create(path, FsPermission.getDefault(), true, bufferSize, (short) 3, 256 * 1024, progressable); } @Override public boolean rename(Path path, Path path2) throws IOException { statistics.incrementWriteOps(1); checkAccess(); return false; } @Override public boolean delete(Path path) throws IOException { statistics.incrementWriteOps(1); checkAccess(); return false; } @Override public boolean delete(Path path, boolean b) throws IOException { statistics.incrementWriteOps(1); checkAccess(); return false; } @Override public RemoteIterator<LocatedFileStatus> listLocatedStatus(final Path f) throws IOException { return new RemoteIterator<LocatedFileStatus>() { private Iterator<LocatedFileStatus> iterator = listLocatedFileStatuses(f).iterator(); @Override public boolean hasNext() throws IOException { return iterator.hasNext(); } @Override public LocatedFileStatus next() throws IOException { return iterator.next(); } }; } private List<LocatedFileStatus> listLocatedFileStatuses(Path path) throws IOException { statistics.incrementReadOps(1); checkAccess(); path = path.makeQualified(this); List<LocatedFileStatus> result = new ArrayList<>(); String pathname = path.toString(); String pathnameAsDir = pathname + "/"; Set<String> dirs = new TreeSet<String>(); MockFile file = findFile(path); if (file != null) { result.add(createLocatedStatus(file)); return result; } findMatchingLocatedFiles(files, pathnameAsDir, dirs, result); findMatchingLocatedFiles(globalFiles, pathnameAsDir, dirs, result); // for each directory add it once for(String dir: dirs) { result.add(createLocatedDirectory(new MockPath(this, pathnameAsDir + dir))); } return result; } @Override public FileStatus[] listStatus(Path path) throws IOException { statistics.incrementReadOps(1); checkAccess(); path = path.makeQualified(this); List<FileStatus> result = new ArrayList<FileStatus>(); String pathname = path.toString(); String pathnameAsDir = pathname + "/"; Set<String> dirs = new TreeSet<String>(); MockFile file = findFile(path); if (file != null) { return new FileStatus[]{createStatus(file)}; } findMatchingFiles(files, pathnameAsDir, dirs, result); findMatchingFiles(globalFiles, pathnameAsDir, dirs, result); // for each directory add it once for(String dir: dirs) { result.add(createDirectory(new MockPath(this, pathnameAsDir + dir))); } return result.toArray(new FileStatus[result.size()]); } private void findMatchingFiles( List<MockFile> files, String pathnameAsDir, Set<String> dirs, List<FileStatus> result) { for (MockFile file: files) { String filename = file.path.toString(); if (filename.startsWith(pathnameAsDir)) { String tail = filename.substring(pathnameAsDir.length()); int nextSlash = tail.indexOf('/'); if (nextSlash > 0) { dirs.add(tail.substring(0, nextSlash)); } else { result.add(createStatus(file)); } } } } private void findMatchingLocatedFiles( List<MockFile> files, String pathnameAsDir, Set<String> dirs, List<LocatedFileStatus> result) throws IOException { for (MockFile file: files) { String filename = file.path.toString(); if (filename.startsWith(pathnameAsDir)) { String tail = filename.substring(pathnameAsDir.length()); int nextSlash = tail.indexOf('/'); if (nextSlash > 0) { dirs.add(tail.substring(0, nextSlash)); } else { result.add(createLocatedStatus(file)); } } } } @Override public void setWorkingDirectory(Path path) { workingDir = path; } @Override public Path getWorkingDirectory() { return workingDir; } @Override public boolean mkdirs(Path path, FsPermission fsPermission) { statistics.incrementWriteOps(1); return false; } private FileStatus createStatus(MockFile file) { if (fileStatusMap.containsKey(file)) { return fileStatusMap.get(file); } FileStatus fileStatus = new FileStatus(file.length, false, 1, file.blockSize, 0, 0, FsPermission.createImmutable((short) 644), "owen", "group", file.path); fileStatusMap.put(file, fileStatus); return fileStatus; } private FileStatus createDirectory(Path dir) { return new FileStatus(0, true, 0, 0, 0, 0, FsPermission.createImmutable((short) 755), "owen", "group", dir); } private LocatedFileStatus createLocatedStatus(MockFile file) throws IOException { FileStatus fileStatus = createStatus(file); return new LocatedFileStatus(fileStatus, getFileBlockLocationsImpl(fileStatus, 0, fileStatus.getLen(), false)); } private LocatedFileStatus createLocatedDirectory(Path dir) throws IOException { FileStatus fileStatus = createDirectory(dir); return new LocatedFileStatus(fileStatus, getFileBlockLocationsImpl(fileStatus, 0, fileStatus.getLen(), false)); } @Override public FileStatus getFileStatus(Path path) throws IOException { statistics.incrementReadOps(1); checkAccess(); path = path.makeQualified(this); String pathnameAsDir = path.toString() + "/"; MockFile file = findFile(path); if (file != null) return createStatus(file); for (MockFile dir : files) { if (dir.path.toString().startsWith(pathnameAsDir)) { return createDirectory(path); } } for (MockFile dir : globalFiles) { if (dir.path.toString().startsWith(pathnameAsDir)) { return createDirectory(path); } } throw new FileNotFoundException("File " + path + " does not exist"); } @Override public BlockLocation[] getFileBlockLocations(FileStatus stat, long start, long len) throws IOException { return getFileBlockLocationsImpl(stat, start, len, true); } private BlockLocation[] getFileBlockLocationsImpl(final FileStatus stat, final long start, final long len, final boolean updateStats) throws IOException { if (updateStats) { statistics.incrementReadOps(1); } checkAccess(); List<BlockLocation> result = new ArrayList<BlockLocation>(); MockFile file = findFile(stat.getPath()); if (file != null) { for(MockBlock block: file.blocks) { if (OrcInputFormat.SplitGenerator.getOverlap(block.offset, block.length, start, len) > 0) { String[] topology = new String[block.hosts.length]; for(int i=0; i < topology.length; ++i) { topology[i] = "/rack/ " + block.hosts[i]; } result.add(new BlockLocation(block.hosts, block.hosts, topology, block.offset, block.length)); } } return result.toArray(new BlockLocation[result.size()]); } return new BlockLocation[0]; } @Override public String toString() { StringBuilder buffer = new StringBuilder(); buffer.append("mockFs{files:["); for(int i=0; i < files.size(); ++i) { if (i != 0) { buffer.append(", "); } buffer.append(files.get(i)); } buffer.append("]}"); return buffer.toString(); } public static void addGlobalFile(MockFile mockFile) { globalFiles.add(mockFile); } public static void clearGlobalFiles() { globalFiles.clear(); } } static void fill(DataOutputBuffer out, long length) throws IOException { for(int i=0; i < length; ++i) { out.write(0); } } /** * Create the binary contents of an ORC file that just has enough information * to test the getInputSplits. * @param stripeLengths the length of each stripe * @return the bytes of the file * @throws IOException */ static byte[] createMockOrcFile(long... stripeLengths) throws IOException { OrcProto.Footer.Builder footer = OrcProto.Footer.newBuilder(); final long headerLen = 3; long offset = headerLen; DataOutputBuffer buffer = new DataOutputBuffer(); for(long stripeLength: stripeLengths) { footer.addStripes(OrcProto.StripeInformation.newBuilder() .setOffset(offset) .setIndexLength(0) .setDataLength(stripeLength-10) .setFooterLength(10) .setNumberOfRows(1000)); offset += stripeLength; } fill(buffer, offset); footer.addTypes(OrcProto.Type.newBuilder() .setKind(OrcProto.Type.Kind.STRUCT) .addFieldNames("col1") .addSubtypes(1)); footer.addTypes(OrcProto.Type.newBuilder() .setKind(OrcProto.Type.Kind.STRING)); footer.setNumberOfRows(1000 * stripeLengths.length) .setHeaderLength(headerLen) .setContentLength(offset - headerLen); footer.addStatistics(OrcProto.ColumnStatistics.newBuilder() .setNumberOfValues(1000 * stripeLengths.length).build()); footer.addStatistics(OrcProto.ColumnStatistics.newBuilder() .setNumberOfValues(1000 * stripeLengths.length) .setStringStatistics( OrcProto.StringStatistics.newBuilder() .setMaximum("zzz") .setMinimum("aaa") .setSum(1000 * 3 * stripeLengths.length) .build() ).build()); footer.build().writeTo(buffer); int footerEnd = buffer.getLength(); OrcProto.PostScript ps = OrcProto.PostScript.newBuilder() .setCompression(OrcProto.CompressionKind.NONE) .setFooterLength(footerEnd - offset) .setMagic("ORC") .build(); ps.writeTo(buffer); buffer.write(buffer.getLength() - footerEnd); byte[] result = new byte[buffer.getLength()]; System.arraycopy(buffer.getData(), 0, result, 0, buffer.getLength()); return result; } @Test public void testAddSplit() throws Exception { // create a file with 5 blocks spread around the cluster MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/file", 500, createMockOrcFile(197, 300, 600, 200, 200, 100, 100, 100, 100, 100), new MockBlock("host1-1", "host1-2", "host1-3"), new MockBlock("host2-1", "host0", "host2-3"), new MockBlock("host0", "host3-2", "host3-3"), new MockBlock("host4-1", "host4-2", "host4-3"), new MockBlock("host5-1", "host5-2", "host5-3"))); OrcInputFormat.Context context = new OrcInputFormat.Context(conf); OrcInputFormat.SplitGenerator splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true); OrcSplit result = splitter.createSplit(0, 200, null); assertEquals(0, result.getStart()); assertEquals(200, result.getLength()); assertEquals("mock:/a/file", result.getPath().toString()); String[] locs = result.getLocations(); assertEquals(3, locs.length); assertEquals("host1-1", locs[0]); assertEquals("host1-2", locs[1]); assertEquals("host1-3", locs[2]); result = splitter.createSplit(500, 600, null); locs = result.getLocations(); assertEquals(3, locs.length); assertEquals("host2-1", locs[0]); assertEquals("host0", locs[1]); assertEquals("host2-3", locs[2]); result = splitter.createSplit(0, 2500, null); locs = result.getLocations(); assertEquals(1, locs.length); assertEquals("host0", locs[0]); } @Test public void testSplitGenerator() throws Exception { // create a file with 5 blocks spread around the cluster long[] stripeSizes = new long[]{197, 300, 600, 200, 200, 100, 100, 100, 100, 100}; MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/file", 500, createMockOrcFile(stripeSizes), new MockBlock("host1-1", "host1-2", "host1-3"), new MockBlock("host2-1", "host0", "host2-3"), new MockBlock("host0", "host3-2", "host3-3"), new MockBlock("host4-1", "host4-2", "host4-3"), new MockBlock("host5-1", "host5-2", "host5-3"))); HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 300); HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 200); OrcInputFormat.Context context = new OrcInputFormat.Context(conf); OrcInputFormat.SplitGenerator splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true); List<OrcSplit> results = splitter.call(); OrcSplit result = results.get(0); assertEquals(3, result.getStart()); assertEquals(497, result.getLength()); result = results.get(1); assertEquals(500, result.getStart()); assertEquals(600, result.getLength()); result = results.get(2); assertEquals(1100, result.getStart()); assertEquals(400, result.getLength()); result = results.get(3); assertEquals(1500, result.getStart()); assertEquals(300, result.getLength()); result = results.get(4); assertEquals(1800, result.getStart()); assertEquals(200, result.getLength()); // test min = 0, max = 0 generates each stripe HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 0); HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 0); context = new OrcInputFormat.Context(conf); splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true); results = splitter.call(); for(int i=0; i < stripeSizes.length; ++i) { assertEquals("checking stripe " + i + " size", stripeSizes[i], results.get(i).getLength()); } } @Test public void testProjectedColumnSize() throws Exception { long[] stripeSizes = new long[]{200, 200, 200, 200, 100}; MockFileSystem fs = new MockFileSystem(conf, new MockFile("mock:/a/file", 500, createMockOrcFile(stripeSizes), new MockBlock("host1-1", "host1-2", "host1-3"), new MockBlock("host2-1", "host0", "host2-3"), new MockBlock("host0", "host3-2", "host3-3"), new MockBlock("host4-1", "host4-2", "host4-3"), new MockBlock("host5-1", "host5-2", "host5-3"))); HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 300); HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 200); conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); OrcInputFormat.Context context = new OrcInputFormat.Context(conf); OrcInputFormat.SplitGenerator splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true); List<OrcSplit> results = splitter.call(); OrcSplit result = results.get(0); assertEquals(3, results.size()); assertEquals(3, result.getStart()); assertEquals(400, result.getLength()); assertEquals(167468, result.getProjectedColumnsUncompressedSize()); result = results.get(1); assertEquals(403, result.getStart()); assertEquals(400, result.getLength()); assertEquals(167468, result.getProjectedColumnsUncompressedSize()); result = results.get(2); assertEquals(803, result.getStart()); assertEquals(100, result.getLength()); assertEquals(41867, result.getProjectedColumnsUncompressedSize()); // test min = 0, max = 0 generates each stripe HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 0); HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 0); context = new OrcInputFormat.Context(conf); splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true); results = splitter.call(); assertEquals(5, results.size()); for (int i = 0; i < stripeSizes.length; ++i) { assertEquals("checking stripe " + i + " size", stripeSizes[i], results.get(i).getLength()); if (i == stripeSizes.length - 1) { assertEquals(41867, results.get(i).getProjectedColumnsUncompressedSize()); } else { assertEquals(83734, results.get(i).getProjectedColumnsUncompressedSize()); } } // single split HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 1000); HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 100000); context = new OrcInputFormat.Context(conf); splitter = new OrcInputFormat.SplitGenerator(new OrcInputFormat.SplitInfo(context, fs, fs.getFileStatus(new Path("/a/file")), null, null, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true); results = splitter.call(); assertEquals(1, results.size()); result = results.get(0); assertEquals(3, result.getStart()); assertEquals(900, result.getLength()); assertEquals(376804, result.getProjectedColumnsUncompressedSize()); } @Test public void testInOutFormat() throws Exception { Properties properties = new Properties(); properties.setProperty("columns", "x,y"); properties.setProperty("columns.types", "int:int"); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } AbstractSerDe serde = new OrcSerde(); HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat(); org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL); writer.write(serde.serialize(new MyRow(1,2), inspector)); writer.write(serde.serialize(new MyRow(2,2), inspector)); writer.write(serde.serialize(new MyRow(3,2), inspector)); writer.close(true); serde = new OrcSerde(); SerDeUtils.initializeSerDe(serde, conf, properties, null); assertEquals(OrcSerde.OrcSerdeRow.class, serde.getSerializedClass()); inspector = (StructObjectInspector) serde.getObjectInspector(); assertEquals("struct<x:int,y:int>", inspector.getTypeName()); InputFormat<?,?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); // the the validate input method ArrayList<FileStatus> fileList = new ArrayList<FileStatus>(); assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList)); fileList.add(fs.getFileStatus(testFilePath)); assertEquals(true, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList)); fileList.add(fs.getFileStatus(workDir)); assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList)); // read the whole file conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty()); conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty()); org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Writable value = (Writable) reader.createValue(); int rowNum = 0; List<? extends StructField> fields =inspector.getAllStructFieldRefs(); IntObjectInspector intInspector = (IntObjectInspector) fields.get(0).getFieldObjectInspector(); // UNDONE: Don't know why HIVE-12894 causes this to return 0? // assertEquals(0.33, reader.getProgress(), 0.01); while (reader.next(key, value)) { assertEquals(++rowNum, intInspector.get(inspector. getStructFieldData(serde.deserialize(value), fields.get(0)))); assertEquals(2, intInspector.get(inspector. getStructFieldData(serde.deserialize(value), fields.get(1)))); } assertEquals(3, rowNum); assertEquals(1.0, reader.getProgress(), 0.00001); reader.close(); // read just the first column ColumnProjectionUtils.appendReadColumns(conf, Collections.singletonList(0)); reader = in.getRecordReader(splits[0], conf, Reporter.NULL); key = reader.createKey(); value = (Writable) reader.createValue(); rowNum = 0; fields = inspector.getAllStructFieldRefs(); while (reader.next(key, value)) { assertEquals(++rowNum, intInspector.get(inspector. getStructFieldData(value, fields.get(0)))); assertEquals(null, inspector.getStructFieldData(value, fields.get(1))); } assertEquals(3, rowNum); reader.close(); // test the mapping of empty string to all columns ColumnProjectionUtils.setReadAllColumns(conf); reader = in.getRecordReader(splits[0], conf, Reporter.NULL); key = reader.createKey(); value = (Writable) reader.createValue(); rowNum = 0; fields = inspector.getAllStructFieldRefs(); while (reader.next(key, value)) { assertEquals(++rowNum, intInspector.get(inspector. getStructFieldData(value, fields.get(0)))); assertEquals(2, intInspector.get(inspector. getStructFieldData(serde.deserialize(value), fields.get(1)))); } assertEquals(3, rowNum); reader.close(); } static class SimpleRow implements Writable { Text z; public SimpleRow(Text t) { this.z = t; } @Override public void write(DataOutput dataOutput) throws IOException { throw new UnsupportedOperationException("unsupported"); } @Override public void readFields(DataInput dataInput) throws IOException { throw new UnsupportedOperationException("unsupported"); } } static class NestedRow implements Writable { int z; MyRow r; NestedRow(int x, int y, int z) { this.z = z; this.r = new MyRow(x,y); } @Override public void write(DataOutput dataOutput) throws IOException { throw new UnsupportedOperationException("unsupported"); } @Override public void readFields(DataInput dataInput) throws IOException { throw new UnsupportedOperationException("unsupported"); } } @Test public void testMROutput() throws Exception { Properties properties = new Properties(); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(NestedRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } AbstractSerDe serde = new OrcSerde(); OutputFormat<?, ?> outFormat = new OrcOutputFormat(); RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL); writer.write(NullWritable.get(), serde.serialize(new NestedRow(1,2,3), inspector)); writer.write(NullWritable.get(), serde.serialize(new NestedRow(4,5,6), inspector)); writer.write(NullWritable.get(), serde.serialize(new NestedRow(7,8,9), inspector)); writer.close(Reporter.NULL); serde = new OrcSerde(); properties.setProperty("columns", "z,r"); properties.setProperty("columns.types", "int:struct<x:int,y:int>"); SerDeUtils.initializeSerDe(serde, conf, properties, null); inspector = (StructObjectInspector) serde.getObjectInspector(); InputFormat<?,?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); ColumnProjectionUtils.appendReadColumns(conf, Collections.singletonList(1)); conf.set("columns", "z,r"); conf.set("columns.types", "int:struct<x:int,y:int>"); org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Object value = reader.createValue(); int rowNum = 0; List<? extends StructField> fields = inspector.getAllStructFieldRefs(); StructObjectInspector inner = (StructObjectInspector) fields.get(1).getFieldObjectInspector(); List<? extends StructField> inFields = inner.getAllStructFieldRefs(); IntObjectInspector intInspector = (IntObjectInspector) fields.get(0).getFieldObjectInspector(); while (reader.next(key, value)) { assertEquals(null, inspector.getStructFieldData(value, fields.get(0))); Object sub = inspector.getStructFieldData(value, fields.get(1)); assertEquals(3*rowNum+1, intInspector.get(inner.getStructFieldData(sub, inFields.get(0)))); assertEquals(3*rowNum+2, intInspector.get(inner.getStructFieldData(sub, inFields.get(1)))); rowNum += 1; } assertEquals(3, rowNum); reader.close(); } @Test public void testEmptyFile() throws Exception { Properties properties = new Properties(); properties.setProperty("columns", "x,y"); properties.setProperty("columns.types", "int:int"); HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat(); org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL); writer.close(true); AbstractSerDe serde = new OrcSerde(); SerDeUtils.initializeSerDe(serde, conf, properties, null); InputFormat<?,?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertTrue(0 == splits.length); assertEquals(null, serde.getSerDeStats()); } @Test(expected = RuntimeException.class) public void testSplitGenFailure() throws IOException { Properties properties = new Properties(); HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat(); org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL); writer.write(new OrcSerde().serialize(null,null)); writer.close(true); InputFormat<?,?> in = new OrcInputFormat(); fs.setPermission(testFilePath, FsPermission.createImmutable((short) 0333)); FileInputFormat.setInputPaths(conf, testFilePath.toString()); try { in.getSplits(conf, 1); } catch (RuntimeException e) { assertEquals(true, e.getMessage().contains("Permission denied")); throw e; } } static class StringRow implements Writable { String str; String str2; StringRow(String s) { str = s; str2 = s; } @Override public void write(DataOutput dataOutput) throws IOException { throw new UnsupportedOperationException("no write"); } @Override public void readFields(DataInput dataInput) throws IOException { throw new UnsupportedOperationException("no read"); } static String getColumnNamesProperty() { return "str,str2"; } static String getColumnTypesProperty() { return "string:string"; } } @Test public void testDefaultTypes() throws Exception { Properties properties = new Properties(); properties.setProperty("columns", "str,str2"); properties.setProperty("columns.types", "string:string"); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } AbstractSerDe serde = new OrcSerde(); HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat(); org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, StringRow.class, true, properties, Reporter.NULL); writer.write(serde.serialize(new StringRow("owen"), inspector)); writer.write(serde.serialize(new StringRow("beth"), inspector)); writer.write(serde.serialize(new StringRow("laurel"), inspector)); writer.write(serde.serialize(new StringRow("hazen"), inspector)); writer.write(serde.serialize(new StringRow("colin"), inspector)); writer.write(serde.serialize(new StringRow("miles"), inspector)); writer.close(true); serde = new OrcSerde(); SerDeUtils.initializeSerDe(serde, conf, properties, null); inspector = (StructObjectInspector) serde.getObjectInspector(); assertEquals("struct<str:string,str2:string>", inspector.getTypeName()); InputFormat<?,?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); // read the whole file conf.set("columns", StringRow.getColumnNamesProperty()); conf.set("columns.types", StringRow.getColumnTypesProperty()); org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Writable value = (Writable) reader.createValue(); List<? extends StructField> fields =inspector.getAllStructFieldRefs(); StringObjectInspector strInspector = (StringObjectInspector) fields.get(0).getFieldObjectInspector(); assertEquals(true, reader.next(key, value)); assertEquals("owen", strInspector.getPrimitiveJavaObject(inspector. getStructFieldData(value, fields.get(0)))); assertEquals(true, reader.next(key, value)); assertEquals("beth", strInspector.getPrimitiveJavaObject(inspector. getStructFieldData(value, fields.get(0)))); assertEquals(true, reader.next(key, value)); assertEquals("laurel", strInspector.getPrimitiveJavaObject(inspector. getStructFieldData(value, fields.get(0)))); assertEquals(true, reader.next(key, value)); assertEquals("hazen", strInspector.getPrimitiveJavaObject(inspector. getStructFieldData(value, fields.get(0)))); assertEquals(true, reader.next(key, value)); assertEquals("colin", strInspector.getPrimitiveJavaObject(inspector. getStructFieldData(value, fields.get(0)))); assertEquals(true, reader.next(key, value)); assertEquals("miles", strInspector.getPrimitiveJavaObject(inspector. getStructFieldData(value, fields.get(0)))); assertEquals(false, reader.next(key, value)); reader.close(); } /** * Create a mock execution environment that has enough detail that * ORC, vectorization, HiveInputFormat, and CombineHiveInputFormat don't * explode. * @param workDir a local filesystem work directory * @param warehouseDir a mock filesystem warehouse directory * @param tableName the table name * @param objectInspector object inspector for the row * @param isVectorized should run vectorized * @return a JobConf that contains the necessary information * @throws IOException * @throws HiveException */ JobConf createMockExecutionEnvironment(Path workDir, Path warehouseDir, String tableName, ObjectInspector objectInspector, boolean isVectorized, int partitions ) throws IOException, HiveException { JobConf conf = new JobConf(); Utilities.clearWorkMap(conf); conf.set("hive.exec.plan", workDir.toString()); conf.set("mapred.job.tracker", "local"); String isVectorizedString = Boolean.toString(isVectorized); conf.set("hive.vectorized.execution.enabled", isVectorizedString); conf.set(Utilities.VECTOR_MODE, isVectorizedString); conf.set(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, isVectorizedString); conf.set("fs.mock.impl", MockFileSystem.class.getName()); conf.set("mapred.mapper.class", ExecMapper.class.getName()); Path root = new Path(warehouseDir, tableName); // clean out previous contents ((MockFileSystem) root.getFileSystem(conf)).clear(); // build partition strings String[] partPath = new String[partitions]; StringBuilder buffer = new StringBuilder(); for(int p=0; p < partitions; ++p) { partPath[p] = new Path(root, "p=" + p).toString(); if (p != 0) { buffer.append(','); } buffer.append(partPath[p]); } conf.set("mapred.input.dir", buffer.toString()); StringBuilder columnIds = new StringBuilder(); StringBuilder columnNames = new StringBuilder(); StringBuilder columnTypes = new StringBuilder(); StructObjectInspector structOI = (StructObjectInspector) objectInspector; List<? extends StructField> fields = structOI.getAllStructFieldRefs(); int numCols = fields.size(); for(int i=0; i < numCols; ++i) { if (i != 0) { columnIds.append(','); columnNames.append(','); columnTypes.append(','); } columnIds.append(i); columnNames.append(fields.get(i).getFieldName()); columnTypes.append(fields.get(i).getFieldObjectInspector().getTypeName()); } conf.set("hive.io.file.readcolumn.ids", columnIds.toString()); conf.set("partition_columns", "p"); conf.set(serdeConstants.LIST_COLUMNS, columnNames.toString()); conf.set(serdeConstants.LIST_COLUMN_TYPES, columnTypes.toString()); MockFileSystem fs = (MockFileSystem) warehouseDir.getFileSystem(conf); fs.clear(); Properties tblProps = new Properties(); tblProps.put("name", tableName); tblProps.put("serialization.lib", OrcSerde.class.getName()); tblProps.put("columns", columnNames.toString()); tblProps.put("columns.types", columnTypes.toString()); TableDesc tbl = new TableDesc(OrcInputFormat.class, OrcOutputFormat.class, tblProps); MapWork mapWork = new MapWork(); mapWork.setVectorMode(isVectorized); if (isVectorized) { VectorizedRowBatchCtx vectorizedRowBatchCtx = new VectorizedRowBatchCtx(); vectorizedRowBatchCtx.init(structOI, new String[0]); mapWork.setVectorizedRowBatchCtx(vectorizedRowBatchCtx); } mapWork.setUseBucketizedHiveInputFormat(false); LinkedHashMap<Path, ArrayList<String>> aliasMap = new LinkedHashMap<>(); ArrayList<String> aliases = new ArrayList<String>(); aliases.add(tableName); LinkedHashMap<Path, PartitionDesc> partMap = new LinkedHashMap<>(); for(int p=0; p < partitions; ++p) { Path path = new Path(partPath[p]); aliasMap.put(path, aliases); LinkedHashMap<String, String> partSpec = new LinkedHashMap<String, String>(); PartitionDesc part = new PartitionDesc(tbl, partSpec); if (isVectorized) { part.setVectorPartitionDesc( VectorPartitionDesc.createVectorizedInputFileFormat("MockInputFileFormatClassName", false)); } partMap.put(path, part); } mapWork.setPathToAliases(aliasMap); mapWork.setPathToPartitionInfo(partMap); // write the plan out FileSystem localFs = FileSystem.getLocal(conf).getRaw(); Path mapXml = new Path(workDir, "map.xml"); localFs.delete(mapXml, true); FSDataOutputStream planStream = localFs.create(mapXml); SerializationUtilities.serializePlan(mapWork, planStream); conf.setBoolean(Utilities.HAS_MAP_WORK, true); planStream.close(); return conf; } /** * Set the mockblocks for a file after it has been written * @param path the path to modify * @param conf the configuration * @param blocks the blocks to uses * @throws IOException */ static void setBlocks(Path path, Configuration conf, MockBlock... blocks) throws IOException { FileSystem mockFs = path.getFileSystem(conf); MockOutputStream stream = (MockOutputStream) mockFs.create(path); stream.setBlocks(blocks); } static int getLength(Path path, Configuration conf) throws IOException { FileSystem mockFs = path.getFileSystem(conf); FileStatus stat = mockFs.getFileStatus(path); return (int) stat.getLen(); } /** * Test vectorization, non-acid, non-combine. * @throws Exception */ @Test public void testVectorization() throws Exception { // get the object inspector for MyRow StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "vectorization", inspector, true, 1); // write the orc file to the mock file system Path path = new Path(conf.get("mapred.input.dir") + "/0_0"); Writer writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for(int i=0; i < 10; ++i) { writer.addRow(new MyRow(i, 2*i)); } writer.close(); setBlocks(path, conf, new MockBlock("host0", "host1")); // call getsplits HiveInputFormat<?,?> inputFormat = new HiveInputFormat<WritableComparable, Writable>(); InputSplit[] splits = inputFormat.getSplits(conf, 10); assertEquals(1, splits.length); org.apache.hadoop.mapred.RecordReader<NullWritable, VectorizedRowBatch> reader = inputFormat.getRecordReader(splits[0], conf, Reporter.NULL); NullWritable key = reader.createKey(); VectorizedRowBatch value = reader.createValue(); assertEquals(true, reader.next(key, value)); assertEquals(10, value.count()); LongColumnVector col0 = (LongColumnVector) value.cols[0]; for(int i=0; i < 10; i++) { assertEquals("checking " + i, i, col0.vector[i]); } assertEquals(false, reader.next(key, value)); } /** * Test vectorization, non-acid, non-combine. * @throws Exception */ @Test public void testVectorizationWithBuckets() throws Exception { // get the object inspector for MyRow StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "vectorBuckets", inspector, true, 1); // write the orc file to the mock file system Path path = new Path(conf.get("mapred.input.dir") + "/0_0"); Writer writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for(int i=0; i < 10; ++i) { writer.addRow(new MyRow(i, 2*i)); } writer.close(); setBlocks(path, conf, new MockBlock("host0", "host1")); // call getsplits conf.setInt(hive_metastoreConstants.BUCKET_COUNT, 3); HiveInputFormat<?,?> inputFormat = new HiveInputFormat<WritableComparable, Writable>(); InputSplit[] splits = inputFormat.getSplits(conf, 10); assertEquals(1, splits.length); org.apache.hadoop.mapred.RecordReader<NullWritable, VectorizedRowBatch> reader = inputFormat.getRecordReader(splits[0], conf, Reporter.NULL); NullWritable key = reader.createKey(); VectorizedRowBatch value = reader.createValue(); assertEquals(true, reader.next(key, value)); assertEquals(10, value.count()); LongColumnVector col0 = (LongColumnVector) value.cols[0]; for(int i=0; i < 10; i++) { assertEquals("checking " + i, i, col0.vector[i]); } assertEquals(false, reader.next(key, value)); } // test acid with vectorization, no combine @Test public void testVectorizationWithAcid() throws Exception { StructObjectInspector inspector = new BigRowInspector(); JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "vectorizationAcid", inspector, true, 1); // write the orc file to the mock file system Path partDir = new Path(conf.get("mapred.input.dir")); OrcRecordUpdater writer = new OrcRecordUpdater(partDir, new AcidOutputFormat.Options(conf).maximumTransactionId(10) .writingBase(true).bucket(0).inspector(inspector).finalDestination(partDir)); for (int i = 0; i < 100; ++i) { BigRow row = new BigRow(i); writer.insert(10, row); } writer.close(false); Path path = new Path("mock:/vectorizationAcid/p=0/base_0000010/bucket_00000"); setBlocks(path, conf, new MockBlock("host0", "host1")); // call getsplits HiveInputFormat<?, ?> inputFormat = new HiveInputFormat<WritableComparable, Writable>(); InputSplit[] splits = inputFormat.getSplits(conf, 10); assertEquals(1, splits.length); conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty()); conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty()); HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true); org.apache.hadoop.mapred.RecordReader<NullWritable, VectorizedRowBatch> reader = inputFormat.getRecordReader(splits[0], conf, Reporter.NULL); NullWritable key = reader.createKey(); VectorizedRowBatch value = reader.createValue(); assertEquals(true, reader.next(key, value)); assertEquals(100, value.count()); LongColumnVector booleanColumn = (LongColumnVector) value.cols[0]; LongColumnVector byteColumn = (LongColumnVector) value.cols[1]; LongColumnVector shortColumn = (LongColumnVector) value.cols[2]; LongColumnVector intColumn = (LongColumnVector) value.cols[3]; LongColumnVector longColumn = (LongColumnVector) value.cols[4]; DoubleColumnVector floatColumn = (DoubleColumnVector) value.cols[5]; DoubleColumnVector doubleCoulmn = (DoubleColumnVector) value.cols[6]; BytesColumnVector stringColumn = (BytesColumnVector) value.cols[7]; DecimalColumnVector decimalColumn = (DecimalColumnVector) value.cols[8]; LongColumnVector dateColumn = (LongColumnVector) value.cols[9]; TimestampColumnVector timestampColumn = (TimestampColumnVector) value.cols[10]; for(int i=0; i < 100; i++) { assertEquals("checking boolean " + i, i % 2 == 0 ? 1 : 0, booleanColumn.vector[i]); assertEquals("checking byte " + i, (byte) i, byteColumn.vector[i]); assertEquals("checking short " + i, (short) i, shortColumn.vector[i]); assertEquals("checking int " + i, i, intColumn.vector[i]); assertEquals("checking long " + i, i, longColumn.vector[i]); assertEquals("checking float " + i, i, floatColumn.vector[i], 0.0001); assertEquals("checking double " + i, i, doubleCoulmn.vector[i], 0.0001); Text strValue = new Text(); strValue.set(stringColumn.vector[i], stringColumn.start[i], stringColumn.length[i]); assertEquals("checking string " + i, new Text(Long.toHexString(i)), strValue); assertEquals("checking decimal " + i, HiveDecimal.create(i), decimalColumn.vector[i].getHiveDecimal()); assertEquals("checking date " + i, i, dateColumn.vector[i]); long millis = (long) i * MILLIS_IN_DAY; millis -= LOCAL_TIMEZONE.getOffset(millis); assertEquals("checking timestamp " + i, millis, timestampColumn.getTime(i)); } assertEquals(false, reader.next(key, value)); } // test non-vectorized, non-acid, combine @Test public void testCombinationInputFormat() throws Exception { // get the object inspector for MyRow StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "combination", inspector, false, 1); // write the orc file to the mock file system Path partDir = new Path(conf.get("mapred.input.dir")); Writer writer = OrcFile.createWriter(new Path(partDir, "0_0"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for(int i=0; i < 10; ++i) { writer.addRow(new MyRow(i, 2*i)); } writer.close(); Path path = new Path("mock:/combination/p=0/0_0"); setBlocks(path, conf, new MockBlock("host0", "host1")); MockFileSystem mockFs = (MockFileSystem) partDir.getFileSystem(conf); int length0 = getLength(path, conf); writer = OrcFile.createWriter(new Path(partDir, "1_0"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for(int i=10; i < 20; ++i) { writer.addRow(new MyRow(i, 2*i)); } writer.close(); Path path1 = new Path("mock:/combination/p=0/1_0"); setBlocks(path1, conf, new MockBlock("host1", "host2")); // call getsplits HiveInputFormat<?,?> inputFormat = new CombineHiveInputFormat<WritableComparable, Writable>(); InputSplit[] splits = inputFormat.getSplits(conf, 1); assertEquals(1, splits.length); CombineHiveInputFormat.CombineHiveInputSplit split = (CombineHiveInputFormat.CombineHiveInputSplit) splits[0]; // check split assertEquals(2, split.getNumPaths()); assertEquals(partDir.toString() + "/0_0", split.getPath(0).toString()); assertEquals(partDir.toString() + "/1_0", split.getPath(1).toString()); assertEquals(length0, split.getLength(0)); assertEquals(getLength(path1, conf), split.getLength(1)); assertEquals(0, split.getOffset(0)); assertEquals(0, split.getOffset(1)); // hadoop-1 gets 3 and hadoop-2 gets 0. *sigh* // best answer would be 1. assertTrue(3 >= split.getLocations().length); // read split org.apache.hadoop.mapred.RecordReader<CombineHiveKey, OrcStruct> reader = inputFormat.getRecordReader(split, conf, Reporter.NULL); CombineHiveKey key = reader.createKey(); OrcStruct value = reader.createValue(); for(int i=0; i < 20; i++) { assertEquals(true, reader.next(key, value)); assertEquals(i, ((IntWritable) value.getFieldValue(0)).get()); } assertEquals(false, reader.next(key, value)); } // test non-vectorized, acid, combine @Test public void testCombinationInputFormatWithAcid() throws Exception { // get the object inspector for MyRow StructObjectInspector inspector; final int PARTITIONS = 2; final int BUCKETS = 3; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "combinationAcid", inspector, false, PARTITIONS); // write the orc file to the mock file system Path[] partDir = new Path[PARTITIONS]; String[] paths = conf.getStrings("mapred.input.dir"); for(int p=0; p < PARTITIONS; ++p) { partDir[p] = new Path(paths[p]); } // write a base file in partition 0 OrcRecordUpdater writer = new OrcRecordUpdater(partDir[0], new AcidOutputFormat.Options(conf).maximumTransactionId(10) .writingBase(true).bucket(0).inspector(inspector).finalDestination(partDir[0])); for(int i=0; i < 10; ++i) { writer.insert(10, new MyRow(i, 2 * i)); } writer.close(false); // base file Path base0 = new Path("mock:/combinationAcid/p=0/base_0000010/bucket_00000"); setBlocks(base0, conf, new MockBlock("host1", "host2")); // write a delta file in partition 0 writer = new OrcRecordUpdater(partDir[0], new AcidOutputFormat.Options(conf).maximumTransactionId(10) .writingBase(true).bucket(1).inspector(inspector).finalDestination(partDir[0])); for(int i=10; i < 20; ++i) { writer.insert(10, new MyRow(i, 2*i)); } writer.close(false); Path base1 = new Path("mock:/combinationAcid/p=0/base_0000010/bucket_00001"); setBlocks(base1, conf, new MockBlock("host1", "host2")); // write three files in partition 1 for(int bucket=0; bucket < BUCKETS; ++bucket) { Path path = new Path(partDir[1], "00000" + bucket + "_0"); Writer orc = OrcFile.createWriter( path, OrcFile.writerOptions(conf) .blockPadding(false) .bufferSize(1024) .inspector(inspector)); orc.addRow(new MyRow(1, 2)); orc.close(); setBlocks(path, conf, new MockBlock("host3", "host4")); } // call getsplits conf.setInt(hive_metastoreConstants.BUCKET_COUNT, BUCKETS); HiveInputFormat<?,?> inputFormat = new CombineHiveInputFormat<WritableComparable, Writable>(); InputSplit[] splits = inputFormat.getSplits(conf, 1); assertEquals(3, splits.length); HiveInputFormat.HiveInputSplit split = (HiveInputFormat.HiveInputSplit) splits[0]; assertEquals("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", split.inputFormatClassName()); assertEquals("mock:/combinationAcid/p=0/base_0000010/bucket_00000", split.getPath().toString()); assertEquals(0, split.getStart()); assertEquals(607, split.getLength()); split = (HiveInputFormat.HiveInputSplit) splits[1]; assertEquals("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", split.inputFormatClassName()); assertEquals("mock:/combinationAcid/p=0/base_0000010/bucket_00001", split.getPath().toString()); assertEquals(0, split.getStart()); assertEquals(629, split.getLength()); CombineHiveInputFormat.CombineHiveInputSplit combineSplit = (CombineHiveInputFormat.CombineHiveInputSplit) splits[2]; assertEquals(BUCKETS, combineSplit.getNumPaths()); for(int bucket=0; bucket < BUCKETS; ++bucket) { assertEquals("mock:/combinationAcid/p=1/00000" + bucket + "_0", combineSplit.getPath(bucket).toString()); assertEquals(0, combineSplit.getOffset(bucket)); assertEquals(241, combineSplit.getLength(bucket)); } String[] hosts = combineSplit.getLocations(); assertEquals(2, hosts.length); } @Test public void testSetSearchArgument() throws Exception { Reader.Options options = new Reader.Options(); List<OrcProto.Type> types = new ArrayList<OrcProto.Type>(); OrcProto.Type.Builder builder = OrcProto.Type.newBuilder(); builder.setKind(OrcProto.Type.Kind.STRUCT) .addAllFieldNames(Arrays.asList("op", "otid", "bucket", "rowid", "ctid", "row")) .addAllSubtypes(Arrays.asList(1,2,3,4,5,6)); types.add(builder.build()); builder.clear().setKind(OrcProto.Type.Kind.INT); types.add(builder.build()); types.add(builder.build()); types.add(builder.build()); types.add(builder.build()); types.add(builder.build()); builder.clear().setKind(OrcProto.Type.Kind.STRUCT) .addAllFieldNames(Arrays.asList("url", "purchase", "cost", "store")) .addAllSubtypes(Arrays.asList(7, 8, 9, 10)); types.add(builder.build()); builder.clear().setKind(OrcProto.Type.Kind.STRING); types.add(builder.build()); builder.clear().setKind(OrcProto.Type.Kind.INT); types.add(builder.build()); types.add(builder.build()); types.add(builder.build()); SearchArgument isNull = SearchArgumentFactory.newBuilder() .startAnd().isNull("cost", PredicateLeaf.Type.LONG).end().build(); conf.set(ConvertAstToSearchArg.SARG_PUSHDOWN, toKryo(isNull)); conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, "url,cost"); options.include(new boolean[]{true, true, false, true, false}); OrcInputFormat.setSearchArgument(options, types, conf, false); String[] colNames = options.getColumnNames(); assertEquals(null, colNames[0]); assertEquals("url", colNames[1]); assertEquals(null, colNames[2]); assertEquals("cost", colNames[3]); assertEquals(null, colNames[4]); SearchArgument arg = options.getSearchArgument(); List<PredicateLeaf> leaves = arg.getLeaves(); assertEquals("cost", leaves.get(0).getColumnName()); assertEquals(PredicateLeaf.Operator.IS_NULL, leaves.get(0).getOperator()); } @Test public void testSplitElimination() throws Exception { Properties properties = new Properties(); properties.setProperty("columns", "z,r"); properties.setProperty("columns.types", "int:struct<x:int,y:int>"); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(NestedRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } AbstractSerDe serde = new OrcSerde(); OutputFormat<?, ?> outFormat = new OrcOutputFormat(); conf.setInt("mapred.max.split.size", 50); RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL); writer.write(NullWritable.get(), serde.serialize(new NestedRow(1,2,3), inspector)); writer.write(NullWritable.get(), serde.serialize(new NestedRow(4,5,6), inspector)); writer.write(NullWritable.get(), serde.serialize(new NestedRow(7,8,9), inspector)); writer.close(Reporter.NULL); serde = new OrcSerde(); SearchArgument sarg = SearchArgumentFactory.newBuilder() .startAnd() .lessThan("z", PredicateLeaf.Type.LONG, new Long(0)) .end() .build(); conf.set("sarg.pushdown", toKryo(sarg)); conf.set("hive.io.file.readcolumn.names", "z,r"); SerDeUtils.initializeSerDe(serde, conf, properties, null); inspector = (StructObjectInspector) serde.getObjectInspector(); InputFormat<?,?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(0, splits.length); } @Test public void testSplitEliminationNullStats() throws Exception { Properties properties = new Properties(); StructObjectInspector inspector = createSoi(); AbstractSerDe serde = new OrcSerde(); OutputFormat<?, ?> outFormat = new OrcOutputFormat(); conf.setInt("mapred.max.split.size", 50); RecordWriter writer = outFormat.getRecordWriter(fs, conf, testFilePath.toString(), Reporter.NULL); writer.write(NullWritable.get(), serde.serialize(new SimpleRow(null), inspector)); writer.write(NullWritable.get(), serde.serialize(new SimpleRow(null), inspector)); writer.write(NullWritable.get(), serde.serialize(new SimpleRow(null), inspector)); writer.close(Reporter.NULL); serde = new OrcSerde(); SearchArgument sarg = SearchArgumentFactory.newBuilder() .startAnd() .lessThan("z", PredicateLeaf.Type.STRING, new String("foo")) .end() .build(); conf.set("sarg.pushdown", toKryo(sarg)); conf.set("hive.io.file.readcolumn.names", "z"); properties.setProperty("columns", "z"); properties.setProperty("columns.types", "string"); SerDeUtils.initializeSerDe(serde, conf, properties, null); inspector = (StructObjectInspector) serde.getObjectInspector(); InputFormat<?,?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(0, splits.length); } @Test public void testDoAs() throws Exception { conf.setInt(ConfVars.HIVE_ORC_COMPUTE_SPLITS_NUM_THREADS.varname, 1); conf.set(ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "ETL"); conf.setBoolean(ConfVars.HIVE_IN_TEST.varname, true); conf.setClass("fs.mock.impl", MockFileSystem.class, FileSystem.class); String badUser = UserGroupInformation.getCurrentUser().getShortUserName() + "-foo"; MockFileSystem.setBlockedUgi(badUser); MockFileSystem.clearGlobalFiles(); OrcInputFormat.Context.resetThreadPool(); // We need the size above to take effect. try { // OrcInputFormat will get a mock fs from FileSystem.get; add global files. MockFileSystem.addGlobalFile(new MockFile("mock:/ugi/1/file", 10000, createMockOrcFile(197, 300, 600), new MockBlock("host1-1", "host1-2", "host1-3"))); MockFileSystem.addGlobalFile(new MockFile("mock:/ugi/2/file", 10000, createMockOrcFile(197, 300, 600), new MockBlock("host1-1", "host1-2", "host1-3"))); FileInputFormat.setInputPaths(conf, "mock:/ugi/1"); UserGroupInformation ugi = UserGroupInformation.createUserForTesting(badUser, new String[0]); assertEquals(0, OrcInputFormat.Context.getCurrentThreadPoolSize()); try { ugi.doAs(new PrivilegedExceptionAction<Void>() { @Override public Void run() throws Exception { OrcInputFormat.generateSplitsInfo(conf, new Context(conf, -1, null)); return null; } }); fail("Didn't throw"); } catch (Exception ex) { Throwable cause = ex; boolean found = false; while (cause != null) { if (cause instanceof MockFileSystem.MockAccessDenied) { found = true; // Expected. break; } cause = cause.getCause(); } if (!found) throw ex; // Unexpected. } assertEquals(1, OrcInputFormat.Context.getCurrentThreadPoolSize()); FileInputFormat.setInputPaths(conf, "mock:/ugi/2"); List<OrcSplit> splits = OrcInputFormat.generateSplitsInfo(conf, new Context(conf, -1, null)); assertEquals(1, splits.size()); } finally { MockFileSystem.clearGlobalFiles(); } } private StructObjectInspector createSoi() { synchronized (TestOrcFile.class) { return (StructObjectInspector)ObjectInspectorFactory.getReflectionObjectInspector( SimpleRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } } @Test public void testSplitGenReadOps() throws Exception { MockFileSystem fs = new MockFileSystem(conf); conf.set("mapred.input.dir", "mock:///mocktable"); conf.set("fs.defaultFS", "mock:///"); conf.set("fs.mock.impl", MockFileSystem.class.getName()); MockPath mockPath = new MockPath(fs, "mock:///mocktable"); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for(int i=0; i < 10; ++i) { writer.addRow(new MyRow(i, 2*i)); } writer.close(); writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for(int i=0; i < 10; ++i) { writer.addRow(new MyRow(i, 2*i)); } writer.close(); int readOpsBefore = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); OrcInputFormat orcInputFormat = new OrcInputFormat(); InputSplit[] splits = orcInputFormat.getSplits(conf, 2); int readOpsDelta = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: listLocatedStatus - mock:/mocktable // call-2: open - mock:/mocktable/0_0 // call-3: open - mock:/mocktable/0_1 assertEquals(3, readOpsDelta); assertEquals(2, splits.length); // revert back to local fs conf.set("fs.defaultFS", "file:///"); } @Test public void testSplitGenReadOpsLocalCache() throws Exception { MockFileSystem fs = new MockFileSystem(conf); // creates the static cache MockPath mockPath = new MockPath(fs, "mock:///mocktbl"); conf.set(ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE.varname, "0"); conf.set("mapred.input.dir", mockPath.toString()); conf.set("fs.defaultFS", "mock:///"); conf.set("fs.mock.impl", MockFileSystem.class.getName()); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); int readOpsBefore = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); OrcInputFormat orcInputFormat = new OrcInputFormat(); InputSplit[] splits = orcInputFormat.getSplits(conf, 2); assertEquals(2, splits.length); int readOpsDelta = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: listLocatedStatus - mock:/mocktbl // call-2: open - mock:/mocktbl/0_0 // call-3: open - mock:/mocktbl/0_1 assertEquals(3, readOpsDelta); // force BI to avoid reading footers conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "BI"); for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } orcInputFormat = new OrcInputFormat(); splits = orcInputFormat.getSplits(conf, 2); assertEquals(2, splits.length); for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: listLocatedStatus - mock:/mocktbl assertEquals(1, readOpsDelta); // enable cache and use default strategy conf.set(ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE.varname, "10Mb"); conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "HYBRID"); for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } orcInputFormat = new OrcInputFormat(); splits = orcInputFormat.getSplits(conf, 2); assertEquals(2, splits.length); for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: listLocatedStatus - mock:/mocktbl // call-2: open - mock:/mocktbl/0_0 // call-3: open - mock:/mocktbl/0_1 assertEquals(3, readOpsDelta); for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } orcInputFormat = new OrcInputFormat(); splits = orcInputFormat.getSplits(conf, 2); assertEquals(2, splits.length); for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: listLocatedStatus - mock:/mocktbl assertEquals(1, readOpsDelta); // revert back to local fs conf.set("fs.defaultFS", "file:///"); } @Test public void testSplitGenReadOpsLocalCacheChangeFileLen() throws Exception { MockFileSystem fs = new MockFileSystem(conf); // creates the static cache MockPath mockPath = new MockPath(fs, "mock:///mocktbl1"); conf.set("mapred.input.dir", mockPath.toString()); conf.set("fs.defaultFS", "mock:///"); conf.set("fs.mock.impl", MockFileSystem.class.getName()); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); int readOpsBefore = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); OrcInputFormat orcInputFormat = new OrcInputFormat(); InputSplit[] splits = orcInputFormat.getSplits(conf, 2); assertEquals(2, splits.length); int readOpsDelta = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: listLocatedStatus - mock:/mocktable // call-2: open - mock:/mocktbl1/0_0 // call-3: open - mock:/mocktbl1/0_1 assertEquals(3, readOpsDelta); // change file length and look for cache misses fs.clear(); writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 100; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 100; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } orcInputFormat = new OrcInputFormat(); splits = orcInputFormat.getSplits(conf, 2); assertEquals(2, splits.length); for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: listLocatedStatus - mock:/mocktable // call-2: open - mock:/mocktbl1/0_0 // call-3: open - mock:/mocktbl1/0_1 assertEquals(3, readOpsDelta); for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } orcInputFormat = new OrcInputFormat(); splits = orcInputFormat.getSplits(conf, 2); assertEquals(2, splits.length); for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: listLocatedStatus - mock:/mocktbl1 assertEquals(1, readOpsDelta); // revert back to local fs conf.set("fs.defaultFS", "file:///"); } @Test public void testSplitGenReadOpsLocalCacheChangeModificationTime() throws Exception { MockFileSystem fs = new MockFileSystem(conf); // creates the static cache MockPath mockPath = new MockPath(fs, "mock:///mocktbl2"); conf.set("hive.orc.cache.use.soft.references", "true"); conf.set("mapred.input.dir", mockPath.toString()); conf.set("fs.defaultFS", "mock:///"); conf.set("fs.mock.impl", MockFileSystem.class.getName()); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); int readOpsBefore = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); OrcInputFormat orcInputFormat = new OrcInputFormat(); InputSplit[] splits = orcInputFormat.getSplits(conf, 2); assertEquals(2, splits.length); int readOpsDelta = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: listLocatedStatus - mock:/mocktbl2 // call-2: open - mock:/mocktbl2/0_0 // call-3: open - mock:/mocktbl2/0_1 assertEquals(3, readOpsDelta); // change file modification time and look for cache misses FileSystem fs1 = FileSystem.get(conf); MockFile mockFile = ((MockFileSystem) fs1).findFile(new Path(mockPath + "/0_0")); ((MockFileSystem) fs1).touch(mockFile); for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } orcInputFormat = new OrcInputFormat(); splits = orcInputFormat.getSplits(conf, 2); assertEquals(2, splits.length); for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: listLocatedStatus - mock:/mocktbl2 // call-2: open - mock:/mocktbl2/0_1 assertEquals(2, readOpsDelta); // touch the next file fs1 = FileSystem.get(conf); mockFile = ((MockFileSystem) fs1).findFile(new Path(mockPath + "/0_1")); ((MockFileSystem) fs1).touch(mockFile); for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } orcInputFormat = new OrcInputFormat(); splits = orcInputFormat.getSplits(conf, 2); assertEquals(2, splits.length); for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: listLocatedStatus - mock:/mocktbl2 // call-2: open - mock:/mocktbl2/0_0 assertEquals(2, readOpsDelta); for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } orcInputFormat = new OrcInputFormat(); splits = orcInputFormat.getSplits(conf, 2); assertEquals(2, splits.length); for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: listLocatedStatus - mock:/mocktbl2 assertEquals(1, readOpsDelta); // revert back to local fs conf.set("fs.defaultFS", "file:///"); } @Test public void testNonVectorReaderNoFooterSerialize() throws Exception { MockFileSystem fs = new MockFileSystem(conf); MockPath mockPath = new MockPath(fs, "mock:///mocktable1"); conf.set("hive.orc.splits.include.file.footer", "false"); conf.set("mapred.input.dir", mockPath.toString()); conf.set("fs.defaultFS", "mock:///"); conf.set("fs.mock.impl", MockFileSystem.class.getName()); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); OrcInputFormat orcInputFormat = new OrcInputFormat(); InputSplit[] splits = orcInputFormat.getSplits(conf, 2); assertEquals(2, splits.length); int readOpsBefore = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); for (InputSplit split : splits) { assertTrue("OrcSplit is expected", split instanceof OrcSplit); // ETL strategies will have start=3 (start of first stripe) assertTrue(split.toString().contains("start=3")); assertTrue(split.toString().contains("hasFooter=false")); assertTrue(split.toString().contains("hasBase=true")); assertTrue(split.toString().contains("deltas=0")); if (split instanceof OrcSplit) { assertFalse("No footer serialize test for non-vector reader, hasFooter is not expected in" + " orc splits.", ((OrcSplit) split).hasFooter()); } orcInputFormat.getRecordReader(split, conf, null); } int readOpsDelta = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: open to read footer - split 1 => mock:/mocktable1/0_0 // call-2: open to read data - split 1 => mock:/mocktable1/0_0 // call-3: open to read footer - split 2 => mock:/mocktable1/0_1 // call-4: open to read data - split 2 => mock:/mocktable1/0_1 assertEquals(4, readOpsDelta); // revert back to local fs conf.set("fs.defaultFS", "file:///"); } @Test public void testNonVectorReaderFooterSerialize() throws Exception { MockFileSystem fs = new MockFileSystem(conf); MockPath mockPath = new MockPath(fs, "mock:///mocktable2"); conf.set("hive.orc.splits.include.file.footer", "true"); conf.set("mapred.input.dir", mockPath.toString()); conf.set("fs.defaultFS", "mock:///"); conf.set("fs.mock.impl", MockFileSystem.class.getName()); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); OrcInputFormat orcInputFormat = new OrcInputFormat(); InputSplit[] splits = orcInputFormat.getSplits(conf, 2); assertEquals(2, splits.length); int readOpsBefore = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); for (InputSplit split : splits) { assertTrue("OrcSplit is expected", split instanceof OrcSplit); // ETL strategies will have start=3 (start of first stripe) assertTrue(split.toString().contains("start=3")); assertTrue(split.toString().contains("hasFooter=true")); assertTrue(split.toString().contains("hasBase=true")); assertTrue(split.toString().contains("deltas=0")); if (split instanceof OrcSplit) { assertTrue("Footer serialize test for non-vector reader, hasFooter is expected in" + " orc splits.", ((OrcSplit) split).hasFooter()); } orcInputFormat.getRecordReader(split, conf, null); } int readOpsDelta = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: open to read data - split 1 => mock:/mocktable2/0_0 // call-2: open to read data - split 2 => mock:/mocktable2/0_1 assertEquals(2, readOpsDelta); // revert back to local fs conf.set("fs.defaultFS", "file:///"); } @Test public void testVectorReaderNoFooterSerialize() throws Exception { MockFileSystem fs = new MockFileSystem(conf); MockPath mockPath = new MockPath(fs, "mock:///mocktable3"); conf.set("hive.orc.splits.include.file.footer", "false"); conf.set("mapred.input.dir", mockPath.toString()); conf.set("fs.defaultFS", "mock:///"); conf.set("fs.mock.impl", MockFileSystem.class.getName()); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } JobConf jobConf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "mocktable3", inspector, true, 0); Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); OrcInputFormat orcInputFormat = new OrcInputFormat(); InputSplit[] splits = orcInputFormat.getSplits(conf, 2); assertEquals(2, splits.length); int readOpsBefore = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); for (InputSplit split : splits) { assertTrue("OrcSplit is expected", split instanceof OrcSplit); // ETL strategies will have start=3 (start of first stripe) assertTrue(split.toString().contains("start=3")); assertTrue(split.toString().contains("hasFooter=false")); assertTrue(split.toString().contains("hasBase=true")); assertTrue(split.toString().contains("deltas=0")); if (split instanceof OrcSplit) { assertFalse("No footer serialize test for vector reader, hasFooter is not expected in" + " orc splits.", ((OrcSplit) split).hasFooter()); } orcInputFormat.getRecordReader(split, jobConf, Reporter.NULL); } int readOpsDelta = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: open to read footer - split 1 => mock:/mocktable3/0_0 // call-2: open to read data - split 1 => mock:/mocktable3/0_0 // call-3: open to read footer - split 2 => mock:/mocktable3/0_1 // call-4: open to read data - split 2 => mock:/mocktable3/0_1 assertEquals(4, readOpsDelta); // revert back to local fs conf.set("fs.defaultFS", "file:///"); } @Test public void testVectorReaderFooterSerialize() throws Exception { MockFileSystem fs = new MockFileSystem(conf); MockPath mockPath = new MockPath(fs, "mock:///mocktable4"); conf.set("hive.orc.splits.include.file.footer", "true"); conf.set("mapred.input.dir", mockPath.toString()); conf.set("fs.defaultFS", "mock:///"); conf.set("fs.mock.impl", MockFileSystem.class.getName()); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } JobConf jobConf = createMockExecutionEnvironment(workDir, new Path("mock:///"), "mocktable4", inspector, true, 0); Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); OrcInputFormat orcInputFormat = new OrcInputFormat(); InputSplit[] splits = orcInputFormat.getSplits(conf, 2); assertEquals(2, splits.length); int readOpsBefore = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); for (InputSplit split : splits) { assertTrue("OrcSplit is expected", split instanceof OrcSplit); // ETL strategies will have start=3 (start of first stripe) assertTrue(split.toString().contains("start=3")); assertTrue(split.toString().contains("hasFooter=true")); assertTrue(split.toString().contains("hasBase=true")); assertTrue(split.toString().contains("deltas=0")); if (split instanceof OrcSplit) { assertTrue("Footer serialize test for vector reader, hasFooter is expected in" + " orc splits.", ((OrcSplit) split).hasFooter()); } orcInputFormat.getRecordReader(split, jobConf, Reporter.NULL); } int readOpsDelta = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: open to read data - split 1 => mock:/mocktable4/0_0 // call-2: open to read data - split 2 => mock:/mocktable4/0_1 assertEquals(2, readOpsDelta); // revert back to local fs conf.set("fs.defaultFS", "file:///"); } @Test public void testACIDReaderNoFooterSerialize() throws Exception { MockFileSystem fs = new MockFileSystem(conf); MockPath mockPath = new MockPath(fs, "mock:///mocktable5"); conf.set("hive.transactional.table.scan", "true"); conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty()); conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty()); conf.set("hive.orc.splits.include.file.footer", "false"); conf.set("mapred.input.dir", mockPath.toString()); conf.set("fs.defaultFS", "mock:///"); conf.set("fs.mock.impl", MockFileSystem.class.getName()); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); OrcInputFormat orcInputFormat = new OrcInputFormat(); InputSplit[] splits = orcInputFormat.getSplits(conf, 2); assertEquals(2, splits.length); int readOpsBefore = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); for (InputSplit split : splits) { assertTrue("OrcSplit is expected", split instanceof OrcSplit); // ETL strategies will have start=3 (start of first stripe) assertTrue(split.toString().contains("start=3")); assertTrue(split.toString().contains("hasFooter=false")); assertTrue(split.toString().contains("hasBase=true")); assertTrue(split.toString().contains("deltas=0")); if (split instanceof OrcSplit) { assertFalse("No footer serialize test for non-vector reader, hasFooter is not expected in" + " orc splits.", ((OrcSplit) split).hasFooter()); } orcInputFormat.getRecordReader(split, conf, Reporter.NULL); } int readOpsDelta = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: open to read footer - split 1 => mock:/mocktable5/0_0 // call-2: open to read data - split 1 => mock:/mocktable5/0_0 // call-3: open to read footer - split 2 => mock:/mocktable5/0_1 // call-4: open to read data - split 2 => mock:/mocktable5/0_1 assertEquals(4, readOpsDelta); // revert back to local fs conf.set("fs.defaultFS", "file:///"); } @Test public void testACIDReaderFooterSerialize() throws Exception { MockFileSystem fs = new MockFileSystem(conf); MockPath mockPath = new MockPath(fs, "mock:///mocktable6"); conf.set("hive.transactional.table.scan", "true"); conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty()); conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty()); conf.set("hive.orc.splits.include.file.footer", "true"); conf.set("mapred.input.dir", mockPath.toString()); conf.set("fs.defaultFS", "mock:///"); conf.set("fs.mock.impl", MockFileSystem.class.getName()); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); writer = OrcFile.createWriter(new Path(mockPath + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); OrcInputFormat orcInputFormat = new OrcInputFormat(); InputSplit[] splits = orcInputFormat.getSplits(conf, 2); assertEquals(2, splits.length); int readOpsBefore = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); for (InputSplit split : splits) { assertTrue("OrcSplit is expected", split instanceof OrcSplit); // ETL strategies will have start=3 (start of first stripe) assertTrue(split.toString().contains("start=3")); assertTrue(split.toString().contains("hasFooter=true")); assertTrue(split.toString().contains("hasBase=true")); assertTrue(split.toString().contains("deltas=0")); if (split instanceof OrcSplit) { assertTrue("Footer serialize test for ACID reader, hasFooter is expected in" + " orc splits.", ((OrcSplit) split).hasFooter()); } orcInputFormat.getRecordReader(split, conf, Reporter.NULL); } int readOpsDelta = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: open to read data - split 1 => mock:/mocktable6/0_0 // call-2: open to read data - split 2 => mock:/mocktable6/0_1 assertEquals(2, readOpsDelta); // revert back to local fs conf.set("fs.defaultFS", "file:///"); } @Test public void testACIDReaderNoFooterSerializeWithDeltas() throws Exception { MockFileSystem fs = new MockFileSystem(conf); MockPath mockPath = new MockPath(fs, "mock:///mocktable7"); conf.set("hive.transactional.table.scan", "true"); conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty()); conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty()); conf.set("hive.orc.splits.include.file.footer", "false"); conf.set("mapred.input.dir", mockPath.toString()); conf.set("fs.defaultFS", "mock:///"); conf.set("fs.mock.impl", MockFileSystem.class.getName()); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); writer = OrcFile.createWriter(new Path(new Path(mockPath + "/delta_001_002") + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); OrcInputFormat orcInputFormat = new OrcInputFormat(); InputSplit[] splits = orcInputFormat.getSplits(conf, 2); assertEquals(1, splits.length); int readOpsBefore = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); for (InputSplit split : splits) { assertTrue("OrcSplit is expected", split instanceof OrcSplit); // ETL strategies will have start=3 (start of first stripe) assertTrue(split.toString().contains("start=3")); assertTrue(split.toString().contains("hasFooter=false")); assertTrue(split.toString().contains("hasBase=true")); // NOTE: don't be surprised if deltas value is different // in older release deltas=2 as min and max transaction are added separately to delta list. // in newer release since both of them are put together deltas=1 assertTrue(split.toString().contains("deltas=1")); if (split instanceof OrcSplit) { assertFalse("No footer serialize test for ACID reader, hasFooter is not expected in" + " orc splits.", ((OrcSplit) split).hasFooter()); } orcInputFormat.getRecordReader(split, conf, Reporter.NULL); } int readOpsDelta = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: open to read footer - split 1 => mock:/mocktable7/0_0 // call-2: open to read data - split 1 => mock:/mocktable7/0_0 // call-3: open side file (flush length) of delta directory // call-4: fs.exists() check for delta_xxx_xxx/bucket_00000 file assertEquals(4, readOpsDelta); // revert back to local fs conf.set("fs.defaultFS", "file:///"); } @Test public void testACIDReaderFooterSerializeWithDeltas() throws Exception { MockFileSystem fs = new MockFileSystem(conf); MockPath mockPath = new MockPath(fs, "mock:///mocktable8"); conf.set("hive.transactional.table.scan", "true"); conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty()); conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty()); conf.set("hive.orc.splits.include.file.footer", "true"); conf.set("mapred.input.dir", mockPath.toString()); conf.set("fs.defaultFS", "mock:///"); conf.set("fs.mock.impl", MockFileSystem.class.getName()); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); writer = OrcFile.createWriter(new Path(new Path(mockPath + "/delta_001_002") + "/0_1"), OrcFile.writerOptions(conf).blockPadding(false) .bufferSize(1024).inspector(inspector)); for (int i = 0; i < 10; ++i) { writer.addRow(new MyRow(i, 2 * i)); } writer.close(); OrcInputFormat orcInputFormat = new OrcInputFormat(); InputSplit[] splits = orcInputFormat.getSplits(conf, 2); assertEquals(1, splits.length); int readOpsBefore = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsBefore = statistics.getReadOps(); } } assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1); for (InputSplit split : splits) { assertTrue("OrcSplit is expected", split instanceof OrcSplit); // ETL strategies will have start=3 (start of first stripe) assertTrue(split.toString().contains("start=3")); assertTrue(split.toString().contains("hasFooter=true")); assertTrue(split.toString().contains("hasBase=true")); // NOTE: don't be surprised if deltas value is different // in older release deltas=2 as min and max transaction are added separately to delta list. // in newer release since both of them are put together deltas=1 assertTrue(split.toString().contains("deltas=1")); if (split instanceof OrcSplit) { assertTrue("Footer serialize test for ACID reader, hasFooter is not expected in" + " orc splits.", ((OrcSplit) split).hasFooter()); } orcInputFormat.getRecordReader(split, conf, Reporter.NULL); } int readOpsDelta = -1; for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) { if (statistics.getScheme().equalsIgnoreCase("mock")) { readOpsDelta = statistics.getReadOps() - readOpsBefore; } } // call-1: open to read data - split 1 => mock:/mocktable8/0_0 // call-2: open side file (flush length) of delta directory // call-3: fs.exists() check for delta_xxx_xxx/bucket_00000 file assertEquals(3, readOpsDelta); // revert back to local fs conf.set("fs.defaultFS", "file:///"); } /** * also see {@link TestOrcFile#testPredicatePushdown()} * This tests that {@link RecordReader#getRowNumber()} works with multiple splits * @throws Exception */ @Test public void testRowNumberUniquenessInDifferentSplits() throws Exception { Properties properties = new Properties(); properties.setProperty("columns", "x,y"); properties.setProperty("columns.types", "int:int"); StructObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } // Save the conf variable values so that they can be restored later. long oldDefaultStripeSize = conf.getLong(OrcConf.STRIPE_SIZE.getHiveConfName(), -1L); long oldMaxSplitSize = conf.getLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, -1L); // Set the conf variable values for this test. long newStripeSize = 10000L; // 10000 bytes per stripe long newMaxSplitSize = 100L; // 1024 bytes per split conf.setLong(OrcConf.STRIPE_SIZE.getHiveConfName(), newStripeSize); conf.setLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, newMaxSplitSize); AbstractSerDe serde = new OrcSerde(); HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat(); org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL); // The following loop should create 20 stripes in the orc file. for (int i = 0; i < newStripeSize * 10; ++i) { writer.write(serde.serialize(new MyRow(i,i+1), inspector)); } writer.close(true); serde = new OrcSerde(); SerDeUtils.initializeSerDe(serde, conf, properties, null); assertEquals(OrcSerde.OrcSerdeRow.class, serde.getSerializedClass()); inspector = (StructObjectInspector) serde.getObjectInspector(); assertEquals("struct<x:int,y:int>", inspector.getTypeName()); InputFormat<?,?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); int numExpectedSplits = 20; InputSplit[] splits = in.getSplits(conf, numExpectedSplits); assertEquals(numExpectedSplits, splits.length); for (int i = 0; i < numExpectedSplits; ++i) { OrcSplit split = (OrcSplit) splits[i]; Reader.Options orcReaderOptions = new Reader.Options(); orcReaderOptions.range(split.getStart(), split.getLength()); OrcFile.ReaderOptions qlReaderOptions = OrcFile.readerOptions(conf).maxLength(split.getFileLength()); Reader reader = OrcFile.createReader(split.getPath(), qlReaderOptions); RecordReader recordReader = reader.rowsOptions(orcReaderOptions); for(int j = 0; recordReader.hasNext(); j++) { long rowNum = (i * 5000) + j; long rowNumActual = recordReader.getRowNumber(); assertEquals("rowNum=" + rowNum, rowNum, rowNumActual); Object row = recordReader.next(null); } recordReader.close(); } // Reset the conf variable values that we changed for this test. if (oldDefaultStripeSize != -1L) { conf.setLong(OrcConf.STRIPE_SIZE.getHiveConfName(), oldDefaultStripeSize); } else { // this means that nothing was set for default stripe size previously, so we should unset it. conf.unset(OrcConf.STRIPE_SIZE.getHiveConfName()); } if (oldMaxSplitSize != -1L) { conf.setLong(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname, oldMaxSplitSize); } else { // this means that nothing was set for default stripe size previously, so we should unset it. conf.unset(HiveConf.ConfVars.MAPREDMAXSPLITSIZE.varname); } } /** * Test schema evolution when using the reader directly. */ @Test public void testSchemaEvolution() throws Exception { TypeDescription fileSchema = TypeDescription.fromString("struct<a:int,b:struct<c:int>,d:string>"); Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) .fileSystem(fs) .setSchema(fileSchema) .compress(org.apache.orc.CompressionKind.NONE)); VectorizedRowBatch batch = fileSchema.createRowBatch(1000); batch.size = 1000; LongColumnVector lcv = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[0]); for(int r=0; r < 1000; r++) { ((LongColumnVector) batch.cols[0]).vector[r] = r * 42; lcv.vector[r] = r * 10001; ((BytesColumnVector) batch.cols[2]).setVal(r, Integer.toHexString(r).getBytes(StandardCharsets.UTF_8)); } writer.addRowBatch(batch); writer.close(); TypeDescription readerSchema = TypeDescription.fromString( "struct<a:int,b:struct<c:int,future1:int>,d:string,future2:int>"); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rowsOptions(new Reader.Options() .schema(readerSchema)); batch = readerSchema.createRowBatch(); lcv = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[0]); LongColumnVector future1 = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[1]); assertEquals(true, rows.nextBatch(batch)); assertEquals(1000, batch.size); assertEquals(true, future1.isRepeating); assertEquals(true, future1.isNull[0]); assertEquals(true, batch.cols[3].isRepeating); assertEquals(true, batch.cols[3].isNull[0]); for(int r=0; r < batch.size; ++r) { assertEquals("row " + r, r * 42, ((LongColumnVector) batch.cols[0]).vector[r]); assertEquals("row " + r, r * 10001, lcv.vector[r]); assertEquals("row " + r, r * 10001, lcv.vector[r]); assertEquals("row " + r, Integer.toHexString(r), ((BytesColumnVector) batch.cols[2]).toString(r)); } assertEquals(false, rows.nextBatch(batch)); rows.close(); // try it again with an include vector rows = reader.rowsOptions(new Reader.Options() .schema(readerSchema) .include(new boolean[]{false, true, true, true, false, false, true})); batch = readerSchema.createRowBatch(); lcv = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[0]); future1 = ((LongColumnVector) ((StructColumnVector) batch.cols[1]).fields[1]); assertEquals(true, rows.nextBatch(batch)); assertEquals(1000, batch.size); assertEquals(true, future1.isRepeating); assertEquals(true, future1.isNull[0]); assertEquals(true, batch.cols[3].isRepeating); assertEquals(true, batch.cols[3].isNull[0]); assertEquals(true, batch.cols[2].isRepeating); assertEquals(true, batch.cols[2].isNull[0]); for(int r=0; r < batch.size; ++r) { assertEquals("row " + r, r * 42, ((LongColumnVector) batch.cols[0]).vector[r]); assertEquals("row " + r, r * 10001, lcv.vector[r]); } assertEquals(false, rows.nextBatch(batch)); rows.close(); } /** * Test column projection when using ACID. */ @Test public void testColumnProjectionWithAcid() throws Exception { Path baseDir = new Path(workDir, "base_00100"); testFilePath = new Path(baseDir, "bucket_00000"); fs.mkdirs(baseDir); fs.delete(testFilePath, true); TypeDescription fileSchema = TypeDescription.fromString("struct<operation:int," + "originalTransaction:bigint,bucket:int,rowId:bigint," + "currentTransaction:bigint," + "row:struct<a:int,b:struct<c:int>,d:string>>"); Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) .fileSystem(fs) .setSchema(fileSchema) .compress(org.apache.orc.CompressionKind.NONE)); VectorizedRowBatch batch = fileSchema.createRowBatch(1000); batch.size = 1000; StructColumnVector scv = (StructColumnVector)batch.cols[5]; // operation batch.cols[0].isRepeating = true; ((LongColumnVector) batch.cols[0]).vector[0] = 0; // original transaction batch.cols[1].isRepeating = true; ((LongColumnVector) batch.cols[1]).vector[0] = 1; // bucket batch.cols[2].isRepeating = true; ((LongColumnVector) batch.cols[2]).vector[0] = 0; // current transaction batch.cols[4].isRepeating = true; ((LongColumnVector) batch.cols[4]).vector[0] = 1; LongColumnVector lcv = (LongColumnVector) ((StructColumnVector) scv.fields[1]).fields[0]; for(int r=0; r < 1000; r++) { // row id ((LongColumnVector) batch.cols[3]).vector[r] = r; // a ((LongColumnVector) scv.fields[0]).vector[r] = r * 42; // b.c lcv.vector[r] = r * 10001; // d ((BytesColumnVector) scv.fields[2]).setVal(r, Integer.toHexString(r).getBytes(StandardCharsets.UTF_8)); } writer.addRowBatch(batch); writer.addUserMetadata(OrcRecordUpdater.ACID_KEY_INDEX_NAME, ByteBuffer.wrap("0,0,999".getBytes(StandardCharsets.UTF_8))); writer.close(); long fileLength = fs.getFileStatus(testFilePath).getLen(); // test with same schema with include conf.set(ValidTxnList.VALID_TXNS_KEY, "100:99:"); conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d"); conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "int,struct<c:int>,string"); conf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false"); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2"); OrcSplit split = new OrcSplit(testFilePath, null, 0, fileLength, new String[0], null, false, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength); OrcInputFormat inputFormat = new OrcInputFormat(); AcidInputFormat.RowReader<OrcStruct> reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf)); int record = 0; RecordIdentifier id = reader.createKey(); OrcStruct struct = reader.createValue(); while (reader.next(id, struct)) { assertEquals("id " + record, record, id.getRowId()); assertEquals("bucket " + record, 0, id.getBucketId()); assertEquals("trans " + record, 1, id.getTransactionId()); assertEquals("a " + record, 42 * record, ((IntWritable) struct.getFieldValue(0)).get()); assertEquals(null, struct.getFieldValue(1)); assertEquals("d " + record, Integer.toHexString(record), struct.getFieldValue(2).toString()); record += 1; } assertEquals(1000, record); reader.close(); // test with schema evolution and include conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "a,b,d,f"); conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "int,struct<c:int,e:string>,string,int"); conf.set(ColumnProjectionUtils.READ_ALL_COLUMNS, "false"); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0,2,3"); split = new OrcSplit(testFilePath, null, 0, fileLength, new String[0], null, false, true, new ArrayList<AcidInputFormat.DeltaMetaData>(), fileLength, fileLength); inputFormat = new OrcInputFormat(); reader = inputFormat.getReader(split, new AcidInputFormat.Options(conf)); record = 0; id = reader.createKey(); struct = reader.createValue(); while (reader.next(id, struct)) { assertEquals("id " + record, record, id.getRowId()); assertEquals("bucket " + record, 0, id.getBucketId()); assertEquals("trans " + record, 1, id.getTransactionId()); assertEquals("a " + record, 42 * record, ((IntWritable) struct.getFieldValue(0)).get()); assertEquals(null, struct.getFieldValue(1)); assertEquals("d " + record, Integer.toHexString(record), struct.getFieldValue(2).toString()); assertEquals("f " + record, null, struct.getFieldValue(3)); record += 1; } assertEquals(1000, record); reader.close(); } }