/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.orc;
import com.facebook.presto.orc.OrcTester.Compression;
import com.facebook.presto.orc.OrcTester.Format;
import com.facebook.presto.orc.memory.AggregatedMemoryContext;
import com.facebook.presto.orc.metadata.OrcMetadataReader;
import com.facebook.presto.orc.metadata.StripeInformation;
import com.facebook.presto.spi.block.Block;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import io.airlift.slice.FixedLengthSliceInput;
import io.airlift.units.DataSize;
import io.airlift.units.DataSize.Unit;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
import java.util.stream.Stream;
import static com.facebook.presto.orc.OrcRecordReader.LinearProbeRangeFinder.createTinyStripesRangeFinder;
import static com.facebook.presto.orc.OrcRecordReader.wrapWithCacheIfTinyStripes;
import static com.facebook.presto.orc.OrcTester.Compression.NONE;
import static com.facebook.presto.orc.OrcTester.Compression.ZLIB;
import static com.facebook.presto.orc.OrcTester.Format.ORC_12;
import static com.facebook.presto.orc.OrcTester.HIVE_STORAGE_TIME_ZONE;
import static com.facebook.presto.orc.OrcTester.writeOrcFileColumnOld;
import static com.facebook.presto.spi.type.VarcharType.VARCHAR;
import static io.airlift.testing.Assertions.assertGreaterThanOrEqual;
import static io.airlift.testing.Assertions.assertInstanceOf;
import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory.javaStringObjectInspector;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertNotNull;
import static org.testng.Assert.fail;
public class TestCachingOrcDataSource
{
private static final int POSITION_COUNT = 50000;
private TempFile tempFile;
@BeforeClass
public void setUp()
throws Exception
{
tempFile = new TempFile();
Random random = new Random();
Iterator<String> iterator = Stream.generate(() -> Long.toHexString(random.nextLong())).limit(POSITION_COUNT).iterator();
writeOrcFileColumnOld(
tempFile.getFile(),
ORC_12,
createOrcRecordWriter(tempFile.getFile(), ORC_12, ZLIB, javaStringObjectInspector),
VARCHAR,
iterator);
}
@AfterClass
public void tearDown()
throws Exception
{
tempFile.close();
}
@Test
public void testWrapWithCacheIfTinyStripes()
throws IOException
{
DataSize maxMergeDistance = new DataSize(1, Unit.MEGABYTE);
DataSize maxReadSize = new DataSize(8, Unit.MEGABYTE);
OrcDataSource actual = wrapWithCacheIfTinyStripes(
FakeOrcDataSource.INSTANCE,
ImmutableList.of(),
maxMergeDistance,
maxReadSize);
assertInstanceOf(actual, CachingOrcDataSource.class);
actual = wrapWithCacheIfTinyStripes(
FakeOrcDataSource.INSTANCE,
ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10)),
maxMergeDistance,
maxReadSize);
assertInstanceOf(actual, CachingOrcDataSource.class);
actual = wrapWithCacheIfTinyStripes(
FakeOrcDataSource.INSTANCE,
ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10), new StripeInformation(123, 33, 10, 10, 10), new StripeInformation(123, 63, 10, 10, 10)),
maxMergeDistance,
maxReadSize);
assertInstanceOf(actual, CachingOrcDataSource.class);
actual = wrapWithCacheIfTinyStripes(
FakeOrcDataSource.INSTANCE,
ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10), new StripeInformation(123, 33, 10, 10, 10), new StripeInformation(123, 63, 1048576 * 8 - 20, 10, 10)),
maxMergeDistance,
maxReadSize);
assertInstanceOf(actual, CachingOrcDataSource.class);
actual = wrapWithCacheIfTinyStripes(
FakeOrcDataSource.INSTANCE,
ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10), new StripeInformation(123, 33, 10, 10, 10), new StripeInformation(123, 63, 1048576 * 8 - 20 + 1, 10, 10)),
maxMergeDistance,
maxReadSize);
assertNotInstanceOf(actual, CachingOrcDataSource.class);
}
@Test
public void testTinyStripesReadCacheAt()
throws IOException
{
DataSize maxMergeDistance = new DataSize(1, Unit.MEGABYTE);
DataSize maxReadSize = new DataSize(8, Unit.MEGABYTE);
TestingOrcDataSource testingOrcDataSource = new TestingOrcDataSource(FakeOrcDataSource.INSTANCE);
CachingOrcDataSource cachingOrcDataSource = new CachingOrcDataSource(
testingOrcDataSource,
createTinyStripesRangeFinder(
ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10), new StripeInformation(123, 33, 10, 10, 10), new StripeInformation(123, 63, 1048576 * 8 - 20, 10, 10)),
maxMergeDistance,
maxReadSize));
cachingOrcDataSource.readCacheAt(3);
assertEquals(testingOrcDataSource.getLastReadRanges(), ImmutableList.of(new DiskRange(3, 60)));
cachingOrcDataSource.readCacheAt(63);
assertEquals(testingOrcDataSource.getLastReadRanges(), ImmutableList.of(new DiskRange(63, 8 * 1048576)));
testingOrcDataSource = new TestingOrcDataSource(FakeOrcDataSource.INSTANCE);
cachingOrcDataSource = new CachingOrcDataSource(
testingOrcDataSource,
createTinyStripesRangeFinder(
ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10), new StripeInformation(123, 33, 10, 10, 10), new StripeInformation(123, 63, 1048576 * 8 - 20, 10, 10)),
maxMergeDistance,
maxReadSize));
cachingOrcDataSource.readCacheAt(62); // read at the end of a stripe
assertEquals(testingOrcDataSource.getLastReadRanges(), ImmutableList.of(new DiskRange(3, 60)));
cachingOrcDataSource.readCacheAt(63);
assertEquals(testingOrcDataSource.getLastReadRanges(), ImmutableList.of(new DiskRange(63, 8 * 1048576)));
testingOrcDataSource = new TestingOrcDataSource(FakeOrcDataSource.INSTANCE);
cachingOrcDataSource = new CachingOrcDataSource(
testingOrcDataSource,
createTinyStripesRangeFinder(
ImmutableList.of(new StripeInformation(123, 3, 1, 0, 0), new StripeInformation(123, 4, 1048576, 1048576, 1048576 * 3), new StripeInformation(123, 4 + 1048576 * 5, 1048576, 1048576, 1048576)),
maxMergeDistance,
maxReadSize));
cachingOrcDataSource.readCacheAt(3);
assertEquals(testingOrcDataSource.getLastReadRanges(), ImmutableList.of(new DiskRange(3, 1 + 1048576 * 5)));
cachingOrcDataSource.readCacheAt(4 + 1048576 * 5);
assertEquals(testingOrcDataSource.getLastReadRanges(), ImmutableList.of(new DiskRange(4 + 1048576 * 5, 3 * 1048576)));
}
@Test
public void testIntegration()
throws IOException
{
// tiny file
TestingOrcDataSource orcDataSource = new TestingOrcDataSource(
new FileOrcDataSource(tempFile.getFile(), new DataSize(1, Unit.MEGABYTE), new DataSize(1, Unit.MEGABYTE), new DataSize(1, Unit.MEGABYTE)));
doIntegration(orcDataSource, new DataSize(1, Unit.MEGABYTE), new DataSize(1, Unit.MEGABYTE));
assertEquals(orcDataSource.getReadCount(), 1); // read entire file at once
// tiny stripes
orcDataSource = new TestingOrcDataSource(
new FileOrcDataSource(tempFile.getFile(), new DataSize(1, Unit.MEGABYTE), new DataSize(1, Unit.MEGABYTE), new DataSize(1, Unit.MEGABYTE)));
doIntegration(orcDataSource, new DataSize(400, Unit.KILOBYTE), new DataSize(400, Unit.KILOBYTE));
assertEquals(orcDataSource.getReadCount(), 3); // footer, first few stripes, last few stripes
}
public void doIntegration(TestingOrcDataSource orcDataSource, DataSize maxMergeDistance, DataSize maxReadSize)
throws IOException
{
OrcReader orcReader = new OrcReader(orcDataSource, new OrcMetadataReader(), maxMergeDistance, maxReadSize);
// 1 for reading file footer
assertEquals(orcDataSource.getReadCount(), 1);
List<StripeInformation> stripes = orcReader.getFooter().getStripes();
// Sanity check number of stripes. This can be three or higher because of orc writer low memory mode.
assertGreaterThanOrEqual(stripes.size(), 3);
//verify wrapped by CachingOrcReader
assertInstanceOf(wrapWithCacheIfTinyStripes(orcDataSource, stripes, maxMergeDistance, maxReadSize), CachingOrcDataSource.class);
OrcRecordReader orcRecordReader = orcReader.createRecordReader(
ImmutableMap.of(0, VARCHAR),
(numberOfRows, statisticsByColumnIndex) -> true,
HIVE_STORAGE_TIME_ZONE,
new AggregatedMemoryContext());
int positionCount = 0;
while (true) {
int batchSize = orcRecordReader.nextBatch();
if (batchSize <= 0) {
break;
}
Block block = orcRecordReader.readBlock(VARCHAR, 0);
positionCount += block.getPositionCount();
}
assertEquals(positionCount, POSITION_COUNT);
}
public static <T, U extends T> void assertNotInstanceOf(T actual, Class<U> expectedType)
{
assertNotNull(actual, "actual is null");
assertNotNull(expectedType, "expectedType is null");
if (expectedType.isInstance(actual)) {
fail(String.format("expected:<%s> to not be an instance of <%s>", actual, expectedType.getName()));
}
}
private static FileSinkOperator.RecordWriter createOrcRecordWriter(File outputFile, Format format, Compression compression, ObjectInspector columnObjectInspector)
throws IOException
{
JobConf jobConf = new JobConf();
jobConf.set("hive.exec.orc.write.format", format == ORC_12 ? "0.12" : "0.11");
jobConf.set("hive.exec.orc.default.compress", compression.name());
Properties tableProperties = new Properties();
tableProperties.setProperty("columns", "test");
tableProperties.setProperty("columns.types", columnObjectInspector.getTypeName());
tableProperties.setProperty("orc.stripe.size", "1200000");
return new OrcOutputFormat().getHiveRecordWriter(
jobConf,
new Path(outputFile.toURI()),
Text.class,
compression != NONE,
tableProperties,
() -> { });
}
private static class FakeOrcDataSource
implements OrcDataSource
{
public static final FakeOrcDataSource INSTANCE = new FakeOrcDataSource();
@Override
public OrcDataSourceId getId()
{
return new OrcDataSourceId("fake");
}
@Override
public long getReadBytes()
{
throw new UnsupportedOperationException();
}
@Override
public long getReadTimeNanos()
{
throw new UnsupportedOperationException();
}
@Override
public long getSize()
{
throw new UnsupportedOperationException();
}
@Override
public void readFully(long position, byte[] buffer)
throws IOException
{
// do nothing
}
@Override
public void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength)
throws IOException
{
// do nothing
}
@Override
public <K> Map<K, FixedLengthSliceInput> readFully(Map<K, DiskRange> diskRanges)
throws IOException
{
throw new UnsupportedOperationException();
}
}
}