/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.hive.parquet;
import com.facebook.presto.hive.parquet.memory.AggregatedMemoryContext;
import com.facebook.presto.hive.parquet.reader.ParquetMetadataReader;
import com.facebook.presto.hive.parquet.reader.ParquetReader;
import com.facebook.presto.spi.block.Block;
import com.facebook.presto.spi.type.Type;
import com.google.common.base.Function;
import com.google.common.base.Throwables;
import com.google.common.collect.AbstractIterator;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import io.airlift.units.DataSize;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.joda.time.DateTimeZone;
import parquet.column.ColumnDescriptor;
import parquet.column.ParquetProperties.WriterVersion;
import parquet.hadoop.ParquetOutputFormat;
import parquet.hadoop.metadata.CompressionCodecName;
import parquet.hadoop.metadata.FileMetaData;
import parquet.hadoop.metadata.ParquetMetadata;
import parquet.schema.MessageType;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import static com.facebook.presto.hive.HiveTestUtils.TYPE_MANAGER;
import static com.facebook.presto.testing.TestingConnectorSession.SESSION;
import static com.google.common.base.Functions.constant;
import static com.google.common.collect.Iterables.transform;
import static io.airlift.units.DataSize.succinctBytes;
import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.getStandardStructObjectInspector;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertFalse;
import static org.testng.Assert.assertTrue;
import static parquet.column.ParquetProperties.WriterVersion.PARQUET_1_0;
import static parquet.hadoop.metadata.CompressionCodecName.GZIP;
import static parquet.hadoop.metadata.CompressionCodecName.LZO;
import static parquet.hadoop.metadata.CompressionCodecName.SNAPPY;
import static parquet.hadoop.metadata.CompressionCodecName.UNCOMPRESSED;
public class ParquetTester
{
public static final DateTimeZone HIVE_STORAGE_TIME_ZONE = DateTimeZone.forID("Asia/Katmandu");
private Set<CompressionCodecName> compressions = ImmutableSet.of();
private Set<WriterVersion> versions = ImmutableSet.of();
public static ParquetTester quickParquetTester()
{
ParquetTester parquetTester = new ParquetTester();
parquetTester.compressions = ImmutableSet.of(GZIP);
parquetTester.versions = ImmutableSet.of(PARQUET_1_0);
return parquetTester;
}
public static ParquetTester fullParquetTester()
{
ParquetTester parquetTester = new ParquetTester();
parquetTester.compressions = ImmutableSet.of(GZIP, UNCOMPRESSED, SNAPPY, LZO);
parquetTester.versions = ImmutableSet.copyOf(WriterVersion.values());
return parquetTester;
}
public void testRoundTrip(PrimitiveObjectInspector columnObjectInspector, Iterable<?> writeValues, Type parameterType)
throws Exception
{
testRoundTrip(columnObjectInspector, writeValues, writeValues, parameterType);
}
public <W, R> void testRoundTrip(PrimitiveObjectInspector columnObjectInspector, Iterable<W> writeValues, Function<W, R> readTransform, Type parameterType)
throws Exception
{
testRoundTrip(columnObjectInspector, writeValues, transform(writeValues, readTransform), parameterType);
}
public void testRoundTrip(ObjectInspector objectInspector, Iterable<?> writeValues, Iterable<?> readValues, Type type)
throws Exception
{
// just the values
testRoundTripType(objectInspector, writeValues, readValues, type);
// all nulls
assertRoundTrip(objectInspector, transform(writeValues, constant(null)), transform(readValues, constant(null)), type);
}
private void testRoundTripType(ObjectInspector objectInspector, Iterable<?> writeValues, Iterable<?> readValues, Type type)
throws Exception
{
// forward order
assertRoundTrip(objectInspector, writeValues, readValues, type);
// reverse order
assertRoundTrip(objectInspector, reverse(writeValues), reverse(readValues), type);
// forward order with nulls
assertRoundTrip(objectInspector, insertNullEvery(5, writeValues), insertNullEvery(5, readValues), type);
// reverse order with nulls
assertRoundTrip(objectInspector, insertNullEvery(5, reverse(writeValues)), insertNullEvery(5, reverse(readValues)), type);
}
public void assertRoundTrip(ObjectInspector objectInspector, Iterable<?> writeValues, Iterable<?> readValues, Type type)
throws Exception
{
for (WriterVersion version : versions) {
for (CompressionCodecName compressionCodecName : compressions) {
try (TempFile tempFile = new TempFile("test", "parquet")) {
JobConf jobConf = new JobConf();
jobConf.setEnum(ParquetOutputFormat.COMPRESSION, compressionCodecName);
jobConf.setBoolean(ParquetOutputFormat.ENABLE_DICTIONARY, true);
jobConf.setEnum(ParquetOutputFormat.WRITER_VERSION, version);
writeParquetColumn(jobConf,
tempFile.getFile(),
compressionCodecName,
objectInspector,
writeValues.iterator());
assertFileContents(jobConf,
tempFile,
readValues,
type);
}
}
}
}
private static void assertFileContents(JobConf jobConf,
TempFile tempFile,
Iterable<?> expectedValues,
Type type)
throws IOException, InterruptedException
{
Path path = new Path(tempFile.getFile().toURI());
FileSystem fileSystem = path.getFileSystem(jobConf);
ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(fileSystem, path);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
MessageType fileSchema = fileMetaData.getSchema();
long size = fileSystem.getFileStatus(path).getLen();
FSDataInputStream inputStream = fileSystem.open(path);
ParquetDataSource dataSource = new HdfsParquetDataSource(path, size, inputStream);
ParquetReader parquetReader = new ParquetReader(fileSchema, fileSchema, parquetMetadata.getBlocks(), dataSource, TYPE_MANAGER, new AggregatedMemoryContext());
assertEquals(parquetReader.getPosition(), 0);
int rowsProcessed = 0;
Iterator<?> iterator = expectedValues.iterator();
for (int batchSize = parquetReader.nextBatch(); batchSize >= 0; batchSize = parquetReader.nextBatch()) {
ColumnDescriptor columnDescriptor = fileSchema.getColumns().get(0);
Block block = parquetReader.readPrimitive(columnDescriptor, type);
for (int i = 0; i < batchSize; i++) {
assertTrue(iterator.hasNext());
Object expected = iterator.next();
Object actual = decodeObject(type, block, i);
assertEquals(actual, expected);
}
rowsProcessed += batchSize;
assertEquals(parquetReader.getPosition(), rowsProcessed);
}
assertFalse(iterator.hasNext());
assertEquals(parquetReader.getPosition(), rowsProcessed);
parquetReader.close();
}
private static DataSize writeParquetColumn(JobConf jobConf,
File outputFile,
CompressionCodecName compressionCodecName,
ObjectInspector columnObjectInspector,
Iterator<?> values)
throws Exception
{
RecordWriter recordWriter = new MapredParquetOutputFormat().getHiveRecordWriter(jobConf,
new Path(outputFile.toURI()),
Text.class,
compressionCodecName != UNCOMPRESSED,
createTableProperties("test", columnObjectInspector.getTypeName()),
() -> { }
);
SettableStructObjectInspector objectInspector = createSettableStructObjectInspector("test", columnObjectInspector);
Object row = objectInspector.create();
List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs());
int i = 0;
while (values.hasNext()) {
Object value = values.next();
objectInspector.setStructFieldData(row, fields.get(0), value);
ParquetHiveSerDe serde = new ParquetHiveSerDe();
serde.initialize(jobConf, createTableProperties("test", columnObjectInspector.getTypeName()), null);
Writable record = serde.serialize(row, objectInspector);
recordWriter.write(record);
i++;
}
recordWriter.close(false);
return succinctBytes(outputFile.length());
}
static SettableStructObjectInspector createSettableStructObjectInspector(String name, ObjectInspector objectInspector)
{
return getStandardStructObjectInspector(ImmutableList.of(name), ImmutableList.of(objectInspector));
}
private static Properties createTableProperties(String name, String type)
{
Properties orderTableProperties = new Properties();
orderTableProperties.setProperty("columns", name);
orderTableProperties.setProperty("columns.types", type);
return orderTableProperties;
}
static class TempFile
implements Closeable
{
private final File file;
public TempFile(String prefix, String suffix)
{
try {
file = File.createTempFile(prefix, suffix);
file.delete();
}
catch (IOException e) {
throw Throwables.propagate(e);
}
}
public File getFile()
{
return file;
}
@Override
public void close()
{
file.delete();
}
}
private static <T> Iterable<T> reverse(Iterable<T> iterable)
{
return Lists.reverse(ImmutableList.copyOf(iterable));
}
private static <T> Iterable<T> insertNullEvery(int n, Iterable<T> iterable)
{
return () -> new AbstractIterator<T>()
{
private final Iterator<T> delegate = iterable.iterator();
private int position;
@Override
protected T computeNext()
{
position++;
if (position > n) {
position = 0;
return null;
}
if (!delegate.hasNext()) {
return endOfData();
}
return delegate.next();
}
};
}
private static Object decodeObject(Type type, Block block, int position)
{
if (block.isNull(position)) {
return null;
}
return type.getObjectValue(SESSION, block, position);
}
}