/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package parquet.hadoop.thrift; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.hadoop.fs.FileStatus; import parquet.column.statistics.*; import parquet.hadoop.metadata.BlockMetaData; import parquet.hadoop.metadata.ColumnChunkMetaData; import parquet.hadoop.util.ContextUtil; import parquet.io.api.Binary; import parquet.thrift.test.RequiredPrimitiveFixture; import parquet.thrift.test.TestListsInMap; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.thrift.TBase; import org.apache.thrift.TException; import org.apache.thrift.protocol.TCompactProtocol; import org.apache.thrift.protocol.TProtocol; import org.apache.thrift.protocol.TProtocolFactory; import org.apache.thrift.transport.TIOStreamTransport; import org.junit.Test; import parquet.Log; import parquet.example.data.Group; import parquet.hadoop.ParquetFileReader; import parquet.hadoop.ParquetReader; import parquet.hadoop.example.GroupReadSupport; import parquet.hadoop.metadata.ParquetMetadata; import com.twitter.data.proto.tutorial.thrift.AddressBook; import com.twitter.data.proto.tutorial.thrift.Name; import com.twitter.data.proto.tutorial.thrift.Person; import com.twitter.data.proto.tutorial.thrift.PhoneNumber; import com.twitter.elephantbird.thrift.test.TestListInMap; import com.twitter.elephantbird.thrift.test.TestMapInList; import parquet.schema.MessageType; public class TestThriftToParquetFileWriter { private static final Log LOG = Log .getLog(TestThriftToParquetFileWriter.class); @Test public void testWriteFile() throws IOException, InterruptedException, TException { final AddressBook a = new AddressBook( Arrays.asList( new Person( new Name("Bob", "Roberts"), 0, "bob.roberts@example.com", Arrays.asList(new PhoneNumber("1234567890"))))); final Path fileToCreate = createFile(a); ParquetReader<Group> reader = createRecordReader(fileToCreate); Group g = null; int i = 0; while((g = reader.read()) != null) { assertEquals(a.persons.size(), g.getFieldRepetitionCount("persons")); assertEquals(a.persons.get(0).email, g.getGroup("persons", 0).getGroup(0, 0).getString("email", 0)); // just some sanity check, we're testing the various layers somewhere else ++i; } assertEquals("read 1 record", 1, i); } @Test public void testWriteStatistics() throws Exception { //create correct stats small numbers IntStatistics intStatsSmall = new IntStatistics(); intStatsSmall.setMinMax(2, 100); LongStatistics longStatsSmall = new LongStatistics(); longStatsSmall.setMinMax(-17l, 287L); DoubleStatistics doubleStatsSmall = new DoubleStatistics(); doubleStatsSmall.setMinMax(-15.55d, 9.63d); BinaryStatistics binaryStatsSmall = new BinaryStatistics(); binaryStatsSmall.setMinMax(Binary.fromString("as"), Binary.fromString("world")); BooleanStatistics boolStats = new BooleanStatistics(); boolStats.setMinMax(false, true); //write rows to a file Path p = createFile(new RequiredPrimitiveFixture(false, (byte)32, (short)32, 2, 90l, -15.55d, "as"), new RequiredPrimitiveFixture(false, (byte)100, (short)100, 100, 287l, -9.0d, "world"), new RequiredPrimitiveFixture(true, (byte)2, (short)2, 9, -17l, 9.63d, "hello")); final Configuration configuration = new Configuration(); final FileSystem fs = p.getFileSystem(configuration); FileStatus fileStatus = fs.getFileStatus(p); ParquetMetadata footer = ParquetFileReader.readFooter(configuration, p); for(BlockMetaData bmd: footer.getBlocks()) { for(ColumnChunkMetaData cmd: bmd.getColumns()) { switch(cmd.getType()) { case INT32: assertTrue(intStatsSmall.equals((IntStatistics)cmd.getStatistics())); break; case INT64: assertTrue(longStatsSmall.equals((LongStatistics)cmd.getStatistics())); break; case DOUBLE: assertTrue(doubleStatsSmall.equals((DoubleStatistics)cmd.getStatistics())); break; case BOOLEAN: assertTrue(boolStats.equals((BooleanStatistics)cmd.getStatistics())); break; case BINARY: // there is also info_string that has no statistics if(cmd.getPath().toString() == "[test_string]") assertTrue(binaryStatsSmall.equals((BinaryStatistics)cmd.getStatistics())); break; } } } //create correct stats large numbers IntStatistics intStatsLarge = new IntStatistics(); intStatsLarge.setMinMax(-Integer.MAX_VALUE, Integer.MAX_VALUE); LongStatistics longStatsLarge = new LongStatistics(); longStatsLarge.setMinMax(-Long.MAX_VALUE, Long.MAX_VALUE); DoubleStatistics doubleStatsLarge = new DoubleStatistics(); doubleStatsLarge.setMinMax(-Double.MAX_VALUE, Double.MAX_VALUE); BinaryStatistics binaryStatsLarge = new BinaryStatistics(); binaryStatsLarge.setMinMax(Binary.fromString("some small string"), Binary.fromString("some very large string here to test in this function")); //write rows to a file Path p_large = createFile(new RequiredPrimitiveFixture(false, (byte)2, (short)32, -Integer.MAX_VALUE, -Long.MAX_VALUE, -Double.MAX_VALUE, "some small string"), new RequiredPrimitiveFixture(false, (byte)100, (short)100, Integer.MAX_VALUE, Long.MAX_VALUE, Double.MAX_VALUE, "some very large string here to test in this function"), new RequiredPrimitiveFixture(true, (byte)2, (short)2, 9, -17l, 9.63d, "hello")); // make new configuration and create file with new large stats final Configuration configuration_large = new Configuration(); final FileSystem fs_large = p_large.getFileSystem(configuration_large); FileStatus fileStatus_large = fs_large.getFileStatus(p_large); ParquetMetadata footer_large = ParquetFileReader.readFooter(configuration_large, p_large); for(BlockMetaData bmd: footer_large.getBlocks()) { for(ColumnChunkMetaData cmd: bmd.getColumns()) { switch(cmd.getType()) { case INT32: // testing the correct limits of an int32, there are also byte and short, tested earlier if(cmd.getPath().toString() == "[test_i32]") assertTrue(intStatsLarge.equals((IntStatistics)cmd.getStatistics())); break; case INT64: assertTrue(longStatsLarge.equals((LongStatistics)cmd.getStatistics())); break; case DOUBLE: assertTrue(doubleStatsLarge.equals((DoubleStatistics)cmd.getStatistics())); break; case BOOLEAN: assertTrue(boolStats.equals((BooleanStatistics)cmd.getStatistics())); break; case BINARY: // there is also info_string that has no statistics if(cmd.getPath().toString() == "[test_string]") assertTrue(binaryStatsLarge.equals((BinaryStatistics)cmd.getStatistics())); break; } } } } @Test public void testWriteFileListOfMap() throws IOException, InterruptedException, TException { Map<String, String> map1 = new HashMap<String,String>(); map1.put("key11", "value11"); map1.put("key12", "value12"); Map<String, String> map2 = new HashMap<String,String>(); map2.put("key21", "value21"); final TestMapInList listMap = new TestMapInList("listmap", Arrays.asList(map1, map2)); final Path fileToCreate = createFile(listMap); ParquetReader<Group> reader = createRecordReader(fileToCreate); Group g = null; while((g = reader.read()) != null) { assertEquals(listMap.names.size(), g.getGroup("names", 0).getFieldRepetitionCount("names_tuple")); assertEquals(listMap.names.get(0).size(), g.getGroup("names", 0).getGroup("names_tuple", 0).getFieldRepetitionCount("map")); assertEquals(listMap.names.get(1).size(), g.getGroup("names", 0).getGroup("names_tuple", 1).getFieldRepetitionCount("map")); } } @Test public void testWriteFileMapOfList() throws IOException, InterruptedException, TException { Map<String, List<String>> map = new HashMap<String,List<String>>(); map.put("key", Arrays.asList("val1","val2")); final TestListInMap mapList = new TestListInMap("maplist", map); final Path fileToCreate = createFile(mapList); ParquetReader<Group> reader = createRecordReader(fileToCreate); Group g = null; while((g = reader.read()) != null) { assertEquals("key", g.getGroup("names", 0).getGroup("map",0).getBinary("key", 0).toStringUsingUTF8()); assertEquals(map.get("key").size(), g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getFieldRepetitionCount(0)); } } @Test public void testWriteFileMapOfLists() throws IOException, InterruptedException, TException { Map<List<String>, List<String>> map = new HashMap<List<String>,List<String>>(); map.put(Arrays.asList("key1","key2"), Arrays.asList("val1","val2")); final TestListsInMap mapList = new TestListsInMap("maplists", map); final Path fileToCreate = createFile(mapList); ParquetReader<Group> reader = createRecordReader(fileToCreate); Group g = null; while((g = reader.read()) != null) { assertEquals("key1", g.getGroup("names", 0).getGroup("map",0).getGroup("key", 0).getBinary("key_tuple", 0).toStringUsingUTF8()); assertEquals("key2", g.getGroup("names", 0).getGroup("map",0).getGroup("key", 0).getBinary("key_tuple", 1).toStringUsingUTF8()); assertEquals("val1", g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getBinary("value_tuple", 0).toStringUsingUTF8()); assertEquals("val2", g.getGroup("names", 0).getGroup("map",0).getGroup("value", 0).getBinary("value_tuple", 1).toStringUsingUTF8()); } } private ParquetReader<Group> createRecordReader(Path parquetFilePath) throws IOException { Configuration configuration = new Configuration(true); GroupReadSupport readSupport = new GroupReadSupport(); ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath); MessageType schema = readFooter.getFileMetaData().getSchema(); readSupport.init(configuration, null, schema); return new ParquetReader<Group>(parquetFilePath, readSupport); } private <T extends TBase<?,?>> Path createFile(T... tObjs) throws IOException, InterruptedException, TException { final Path fileToCreate = new Path("target/test/TestThriftToParquetFileWriter/"+tObjs[0].getClass()+".parquet"); LOG.info("File created: " + fileToCreate.toString()); Configuration conf = new Configuration(); final FileSystem fs = fileToCreate.getFileSystem(conf); if (fs.exists(fileToCreate)) { fs.delete(fileToCreate, true); } TProtocolFactory protocolFactory = new TCompactProtocol.Factory(); TaskAttemptID taskId = new TaskAttemptID("local", 0, true, 0, 0); ThriftToParquetFileWriter w = new ThriftToParquetFileWriter(fileToCreate, ContextUtil.newTaskAttemptContext(conf, taskId), protocolFactory, (Class<? extends TBase<?, ?>>) tObjs[0].getClass()); for(T tObj:tObjs) { final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final TProtocol protocol = protocolFactory.getProtocol(new TIOStreamTransport(baos)); tObj.write(protocol); w.write(new BytesWritable(baos.toByteArray())); } w.close(); return fileToCreate; } }