/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package parquet.hadoop.thrift;
import static org.junit.Assert.assertEquals;
import java.io.ByteArrayOutputStream;
import java.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.thrift.TBase;
import org.apache.thrift.protocol.TCompactProtocol;
import org.apache.thrift.protocol.TProtocol;
import org.apache.thrift.protocol.TProtocolFactory;
import org.apache.thrift.transport.TIOStreamTransport;
import org.junit.Test;
import parquet.Log;
import parquet.hadoop.api.ReadSupport;
import parquet.hadoop.util.ContextUtil;
import com.twitter.data.proto.tutorial.thrift.AddressBook;
import com.twitter.data.proto.tutorial.thrift.Name;
import com.twitter.data.proto.tutorial.thrift.Person;
import com.twitter.data.proto.tutorial.thrift.PhoneNumber;
import parquet.thrift.test.*;
public class TestParquetToThriftReadWriteAndProjection {
private static final Log LOG = Log.getLog(TestParquetToThriftReadWriteAndProjection.class);
@Test
public void testThriftOptionalFieldsWithReadProjectionUsingParquetSchema() throws Exception {
// test with projection
Configuration conf = new Configuration();
final String readProjectionSchema = "message AddressBook {\n" +
" optional group persons {\n" +
" repeated group persons_tuple {\n" +
" required group name {\n" +
" optional binary first_name;\n" +
" optional binary last_name;\n" +
" }\n" +
" optional int32 id;\n" +
" }\n" +
" }\n" +
"}";
conf.set(ReadSupport.PARQUET_READ_SCHEMA, readProjectionSchema);
TBase toWrite=new AddressBook(
Arrays.asList(
new Person(
new Name("Bob", "Roberts"),
0,
"bob.roberts@example.com",
Arrays.asList(new PhoneNumber("1234567890")))));
TBase toRead=new AddressBook(
Arrays.asList(
new Person(
new Name("Bob", "Roberts"),
0,
null,
null)));
shouldDoProjection(conf,toWrite,toRead,AddressBook.class);
}
@Test
public void testPullingInRequiredStructWithFilter() throws Exception {
final String projectionFilterDesc = "persons/{id};persons/email";
TBase toWrite=new AddressBook(
Arrays.asList(
new Person(
new Name("Bob", "Roberts"),
0,
"bob.roberts@example.com",
Arrays.asList(new PhoneNumber("1234567890")))));
TBase toRead=new AddressBook(
Arrays.asList(
new Person(
new Name("", ""),
0,
"bob.roberts@example.com",
null)));
shouldDoProjectionWithThriftColumnFilter(projectionFilterDesc,toWrite,toRead,AddressBook.class);
}
@Test
public void testReorderdOptionalFields() throws Exception {
final String projectionFilter = "**";
StructWithReorderedOptionalFields toWrite = new StructWithReorderedOptionalFields();
toWrite.setFieldOne(1);
toWrite.setFieldTwo(2);
toWrite.setFieldThree(3);
shouldDoProjectionWithThriftColumnFilter(projectionFilter,toWrite,toWrite,StructWithReorderedOptionalFields.class);
}
@Test
public void testNotPullInOptionalFields() throws Exception {
final String projectionFilterDesc = "nomatch";
TBase toWrite=new AddressBook(
Arrays.asList(
new Person(
new Name("Bob", "Roberts"),
0,
"bob.roberts@example.com",
Arrays.asList(new PhoneNumber("1234567890")))));
TBase toRead=new AddressBook();
shouldDoProjectionWithThriftColumnFilter(projectionFilterDesc, toWrite, toRead,AddressBook.class);
}
@Test
public void testPullInRequiredMaps() throws Exception{
String filter="name";
Map<String,String> mapValue=new HashMap<String,String>();
mapValue.put("a","1");
mapValue.put("b","2");
RequiredMapFixture toWrite= new RequiredMapFixture(mapValue);
toWrite.setName("testName");
RequiredMapFixture toRead=new RequiredMapFixture(new HashMap<String,String>());
toRead.setName("testName");
shouldDoProjectionWithThriftColumnFilter(filter,toWrite,toRead,RequiredMapFixture.class);
}
@Test
public void testPullInRequiredLists() throws Exception{
String filter="info";
RequiredListFixture toWrite=new RequiredListFixture(Arrays.asList(new parquet.thrift.test.Name("first_name")));
toWrite.setInfo("test_info");
RequiredListFixture toRead=new RequiredListFixture(new ArrayList<parquet.thrift.test.Name>());
toRead.setInfo("test_info");
shouldDoProjectionWithThriftColumnFilter(filter,toWrite,toRead,RequiredListFixture.class);
}
@Test
public void testPullInRequiredSets() throws Exception{
String filter="info";
RequiredSetFixture toWrite=new RequiredSetFixture(new HashSet<parquet.thrift.test.Name>(Arrays.asList(new parquet.thrift.test.Name("first_name"))));
toWrite.setInfo("test_info");
RequiredSetFixture toRead=new RequiredSetFixture(new HashSet<parquet.thrift.test.Name>());
toRead.setInfo("test_info");
shouldDoProjectionWithThriftColumnFilter(filter,toWrite,toRead,RequiredSetFixture.class);
}
@Test
public void testPullInPrimitiveValues() throws Exception{
String filter="info_string";
RequiredPrimitiveFixture toWrite= new RequiredPrimitiveFixture(true,(byte)2,(short)3,4,(long)5,(double)6.0,"7");
toWrite.setInfo_string("it's info");
RequiredPrimitiveFixture toRead= new RequiredPrimitiveFixture(false,(byte)0,(short)0,0,(long)0,(double)0.0,"");
toRead.setInfo_string("it's info");
shouldDoProjectionWithThriftColumnFilter(filter,toWrite,toRead,RequiredPrimitiveFixture.class);
}
private void shouldDoProjectionWithThriftColumnFilter(String filterDesc,TBase toWrite, TBase toRead,Class<? extends TBase<?,?>> thriftClass) throws Exception {
Configuration conf = new Configuration();
conf.set(ThriftReadSupport.THRIFT_COLUMN_FILTER_KEY, filterDesc);
shouldDoProjection(conf,toWrite,toRead,thriftClass);
}
private <T extends TBase<?,?>> void shouldDoProjection(Configuration conf,T recordToWrite,T exptectedReadResult, Class<? extends TBase<?,?>> thriftClass) throws Exception {
final Path parquetFile = new Path("target/test/TestParquetToThriftReadWriteAndProjection/file.parquet");
final FileSystem fs = parquetFile.getFileSystem(conf);
if (fs.exists(parquetFile)) {
fs.delete(parquetFile, true);
}
//create a test file
final TProtocolFactory protocolFactory = new TCompactProtocol.Factory();
final TaskAttemptID taskId = new TaskAttemptID("local", 0, true, 0, 0);
final ThriftToParquetFileWriter w = new ThriftToParquetFileWriter(parquetFile, ContextUtil.newTaskAttemptContext(conf, taskId), protocolFactory, thriftClass);
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
final TProtocol protocol = protocolFactory.getProtocol(new TIOStreamTransport(baos));
recordToWrite.write(protocol);
w.write(new BytesWritable(baos.toByteArray()));
w.close();
final ParquetThriftInputFormat<T> parquetThriftInputFormat = new ParquetThriftInputFormat<T>();
final Job job = new Job(conf, "read");
job.setInputFormatClass(ParquetThriftInputFormat.class);
ParquetThriftInputFormat.setInputPaths(job, parquetFile);
final JobID jobID = new JobID("local", 1);
List<InputSplit> splits = parquetThriftInputFormat.getSplits(ContextUtil.newJobContext(ContextUtil.getConfiguration(job), jobID));
T readValue = null;
for (InputSplit split : splits) {
TaskAttemptContext taskAttemptContext = ContextUtil.newTaskAttemptContext(ContextUtil.getConfiguration(job), new TaskAttemptID(new TaskID(jobID, true, 1), 0));
final RecordReader<Void, T> reader = parquetThriftInputFormat.createRecordReader(split, taskAttemptContext);
reader.initialize(split, taskAttemptContext);
if (reader.nextKeyValue()) {
readValue = reader.getCurrentValue();
LOG.info(readValue);
}
}
assertEquals(exptectedReadResult, readValue);
}
}