/*
* Copyright 2015 OpenCB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencb.hpg.bigdata.tools.io.parquet;
import org.apache.avro.Schema;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.parquet.avro.AvroParquetOutputFormat;
import org.opencb.hpg.bigdata.tools.utils.CompressionUtils;
/**
* Created by hpccoll1 on 05/05/15.
*/
public class ParquetMR {
private final Schema schema;
public ParquetMR(Schema schema) {
this.schema = schema;
}
public int run(String input, String output, String codecName) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "ParquetMR");
job.setJarByClass(this.getClass());
// point to input data
FileInputFormat.addInputPath(job, new Path(input));
job.setInputFormatClass(AvroKeyInputFormat.class);
AvroJob.setInputKeySchema(job, schema);
// set the output format
job.setOutputFormatClass(AvroParquetOutputFormat.class);
AvroParquetOutputFormat.setOutputPath(job, new Path(output));
AvroParquetOutputFormat.setSchema(job, schema);
AvroParquetOutputFormat.setCompression(job, CompressionUtils.getParquetCodec(codecName));
AvroParquetOutputFormat.setCompressOutput(job, true);
// set a large block size to ensure a single row group
AvroParquetOutputFormat.setBlockSize(job, 500 * 1024 * 1024);
job.setMapperClass(ParquetMapper.class);
job.setNumReduceTasks(0);
return (job.waitForCompletion(true) ? 0 : 1);
}
}