/******************************************************************************* * Copyright 2017 Capital One Services, LLC and Bitwise, Inc. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License *******************************************************************************/ package hydrograph.engine.cascading.scheme.hive.parquet; import cascading.flow.FlowProcess; import cascading.scheme.Scheme; import cascading.scheme.SinkCall; import cascading.scheme.SourceCall; import cascading.tap.Tap; import cascading.tap.hive.HiveTableDescriptor; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; import org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriteSupport; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import parquet.hadoop.ParquetInputFormat; import parquet.hadoop.ParquetOutputFormat; import parquet.hadoop.mapred.Container; import parquet.hadoop.mapred.DeprecatedParquetInputFormat; import parquet.hadoop.mapred.DeprecatedParquetOutputFormat; import java.io.IOException; import java.util.ArrayList; import java.util.List; @SuppressWarnings("rawtypes") public class HiveParquetScheme extends Scheme<JobConf, RecordReader, OutputCollector, Object[], Object[]> { /** * */ private static final long serialVersionUID = 9178222334142050386L; private HiveTableDescriptor hiveTableDescriptor; private static Logger LOG = LoggerFactory.getLogger(HiveParquetScheme.class); public HiveParquetScheme(HiveTableDescriptor hiveTableDescriptor) { // super(new Fields(hiveTableDescriptor.getColumnNames()), new Fields( // hiveTableDescriptor.getColumnNames())); this.hiveTableDescriptor = hiveTableDescriptor; } @Override public void sourcePrepare(FlowProcess<? extends JobConf> flowProcess, SourceCall<Object[], RecordReader> sourceCall) throws IOException { sourceCall.setContext(new Object[3]); sourceCall.getContext()[0] = sourceCall.getInput().createKey(); sourceCall.getContext()[1] = sourceCall.getInput().createValue(); } @Override public void sourceConfInit(FlowProcess<? extends JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) { jobConf.setInputFormat(DeprecatedParquetInputFormat.class); jobConf.set("hive.parquet.timestamp.skip.conversion", "false"); ParquetInputFormat.setReadSupportClass(jobConf, DataWritableReadSupport.class); } @SuppressWarnings("unchecked") @Override public boolean source(FlowProcess<? extends JobConf> fp, SourceCall<Object[], RecordReader> sc) throws IOException { Container<ArrayWritable> value = (Container<ArrayWritable>) sc.getInput().createValue(); boolean hasNext = sc.getInput().next(null, value); if (!hasNext) { return false; } // Skip nulls if (value == null) { return true; } Tuple tuple = WritableFactory.getTuple(value.get()); sc.getIncomingEntry().setTuple(tuple); return true; } @Override public void sinkConfInit(FlowProcess<? extends JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) { jobConf.setOutputFormat(DeprecatedParquetOutputFormat.class); jobConf.set(DataWritableWriteSupport.PARQUET_HIVE_SCHEMA, HiveParquetSchemeHelper.getParquetSchemeMessage(hiveTableDescriptor)); ParquetOutputFormat.setWriteSupportClass(jobConf, DataWritableWriteSupport.class); } @SuppressWarnings("unchecked") @Override public void sink(FlowProcess<? extends JobConf> fp, SinkCall<Object[], OutputCollector> sink) throws IOException { TupleEntry entry = sink.getOutgoingEntry(); OutputCollector outputCollector = sink.getOutput(); Tuple tuple = entry.getTuple(); List<Object> obj = new ArrayList<Object>(); for (int i = 0; i < tuple.size(); i++) { obj.add(tuple.getObject(i)); } ArrayWritable writable; try { ParquetHiveSerDe serde = new ParquetHiveSerDe(); Configuration conf = new Configuration(); serde.initialize(conf, HiveParquetSchemeHelper.getTableProperties(hiveTableDescriptor)); ObjectInspector io = serde.getObjectInspector(); writable = ParquetWritableUtils.createStruct(obj, (StructObjectInspector) io); Writable parRow = serde.serialize(writable, io); outputCollector.collect(null, parRow); } catch (SerDeException e) { LOG.error("", e); throw new RuntimeException(e); } } }