SparkPartitionPruningSinkOperator.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.parse.spark;

import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.optimizer.spark.SparkPartitionPruningSinkDesc;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.serde2.Serializer;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.annotations.VisibleForTesting;

/**
 * This operator gets partition info from the upstream operators, and write them
 * to HDFS. This will later be read at the driver, and used for pruning the partitions
 * for the big table side.
 */
public class SparkPartitionPruningSinkOperator extends Operator<SparkPartitionPruningSinkDesc> {

  @SuppressWarnings("deprecation")
  protected transient Serializer serializer;
  protected transient DataOutputBuffer buffer;
  protected static final Logger LOG = LoggerFactory.getLogger(SparkPartitionPruningSinkOperator.class);

  /** Kryo ctor. */
  @VisibleForTesting
  public SparkPartitionPruningSinkOperator() {
    super();
  }

  public SparkPartitionPruningSinkOperator(CompilationOpContext ctx) {
    super(ctx);
  }

  @Override
  @SuppressWarnings("deprecation")
  public void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);
    serializer = (Serializer) ReflectionUtils.newInstance(
        conf.getTable().getDeserializerClass(), null);
    buffer = new DataOutputBuffer();
  }

  @Override
  public void process(Object row, int tag) throws HiveException {
    ObjectInspector rowInspector = inputObjInspectors[0];
    try {
      Writable writableRow = serializer.serialize(row, rowInspector);
      writableRow.write(buffer);
    } catch (Exception e) {
      throw new HiveException(e);
    }
  }

  @Override
  public void closeOp(boolean abort) throws HiveException {
    if (!abort) {
      try {
        flushToFile();
      } catch (Exception e) {
        throw new HiveException(e);
      }
    }
  }

  private void flushToFile() throws IOException {
    // write an intermediate file to the specified path
    // the format of the path is: tmpPath/targetWorkId/sourceWorkId/randInt
    Path path = conf.getPath();
    FileSystem fs = path.getFileSystem(this.getConfiguration());
    fs.mkdirs(path);

    while (true) {
      path = new Path(path, String.valueOf(Utilities.randGen.nextInt()));
      if (!fs.exists(path)) {
        break;
      }
    }

    short numOfRepl = fs.getDefaultReplication(path);

    ObjectOutputStream out = null;
    FSDataOutputStream fsout = null;

    try {
      fsout = fs.create(path, numOfRepl);
      out = new ObjectOutputStream(new BufferedOutputStream(fsout, 4096));
      out.writeUTF(conf.getTargetColumnName());
      buffer.writeTo(out);
    } catch (Exception e) {
      try {
        fs.delete(path, false);
      } catch (Exception ex) {
        LOG.warn("Exception happened while trying to clean partial file.");
      }
      throw e;
    } finally {
      if (out != null) {
        LOG.info("Flushed to file: " + path);
        out.close();
      } else if (fsout != null) {
        fsout.close();
      }
    }
  }

  @Override
  public OperatorType getType() {
    return OperatorType.SPARKPRUNINGSINK;
  }

  @Override
  public String getName() {
    return SparkPartitionPruningSinkOperator.getOperatorName();
  }

  public static String getOperatorName() {
    return "SPARKPRUNINGSINK";
  }

}