HiveHFileOutputFormat.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.hbase;

import java.io.IOException;
import java.io.InterruptedIOException;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.commons.lang.NotImplementedException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellComparator;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.KeyValueUtil;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter;
import org.apache.hadoop.hive.ql.io.HiveOutputFormat;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Progressable;

/**
 * HiveHFileOutputFormat implements HiveOutputFormat for HFile bulk
 * loading.  Until HBASE-1861 is implemented, it can only be used
 * for loading a table with a single column family.
 */
public class HiveHFileOutputFormat extends
    HFileOutputFormat implements
    HiveOutputFormat<ImmutableBytesWritable, KeyValue> {

  public static final String HFILE_FAMILY_PATH = "hfile.family.path";

  static final Logger LOG = LoggerFactory.getLogger(HiveHFileOutputFormat.class.getName());

  private
  org.apache.hadoop.mapreduce.RecordWriter<ImmutableBytesWritable, KeyValue>
  getFileWriter(org.apache.hadoop.mapreduce.TaskAttemptContext tac)
  throws IOException {
    try {
      return super.getRecordWriter(tac);
    } catch (InterruptedException ex) {
      throw new IOException(ex);
    }
  }

  /**
   * Retrieve the family path, first check the JobConf, then the table properties.
   * @return the family path or null if not specified.
   */
  public static String getFamilyPath(Configuration jc, Properties tableProps) {
    return jc.get(HFILE_FAMILY_PATH, tableProps.getProperty(HFILE_FAMILY_PATH));
  }

  @Override
  public RecordWriter getHiveRecordWriter(
    final JobConf jc,
    final Path finalOutPath,
    Class<? extends Writable> valueClass,
    boolean isCompressed,
    Properties tableProperties,
    final Progressable progressable) throws IOException {

    // Read configuration for the target path, first from jobconf, then from table properties
    String hfilePath = getFamilyPath(jc, tableProperties);
    if (hfilePath == null) {
      throw new RuntimeException(
        "Please set " + HFILE_FAMILY_PATH + " to target location for HFiles");
    }

    // Target path's last component is also the column family name.
    final Path columnFamilyPath = new Path(hfilePath);
    final String columnFamilyName = columnFamilyPath.getName();
    final byte [] columnFamilyNameBytes = Bytes.toBytes(columnFamilyName);
    final Job job = new Job(jc);
    setCompressOutput(job, isCompressed);
    setOutputPath(job, finalOutPath);

    // Create the HFile writer
    final org.apache.hadoop.mapreduce.TaskAttemptContext tac =
      ShimLoader.getHadoopShims().newTaskAttemptContext(
          job.getConfiguration(), progressable);

    final Path outputdir = FileOutputFormat.getOutputPath(tac);
    final Path taskAttemptOutputdir = new FileOutputCommitter(outputdir, tac).getWorkPath();
    final org.apache.hadoop.mapreduce.RecordWriter<
      ImmutableBytesWritable, KeyValue> fileWriter = getFileWriter(tac);

    // Individual columns are going to be pivoted to HBase cells,
    // and for each row, they need to be written out in order
    // of column name, so sort the column names now, creating a
    // mapping to their column position.  However, the first
    // column is interpreted as the row key.
    String columnList = tableProperties.getProperty("columns");
    String [] columnArray = columnList.split(",");
    final SortedMap<byte [], Integer> columnMap =
      new TreeMap<byte [], Integer>(Bytes.BYTES_COMPARATOR);
    int i = 0;
    for (String columnName : columnArray) {
      if (i != 0) {
        columnMap.put(Bytes.toBytes(columnName), i);
      }
      ++i;
    }

    return new RecordWriter() {

      @Override
      public void close(boolean abort) throws IOException {
        try {
          fileWriter.close(null);
          if (abort) {
            return;
          }
          // Move the hfiles file(s) from the task output directory to the
          // location specified by the user.
          FileSystem fs = outputdir.getFileSystem(jc);
          fs.mkdirs(columnFamilyPath);
          Path srcDir = taskAttemptOutputdir;
          for (;;) {
            FileStatus [] files = fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER);
            if ((files == null) || (files.length == 0)) {
              throw new IOException("No family directories found in " + srcDir);
            }
            if (files.length != 1) {
              throw new IOException("Multiple family directories found in " + srcDir);
            }
            srcDir = files[0].getPath();
            if (srcDir.getName().equals(columnFamilyName)) {
              break;
            }
            if (files[0].isFile()) {
              throw new IOException("No family directories found in " + taskAttemptOutputdir + ". "
                  + "The last component in hfile path should match column family name "
                  + columnFamilyName);
            }
          }
          for (FileStatus regionFile : fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER)) {
            fs.rename(
              regionFile.getPath(),
              new Path(
                columnFamilyPath,
                regionFile.getPath().getName()));
          }
          // Hive actually wants a file as task output (not a directory), so
          // replace the empty directory with an empty file to keep it happy.
          fs.delete(taskAttemptOutputdir, true);
          fs.createNewFile(taskAttemptOutputdir);
        } catch (InterruptedException ex) {
          throw new IOException(ex);
        }
      }

      private void writeText(Text text) throws IOException {
        // Decompose the incoming text row into fields.
        String s = text.toString();
        String [] fields = s.split("\u0001");
        assert(fields.length <= (columnMap.size() + 1));
        // First field is the row key.
        byte [] rowKeyBytes = Bytes.toBytes(fields[0]);
        // Remaining fields are cells addressed by column name within row.
        for (Map.Entry<byte [], Integer> entry : columnMap.entrySet()) {
          byte [] columnNameBytes = entry.getKey();
          int iColumn = entry.getValue();
          String val;
          if (iColumn >= fields.length) {
            // trailing blank field
            val = "";
          } else {
            val = fields[iColumn];
            if ("\\N".equals(val)) {
              // omit nulls
              continue;
            }
          }
          byte [] valBytes = Bytes.toBytes(val);
          KeyValue kv = new KeyValue(
            rowKeyBytes,
            columnFamilyNameBytes,
            columnNameBytes,
            valBytes);
          try {
            fileWriter.write(null, kv);
          } catch (IOException e) {
            LOG.error("Failed while writing row: " + s);
            throw e;
          } catch (InterruptedException ex) {
            throw new IOException(ex);
          }
        }
      }

      private void writePut(PutWritable put) throws IOException {
        ImmutableBytesWritable row = new ImmutableBytesWritable(put.getPut().getRow());
        SortedMap<byte[], List<Cell>> cells = put.getPut().getFamilyCellMap();
        for (Map.Entry<byte[], List<Cell>> entry : cells.entrySet()) {
          Collections.sort(entry.getValue(), new CellComparator());
          for (Cell c : entry.getValue()) {
            try {
              fileWriter.write(row, KeyValueUtil.copyToNewKeyValue(c));
            } catch (InterruptedException e) {
              throw (InterruptedIOException) new InterruptedIOException().initCause(e);
            }
          }
        }
      }

      @Override
      public void write(Writable w) throws IOException {
        if (w instanceof Text) {
          writeText((Text) w);
        } else if (w instanceof PutWritable) {
          writePut((PutWritable) w);
        } else {
          throw new IOException("Unexpected writable " + w);
        }
      }
    };
  }

  @Override
  public void checkOutputSpecs(FileSystem ignored, JobConf jc) throws IOException {
    //delegate to the new api
    Job job = new Job(jc);
    JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);

    checkOutputSpecs(jobContext);
  }

  @Override
  public org.apache.hadoop.mapred.RecordWriter<ImmutableBytesWritable, KeyValue> getRecordWriter(
      FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException {
    throw new NotImplementedException("This will not be invoked");
  }
}