OrcFile.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.io.orc;

import java.io.IOException;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.orc.FileMetadata;
import org.apache.orc.PhysicalWriter;
import org.apache.orc.MemoryManager;
import org.apache.orc.TypeDescription;
import org.apache.orc.impl.OrcTail;

/**
 * Contains factory methods to read or write ORC files.
 */
public final class OrcFile extends org.apache.orc.OrcFile {

  // unused
  protected OrcFile() {}

  /**
   * Create an ORC file reader.
   * @param fs file system
   * @param path file name to read from
   * @return a new ORC file reader.
   * @throws IOException
   */
  public static Reader createReader(FileSystem fs,
                                    Path path) throws IOException {
    ReaderOptions opts = new ReaderOptions(new Configuration());
    opts.filesystem(fs);
    return new ReaderImpl(path, opts);
  }

  public static class ReaderOptions extends org.apache.orc.OrcFile.ReaderOptions {
    public ReaderOptions(Configuration conf) {
      super(conf);
    }

    public ReaderOptions filesystem(FileSystem fs) {
      super.filesystem(fs);
      return this;
    }

    public ReaderOptions maxLength(long val) {
      super.maxLength(val);
      return this;
    }

    public ReaderOptions fileMetadata(FileMetadata metadata) {
      super.fileMetadata(metadata);
      return this;
    }

    public ReaderOptions orcTail(OrcTail orcTail) {
      super.orcTail(orcTail);
      return this;
    }
  }

  public static ReaderOptions readerOptions(Configuration conf) {
    return new ReaderOptions(conf);
  }

  public static Reader createReader(Path path,
                                    ReaderOptions options) throws IOException {
    return new ReaderImpl(path, options);
  }

  /**
   * Options for creating ORC file writers.
   */
  public static class WriterOptions extends org.apache.orc.OrcFile.WriterOptions {
    private boolean explicitSchema = false;
    private ObjectInspector inspector = null;
    // Setting the default batch size to 1000 makes the memory check at 5000
    // rows work the same as the row by row writer. (If it was the default 1024,
    // the smallest stripe size would be 5120 rows, which changes the output
    // of some of the tests.)
    private int batchSize = 1000;

    WriterOptions(Properties tableProperties, Configuration conf) {
      super(tableProperties, conf);
    }

   /**
     * A required option that sets the object inspector for the rows. If
     * setSchema is not called, it also defines the schema.
     */
    public WriterOptions inspector(ObjectInspector value) {
      this.inspector = value;
      if (!explicitSchema) {
        super.setSchema(OrcInputFormat.convertTypeInfo(
            TypeInfoUtils.getTypeInfoFromObjectInspector(value)));
      }
      return this;
    }

    /**
     * Set the schema for the file. This is a required parameter.
     * @param schema the schema for the file.
     * @return this
     */
    public WriterOptions setSchema(TypeDescription schema) {
      this.explicitSchema = true;
      super.setSchema(schema);
      return this;
    }

    /**
     * Provide the filesystem for the path, if the client has it available.
     * If it is not provided, it will be found from the path.
     */
    public WriterOptions fileSystem(FileSystem value) {
      super.fileSystem(value);
      return this;
    }

    /**
     * Set the stripe size for the file. The writer stores the contents of the
     * stripe in memory until this memory limit is reached and the stripe
     * is flushed to the HDFS file and the next stripe started.
     */
    public WriterOptions stripeSize(long value) {
      super.stripeSize(value);
      return this;
    }

    /**
     * Set the file system block size for the file. For optimal performance,
     * set the block size to be multiple factors of stripe size.
     */
    public WriterOptions blockSize(long value) {
      super.blockSize(value);
      return this;
    }

    /**
     * Set the distance between entries in the row index. The minimum value is
     * 1000 to prevent the index from overwhelming the data. If the stride is
     * set to 0, no indexes will be included in the file.
     */
    public WriterOptions rowIndexStride(int value) {
      super.rowIndexStride(value);
      return this;
    }

    /**
     * The size of the memory buffers used for compressing and storing the
     * stripe in memory.
     */
    public WriterOptions bufferSize(int value) {
      super.bufferSize(value);
      return this;
    }

    /**
     * Sets whether the HDFS blocks are padded to prevent stripes from
     * straddling blocks. Padding improves locality and thus the speed of
     * reading, but costs space.
     */
    public WriterOptions blockPadding(boolean value) {
      super.blockPadding(value);
      return this;
    }

    /**
     * Sets the encoding strategy that is used to encode the data.
     */
    public WriterOptions encodingStrategy(EncodingStrategy strategy) {
      super.encodingStrategy(strategy);
      return this;
    }

    /**
     * Sets the tolerance for block padding as a percentage of stripe size.
     */
    public WriterOptions paddingTolerance(double value) {
      super.paddingTolerance(value);
      return this;
    }

    /**
     * Comma separated values of column names for which bloom filter is to be created.
     */
    public WriterOptions bloomFilterColumns(String columns) {
      super.bloomFilterColumns(columns);
      return this;
    }

    /**
     * Specify the false positive probability for bloom filter.
     * @param fpp - false positive probability
     * @return this
     */
    public WriterOptions bloomFilterFpp(double fpp) {
      super.bloomFilterFpp(fpp);
      return this;
    }

    /**
     * Sets the generic compression that is used to compress the data.
     */
    public WriterOptions compress(CompressionKind value) {
      super.compress(value.getUnderlying());
      return this;
    }

    /**
     * Sets the generic compression that is used to compress the data.
     */
    public WriterOptions compress(org.apache.orc.CompressionKind value) {
      super.compress(value);
      return this;
    }

    /**
     * Sets the version of the file that will be written.
     */
    public WriterOptions version(Version value) {
      super.version(value);
      return this;
    }

    /**
     * Add a listener for when the stripe and file are about to be closed.
     * @param callback the object to be called when the stripe is closed
     * @return this
     */
    public WriterOptions callback(WriterCallback callback) {
      super.callback(callback);
      return this;
    }

    /**
     * A package local option to set the memory manager.
     */
    public WriterOptions memory(MemoryManager value) {
      super.memory(value);
      return this;
    }

    protected WriterOptions batchSize(int maxSize) {
      batchSize = maxSize;
      return this;
    }

    public WriterOptions physicalWriter(PhysicalWriter writer) {
      super.physicalWriter(writer);
      return this;
    }

    ObjectInspector getInspector() {
      return inspector;
    }

    int getBatchSize() {
      return batchSize;
    }
  }

  /**
   * Create a set of writer options based on a configuration.
   * @param conf the configuration to use for values
   * @return A WriterOptions object that can be modified
   */
  public static WriterOptions writerOptions(Configuration conf) {
    return new WriterOptions(null, conf);
  }

  /**
   * Create a set of write options based on a set of table properties and
   * configuration.
   * @param tableProperties the properties of the table
   * @param conf the configuration of the query
   * @return a WriterOptions object that can be modified
   */
  public static WriterOptions writerOptions(Properties tableProperties,
                                            Configuration conf) {
    return new WriterOptions(tableProperties, conf);
  }

  /**
   * Create an ORC file writer. This is the public interface for creating
   * writers going forward and new options will only be added to this method.
   * @param path filename to write to
   * @param opts the options
   * @return a new ORC file writer
   * @throws IOException
   */
  public static Writer createWriter(Path path,
                                    WriterOptions opts
                                    ) throws IOException {
    FileSystem fs = opts.getFileSystem() == null ?
      path.getFileSystem(opts.getConfiguration()) : opts.getFileSystem();

    return new WriterImpl(fs, path, opts);
  }

  /**
   * Create an ORC file writer. This method is provided for API backward
   * compatability with Hive 0.11.
   * @param fs file system
   * @param path filename to write to
   * @param inspector the ObjectInspector that inspects the rows
   * @param stripeSize the number of bytes in a stripe
   * @param compress how to compress the file
   * @param bufferSize the number of bytes to compress at once
   * @param rowIndexStride the number of rows between row index entries or
   *                       0 to suppress all indexes
   * @return a new ORC file writer
   * @throws IOException
   */
  public static Writer createWriter(FileSystem fs,
                                    Path path,
                                    Configuration conf,
                                    ObjectInspector inspector,
                                    long stripeSize,
                                    CompressionKind compress,
                                    int bufferSize,
                                    int rowIndexStride) throws IOException {
    return createWriter(path, writerOptions(conf)
        .inspector(inspector)
        .fileSystem(fs)
        .stripeSize(stripeSize)
        .compress(compress)
        .bufferSize(bufferSize)
        .rowIndexStride(rowIndexStride));
  }
}