/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io.orc; import java.io.IOException; import java.util.Properties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.orc.FileMetadata; import org.apache.orc.PhysicalWriter; import org.apache.orc.MemoryManager; import org.apache.orc.TypeDescription; import org.apache.orc.impl.OrcTail; /** * Contains factory methods to read or write ORC files. */ public final class OrcFile extends org.apache.orc.OrcFile { // unused protected OrcFile() {} /** * Create an ORC file reader. * @param fs file system * @param path file name to read from * @return a new ORC file reader. * @throws IOException */ public static Reader createReader(FileSystem fs, Path path) throws IOException { ReaderOptions opts = new ReaderOptions(new Configuration()); opts.filesystem(fs); return new ReaderImpl(path, opts); } public static class ReaderOptions extends org.apache.orc.OrcFile.ReaderOptions { public ReaderOptions(Configuration conf) { super(conf); } public ReaderOptions filesystem(FileSystem fs) { super.filesystem(fs); return this; } public ReaderOptions maxLength(long val) { super.maxLength(val); return this; } public ReaderOptions fileMetadata(FileMetadata metadata) { super.fileMetadata(metadata); return this; } public ReaderOptions orcTail(OrcTail orcTail) { super.orcTail(orcTail); return this; } } public static ReaderOptions readerOptions(Configuration conf) { return new ReaderOptions(conf); } public static Reader createReader(Path path, ReaderOptions options) throws IOException { return new ReaderImpl(path, options); } /** * Options for creating ORC file writers. */ public static class WriterOptions extends org.apache.orc.OrcFile.WriterOptions { private boolean explicitSchema = false; private ObjectInspector inspector = null; // Setting the default batch size to 1000 makes the memory check at 5000 // rows work the same as the row by row writer. (If it was the default 1024, // the smallest stripe size would be 5120 rows, which changes the output // of some of the tests.) private int batchSize = 1000; WriterOptions(Properties tableProperties, Configuration conf) { super(tableProperties, conf); } /** * A required option that sets the object inspector for the rows. If * setSchema is not called, it also defines the schema. */ public WriterOptions inspector(ObjectInspector value) { this.inspector = value; if (!explicitSchema) { super.setSchema(OrcInputFormat.convertTypeInfo( TypeInfoUtils.getTypeInfoFromObjectInspector(value))); } return this; } /** * Set the schema for the file. This is a required parameter. * @param schema the schema for the file. * @return this */ public WriterOptions setSchema(TypeDescription schema) { this.explicitSchema = true; super.setSchema(schema); return this; } /** * Provide the filesystem for the path, if the client has it available. * If it is not provided, it will be found from the path. */ public WriterOptions fileSystem(FileSystem value) { super.fileSystem(value); return this; } /** * Set the stripe size for the file. The writer stores the contents of the * stripe in memory until this memory limit is reached and the stripe * is flushed to the HDFS file and the next stripe started. */ public WriterOptions stripeSize(long value) { super.stripeSize(value); return this; } /** * Set the file system block size for the file. For optimal performance, * set the block size to be multiple factors of stripe size. */ public WriterOptions blockSize(long value) { super.blockSize(value); return this; } /** * Set the distance between entries in the row index. The minimum value is * 1000 to prevent the index from overwhelming the data. If the stride is * set to 0, no indexes will be included in the file. */ public WriterOptions rowIndexStride(int value) { super.rowIndexStride(value); return this; } /** * The size of the memory buffers used for compressing and storing the * stripe in memory. */ public WriterOptions bufferSize(int value) { super.bufferSize(value); return this; } /** * Sets whether the HDFS blocks are padded to prevent stripes from * straddling blocks. Padding improves locality and thus the speed of * reading, but costs space. */ public WriterOptions blockPadding(boolean value) { super.blockPadding(value); return this; } /** * Sets the encoding strategy that is used to encode the data. */ public WriterOptions encodingStrategy(EncodingStrategy strategy) { super.encodingStrategy(strategy); return this; } /** * Sets the tolerance for block padding as a percentage of stripe size. */ public WriterOptions paddingTolerance(double value) { super.paddingTolerance(value); return this; } /** * Comma separated values of column names for which bloom filter is to be created. */ public WriterOptions bloomFilterColumns(String columns) { super.bloomFilterColumns(columns); return this; } /** * Specify the false positive probability for bloom filter. * @param fpp - false positive probability * @return this */ public WriterOptions bloomFilterFpp(double fpp) { super.bloomFilterFpp(fpp); return this; } /** * Sets the generic compression that is used to compress the data. */ public WriterOptions compress(CompressionKind value) { super.compress(value.getUnderlying()); return this; } /** * Sets the generic compression that is used to compress the data. */ public WriterOptions compress(org.apache.orc.CompressionKind value) { super.compress(value); return this; } /** * Sets the version of the file that will be written. */ public WriterOptions version(Version value) { super.version(value); return this; } /** * Add a listener for when the stripe and file are about to be closed. * @param callback the object to be called when the stripe is closed * @return this */ public WriterOptions callback(WriterCallback callback) { super.callback(callback); return this; } /** * A package local option to set the memory manager. */ public WriterOptions memory(MemoryManager value) { super.memory(value); return this; } protected WriterOptions batchSize(int maxSize) { batchSize = maxSize; return this; } public WriterOptions physicalWriter(PhysicalWriter writer) { super.physicalWriter(writer); return this; } ObjectInspector getInspector() { return inspector; } int getBatchSize() { return batchSize; } } /** * Create a set of writer options based on a configuration. * @param conf the configuration to use for values * @return A WriterOptions object that can be modified */ public static WriterOptions writerOptions(Configuration conf) { return new WriterOptions(null, conf); } /** * Create a set of write options based on a set of table properties and * configuration. * @param tableProperties the properties of the table * @param conf the configuration of the query * @return a WriterOptions object that can be modified */ public static WriterOptions writerOptions(Properties tableProperties, Configuration conf) { return new WriterOptions(tableProperties, conf); } /** * Create an ORC file writer. This is the public interface for creating * writers going forward and new options will only be added to this method. * @param path filename to write to * @param opts the options * @return a new ORC file writer * @throws IOException */ public static Writer createWriter(Path path, WriterOptions opts ) throws IOException { FileSystem fs = opts.getFileSystem() == null ? path.getFileSystem(opts.getConfiguration()) : opts.getFileSystem(); return new WriterImpl(fs, path, opts); } /** * Create an ORC file writer. This method is provided for API backward * compatability with Hive 0.11. * @param fs file system * @param path filename to write to * @param inspector the ObjectInspector that inspects the rows * @param stripeSize the number of bytes in a stripe * @param compress how to compress the file * @param bufferSize the number of bytes to compress at once * @param rowIndexStride the number of rows between row index entries or * 0 to suppress all indexes * @return a new ORC file writer * @throws IOException */ public static Writer createWriter(FileSystem fs, Path path, Configuration conf, ObjectInspector inspector, long stripeSize, CompressionKind compress, int bufferSize, int rowIndexStride) throws IOException { return createWriter(path, writerOptions(conf) .inspector(inspector) .fileSystem(fs) .stripeSize(stripeSize) .compress(compress) .bufferSize(bufferSize) .rowIndexStride(rowIndexStride)); } }