/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io.orc;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.TreeMap;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.ValidReadTxnList;
import org.apache.hadoop.hive.common.ValidTxnList;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.metastore.Metastore;
import org.apache.hadoop.hive.metastore.Metastore.SplitInfos;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedInputFormatInterface;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx;
import org.apache.hadoop.hive.ql.io.AcidInputFormat;
import org.apache.hadoop.hive.ql.io.AcidOutputFormat;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.AcidUtils.AcidBaseFileInfo;
import org.apache.hadoop.hive.ql.io.AcidUtils.AcidOperationalProperties;
import org.apache.hadoop.hive.ql.io.AcidUtils.Directory;
import org.apache.hadoop.hive.ql.io.AcidUtils.ParsedDelta;
import org.apache.hadoop.hive.ql.io.BatchToRowInputFormat;
import org.apache.hadoop.hive.ql.io.BatchToRowReader;
import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
import org.apache.hadoop.hive.ql.io.HdfsUtils;
import org.apache.hadoop.hive.ql.io.HiveInputFormat;
import org.apache.hadoop.hive.ql.io.IOConstants;
import org.apache.hadoop.hive.ql.io.InputFormatChecker;
import org.apache.hadoop.hive.ql.io.LlapWrappableInputFormatInterface;
import org.apache.hadoop.hive.ql.io.RecordIdentifier;
import org.apache.hadoop.hive.ql.io.SelfDescribingInputFormatInterface;
import org.apache.hadoop.hive.ql.io.StatsProvidingRecordReader;
import org.apache.hadoop.hive.ql.io.SyntheticFileId;
import org.apache.hadoop.hive.ql.io.orc.ExternalCache.ExternalFooterCachesByConf;
import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg;
import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
import org.apache.hadoop.hive.shims.HadoopShims;
import org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.StringUtils;
import org.apache.hive.common.util.Ref;
import org.apache.orc.ColumnStatistics;
import org.apache.orc.OrcProto;
import org.apache.orc.OrcUtils;
import org.apache.orc.StripeInformation;
import org.apache.orc.StripeStatistics;
import org.apache.orc.TypeDescription;
import org.apache.orc.impl.InStream;
import org.apache.orc.impl.OrcTail;
import org.apache.orc.impl.SchemaEvolution;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.google.protobuf.CodedInputStream;
/**
* A MapReduce/Hive input format for ORC files.
* <p>
* This class implements both the classic InputFormat, which stores the rows
* directly, and AcidInputFormat, which stores a series of events with the
* following schema:
* <pre>
* class AcidEvent<ROW> {
* enum ACTION {INSERT, UPDATE, DELETE}
* ACTION operation;
* long originalTransaction;
* int bucket;
* long rowId;
* long currentTransaction;
* ROW row;
* }
* </pre>
* Each AcidEvent object corresponds to an update event. The
* originalTransaction, bucket, and rowId are the unique identifier for the row.
* The operation and currentTransaction are the operation and the transaction
* that added this event. Insert and update events include the entire row, while
* delete events have null for row.
*/
public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>,
InputFormatChecker, VectorizedInputFormatInterface, LlapWrappableInputFormatInterface,
SelfDescribingInputFormatInterface, AcidInputFormat<NullWritable, OrcStruct>,
CombineHiveInputFormat.AvoidSplitCombination, BatchToRowInputFormat {
static enum SplitStrategyKind {
HYBRID,
BI,
ETL
}
private static final Logger LOG = LoggerFactory.getLogger(OrcInputFormat.class);
private static final boolean isDebugEnabled = LOG.isDebugEnabled();
static final HadoopShims SHIMS = ShimLoader.getHadoopShims();
private static final long DEFAULT_MIN_SPLIT_SIZE = 16 * 1024 * 1024;
private static final long DEFAULT_MAX_SPLIT_SIZE = 256 * 1024 * 1024;
private static final int DEFAULT_ETL_FILE_THRESHOLD = 100;
/**
* When picking the hosts for a split that crosses block boundaries,
* drop any host that has fewer than MIN_INCLUDED_LOCATION of the
* number of bytes available on the host with the most.
* If host1 has 10MB of the split, host2 has 20MB, and host3 has 18MB the
* split will contain host2 (100% of host2) and host3 (90% of host2). Host1
* with 50% will be dropped.
*/
private static final double MIN_INCLUDED_LOCATION = 0.80;
@Override
public boolean shouldSkipCombine(Path path,
Configuration conf) throws IOException {
return (conf.get(AcidUtils.CONF_ACID_KEY) != null) || AcidUtils.isAcid(path, conf);
}
/**
* We can derive if a split is ACID or not from the flags encoded in OrcSplit.
* If the file split is not instance of OrcSplit then its definitely not ACID.
* If file split is instance of OrcSplit and the flags contain hasBase or deltas then it's
* definitely ACID.
* Else fallback to configuration object/table property.
* @param conf
* @param inputSplit
* @return
*/
public boolean isAcidRead(Configuration conf, InputSplit inputSplit) {
if (!(inputSplit instanceof OrcSplit)) {
return false;
}
/*
* If OrcSplit.isAcid returns true, we know for sure it is ACID.
*/
// if (((OrcSplit) inputSplit).isAcid()) {
// return true;
// }
/*
* Fallback for the case when OrcSplit flags do not contain hasBase and deltas
*/
return HiveConf.getBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN);
}
private static class OrcRecordReader
implements org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct>,
StatsProvidingRecordReader {
private final RecordReader reader;
private final long offset;
private final long length;
private final int numColumns;
private float progress = 0.0f;
private final Reader file;
private final SerDeStats stats;
OrcRecordReader(Reader file, Configuration conf,
FileSplit split) throws IOException {
List<OrcProto.Type> types = file.getTypes();
this.file = file;
numColumns = (types.size() == 0) ? 0 : types.get(0).getSubtypesCount();
this.offset = split.getStart();
this.length = split.getLength();
this.reader = createReaderFromFile(file, conf, offset, length);
this.stats = new SerDeStats();
}
@Override
public boolean next(NullWritable key, OrcStruct value) throws IOException {
if (reader.hasNext()) {
reader.next(value);
progress = reader.getProgress();
return true;
} else {
return false;
}
}
@Override
public NullWritable createKey() {
return NullWritable.get();
}
@Override
public OrcStruct createValue() {
return new OrcStruct(numColumns);
}
@Override
public long getPos() throws IOException {
return offset + (long) (progress * length);
}
@Override
public void close() throws IOException {
reader.close();
}
@Override
public float getProgress() throws IOException {
return progress;
}
@Override
public SerDeStats getStats() {
stats.setRawDataSize(file.getRawDataSize());
stats.setRowCount(file.getNumberOfRows());
return stats;
}
}
/**
* Get the root column for the row. In ACID format files, it is offset by
* the extra metadata columns.
* @param isOriginal is the file in the original format?
* @return the column number for the root of row.
*/
public static int getRootColumn(boolean isOriginal) {
return isOriginal ? 0 : (OrcRecordUpdater.ROW + 1);
}
public static void raiseAcidTablesMustBeReadWithAcidReaderException(Configuration conf)
throws IOException {
String hiveInputFormat = HiveConf.getVar(conf, ConfVars.HIVEINPUTFORMAT);
if (hiveInputFormat.equals(HiveInputFormat.class.getName())) {
throw new IOException(ErrorMsg.ACID_TABLES_MUST_BE_READ_WITH_ACID_READER.getErrorCodedMsg());
} else {
throw new IOException(ErrorMsg.ACID_TABLES_MUST_BE_READ_WITH_HIVEINPUTFORMAT.getErrorCodedMsg());
}
}
public static RecordReader createReaderFromFile(Reader file,
Configuration conf,
long offset, long length
) throws IOException {
boolean isTransactionalTableScan = HiveConf.getBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN);
if (isTransactionalTableScan) {
raiseAcidTablesMustBeReadWithAcidReaderException(conf);
}
/**
* Do we have schema on read in the configuration variables?
*/
TypeDescription schema = getDesiredRowTypeDescr(conf, false, Integer.MAX_VALUE);
Reader.Options options = new Reader.Options().range(offset, length);
options.schema(schema);
boolean isOriginal = isOriginal(file);
if (schema == null) {
schema = file.getSchema();
}
List<OrcProto.Type> types = OrcUtils.getOrcTypes(schema);
options.include(genIncludedColumns(schema, conf));
setSearchArgument(options, types, conf, isOriginal);
return file.rowsOptions(options);
}
public static boolean isOriginal(Reader file) {
return !file.hasMetadataValue(OrcRecordUpdater.ACID_KEY_INDEX_NAME);
}
public static boolean[] genIncludedColumns(TypeDescription readerSchema,
List<Integer> included) {
boolean[] result = new boolean[readerSchema.getMaximumId() + 1];
if (included == null) {
Arrays.fill(result, true);
return result;
}
result[0] = true;
List<TypeDescription> children = readerSchema.getChildren();
for (int columnNumber = 0; columnNumber < children.size(); ++columnNumber) {
if (included.contains(columnNumber)) {
TypeDescription child = children.get(columnNumber);
for(int col = child.getId(); col <= child.getMaximumId(); ++col) {
result[col] = true;
}
}
}
return result;
}
/**
* Reverses genIncludedColumns; produces the table columns indexes from ORC included columns.
* @param readerSchema The ORC reader schema for the table.
* @param included The included ORC columns.
* @param isFullColumnMatch Whether full column match should be enforced (i.e. whether to expect
* that all the sub-columns or a complex type column should be included or excluded
* together in the included array. If false, any sub-column being included for a complex
* type is sufficient for the entire complex column to be included in the result.
* @return The list of table column indexes.
*/
public static List<Integer> genIncludedColumnsReverse(
TypeDescription readerSchema, boolean[] included, boolean isFullColumnMatch) {
assert included != null;
List<Integer> result = new ArrayList<>();
List<TypeDescription> children = readerSchema.getChildren();
for (int columnNumber = 0; columnNumber < children.size(); ++columnNumber) {
TypeDescription child = children.get(columnNumber);
int id = child.getId();
int maxId = child.getMaximumId();
if (id >= included.length || maxId >= included.length) {
throw new AssertionError("Inconsistent includes: " + included.length
+ " elements; found column ID " + id);
}
boolean isIncluded = included[id];
for (int col = id + 1; col <= maxId; ++col) {
if (isFullColumnMatch && included[col] != isIncluded) {
throw new AssertionError("Inconsistent includes: root column IDs are [" + id + ", "
+ maxId + "]; included[" + col + "] = " + included[col] + ", which is different "
+ " from the previous IDs of the same root column.");
}
isIncluded = isIncluded || included[col];
}
if (isIncluded) {
result.add(columnNumber);
}
}
return result;
}
/**
* Take the configuration and figure out which columns we need to include.
* @param readerSchema the types for the reader
* @param conf the configuration
*/
public static boolean[] genIncludedColumns(TypeDescription readerSchema,
Configuration conf) {
if (!ColumnProjectionUtils.isReadAllColumns(conf)) {
List<Integer> included = ColumnProjectionUtils.getReadColumnIDs(conf);
return genIncludedColumns(readerSchema, included);
} else {
return null;
}
}
public static String[] getSargColumnNames(String[] originalColumnNames,
List<OrcProto.Type> types, boolean[] includedColumns, boolean isOriginal) {
int rootColumn = getRootColumn(isOriginal);
String[] columnNames = new String[types.size() - rootColumn];
int i = 0;
// The way this works is as such. originalColumnNames is the equivalent on getNeededColumns
// from TSOP. They are assumed to be in the same order as the columns in ORC file, AND they are
// assumed to be equivalent to the columns in includedColumns (because it was generated from
// the same column list at some point in the past), minus the subtype columns. Therefore, when
// we go thru all the top level ORC file columns that are included, in order, they match
// originalColumnNames. This way, we do not depend on names stored inside ORC for SARG leaf
// column name resolution (see mapSargColumns method).
for(int columnId: types.get(rootColumn).getSubtypesList()) {
if (includedColumns == null || includedColumns[columnId - rootColumn]) {
// this is guaranteed to be positive because types only have children
// ids greater than their own id.
columnNames[columnId - rootColumn] = originalColumnNames[i++];
}
}
return columnNames;
}
static void setSearchArgument(Reader.Options options,
List<OrcProto.Type> types,
Configuration conf,
boolean isOriginal) {
String neededColumnNames = getNeededColumnNamesString(conf);
if (neededColumnNames == null) {
LOG.debug("No ORC pushdown predicate - no column names");
options.searchArgument(null, null);
return;
}
SearchArgument sarg = ConvertAstToSearchArg.createFromConf(conf);
if (sarg == null) {
LOG.debug("No ORC pushdown predicate");
options.searchArgument(null, null);
return;
}
if (LOG.isInfoEnabled()) {
LOG.info("ORC pushdown predicate: " + sarg);
}
options.searchArgument(sarg, getSargColumnNames(
neededColumnNames.split(","), types, options.getInclude(), isOriginal));
}
static boolean canCreateSargFromConf(Configuration conf) {
if (getNeededColumnNamesString(conf) == null) {
LOG.debug("No ORC pushdown predicate - no column names");
return false;
}
if (!ConvertAstToSearchArg.canCreateFromConf(conf)) {
LOG.debug("No ORC pushdown predicate");
return false;
}
return true;
}
private static String[] extractNeededColNames(
List<OrcProto.Type> types, Configuration conf, boolean[] include, boolean isOriginal) {
String colNames = getNeededColumnNamesString(conf);
if (colNames == null) {
return null;
}
return extractNeededColNames(types, colNames, include, isOriginal);
}
private static String[] extractNeededColNames(
List<OrcProto.Type> types, String columnNamesString, boolean[] include, boolean isOriginal) {
return getSargColumnNames(columnNamesString.split(","), types, include, isOriginal);
}
static String getNeededColumnNamesString(Configuration conf) {
return conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR);
}
static String getSargColumnIDsString(Configuration conf) {
return conf.getBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, true) ? null
: conf.get(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR);
}
@Override
public boolean validateInput(FileSystem fs, HiveConf conf,
List<FileStatus> files
) throws IOException {
if (Utilities.getUseVectorizedInputFileFormat(conf)) {
return new VectorizedOrcInputFormat().validateInput(fs, conf, files);
}
if (files.size() <= 0) {
return false;
}
for (FileStatus file : files) {
// 0 length files cannot be ORC files
if (file.getLen() == 0) {
return false;
}
try {
OrcFile.createReader(file.getPath(),
OrcFile.readerOptions(conf).filesystem(fs).maxLength(file.getLen()));
} catch (IOException e) {
return false;
}
}
return true;
}
/**
* Get the list of input {@link Path}s for the map-reduce job.
*
* @param conf The configuration of the job
* @return the list of input {@link Path}s for the map-reduce job.
*/
static Path[] getInputPaths(Configuration conf) throws IOException {
String dirs = conf.get("mapred.input.dir");
if (dirs == null) {
throw new IOException("Configuration mapred.input.dir is not defined.");
}
String [] list = StringUtils.split(dirs);
Path[] result = new Path[list.length];
for (int i = 0; i < list.length; i++) {
result[i] = new Path(StringUtils.unEscapeString(list[i]));
}
return result;
}
/**
* The global information about the split generation that we pass around to
* the different worker threads.
*/
static class Context {
private final Configuration conf;
// We store all caches in variables to change the main one based on config.
// This is not thread safe between different split generations (and wasn't anyway).
private FooterCache footerCache;
private static LocalCache localCache;
private static ExternalCache metaCache;
static ExecutorService threadPool = null;
private final int numBuckets;
private final int splitStrategyBatchMs;
private final long maxSize;
private final long minSize;
private final int etlFileThreshold;
private final boolean footerInSplits;
private final boolean cacheStripeDetails;
private final boolean forceThreadpool;
private final AtomicInteger cacheHitCounter = new AtomicInteger(0);
private final AtomicInteger numFilesCounter = new AtomicInteger(0);
private final ValidTxnList transactionList;
private SplitStrategyKind splitStrategyKind;
private final SearchArgument sarg;
private final AcidOperationalProperties acidOperationalProperties;
Context(Configuration conf) throws IOException {
this(conf, 1, null);
}
Context(Configuration conf, final int minSplits) throws IOException {
this(conf, minSplits, null);
}
@VisibleForTesting
Context(Configuration conf, final int minSplits, ExternalFooterCachesByConf efc)
throws IOException {
this.conf = conf;
this.forceThreadpool = HiveConf.getBoolVar(conf, ConfVars.HIVE_IN_TEST);
this.sarg = ConvertAstToSearchArg.createFromConf(conf);
minSize = HiveConf.getLongVar(conf, ConfVars.MAPREDMINSPLITSIZE, DEFAULT_MIN_SPLIT_SIZE);
maxSize = HiveConf.getLongVar(conf, ConfVars.MAPREDMAXSPLITSIZE, DEFAULT_MAX_SPLIT_SIZE);
String ss = conf.get(ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname);
if (ss == null || ss.equals(SplitStrategyKind.HYBRID.name())) {
splitStrategyKind = SplitStrategyKind.HYBRID;
} else {
LOG.info("Enforcing " + ss + " ORC split strategy");
splitStrategyKind = SplitStrategyKind.valueOf(ss);
}
footerInSplits = HiveConf.getBoolVar(conf,
ConfVars.HIVE_ORC_INCLUDE_FILE_FOOTER_IN_SPLITS);
numBuckets =
Math.max(conf.getInt(hive_metastoreConstants.BUCKET_COUNT, 0), 0);
splitStrategyBatchMs = HiveConf.getIntVar(conf, ConfVars.HIVE_ORC_SPLIT_DIRECTORY_BATCH_MS);
LOG.debug("Number of buckets specified by conf file is " + numBuckets);
long cacheMemSize = HiveConf.getSizeVar(
conf, ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_MEMORY_SIZE);
int numThreads = HiveConf.getIntVar(conf, ConfVars.HIVE_ORC_COMPUTE_SPLITS_NUM_THREADS);
boolean useSoftReference = HiveConf.getBoolVar(
conf, ConfVars.HIVE_ORC_CACHE_USE_SOFT_REFERENCES);
cacheStripeDetails = (cacheMemSize > 0);
this.etlFileThreshold = minSplits <= 0 ? DEFAULT_ETL_FILE_THRESHOLD : minSplits;
synchronized (Context.class) {
if (threadPool == null) {
threadPool = Executors.newFixedThreadPool(numThreads,
new ThreadFactoryBuilder().setDaemon(true)
.setNameFormat("ORC_GET_SPLITS #%d").build());
}
// TODO: local cache is created once, so the configs for future queries will not be honored.
if (cacheStripeDetails) {
// Note that there's no FS check here; we implicitly only use metastore cache for
// HDFS, because only HDFS would return fileIds for us. If fileId is extended using
// size/mod time/etc. for other FSes, we might need to check FSes explicitly because
// using such an aggregate fileId cache is not bulletproof and should be disable-able.
boolean useExternalCache = HiveConf.getBoolVar(
conf, HiveConf.ConfVars.HIVE_ORC_MS_FOOTER_CACHE_ENABLED);
if (useExternalCache) {
if (LOG.isDebugEnabled()) {
LOG.debug(
"Turning off hive.orc.splits.ms.footer.cache.enabled since it is not fully supported yet");
}
useExternalCache = false;
}
if (localCache == null) {
localCache = new LocalCache(numThreads, cacheMemSize, useSoftReference);
}
if (useExternalCache) {
if (metaCache == null) {
metaCache = new ExternalCache(localCache,
efc == null ? new MetastoreExternalCachesByConf() : efc);
}
assert conf instanceof HiveConf;
metaCache.configure((HiveConf)conf);
}
// Set footer cache for current split generation. See field comment - not thread safe.
// TODO: we should be able to enable caches separately
footerCache = useExternalCache ? metaCache : localCache;
}
}
String value = conf.get(ValidTxnList.VALID_TXNS_KEY);
transactionList = value == null ? new ValidReadTxnList() : new ValidReadTxnList(value);
// Determine the transactional_properties of the table from the job conf stored in context.
// The table properties are copied to job conf at HiveInputFormat::addSplitsForGroup(),
// & therefore we should be able to retrieve them here and determine appropriate behavior.
// Note that this will be meaningless for non-acid tables & will be set to null.
boolean isTableTransactional = conf.getBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, false);
String transactionalProperties = conf.get(hive_metastoreConstants.TABLE_TRANSACTIONAL_PROPERTIES);
this.acidOperationalProperties = isTableTransactional ?
AcidOperationalProperties.parseString(transactionalProperties) : null;
}
@VisibleForTesting
static int getCurrentThreadPoolSize() {
synchronized (Context.class) {
return (threadPool instanceof ThreadPoolExecutor)
? ((ThreadPoolExecutor)threadPool).getPoolSize() : ((threadPool == null) ? 0 : -1);
}
}
@VisibleForTesting
public static void resetThreadPool() {
synchronized (Context.class) {
threadPool = null;
}
}
@VisibleForTesting
public static void clearLocalCache() {
if (localCache == null) return;
localCache.clear();
}
}
/**
* The full ACID directory information needed for splits; no more calls to HDFS needed.
* We could just live with AcidUtils.Directory but...
* 1) That doesn't have base files for the base-directory case.
* 2) We save fs for convenience to avoid getting it twice.
*/
@VisibleForTesting
static final class AcidDirInfo {
public AcidDirInfo(FileSystem fs, Path splitPath, Directory acidInfo,
List<AcidBaseFileInfo> baseFiles,
List<ParsedDelta> parsedDeltas) {
this.splitPath = splitPath;
this.acidInfo = acidInfo;
this.baseFiles = baseFiles;
this.fs = fs;
this.parsedDeltas = parsedDeltas;
}
final FileSystem fs;
final Path splitPath;
final AcidUtils.Directory acidInfo;
final List<AcidBaseFileInfo> baseFiles;
final List<ParsedDelta> parsedDeltas;
}
@VisibleForTesting
interface SplitStrategy<T> {
List<T> getSplits() throws IOException;
}
@VisibleForTesting
static final class SplitInfo extends ACIDSplitStrategy {
private final Context context;
private final FileSystem fs;
private final HdfsFileStatusWithId fileWithId;
private final OrcTail orcTail;
private final List<OrcProto.Type> readerTypes;
private final boolean isOriginal;
private final List<DeltaMetaData> deltas;
private final boolean hasBase;
private final ByteBuffer ppdResult;
SplitInfo(Context context, FileSystem fs, HdfsFileStatusWithId fileWithId, OrcTail orcTail,
List<OrcProto.Type> readerTypes, boolean isOriginal, List<DeltaMetaData> deltas,
boolean hasBase, Path dir, boolean[] covered, ByteBuffer ppdResult) throws IOException {
super(dir, context.numBuckets, deltas, covered, context.acidOperationalProperties);
this.context = context;
this.fs = fs;
this.fileWithId = fileWithId;
this.orcTail = orcTail;
this.readerTypes = readerTypes;
this.isOriginal = isOriginal;
this.deltas = deltas;
this.hasBase = hasBase;
this.ppdResult = ppdResult;
}
@VisibleForTesting
public SplitInfo(Context context, FileSystem fs, FileStatus fileStatus, OrcTail orcTail,
List<OrcProto.Type> readerTypes, boolean isOriginal, ArrayList<DeltaMetaData> deltas,
boolean hasBase, Path dir, boolean[] covered) throws IOException {
this(context, fs, AcidUtils.createOriginalObj(null, fileStatus),
orcTail, readerTypes, isOriginal, deltas, hasBase, dir, covered, null);
}
}
/**
* ETL strategy is used when spending little more time in split generation is acceptable
* (split generation reads and caches file footers).
*/
static final class ETLSplitStrategy implements SplitStrategy<SplitInfo>, Callable<Void> {
private static final int ETL_COMBINE_FILE_LIMIT = 500;
private static class ETLDir {
public ETLDir(Path dir, FileSystem fs, int fileCount) {
this.dir = dir;
this.fs = fs;
this.fileCount = fileCount;
}
private final int fileCount;
private final Path dir;
private final FileSystem fs;
}
Context context;
final List<ETLDir> dirs;
List<HdfsFileStatusWithId> files;
private final List<DeltaMetaData> deltas;
private final boolean[] covered;
final boolean isOriginal;
final List<OrcProto.Type> readerTypes;
// References to external fields for async SplitInfo generation.
private List<Future<List<OrcSplit>>> splitFuturesRef = null;
private List<OrcSplit> splitsRef = null;
private final UserGroupInformation ugi;
private final boolean allowSyntheticFileIds;
public ETLSplitStrategy(Context context, FileSystem fs, Path dir,
List<HdfsFileStatusWithId> children, List<OrcProto.Type> readerTypes, boolean isOriginal,
List<DeltaMetaData> deltas, boolean[] covered, UserGroupInformation ugi, boolean allowSyntheticFileIds) {
assert !children.isEmpty();
this.context = context;
this.dirs = Lists.newArrayList(new ETLDir(dir, fs, children.size()));
this.files = children;
this.isOriginal = isOriginal;
this.readerTypes = readerTypes;
this.deltas = deltas;
this.covered = covered;
this.ugi = ugi;
this.allowSyntheticFileIds = allowSyntheticFileIds;
}
@Override
public List<SplitInfo> getSplits() throws IOException {
List<SplitInfo> result = new ArrayList<>(files.size());
// Force local cache if we have deltas.
FooterCache cache = context.cacheStripeDetails ? ((deltas == null || deltas.isEmpty())
? context.footerCache : Context.localCache) : null;
if (cache != null) {
OrcTail[] orcTails = new OrcTail[files.size()];
ByteBuffer[] ppdResults = null;
if (cache.hasPpd()) {
ppdResults = new ByteBuffer[files.size()];
}
try {
cache.getAndValidate(files, isOriginal, orcTails, ppdResults);
} catch (HiveException e) {
throw new IOException(e);
}
int dirIx = -1, fileInDirIx = -1, filesInDirCount = 0;
ETLDir dir = null;
for (int i = 0; i < files.size(); ++i) {
if ((++fileInDirIx) == filesInDirCount) {
dir = dirs.get(++dirIx);
filesInDirCount = dir.fileCount;
}
OrcTail orcTail = orcTails[i];
ByteBuffer ppdResult = ppdResults == null ? null : ppdResults[i];
HdfsFileStatusWithId file = files.get(i);
if (orcTail != null) {
// Cached copy is valid
context.cacheHitCounter.incrementAndGet();
}
// Ignore files eliminated by PPD, or of 0 length.
if (ppdResult != FooterCache.NO_SPLIT_AFTER_PPD && file.getFileStatus().getLen() > 0) {
result.add(new SplitInfo(context, dir.fs, file, orcTail, readerTypes,
isOriginal, deltas, true, dir.dir, covered, ppdResult));
}
}
} else {
int dirIx = -1, fileInDirIx = -1, filesInDirCount = 0;
ETLDir dir = null;
for (HdfsFileStatusWithId file : files) {
if ((++fileInDirIx) == filesInDirCount) {
dir = dirs.get(++dirIx);
filesInDirCount = dir.fileCount;
}
// ignore files of 0 length
if (file.getFileStatus().getLen() > 0) {
result.add(new SplitInfo(context, dir.fs, file, null, readerTypes,
isOriginal, deltas, true, dir.dir, covered, null));
}
}
}
return result;
}
@Override
public String toString() {
if (dirs.size() == 1) {
return ETLSplitStrategy.class.getSimpleName() + " strategy for " + dirs.get(0).dir;
} else {
StringBuilder sb = new StringBuilder(ETLSplitStrategy.class.getSimpleName()
+ " strategy for ");
boolean isFirst = true;
for (ETLDir dir : dirs) {
if (!isFirst) sb.append(", ");
isFirst = false;
sb.append(dir.dir);
}
return sb.toString();
}
}
enum CombineResult {
YES, // Combined, all good.
NO_AND_CONTINUE, // Don't combine with that, but may combine with others.
NO_AND_SWAP // Don't combine with with that, and make that a base for new combines.
// We may add NO_AND_STOP in future where combine is impossible and other should not be base.
}
public CombineResult combineWith(FileSystem fs, Path dir,
List<HdfsFileStatusWithId> otherFiles, boolean isOriginal) {
if ((files.size() + otherFiles.size()) > ETL_COMBINE_FILE_LIMIT
|| this.isOriginal != isOriginal) {
return (files.size() > otherFiles.size())
? CombineResult.NO_AND_SWAP : CombineResult.NO_AND_CONTINUE;
}
// All good, combine the base/original only ETL strategies.
files.addAll(otherFiles);
dirs.add(new ETLDir(dir, fs, otherFiles.size()));
return CombineResult.YES;
}
public Future<Void> generateSplitWork(Context context,
List<Future<List<OrcSplit>>> splitFutures, List<OrcSplit> splits) throws IOException {
if ((context.cacheStripeDetails && context.footerCache.isBlocking())
|| context.forceThreadpool) {
this.splitFuturesRef = splitFutures;
this.splitsRef = splits;
return Context.threadPool.submit(this);
} else {
runGetSplitsSync(splitFutures, splits, null);
return null;
}
}
@Override
public Void call() throws IOException {
if (ugi == null) {
runGetSplitsSync(splitFuturesRef, splitsRef, null);
return null;
}
try {
return ugi.doAs(new PrivilegedExceptionAction<Void>() {
@Override
public Void run() throws Exception {
runGetSplitsSync(splitFuturesRef, splitsRef, ugi);
return null;
}
});
} catch (InterruptedException e) {
throw new IOException(e);
}
}
private void runGetSplitsSync(List<Future<List<OrcSplit>>> splitFutures,
List<OrcSplit> splits, UserGroupInformation ugi) throws IOException {
UserGroupInformation tpUgi = ugi == null ? UserGroupInformation.getCurrentUser() : ugi;
List<SplitInfo> splitInfos = getSplits();
List<Future<List<OrcSplit>>> localListF = null;
List<OrcSplit> localListS = null;
for (SplitInfo splitInfo : splitInfos) {
SplitGenerator sg = new SplitGenerator(splitInfo, tpUgi, allowSyntheticFileIds);
if (!sg.isBlocking()) {
if (localListS == null) {
localListS = new ArrayList<>(splits.size());
}
// Already called in doAs, so no need to doAs here.
localListS.addAll(sg.call());
} else {
if (localListF == null) {
localListF = new ArrayList<>(splits.size());
}
localListF.add(Context.threadPool.submit(sg));
}
}
if (localListS != null) {
synchronized (splits) {
splits.addAll(localListS);
}
}
if (localListF != null) {
synchronized (splitFutures) {
splitFutures.addAll(localListF);
}
}
}
}
/**
* BI strategy is used when the requirement is to spend less time in split generation
* as opposed to query execution (split generation does not read or cache file footers).
*/
static final class BISplitStrategy extends ACIDSplitStrategy {
private final List<HdfsFileStatusWithId> fileStatuses;
private final boolean isOriginal;
private final List<DeltaMetaData> deltas;
private final FileSystem fs;
private final Path dir;
private final boolean allowSyntheticFileIds;
public BISplitStrategy(Context context, FileSystem fs,
Path dir, List<HdfsFileStatusWithId> fileStatuses, boolean isOriginal,
List<DeltaMetaData> deltas, boolean[] covered, boolean allowSyntheticFileIds) {
super(dir, context.numBuckets, deltas, covered, context.acidOperationalProperties);
this.fileStatuses = fileStatuses;
this.isOriginal = isOriginal;
this.deltas = deltas;
this.fs = fs;
this.dir = dir;
this.allowSyntheticFileIds = allowSyntheticFileIds;
}
@Override
public List<OrcSplit> getSplits() throws IOException {
List<OrcSplit> splits = Lists.newArrayList();
for (HdfsFileStatusWithId file : fileStatuses) {
FileStatus fileStatus = file.getFileStatus();
if (fileStatus.getLen() != 0) {
Object fileKey = file.getFileId();
if (fileKey == null && allowSyntheticFileIds) {
fileKey = new SyntheticFileId(fileStatus);
}
TreeMap<Long, BlockLocation> blockOffsets = SHIMS.getLocationsWithOffset(fs, fileStatus);
for (Map.Entry<Long, BlockLocation> entry : blockOffsets.entrySet()) {
OrcSplit orcSplit = new OrcSplit(fileStatus.getPath(), fileKey, entry.getKey(),
entry.getValue().getLength(), entry.getValue().getHosts(), null, isOriginal, true,
deltas, -1, fileStatus.getLen());
splits.add(orcSplit);
}
}
}
// add uncovered ACID delta splits
splits.addAll(super.getSplits());
return splits;
}
@Override
public String toString() {
return BISplitStrategy.class.getSimpleName() + " strategy for " + dir;
}
}
/**
* ACID split strategy is used when there is no base directory (when transactions are enabled).
*/
static class ACIDSplitStrategy implements SplitStrategy<OrcSplit> {
Path dir;
List<DeltaMetaData> deltas;
boolean[] covered;
int numBuckets;
AcidOperationalProperties acidOperationalProperties;
public ACIDSplitStrategy(Path dir, int numBuckets, List<DeltaMetaData> deltas, boolean[] covered,
AcidOperationalProperties acidOperationalProperties) {
this.dir = dir;
this.numBuckets = numBuckets;
this.deltas = deltas;
this.covered = covered;
this.acidOperationalProperties = acidOperationalProperties;
}
@Override
public List<OrcSplit> getSplits() throws IOException {
List<OrcSplit> splits = Lists.newArrayList();
// When split-update is enabled, we do not need to account for buckets that aren't covered.
// This is a huge performance benefit of split-update. And the reason why we are able to
// do so is because the 'deltas' here are actually only the delete_deltas. All the insert_deltas
// with valid user payload data has already been considered as base for the covered buckets.
// Hence, the uncovered buckets do not have any relevant data and we can just ignore them.
if (acidOperationalProperties != null && acidOperationalProperties.isSplitUpdate()) {
return splits; // return an empty list.
}
// Generate a split for any buckets that weren't covered.
// This happens in the case where a bucket just has deltas and no
// base.
if (!deltas.isEmpty()) {
for (int b = 0; b < numBuckets; ++b) {
if (!covered[b]) {
splits.add(new OrcSplit(dir, null, b, 0, new String[0], null, false, false, deltas, -1, -1));
}
}
}
return splits;
}
@Override
public String toString() {
return ACIDSplitStrategy.class.getSimpleName() + " strategy for " + dir;
}
}
/**
* Given a directory, get the list of files and blocks in those files.
* To parallelize file generator use "mapreduce.input.fileinputformat.list-status.num-threads"
*/
static final class FileGenerator implements Callable<AcidDirInfo> {
private final Context context;
private final FileSystem fs;
private final Path dir;
private final Ref<Boolean> useFileIds;
private final UserGroupInformation ugi;
FileGenerator(Context context, FileSystem fs, Path dir, boolean useFileIds,
UserGroupInformation ugi) {
this(context, fs, dir, Ref.from(useFileIds), ugi);
}
FileGenerator(Context context, FileSystem fs, Path dir, Ref<Boolean> useFileIds,
UserGroupInformation ugi) {
this.context = context;
this.fs = fs;
this.dir = dir;
this.useFileIds = useFileIds;
this.ugi = ugi;
}
@Override
public AcidDirInfo call() throws IOException {
if (ugi == null) {
return callInternal();
}
try {
return ugi.doAs(new PrivilegedExceptionAction<AcidDirInfo>() {
@Override
public AcidDirInfo run() throws Exception {
return callInternal();
}
});
} catch (InterruptedException e) {
throw new IOException(e);
}
}
private AcidDirInfo callInternal() throws IOException {
AcidUtils.Directory dirInfo = AcidUtils.getAcidState(dir, context.conf,
context.transactionList, useFileIds, true);
Path base = dirInfo.getBaseDirectory();
// find the base files (original or new style)
List<AcidBaseFileInfo> baseFiles = new ArrayList<AcidBaseFileInfo>();
if (base == null) {
for (HdfsFileStatusWithId fileId : dirInfo.getOriginalFiles()) {
baseFiles.add(new AcidBaseFileInfo(fileId, AcidUtils.AcidBaseFileType.ORIGINAL_BASE));
}
} else {
List<HdfsFileStatusWithId> compactedBaseFiles = findBaseFiles(base, useFileIds);
for (HdfsFileStatusWithId fileId : compactedBaseFiles) {
baseFiles.add(new AcidBaseFileInfo(fileId, AcidUtils.AcidBaseFileType.COMPACTED_BASE));
}
}
// Find the parsed deltas- some of them containing only the insert delta events
// may get treated as base if split-update is enabled for ACID. (See HIVE-14035 for details)
List<ParsedDelta> parsedDeltas = new ArrayList<ParsedDelta>();
if (context.acidOperationalProperties != null &&
context.acidOperationalProperties.isSplitUpdate()) {
// If we have split-update turned on for this table, then the delta events have already been
// split into two directories- delta_x_y/ and delete_delta_x_y/.
// When you have split-update turned on, the insert events go to delta_x_y/ directory and all
// the delete events go to delete_x_y/. An update event will generate two events-
// a delete event for the old record that is put into delete_delta_x_y/,
// followed by an insert event for the updated record put into the usual delta_x_y/.
// Therefore, everything inside delta_x_y/ is an insert event and all the files in delta_x_y/
// can be treated like base files. Hence, each of these are added to baseOrOriginalFiles list.
for (ParsedDelta parsedDelta : dirInfo.getCurrentDirectories()) {
if (parsedDelta.isDeleteDelta()) {
parsedDeltas.add(parsedDelta);
} else {
// This is a normal insert delta, which only has insert events and hence all the files
// in this delta directory can be considered as a base.
Boolean val = useFileIds.value;
if (val == null || val) {
try {
List<HdfsFileStatusWithId> insertDeltaFiles =
SHIMS.listLocatedHdfsStatus(fs, parsedDelta.getPath(), AcidUtils.hiddenFileFilter);
for (HdfsFileStatusWithId fileId : insertDeltaFiles) {
baseFiles.add(new AcidBaseFileInfo(fileId, AcidUtils.AcidBaseFileType.INSERT_DELTA));
}
if (val == null) {
useFileIds.value = true; // The call succeeded, so presumably the API is there.
}
continue; // move on to process to the next parsedDelta.
} catch (Throwable t) {
LOG.error("Failed to get files with ID; using regular API: " + t.getMessage());
if (val == null && t instanceof UnsupportedOperationException) {
useFileIds.value = false;
}
}
}
// Fall back to regular API and create statuses without ID.
List<FileStatus> children = HdfsUtils.listLocatedStatus(fs, parsedDelta.getPath(), AcidUtils.hiddenFileFilter);
for (FileStatus child : children) {
HdfsFileStatusWithId fileId = AcidUtils.createOriginalObj(null, child);
baseFiles.add(new AcidBaseFileInfo(fileId, AcidUtils.AcidBaseFileType.INSERT_DELTA));
}
}
}
} else {
// When split-update is not enabled, then all the deltas in the current directories
// should be considered as usual.
parsedDeltas.addAll(dirInfo.getCurrentDirectories());
}
return new AcidDirInfo(fs, dir, dirInfo, baseFiles, parsedDeltas);
}
private List<HdfsFileStatusWithId> findBaseFiles(
Path base, Ref<Boolean> useFileIds) throws IOException {
Boolean val = useFileIds.value;
if (val == null || val) {
try {
List<HdfsFileStatusWithId> result = SHIMS.listLocatedHdfsStatus(
fs, base, AcidUtils.hiddenFileFilter);
if (val == null) {
useFileIds.value = true; // The call succeeded, so presumably the API is there.
}
return result;
} catch (Throwable t) {
LOG.error("Failed to get files with ID; using regular API: " + t.getMessage());
if (val == null && t instanceof UnsupportedOperationException) {
useFileIds.value = false;
}
}
}
// Fall back to regular API and create states without ID.
List<FileStatus> children = HdfsUtils.listLocatedStatus(fs, base, AcidUtils.hiddenFileFilter);
List<HdfsFileStatusWithId> result = new ArrayList<>(children.size());
for (FileStatus child : children) {
result.add(AcidUtils.createOriginalObj(null, child));
}
return result;
}
}
/**
* Split the stripes of a given file into input splits.
* A thread is used for each file.
*/
static final class SplitGenerator implements Callable<List<OrcSplit>> {
private final Context context;
private final FileSystem fs;
private final FileStatus file;
private final Long fsFileId;
private final long blockSize;
private final TreeMap<Long, BlockLocation> locations;
private OrcTail orcTail;
private List<OrcProto.Type> readerTypes;
private List<StripeInformation> stripes;
private List<StripeStatistics> stripeStats;
private List<OrcProto.Type> fileTypes;
private boolean[] readerIncluded; // The included columns of the reader / file schema that
// include ACID columns if present.
private final boolean isOriginal;
private final List<DeltaMetaData> deltas;
private final boolean hasBase;
private OrcFile.WriterVersion writerVersion;
private long projColsUncompressedSize;
private final List<OrcSplit> deltaSplits;
private final ByteBuffer ppdResult;
private final UserGroupInformation ugi;
private final boolean allowSyntheticFileIds;
private SchemaEvolution evolution;
public SplitGenerator(SplitInfo splitInfo, UserGroupInformation ugi,
boolean allowSyntheticFileIds) throws IOException {
this.ugi = ugi;
this.context = splitInfo.context;
this.fs = splitInfo.fs;
this.file = splitInfo.fileWithId.getFileStatus();
this.fsFileId = splitInfo.fileWithId.getFileId();
this.blockSize = this.file.getBlockSize();
this.orcTail = splitInfo.orcTail;
this.readerTypes = splitInfo.readerTypes;
// TODO: potential DFS call
this.locations = SHIMS.getLocationsWithOffset(fs, file);
this.isOriginal = splitInfo.isOriginal;
this.deltas = splitInfo.deltas;
this.hasBase = splitInfo.hasBase;
this.projColsUncompressedSize = -1;
this.deltaSplits = splitInfo.getSplits();
this.allowSyntheticFileIds = allowSyntheticFileIds;
this.ppdResult = splitInfo.ppdResult;
}
public boolean isBlocking() {
return true;
}
Path getPath() {
return file.getPath();
}
@Override
public String toString() {
return "splitter(" + file.getPath() + ")";
}
/**
* Compute the number of bytes that overlap between the two ranges.
* @param offset1 start of range1
* @param length1 length of range1
* @param offset2 start of range2
* @param length2 length of range2
* @return the number of bytes in the overlap range
*/
static long getOverlap(long offset1, long length1,
long offset2, long length2) {
long end1 = offset1 + length1;
long end2 = offset2 + length2;
if (end2 <= offset1 || end1 <= offset2) {
return 0;
} else {
return Math.min(end1, end2) - Math.max(offset1, offset2);
}
}
/**
* Create an input split over the given range of bytes. The location of the
* split is based on where the majority of the byte are coming from. ORC
* files are unlikely to have splits that cross between blocks because they
* are written with large block sizes.
* @param offset the start of the split
* @param length the length of the split
* @param orcTail orc tail
* @throws IOException
*/
OrcSplit createSplit(long offset, long length, OrcTail orcTail) throws IOException {
String[] hosts;
Map.Entry<Long, BlockLocation> startEntry = locations.floorEntry(offset);
BlockLocation start = startEntry.getValue();
if (offset + length <= start.getOffset() + start.getLength()) {
// handle the single block case
hosts = start.getHosts();
} else {
Map.Entry<Long, BlockLocation> endEntry = locations.floorEntry(offset + length);
//get the submap
NavigableMap<Long, BlockLocation> navigableMap = locations.subMap(startEntry.getKey(),
true, endEntry.getKey(), true);
// Calculate the number of bytes in the split that are local to each
// host.
Map<String, LongWritable> sizes = new HashMap<String, LongWritable>();
long maxSize = 0;
for (BlockLocation block : navigableMap.values()) {
long overlap = getOverlap(offset, length, block.getOffset(),
block.getLength());
if (overlap > 0) {
for(String host: block.getHosts()) {
LongWritable val = sizes.get(host);
if (val == null) {
val = new LongWritable();
sizes.put(host, val);
}
val.set(val.get() + overlap);
maxSize = Math.max(maxSize, val.get());
}
} else {
throw new IOException("File " + file.getPath().toString() +
" should have had overlap on block starting at " + block.getOffset());
}
}
// filter the list of locations to those that have at least 80% of the
// max
long threshold = (long) (maxSize * MIN_INCLUDED_LOCATION);
List<String> hostList = new ArrayList<String>();
// build the locations in a predictable order to simplify testing
for(BlockLocation block: navigableMap.values()) {
for(String host: block.getHosts()) {
if (sizes.containsKey(host)) {
if (sizes.get(host).get() >= threshold) {
hostList.add(host);
}
sizes.remove(host);
}
}
}
hosts = new String[hostList.size()];
hostList.toArray(hosts);
}
// scale the raw data size to split level based on ratio of split wrt to file length
final long fileLen = file.getLen();
final double splitRatio = (double) length / (double) fileLen;
final long scaledProjSize = projColsUncompressedSize > 0 ?
(long) (splitRatio * projColsUncompressedSize) : fileLen;
Object fileKey = fsFileId;
if (fileKey == null && allowSyntheticFileIds) {
fileKey = new SyntheticFileId(file);
}
return new OrcSplit(file.getPath(), fileKey, offset, length, hosts,
orcTail, isOriginal, hasBase, deltas, scaledProjSize, fileLen);
}
private static final class OffsetAndLength { // Java cruft; pair of long.
public OffsetAndLength() {
this.offset = -1;
this.length = 0;
}
long offset, length;
@Override
public String toString() {
return "[offset=" + offset + ", length=" + length + "]";
}
}
/**
* Divide the adjacent stripes in the file into input splits based on the
* block size and the configured minimum and maximum sizes.
*/
@Override
public List<OrcSplit> call() throws IOException {
if (ugi == null) {
return callInternal();
}
try {
return ugi.doAs(new PrivilegedExceptionAction<List<OrcSplit>>() {
@Override
public List<OrcSplit> run() throws Exception {
return callInternal();
}
});
} catch (InterruptedException e) {
throw new IOException(e);
}
}
private List<OrcSplit> callInternal() throws IOException {
// Figure out which stripes we need to read.
if (ppdResult != null) {
assert deltaSplits.isEmpty();
assert ppdResult.hasArray();
// TODO: when PB is upgraded to 2.6, newInstance(ByteBuffer) method should be used here.
CodedInputStream cis = CodedInputStream.newInstance(
ppdResult.array(), ppdResult.arrayOffset(), ppdResult.remaining());
cis.setSizeLimit(InStream.PROTOBUF_MESSAGE_MAX_LIMIT);
return generateSplitsFromPpd(SplitInfos.parseFrom(cis));
} else {
populateAndCacheStripeDetails();
boolean[] includeStripe = null;
// We can't eliminate stripes if there are deltas because the
// deltas may change the rows making them match the predicate.
if ((deltas == null || deltas.isEmpty()) && context.sarg != null) {
String[] colNames =
extractNeededColNames((readerTypes == null ? fileTypes : readerTypes),
context.conf, readerIncluded, isOriginal);
if (colNames == null) {
LOG.warn("Skipping split elimination for {} as column names is null", file.getPath());
} else {
includeStripe = pickStripes(context.sarg, writerVersion,
stripeStats, stripes.size(), file.getPath(), evolution);
}
}
return generateSplitsFromStripes(includeStripe);
}
}
private List<OrcSplit> generateSplitsFromPpd(SplitInfos ppdResult) throws IOException {
OffsetAndLength current = new OffsetAndLength();
List<OrcSplit> splits = new ArrayList<>(ppdResult.getInfosCount());
int lastIdx = -1;
for (Metastore.SplitInfo si : ppdResult.getInfosList()) {
int index = si.getIndex();
if (lastIdx >= 0 && lastIdx + 1 != index && current.offset != -1) {
// Create split for the previous unfinished stripe.
splits.add(createSplit(current.offset, current.length, orcTail));
current.offset = -1;
}
lastIdx = index;
String debugStr = null;
if (LOG.isDebugEnabled()) {
debugStr = current.toString();
}
current = generateOrUpdateSplit(splits, current, si.getOffset(), si.getLength(), null);
if (LOG.isDebugEnabled()) {
LOG.debug("Updated split from {" + index + ": " + si.getOffset() + ", "
+ si.getLength() + "} and "+ debugStr + " to " + current);
}
}
generateLastSplit(splits, current, null);
return splits;
}
private List<OrcSplit> generateSplitsFromStripes(boolean[] includeStripe) throws IOException {
List<OrcSplit> splits = new ArrayList<>(stripes.size());
// if we didn't have predicate pushdown, read everything
if (includeStripe == null) {
includeStripe = new boolean[stripes.size()];
Arrays.fill(includeStripe, true);
}
OffsetAndLength current = new OffsetAndLength();
int idx = -1;
for (StripeInformation stripe : stripes) {
idx++;
if (!includeStripe[idx]) {
// create split for the previous unfinished stripe
if (current.offset != -1) {
splits.add(createSplit(current.offset, current.length, orcTail));
current.offset = -1;
}
continue;
}
current = generateOrUpdateSplit(
splits, current, stripe.getOffset(), stripe.getLength(), orcTail);
}
generateLastSplit(splits, current, orcTail);
// Add uncovered ACID delta splits.
splits.addAll(deltaSplits);
return splits;
}
private OffsetAndLength generateOrUpdateSplit(
List<OrcSplit> splits, OffsetAndLength current, long offset,
long length, OrcTail orcTail) throws IOException {
// if we are working on a stripe, over the min stripe size, and
// crossed a block boundary, cut the input split here.
if (current.offset != -1 && current.length > context.minSize &&
(current.offset / blockSize != offset / blockSize)) {
splits.add(createSplit(current.offset, current.length, orcTail));
current.offset = -1;
}
// if we aren't building a split, start a new one.
if (current.offset == -1) {
current.offset = offset;
current.length = length;
} else {
current.length = (offset + length) - current.offset;
}
if (current.length >= context.maxSize) {
splits.add(createSplit(current.offset, current.length, orcTail));
current.offset = -1;
}
return current;
}
private void generateLastSplit(List<OrcSplit> splits, OffsetAndLength current,
OrcTail orcTail) throws IOException {
if (current.offset == -1) return;
splits.add(createSplit(current.offset, current.length, orcTail));
}
private void populateAndCacheStripeDetails() throws IOException {
// When reading the file for first time we get the orc tail from the orc reader and cache it
// in the footer cache. Subsequent requests will get the orc tail from the cache (if file
// length and modification time is not changed) and populate the split info. If the split info
// object contains the orc tail from the cache then we can skip creating orc reader avoiding
// filesystem calls.
if (orcTail == null) {
Reader orcReader = OrcFile.createReader(file.getPath(),
OrcFile.readerOptions(context.conf)
.filesystem(fs)
.maxLength(file.getLen()));
orcTail = new OrcTail(orcReader.getFileTail(), orcReader.getSerializedFileFooter(),
file.getModificationTime());
if (context.cacheStripeDetails) {
context.footerCache.put(new FooterCacheKey(fsFileId, file.getPath()), orcTail);
}
}
stripes = orcTail.getStripes();
stripeStats = orcTail.getStripeStatistics();
fileTypes = orcTail.getTypes();
TypeDescription fileSchema = OrcUtils.convertTypeFromProtobuf(fileTypes, 0);
Reader.Options readerOptions = new Reader.Options(context.conf);
if (readerTypes == null) {
readerIncluded = genIncludedColumns(fileSchema, context.conf);
evolution = new SchemaEvolution(fileSchema, null, readerOptions.include(readerIncluded));
} else {
// The reader schema always comes in without ACID columns.
TypeDescription readerSchema = OrcUtils.convertTypeFromProtobuf(readerTypes, 0);
readerIncluded = genIncludedColumns(readerSchema, context.conf);
evolution = new SchemaEvolution(fileSchema, readerSchema, readerOptions.include(readerIncluded));
if (!isOriginal) {
// The SchemaEvolution class has added the ACID metadata columns. Let's update our
// readerTypes so PPD code will work correctly.
readerTypes = OrcUtils.getOrcTypes(evolution.getReaderSchema());
}
}
writerVersion = orcTail.getWriterVersion();
List<OrcProto.ColumnStatistics> fileColStats = orcTail.getFooter().getStatisticsList();
boolean[] fileIncluded;
if (readerTypes == null) {
fileIncluded = readerIncluded;
} else {
fileIncluded = new boolean[fileTypes.size()];
final int readerSchemaSize = readerTypes.size();
for (int i = 0; i < readerSchemaSize; i++) {
TypeDescription fileType = evolution.getFileType(i);
if (fileType != null) {
fileIncluded[fileType.getId()] = true;
}
}
}
projColsUncompressedSize = computeProjectionSize(fileTypes, fileColStats, fileIncluded);
if (!context.footerInSplits) {
orcTail = null;
}
}
private long computeProjectionSize(List<OrcProto.Type> fileTypes,
List<OrcProto.ColumnStatistics> stats, boolean[] fileIncluded) {
List<Integer> internalColIds = Lists.newArrayList();
if (fileIncluded == null) {
// Add all.
for (int i = 0; i < fileTypes.size(); i++) {
internalColIds.add(i);
}
} else {
for (int i = 0; i < fileIncluded.length; i++) {
if (fileIncluded[i]) {
internalColIds.add(i);
}
}
}
return ReaderImpl.getRawDataSizeFromColIndices(internalColIds, fileTypes, stats);
}
private boolean[] shiftReaderIncludedForAcid(boolean[] included) {
// We always need the base row
included[0] = true;
boolean[] newIncluded = new boolean[included.length + OrcRecordUpdater.FIELDS];
Arrays.fill(newIncluded, 0, OrcRecordUpdater.FIELDS, true);
for(int i= 0; i < included.length; ++i) {
newIncluded[i + OrcRecordUpdater.FIELDS] = included[i];
}
return newIncluded;
}
}
/** Class intended to update two values from methods... Java-related cruft. */
@VisibleForTesting
static final class CombinedCtx {
ETLSplitStrategy combined;
long combineStartUs;
}
static List<OrcSplit> generateSplitsInfo(Configuration conf, Context context)
throws IOException {
if (LOG.isInfoEnabled()) {
LOG.info("ORC pushdown predicate: " + context.sarg);
}
boolean useFileIdsConfig = HiveConf.getBoolVar(
conf, ConfVars.HIVE_ORC_INCLUDE_FILE_ID_IN_SPLITS);
// Sharing this state assumes splits will succeed or fail to get it together (same FS).
// We also start with null and only set it to true on the first call, so we would only do
// the global-disable thing on the first failure w/the API error, not any random failure.
Ref<Boolean> useFileIds = Ref.from(useFileIdsConfig ? null : false);
boolean allowSyntheticFileIds = useFileIdsConfig && HiveConf.getBoolVar(
conf, ConfVars.HIVE_ORC_ALLOW_SYNTHETIC_FILE_ID_IN_SPLITS);
List<OrcSplit> splits = Lists.newArrayList();
List<Future<AcidDirInfo>> pathFutures = Lists.newArrayList();
List<Future<Void>> strategyFutures = Lists.newArrayList();
final List<Future<List<OrcSplit>>> splitFutures = Lists.newArrayList();
UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
// multi-threaded file statuses and split strategy
Path[] paths = getInputPaths(conf);
CompletionService<AcidDirInfo> ecs = new ExecutorCompletionService<>(Context.threadPool);
for (Path dir : paths) {
FileSystem fs = dir.getFileSystem(conf);
FileGenerator fileGenerator = new FileGenerator(context, fs, dir, useFileIds, ugi);
pathFutures.add(ecs.submit(fileGenerator));
}
boolean isTransactionalTableScan =
HiveConf.getBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN);
boolean isSchemaEvolution = HiveConf.getBoolVar(conf, ConfVars.HIVE_SCHEMA_EVOLUTION);
TypeDescription readerSchema =
OrcInputFormat.getDesiredRowTypeDescr(conf, isTransactionalTableScan, Integer.MAX_VALUE);
List<OrcProto.Type> readerTypes = null;
if (readerSchema != null) {
readerTypes = OrcUtils.getOrcTypes(readerSchema);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Generate splits schema evolution property " + isSchemaEvolution +
" reader schema " + (readerSchema == null ? "NULL" : readerSchema.toString()) +
" transactional scan property " + isTransactionalTableScan);
}
// complete path futures and schedule split generation
try {
CombinedCtx combinedCtx = (context.splitStrategyBatchMs > 0) ? new CombinedCtx() : null;
long maxWaitUs = context.splitStrategyBatchMs * 1000000;
int resultsLeft = paths.length;
while (resultsLeft > 0) {
AcidDirInfo adi = null;
if (combinedCtx != null && combinedCtx.combined != null) {
long waitTimeUs = combinedCtx.combineStartUs + maxWaitUs - System.nanoTime();
if (waitTimeUs >= 0) {
Future<AcidDirInfo> f = ecs.poll(waitTimeUs, TimeUnit.NANOSECONDS);
adi = (f == null) ? null : f.get();
}
} else {
adi = ecs.take().get();
}
if (adi == null) {
// We were combining SS-es and the time has expired.
assert combinedCtx.combined != null;
scheduleSplits(combinedCtx.combined, context, splitFutures, strategyFutures, splits);
combinedCtx.combined = null;
continue;
}
// We have received a new directory information, make split strategies.
--resultsLeft;
// The reason why we can get a list of split strategies here is because for ACID split-update
// case when we have a mix of original base files & insert deltas, we will produce two
// independent split strategies for them. There is a global flag 'isOriginal' that is set
// on a per split strategy basis and it has to be same for all the files in that strategy.
List<SplitStrategy<?>> splitStrategies = determineSplitStrategies(combinedCtx, context, adi.fs,
adi.splitPath, adi.acidInfo, adi.baseFiles, adi.parsedDeltas, readerTypes, ugi,
allowSyntheticFileIds);
for (SplitStrategy<?> splitStrategy : splitStrategies) {
if (isDebugEnabled) {
LOG.debug("Split strategy: {}", splitStrategy);
}
// Hack note - different split strategies return differently typed lists, yay Java.
// This works purely by magic, because we know which strategy produces which type.
if (splitStrategy instanceof ETLSplitStrategy) {
scheduleSplits((ETLSplitStrategy)splitStrategy,
context, splitFutures, strategyFutures, splits);
} else {
@SuppressWarnings("unchecked")
List<OrcSplit> readySplits = (List<OrcSplit>)splitStrategy.getSplits();
splits.addAll(readySplits);
}
}
}
// Run the last combined strategy, if any.
if (combinedCtx != null && combinedCtx.combined != null) {
scheduleSplits(combinedCtx.combined, context, splitFutures, strategyFutures, splits);
combinedCtx.combined = null;
}
// complete split futures
for (Future<Void> ssFuture : strategyFutures) {
ssFuture.get(); // Make sure we get exceptions strategies might have thrown.
}
// All the split strategies are done, so it must be safe to access splitFutures.
for (Future<List<OrcSplit>> splitFuture : splitFutures) {
splits.addAll(splitFuture.get());
}
} catch (Exception e) {
cancelFutures(pathFutures);
cancelFutures(strategyFutures);
cancelFutures(splitFutures);
throw new RuntimeException("ORC split generation failed with exception: " + e.getMessage(), e);
}
if (context.cacheStripeDetails) {
LOG.info("FooterCacheHitRatio: " + context.cacheHitCounter.get() + "/"
+ context.numFilesCounter.get());
}
if (isDebugEnabled) {
for (OrcSplit split : splits) {
LOG.debug(split + " projected_columns_uncompressed_size: "
+ split.getColumnarProjectionSize());
}
}
return splits;
}
@VisibleForTesting
// We could have this as a protected method w/no class, but half of Hive is static, so there.
public static class ContextFactory {
public Context create(Configuration conf, int numSplits) throws IOException {
return new Context(conf, numSplits);
}
}
private static void scheduleSplits(ETLSplitStrategy splitStrategy, Context context,
List<Future<List<OrcSplit>>> splitFutures, List<Future<Void>> strategyFutures,
List<OrcSplit> splits) throws IOException {
Future<Void> ssFuture = splitStrategy.generateSplitWork(context, splitFutures, splits);
if (ssFuture == null) return;
strategyFutures.add(ssFuture);
}
private static <T> void cancelFutures(List<Future<T>> futures) {
for (Future<T> future : futures) {
future.cancel(true);
}
}
private static SplitStrategy<?> combineOrCreateETLStrategy(CombinedCtx combinedCtx,
Context context, FileSystem fs, Path dir, List<HdfsFileStatusWithId> files,
List<DeltaMetaData> deltas, boolean[] covered, List<OrcProto.Type> readerTypes,
boolean isOriginal, UserGroupInformation ugi, boolean allowSyntheticFileIds) {
if (!deltas.isEmpty() || combinedCtx == null) {
return new ETLSplitStrategy(
context, fs, dir, files, readerTypes, isOriginal, deltas, covered, ugi,
allowSyntheticFileIds);
} else if (combinedCtx.combined == null) {
combinedCtx.combined = new ETLSplitStrategy(
context, fs, dir, files, readerTypes, isOriginal, deltas, covered, ugi,
allowSyntheticFileIds);
combinedCtx.combineStartUs = System.nanoTime();
return null;
} else {
ETLSplitStrategy.CombineResult r =
combinedCtx.combined.combineWith(fs, dir, files, isOriginal);
switch (r) {
case YES: return null;
case NO_AND_CONTINUE:
return new ETLSplitStrategy(
context, fs, dir, files, readerTypes, isOriginal, deltas, covered, ugi,
allowSyntheticFileIds);
case NO_AND_SWAP: {
ETLSplitStrategy oldBase = combinedCtx.combined;
combinedCtx.combined = new ETLSplitStrategy(
context, fs, dir, files, readerTypes, isOriginal, deltas, covered, ugi,
allowSyntheticFileIds);
combinedCtx.combineStartUs = System.nanoTime();
return oldBase;
}
default: throw new AssertionError("Unknown result " + r);
}
}
}
@Override
public InputSplit[] getSplits(JobConf job,
int numSplits) throws IOException {
if (isDebugEnabled) {
LOG.debug("getSplits started");
}
Configuration conf = job;
if (HiveConf.getBoolVar(job, HiveConf.ConfVars.HIVE_ORC_MS_FOOTER_CACHE_ENABLED)) {
// Create HiveConf once, since this is expensive.
conf = new HiveConf(conf, OrcInputFormat.class);
}
List<OrcSplit> result = generateSplitsInfo(conf,
new Context(conf, numSplits, createExternalCaches()));
if (isDebugEnabled) {
LOG.debug("getSplits finished");
}
return result.toArray(new InputSplit[result.size()]);
}
@SuppressWarnings("unchecked")
private org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct>
createVectorizedReader(InputSplit split, JobConf conf, Reporter reporter
) throws IOException {
return (org.apache.hadoop.mapred.RecordReader)
new VectorizedOrcInputFormat().getRecordReader(split, conf, reporter);
}
@Override
public org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct>
getRecordReader(InputSplit inputSplit, JobConf conf,
Reporter reporter) throws IOException {
boolean vectorMode = Utilities.getUseVectorizedInputFileFormat(conf);
boolean isAcidRead = isAcidRead(conf, inputSplit);
if (!isAcidRead) {
if (vectorMode) {
return createVectorizedReader(inputSplit, conf, reporter);
} else {
OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(conf);
if (inputSplit instanceof OrcSplit) {
OrcSplit split = (OrcSplit) inputSplit;
readerOptions.maxLength(split.getFileLength()).orcTail(split.getOrcTail());
}
return new OrcRecordReader(OrcFile.createReader(
((FileSplit) inputSplit).getPath(),
readerOptions),
conf, (FileSplit) inputSplit);
}
}
reporter.setStatus(inputSplit.toString());
boolean isFastVectorizedReaderAvailable =
VectorizedOrcAcidRowBatchReader.canCreateVectorizedAcidRowBatchReaderOnSplit(conf, inputSplit);
if (vectorMode && isFastVectorizedReaderAvailable) {
// Faster vectorized ACID row batch reader is available that avoids row-by-row stitching.
return (org.apache.hadoop.mapred.RecordReader)
new VectorizedOrcAcidRowBatchReader(inputSplit, conf, reporter);
}
Options options = new Options(conf).reporter(reporter);
final RowReader<OrcStruct> inner = getReader(inputSplit, options);
if (vectorMode && !isFastVectorizedReaderAvailable) {
// Vectorized regular ACID reader that does row-by-row stitching.
return (org.apache.hadoop.mapred.RecordReader)
new VectorizedOrcAcidRowReader(inner, conf,
Utilities.getMapWork(conf).getVectorizedRowBatchCtx(), (FileSplit) inputSplit);
} else {
// Non-vectorized regular ACID reader.
return new NullKeyRecordReader(inner, conf);
}
}
/**
* Return a RecordReader that is compatible with the Hive 0.12 reader
* with NullWritable for the key instead of RecordIdentifier.
*/
public static final class NullKeyRecordReader implements AcidRecordReader<NullWritable, OrcStruct> {
private final RecordIdentifier id;
private final RowReader<OrcStruct> inner;
@Override
public RecordIdentifier getRecordIdentifier() {
return id;
}
private NullKeyRecordReader(RowReader<OrcStruct> inner, Configuration conf) {
this.inner = inner;
id = inner.createKey();
}
@Override
public boolean next(NullWritable nullWritable,
OrcStruct orcStruct) throws IOException {
return inner.next(id, orcStruct);
}
@Override
public NullWritable createKey() {
return NullWritable.get();
}
@Override
public OrcStruct createValue() {
return inner.createValue();
}
@Override
public long getPos() throws IOException {
return inner.getPos();
}
@Override
public void close() throws IOException {
inner.close();
}
@Override
public float getProgress() throws IOException {
return inner.getProgress();
}
}
@Override
public RowReader<OrcStruct> getReader(InputSplit inputSplit,
Options options)
throws IOException {
final OrcSplit split = (OrcSplit) inputSplit;
final Path path = split.getPath();
Path root;
if (split.hasBase()) {
if (split.isOriginal()) {
root = path.getParent();
} else {
root = path.getParent().getParent();
}
} else {
root = path;
}
// Retrieve the acidOperationalProperties for the table, initialized in HiveInputFormat.
AcidUtils.AcidOperationalProperties acidOperationalProperties
= AcidUtils.getAcidOperationalProperties(options.getConfiguration());
// The deltas are decided based on whether split-update has been turned on for the table or not.
// When split-update is turned off, everything in the delta_x_y/ directory should be treated
// as delta. However if split-update is turned on, only the files in delete_delta_x_y/ directory
// need to be considered as delta, because files in delta_x_y/ will be processed as base files
// since they only have insert events in them.
final Path[] deltas =
acidOperationalProperties.isSplitUpdate() ?
AcidUtils.deserializeDeleteDeltas(root, split.getDeltas())
: AcidUtils.deserializeDeltas(root, split.getDeltas());
final Configuration conf = options.getConfiguration();
final Reader reader = OrcInputFormat.createOrcReaderForSplit(conf, split);
final int bucket = OrcInputFormat.getBucketForSplit(conf, split);
final Reader.Options readOptions = OrcInputFormat.createOptionsForReader(conf);
readOptions.range(split.getStart(), split.getLength());
String txnString = conf.get(ValidTxnList.VALID_TXNS_KEY);
ValidTxnList validTxnList = txnString == null ? new ValidReadTxnList() :
new ValidReadTxnList(txnString);
final OrcRawRecordMerger records =
new OrcRawRecordMerger(conf, true, reader, split.isOriginal(), bucket,
validTxnList, readOptions, deltas);
return new RowReader<OrcStruct>() {
OrcStruct innerRecord = records.createValue();
@Override
public ObjectInspector getObjectInspector() {
return OrcStruct.createObjectInspector(0, OrcUtils.getOrcTypes(readOptions.getSchema()));
}
@Override
public boolean next(RecordIdentifier recordIdentifier,
OrcStruct orcStruct) throws IOException {
boolean result;
// filter out the deleted records
do {
result = records.next(recordIdentifier, innerRecord);
} while (result &&
OrcRecordUpdater.getOperation(innerRecord) ==
OrcRecordUpdater.DELETE_OPERATION);
if (result) {
// swap the fields with the passed in orcStruct
orcStruct.linkFields(OrcRecordUpdater.getRow(innerRecord));
}
return result;
}
@Override
public RecordIdentifier createKey() {
return records.createKey();
}
@Override
public OrcStruct createValue() {
return new OrcStruct(records.getColumns());
}
@Override
public long getPos() throws IOException {
return records.getPos();
}
@Override
public void close() throws IOException {
records.close();
}
@Override
public float getProgress() throws IOException {
return records.getProgress();
}
};
}
static Path findOriginalBucket(FileSystem fs,
Path directory,
int bucket) throws IOException {
for(FileStatus stat: fs.listStatus(directory)) {
String name = stat.getPath().getName();
String numberPart = name.substring(0, name.indexOf('_'));
if (org.apache.commons.lang3.StringUtils.isNumeric(numberPart) &&
Integer.parseInt(numberPart) == bucket) {
return stat.getPath();
}
}
throw new IllegalArgumentException("Can't find bucket " + bucket + " in " +
directory);
}
static Reader.Options createOptionsForReader(Configuration conf) {
/**
* Do we have schema on read in the configuration variables?
*/
TypeDescription schema =
OrcInputFormat.getDesiredRowTypeDescr(conf, true, Integer.MAX_VALUE);
Reader.Options readerOptions = new Reader.Options().schema(schema);
// TODO: Convert genIncludedColumns and setSearchArgument to use TypeDescription.
final List<OrcProto.Type> schemaTypes = OrcUtils.getOrcTypes(schema);
readerOptions.include(OrcInputFormat.genIncludedColumns(schema, conf));
OrcInputFormat.setSearchArgument(readerOptions, schemaTypes, conf, true);
return readerOptions;
}
static Reader createOrcReaderForSplit(Configuration conf, OrcSplit orcSplit) throws IOException {
Path path = orcSplit.getPath();
Reader reader;
if (orcSplit.hasBase()) {
OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(conf);
readerOptions.maxLength(orcSplit.getFileLength());
if (orcSplit.hasFooter()) {
readerOptions.orcTail(orcSplit.getOrcTail());
}
reader = OrcFile.createReader(path, readerOptions);
} else {
reader = null;
}
return reader;
}
static int getBucketForSplit(Configuration conf, OrcSplit orcSplit) {
if (orcSplit.hasBase()) {
return AcidUtils.parseBaseOrDeltaBucketFilename(orcSplit.getPath(), conf).getBucket();
} else {
return (int) orcSplit.getStart();
}
}
public static boolean[] pickStripesViaTranslatedSarg(SearchArgument sarg,
OrcFile.WriterVersion writerVersion, List<OrcProto.Type> types,
List<StripeStatistics> stripeStats, int stripeCount) {
LOG.info("Translated ORC pushdown predicate: " + sarg);
assert sarg != null;
if (stripeStats == null || writerVersion == OrcFile.WriterVersion.ORIGINAL) {
return null; // only do split pruning if HIVE-8732 has been fixed in the writer
}
// eliminate stripes that doesn't satisfy the predicate condition
List<PredicateLeaf> sargLeaves = sarg.getLeaves();
int[] filterColumns = RecordReaderImpl.mapTranslatedSargColumns(types, sargLeaves);
TypeDescription schema = OrcUtils.convertTypeFromProtobuf(types, 0);
SchemaEvolution evolution = new SchemaEvolution(schema, null);
return pickStripesInternal(sarg, filterColumns, stripeStats, stripeCount, null, evolution);
}
private static boolean[] pickStripes(SearchArgument sarg,
OrcFile.WriterVersion writerVersion,
List<StripeStatistics> stripeStats,
int stripeCount, Path filePath, final SchemaEvolution evolution) {
if (sarg == null || stripeStats == null || writerVersion == OrcFile.WriterVersion.ORIGINAL) {
return null; // only do split pruning if HIVE-8732 has been fixed in the writer
}
// eliminate stripes that doesn't satisfy the predicate condition
List<PredicateLeaf> sargLeaves = sarg.getLeaves();
int[] filterColumns = RecordReaderImpl.mapSargColumnsToOrcInternalColIdx(sargLeaves,
evolution);
return pickStripesInternal(sarg, filterColumns, stripeStats, stripeCount, filePath, evolution);
}
private static boolean[] pickStripesInternal(SearchArgument sarg, int[] filterColumns,
List<StripeStatistics> stripeStats, int stripeCount, Path filePath,
final SchemaEvolution evolution) {
boolean[] includeStripe = new boolean[stripeCount];
for (int i = 0; i < includeStripe.length; ++i) {
includeStripe[i] = (i >= stripeStats.size()) ||
isStripeSatisfyPredicate(stripeStats.get(i), sarg, filterColumns, evolution);
if (isDebugEnabled && !includeStripe[i]) {
LOG.debug("Eliminating ORC stripe-" + i + " of file '" + filePath
+ "' as it did not satisfy predicate condition.");
}
}
return includeStripe;
}
private static boolean isStripeSatisfyPredicate(
StripeStatistics stripeStatistics, SearchArgument sarg, int[] filterColumns,
final SchemaEvolution evolution) {
List<PredicateLeaf> predLeaves = sarg.getLeaves();
TruthValue[] truthValues = new TruthValue[predLeaves.size()];
for (int pred = 0; pred < truthValues.length; pred++) {
if (filterColumns[pred] != -1) {
if (evolution != null && !evolution.isPPDSafeConversion(filterColumns[pred])) {
truthValues[pred] = TruthValue.YES_NO_NULL;
} else {
// column statistics at index 0 contains only the number of rows
ColumnStatistics stats = stripeStatistics.getColumnStatistics()[filterColumns[pred]];
truthValues[pred] = RecordReaderImpl.evaluatePredicate(stats, predLeaves.get(pred), null);
}
} else {
// parition column case.
// partition filter will be evaluated by partition pruner so
// we will not evaluate partition filter here.
truthValues[pred] = TruthValue.YES_NO_NULL;
}
}
return sarg.evaluate(truthValues).isNeeded();
}
@VisibleForTesting
static List<SplitStrategy<?>> determineSplitStrategies(CombinedCtx combinedCtx, Context context,
FileSystem fs, Path dir, AcidUtils.Directory dirInfo,
List<AcidBaseFileInfo> baseFiles,
List<ParsedDelta> parsedDeltas,
List<OrcProto.Type> readerTypes,
UserGroupInformation ugi, boolean allowSyntheticFileIds) {
List<SplitStrategy<?>> splitStrategies = new ArrayList<SplitStrategy<?>>();
SplitStrategy<?> splitStrategy;
// When no baseFiles, we will just generate a single split strategy and return.
List<HdfsFileStatusWithId> acidSchemaFiles = new ArrayList<HdfsFileStatusWithId>();
if (baseFiles.isEmpty()) {
splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, dirInfo,
acidSchemaFiles, false, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds);
if (splitStrategy != null) {
splitStrategies.add(splitStrategy);
}
return splitStrategies; // return here
}
List<HdfsFileStatusWithId> originalSchemaFiles = new ArrayList<HdfsFileStatusWithId>();
// Separate the base files into acid schema and non-acid(original) schema files.
for (AcidBaseFileInfo acidBaseFileInfo : baseFiles) {
if (acidBaseFileInfo.isOriginal()) {
originalSchemaFiles.add(acidBaseFileInfo.getHdfsFileStatusWithId());
} else {
acidSchemaFiles.add(acidBaseFileInfo.getHdfsFileStatusWithId());
}
}
// Generate split strategy for non-acid schema original files, if any.
if (!originalSchemaFiles.isEmpty()) {
splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, dirInfo,
originalSchemaFiles, true, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds);
if (splitStrategy != null) {
splitStrategies.add(splitStrategy);
}
}
// Generate split strategy for acid schema files, if any.
if (!acidSchemaFiles.isEmpty()) {
splitStrategy = determineSplitStrategy(combinedCtx, context, fs, dir, dirInfo,
acidSchemaFiles, false, parsedDeltas, readerTypes, ugi, allowSyntheticFileIds);
if (splitStrategy != null) {
splitStrategies.add(splitStrategy);
}
}
return splitStrategies;
}
@VisibleForTesting
static SplitStrategy<?> determineSplitStrategy(CombinedCtx combinedCtx, Context context,
FileSystem fs, Path dir, AcidUtils.Directory dirInfo,
List<HdfsFileStatusWithId> baseFiles,
boolean isOriginal,
List<ParsedDelta> parsedDeltas,
List<OrcProto.Type> readerTypes,
UserGroupInformation ugi, boolean allowSyntheticFileIds) {
List<DeltaMetaData> deltas = AcidUtils.serializeDeltas(parsedDeltas);
boolean[] covered = new boolean[context.numBuckets];
// if we have a base to work from
if (!baseFiles.isEmpty()) {
long totalFileSize = 0;
for (HdfsFileStatusWithId child : baseFiles) {
totalFileSize += child.getFileStatus().getLen();
AcidOutputFormat.Options opts = AcidUtils.parseBaseOrDeltaBucketFilename
(child.getFileStatus().getPath(), context.conf);
opts.writingBase(true);
int b = opts.getBucket();
// If the bucket is in the valid range, mark it as covered.
// I wish Hive actually enforced bucketing all of the time.
if (b >= 0 && b < covered.length) {
covered[b] = true;
}
}
int numFiles = baseFiles.size();
long avgFileSize = totalFileSize / numFiles;
int totalFiles = context.numFilesCounter.addAndGet(numFiles);
switch(context.splitStrategyKind) {
case BI:
// BI strategy requested through config
return new BISplitStrategy(context, fs, dir, baseFiles,
isOriginal, deltas, covered, allowSyntheticFileIds);
case ETL:
// ETL strategy requested through config
return combineOrCreateETLStrategy(combinedCtx, context, fs, dir, baseFiles,
deltas, covered, readerTypes, isOriginal, ugi, allowSyntheticFileIds);
default:
// HYBRID strategy
if (avgFileSize > context.maxSize || totalFiles <= context.etlFileThreshold) {
return combineOrCreateETLStrategy(combinedCtx, context, fs, dir, baseFiles,
deltas, covered, readerTypes, isOriginal, ugi, allowSyntheticFileIds);
} else {
return new BISplitStrategy(context, fs, dir, baseFiles,
isOriginal, deltas, covered, allowSyntheticFileIds);
}
}
} else {
// no base, only deltas
return new ACIDSplitStrategy(dir, context.numBuckets, deltas, covered,
context.acidOperationalProperties);
}
}
@Override
public RawReader<OrcStruct> getRawReader(Configuration conf,
boolean collapseEvents,
int bucket,
ValidTxnList validTxnList,
Path baseDirectory,
Path[] deltaDirectory
) throws IOException {
Reader reader = null;
boolean isOriginal = false;
if (baseDirectory != null) {
Path bucketFile;
if (baseDirectory.getName().startsWith(AcidUtils.BASE_PREFIX)) {
bucketFile = AcidUtils.createBucketFile(baseDirectory, bucket);
} else {
isOriginal = true;
bucketFile = findOriginalBucket(baseDirectory.getFileSystem(conf),
baseDirectory, bucket);
}
reader = OrcFile.createReader(bucketFile, OrcFile.readerOptions(conf));
}
return new OrcRawRecordMerger(conf, collapseEvents, reader, isOriginal,
bucket, validTxnList, new Reader.Options(), deltaDirectory);
}
/**
* Represents footer cache.
*/
public interface FooterCache {
ByteBuffer NO_SPLIT_AFTER_PPD = ByteBuffer.wrap(new byte[0]);
void getAndValidate(List<HdfsFileStatusWithId> files, boolean isOriginal,
OrcTail[] result, ByteBuffer[] ppdResult) throws IOException, HiveException;
boolean hasPpd();
boolean isBlocking();
void put(FooterCacheKey cacheKey, OrcTail orcTail) throws IOException;
}
public static class FooterCacheKey {
Long fileId; // used by external cache
Path path; // used by local cache
FooterCacheKey(Long fileId, Path path) {
this.fileId = fileId;
this.path = path;
}
public Long getFileId() {
return fileId;
}
public Path getPath() {
return path;
}
}
/**
* Convert a Hive type property string that contains separated type names into a list of
* TypeDescription objects.
* @param hiveTypeProperty the desired types from hive
* @param maxColumns the maximum number of desired columns
* @return the list of TypeDescription objects.
*/
public static ArrayList<TypeDescription>
typeDescriptionsFromHiveTypeProperty(String hiveTypeProperty,
int maxColumns) {
// CONSDIER: We need a type name parser for TypeDescription.
ArrayList<TypeInfo> typeInfoList = TypeInfoUtils.getTypeInfosFromTypeString(hiveTypeProperty);
ArrayList<TypeDescription> typeDescrList =new ArrayList<TypeDescription>(typeInfoList.size());
for (TypeInfo typeInfo : typeInfoList) {
typeDescrList.add(convertTypeInfo(typeInfo));
if (typeDescrList.size() >= maxColumns) {
break;
}
}
return typeDescrList;
}
public static TypeDescription convertTypeInfo(TypeInfo info) {
switch (info.getCategory()) {
case PRIMITIVE: {
PrimitiveTypeInfo pinfo = (PrimitiveTypeInfo) info;
switch (pinfo.getPrimitiveCategory()) {
case BOOLEAN:
return TypeDescription.createBoolean();
case BYTE:
return TypeDescription.createByte();
case SHORT:
return TypeDescription.createShort();
case INT:
return TypeDescription.createInt();
case LONG:
return TypeDescription.createLong();
case FLOAT:
return TypeDescription.createFloat();
case DOUBLE:
return TypeDescription.createDouble();
case STRING:
return TypeDescription.createString();
case DATE:
return TypeDescription.createDate();
case TIMESTAMP:
return TypeDescription.createTimestamp();
case BINARY:
return TypeDescription.createBinary();
case DECIMAL: {
DecimalTypeInfo dinfo = (DecimalTypeInfo) pinfo;
return TypeDescription.createDecimal()
.withScale(dinfo.getScale())
.withPrecision(dinfo.getPrecision());
}
case VARCHAR: {
BaseCharTypeInfo cinfo = (BaseCharTypeInfo) pinfo;
return TypeDescription.createVarchar()
.withMaxLength(cinfo.getLength());
}
case CHAR: {
BaseCharTypeInfo cinfo = (BaseCharTypeInfo) pinfo;
return TypeDescription.createChar()
.withMaxLength(cinfo.getLength());
}
default:
throw new IllegalArgumentException("ORC doesn't handle primitive" +
" category " + pinfo.getPrimitiveCategory());
}
}
case LIST: {
ListTypeInfo linfo = (ListTypeInfo) info;
return TypeDescription.createList
(convertTypeInfo(linfo.getListElementTypeInfo()));
}
case MAP: {
MapTypeInfo minfo = (MapTypeInfo) info;
return TypeDescription.createMap
(convertTypeInfo(minfo.getMapKeyTypeInfo()),
convertTypeInfo(minfo.getMapValueTypeInfo()));
}
case UNION: {
UnionTypeInfo minfo = (UnionTypeInfo) info;
TypeDescription result = TypeDescription.createUnion();
for (TypeInfo child: minfo.getAllUnionObjectTypeInfos()) {
result.addUnionChild(convertTypeInfo(child));
}
return result;
}
case STRUCT: {
StructTypeInfo sinfo = (StructTypeInfo) info;
TypeDescription result = TypeDescription.createStruct();
for(String fieldName: sinfo.getAllStructFieldNames()) {
result.addField(fieldName,
convertTypeInfo(sinfo.getStructFieldTypeInfo(fieldName)));
}
return result;
}
default:
throw new IllegalArgumentException("ORC doesn't handle " +
info.getCategory());
}
}
/**
* Generate the desired schema for reading the file.
* @param conf the configuration
* @param isAcidRead is this an acid format?
* @param dataColumns the desired number of data columns for vectorized read
* @return the desired schema or null if schema evolution isn't enabled
* @throws IllegalArgumentException
*/
public static TypeDescription getDesiredRowTypeDescr(Configuration conf,
boolean isAcidRead,
int dataColumns) {
String columnNameProperty = null;
String columnTypeProperty = null;
ArrayList<String> schemaEvolutionColumnNames = null;
ArrayList<TypeDescription> schemaEvolutionTypeDescrs = null;
boolean haveSchemaEvolutionProperties = false;
if (isAcidRead || HiveConf.getBoolVar(conf, ConfVars.HIVE_SCHEMA_EVOLUTION) ) {
columnNameProperty = conf.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS);
columnTypeProperty = conf.get(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES);
haveSchemaEvolutionProperties =
(columnNameProperty != null && columnTypeProperty != null);
if (haveSchemaEvolutionProperties) {
schemaEvolutionColumnNames = Lists.newArrayList(columnNameProperty.split(","));
if (schemaEvolutionColumnNames.size() == 0) {
haveSchemaEvolutionProperties = false;
} else {
schemaEvolutionTypeDescrs =
typeDescriptionsFromHiveTypeProperty(columnTypeProperty,
dataColumns);
if (schemaEvolutionTypeDescrs.size() !=
Math.min(dataColumns, schemaEvolutionColumnNames.size())) {
haveSchemaEvolutionProperties = false;
}
}
} else if (isAcidRead) {
throw new IllegalArgumentException(ErrorMsg.SCHEMA_REQUIRED_TO_READ_ACID_TABLES.getErrorCodedMsg());
}
}
if (haveSchemaEvolutionProperties) {
if (LOG.isInfoEnabled()) {
LOG.info("Using schema evolution configuration variables schema.evolution.columns " +
schemaEvolutionColumnNames.toString() +
" / schema.evolution.columns.types " +
schemaEvolutionTypeDescrs.toString() +
" (isAcidRead " + isAcidRead + ")");
}
} else {
// Try regular properties;
columnNameProperty = conf.get(serdeConstants.LIST_COLUMNS);
columnTypeProperty = conf.get(serdeConstants.LIST_COLUMN_TYPES);
if (columnTypeProperty == null || columnNameProperty == null) {
return null;
}
schemaEvolutionColumnNames = Lists.newArrayList(columnNameProperty.split(","));
if (schemaEvolutionColumnNames.size() == 0) {
return null;
}
schemaEvolutionTypeDescrs =
typeDescriptionsFromHiveTypeProperty(columnTypeProperty, dataColumns);
if (schemaEvolutionTypeDescrs.size() !=
Math.min(dataColumns, schemaEvolutionColumnNames.size())) {
return null;
}
// Find first virtual column and clip them off.
int virtualColumnClipNum = -1;
int columnNum = 0;
for (String columnName : schemaEvolutionColumnNames) {
if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(columnName)) {
virtualColumnClipNum = columnNum;
break;
}
columnNum++;
}
if (virtualColumnClipNum != -1 && virtualColumnClipNum < dataColumns) {
schemaEvolutionColumnNames =
Lists.newArrayList(schemaEvolutionColumnNames.subList(0, virtualColumnClipNum));
schemaEvolutionTypeDescrs = Lists.newArrayList(schemaEvolutionTypeDescrs.subList(0, virtualColumnClipNum));
}
if (LOG.isInfoEnabled()) {
LOG.info("Using column configuration variables columns " +
schemaEvolutionColumnNames.toString() +
" / columns.types " +
schemaEvolutionTypeDescrs.toString() +
" (isAcidRead " + isAcidRead + ")");
}
}
// Desired schema does not include virtual columns or partition columns.
TypeDescription result = TypeDescription.createStruct();
for (int i = 0; i < schemaEvolutionTypeDescrs.size(); i++) {
result.addField(schemaEvolutionColumnNames.get(i), schemaEvolutionTypeDescrs.get(i));
}
return result;
}
@VisibleForTesting
protected ExternalFooterCachesByConf createExternalCaches() {
return null; // The default ones are created in case of null; tests override this.
}
@Override
public BatchToRowReader<?, ?> getWrapper(
org.apache.hadoop.mapred.RecordReader<NullWritable, VectorizedRowBatch> vrr,
VectorizedRowBatchCtx vrbCtx, List<Integer> includedCols) {
return new OrcOiBatchToRowReader(vrr, vrbCtx, includedCols);
}
}