/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.plan;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.plan.Explain.Level;
import org.apache.hadoop.hive.ql.plan.Explain.Vectorization;
/**
* FileSinkDesc.
*
*/
@Explain(displayName = "File Output Operator", explainLevels = { Level.USER, Level.DEFAULT, Level.EXTENDED })
public class FileSinkDesc extends AbstractOperatorDesc {
private static final long serialVersionUID = 1L;
public enum DPSortState {
NONE, PARTITION_SORTED, PARTITION_BUCKET_SORTED
}
private DPSortState dpSortState;
private Path dirName;
// normally statsKeyPref will be the same as dirName, but the latter
// could be changed in local execution optimization
private String statsKeyPref;
private TableDesc tableInfo;
private boolean compressed;
private int destTableId;
private String compressCodec;
private String compressType;
private boolean multiFileSpray;
private boolean temporary;
private boolean materialization;
// Whether the files output by this FileSink can be merged, e.g. if they are to be put into a
// bucketed or sorted table/partition they cannot be merged.
private boolean canBeMerged;
private int totalFiles;
private ArrayList<ExprNodeDesc> partitionCols;
private int numFiles;
private DynamicPartitionCtx dpCtx;
private String staticSpec; // static partition spec ends with a '/'
private boolean gatherStats;
// Consider a query like:
// insert overwrite table T3 select ... from T1 join T2 on T1.key = T2.key;
// where T1, T2 and T3 are sorted and bucketed by key into the same number of buckets,
// We dont need a reducer to enforce bucketing and sorting for T3.
// The field below captures the fact that the reducer introduced to enforce sorting/
// bucketing of T3 has been removed.
// In this case, a sort-merge join is needed, and so the sort-merge join between T1 and T2
// cannot be performed as a map-only job
private transient boolean removedReduceSinkBucketSort;
// This file descriptor is linked to other file descriptors.
// One use case is that, a union->select (star)->file sink, is broken down.
// For eg: consider a query like:
// select * from (subq1 union all subq2)x;
// where subq1 or subq2 involves a map-reduce job.
// It is broken into two independent queries involving subq1 and subq2 directly, and
// the sub-queries write to sub-directories of a common directory. So, the file sink
// descriptors for subq1 and subq2 are linked.
private boolean linkedFileSink = false;
private Path parentDir;
transient private List<FileSinkDesc> linkedFileSinkDesc;
private boolean statsReliable;
private ListBucketingCtx lbCtx;
private String statsTmpDir;
// Record what type of write this is. Default is non-ACID (ie old style).
private AcidUtils.Operation writeType = AcidUtils.Operation.NOT_ACID;
private long txnId = 0; // transaction id for this operation
private int statementId = -1;
private transient Table table;
private Path destPath;
private boolean isHiveServerQuery;
/**
* Whether is a HiveServer query, and the destination table is
* indeed written using ThriftJDBCBinarySerDe
*/
private boolean isUsingThriftJDBCBinarySerDe = false;
public FileSinkDesc() {
}
/**
* @param destPath - the final destination for data
*/
public FileSinkDesc(final Path dirName, final TableDesc tableInfo,
final boolean compressed, final int destTableId, final boolean multiFileSpray,
final boolean canBeMerged, final int numFiles, final int totalFiles,
final ArrayList<ExprNodeDesc> partitionCols, final DynamicPartitionCtx dpCtx, Path destPath) {
this.dirName = dirName;
this.tableInfo = tableInfo;
this.compressed = compressed;
this.destTableId = destTableId;
this.multiFileSpray = multiFileSpray;
this.canBeMerged = canBeMerged;
this.numFiles = numFiles;
this.totalFiles = totalFiles;
this.partitionCols = partitionCols;
this.dpCtx = dpCtx;
this.dpSortState = DPSortState.NONE;
this.destPath = destPath;
}
public FileSinkDesc(final Path dirName, final TableDesc tableInfo,
final boolean compressed) {
this.dirName = dirName;
this.tableInfo = tableInfo;
this.compressed = compressed;
destTableId = 0;
this.multiFileSpray = false;
this.canBeMerged = false;
this.numFiles = 1;
this.totalFiles = 1;
this.partitionCols = null;
this.dpSortState = DPSortState.NONE;
}
@Override
public Object clone() throws CloneNotSupportedException {
FileSinkDesc ret = new FileSinkDesc(dirName, tableInfo, compressed,
destTableId, multiFileSpray, canBeMerged, numFiles, totalFiles,
partitionCols, dpCtx, destPath);
ret.setCompressCodec(compressCodec);
ret.setCompressType(compressType);
ret.setGatherStats(gatherStats);
ret.setStaticSpec(staticSpec);
ret.setStatsAggPrefix(statsKeyPref);
ret.setLinkedFileSink(linkedFileSink);
ret.setParentDir(parentDir);
ret.setLinkedFileSinkDesc(linkedFileSinkDesc);
ret.setStatsReliable(statsReliable);
ret.setDpSortState(dpSortState);
ret.setWriteType(writeType);
ret.setTransactionId(txnId);
ret.setStatsTmpDir(statsTmpDir);
return ret;
}
public boolean isHiveServerQuery() {
return this.isHiveServerQuery;
}
public void setHiveServerQuery(boolean isHiveServerQuery) {
this.isHiveServerQuery = isHiveServerQuery;
}
public boolean isUsingThriftJDBCBinarySerDe() {
return this.isUsingThriftJDBCBinarySerDe;
}
public void setIsUsingThriftJDBCBinarySerDe(boolean isUsingThriftJDBCBinarySerDe) {
this.isUsingThriftJDBCBinarySerDe = isUsingThriftJDBCBinarySerDe;
}
@Explain(displayName = "directory", explainLevels = { Level.EXTENDED })
public Path getDirName() {
return dirName;
}
public void setDirName(final Path dirName) {
this.dirName = dirName;
}
public Path getFinalDirName() {
return linkedFileSink ? parentDir : dirName;
}
@Explain(displayName = "table", explainLevels = { Level.USER, Level.DEFAULT, Level.EXTENDED })
public TableDesc getTableInfo() {
return tableInfo;
}
public void setTableInfo(final TableDesc tableInfo) {
this.tableInfo = tableInfo;
}
@Explain(displayName = "compressed")
public boolean getCompressed() {
return compressed;
}
public void setCompressed(boolean compressed) {
this.compressed = compressed;
}
@Explain(displayName = "GlobalTableId", explainLevels = { Level.EXTENDED })
public int getDestTableId() {
return destTableId;
}
public void setDestTableId(int destTableId) {
this.destTableId = destTableId;
}
public String getCompressCodec() {
return compressCodec;
}
public void setCompressCodec(String intermediateCompressorCodec) {
compressCodec = intermediateCompressorCodec;
}
public String getCompressType() {
return compressType;
}
public void setCompressType(String intermediateCompressType) {
compressType = intermediateCompressType;
}
/**
* @return the multiFileSpray
*/
@Explain(displayName = "MultiFileSpray", explainLevels = { Level.EXTENDED })
public boolean isMultiFileSpray() {
return multiFileSpray;
}
/**
* @param multiFileSpray the multiFileSpray to set
*/
public void setMultiFileSpray(boolean multiFileSpray) {
this.multiFileSpray = multiFileSpray;
}
/**
* @return destination is temporary
*/
public boolean isTemporary() {
return temporary;
}
public void setTemporary(boolean temporary) {
this.temporary = temporary;
}
public boolean isMaterialization() {
return materialization;
}
public void setMaterialization(boolean materialization) {
this.materialization = materialization;
}
public boolean canBeMerged() {
return canBeMerged;
}
public void setCanBeMerged(boolean canBeMerged) {
this.canBeMerged = canBeMerged;
}
/**
* @return the totalFiles
*/
@Explain(displayName = "TotalFiles", explainLevels = { Level.EXTENDED })
public int getTotalFiles() {
return totalFiles;
}
/**
* @param totalFiles the totalFiles to set
*/
public void setTotalFiles(int totalFiles) {
this.totalFiles = totalFiles;
}
/**
* @return the partitionCols
*/
public ArrayList<ExprNodeDesc> getPartitionCols() {
return partitionCols;
}
/**
* @param partitionCols the partitionCols to set
*/
public void setPartitionCols(ArrayList<ExprNodeDesc> partitionCols) {
this.partitionCols = partitionCols;
}
/**
* @return the numFiles
*/
@Explain(displayName = "NumFilesPerFileSink", explainLevels = { Level.EXTENDED })
public int getNumFiles() {
return numFiles;
}
/**
* @param numFiles the numFiles to set
*/
public void setNumFiles(int numFiles) {
this.numFiles = numFiles;
}
public void setDynPartCtx(DynamicPartitionCtx dpc) {
this.dpCtx = dpc;
}
public DynamicPartitionCtx getDynPartCtx() {
return this.dpCtx;
}
public void setStaticSpec(String staticSpec) {
this.staticSpec = staticSpec;
}
@Explain(displayName = "Static Partition Specification", explainLevels = { Level.EXTENDED })
public String getStaticSpec() {
return staticSpec;
}
public void setGatherStats(boolean gatherStats) {
this.gatherStats = gatherStats;
}
@Explain(displayName = "GatherStats", explainLevels = { Level.EXTENDED })
public boolean isGatherStats() {
return gatherStats;
}
/**
* Construct the key prefix used as (intermediate) statistics publishing
* and aggregation. During stats publishing phase, this key prefix will be
* appended with the optional dynamic partition spec and the task ID. The
* whole key uniquely identifies the output of a task for this job. In the
* stats aggregation phase, all rows with the same prefix plus dynamic partition
* specs (obtained at run-time after MR job finishes) will be serving as the
* prefix: all rows with the same prefix (output of all tasks for this job)
* will be aggregated.
* @return key prefix used for stats publishing and aggregation.
*/
@Explain(displayName = "Stats Publishing Key Prefix", explainLevels = { Level.EXTENDED })
public String getStatsAggPrefix() {
// dirName uniquely identifies destination directory of a FileSinkOperator.
// If more than one FileSinkOperator write to the same partition, this dirName
// should be different.
return statsKeyPref;
}
/**
* Set the stats aggregation key. If the input string is not terminated by Path.SEPARATOR
* aggregation key will add one to make it as a directory name.
* @param k input directory name.
*/
public void setStatsAggPrefix(String k) {
if (k.endsWith(Path.SEPARATOR)) {
statsKeyPref = k;
} else {
statsKeyPref = k + Path.SEPARATOR;
}
}
public boolean isLinkedFileSink() {
return linkedFileSink;
}
public void setLinkedFileSink(boolean linkedFileSink) {
this.linkedFileSink = linkedFileSink;
}
public Path getParentDir() {
return parentDir;
}
public void setParentDir(Path parentDir) {
this.parentDir = parentDir;
}
public boolean isStatsReliable() {
return statsReliable;
}
public void setStatsReliable(boolean statsReliable) {
this.statsReliable = statsReliable;
}
/**
* @return the lbCtx
*/
public ListBucketingCtx getLbCtx() {
return lbCtx;
}
/**
* @param lbCtx the lbCtx to set
*/
public void setLbCtx(ListBucketingCtx lbCtx) {
this.lbCtx = lbCtx;
}
public List<FileSinkDesc> getLinkedFileSinkDesc() {
return linkedFileSinkDesc;
}
public void setLinkedFileSinkDesc(List<FileSinkDesc> linkedFileSinkDesc) {
this.linkedFileSinkDesc = linkedFileSinkDesc;
}
public boolean isRemovedReduceSinkBucketSort() {
return removedReduceSinkBucketSort;
}
public void setRemovedReduceSinkBucketSort(boolean removedReduceSinkBucketSort) {
this.removedReduceSinkBucketSort = removedReduceSinkBucketSort;
}
public DPSortState getDpSortState() {
return dpSortState;
}
@Explain(displayName = "Dp Sort State")
public String getDpSortStateString() {
return getDpSortState() == DPSortState.NONE ? null : getDpSortState().toString();
}
public void setDpSortState(DPSortState dpSortState) {
this.dpSortState = dpSortState;
}
public void setWriteType(AcidUtils.Operation type) {
writeType = type;
}
public AcidUtils.Operation getWriteType() {
return writeType;
}
@Explain(displayName = "Write Type")
public String getWriteTypeString() {
return getWriteType() == AcidUtils.Operation.NOT_ACID ? null : getWriteType().toString();
}
public void setTransactionId(long id) {
txnId = id;
}
public long getTransactionId() {
return txnId;
}
public void setStatementId(int id) {
statementId = id;
}
/**
* See {@link org.apache.hadoop.hive.ql.io.AcidOutputFormat.Options#statementId(int)}
*/
public int getStatementId() {
return statementId;
}
public Path getDestPath() {
return destPath;
}
public Table getTable() {
return table;
}
public void setTable(Table table) {
this.table = table;
}
public String getStatsTmpDir() {
return statsTmpDir;
}
public void setStatsTmpDir(String statsCollectionTempDir) {
this.statsTmpDir = statsCollectionTempDir;
}
public class FileSinkOperatorExplainVectorization extends OperatorExplainVectorization {
public FileSinkOperatorExplainVectorization(VectorDesc vectorDesc) {
// Native vectorization not supported.
super(vectorDesc, false);
}
}
@Explain(vectorization = Vectorization.OPERATOR, displayName = "File Sink Vectorization", explainLevels = { Level.DEFAULT, Level.EXTENDED })
public FileSinkOperatorExplainVectorization getFileSinkVectorization() {
if (vectorDesc == null) {
return null;
}
return new FileSinkOperatorExplainVectorization(vectorDesc);
}
}