/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.plan; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.plan.Explain.Level; import org.apache.hadoop.hive.ql.plan.Explain.Vectorization; /** * FileSinkDesc. * */ @Explain(displayName = "File Output Operator", explainLevels = { Level.USER, Level.DEFAULT, Level.EXTENDED }) public class FileSinkDesc extends AbstractOperatorDesc { private static final long serialVersionUID = 1L; public enum DPSortState { NONE, PARTITION_SORTED, PARTITION_BUCKET_SORTED } private DPSortState dpSortState; private Path dirName; // normally statsKeyPref will be the same as dirName, but the latter // could be changed in local execution optimization private String statsKeyPref; private TableDesc tableInfo; private boolean compressed; private int destTableId; private String compressCodec; private String compressType; private boolean multiFileSpray; private boolean temporary; private boolean materialization; // Whether the files output by this FileSink can be merged, e.g. if they are to be put into a // bucketed or sorted table/partition they cannot be merged. private boolean canBeMerged; private int totalFiles; private ArrayList<ExprNodeDesc> partitionCols; private int numFiles; private DynamicPartitionCtx dpCtx; private String staticSpec; // static partition spec ends with a '/' private boolean gatherStats; // Consider a query like: // insert overwrite table T3 select ... from T1 join T2 on T1.key = T2.key; // where T1, T2 and T3 are sorted and bucketed by key into the same number of buckets, // We dont need a reducer to enforce bucketing and sorting for T3. // The field below captures the fact that the reducer introduced to enforce sorting/ // bucketing of T3 has been removed. // In this case, a sort-merge join is needed, and so the sort-merge join between T1 and T2 // cannot be performed as a map-only job private transient boolean removedReduceSinkBucketSort; // This file descriptor is linked to other file descriptors. // One use case is that, a union->select (star)->file sink, is broken down. // For eg: consider a query like: // select * from (subq1 union all subq2)x; // where subq1 or subq2 involves a map-reduce job. // It is broken into two independent queries involving subq1 and subq2 directly, and // the sub-queries write to sub-directories of a common directory. So, the file sink // descriptors for subq1 and subq2 are linked. private boolean linkedFileSink = false; private Path parentDir; transient private List<FileSinkDesc> linkedFileSinkDesc; private boolean statsReliable; private ListBucketingCtx lbCtx; private String statsTmpDir; // Record what type of write this is. Default is non-ACID (ie old style). private AcidUtils.Operation writeType = AcidUtils.Operation.NOT_ACID; private long txnId = 0; // transaction id for this operation private int statementId = -1; private transient Table table; private Path destPath; private boolean isHiveServerQuery; /** * Whether is a HiveServer query, and the destination table is * indeed written using ThriftJDBCBinarySerDe */ private boolean isUsingThriftJDBCBinarySerDe = false; public FileSinkDesc() { } /** * @param destPath - the final destination for data */ public FileSinkDesc(final Path dirName, final TableDesc tableInfo, final boolean compressed, final int destTableId, final boolean multiFileSpray, final boolean canBeMerged, final int numFiles, final int totalFiles, final ArrayList<ExprNodeDesc> partitionCols, final DynamicPartitionCtx dpCtx, Path destPath) { this.dirName = dirName; this.tableInfo = tableInfo; this.compressed = compressed; this.destTableId = destTableId; this.multiFileSpray = multiFileSpray; this.canBeMerged = canBeMerged; this.numFiles = numFiles; this.totalFiles = totalFiles; this.partitionCols = partitionCols; this.dpCtx = dpCtx; this.dpSortState = DPSortState.NONE; this.destPath = destPath; } public FileSinkDesc(final Path dirName, final TableDesc tableInfo, final boolean compressed) { this.dirName = dirName; this.tableInfo = tableInfo; this.compressed = compressed; destTableId = 0; this.multiFileSpray = false; this.canBeMerged = false; this.numFiles = 1; this.totalFiles = 1; this.partitionCols = null; this.dpSortState = DPSortState.NONE; } @Override public Object clone() throws CloneNotSupportedException { FileSinkDesc ret = new FileSinkDesc(dirName, tableInfo, compressed, destTableId, multiFileSpray, canBeMerged, numFiles, totalFiles, partitionCols, dpCtx, destPath); ret.setCompressCodec(compressCodec); ret.setCompressType(compressType); ret.setGatherStats(gatherStats); ret.setStaticSpec(staticSpec); ret.setStatsAggPrefix(statsKeyPref); ret.setLinkedFileSink(linkedFileSink); ret.setParentDir(parentDir); ret.setLinkedFileSinkDesc(linkedFileSinkDesc); ret.setStatsReliable(statsReliable); ret.setDpSortState(dpSortState); ret.setWriteType(writeType); ret.setTransactionId(txnId); ret.setStatsTmpDir(statsTmpDir); return ret; } public boolean isHiveServerQuery() { return this.isHiveServerQuery; } public void setHiveServerQuery(boolean isHiveServerQuery) { this.isHiveServerQuery = isHiveServerQuery; } public boolean isUsingThriftJDBCBinarySerDe() { return this.isUsingThriftJDBCBinarySerDe; } public void setIsUsingThriftJDBCBinarySerDe(boolean isUsingThriftJDBCBinarySerDe) { this.isUsingThriftJDBCBinarySerDe = isUsingThriftJDBCBinarySerDe; } @Explain(displayName = "directory", explainLevels = { Level.EXTENDED }) public Path getDirName() { return dirName; } public void setDirName(final Path dirName) { this.dirName = dirName; } public Path getFinalDirName() { return linkedFileSink ? parentDir : dirName; } @Explain(displayName = "table", explainLevels = { Level.USER, Level.DEFAULT, Level.EXTENDED }) public TableDesc getTableInfo() { return tableInfo; } public void setTableInfo(final TableDesc tableInfo) { this.tableInfo = tableInfo; } @Explain(displayName = "compressed") public boolean getCompressed() { return compressed; } public void setCompressed(boolean compressed) { this.compressed = compressed; } @Explain(displayName = "GlobalTableId", explainLevels = { Level.EXTENDED }) public int getDestTableId() { return destTableId; } public void setDestTableId(int destTableId) { this.destTableId = destTableId; } public String getCompressCodec() { return compressCodec; } public void setCompressCodec(String intermediateCompressorCodec) { compressCodec = intermediateCompressorCodec; } public String getCompressType() { return compressType; } public void setCompressType(String intermediateCompressType) { compressType = intermediateCompressType; } /** * @return the multiFileSpray */ @Explain(displayName = "MultiFileSpray", explainLevels = { Level.EXTENDED }) public boolean isMultiFileSpray() { return multiFileSpray; } /** * @param multiFileSpray the multiFileSpray to set */ public void setMultiFileSpray(boolean multiFileSpray) { this.multiFileSpray = multiFileSpray; } /** * @return destination is temporary */ public boolean isTemporary() { return temporary; } public void setTemporary(boolean temporary) { this.temporary = temporary; } public boolean isMaterialization() { return materialization; } public void setMaterialization(boolean materialization) { this.materialization = materialization; } public boolean canBeMerged() { return canBeMerged; } public void setCanBeMerged(boolean canBeMerged) { this.canBeMerged = canBeMerged; } /** * @return the totalFiles */ @Explain(displayName = "TotalFiles", explainLevels = { Level.EXTENDED }) public int getTotalFiles() { return totalFiles; } /** * @param totalFiles the totalFiles to set */ public void setTotalFiles(int totalFiles) { this.totalFiles = totalFiles; } /** * @return the partitionCols */ public ArrayList<ExprNodeDesc> getPartitionCols() { return partitionCols; } /** * @param partitionCols the partitionCols to set */ public void setPartitionCols(ArrayList<ExprNodeDesc> partitionCols) { this.partitionCols = partitionCols; } /** * @return the numFiles */ @Explain(displayName = "NumFilesPerFileSink", explainLevels = { Level.EXTENDED }) public int getNumFiles() { return numFiles; } /** * @param numFiles the numFiles to set */ public void setNumFiles(int numFiles) { this.numFiles = numFiles; } public void setDynPartCtx(DynamicPartitionCtx dpc) { this.dpCtx = dpc; } public DynamicPartitionCtx getDynPartCtx() { return this.dpCtx; } public void setStaticSpec(String staticSpec) { this.staticSpec = staticSpec; } @Explain(displayName = "Static Partition Specification", explainLevels = { Level.EXTENDED }) public String getStaticSpec() { return staticSpec; } public void setGatherStats(boolean gatherStats) { this.gatherStats = gatherStats; } @Explain(displayName = "GatherStats", explainLevels = { Level.EXTENDED }) public boolean isGatherStats() { return gatherStats; } /** * Construct the key prefix used as (intermediate) statistics publishing * and aggregation. During stats publishing phase, this key prefix will be * appended with the optional dynamic partition spec and the task ID. The * whole key uniquely identifies the output of a task for this job. In the * stats aggregation phase, all rows with the same prefix plus dynamic partition * specs (obtained at run-time after MR job finishes) will be serving as the * prefix: all rows with the same prefix (output of all tasks for this job) * will be aggregated. * @return key prefix used for stats publishing and aggregation. */ @Explain(displayName = "Stats Publishing Key Prefix", explainLevels = { Level.EXTENDED }) public String getStatsAggPrefix() { // dirName uniquely identifies destination directory of a FileSinkOperator. // If more than one FileSinkOperator write to the same partition, this dirName // should be different. return statsKeyPref; } /** * Set the stats aggregation key. If the input string is not terminated by Path.SEPARATOR * aggregation key will add one to make it as a directory name. * @param k input directory name. */ public void setStatsAggPrefix(String k) { if (k.endsWith(Path.SEPARATOR)) { statsKeyPref = k; } else { statsKeyPref = k + Path.SEPARATOR; } } public boolean isLinkedFileSink() { return linkedFileSink; } public void setLinkedFileSink(boolean linkedFileSink) { this.linkedFileSink = linkedFileSink; } public Path getParentDir() { return parentDir; } public void setParentDir(Path parentDir) { this.parentDir = parentDir; } public boolean isStatsReliable() { return statsReliable; } public void setStatsReliable(boolean statsReliable) { this.statsReliable = statsReliable; } /** * @return the lbCtx */ public ListBucketingCtx getLbCtx() { return lbCtx; } /** * @param lbCtx the lbCtx to set */ public void setLbCtx(ListBucketingCtx lbCtx) { this.lbCtx = lbCtx; } public List<FileSinkDesc> getLinkedFileSinkDesc() { return linkedFileSinkDesc; } public void setLinkedFileSinkDesc(List<FileSinkDesc> linkedFileSinkDesc) { this.linkedFileSinkDesc = linkedFileSinkDesc; } public boolean isRemovedReduceSinkBucketSort() { return removedReduceSinkBucketSort; } public void setRemovedReduceSinkBucketSort(boolean removedReduceSinkBucketSort) { this.removedReduceSinkBucketSort = removedReduceSinkBucketSort; } public DPSortState getDpSortState() { return dpSortState; } @Explain(displayName = "Dp Sort State") public String getDpSortStateString() { return getDpSortState() == DPSortState.NONE ? null : getDpSortState().toString(); } public void setDpSortState(DPSortState dpSortState) { this.dpSortState = dpSortState; } public void setWriteType(AcidUtils.Operation type) { writeType = type; } public AcidUtils.Operation getWriteType() { return writeType; } @Explain(displayName = "Write Type") public String getWriteTypeString() { return getWriteType() == AcidUtils.Operation.NOT_ACID ? null : getWriteType().toString(); } public void setTransactionId(long id) { txnId = id; } public long getTransactionId() { return txnId; } public void setStatementId(int id) { statementId = id; } /** * See {@link org.apache.hadoop.hive.ql.io.AcidOutputFormat.Options#statementId(int)} */ public int getStatementId() { return statementId; } public Path getDestPath() { return destPath; } public Table getTable() { return table; } public void setTable(Table table) { this.table = table; } public String getStatsTmpDir() { return statsTmpDir; } public void setStatsTmpDir(String statsCollectionTempDir) { this.statsTmpDir = statsCollectionTempDir; } public class FileSinkOperatorExplainVectorization extends OperatorExplainVectorization { public FileSinkOperatorExplainVectorization(VectorDesc vectorDesc) { // Native vectorization not supported. super(vectorDesc, false); } } @Explain(vectorization = Vectorization.OPERATOR, displayName = "File Sink Vectorization", explainLevels = { Level.DEFAULT, Level.EXTENDED }) public FileSinkOperatorExplainVectorization getFileSinkVectorization() { if (vectorDesc == null) { return null; } return new FileSinkOperatorExplainVectorization(vectorDesc); } }