/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.exec; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.FileUtils; import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.hive.ql.plan.api.OperatorType; import org.apache.hadoop.hive.ql.stats.StatsCollectionContext; import org.apache.hadoop.hive.ql.stats.StatsPublisher; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapred.JobConf; /** * Table Scan Operator If the data is coming from the map-reduce framework, just * forward it. This will be needed as part of local work when data is not being * read as part of map-reduce framework **/ public class TableScanOperator extends Operator<TableScanDesc> implements Serializable { private static final long serialVersionUID = 1L; protected transient JobConf jc; private transient boolean inputFileChanged = false; private TableDesc tableDesc; private transient Stat currentStat; private transient Map<String, Stat> stats; private transient int rowLimit = -1; private transient int currCount = 0; // insiderView will tell this TableScan is inside a view or not. private transient boolean insideView; private String defaultPartitionName; /** * These values are saved during MapWork, FetchWork, etc preparation and later added to the the * JobConf of each task. */ private String schemaEvolutionColumns; private String schemaEvolutionColumnsTypes; public TableDesc getTableDesc() { return tableDesc; } public void setTableDesc(TableDesc tableDesc) { this.tableDesc = tableDesc; } public void setSchemaEvolution(String schemaEvolutionColumns, String schemaEvolutionColumnsTypes) { this.schemaEvolutionColumns = schemaEvolutionColumns; this.schemaEvolutionColumnsTypes = schemaEvolutionColumnsTypes; } public String getSchemaEvolutionColumns() { return schemaEvolutionColumns; } public String getSchemaEvolutionColumnsTypes() { return schemaEvolutionColumnsTypes; } /** * Other than gathering statistics for the ANALYZE command, the table scan operator * does not do anything special other than just forwarding the row. Since the table * data is always read as part of the map-reduce framework by the mapper. But, when this * assumption stops to be true, i.e table data won't be only read by the mapper, this * operator will be enhanced to read the table. **/ @Override public void process(Object row, int tag) throws HiveException { if (rowLimit >= 0) { if (row instanceof VectorizedRowBatch) { VectorizedRowBatch batch = (VectorizedRowBatch) row; if (currCount >= rowLimit) { setDone(true); return; } if (currCount + batch.size > rowLimit) { batch.size = rowLimit - currCount; } currCount += batch.size; } else if (currCount++ >= rowLimit) { setDone(true); return; } } if (conf != null && conf.isGatherStats()) { gatherStats(row); } forward(row, inputObjInspectors[tag]); } // Change the table partition for collecting stats @Override public void cleanUpInputFileChangedOp() throws HiveException { inputFileChanged = true; // If the file name to bucket number mapping is maintained, store the bucket number // in the execution context. This is needed for the following scenario: // insert overwrite table T1 select * from T2; // where T1 and T2 are sorted/bucketed by the same keys into the same number of buckets // Although one mapper per file is used (BucketizedInputHiveInput), it is possible that // any mapper can pick up any file (depending on the size of the files). The bucket number // corresponding to the input file is stored to name the output bucket file appropriately. Map<String, Integer> bucketNameMapping = (conf != null) ? conf.getBucketFileNameMapping() : null; if ((bucketNameMapping != null) && (!bucketNameMapping.isEmpty())) { Path currentInputPath = getExecContext().getCurrentInputPath(); getExecContext().setFileId(Integer.toString(bucketNameMapping.get( currentInputPath.getName()))); } } private void gatherStats(Object row) { // first row/call or a new partition if ((currentStat == null) || inputFileChanged) { String partitionSpecs; inputFileChanged = false; if (conf.getPartColumns() == null || conf.getPartColumns().size() == 0) { partitionSpecs = ""; // non-partitioned } else { // Figure out the partition spec from the input. // This is only done once for the first row (when stat == null) // since all rows in the same mapper should be from the same partition. List<Object> writable; List<String> values; int dpStartCol; // the first position of partition column assert inputObjInspectors[0].getCategory() == ObjectInspector.Category.STRUCT : "input object inspector is not struct"; writable = new ArrayList<Object>(conf.getPartColumns().size()); values = new ArrayList<String>(conf.getPartColumns().size()); dpStartCol = 0; StructObjectInspector soi = (StructObjectInspector) inputObjInspectors[0]; for (StructField sf : soi.getAllStructFieldRefs()) { String fn = sf.getFieldName(); if (!conf.getPartColumns().contains(fn)) { dpStartCol++; } else { break; } } ObjectInspectorUtils.partialCopyToStandardObject(writable, row, dpStartCol, conf .getPartColumns().size(), (StructObjectInspector) inputObjInspectors[0], ObjectInspectorCopyOption.WRITABLE); for (Object o : writable) { // It's possible that a parition column may have NULL value, in which case the row belongs // to the special partition, __HIVE_DEFAULT_PARTITION__. values.add(o == null ? defaultPartitionName : o.toString()); } partitionSpecs = FileUtils.makePartName(conf.getPartColumns(), values); if (isLogInfoEnabled) { LOG.info("Stats Gathering found a new partition spec = " + partitionSpecs); } } // find which column contains the raw data size (both partitioned and non partitioned int uSizeColumn = -1; StructObjectInspector soi = (StructObjectInspector) inputObjInspectors[0]; for (int i = 0; i < soi.getAllStructFieldRefs().size(); i++) { if (soi.getAllStructFieldRefs().get(i).getFieldName() .equals(VirtualColumn.RAWDATASIZE.getName().toLowerCase())) { uSizeColumn = i; break; } } currentStat = stats.get(partitionSpecs); if (currentStat == null) { currentStat = new Stat(); currentStat.setBookkeepingInfo(StatsSetupConst.RAW_DATA_SIZE, uSizeColumn); stats.put(partitionSpecs, currentStat); } } // increase the row count currentStat.addToStat(StatsSetupConst.ROW_COUNT, 1); // extract the raw data size, and update the stats for the current partition int rdSizeColumn = currentStat.getBookkeepingInfo(StatsSetupConst.RAW_DATA_SIZE); if(rdSizeColumn != -1) { List<Object> rdSize = new ArrayList<Object>(1); ObjectInspectorUtils.partialCopyToStandardObject(rdSize, row, rdSizeColumn, 1, (StructObjectInspector) inputObjInspectors[0], ObjectInspectorCopyOption.WRITABLE); currentStat.addToStat(StatsSetupConst.RAW_DATA_SIZE, (((LongWritable) rdSize.get(0)).get())); } } /** Kryo ctor. */ protected TableScanOperator() { super(); } public TableScanOperator(CompilationOpContext ctx) { super(ctx); } @Override protected void initializeOp(Configuration hconf) throws HiveException { super.initializeOp(hconf); inputFileChanged = false; if (conf == null) { return; } rowLimit = conf.getRowLimit(); if (!conf.isGatherStats()) { return; } if (hconf instanceof JobConf) { jc = (JobConf) hconf; } else { // test code path jc = new JobConf(hconf); } defaultPartitionName = HiveConf.getVar(hconf, HiveConf.ConfVars.DEFAULTPARTITIONNAME); currentStat = null; stats = new HashMap<String, Stat>(); } @Override public void closeOp(boolean abort) throws HiveException { if (conf != null) { if (conf.isGatherStats() && stats.size() != 0) { publishStats(); } } super.closeOp(abort); } /** * The operator name for this operator type. This is used to construct the * rule for an operator * * @return the operator name **/ @Override public String getName() { return TableScanOperator.getOperatorName(); } static public String getOperatorName() { return "TS"; } public void setNeededColumnIDs(List<Integer> orign_columns) { conf.setNeededColumnIDs(orign_columns); } public List<Integer> getNeededColumnIDs() { return conf.getNeededColumnIDs(); } public void setNeededColumns(List<String> columnNames) { conf.setNeededColumns(columnNames); } public List<String> getNeededNestedColumnPaths() { return conf.getNeededNestedColumnPaths(); } public void setNeededNestedColumnPaths(List<String> nestedColumnPaths) { conf.setNeededNestedColumnPaths(nestedColumnPaths); } public List<String> getNeededColumns() { return conf.getNeededColumns(); } public void setReferencedColumns(List<String> referencedColumns) { conf.setReferencedColumns(referencedColumns); } public List<String> getReferencedColumns() { return conf.getReferencedColumns(); } @Override public OperatorType getType() { return OperatorType.TABLESCAN; } private void publishStats() throws HiveException { boolean isStatsReliable = conf.isStatsReliable(); // Initializing a stats publisher StatsPublisher statsPublisher = Utilities.getStatsPublisher(jc); StatsCollectionContext sc = new StatsCollectionContext(jc); sc.setStatsTmpDir(conf.getTmpStatsDir()); if (!statsPublisher.connect(sc)) { // just return, stats gathering should not block the main query. if (isLogInfoEnabled) { LOG.info("StatsPublishing error: cannot connect to database."); } if (isStatsReliable) { throw new HiveException(ErrorMsg.STATSPUBLISHER_CONNECTION_ERROR.getErrorCodedMsg()); } return; } Map<String, String> statsToPublish = new HashMap<String, String>(); for (String pspecs : stats.keySet()) { statsToPublish.clear(); String prefix = Utilities.join(conf.getStatsAggPrefix(), pspecs); String key = prefix.endsWith(Path.SEPARATOR) ? prefix : prefix + Path.SEPARATOR; for(String statType : stats.get(pspecs).getStoredStats()) { statsToPublish.put(statType, Long.toString(stats.get(pspecs).getStat(statType))); } if (!statsPublisher.publishStat(key, statsToPublish)) { if (isStatsReliable) { throw new HiveException(ErrorMsg.STATSPUBLISHER_PUBLISHING_ERROR.getErrorCodedMsg()); } } if (isLogInfoEnabled) { LOG.info("publishing : " + key + " : " + statsToPublish.toString()); } } if (!statsPublisher.closeConnection(sc)) { if (isStatsReliable) { throw new HiveException(ErrorMsg.STATSPUBLISHER_CLOSING_ERROR.getErrorCodedMsg()); } } } @Override public boolean supportSkewJoinOptimization() { return true; } @Override public boolean supportAutomaticSortMergeJoin() { return true; } @Override public Operator<? extends OperatorDesc> clone() throws CloneNotSupportedException { TableScanOperator ts = (TableScanOperator) super.clone(); ts.setNeededColumnIDs(new ArrayList<Integer>(getNeededColumnIDs())); ts.setNeededColumns(new ArrayList<String>(getNeededColumns())); ts.setReferencedColumns(new ArrayList<String>(getReferencedColumns())); return ts; } public boolean isInsideView() { return insideView; } public void setInsideView(boolean insiderView) { this.insideView = insiderView; } }