/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.hive.serde2.objectinspector.InspectableObject;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.ReflectionUtils;
/**
* FetchTask implementation.
**/
public class FetchOperator implements Serializable {
static Log LOG = LogFactory.getLog(FetchOperator.class.getName());
static LogHelper console = new LogHelper(LOG);
private boolean isEmptyTable;
private boolean isNativeTable;
private FetchWork work;
private int splitNum;
private PartitionDesc currPart;
private TableDesc currTbl;
private boolean tblDataDone;
private transient RecordReader<WritableComparable, Writable> currRecReader;
private transient InputSplit[] inputSplits;
private transient InputFormat inputFormat;
private transient JobConf job;
private transient WritableComparable key;
private transient Writable value;
private transient Deserializer serde;
private transient Iterator<Path> iterPath;
private transient Iterator<PartitionDesc> iterPartDesc;
private transient Path currPath;
private transient StructObjectInspector rowObjectInspector;
private transient Object[] rowWithPart;
public FetchOperator() {
}
public FetchOperator(FetchWork work, JobConf job) {
this.work = work;
initialize(job);
}
public void initialize(JobConf job) {
this.job = job;
tblDataDone = false;
rowWithPart = new Object[2];
if (work.getTblDesc() != null) {
isNativeTable = !work.getTblDesc().isNonNative();
} else {
isNativeTable = true;
}
}
public FetchWork getWork() {
return work;
}
public void setWork(FetchWork work) {
this.work = work;
}
public int getSplitNum() {
return splitNum;
}
public void setSplitNum(int splitNum) {
this.splitNum = splitNum;
}
public PartitionDesc getCurrPart() {
return currPart;
}
public void setCurrPart(PartitionDesc currPart) {
this.currPart = currPart;
}
public TableDesc getCurrTbl() {
return currTbl;
}
public void setCurrTbl(TableDesc currTbl) {
this.currTbl = currTbl;
}
public boolean isTblDataDone() {
return tblDataDone;
}
public void setTblDataDone(boolean tblDataDone) {
this.tblDataDone = tblDataDone;
}
public boolean isEmptyTable() {
return isEmptyTable;
}
public void setEmptyTable(boolean isEmptyTable) {
this.isEmptyTable = isEmptyTable;
}
/**
* A cache of InputFormat instances.
*/
private static Map<Class, InputFormat<WritableComparable, Writable>> inputFormats = new HashMap<Class, InputFormat<WritableComparable, Writable>>();
static InputFormat<WritableComparable, Writable> getInputFormatFromCache(Class inputFormatClass,
Configuration conf) throws IOException {
if (!inputFormats.containsKey(inputFormatClass)) {
try {
InputFormat<WritableComparable, Writable> newInstance = (InputFormat<WritableComparable, Writable>) ReflectionUtils
.newInstance(inputFormatClass, conf);
inputFormats.put(inputFormatClass, newInstance);
} catch (Exception e) {
throw new IOException("Cannot create an instance of InputFormat class "
+ inputFormatClass.getName() + " as specified in mapredWork!", e);
}
}
return inputFormats.get(inputFormatClass);
}
private void setPrtnDesc() throws Exception {
List<String> partNames = new ArrayList<String>();
List<String> partValues = new ArrayList<String>();
String pcols = currPart.getTableDesc().getProperties().getProperty(
org.apache.hadoop.hive.metastore.api.Constants.META_TABLE_PARTITION_COLUMNS);
LinkedHashMap<String, String> partSpec = currPart.getPartSpec();
List<ObjectInspector> partObjectInspectors = new ArrayList<ObjectInspector>();
String[] partKeys = pcols.trim().split("/");
for (String key : partKeys) {
partNames.add(key);
partValues.add(partSpec.get(key));
partObjectInspectors.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
}
StructObjectInspector partObjectInspector = ObjectInspectorFactory
.getStandardStructObjectInspector(partNames, partObjectInspectors);
rowObjectInspector = (StructObjectInspector) serde.getObjectInspector();
rowWithPart[1] = partValues;
rowObjectInspector = ObjectInspectorFactory.getUnionStructObjectInspector(Arrays
.asList(new StructObjectInspector[] {rowObjectInspector, partObjectInspector}));
}
private void getNextPath() throws Exception {
// first time
if (iterPath == null) {
if (work.getTblDir() != null) {
if (!tblDataDone) {
currPath = work.getTblDirPath();
currTbl = work.getTblDesc();
if (isNativeTable) {
FileSystem fs = currPath.getFileSystem(job);
if (fs.exists(currPath)) {
FileStatus[] fStats = listStatusUnderPath(fs, currPath);
for (FileStatus fStat : fStats) {
if (fStat.getLen() > 0) {
tblDataDone = true;
break;
}
}
}
} else {
tblDataDone = true;
}
if (!tblDataDone) {
currPath = null;
}
return;
} else {
currTbl = null;
currPath = null;
}
return;
} else {
iterPath = FetchWork.convertStringToPathArray(work.getPartDir()).iterator();
iterPartDesc = work.getPartDesc().iterator();
}
}
while (iterPath.hasNext()) {
Path nxt = iterPath.next();
PartitionDesc prt = null;
if (iterPartDesc != null) {
prt = iterPartDesc.next();
}
FileSystem fs = nxt.getFileSystem(job);
if (fs.exists(nxt)) {
FileStatus[] fStats = listStatusUnderPath(fs, nxt);
for (FileStatus fStat : fStats) {
if (fStat.getLen() > 0) {
currPath = nxt;
if (iterPartDesc != null) {
currPart = prt;
}
return;
}
}
}
}
}
private RecordReader<WritableComparable, Writable> getRecordReader() throws Exception {
if (currPath == null) {
getNextPath();
if (currPath == null) {
return null;
}
// not using FileInputFormat.setInputPaths() here because it forces a
// connection
// to the default file system - which may or may not be online during pure
// metadata
// operations
job.set("mapred.input.dir", org.apache.hadoop.util.StringUtils.escapeString(currPath
.toString()));
PartitionDesc tmp;
if (currTbl == null) {
tmp = currPart;
} else {
tmp = new PartitionDesc(currTbl, null);
}
inputFormat = getInputFormatFromCache(tmp.getInputFileFormatClass(), job);
Utilities.copyTableJobPropertiesToConf(tmp.getTableDesc(), job);
inputSplits = inputFormat.getSplits(job, 1);
splitNum = 0;
serde = tmp.getDeserializerClass().newInstance();
serde.initialize(job, tmp.getProperties());
if (LOG.isDebugEnabled()) {
LOG.debug("Creating fetchTask with deserializer typeinfo: "
+ serde.getObjectInspector().getTypeName());
LOG.debug("deserializer properties: " + tmp.getProperties());
}
if (currPart != null) {
setPrtnDesc();
}
}
if (splitNum >= inputSplits.length) {
if (currRecReader != null) {
currRecReader.close();
currRecReader = null;
}
currPath = null;
return getRecordReader();
}
currRecReader = inputFormat.getRecordReader(inputSplits[splitNum++], job, Reporter.NULL);
key = currRecReader.createKey();
value = currRecReader.createValue();
return currRecReader;
}
/**
* Get the next row. The fetch context is modified appropriately.
*
**/
public InspectableObject getNextRow() throws IOException {
try {
while (true) {
if (currRecReader == null) {
currRecReader = getRecordReader();
if (currRecReader == null) {
return null;
}
}
boolean ret = currRecReader.next(key, value);
if (ret) {
if (this.currPart == null) {
Object obj = serde.deserialize(value);
return new InspectableObject(obj, serde.getObjectInspector());
} else {
rowWithPart[0] = serde.deserialize(value);
return new InspectableObject(rowWithPart, rowObjectInspector);
}
} else {
currRecReader.close();
currRecReader = null;
}
}
} catch (Exception e) {
throw new IOException(e);
}
}
/**
* Clear the context, if anything needs to be done.
*
**/
public void clearFetchContext() throws HiveException {
try {
if (currRecReader != null) {
currRecReader.close();
currRecReader = null;
}
this.currPath = null;
this.iterPath = null;
this.iterPartDesc = null;
} catch (Exception e) {
throw new HiveException("Failed with exception " + e.getMessage()
+ org.apache.hadoop.util.StringUtils.stringifyException(e));
}
}
/**
* used for bucket map join. there is a hack for getting partitionDesc. bucket map join right now
* only allow one partition present in bucket map join.
*/
public void setupContext(Iterator<Path> iterPath, Iterator<PartitionDesc> iterPartDesc) {
this.iterPath = iterPath;
this.iterPartDesc = iterPartDesc;
if (iterPartDesc == null) {
if (work.getTblDir() != null) {
this.currTbl = work.getTblDesc();
} else {
// hack, get the first.
List<PartitionDesc> listParts = work.getPartDesc();
currPart = listParts.get(0);
}
}
}
public ObjectInspector getOutputObjectInspector() throws HiveException {
try {
if (work.getTblDir() != null) {
TableDesc tbl = work.getTblDesc();
Deserializer serde = tbl.getDeserializerClass().newInstance();
serde.initialize(job, tbl.getProperties());
return serde.getObjectInspector();
} else if (work.getPartDesc() != null) {
List<PartitionDesc> listParts = work.getPartDesc();
if(listParts.size() == 0) {
return null;
}
currPart = listParts.get(0);
serde = currPart.getTableDesc().getDeserializerClass().newInstance();
serde.initialize(job, currPart.getTableDesc().getProperties());
setPrtnDesc();
currPart = null;
return rowObjectInspector;
} else {
return null;
}
} catch (Exception e) {
throw new HiveException("Failed with exception " + e.getMessage()
+ org.apache.hadoop.util.StringUtils.stringifyException(e));
}
}
/**
* Lists status for all files under a given path. Whether or not this is recursive depends on the
* setting of job configuration parameter mapred.input.dir.recursive.
*
* @param fs
* file system
*
* @param p
* path in file system
*
* @return list of file status entries
*/
private FileStatus[] listStatusUnderPath(FileSystem fs, Path p) throws IOException {
HiveConf hiveConf = new HiveConf(job, FetchOperator.class);
boolean recursive = hiveConf.getBoolVar(HiveConf.ConfVars.HADOOPMAPREDINPUTDIRRECURSIVE);
if (!recursive) {
return fs.listStatus(p);
}
List<FileStatus> results = new ArrayList<FileStatus>();
for (FileStatus stat : fs.listStatus(p)) {
FileUtils.listStatusRecursively(fs, stat, results);
}
return results.toArray(new FileStatus[results.size()]);
}
}