/*
* Copyright © 2014 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.hive.stream;
import co.cask.cdap.common.conf.Constants;
import co.cask.cdap.data.stream.StreamInputSplitFactory;
import co.cask.cdap.data.stream.StreamInputSplitFinder;
import co.cask.cdap.data.stream.StreamUtils;
import co.cask.cdap.data2.transaction.stream.StreamConfig;
import co.cask.cdap.hive.context.ContextManager;
import co.cask.cdap.proto.Id;
import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.index.IndexPredicateAnalyzer;
import org.apache.hadoop.hive.ql.index.IndexSearchCondition;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.twill.filesystem.Location;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.List;
import javax.annotation.Nullable;
/**
* Stream input format for use in hive queries and only hive queries. Will not work outside of hive.
*/
public class HiveStreamInputFormat implements InputFormat<Void, ObjectWritable> {
private static final Logger LOG = LoggerFactory.getLogger(HiveStreamInputFormat.class);
@Override
public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
// right before this method is called by hive, hive copies everything that the storage handler put in the properties
// map in it's configureTableJobProperties method into the job conf. We put the stream name in there so that
// we can derive the stream path and properties from it.
// this MUST be done in the input format and not in StreamSerDe's initialize method because we have no control
// over when initialize is called. If we set job conf settings there, the settings for one stream get clobbered
// by the settings for another stream if a join over streams is being performed.
StreamInputSplitFinder<InputSplit> splitFinder = getSplitFinder(conf);
List<InputSplit> splits = splitFinder.getSplits(conf);
return splits.toArray(new InputSplit[splits.size()]);
}
@Override
public RecordReader<Void, ObjectWritable> getRecordReader(InputSplit split, JobConf conf, Reporter reporter)
throws IOException {
return new StreamRecordReader(split, conf);
}
private StreamInputSplitFinder<InputSplit> getSplitFinder(JobConf conf) throws IOException {
// first get the context we are in
ContextManager.Context context = ContextManager.getContext(conf);
String streamName = conf.get(Constants.Explore.STREAM_NAME);
String streamNamespace = conf.get(Constants.Explore.STREAM_NAMESPACE);
Id.Stream streamId = Id.Stream.from(streamNamespace, streamName);
StreamConfig streamConfig = context.getStreamConfig(streamId);
// make sure we get the current generation so we don't read events that occurred before a truncate.
Location streamPath = StreamUtils.createGenerationLocation(streamConfig.getLocation(),
StreamUtils.getGeneration(streamConfig));
StreamInputSplitFinder.Builder builder = StreamInputSplitFinder.builder(streamPath.toURI());
// Get the Hive table path for the InputSplit created. It is just to satisfy hive. The InputFormat never uses it.
JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(Job.getInstance(conf));
final Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
return setupBuilder(conf, streamConfig, builder).build(new StreamInputSplitFactory<InputSplit>() {
@Override
public InputSplit createSplit(Path eventPath, Path indexPath, long startTime, long endTime,
long start, long length, @Nullable String[] locations) {
return new StreamInputSplit(tablePaths[0], eventPath, indexPath, startTime, endTime, start, length, locations);
}
});
}
/**
* Setups the given {@link StreamInputSplitFinder.Builder} by analyzing the query.
*/
private StreamInputSplitFinder.Builder setupBuilder(Configuration conf, StreamConfig streamConfig,
StreamInputSplitFinder.Builder builder) {
// the conf contains a 'hive.io.filter.expr.serialized' key which contains the serialized form of ExprNodeDesc
long startTime = Math.max(0L, System.currentTimeMillis() - streamConfig.getTTL());
long endTime = System.currentTimeMillis();
String serializedExpr = conf.get(TableScanDesc.FILTER_EXPR_CONF_STR);
if (serializedExpr == null) {
return builder.setStartTime(startTime).setEndTime(endTime);
}
try {
ExprNodeGenericFuncDesc expr;
// Hack to deal with the fact that older versions of Hive use
// Utilities.deserializeExpression(String, Configuration),
// whereas newer versions use Utilities.deserializeExpression(String).
try {
expr = Utilities.deserializeExpression(serializedExpr);
} catch (NoSuchMethodError e) {
expr = (ExprNodeGenericFuncDesc) Utilities.class.getMethod(
"deserializeExpression", String.class, Configuration.class).invoke(null, serializedExpr, conf);
}
// Analyze the query to extract predicates that can be used for indexing (i.e. setting start/end time)
IndexPredicateAnalyzer analyzer = new IndexPredicateAnalyzer();
for (CompareOp op : CompareOp.values()) {
analyzer.addComparisonOp(op.getOpClassName());
}
// Stream can only be indexed by timestamp
analyzer.clearAllowedColumnNames();
analyzer.allowColumnName("ts");
List<IndexSearchCondition> conditions = Lists.newArrayList();
analyzer.analyzePredicate(expr, conditions);
for (IndexSearchCondition condition : conditions) {
CompareOp op = CompareOp.from(condition.getComparisonOp());
if (op == null) {
// Not a supported operation
continue;
}
ExprNodeConstantDesc value = condition.getConstantDesc();
if (value == null || !(value.getValue() instanceof Long)) {
// Not a supported value
continue;
}
long timestamp = (Long) value.getValue();
// If there is a equal, set both start and endtime and no need to inspect further
if (op == CompareOp.EQUAL) {
startTime = timestamp;
endTime = (timestamp < Long.MAX_VALUE) ? timestamp + 1L : timestamp;
break;
}
if (op == CompareOp.GREATER || op == CompareOp.EQUAL_OR_GREATER) {
// Plus 1 for the start time if it is greater since start time is inclusive in stream
startTime = Math.max(startTime,
timestamp + (timestamp < Long.MAX_VALUE && op == CompareOp.GREATER ? 1L : 0L));
} else {
// Plus 1 for end time if it is equal or less since end time is exclusive in stream
endTime = Math.min(endTime,
timestamp + (timestamp < Long.MAX_VALUE && op == CompareOp.EQUAL_OR_LESS ? 1L : 0L));
}
}
} catch (Throwable t) {
LOG.warn("Exception analyzing query predicate. A full table scan will be performed.", t);
}
return builder.setStartTime(startTime).setEndTime(endTime);
}
private enum CompareOp {
EQUAL(GenericUDFOPEqual.class.getName()),
EQUAL_OR_GREATER(GenericUDFOPEqualOrGreaterThan.class.getName()),
EQUAL_OR_LESS(GenericUDFOPEqualOrLessThan.class.getName()),
GREATER(GenericUDFOPGreaterThan.class.getName()),
LESS(GenericUDFOPLessThan.class.getName());
private final String opClassName;
CompareOp(String opClassName) {
this.opClassName = opClassName;
}
public String getOpClassName() {
return opClassName;
}
/**
* Returns a {@link CompareOp} by matching the given class name or {@code null} if there is none matching.
*/
@Nullable
public static CompareOp from(String opClassName) {
for (CompareOp op : values()) {
if (op.getOpClassName().equals(opClassName)) {
return op;
}
}
return null;
}
}
}