/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.optimizer.dag;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.apache.flink.api.common.ExecutionMode;
import org.apache.flink.api.common.functions.Partitioner;
import org.apache.flink.api.common.io.FileInputFormat;
import org.apache.flink.api.common.io.InputFormat;
import org.apache.flink.api.common.io.NonParallelInput;
import org.apache.flink.api.common.io.ReplicatingInputFormat;
import org.apache.flink.api.common.io.statistics.BaseStatistics;
import org.apache.flink.api.common.operators.GenericDataSourceBase;
import org.apache.flink.api.common.operators.GenericDataSourceBase.SplitDataProperties;
import org.apache.flink.api.common.operators.Operator;
import org.apache.flink.api.common.operators.Ordering;
import org.apache.flink.api.common.operators.SemanticProperties;
import org.apache.flink.api.common.operators.SemanticProperties.EmptySemanticProperties;
import org.apache.flink.api.common.operators.util.FieldList;
import org.apache.flink.optimizer.DataStatistics;
import org.apache.flink.optimizer.Optimizer;
import org.apache.flink.optimizer.costs.CostEstimator;
import org.apache.flink.optimizer.costs.Costs;
import org.apache.flink.optimizer.dataproperties.GlobalProperties;
import org.apache.flink.optimizer.dataproperties.LocalProperties;
import org.apache.flink.optimizer.plan.PlanNode;
import org.apache.flink.optimizer.plan.SourcePlanNode;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.util.Visitor;
/**
* The optimizer's internal representation of a data source.
*/
public class DataSourceNode extends OptimizerNode {
private final boolean sequentialInput;
private final boolean replicatedInput;
private GlobalProperties gprops;
private LocalProperties lprops;
/**
* Creates a new DataSourceNode for the given contract.
*
* @param pactContract
* The data source contract object.
*/
public DataSourceNode(GenericDataSourceBase<?, ?> pactContract) {
super(pactContract);
if (pactContract.getUserCodeWrapper().getUserCodeClass() == null) {
throw new IllegalArgumentException("Input format has not been set.");
}
if (NonParallelInput.class.isAssignableFrom(pactContract.getUserCodeWrapper().getUserCodeClass())) {
setParallelism(1);
this.sequentialInput = true;
} else {
this.sequentialInput = false;
}
this.replicatedInput = ReplicatingInputFormat.class.isAssignableFrom(
pactContract.getUserCodeWrapper().getUserCodeClass());
this.gprops = new GlobalProperties();
this.lprops = new LocalProperties();
SplitDataProperties<?> splitProps = pactContract.getSplitDataProperties();
if(replicatedInput) {
this.gprops.setFullyReplicated();
this.lprops = new LocalProperties();
} else if (splitProps != null) {
// configure data properties of data source using split properties
setDataPropertiesFromSplitProperties(splitProps);
}
}
/**
* Gets the contract object for this data source node.
*
* @return The contract.
*/
@Override
public GenericDataSourceBase<?, ?> getOperator() {
return (GenericDataSourceBase<?, ?>) super.getOperator();
}
@Override
public String getOperatorName() {
return "Data Source";
}
@Override
public void setParallelism(int parallelism) {
// if unsplittable, parallelism remains at 1
if (!this.sequentialInput) {
super.setParallelism(parallelism);
}
}
@Override
public List<DagConnection> getIncomingConnections() {
return Collections.<DagConnection>emptyList();
}
@Override
public void setInput(Map<Operator<?>, OptimizerNode> contractToNode, ExecutionMode defaultDataExchangeMode) {}
@Override
protected void computeOperatorSpecificDefaultEstimates(DataStatistics statistics) {
// see, if we have a statistics object that can tell us a bit about the file
if (statistics != null) {
// instantiate the input format, as this is needed by the statistics
InputFormat<?, ?> format;
String inFormatDescription = "<unknown>";
try {
format = getOperator().getFormatWrapper().getUserCodeObject();
Configuration config = getOperator().getParameters();
format.configure(config);
}
catch (Throwable t) {
if (Optimizer.LOG.isWarnEnabled()) {
Optimizer.LOG.warn("Could not instantiate InputFormat to obtain statistics."
+ " Limited statistics will be available.", t);
}
return;
}
try {
inFormatDescription = format.toString();
}
catch (Throwable t) {
// we can ignore this error, as it only prevents us to use a cosmetic string
}
// first of all, get the statistics from the cache
final String statisticsKey = getOperator().getStatisticsKey();
final BaseStatistics cachedStatistics = statistics.getBaseStatistics(statisticsKey);
BaseStatistics bs = null;
try {
bs = format.getStatistics(cachedStatistics);
}
catch (Throwable t) {
if (Optimizer.LOG.isWarnEnabled()) {
Optimizer.LOG.warn("Error obtaining statistics from input format: " + t.getMessage(), t);
}
}
if (bs != null) {
final long len = bs.getTotalInputSize();
if (len == BaseStatistics.SIZE_UNKNOWN) {
if (Optimizer.LOG.isInfoEnabled()) {
Optimizer.LOG.info("Compiler could not determine the size of input '" + inFormatDescription + "'. Using default estimates.");
}
}
else if (len >= 0) {
this.estimatedOutputSize = len;
}
final long card = bs.getNumberOfRecords();
if (card != BaseStatistics.NUM_RECORDS_UNKNOWN) {
this.estimatedNumRecords = card;
}
}
}
}
@Override
public void computeInterestingPropertiesForInputs(CostEstimator estimator) {
// no children, so nothing to compute
}
@Override
public void computeUnclosedBranchStack() {
// because there are no inputs, there are no unclosed branches.
this.openBranches = Collections.emptyList();
}
@Override
public List<PlanNode> getAlternativePlans(CostEstimator estimator) {
if (this.cachedPlans != null) {
return this.cachedPlans;
}
SourcePlanNode candidate = new SourcePlanNode(this, "DataSource ("+this.getOperator().getName()+")",
this.gprops, this.lprops);
if(!replicatedInput) {
candidate.updatePropertiesWithUniqueSets(getUniqueFields());
final Costs costs = new Costs();
if (FileInputFormat.class.isAssignableFrom(getOperator().getFormatWrapper().getUserCodeClass()) &&
this.estimatedOutputSize >= 0) {
estimator.addFileInputCost(this.estimatedOutputSize, costs);
}
candidate.setCosts(costs);
} else {
// replicated input
final Costs costs = new Costs();
InputFormat<?,?> inputFormat =
((ReplicatingInputFormat<?,?>) getOperator().getFormatWrapper().getUserCodeObject()).getReplicatedInputFormat();
if (FileInputFormat.class.isAssignableFrom(inputFormat.getClass()) &&
this.estimatedOutputSize >= 0) {
estimator.addFileInputCost(this.estimatedOutputSize * this.getParallelism(), costs);
}
candidate.setCosts(costs);
}
// since there is only a single plan for the data-source, return a list with that element only
List<PlanNode> plans = new ArrayList<PlanNode>(1);
plans.add(candidate);
this.cachedPlans = plans;
return plans;
}
@Override
public SemanticProperties getSemanticProperties() {
return new EmptySemanticProperties();
}
@Override
public void accept(Visitor<OptimizerNode> visitor) {
if (visitor.preVisit(this)) {
visitor.postVisit(this);
}
}
private void setDataPropertiesFromSplitProperties(SplitDataProperties splitProps) {
// set global properties
int[] partitionKeys = splitProps.getSplitPartitionKeys();
Partitioner<?> partitioner = splitProps.getSplitPartitioner();
if(partitionKeys != null && partitioner != null) {
this.gprops.setCustomPartitioned(new FieldList(partitionKeys), partitioner);
}
else if(partitionKeys != null) {
this.gprops.setAnyPartitioning(new FieldList(partitionKeys));
}
// set local properties
int[] groupingKeys = splitProps.getSplitGroupKeys();
Ordering ordering = splitProps.getSplitOrder();
// more than one split per source tasks possible.
// adapt split grouping and sorting
if(ordering != null) {
// sorting falls back to grouping because a source can read multiple,
// randomly assigned splits
groupingKeys = ordering.getFieldPositions();
}
if(groupingKeys != null && partitionKeys != null) {
// check if grouping is also valid across splits, i.e., whether grouping keys are
// valid superset of partition keys
boolean allFieldsIncluded = true;
for(int i : partitionKeys) {
boolean fieldIncluded = false;
for(int j : groupingKeys) {
if(i == j) {
fieldIncluded = true;
break;
}
}
if(!fieldIncluded) {
allFieldsIncluded = false;
break;
}
}
if (allFieldsIncluded) {
this.lprops = LocalProperties.forGrouping(new FieldList(groupingKeys));
} else {
this.lprops = new LocalProperties();
}
} else {
this.lprops = new LocalProperties();
}
}
}