/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.api.common.operators;
import java.util.ArrayList;
import java.util.List;
import org.apache.flink.annotation.Internal;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.functions.Partitioner;
import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.api.common.io.InputFormat;
import org.apache.flink.api.common.io.RichInputFormat;
import org.apache.flink.api.common.operators.util.UserCodeClassWrapper;
import org.apache.flink.api.common.operators.util.UserCodeObjectWrapper;
import org.apache.flink.api.common.operators.util.UserCodeWrapper;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.core.io.InputSplit;
import org.apache.flink.util.Visitor;
/**
* Abstract superclass for data sources in a Pact plan.
*
* @param <OUT> The output type of the data source
* @param <T> The type of input format invoked by instances of this data source.
*/
@Internal
public class GenericDataSourceBase<OUT, T extends InputFormat<OUT, ?>> extends Operator<OUT> {
private static final String DEFAULT_NAME = "<Unnamed Generic Data Source>";
protected final UserCodeWrapper<? extends T> formatWrapper;
protected String statisticsKey;
private SplitDataProperties splitProperties;
/**
* Creates a new instance for the given file using the given input format.
*
* @param format The {@link org.apache.flink.api.common.io.InputFormat} implementation used to read the data.
* @param operatorInfo The type information for the operator.
* @param name The given name for the Pact, used in plans, logs and progress messages.
*/
public GenericDataSourceBase(T format, OperatorInformation<OUT> operatorInfo, String name) {
super(operatorInfo, name);
if (format == null) {
throw new IllegalArgumentException("Input format may not be null.");
}
this.formatWrapper = new UserCodeObjectWrapper<T>(format);
}
/**
* Creates a new instance for the given file using the given input format, using the default name.
*
* @param format The {@link org.apache.flink.api.common.io.InputFormat} implementation used to read the data.
* @param operatorInfo The type information for the operator.
*/
public GenericDataSourceBase(T format, OperatorInformation<OUT> operatorInfo) {
super(operatorInfo, DEFAULT_NAME);
if (format == null) {
throw new IllegalArgumentException("Input format may not be null.");
}
this.formatWrapper = new UserCodeObjectWrapper<T>(format);
}
/**
* Creates a new instance for the given file using the given input format.
*
* @param format The {@link org.apache.flink.api.common.io.InputFormat} implementation used to read the data.
* @param operatorInfo The type information for the operator.
* @param name The given name for the Pact, used in plans, logs and progress messages.
*/
public GenericDataSourceBase(Class<? extends T> format, OperatorInformation<OUT> operatorInfo, String name) {
super(operatorInfo, name);
if (format == null) {
throw new IllegalArgumentException("Input format may not be null.");
}
this.formatWrapper = new UserCodeClassWrapper<T>(format);
}
/**
* Creates a new instance for the given file using the given input format, using the default name.
*
* @param format The {@link org.apache.flink.api.common.io.InputFormat} implementation used to read the data.
* @param operatorInfo The type information for the operator.
*/
public GenericDataSourceBase(Class<? extends T> format, OperatorInformation<OUT> operatorInfo) {
super(operatorInfo, DEFAULT_NAME);
if (format == null) {
throw new IllegalArgumentException("Input format may not be null.");
}
this.formatWrapper = new UserCodeClassWrapper<T>(format);
}
// --------------------------------------------------------------------------------------------
/**
* Gets the class describing the input format.
*
* @return The class describing the input format.
*/
public UserCodeWrapper<? extends T> getFormatWrapper() {
return this.formatWrapper;
}
/**
* Gets the class describing the input format.
* <p>
* This method is basically identical to {@link #getFormatWrapper()}.
*
* @return The class describing the input format.
*
* @see org.apache.flink.api.common.operators.Operator#getUserCodeWrapper()
*/
@Override
public UserCodeWrapper<? extends T> getUserCodeWrapper() {
return this.formatWrapper;
}
// --------------------------------------------------------------------------------------------
/**
* Gets the key under which statistics about this data source may be obtained from the
* statistics cache.
*
* @return The statistics cache key.
*/
public String getStatisticsKey() {
return this.statisticsKey;
}
/**
* Sets the key under which statistics about this data source may be obtained from the
* statistics cache. Useful for testing purposes, when providing mock statistics.
*
* @param statisticsKey The key for the statistics object.
*/
public void setStatisticsKey(String statisticsKey) {
this.statisticsKey = statisticsKey;
}
/**
* Sets properties of input splits for this data source.
* Split properties can help to generate more efficient execution plans.
* <br>
* <b>
* IMPORTANT: Providing wrong split data properties can cause wrong results!
* </b>
*
* @param splitDataProperties The data properties of this data source's splits.
*/
public void setSplitDataProperties(SplitDataProperties<OUT> splitDataProperties) {
this.splitProperties = splitDataProperties;
}
/**
* Returns the data properties of this data source's splits.
*
* @return The data properties of this data source's splits or null if no properties have been set.
*/
public SplitDataProperties<OUT> getSplitDataProperties() {
return this.splitProperties;
}
// --------------------------------------------------------------------------------------------
/**
* Accepts the visitor and applies it this instance. Since the data sources have no inputs, no recursive descend
* happens. The visitors pre-visit method is called and, if returning <tt>true</tt>, the post-visit method is called.
*
* @param visitor The visitor.
*
* @see org.apache.flink.util.Visitable#accept(org.apache.flink.util.Visitor)
*/
@Override
public void accept(Visitor<Operator<?>> visitor) {
if (visitor.preVisit(this)) {
visitor.postVisit(this);
}
}
// --------------------------------------------------------------------------------------------
protected List<OUT> executeOnCollections(RuntimeContext ctx, ExecutionConfig executionConfig) throws Exception {
@SuppressWarnings("unchecked")
InputFormat<OUT, InputSplit> inputFormat = (InputFormat<OUT, InputSplit>) this.formatWrapper.getUserCodeObject();
//configure the input format
inputFormat.configure(this.parameters);
//open the input format
if (inputFormat instanceof RichInputFormat) {
((RichInputFormat) inputFormat).setRuntimeContext(ctx);
((RichInputFormat) inputFormat).openInputFormat();
}
List<OUT> result = new ArrayList<OUT>();
// splits
InputSplit[] splits = inputFormat.createInputSplits(1);
TypeSerializer<OUT> serializer = getOperatorInfo().getOutputType().createSerializer(executionConfig);
for (InputSplit split : splits) {
inputFormat.open(split);
while (!inputFormat.reachedEnd()) {
OUT next = inputFormat.nextRecord(serializer.createInstance());
if (next != null) {
result.add(serializer.copy(next));
}
}
inputFormat.close();
}
//close the input format
if (inputFormat instanceof RichInputFormat) {
((RichInputFormat) inputFormat).closeInputFormat();
}
return result;
}
// --------------------------------------------------------------------------------------------
public String toString() {
return this.name;
}
public static interface SplitDataProperties<T> {
public int[] getSplitPartitionKeys();
public Partitioner<T> getSplitPartitioner();
public int[] getSplitGroupKeys();
public Ordering getSplitOrder();
}
}