/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.api.java.operators;
import org.apache.flink.annotation.Internal;
import org.apache.flink.annotation.Public;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.InvalidProgramException;
import org.apache.flink.api.common.io.OutputFormat;
import org.apache.flink.api.common.operators.GenericDataSinkBase;
import org.apache.flink.api.common.operators.Keys;
import org.apache.flink.api.common.operators.Operator;
import org.apache.flink.api.common.operators.Order;
import org.apache.flink.api.common.operators.Ordering;
import org.apache.flink.api.common.operators.ResourceSpec;
import org.apache.flink.api.common.operators.UnaryOperatorInformation;
import org.apache.flink.api.common.typeinfo.NothingTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.util.Preconditions;
import java.util.Arrays;
@Public
public class DataSink<T> {
private final OutputFormat<T> format;
private final TypeInformation<T> type;
private final DataSet<T> data;
private String name;
private int parallelism = ExecutionConfig.PARALLELISM_DEFAULT;
private ResourceSpec minResources = ResourceSpec.DEFAULT;
private ResourceSpec preferredResources = ResourceSpec.DEFAULT;
private Configuration parameters;
private int[] sortKeyPositions;
private Order[] sortOrders;
public DataSink(DataSet<T> data, OutputFormat<T> format, TypeInformation<T> type) {
if (format == null) {
throw new IllegalArgumentException("The output format must not be null.");
}
if (type == null) {
throw new IllegalArgumentException("The input type information must not be null.");
}
if (data == null) {
throw new IllegalArgumentException("The data set must not be null.");
}
this.format = format;
this.data = data;
this.type = type;
}
@Internal
public OutputFormat<T> getFormat() {
return format;
}
@Internal
public TypeInformation<T> getType() {
return type;
}
@Internal
public DataSet<T> getDataSet() {
return data;
}
/**
* Pass a configuration to the OutputFormat
* @param parameters Configuration parameters
*/
public DataSink<T> withParameters(Configuration parameters) {
this.parameters = parameters;
return this;
}
/**
* Sorts each local partition of a {@link org.apache.flink.api.java.tuple.Tuple} data set
* on the specified field in the specified {@link Order} before it is emitted by the output format.<br>
* <b>Note: Only tuple data sets can be sorted using integer field indices.</b><br>
* The tuple data set can be sorted on multiple fields in different orders
* by chaining {@link #sortLocalOutput(int, Order)} calls.
*
* @param field The Tuple field on which the data set is locally sorted.
* @param order The Order in which the specified Tuple field is locally sorted.
* @return This data sink operator with specified output order.
*
* @see org.apache.flink.api.java.tuple.Tuple
* @see Order
*
* @deprecated Use {@link DataSet#sortPartition(int, Order)} instead
*/
@Deprecated
@PublicEvolving
public DataSink<T> sortLocalOutput(int field, Order order) {
// get flat keys
Keys.ExpressionKeys<T> ek = new Keys.ExpressionKeys<>(field, this.type);
int[] flatKeys = ek.computeLogicalKeyPositions();
if (!Keys.ExpressionKeys.isSortKey(field, this.type)) {
throw new InvalidProgramException("Selected sort key is not a sortable type");
}
if(this.sortKeyPositions == null) {
// set sorting info
this.sortKeyPositions = flatKeys;
this.sortOrders = new Order[flatKeys.length];
Arrays.fill(this.sortOrders, order);
} else {
// append sorting info to exising info
int oldLength = this.sortKeyPositions.length;
int newLength = oldLength + flatKeys.length;
this.sortKeyPositions = Arrays.copyOf(this.sortKeyPositions, newLength);
this.sortOrders = Arrays.copyOf(this.sortOrders, newLength);
for(int i=0; i<flatKeys.length; i++) {
this.sortKeyPositions[oldLength+i] = flatKeys[i];
this.sortOrders[oldLength+i] = order;
}
}
return this;
}
/**
* Sorts each local partition of a data set on the field(s) specified by the field expression
* in the specified {@link Order} before it is emitted by the output format.<br>
* <b>Note: Non-composite types can only be sorted on the full element which is specified by
* a wildcard expression ("*" or "_").</b><br>
* Data sets of composite types (Tuple or Pojo) can be sorted on multiple fields in different orders
* by chaining {@link #sortLocalOutput(String, Order)} calls.
*
* @param fieldExpression The field expression for the field(s) on which the data set is locally sorted.
* @param order The Order in which the specified field(s) are locally sorted.
* @return This data sink operator with specified output order.
*
* @see Order
*
* @deprecated Use {@link DataSet#sortPartition(String, Order)} instead
*/
@Deprecated
@PublicEvolving
public DataSink<T> sortLocalOutput(String fieldExpression, Order order) {
int numFields;
int[] fields;
Order[] orders;
// compute flat field positions for (nested) sorting fields
Keys.ExpressionKeys<T> ek = new Keys.ExpressionKeys<>(fieldExpression, this.type);
fields = ek.computeLogicalKeyPositions();
if (!Keys.ExpressionKeys.isSortKey(fieldExpression, this.type)) {
throw new InvalidProgramException("Selected sort key is not a sortable type");
}
numFields = fields.length;
orders = new Order[numFields];
Arrays.fill(orders, order);
if(this.sortKeyPositions == null) {
// set sorting info
this.sortKeyPositions = fields;
this.sortOrders = orders;
} else {
// append sorting info to existing info
int oldLength = this.sortKeyPositions.length;
int newLength = oldLength + numFields;
this.sortKeyPositions = Arrays.copyOf(this.sortKeyPositions, newLength);
this.sortOrders = Arrays.copyOf(this.sortOrders, newLength);
for(int i=0; i<numFields; i++) {
this.sortKeyPositions[oldLength+i] = fields[i];
this.sortOrders[oldLength+i] = orders[i];
}
}
return this;
}
/**
* @return Configuration for the OutputFormat.
*/
public Configuration getParameters() {
return this.parameters;
}
// --------------------------------------------------------------------------------------------
public DataSink<T> name(String name) {
this.name = name;
return this;
}
// --------------------------------------------------------------------------------------------
protected GenericDataSinkBase<T> translateToDataFlow(Operator<T> input) {
// select the name (or create a default one)
String name = this.name != null ? this.name : this.format.toString();
GenericDataSinkBase<T> sink = new GenericDataSinkBase<>(this.format, new UnaryOperatorInformation<>(this.type, new NothingTypeInfo()), name);
// set input
sink.setInput(input);
// set parameters
if(this.parameters != null) {
sink.getParameters().addAll(this.parameters);
}
// set parallelism
if(this.parallelism > 0) {
// use specified parallelism
sink.setParallelism(this.parallelism);
} else {
// if no parallelism has been specified, use parallelism of input operator to enable chaining
sink.setParallelism(input.getParallelism());
}
if(this.sortKeyPositions != null) {
// configure output sorting
Ordering ordering = new Ordering();
for(int i=0; i<this.sortKeyPositions.length; i++) {
ordering.appendOrdering(this.sortKeyPositions[i], null, this.sortOrders[i]);
}
sink.setLocalOrder(ordering);
}
return sink;
}
// --------------------------------------------------------------------------------------------
@Override
public String toString() {
return "DataSink '" + (this.name == null ? "<unnamed>" : this.name) + "' (" + this.format.toString() + ")";
}
/**
* Returns the parallelism of this data sink.
*
* @return The parallelism of this data sink.
*/
public int getParallelism() {
return this.parallelism;
}
/**
* Sets the parallelism for this data sink.
* The degree must be 1 or more.
*
* @param parallelism The parallelism for this data sink. A value equal to {@link ExecutionConfig#PARALLELISM_DEFAULT}
* will use the system default.
* @return This data sink with set parallelism.
*/
public DataSink<T> setParallelism(int parallelism) {
Preconditions.checkArgument(parallelism > 0 || parallelism == ExecutionConfig.PARALLELISM_DEFAULT,
"The parallelism of an operator must be at least 1.");
this.parallelism = parallelism;
return this;
}
/**
* Returns the minimum resources of this data sink. If no minimum resources have been set,
* this returns the default resource profile.
*
* @return The minimum resources of this data sink.
*/
@PublicEvolving
public ResourceSpec getMinResources() {
return this.minResources;
}
/**
* Returns the preferred resources of this data sink. If no preferred resources have been set,
* this returns the default resource profile.
*
* @return The preferred resources of this data sink.
*/
@PublicEvolving
public ResourceSpec getPreferredResources() {
return this.preferredResources;
}
// ---------------------------------------------------------------------------
// Fine-grained resource profiles are an incomplete work-in-progress feature
// The setters are hence private at this point.
// ---------------------------------------------------------------------------
/**
* Sets the minimum and preferred resources for this data sink. and the lower and upper resource limits
* will be considered in resource resize feature for future plan.
*
* @param minResources The minimum resources for this data sink.
* @param preferredResources The preferred resources for this data sink.
* @return The data sink with set minimum and preferred resources.
*/
private DataSink<T> setResources(ResourceSpec minResources, ResourceSpec preferredResources) {
Preconditions.checkNotNull(minResources, "The min resources must be not null.");
Preconditions.checkNotNull(preferredResources, "The preferred resources must be not null.");
Preconditions.checkArgument(minResources.isValid() && preferredResources.isValid() && minResources.lessThanOrEqual(preferredResources),
"The values in resources must be not less than 0 and the preferred resources must be greater than the min resources.");
this.minResources = minResources;
this.preferredResources = preferredResources;
return this;
}
/**
* Sets the resources for this data sink, and the minimum and preferred resources are the same by default.
*
* @param resources The resources for this data sink.
* @return The data sink with set minimum and preferred resources.
*/
private DataSink<T> setResources(ResourceSpec resources) {
Preconditions.checkNotNull(resources, "The resources must be not null.");
Preconditions.checkArgument(resources.isValid(), "The values in resources must be not less than 0.");
this.minResources = resources;
this.preferredResources = resources;
return this;
}
}