/*********************************************************************************************************************** * * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. * **********************************************************************************************************************/ package eu.stratosphere.api.java; import org.apache.commons.lang3.Validate; import eu.stratosphere.api.common.io.FileOutputFormat; import eu.stratosphere.api.common.io.OutputFormat; import eu.stratosphere.api.java.aggregation.Aggregations; import eu.stratosphere.api.java.functions.CoGroupFunction; import eu.stratosphere.api.java.functions.FilterFunction; import eu.stratosphere.api.java.functions.FlatMapFunction; import eu.stratosphere.api.java.functions.GroupReduceFunction; import eu.stratosphere.api.java.functions.KeySelector; import eu.stratosphere.api.java.functions.MapFunction; import eu.stratosphere.api.java.functions.ReduceFunction; import eu.stratosphere.api.java.io.CsvOutputFormat; import eu.stratosphere.api.java.io.PrintingOutputFormat; import eu.stratosphere.api.java.io.TextOutputFormat; import eu.stratosphere.api.java.operators.AggregateOperator; import eu.stratosphere.api.java.operators.CoGroupOperator; import eu.stratosphere.api.java.operators.CoGroupOperator.CoGroupOperatorSets; import eu.stratosphere.api.java.operators.CrossOperator; import eu.stratosphere.api.java.operators.CrossOperator.DefaultCross; import eu.stratosphere.api.java.operators.CustomUnaryOperation; import eu.stratosphere.api.java.operators.DataSink; import eu.stratosphere.api.java.operators.FilterOperator; import eu.stratosphere.api.java.operators.FlatMapOperator; import eu.stratosphere.api.java.operators.Grouping; import eu.stratosphere.api.java.operators.JoinOperator; import eu.stratosphere.api.java.operators.JoinOperator.JoinHint; import eu.stratosphere.api.java.operators.JoinOperator.JoinOperatorSets; import eu.stratosphere.api.java.operators.Keys; import eu.stratosphere.api.java.operators.MapOperator; import eu.stratosphere.api.java.operators.ProjectOperator; import eu.stratosphere.api.java.operators.ProjectOperator.Projection; import eu.stratosphere.api.java.operators.ReduceGroupOperator; import eu.stratosphere.api.java.operators.ReduceOperator; import eu.stratosphere.api.java.operators.SortedGrouping; import eu.stratosphere.api.java.operators.UnionOperator; import eu.stratosphere.api.java.operators.UnsortedGrouping; import eu.stratosphere.api.java.record.functions.CrossFunction; import eu.stratosphere.api.java.tuple.Tuple; import eu.stratosphere.api.java.tuple.Tuple2; import eu.stratosphere.api.java.typeutils.InputTypeConfigurable; import eu.stratosphere.types.TypeInformation; import eu.stratosphere.core.fs.FileSystem.WriteMode; import eu.stratosphere.core.fs.Path; /** * A DataSet represents a collection of elements of the same type.<br/> * A DataSet can be transformed into another DataSet by applying a transformation as for example * <ul> * <li>{@link DataSet#map(MapFunction)},</li> * <li>{@link DataSet#reduce(ReduceFunction)},</li> * <li>{@link DataSet#join(DataSet)}, or</li> * <li>{@link DataSet#coGroup(DataSet)}.</li> * </ul> * * @param <T> The type of the DataSet, i.e., the type of the elements of the DataSet. */ public abstract class DataSet<T> { private final ExecutionEnvironment context; private final TypeInformation<T> type; protected DataSet(ExecutionEnvironment context, TypeInformation<T> type) { if (context == null) { throw new NullPointerException("context is null"); } if (type == null) { throw new NullPointerException("type is null"); } this.context = context; this.type = type; } /** * Returns the {@link ExecutionEnvironment} in which this DataSet is registered. * * @return The ExecutionEnvironment in which this DataSet is registered. * * @see ExecutionEnvironment */ public ExecutionEnvironment getExecutionEnvironment() { return this.context; } /** * Returns the {@link TypeInformation} for the type of this DataSet. * * @return The TypeInformation for the type of this DataSet. * * @see TypeInformation */ public TypeInformation<T> getType() { return this.type; } // -------------------------------------------------------------------------------------------- // Filter & Transformations // -------------------------------------------------------------------------------------------- /** * Applies a Map transformation on a {@link DataSet}.<br/> * The transformation calls a {@link MapFunction} for each element of the DataSet. * Each MapFunction call returns exactly one element. * * @param mapper The MapFunction that is called for each element of the DataSet. * @return A MapOperator that represents the transformed DataSet. * * @see MapFunction * @see MapOperator * @see DataSet */ public <R> MapOperator<T, R> map(MapFunction<T, R> mapper) { if (mapper == null) { throw new NullPointerException("Map function must not be null."); } return new MapOperator<T, R>(this, mapper); } /** * Applies a FlatMap transformation on a {@link DataSet}.<br/> * The transformation calls a {@link FlatMapFunction} for each element of the DataSet. * Each FlatMapFunction call can return any number of elements including none. * * @param flatMapper The FlatMapFunction that is called for each element of the DataSet. * @return A FlatMapOperator that represents the transformed DataSet. * * @see FlatMapFunction * @see FlatMapOperator * @see DataSet */ public <R> FlatMapOperator<T, R> flatMap(FlatMapFunction<T, R> flatMapper) { if (flatMapper == null) { throw new NullPointerException("FlatMap function must not be null."); } return new FlatMapOperator<T, R>(this, flatMapper); } /** * Applies a Filter transformation on a {@link DataSet}.<br/> * The transformation calls a {@link FilterFunction} for each element of the DataSet * and retains only those element for which the function returns true. Elements for * which the function returns false are filtered. * * @param filter The FilterFunction that is called for each element of the DataSet. * @return A FilterOperator that represents the filtered DataSet. * * @see FilterFunction * @see FilterOperator * @see DataSet */ public FilterOperator<T> filter(FilterFunction<T> filter) { if (filter == null) { throw new NullPointerException("Filter function must not be null."); } return new FilterOperator<T>(this, filter); } // -------------------------------------------------------------------------------------------- // Projections // -------------------------------------------------------------------------------------------- /** * Initiates a Project transformation on a {@link Tuple} {@link DataSet}.<br/> * <b>Note: Only Tuple DataSets can be projected.</b></br> * The transformation projects each Tuple of the DataSet onto a (sub)set of fields.</br> * This method returns a {@link Projection} on which {@link Projection#types()} needs to * be called to completed the transformation. * * @param fieldIndexes The field indexes of the input tuples that are retained. * The order of fields in the output tuple corresponds to the order of field indexes. * @return A Projection that needs to be converted into a {@link ProjectOperator} to complete the * Project transformation by calling {@link Projection#types()}. * * @see Tuple * @see DataSet * @see Projection * @see ProjectOperator */ public Projection<T> project(int... fieldIndexes) { return new Projection<T>(this, fieldIndexes); } // -------------------------------------------------------------------------------------------- // Non-grouped aggregations // -------------------------------------------------------------------------------------------- /** * Applies an Aggregate transformation on a non-grouped {@link Tuple} {@link DataSet}.<br/> * <b>Note: Only Tuple DataSets can be aggregated.</b> * The transformation applies a built-in {@link Aggregations Aggregation} on a specified field * of a Tuple DataSet. Additional aggregation functions can be added to the resulting * {@link AggregateOperator} by calling {@link AggregateOperator#and(Aggregations, int)}. * * @param agg The built-in aggregation function that is computed. * @param field The index of the Tuple field on which the aggregation function is applied. * @return An AggregateOperator that represents the aggregated DataSet. * * @see Tuple * @see Aggregations * @see AggregateOperator * @see DataSet */ public AggregateOperator<T> aggregate(Aggregations agg, int field) { return new AggregateOperator<T>(this, agg, field); } /** * Applies a Reduce transformation on a non-grouped {@link DataSet}.<br/> * The transformation consecutively calls a {@link ReduceFunction} * until only a single element remains which is the result of the transformation. * A ReduceFunction combines two elements into one new element of the same type. * * @param reducer The ReduceFunction that is applied on the DataSet. * @return A ReduceOperator that represents the reduced DataSet. * * @see ReduceFunction * @see ReduceOperator * @see DataSet */ public ReduceOperator<T> reduce(ReduceFunction<T> reducer) { if (reducer == null) { throw new NullPointerException("Reduce function must not be null."); } return new ReduceOperator<T>(this, reducer); } /** * Applies a GroupReduce transformation on a non-grouped {@link DataSet}.<br/> * The transformation calls a {@link GroupReduceFunction} once with the full DataSet. * The GroupReduceFunction can iterate over all elements of the DataSet and emit any * number of output elements including none. * * @param reducer The GroupReduceFunction that is applied on the DataSet. * @return A GroupReduceOperator that represents the reduced DataSet. * * @see GroupReduceFunction * @see GroupReduceOperator * @see DataSet */ public <R> ReduceGroupOperator<T, R> reduceGroup(GroupReduceFunction<T, R> reducer) { if (reducer == null) { throw new NullPointerException("GroupReduce function must not be null."); } return new ReduceGroupOperator<T, R>(this, reducer); } // -------------------------------------------------------------------------------------------- // distinct // -------------------------------------------------------------------------------------------- // public <K extends Comparable<K>> DistinctOperator<T> distinct(KeySelector<T, K> keyExtractor) { // return new DistinctOperator<T>(this, new Keys.SelectorFunctionKeys<T, K>(keyExtractor, getType())); // } // public DistinctOperator<T> distinct(int... fields) { // return new DistinctOperator<T>(this, new Keys.FieldPositionKeys<T>(fields, getType(), true)); // } // -------------------------------------------------------------------------------------------- // Grouping // -------------------------------------------------------------------------------------------- /** * Groups a {@link DataSet} using a {@link KeySelector} function. * The KeySelector function is called for each element of the DataSet and extracts a single * key value on which the DataSet is grouped. </br> * This method returns an {@link UnsortedGrouping} on which one of the following grouping transformation * can be applied. * <ul> * <li>{@link UnsortedGrouping#sortGroup(int, eu.stratosphere.api.common.operators.Order)} to get a {@link SortedGrouping}. * <li>{@link Grouping#aggregate(Aggregations, int)} to apply an Aggregate transformation. * <li>{@link Grouping#reduce(ReduceFunction)} to apply a Reduce transformation. * <li>{@link Grouping#reduceGroup(GroupReduceFunction)} to apply a GroupReduce transformation. * </ul> * * @param keyExtractor The KeySelector function which extracts the key values from the DataSet on which it is grouped. * @return An UnsortedGrouping on which a transformation needs to be applied to obtain a transformed DataSet. * * @see KeySelector * @see Grouping * @see UnsortedGrouping * @see SortedGrouping * @see AggregateOperator * @see ReduceOperator * @see GroupReduceOperator * @see DataSet */ public <K extends Comparable<K>> UnsortedGrouping<T> groupBy(KeySelector<T, K> keyExtractor) { return new UnsortedGrouping<T>(this, new Keys.SelectorFunctionKeys<T, K>(keyExtractor, getType())); } /** * Groups a {@link Tuple} {@link DataSet} using field position keys.<br/> * <b>Note: Field position keys only be specified for Tuple DataSets.</b></br> * The field position keys specify the fields of Tuples on which the DataSet is grouped. * This method returns an {@link UnsortedGrouping} on which one of the following grouping transformation * can be applied. * <ul> * <li>{@link UnsortedGrouping#sortGroup(int, eu.stratosphere.api.common.operators.Order)} to get a {@link SortedGrouping}. * <li>{@link Grouping#aggregate(Aggregations, int)} to apply an Aggregate transformation. * <li>{@link Grouping#reduce(ReduceFunction)} to apply a Reduce transformation. * <li>{@link Grouping#reduceGroup(GroupReduceFunction)} to apply a GroupReduce transformation. * </ul> * * @param fields One or more field positions on which the DataSet will be grouped. * @return A Grouping on which a transformation needs to be applied to obtain a transformed DataSet. * * @see Tuple * @see Grouping * @see UnsortedGrouping * @see SortedGrouping * @see AggregateOperator * @see ReduceOperator * @see GroupReduceOperator * @see DataSet */ public UnsortedGrouping<T> groupBy(int... fields) { return new UnsortedGrouping<T>(this, new Keys.FieldPositionKeys<T>(fields, getType(), false)); } // -------------------------------------------------------------------------------------------- // Joining // -------------------------------------------------------------------------------------------- /** * Initiates a Join transformation. <br/> * A Join transformation joins the elements of two * {@link DataSet DataSets} on key equality and provides multiple ways to combine * joining elements into one DataSet.</br> * * This method returns a {@link JoinOperatorSets} on which * {@link JoinOperatorSets#where()} needs to be called to define the join key of the first * joining (i.e., this) DataSet. * * @param other The other DataSet with which this DataSet is joined. * @return A JoinOperatorSets to continue the definition of the Join transformation. * * @see JoinOperatorSets * @see JoinOperator * @see DataSet */ public <R> JoinOperatorSets<T, R> join(DataSet<R> other) { return new JoinOperatorSets<T, R>(this, other); } /** * Initiates a Join transformation. <br/> * A Join transformation joins the elements of two * {@link DataSet DataSets} on key equality and provides multiple ways to combine * joining elements into one DataSet.</br> * This method also gives the hint to the optimizer that the second DataSet to join is much * smaller than the first one.</br> * This method returns a {@link JoinOperatorSets} on which * {@link JoinOperatorSets#where()} needs to be called to define the join key of the first * joining (i.e., this) DataSet. * * @param other The other DataSet with which this DataSet is joined. * @return A JoinOperatorSets to continue the definition of the Join transformation. * * @see JoinOperatorSets * @see JoinOperator * @see DataSet */ public <R> JoinOperatorSets<T, R> joinWithTiny(DataSet<R> other) { return new JoinOperatorSets<T, R>(this, other, JoinHint.BROADCAST_HASH_SECOND); } /** * Initiates a Join transformation.<br/> * A Join transformation joins the elements of two * {@link DataSet DataSets} on key equality and provides multiple ways to combine * joining elements into one DataSet.</br> * This method also gives the hint to the optimizer that the second DataSet to join is much * larger than the first one.</br> * This method returns a {@link JoinOperatorSets JoinOperatorSet} on which * {@link JoinOperatorSets#where()} needs to be called to define the join key of the first * joining (i.e., this) DataSet. * * @param other The other DataSet with which this DataSet is joined. * @return A JoinOperatorSet to continue the definition of the Join transformation. * * @see JoinOperatorSets * @see JoinOperator * @see DataSet */ public <R> JoinOperatorSets<T, R> joinWithHuge(DataSet<R> other) { return new JoinOperatorSets<T, R>(this, other, JoinHint.BROADCAST_HASH_FIRST); } // -------------------------------------------------------------------------------------------- // Co-Grouping // -------------------------------------------------------------------------------------------- /** * Initiates a CoGroup transformation.<br/> * A CoGroup transformation combines the elements of * two {@link DataSet DataSets} into one DataSet. It groups each DataSet individually on a key and * gives groups of both DataSets with equal keys together into a {@link CoGroupFunction}. * If a DataSet has a group with no matching key in the other DataSet, the CoGroupFunction * is called with an empty group for the non-existing group.</br> * The CoGroupFunction can iterate over the elements of both groups and return any number * of elements including none.</br> * This method returns a {@link CoGroupOperatorSets} on which * {@link CoGroupOperatorSets#where(} needs to be called to define the grouping key of the first * (i.e., this) DataSet. * * @param other The other DataSet of the CoGroup transformation. * @return A CoGroupOperatorSets to continue the definition of the CoGroup transformation. * * @see CoGroupOperatorSets * @see CoGroupOperator * @see DataSet */ public <R> CoGroupOperator.CoGroupOperatorSets<T, R> coGroup(DataSet<R> other) { return new CoGroupOperator.CoGroupOperatorSets<T, R>(this, other); } // -------------------------------------------------------------------------------------------- // Cross // -------------------------------------------------------------------------------------------- /** * Continues a Join transformation and defines the {@link Tuple} fields of the second join * {@link DataSet} that should be used as join keys.<br/> * <b>Note: Fields can only be selected as join keys on Tuple DataSets.</b><br/> * * The resulting {@link DefaultJoin} wraps each pair of joining elements into a {@link Tuple2}, with * the element of the first input being the first field of the tuple and the element of the * second input being the second field of the tuple. * * @param fields The indexes of the Tuple fields of the second join DataSet that should be used as keys. * @return A DefaultJoin that represents the joined DataSet. */ /** * Initiates a Cross transformation.<br/> * A Cross transformation combines the elements of two * {@link DataSet DataSets} into one DataSet. It builds all pair combinations of elements of * both DataSets, i.e., it builds a Cartesian product. * * <p> * The resulting {@link DefaultCross} wraps each pair of crossed elements into a {@link Tuple2}, with * the element of the first input being the first field of the tuple and the element of the * second input being the second field of the tuple. * * <p> * Call {@link DefaultCross.with(CrossFunction)} to define a {@link CrossFunction} which is called for * each pair of crossed elements. The CrossFunction returns a exactly one element for each pair of input elements.</br> * * @param other The other DataSet with which this DataSet is crossed. * @return A DefaultCross that returns a Tuple2 for each pair of crossed elements. * * @see DefaultCross * @see CrossFunction * @see DataSet * @see Tuple2 */ public <R> CrossOperator.DefaultCross<T, R> cross(DataSet<R> other) { return new CrossOperator.DefaultCross<T, R>(this, other); } /** * Initiates a Cross transformation.<br/> * A Cross transformation combines the elements of two * {@link DataSet DataSets} into one DataSet. It builds all pair combinations of elements of * both DataSets, i.e., it builds a Cartesian product. * This method also gives the hint to the optimizer that the second DataSet to cross is much * smaller than the first one. * * <p> * The resulting {@link DefaultCross} wraps each pair of crossed elements into a {@link Tuple2}, with * the element of the first input being the first field of the tuple and the element of the * second input being the second field of the tuple. * * <p> * Call {@link DefaultCross.with(CrossFunction)} to define a {@link CrossFunction} which is called for * each pair of crossed elements. The CrossFunction returns a exactly one element for each pair of input elements.</br> * * @param other The other DataSet with which this DataSet is crossed. * @return A DefaultCross that returns a Tuple2 for each pair of crossed elements. * * @see DefaultCross * @see CrossFunction * @see DataSet * @see Tuple2 */ public <R> CrossOperator.DefaultCross<T, R> crossWithTiny(DataSet<R> other) { return new CrossOperator.DefaultCross<T, R>(this, other); } /** * Initiates a Cross transformation.<br/> * A Cross transformation combines the elements of two * {@link DataSet DataSets} into one DataSet. It builds all pair combinations of elements of * both DataSets, i.e., it builds a Cartesian product. * This method also gives the hint to the optimizer that the second DataSet to cross is much * larger than the first one. * * <p> * The resulting {@link DefaultCross} wraps each pair of crossed elements into a {@link Tuple2}, with * the element of the first input being the first field of the tuple and the element of the * second input being the second field of the tuple. * * <p> * Call {@link DefaultCross.with(CrossFunction)} to define a {@link CrossFunction} which is called for * each pair of crossed elements. The CrossFunction returns a exactly one element for each pair of input elements.</br> * * @param other The other DataSet with which this DataSet is crossed. * @return A DefaultCross that returns a Tuple2 for each pair of crossed elements. * * @see DefaultCross * @see CrossFunction * @see DataSet * @see Tuple2 */ public <R> CrossOperator.DefaultCross<T, R> crossWithHuge(DataSet<R> other) { return new CrossOperator.DefaultCross<T, R>(this, other); } // -------------------------------------------------------------------------------------------- // Iterations // -------------------------------------------------------------------------------------------- /** * Initiates an iterative part of the program that executes multiple times and feeds back data sets. * The iterative part needs to be closed by calling {@link IterativeDataSet#closeWith(DataSet)}. The data set * given to the {@code closeWith(DataSet)} method is the data set that will be fed back and used as the input * to the next iteration. The return value of the {@code closeWith(DataSet)} method is the resulting * data set after the iteration has terminated. * <p> * An example of an iterative computation is as follows: * * <pre> * {@code * DataSet<Double> input = ...; * * DataSet<Double> startOfIteration = input.iterate(10); * DataSet<Double> toBeFedBack = startOfIteration * .map(new MyMapper()) * .groupBy(...).reduceGroup(new MyReducer()); * DataSet<Double> result = startOfIteration.closeWith(toBeFedBack); * } * </pre> * <p> * The iteration has a maximum number of times that it executes. A dynamic termination can be realized by using a * termination criterion (see {@link IterativeDataSet#closeWith(DataSet, DataSet)}). * * @param maxIterations The maximum number of times that the iteration is executed. * @return An IterativeDataSet that marks the start of the iterative part and needs to be closed by * {@link IterativeDataSet#closeWith(DataSet)}. * * @see eu.stratosphere.api.java.IterativeDataSet */ public IterativeDataSet<T> iterate(int maxIterations) { return new IterativeDataSet<T>(getExecutionEnvironment(), getType(), this, maxIterations); } /** * Initiates a delta iteration. A delta iteration is similar to a regular iteration (as started by {@link #iterate(int)}, * but maintains state across the individual iteration steps. The Solution set, which represents the current state * at the beginning of each iteration can be obtained via {@link DeltaIteration#getSolutionSet()} ()}. * It can be be accessed by joining (or CoGrouping) with it. The DataSet that represents the workset of an iteration * can be obtained via {@link DeltaIteration#getWorkset()}. * The solution set is updated by producing a delta for it, which is merged into the solution set at the end of each * iteration step. * <p> * The delta iteration must be closed by calling {@link DeltaIteration#closeWith(DataSet, DataSet)}. The two * parameters are the delta for the solution set and the new workset (the data set that will be fed back). * The return value of the {@code closeWith(DataSet, DataSet)} method is the resulting * data set after the iteration has terminated. Delta iterations terminate when the feed back data set * (the workset) is empty. In addition, a maximum number of steps is given as a fall back termination guard. * <p> * Elements in the solution set are uniquely identified by a key. When merging the solution set delta, contained elements * with the same key are replaced. * <p> * <b>NOTE:</b> Delta iterations currently support only tuple valued data types. This restriction * will be removed in the future. The key is specified by the tuple position. * <p> * A code example for a delta iteration is as follows * <pre> * {@code * DeltaIteration<Tuple2<Long, Long>, Tuple2<Long, Long>> iteration = * initialState.iterateDelta(initialFeedbakSet, 100, 0); * * DataSet<Tuple2<Long, Long>> delta = iteration.groupBy(0).aggregate(Aggregations.AVG, 1) * .join(iteration.getSolutionSet()).where(0).equalTo(0) * .flatMap(new ProjectAndFilter()); * * DataSet<Tuple2<Long, Long>> feedBack = delta.join(someOtherSet).where(...).equalTo(...).with(...); * * // close the delta iteration (delta and new workset are identical) * DataSet<Tuple2<Long, Long>> result = iteration.closeWith(delta, feedBack); * } * </pre> * * @param workset The initial version of the data set that is fed back to the next iteration step (the workset). * @param maxIterations The maximum number of iteration steps, as a fall back safeguard. * @param keyPositions The position of the tuple fields that is used as the key of the solution set. * * @return The DeltaIteration that marks the start of a delta iteration. * * @see DeltaIteration */ public <R> DeltaIteration<T, R> iterateDelta(DataSet<R> workset, int maxIterations, int... keyPositions) { Keys.FieldPositionKeys<T> keys = new Keys.FieldPositionKeys<T>(keyPositions, getType(), false); return new DeltaIteration<T, R>(getExecutionEnvironment(), getType(), this, workset, keys, maxIterations); } // -------------------------------------------------------------------------------------------- // Custom Operators // ------------------------------------------------------------------------------------------- /** * Runs a {@link CustomUnaryOperation} on the data set. Custom operations are typically complex * operators that are composed of multiple steps. * * @param operation The operation to run. * @return The data set produced by the operation. */ public <X> DataSet<X> runOperation(CustomUnaryOperation<T, X> operation) { Validate.notNull(operation, "The custom operator must not be null."); operation.setInput(this); return operation.createResult(); } // -------------------------------------------------------------------------------------------- // Union // -------------------------------------------------------------------------------------------- /** * Creates a union of this DataSet with an other DataSet. The other DataSet must be of the same data type. * * @param other The other DataSet which is unioned with the current DataSet. * @return The resulting DataSet. */ public UnionOperator<T> union(DataSet<T> other){ return new UnionOperator<T>(this, other); } // -------------------------------------------------------------------------------------------- // Top-K // -------------------------------------------------------------------------------------------- // -------------------------------------------------------------------------------------------- // Result writing // -------------------------------------------------------------------------------------------- /** * Writes a DataSet as a text file to the specified location.<br/> * For each element of the DataSet the result of {@link Object#toString()} is written. * * @param filePath The path pointing to the location the text file is written to. * @return The DataSink that writes the DataSet. * * @see TextOutputFormat */ public DataSink<T> writeAsText(String filePath) { return output(new TextOutputFormat<T>(new Path(filePath))); } /** * Writes a DataSet as a text file to the specified location.<br/> * For each element of the DataSet the result of {@link Object#toString()} is written. * * @param filePath The path pointing to the location the text file is written to. * @param writeMode Control the behavior for existing files. Options are NO_OVERWRITE and OVERWRITE. * @return The DataSink that writes the DataSet. * * @see TextOutputFormat */ public DataSink<T> writeAsText(String filePath, WriteMode writeMode) { TextOutputFormat<T> tof = new TextOutputFormat<T>(new Path(filePath)); tof.setWriteMode(writeMode); return output(tof); } /** * Writes a {@link Tuple} DataSet as a CSV file to the specified location.<br/> * <b>Note: Only a Tuple DataSet can written as a CSV file.</b><br/> * For each Tuple field the result of {@link Object#toString()} is written. * Tuple fields are separated by the default field delimiter {@link CsvOutputFormat.DEFAULT_FIELD_DELIMITER}.<br/> * Tuples are are separated by the default line delimiter {@link CsvOutputFormat.DEFAULT_LINE_DELIMITER}. * * @param filePath The path pointing to the location the CSV file is written to. * @return The DataSink that writes the DataSet. * * @see Tuple * @see CsvOutputFormat */ public DataSink<T> writeAsCsv(String filePath) { return writeAsCsv(filePath, CsvOutputFormat.DEFAULT_LINE_DELIMITER, CsvOutputFormat.DEFAULT_FIELD_DELIMITER); } /** * Writes a {@link Tuple} DataSet as a CSV file to the specified location with the specified field and line delimiters.<br/> * <b>Note: Only a Tuple DataSet can written as a CSV file.</b><br/> * For each Tuple field the result of {@link Object#toString()} is written. * * @param filePath The path pointing to the location the CSV file is written to. * @param rowDelimiter The row delimiter to separate Tuples. * @param fieldDelimiter The field delimiter to separate Tuple fields. * * @see Tuple * @see CsvOutputFormat */ public DataSink<T> writeAsCsv(String filePath, String rowDelimiter, String fieldDelimiter) { return internalWriteAsCsv(new Path(filePath), rowDelimiter, fieldDelimiter, null); } /** * Writes a {@link Tuple} DataSet as a CSV file to the specified location with the specified field and line delimiters.<br/> * <b>Note: Only a Tuple DataSet can written as a CSV file.</b><br/> * For each Tuple field the result of {@link Object#toString()} is written. * * @param filePath The path pointing to the location the CSV file is written to. * @param rowDelimiter The row delimiter to separate Tuples. * @param fieldDelimiter The field delimiter to separate Tuple fields. * @param writeMode Control the behavior for existing files. Options are NO_OVERWRITE and OVERWRITE. * * @see Tuple * @see CsvOutputFormat */ public DataSink<T> writeAsCsv(String filePath, String rowDelimiter, String fieldDelimiter, WriteMode writeMode) { return internalWriteAsCsv(new Path(filePath), rowDelimiter, fieldDelimiter, writeMode); } @SuppressWarnings("unchecked") private <X extends Tuple> DataSink<T> internalWriteAsCsv(Path filePath, String rowDelimiter, String fieldDelimiter, WriteMode wm) { Validate.isTrue(this.type.isTupleType(), "The writeAsCsv() method can only be used on data sets of tuples."); CsvOutputFormat<X> of = new CsvOutputFormat<X>(filePath, rowDelimiter, fieldDelimiter); if(wm != null) { of.setWriteMode(wm); } return output((OutputFormat<T>) of); } /** * Writes a DataSet to the standard output stream (stdout).<br/> * For each element of the DataSet the result of {@link Object#toString()} is written. * * @return The DataSink that writes the DataSet. */ public DataSink<T> print() { return output(new PrintingOutputFormat<T>(false)); } /** * Writes a DataSet to the standard error stream (stderr).<br/> * For each element of the DataSet the result of {@link Object#toString()} is written. * * @return The DataSink that writes the DataSet. */ public DataSink<T> printToErr() { return output(new PrintingOutputFormat<T>(true)); } /** * Writes a DataSet using a {@link FileOutputFormat} to a specified location. * This method adds a data sink to the program. * * @param outputFormat The FileOutputFormat to write the DataSet. * @param filePath The path to the location where the DataSet is written. * @return The DataSink that writes the DataSet. * * @see FileOutputFormat */ public DataSink<T> write(FileOutputFormat<T> outputFormat, String filePath) { Validate.notNull(filePath, "File path must not be null."); Validate.notNull(outputFormat, "Output format must not be null."); outputFormat.setOutputFilePath(new Path(filePath)); return output(outputFormat); } /** * Writes a DataSet using a {@link FileOutputFormat} to a specified location. * This method adds a data sink to the program. * * @param outputFormat The FileOutputFormat to write the DataSet. * @param filePath The path to the location where the DataSet is written. * @param writeMode The mode of writing, indicating whether to overwrite existing files. * @return The DataSink that writes the DataSet. * * @see FileOutputFormat */ public DataSink<T> write(FileOutputFormat<T> outputFormat, String filePath, WriteMode writeMode) { Validate.notNull(filePath, "File path must not be null."); Validate.notNull(writeMode, "Write mode must not be null."); Validate.notNull(outputFormat, "Output format must not be null."); outputFormat.setOutputFilePath(new Path(filePath)); outputFormat.setWriteMode(writeMode); return output(outputFormat); } /** * Emits a DataSet using an {@link OutputFormat}. This method adds a data sink to the program. * Programs may have multiple data sinks. A DataSet may also have multiple consumers (data sinks * or transformations) at the same time. * * @param outputFormat The OutputFormat to process the DataSet. * @return The DataSink that processes the DataSet. * * @see OutputFormat * @see DataSink */ public DataSink<T> output(OutputFormat<T> outputFormat) { Validate.notNull(outputFormat); // configure the type if needed if (outputFormat instanceof InputTypeConfigurable) { ((InputTypeConfigurable) outputFormat).setInputType(this.type); } DataSink<T> sink = new DataSink<T>(this, outputFormat, this.type); this.context.registerDataSink(sink); return sink; } // -------------------------------------------------------------------------------------------- // Utilities // -------------------------------------------------------------------------------------------- protected static void checkSameExecutionContext(DataSet<?> set1, DataSet<?> set2) { if (set1.context != set2.context) { throw new IllegalArgumentException("The two inputs have different execution contexts."); } } }