/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/
package eu.stratosphere.test.recordJobs.sort;
import eu.stratosphere.api.common.Plan;
import eu.stratosphere.api.common.Program;
import eu.stratosphere.api.common.ProgramDescription;
import eu.stratosphere.api.java.record.operators.FileDataSink;
import eu.stratosphere.api.java.record.operators.FileDataSource;
import eu.stratosphere.api.common.operators.Order;
import eu.stratosphere.api.common.operators.Ordering;
import eu.stratosphere.test.recordJobs.sort.tsUtil.TeraDistribution;
import eu.stratosphere.test.recordJobs.sort.tsUtil.TeraInputFormat;
import eu.stratosphere.test.recordJobs.sort.tsUtil.TeraKey;
import eu.stratosphere.test.recordJobs.sort.tsUtil.TeraOutputFormat;
/**
* This is an example implementation of the well-known TeraSort benchmark using the Stratosphere system. The benchmark
* requires the input data to be generated according to the rules of Jim Gray's sort benchmark. A possible way to such
* input data is the Hadoop TeraGen program. For more details see <a
* href="http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/examples/terasort/TeraGen.html">
* http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/examples/terasort/TeraGen.html</a>.
*
* Note: this example job is currently not included in the build, because of problems with the RangePartioner (see
* https://github.com/stratosphere/stratosphere/issues/7). It should be included again after fixing the issue.
*
*/
public final class TeraSort implements Program, ProgramDescription {
private static final long serialVersionUID = 1L;
@Override
public String getDescription() {
return "Parameters: [numSubStasks] [input] [output]";
}
@Override
public Plan getPlan(String... args) throws IllegalArgumentException {
// parse job parameters
final int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
final String input = (args.length > 1 ? args[1] : "");
final String output = (args.length > 2 ? args[2] : "");
// This task will read the input data and generate the key/value pairs
final FileDataSource source =
new FileDataSource(new TeraInputFormat(), input, "Data Source");
source.setDegreeOfParallelism(numSubTasks);
// This task writes the sorted data back to disk
final FileDataSink sink =
new FileDataSink(new TeraOutputFormat(), output, "Data Sink");
sink.setDegreeOfParallelism(numSubTasks);
sink.setGlobalOrder(new Ordering(0, TeraKey.class, Order.ASCENDING), new TeraDistribution());
sink.setInput(source);
return new Plan(sink, "TeraSort");
}
}