/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/
package eu.stratosphere.test.recordJobs.relational;
import java.io.Serializable;
import java.util.Iterator;
import eu.stratosphere.api.common.Plan;
import eu.stratosphere.api.common.Program;
import eu.stratosphere.api.common.ProgramDescription;
import eu.stratosphere.api.java.record.operators.FileDataSink;
import eu.stratosphere.api.java.record.operators.FileDataSource;
import eu.stratosphere.api.java.record.functions.JoinFunction;
import eu.stratosphere.api.java.record.functions.ReduceFunction;
import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFields;
import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFieldsSecondExcept;
import eu.stratosphere.api.java.record.io.CsvInputFormat;
import eu.stratosphere.api.java.record.io.CsvOutputFormat;
import eu.stratosphere.api.java.record.operators.JoinOperator;
import eu.stratosphere.api.java.record.operators.ReduceOperator;
import eu.stratosphere.api.java.record.operators.ReduceOperator.Combinable;
import eu.stratosphere.types.IntValue;
import eu.stratosphere.types.Record;
import eu.stratosphere.types.StringValue;
import eu.stratosphere.util.Collector;
/**
* The TPC-H is a decision support benchmark on relational data.
* Its documentation and the data generator (DBGEN) can be found
* on http://www.tpc.org/tpch/ .This implementation is tested with
* the DB2 data format.
*
* This program implements a query on the TPC-H schema
* including one join and an aggregation.
* This query is used as example in the Asterix project (http://asterix.ics.uci.edu/).
*
* SELECT c_mktsegment, COUNT(o_orderkey)
* FROM orders, customer
* WHERE c_custkey = o_custkey
* GROUP BY c_mktsegment;
*
*/
public class TPCHQueryAsterix implements Program, ProgramDescription {
private static final long serialVersionUID = 1L;
/**
* Realizes the join between Customers and Order table.
*/
@ConstantFieldsSecondExcept(0)
public static class JoinCO extends JoinFunction implements Serializable {
private static final long serialVersionUID = 1L;
private final IntValue one = new IntValue(1);
/**
* Output Schema:
* 0: PARTIAL_COUNT=1
* 1: C_MKTSEGMENT
*/
@Override
public void join(Record order, Record cust, Collector<Record> out)
throws Exception {
cust.setField(0, one);
out.collect(cust);
}
}
/**
* Reduce implements the aggregation of the results. The
* Combinable annotation is set as the partial counts can be calculated
* already in the combiner
*
*/
@Combinable
@ConstantFields(1)
public static class AggCO extends ReduceFunction implements Serializable {
private static final long serialVersionUID = 1L;
private final IntValue integer = new IntValue();
private Record record = new Record();
/**
* Output Schema:
* 0: COUNT
* 1: C_MKTSEGMENT
*
*/
@Override
public void reduce(Iterator<Record> records, Collector<Record> out)
throws Exception {
int count = 0;
while (records.hasNext()) {
record = records.next();
count+=record.getField(0, integer).getValue();
}
integer.setValue(count);
record.setField(0, integer);
out.collect(record);
}
/**
* Computes partial counts
*/
public void combine(Iterator<Record> records, Collector<Record> out)
throws Exception {
reduce(records, out);
}
}
@Override
public Plan getPlan(final String... args) {
// parse program parameters
int numSubtasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
String ordersPath = (args.length > 1 ? args[1] : "");
String customerPath = (args.length > 2 ? args[2] : "");
String output = (args.length > 3 ? args[3] : "");
/*
* Output Schema:
* 0: CUSTOMER_ID
*/
// create DataSourceContract for Orders input
FileDataSource orders = new FileDataSource(new CsvInputFormat(), ordersPath, "Orders");
orders.setDegreeOfParallelism(numSubtasks);
CsvInputFormat.configureRecordFormat(orders)
.recordDelimiter('\n')
.fieldDelimiter('|')
.field(IntValue.class, 1);
/*
* Output Schema:
* 0: CUSTOMER_ID
* 1: MKT_SEGMENT
*/
// create DataSourceContract for Customer input
FileDataSource customers = new FileDataSource(new CsvInputFormat(), customerPath, "Customers");
customers.setDegreeOfParallelism(numSubtasks);
CsvInputFormat.configureRecordFormat(customers)
.recordDelimiter('\n')
.fieldDelimiter('|')
.field(IntValue.class, 0)
.field(StringValue.class, 6);
// create JoinOperator for joining Orders and LineItems
JoinOperator joinCO = JoinOperator.builder(new JoinCO(), IntValue.class, 0, 0)
.name("JoinCO")
.build();
joinCO.setDegreeOfParallelism(numSubtasks);
// create ReduceOperator for aggregating the result
ReduceOperator aggCO = ReduceOperator.builder(new AggCO(), StringValue.class, 1)
.name("AggCo")
.build();
aggCO.setDegreeOfParallelism(numSubtasks);
// create DataSinkContract for writing the result
FileDataSink result = new FileDataSink(new CsvOutputFormat(), output, "Output");
result.setDegreeOfParallelism(numSubtasks);
CsvOutputFormat.configureRecordFormat(result)
.recordDelimiter('\n')
.fieldDelimiter('|')
.field(IntValue.class, 0)
.field(StringValue.class, 1);
// assemble the plan
result.setInput(aggCO);
aggCO.setInput(joinCO);
joinCO.setFirstInput(orders);
joinCO.setSecondInput(customers);
return new Plan(result, "TPCH Asterix");
}
@Override
public String getDescription() {
return "Parameters: [numSubStasks], [orders], [customer], [output]";
}
}