/*********************************************************************************************************************** * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. **********************************************************************************************************************/ package eu.stratosphere.test.recordJobs.relational; import java.io.Serializable; import java.util.Iterator; import eu.stratosphere.api.common.Plan; import eu.stratosphere.api.common.Program; import eu.stratosphere.api.common.ProgramDescription; import eu.stratosphere.api.java.record.operators.FileDataSink; import eu.stratosphere.api.java.record.operators.FileDataSource; import eu.stratosphere.api.java.record.functions.JoinFunction; import eu.stratosphere.api.java.record.functions.ReduceFunction; import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFields; import eu.stratosphere.api.java.record.functions.FunctionAnnotation.ConstantFieldsSecondExcept; import eu.stratosphere.api.java.record.io.CsvInputFormat; import eu.stratosphere.api.java.record.io.CsvOutputFormat; import eu.stratosphere.api.java.record.operators.JoinOperator; import eu.stratosphere.api.java.record.operators.ReduceOperator; import eu.stratosphere.api.java.record.operators.ReduceOperator.Combinable; import eu.stratosphere.types.IntValue; import eu.stratosphere.types.Record; import eu.stratosphere.types.StringValue; import eu.stratosphere.util.Collector; /** * The TPC-H is a decision support benchmark on relational data. * Its documentation and the data generator (DBGEN) can be found * on http://www.tpc.org/tpch/ .This implementation is tested with * the DB2 data format. * * This program implements a query on the TPC-H schema * including one join and an aggregation. * This query is used as example in the Asterix project (http://asterix.ics.uci.edu/). * * SELECT c_mktsegment, COUNT(o_orderkey) * FROM orders, customer * WHERE c_custkey = o_custkey * GROUP BY c_mktsegment; * */ public class TPCHQueryAsterix implements Program, ProgramDescription { private static final long serialVersionUID = 1L; /** * Realizes the join between Customers and Order table. */ @ConstantFieldsSecondExcept(0) public static class JoinCO extends JoinFunction implements Serializable { private static final long serialVersionUID = 1L; private final IntValue one = new IntValue(1); /** * Output Schema: * 0: PARTIAL_COUNT=1 * 1: C_MKTSEGMENT */ @Override public void join(Record order, Record cust, Collector<Record> out) throws Exception { cust.setField(0, one); out.collect(cust); } } /** * Reduce implements the aggregation of the results. The * Combinable annotation is set as the partial counts can be calculated * already in the combiner * */ @Combinable @ConstantFields(1) public static class AggCO extends ReduceFunction implements Serializable { private static final long serialVersionUID = 1L; private final IntValue integer = new IntValue(); private Record record = new Record(); /** * Output Schema: * 0: COUNT * 1: C_MKTSEGMENT * */ @Override public void reduce(Iterator<Record> records, Collector<Record> out) throws Exception { int count = 0; while (records.hasNext()) { record = records.next(); count+=record.getField(0, integer).getValue(); } integer.setValue(count); record.setField(0, integer); out.collect(record); } /** * Computes partial counts */ public void combine(Iterator<Record> records, Collector<Record> out) throws Exception { reduce(records, out); } } @Override public Plan getPlan(final String... args) { // parse program parameters int numSubtasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); String ordersPath = (args.length > 1 ? args[1] : ""); String customerPath = (args.length > 2 ? args[2] : ""); String output = (args.length > 3 ? args[3] : ""); /* * Output Schema: * 0: CUSTOMER_ID */ // create DataSourceContract for Orders input FileDataSource orders = new FileDataSource(new CsvInputFormat(), ordersPath, "Orders"); orders.setDegreeOfParallelism(numSubtasks); CsvInputFormat.configureRecordFormat(orders) .recordDelimiter('\n') .fieldDelimiter('|') .field(IntValue.class, 1); /* * Output Schema: * 0: CUSTOMER_ID * 1: MKT_SEGMENT */ // create DataSourceContract for Customer input FileDataSource customers = new FileDataSource(new CsvInputFormat(), customerPath, "Customers"); customers.setDegreeOfParallelism(numSubtasks); CsvInputFormat.configureRecordFormat(customers) .recordDelimiter('\n') .fieldDelimiter('|') .field(IntValue.class, 0) .field(StringValue.class, 6); // create JoinOperator for joining Orders and LineItems JoinOperator joinCO = JoinOperator.builder(new JoinCO(), IntValue.class, 0, 0) .name("JoinCO") .build(); joinCO.setDegreeOfParallelism(numSubtasks); // create ReduceOperator for aggregating the result ReduceOperator aggCO = ReduceOperator.builder(new AggCO(), StringValue.class, 1) .name("AggCo") .build(); aggCO.setDegreeOfParallelism(numSubtasks); // create DataSinkContract for writing the result FileDataSink result = new FileDataSink(new CsvOutputFormat(), output, "Output"); result.setDegreeOfParallelism(numSubtasks); CsvOutputFormat.configureRecordFormat(result) .recordDelimiter('\n') .fieldDelimiter('|') .field(IntValue.class, 0) .field(StringValue.class, 1); // assemble the plan result.setInput(aggCO); aggCO.setInput(joinCO); joinCO.setFirstInput(orders); joinCO.setSecondInput(customers); return new Plan(result, "TPCH Asterix"); } @Override public String getDescription() { return "Parameters: [numSubStasks], [orders], [customer], [output]"; } }