/*********************************************************************************************************************** * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. **********************************************************************************************************************/ package eu.stratosphere.test.recordJobs.relational; import eu.stratosphere.api.common.Plan; import eu.stratosphere.api.common.Program; import eu.stratosphere.api.common.ProgramDescription; import eu.stratosphere.api.java.record.operators.FileDataSink; import eu.stratosphere.api.java.record.operators.FileDataSource; import eu.stratosphere.api.java.record.io.CsvInputFormat; import eu.stratosphere.api.java.record.io.CsvOutputFormat; import eu.stratosphere.api.java.record.operators.JoinOperator; import eu.stratosphere.api.java.record.operators.MapOperator; import eu.stratosphere.api.java.record.operators.ReduceOperator; import eu.stratosphere.test.recordJobs.relational.TPCHQuery3.AggLiO; import eu.stratosphere.test.recordJobs.relational.TPCHQuery3.FilterO; import eu.stratosphere.test.recordJobs.relational.TPCHQuery3.JoinLiO; import eu.stratosphere.types.DoubleValue; import eu.stratosphere.types.IntValue; import eu.stratosphere.types.LongValue; import eu.stratosphere.types.StringValue; /** * The TPC-H is a decision support benchmark on relational data. * Its documentation and the data generator (DBGEN) can be found * on http://www.tpc.org/tpch/ .This implementation is tested with * the DB2 data format. * THe PACT program implements a modified version of the query 3 of * the TPC-H benchmark including one join, some filtering and an * aggregation. * * SELECT l_orderkey, o_shippriority, sum(l_extendedprice) as revenue * FROM orders, lineitem * WHERE l_orderkey = o_orderkey * AND o_orderstatus = "X" * AND YEAR(o_orderdate) > Y * AND o_orderpriority LIKE "Z%" * GROUP BY l_orderkey, o_shippriority; */ public class TPCHQuery3Unioned implements Program, ProgramDescription { private static final long serialVersionUID = 1L; @Override public Plan getPlan(final String... args) { // parse program parameters final int numSubtasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); String orders1Path = (args.length > 1 ? args[1] : ""); String orders2Path = (args.length > 2 ? args[2] : ""); String partJoin1Path = (args.length > 3 ? args[3] : ""); String partJoin2Path = (args.length > 4 ? args[4] : ""); String lineitemsPath = (args.length > 5 ? args[5] : ""); String output = (args.length > 6 ? args[6] : ""); // create DataSourceContract for Orders input FileDataSource orders1 = new FileDataSource(new CsvInputFormat(), orders1Path, "Orders 1"); CsvInputFormat.configureRecordFormat(orders1) .recordDelimiter('\n') .fieldDelimiter('|') .field(LongValue.class, 0) // order id .field(IntValue.class, 7) // ship prio .field(StringValue.class, 2, 2) // order status .field(StringValue.class, 4, 10) // order date .field(StringValue.class, 5, 8); // order prio FileDataSource orders2 = new FileDataSource(new CsvInputFormat(), orders2Path, "Orders 2"); CsvInputFormat.configureRecordFormat(orders2) .recordDelimiter('\n') .fieldDelimiter('|') .field(LongValue.class, 0) // order id .field(IntValue.class, 7) // ship prio .field(StringValue.class, 2, 2) // order status .field(StringValue.class, 4, 10) // order date .field(StringValue.class, 5, 8); // order prio // create DataSourceContract for LineItems input FileDataSource lineitems = new FileDataSource(new CsvInputFormat(), lineitemsPath, "LineItems"); CsvInputFormat.configureRecordFormat(lineitems) .recordDelimiter('\n') .fieldDelimiter('|') .field(LongValue.class, 0) .field(DoubleValue.class, 5); // create MapOperator for filtering Orders tuples MapOperator filterO1 = MapOperator.builder(new FilterO()) .name("FilterO") .input(orders1) .build(); // filter configuration filterO1.setParameter(TPCHQuery3.YEAR_FILTER, 1993); filterO1.setParameter(TPCHQuery3.PRIO_FILTER, "5"); filterO1.getCompilerHints().setFilterFactor(0.05f); // create MapOperator for filtering Orders tuples MapOperator filterO2 = MapOperator.builder(new FilterO()) .name("FilterO") .input(orders2) .build(); // filter configuration filterO2.setParameter(TPCHQuery3.YEAR_FILTER, 1993); filterO2.setParameter(TPCHQuery3.PRIO_FILTER, "5"); // create JoinOperator for joining Orders and LineItems @SuppressWarnings("unchecked") JoinOperator joinLiO = JoinOperator.builder(new JoinLiO(), LongValue.class, 0, 0) .input1(filterO2, filterO1) .input2(lineitems) .name("JoinLiO") .build(); FileDataSource partJoin1 = new FileDataSource(new CsvInputFormat(), partJoin1Path, "Part Join 1"); CsvInputFormat.configureRecordFormat(partJoin1) .recordDelimiter('\n') .fieldDelimiter('|') .field(LongValue.class, 0) .field(IntValue.class, 1) .field(DoubleValue.class, 2); FileDataSource partJoin2 = new FileDataSource(new CsvInputFormat(), partJoin2Path, "Part Join 2"); CsvInputFormat.configureRecordFormat(partJoin2) .recordDelimiter('\n') .fieldDelimiter('|') .field(LongValue.class, 0) .field(IntValue.class, 1) .field(DoubleValue.class, 2); // create ReduceOperator for aggregating the result // the reducer has a composite key, consisting of the fields 0 and 1 @SuppressWarnings("unchecked") ReduceOperator aggLiO = ReduceOperator.builder(new AggLiO()) .keyField(LongValue.class, 0) .keyField(StringValue.class, 1) .input(joinLiO, partJoin2, partJoin1) .name("AggLio") .build(); // create DataSinkContract for writing the result FileDataSink result = new FileDataSink(new CsvOutputFormat(), output, aggLiO, "Output"); CsvOutputFormat.configureRecordFormat(result) .recordDelimiter('\n') .fieldDelimiter('|') .lenient(true) .field(LongValue.class, 0) .field(IntValue.class, 1) .field(DoubleValue.class, 2); // assemble the PACT plan Plan plan = new Plan(result, "TPCH Q3 Unioned"); plan.setDefaultParallelism(numSubtasks); return plan; } @Override public String getDescription() { return "Parameters: [numSubStasks], [orders1], [orders2], [partJoin1], [partJoin2], [lineitem], [output]"; } }