package com.github.projectflink.spark; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import com.github.projectflink.spark.util.TPCH3ScalaReg; import scala.Tuple2; import scala.Tuple3; import scala.Tuple4; import scala.Tuple5; /** * local[4] file:///home/robert/flink-workdir/flink-perf/automation/workdir/testjob-data/lineitem.tbl file:///home/robert/flink-workdir/flink-perf/automation/workdir/testjob-data/order.tbl file:///home/robert/flink-workdir/flink-perf/automation/workdir/testjob-data/customer.tbl file:///home/robert/flink-workdir/flink-perf/localsparkout/tpch3/ * * * */ public class TPCH3Spark { // ************************************************************************* // PROGRAM // ************************************************************************* public static String SPLIT = "\\|"; public static void main(String[] args) throws Exception { String master = args[0]; String lineitem = args[1]; String order = args[2]; String customer = args[3]; String output = args[4]; System.err.println("Starting spark with master="+master); SparkConf conf = new SparkConf().setAppName("TPCH 3").setMaster(master).set("spark.hadoop.validateOutputSpecs", "false"); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", TPCH3ScalaReg.class.getName()); JavaSparkContext sc = new JavaSparkContext(conf); // get input data JavaRDD<String> liStr = sc.textFile(lineitem); JavaRDD<String> orStr = sc.textFile(order); JavaRDD<String> custStr = sc.textFile(customer); JavaPairRDD<Integer, Lineitem> li = liStr.mapToPair(new PairFunction<String, Integer, Lineitem>() { @Override public Tuple2<Integer, Lineitem> call(String t) throws Exception { String[] el = t.split(SPLIT); // 1000011000100000 // 0123456789012345 // 0 12 3 System.err.println("Line item keys = "+el[0]); Lineitem li = new Lineitem(Integer.getInteger(el[0]), Double.valueOf(el[6]), Double.valueOf(el[5]), el[10]); return new Tuple2<Integer, TPCH3Spark.Lineitem>(li.getOrderkey(), li); } }); JavaPairRDD<Integer, Order> or = orStr.mapToPair(new PairFunction<String, Integer, Order>() { @Override public Tuple2<Integer, Order> call(String t) throws Exception { String[] el = t.split(SPLIT); // 100010010 // 012345678 Order o = new Order(Integer.valueOf(el[0]), el[4], Integer.valueOf(el[7])); return new Tuple2<Integer, Order>(o.getOrderkey(), o); } }); JavaPairRDD<Integer, Customer> cust = custStr.mapToPair(new PairFunction<String, Integer, Customer>() { @Override public Tuple2<Integer, Customer> call(String t) throws Exception { String[] el = t.split(SPLIT); // 10000010 // 01234567 Customer c = new Customer(Integer.valueOf(el[0]), el[6]); return new Tuple2<Integer, Customer> (c.getCustKey(), c); }}); // Filter market segment "AUTOMOBILE" cust = cust.filter( new Function<Tuple2<Integer,Customer>, Boolean>() { @Override public Boolean call(Tuple2<Integer, Customer> v1) throws Exception { return v1._2().getMktsegment().equals("AUTOMOBILE"); }} ); // Filter all Orders with o_orderdate < 12.03.1995 or = or.filter( new Function<Tuple2<Integer,Order>, Boolean>() { private DateFormat format = new SimpleDateFormat("yyyy-MM-dd"); private Date date; { Calendar cal = Calendar.getInstance(); cal.set(1995, 3, 12); date = cal.getTime(); } @Override public Boolean call(Tuple2<Integer, Order> v1) throws Exception { Date orderDate = format.parse(v1._2().getOrderdate()); return orderDate.before(date); }} ); // Filter all Lineitems with l_shipdate > 12.03.1995 li = li.filter(new Function<Tuple2<Integer,Lineitem>, Boolean>() { private DateFormat format = new SimpleDateFormat("yyyy-MM-dd"); private Date date; { Calendar cal = Calendar.getInstance(); cal.set(1995, 3, 12); date = cal.getTime(); } @Override public Boolean call(Tuple2<Integer,Lineitem> value) throws ParseException { Date shipDate = format.parse(value._2().getShipdate()); return shipDate.after(date); } } ); System.err.println("++++++ Li count = "+li.count()+" cust count = "+cust.count()+" orders count = "+or.count()); JavaPairRDD<Integer, ShippingPriorityItem> customerWithOrders = cust.join(or) // set orderkey to key (for upcoming join) .mapToPair(new PairFunction<Tuple2<Integer,Tuple2<Customer,Order>>, Integer, ShippingPriorityItem>() { @Override public Tuple2<Integer, ShippingPriorityItem> call( Tuple2<Integer, Tuple2<Customer, Order>> t) throws Exception { final Order second = t._2()._2(); ShippingPriorityItem spi = new ShippingPriorityItem(0, 0.0, second.getOrderdate(), second.getShippriority(), second.getOrderkey()); System.err.println("C with O keys = "+second.getOrderkey()); return new Tuple2<Integer, ShippingPriorityItem>(second.getOrderkey(), spi); } /** * new JoinFunction<Customer, Order, ShippingPriorityItem>() { @Override public ShippingPriorityItem join(Customer first, Order second) { return new ShippingPriorityItem(0, 0.0, second.getOrderdate(), second.getShippriority(), second.getOrderkey()); } }); */ }); System.err.println("++++++ customerWithOrders count "+customerWithOrders.count()); JavaPairRDD<Integer, Tuple2<ShippingPriorityItem, Lineitem>> joined = customerWithOrders.join(li); System.err.println("++++++ joined count "+joined.count()); // .groupBy(0, 2, 3) JavaPairRDD<Tuple3<Integer, String, Integer>, ShippingPriorityItem> joined1 = joined.mapToPair( new PairFunction<Tuple2<Integer,Tuple2<ShippingPriorityItem,Lineitem>>, Tuple3<Integer, String, Integer>, ShippingPriorityItem>() { @Override public Tuple2<Tuple3<Integer, String, Integer>, ShippingPriorityItem> call( Tuple2<Integer, Tuple2<ShippingPriorityItem, Lineitem>> t) throws Exception { final ShippingPriorityItem spi = t._2()._1(); final Lineitem second = t._2()._2(); ShippingPriorityItem spiImmu = new ShippingPriorityItem(second.getOrderkey(), second.getExtendedprice() * (1 - second.getDiscount()), spi._3(), spi._4(), spi._5()); return new Tuple2<Tuple3<Integer, String, Integer>, ShippingPriorityItem>(new Tuple3<Integer, String, Integer>(spiImmu._1(),spiImmu._3(),spiImmu._4()), spiImmu); } }); System.err.println("++++++ joined1 count "+joined1.count()); JavaPairRDD<Tuple3<Integer, String, Integer>, ShippingPriorityItem> finalDs = joined1.reduceByKey(new Function2<TPCH3Spark.ShippingPriorityItem, TPCH3Spark.ShippingPriorityItem, TPCH3Spark.ShippingPriorityItem>() { @Override public ShippingPriorityItem call( ShippingPriorityItem v1, ShippingPriorityItem v2) throws Exception { return new ShippingPriorityItem(v1._1(), v1._2()+v2._2(), v1._3(), v1._4(), v1._5()); } }); System.err.println("++++++ finalDs count "+finalDs.count()); finalDs.saveAsTextFile(output); // Join the last join result with Lineitems /* DataSet<ShippingPriorityItem> joined = customerWithOrders.join(li) .where(4) .equalTo(0) .with( new JoinFunction<ShippingPriorityItem, Lineitem, ShippingPriorityItem>() { @Override public ShippingPriorityItem join(ShippingPriorityItem first, Lineitem second) { first.setL_Orderkey(second.getOrderkey()); first.setRevenue(second.getExtendedprice() * (1 - second.getDiscount())); return first; } }); */ // Group by l_orderkey, o_orderdate and o_shippriority and compute revenue sum /* joined = joined .groupBy(0, 2, 3) .aggregate(Aggregations.SUM, 1); // emit result joined.writeAsCsv(outputPath, "\n", "|"); */ } // ************************************************************************* // DATA TYPES // ************************************************************************* public static class Lineitem extends Tuple4<Integer, Double, Double, String> { public Lineitem(Integer arg0, Double arg1, Double arg2, String arg3) { super(arg0, arg1, arg2, arg3); } public Integer getOrderkey() { return this._1(); } public Double getDiscount() { return this._3(); } public Double getExtendedprice() { return this._2(); } public String getShipdate() { return this._4(); } } public static class Customer extends Tuple2<Integer, String> { public Customer(Integer arg0, String arg1) { super(arg0, arg1); } public Integer getCustKey() { return this._1(); } public String getMktsegment() { return this._2(); } } public static class Order extends Tuple3<Integer, String, Integer> { public Order(Integer arg0, String arg1, Integer arg2) { super(arg0, arg1, arg2); } public Integer getOrderkey() { return this._1(); } public String getOrderdate() { return this._2(); } public Integer getShippriority() { return this._3(); } } // 0 1 2 3 4 public static class ShippingPriorityItem extends Tuple5<Integer, Double, String, Integer, Integer> { public ShippingPriorityItem(Integer arg0, Double arg1, String arg2, Integer arg3, Integer arg4) { super(arg0, arg1, arg2, arg3, arg4); } public Integer getL_Orderkey() { return this._1(); } // public void setL_Orderkey(Integer l_orderkey) { this.f0 = l_orderkey; } public Double getRevenue() { return this._2(); } // public void setRevenue(Double revenue) { this.f1 = revenue; } public String getOrderdate() { return this._3(); } public Integer getShippriority() { return this._4(); } public Integer getO_Orderkey() { return this._5(); } @Override public String toString() { return this._1()+"|"+this._2()+"|"+this._3()+"|"+this._4()+"|"+this._5()+"\n"; } } }