package dstream.tez.examples; import static io.dstream.utils.Tuples.Tuple2.tuple2; import static io.dstream.utils.Tuples.Tuple4.tuple4; import java.util.concurrent.Future; import java.util.stream.Stream; import io.dstream.DStream; import io.dstream.utils.Tuples.Tuple2; import io.dstream.utils.Tuples.Tuple4; public class Join { static String EXECUTION_NAME = "Join"; public static void main(String[] args) throws Exception { //run all TwoWayJoin.main(); FourWayJoin.main(); SampleUtils.clean(EXECUTION_NAME); } /** * This example demonstrates simple join between two streams. * To ensure correctness of joining data in the distributed environment, classification must * precede any type of streams combine (i.e., join and/or union*). This will ensure * the two+ streams represented as individual partitions have comparable data. * * The following case has two data sets: * -one- * 1 Oracle * 2 Amazon * . . . * * - two- * Arun Murthy 3 * Larry Ellison 1 * . . . * * Classification is performed using the common "id", this ensuring that * '1 Oracle' and 'Larry Ellison 1' will end up in the same partition. */ public static class TwoWayJoin{ public static void main(String... args) throws Exception { SampleUtils.clean(EXECUTION_NAME); DStream<String> hash = DStream.ofType(String.class, "one").classify(a -> a.split("\\s+")[0]);; DStream<String> probe = DStream.ofType(String.class, "two").classify(a -> a.split("\\s+")[2]);; Future<Stream<Stream<Tuple2<String, String>>>> resultFuture = hash .join(probe).on(t2 -> t2._1().substring(0, 1).equals(t2._2().substring(t2._2().length()-1))) .executeAs(EXECUTION_NAME); Stream<Stream<Tuple2<String, String>>> result = resultFuture.get(); result.forEach(resultPartitionStream -> { resultPartitionStream.forEach(System.out::println); }); result.close();// will close Tez client SampleUtils.clean(EXECUTION_NAME); } } /** * This example shows a sample of joining more then two data sets with some transformation * as well as multiple predicates */ public static class FourWayJoin { public static void main(String... args) throws Exception { SampleUtils.clean(EXECUTION_NAME); DStream<String> one = DStream.ofType(String.class, "one").classify(a -> a.split("\\s+")[0]); DStream<String> two = DStream.ofType(String.class, "two").classify(a -> a.split("\\s+")[2]); DStream<String> three = DStream.ofType(String.class, "three").classify(a -> a.split("\\s+")[0]); DStream<String> four = DStream.ofType(String.class, "four").classify(a -> a.split("\\s+")[0]); Future<Stream<Stream<Tuple4<String, String, String, String>>>> resultFuture = one .join(two) .filter(t2 -> t2._1().contains("Hortonworks")) .map(t2 -> tuple2(t2._1().toUpperCase(), t2._2().toUpperCase())) .join(three) .join(four).on(t3 -> { String v1 = t3._1()._1().split("\\s+")[0]; String v2 = t3._1()._2().split("\\s+")[2]; String v3 = t3._2().split("\\s+")[0]; String v4 = t3._3().split("\\s+")[0]; return v1.equals(v2) && v1.equals(v3) && v1.equals(v4); }) .map(t3 -> tuple4(t3._1()._1(), t3._1()._2(), t3._2(), t3._3())) .executeAs(EXECUTION_NAME); Stream<Stream<Tuple4<String, String, String, String>>> result = resultFuture.get(); result.forEach(resultPartitionStream -> { resultPartitionStream.forEach(System.out::println); }); result.close();// will close Tez client SampleUtils.clean(EXECUTION_NAME); } } }