/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dstream.examples;
import static io.dstream.utils.Tuples.Tuple2.tuple2;
import static io.dstream.utils.Tuples.Tuple4.tuple4;
import java.util.List;
import java.util.Map.Entry;
import java.util.concurrent.Future;
import java.util.stream.Stream;
import io.dstream.DStream;
import io.dstream.utils.ExecutionResultUtils;
import io.dstream.utils.Tuples.Tuple2;
import io.dstream.utils.Tuples.Tuple4;
/**
* Contains various examples of join operation
*/
public class Join {
static String EXECUTION_NAME = "Join";
public static void main(String[] args) throws Exception {
// run all
TwoWayJoin.main();
FourWayJoin.main();
}
/**
* This example demonstrates simple join between two streams.
* To ensure correctness of joining data in the distributed environment, classification must
* precede any type of streams combine (i.e., join and/or union*). This will ensure
* the two+ streams represented as individual partitions have comparable data.
*
* The following case has two data sets:
* -one-
* 1 Oracle
* 2 Amazon
* . . .
*
* - two-
* Arun Murthy 3
* Larry Ellison 1
* . . .
*
* Classification is performed using the common "id", this ensuring that
* '1 Oracle' and 'Larry Ellison 1' will end up in the same partition.
*/
public static class TwoWayJoin {
public static void main(String... args) throws Exception {
DStream<String> one = DStream.ofType(String.class, "one").classify(s -> s.split("\\s+")[0]);
DStream<String> two = DStream.ofType(String.class, "two").classify(s -> s.split("\\s+")[2]);
Future<Stream<Stream<Tuple2<String, String>>>> resultFuture = one
.join(two).on(t2 -> t2._1().split("\\s+")[0].equals(t2._2().split("\\s+")[2]))
.executeAs(EXECUTION_NAME);
Stream<Stream<Tuple2<String, String>>> resultPartitionsStream = resultFuture.get();
ExecutionResultUtils.printResults(resultPartitionsStream, true);
}
}
/**
* This example shows a sample of joining more then two data sets with some transformation
* as well as multiple predicates
*/
public static class FourWayJoin {
public static void main(String... args) throws Exception {
DStream<String> one = DStream.ofType(String.class, "one").classify(a -> a.split("\\s+")[0]);
DStream<String> two = DStream.ofType(String.class, "two").classify(a -> a.split("\\s+")[2]);
DStream<String> three = DStream.ofType(String.class, "three").classify(a -> a.split("\\s+")[0]);
DStream<String> four = DStream.ofType(String.class, "four").classify(a -> a.split("\\s+")[0]);
Future<Stream<Stream<Entry<String, List<Tuple4<String, String, String, String>>>>>> resultFuture = one
.join(two)//.on(t2 -> t2._1().contains("Hortonworks"))
.map(t2 -> tuple2(t2._1().toUpperCase(), t2._2().toUpperCase()))
.join(three)
.join(four).on(t3 -> {
String v1 = t3._1()._1().split("\\s+")[0];
String v2 = t3._1()._2().split("\\s+")[2];
String v3 = t3._2().split("\\s+")[0];
String v4 = t3._3().split("\\s+")[0];
return v1.equals(v2) && v1.equals(v3) && v1.equals(v4);
})
.map(t3 -> tuple4(t3._1()._1(), t3._1()._2(), t3._2(), t3._3()))
.aggregateValues(t4 -> t4._1(), t4 -> t4)
.executeAs(EXECUTION_NAME);
Stream<Stream<Entry<String, List<Tuple4<String, String, String, String>>>>> resultPartitionsStream = resultFuture.get();
ExecutionResultUtils.printResults(resultPartitionsStream, true);
}
}
}