/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package io.dstream; import java.util.List; import java.util.Map.Entry; import java.util.stream.Stream; import io.dstream.SerializableStreamAssets.SerBinaryOperator; import io.dstream.SerializableStreamAssets.SerComparator; import io.dstream.SerializableStreamAssets.SerConsumer; import io.dstream.SerializableStreamAssets.SerFunction; import io.dstream.SerializableStreamAssets.SerPredicate; import io.dstream.support.Classifier; import io.dstream.support.HashClassifier; /** * Base strategy for variants of {@link DStream}. It defines all common operations. * * @param <A> the type of the stream elements * @param <T> the actual type of the instance of this {@link BaseDStream}. */ interface BaseDStream<A, T> extends ExecutableDStream<A> { /** * Returns a stream consisting of the distinct elements (according to * {@link Object#equals(Object)}) of this stream. * <br> * Consistent with {@link Stream#distinct()}.<br> * <br> * This is an <i>intermediate</i> operation. * <br> * This is a <i>composable-transformation</i> operation. * * @return new {@link DStream} of the same type. */ T distinct(); /** * Returns a stream containing a single value representing the count of elements in * the previous stream.<br> * * This operation is a non-terminal equivalent of the * <i>Stream.count()</i>.<br> * <br> * This is an <i>intermediate</i> operation. * <br> * This is a <i>composable-transformation</i> operation. * * @return new {@link DStream} of type {@link Long} */ DStream<Long> count(); /** * Will <b>distinctively</b> combine the two streams of the <i><b>same type</b></i> * returning a new {@link DStream} of the same type. <br> * For example: * <pre> * DStream<String> d1 = ... * DStream<String> d2 = ... * DStream<Integer> d3 = ... * * d1.union(d2) - legal because both streams are of the same type * d1.union(d3) - is illegal and will result in compile error since two streams are of different type (String and Integer) * </pre> * <br> * This is an <i>intermediate</i> operation. * <br> * This is a <i>composable-shuffle</i> operation. * * @param stream {@link DStream} of the same type to combine with this stream. * @return new {@link DStream} of the same type. */ T union(T stream); /** * Will combine two streams of the same type returning a new {@link DStream} * of the same type. * <pre> * DStream<String> d1 = ... * DStream<String> d2 = ... * DStream<Integer> d3 = ... * * d1.unionAll(d2) - legal because both streams are of the same type * d1.unionAll(d3) - is illegal and will result in compile error since two streams are of different type (String and Integer) * </pre> * <br> * This is an <i>intermediate</i> operation. * <br> * This is a <i>composable-shuffle</i> operation. * * @param stream {@link DStream} of the same type to combine with this stream. * @return new {@link DStream} of the same type. */ T unionAll(T stream); /** * Returns a new {@link DStream} consisting of the elements of this stream that were tested * true according to the given predicate. * the given predicate.<br> * <br> * Consistent with {@link Stream#filter(java.util.function.Predicate)}.<br> * <br> * This is an <i>intermediate</i> operation. * <br> * This is a <i>composable-transformation</i> operation. * * @param predicate predicate to apply to each element to determine if it * should be included * @return new {@link DStream} of the same type. */ T filter(SerPredicate<? super A> predicate); /** * Returns an equivalent {@link DStream} where elements are classified * on values provided by the given <i>classifier</i>.<br> * Classification is performed by the {@link Classifier}. * Default implementation of the {@link Classifier} is {@link HashClassifier}, * however specialized implementation could be configured via {@link DStreamConstants#CLASSIFIER} * configuration property.<br> * Classification could be looked at as the process of distributed grouping<br> * For example: * <pre> * // Suppose you have a text file with the following contents: * foo bar foo bar foo * bar foo bar foo * * // Your pipeline * DStream.ofType(String.class, "wc") * .flatMap(record -> Stream.of(record.split("\\s+"))) * .classify(word -> word) * * // Your Classifier is configured as 'dstream.grouper=FooBarClassifier' * // and its implementation looks like this: * * class FooBarClassifier implements Classifier { * public Integer apply(Object input) { * return input.equals("bar") ? 1 : 0; * } * . . . * } * </pre> * The above would result in {@link DStream} which represents two classification groups:<br> * Group-1 - bar bar bar bar<br> * Group-2 - foo foo foo foo foo<br> * Even though it would look continuous to you * (i.e., bar bar bar bar foo foo foo foo foo).<br> * <br> * In the "distributable" reality, this often coincides with data <i>partitioning</i>, since * {@link Classifier} is compliant with the general semantics of partitioners * by returning an {@link Integer} representing classification id, which could be treated by a * target partitioner as partition id.<br> * <br> * However, the actual <b><i>data partitioning</i></b> is the function of the system and * exist primarily to facilitate greater parallelization when it comes to actual data processing. * <b><i>Data classification</i></b> on the other hand, is the function of the application deriving its * requirement from the use case at hand (e.g., group all 'foo's and 'bar's together).<br> * So, it is important to separate the two, since it is quite conceivable that to facilitate greater * parallelization in the truly distributed environment classification groups * could be further partitioned (e.g., 2 groups into 8 partitions).<br> * <br> * Another configuration property relevant to this and every other <i>shuffle</i>-style operation * is {@link DStreamConstants#PARALLELISM} which allows you to provide a hint as to the level of * parallelisation you may want to accomplish and is typically passed as one of the constructor arguments * to the instance of {@link Classifier}, but it could also be used by the target execution * environment to configure its partitioner.<br> * <br> * This is an <i>intermediate</i> operation. * <br> * This is a <i>shuffle</i> operation. * * @param classifier function to extract value used by a target classifier to compute classification id. * @return */ T classify(SerFunction<? super A, ?> classifier); /** * Returns a {@link DStream} consisting of the results of replacing each element of * this stream with the contents of a mapped stream produced by applying * the provided mapping function to each element.<br> * <br> * Consistent with {@link Stream#flatMap(java.util.function.Function)}.<br> * <br> * This is an <i>intermediate</i> operation. * <br> * This is a <i>composable-transformation</i> operation. * * @param <R> The element type of the returned {@link DStream} * @param mapper function to apply to each element which produces a stream * of new values * @return new {@link DStream} */ <R> DStream<R> flatMap(SerFunction<? super A, ? extends Stream<? extends R>> mapper); /** * Returns a {@link DStream} consisting of the results of applying the given * function to the elements of this stream.<br> * <br> * Consistent with {@link Stream#map(java.util.function.Function)}.<br> * <br> * This is an <i>intermediate</i> operation. * <br> * This is a <i>composable-transformation</i> operation. * * @param <R> The element type of the returned {@link DStream} * @param mapper function to apply to each element * @return new {@link DStream} */ <R> DStream<R> map(SerFunction<? super A, ? extends R> mapper); /** * Returns a {@link DStream} consisting of the results of applying the given * function on the entire {@link Stream} which typically represents a single partition/split * handled by a currently executing task<br>. Essentially this is a gateway to use * standard {@link Stream} API on a given piece of data represented by this stream.<br> * Below is the variant of the rudimentary WordCount. Even though the API provides configuration * for the implicit map-side combine, here you can see how something like explicit map-side combine * could be accomplished via <i>compute</i> operation and standard {@link Stream} API. * <pre> * DStream.ofType(String.class, "wc") * .compute(stream -> stream * .flatMap(line -> Stream.of(line.split("\\s+"))) * .collect(Collectors.toMap(word -> word, word -> 1, Integer::sum)).entrySet().stream() * ).reduceValues(word -> word, word -> 1, Integer::sum) * .executeAs("WordCount"); * </pre> * <br> * This is an <i>intermediate</i> operation. * <br> * This is a <i>composable-transformation</i> operation. * * @param <R> The element type of the returned {@link DStream} * @param computeFunction function to apply on the entire {@link Stream}. * @return new {@link DStream} of type R */ <R> DStream<R> compute(SerFunction<? super Stream<A>, ? extends Stream<? extends R>> computeFunction); /** * Performs a reduction on the elements of this stream, using an * accumulation function returning a {@link DStream} with a single value * of the same type as the source stream.<br> * <br> * This operation is a non-terminal equivalent of the * <i>Stream.reduce(BinaryOperator)</i>.<br> * * <br> * This is an <i>intermediate</i> operation. * <br> * This is a <i>composable-transformation</i> operation. * * @param accumulator a function for combining two values * @return new {@link DStream} of the same type */ DStream<A> reduce(SerBinaryOperator<A> accumulator); /** * Returns a {@link DStream} containing a single element which represents the * minimum element of this stream according to the provided {@code SerComparator}.<br> * <br> * This operation is a non-terminal equivalent of the * <i>Stream.min(Comparator)</i>.<br> * <br> * This is an <i>intermediate</i> operation. * <br> * This is a <i>composable-transformation</i> operation. * * @param comparator a stateless {@code SerComparator} to compare elements of this stream * @return new {@link DStream} of the same type */ DStream<A> min(SerComparator<? super A> comparator); /** * Returns a {@link DStream} containing a single element which represents the * maximum element of this stream according to the provided {@code SerComparator}.<br> * <br> * This operation is a non-terminal equivalent of the * <i>Stream.max(Comparator)</i>.<br> * <br> * This is an <i>intermediate</i> operation. * <br> * This is a <i>composable-transformation</i> operation. * * @param comparator a stateless {@code SerComparator} to compare elements of this stream * @return new {@link DStream} of the same type */ DStream<A> max(SerComparator<? super A> comparator); /** * Returns a stream consisting of the elements of this stream, sorted * according to the provided {@code SerComparator}. * <br> * This operation is consistent with * <i>Stream.sorted(Comparator)</i>.<br> * <br> * This is an <i>intermediate</i> operation. * <br> * This is a <i>composable-transformation</i> operation. * @param comparator a stateless {@code SerComparator} to be used to compare stream elements * @return new {@link DStream} of the same type */ DStream<A> sorted(SerComparator<? super A> comparator); /** * Returns a stream consisting of the elements of this stream, additionally * performing the provided action on each element as elements are consumed * from the resulting stream provided by {@code SerConsumer}. * <br> * This operation is consistent with * <i>Stream.peek(Consumer)</i>.<br> * <br> * This is an <i>intermediate</i> operation. * <br> * * @param action is an non-interfering action to be performed * on the stream as they are consumed from the stream * @return new {@link DStream} of the same type */ DStream<A> peek(SerConsumer<? super A> action); /** * Returns a {@link DStream} of Key/Value pairs, where values mapped from the individual * elements of this stream are grouped on the given <i>groupClassifier</i> (e.g., key) and * reduced by the given <i>valueReducer</i>.<br> * <br> * This operation is a non-terminal equivalent of the * <i>Stream.collect(Collectors.toMap(Function, Function, BinaryOperator))</i>.<br> * <br> * This is an <i>intermediate</i> operation. * <br> * This is a <i>shuffle</i> operation. * * @param <K> The element type of the key * @param <V> The element type of the value * @param groupClassifier a mapping function to produce keys * @param valueMapper a mapping function to produce values * @param valueReducer a reduce function, used to resolve collisions between * values associated with the same key. * @return new {@link DStream} of Key/Value pairs represented as {@link Entry}<K,V> */ <K,V> DStream<Entry<K,V>> reduceValues(SerFunction<? super A, ? extends K> groupClassifier, SerFunction<? super A, ? extends V> valueMapper, SerBinaryOperator<V> valueReducer); /** * Returns a {@link DStream} of Key/Value pairs, where values mapped from the individual * elements of this stream are grouped on the given <i>groupClassifier</i> (e.g., key) and * aggregated into a {@link List}.<br> * <br> * This is an <i>intermediate</i> operation. * <br> * This is a <i>shuffle</i> operation. * * @param <K> The element type of the key * @param <V> The element type of the value * @param groupClassifier a mapping function to produce keys * @param valueMapper a mapping function to produce values * @return new {@link DStream} of Key/Value pairs represented as {@link Entry}<K,List<V>> */ <K,V> DStream<Entry<K,List<V>>> aggregateValues(SerFunction<? super A, ? extends K> groupClassifier, SerFunction<? super A, ? extends V> valueMapper); }