/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.kafka.streams.processor; import org.apache.kafka.common.serialization.Deserializer; import org.apache.kafka.common.serialization.Serializer; import org.apache.kafka.common.utils.Utils; import org.apache.kafka.streams.StreamsConfig; import org.apache.kafka.streams.errors.TopologyBuilderException; import org.apache.kafka.streams.kstream.KStream; import org.apache.kafka.streams.kstream.KTable; import org.apache.kafka.streams.processor.internals.InternalTopicConfig; import org.apache.kafka.streams.processor.internals.ProcessorNode; import org.apache.kafka.streams.processor.internals.ProcessorStateManager; import org.apache.kafka.streams.processor.internals.ProcessorTopology; import org.apache.kafka.streams.processor.internals.QuickUnion; import org.apache.kafka.streams.processor.internals.SinkNode; import org.apache.kafka.streams.processor.internals.SourceNode; import org.apache.kafka.streams.processor.internals.StreamPartitionAssignor.SubscriptionUpdates; import org.apache.kafka.streams.state.KeyValueStore; import org.apache.kafka.streams.state.internals.WindowStoreSupplier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.regex.Pattern; /** * A component that is used to build a {@link ProcessorTopology}. A topology contains an acyclic graph of sources, processors, * and sinks. A {@link SourceNode source} is a node in the graph that consumes one or more Kafka topics and forwards them to * its child nodes. A {@link Processor processor} is a node in the graph that receives input records from upstream nodes, * processes that records, and optionally forwarding new records to one or all of its children. Finally, a {@link SinkNode sink} * is a node in the graph that receives records from upstream nodes and writes them to a Kafka topic. This builder allows you * to construct an acyclic graph of these nodes, and the builder is then passed into a new {@link org.apache.kafka.streams.KafkaStreams} * instance that will then {@link org.apache.kafka.streams.KafkaStreams#start() begin consuming, processing, and producing records}. */ public class TopologyBuilder { private static final Logger log = LoggerFactory.getLogger(TopologyBuilder.class); private static final Pattern EMPTY_ZERO_LENGTH_PATTERN = Pattern.compile(""); // node factories in a topological order private final LinkedHashMap<String, NodeFactory> nodeFactories = new LinkedHashMap<>(); // state factories private final Map<String, StateStoreFactory> stateFactories = new HashMap<>(); // global state factories private final Map<String, StateStore> globalStateStores = new LinkedHashMap<>(); // all topics subscribed from source processors (without application-id prefix for internal topics) private final Set<String> sourceTopicNames = new HashSet<>(); // all internal topics auto-created by the topology builder and used in source / sink processors private final Set<String> internalTopicNames = new HashSet<>(); // groups of source processors that need to be copartitioned private final List<Set<String>> copartitionSourceGroups = new ArrayList<>(); // map from source processor names to subscribed topics (without application-id prefix for internal topics) private final HashMap<String, List<String>> nodeToSourceTopics = new HashMap<>(); // map from source processor names to regex subscription patterns private final HashMap<String, Pattern> nodeToSourcePatterns = new LinkedHashMap<>(); // map from sink processor names to subscribed topic (without application-id prefix for internal topics) private final HashMap<String, String> nodeToSinkTopic = new HashMap<>(); // map from topics to their matched regex patterns, this is to ensure one topic is passed through on source node // even if it can be matched by multiple regex patterns private final HashMap<String, Pattern> topicToPatterns = new HashMap<>(); // map from state store names to all the topics subscribed from source processors that // are connected to these state stores private final Map<String, Set<String>> stateStoreNameToSourceTopics = new HashMap<>(); // map from state store names to all the regex subscribed topics from source processors that // are connected to these state stores private final Map<String, Set<Pattern>> stateStoreNameToSourceRegex = new HashMap<>(); // map from state store names to this state store's corresponding changelog topic if possible, // this is used in the extended KStreamBuilder. private final Map<String, String> storeToChangelogTopic = new HashMap<>(); // all global topics private final Set<String> globalTopics = new HashSet<>(); private final Set<String> earliestResetTopics = new HashSet<>(); private final Set<String> latestResetTopics = new HashSet<>(); private final Set<Pattern> earliestResetPatterns = new HashSet<>(); private final Set<Pattern> latestResetPatterns = new HashSet<>(); private final QuickUnion<String> nodeGrouper = new QuickUnion<>(); private SubscriptionUpdates subscriptionUpdates = new SubscriptionUpdates(); private String applicationId = null; private Pattern topicPattern = null; private Map<Integer, Set<String>> nodeGroups = null; private static class StateStoreFactory { public final Set<String> users; public final StateStoreSupplier supplier; StateStoreFactory(StateStoreSupplier supplier) { this.supplier = supplier; this.users = new HashSet<>(); } } private static abstract class NodeFactory { public final String name; NodeFactory(String name) { this.name = name; } public abstract ProcessorNode build(); } private static class ProcessorNodeFactory extends NodeFactory { private final String[] parents; private final ProcessorSupplier<?, ?> supplier; private final Set<String> stateStoreNames = new HashSet<>(); ProcessorNodeFactory(String name, String[] parents, ProcessorSupplier<?, ?> supplier) { super(name); this.parents = parents.clone(); this.supplier = supplier; } public void addStateStore(String stateStoreName) { stateStoreNames.add(stateStoreName); } @Override public ProcessorNode build() { return new ProcessorNode<>(name, supplier.get(), stateStoreNames); } } private class SourceNodeFactory extends NodeFactory { private final List<String> topics; private final Pattern pattern; private final Deserializer<?> keyDeserializer; private final Deserializer<?> valDeserializer; private final TimestampExtractor timestampExtractor; private SourceNodeFactory(final String name, final String[] topics, final Pattern pattern, final TimestampExtractor timestampExtractor, final Deserializer<?> keyDeserializer, final Deserializer<?> valDeserializer) { super(name); this.topics = topics != null ? Arrays.asList(topics) : new ArrayList<String>(); this.pattern = pattern; this.keyDeserializer = keyDeserializer; this.valDeserializer = valDeserializer; this.timestampExtractor = timestampExtractor; } List<String> getTopics(Collection<String> subscribedTopics) { // if it is subscribed via patterns, it is possible that the topic metadata has not been updated // yet and hence the map from source node to topics is stale, in this case we put the pattern as a place holder; // this should only happen for debugging since during runtime this function should always be called after the metadata has updated. if (subscribedTopics.isEmpty()) return Collections.singletonList("" + pattern + ""); List<String> matchedTopics = new ArrayList<>(); for (String update : subscribedTopics) { if (this.pattern == topicToPatterns.get(update)) { matchedTopics.add(update); } else if (topicToPatterns.containsKey(update) && isMatch(update)) { // the same topic cannot be matched to more than one pattern // TODO: we should lift this requirement in the future throw new TopologyBuilderException("Topic " + update + " is already matched for another regex pattern " + topicToPatterns.get(update) + " and hence cannot be matched to this regex pattern " + pattern + " any more."); } else if (isMatch(update)) { topicToPatterns.put(update, this.pattern); matchedTopics.add(update); } } return matchedTopics; } @Override public ProcessorNode build() { final List<String> sourceTopics = nodeToSourceTopics.get(name); // if it is subscribed via patterns, it is possible that the topic metadata has not been updated // yet and hence the map from source node to topics is stale, in this case we put the pattern as a place holder; // this should only happen for debugging since during runtime this function should always be called after the metadata has updated. if (sourceTopics == null) return new SourceNode<>(name, Collections.singletonList("" + pattern + ""), timestampExtractor, keyDeserializer, valDeserializer); else return new SourceNode<>(name, maybeDecorateInternalSourceTopics(sourceTopics), timestampExtractor, keyDeserializer, valDeserializer); } private boolean isMatch(String topic) { return this.pattern.matcher(topic).matches(); } } private class SinkNodeFactory<K, V> extends NodeFactory { private final String[] parents; private final String topic; private final Serializer<K> keySerializer; private final Serializer<V> valSerializer; private final StreamPartitioner<? super K, ? super V> partitioner; private SinkNodeFactory(String name, String[] parents, String topic, Serializer<K> keySerializer, Serializer<V> valSerializer, StreamPartitioner<? super K, ? super V> partitioner) { super(name); this.parents = parents.clone(); this.topic = topic; this.keySerializer = keySerializer; this.valSerializer = valSerializer; this.partitioner = partitioner; } @Override public ProcessorNode build() { if (internalTopicNames.contains(topic)) { // prefix the internal topic name with the application id return new SinkNode<>(name, decorateTopic(topic), keySerializer, valSerializer, partitioner); } else { return new SinkNode<>(name, topic, keySerializer, valSerializer, partitioner); } } } public static class TopicsInfo { public Set<String> sinkTopics; public Set<String> sourceTopics; public Map<String, InternalTopicConfig> stateChangelogTopics; public Map<String, InternalTopicConfig> repartitionSourceTopics; TopicsInfo(Set<String> sinkTopics, Set<String> sourceTopics, Map<String, InternalTopicConfig> repartitionSourceTopics, Map<String, InternalTopicConfig> stateChangelogTopics) { this.sinkTopics = sinkTopics; this.sourceTopics = sourceTopics; this.stateChangelogTopics = stateChangelogTopics; this.repartitionSourceTopics = repartitionSourceTopics; } @Override public boolean equals(Object o) { if (o instanceof TopicsInfo) { TopicsInfo other = (TopicsInfo) o; return other.sourceTopics.equals(this.sourceTopics) && other.stateChangelogTopics.equals(this.stateChangelogTopics); } else { return false; } } @Override public int hashCode() { long n = ((long) sourceTopics.hashCode() << 32) | (long) stateChangelogTopics.hashCode(); return (int) (n % 0xFFFFFFFFL); } @Override public String toString() { return "TopicsInfo{" + "sinkTopics=" + sinkTopics + ", sourceTopics=" + sourceTopics + ", repartitionSourceTopics=" + repartitionSourceTopics + ", stateChangelogTopics=" + stateChangelogTopics + '}'; } } /** * Enum used to define auto offset reset policy when creating {@link KStream} or {@link KTable} */ public enum AutoOffsetReset { EARLIEST, LATEST } /** * Create a new builder. */ public TopologyBuilder() {} /** * Set the applicationId to be used for auto-generated internal topics. * * This is required before calling {@link #topicGroups}, {@link #copartitionSources}, * {@link #stateStoreNameToSourceTopics} and {@link #build(Integer)}. * * @param applicationId the streams applicationId. Should be the same as set by * {@link org.apache.kafka.streams.StreamsConfig#APPLICATION_ID_CONFIG} */ public synchronized final TopologyBuilder setApplicationId(final String applicationId) { Objects.requireNonNull(applicationId, "applicationId can't be null"); this.applicationId = applicationId; return this; } /** * Add a new source that consumes the named topics and forward the records to child processor and/or sink nodes. * The source will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key deserializer} and * {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value deserializer} specified in the * {@link org.apache.kafka.streams.StreamsConfig stream configuration}. * The default {@link TimestampExtractor} as specified in the {@link StreamsConfig config} is used. * * @param name the unique name of the source used to reference this node when * {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}. * @param topics the name of one or more Kafka topics that this source is to consume * @return this builder instance so methods can be chained together; never null */ public synchronized final TopologyBuilder addSource(final String name, final String... topics) { return addSource(null, name, null, null, null, topics); } /** * Add a new source that consumes the named topics and forward the records to child processor and/or sink nodes. * The source will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key deserializer} and * {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value deserializer} specified in the * {@link org.apache.kafka.streams.StreamsConfig stream configuration}. * The default {@link TimestampExtractor} as specified in the {@link StreamsConfig config} is used. * * @param offsetReset the auto offset reset policy to use for this source if no committed offsets found; acceptable values earliest or latest * @param name the unique name of the source used to reference this node when * {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}. * @param topics the name of one or more Kafka topics that this source is to consume * @return this builder instance so methods can be chained together; never null */ public synchronized final TopologyBuilder addSource(final AutoOffsetReset offsetReset, final String name, final String... topics) { return addSource(offsetReset, name, null, null, null, topics); } /** * Add a new source that consumes the named topics and forward the records to child processor and/or sink nodes. * The source will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key deserializer} and * {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value deserializer} specified in the * {@link org.apache.kafka.streams.StreamsConfig stream configuration}. * * @param timestampExtractor the stateless timestamp extractor used for this source, * if not specified the default extractor defined in the configs will be used * @param name the unique name of the source used to reference this node when * {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}. * @param topics the name of one or more Kafka topics that this source is to consume * @return this builder instance so methods can be chained together; never null */ public synchronized final TopologyBuilder addSource(final TimestampExtractor timestampExtractor, final String name, final String... topics) { return addSource(null, name, timestampExtractor, null, null, topics); } /** * Add a new source that consumes the named topics and forward the records to child processor and/or sink nodes. * The source will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key deserializer} and * {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value deserializer} specified in the * {@link org.apache.kafka.streams.StreamsConfig stream configuration}. * * @param offsetReset the auto offset reset policy to use for this source if no committed offsets found; * acceptable values earliest or latest * @param timestampExtractor the stateless timestamp extractor used for this source, * if not specified the default extractor defined in the configs will be used * @param name the unique name of the source used to reference this node when * {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}. * @param topics the name of one or more Kafka topics that this source is to consume * @return this builder instance so methods can be chained together; never null */ public synchronized final TopologyBuilder addSource(final AutoOffsetReset offsetReset, final TimestampExtractor timestampExtractor, final String name, final String... topics) { return addSource(offsetReset, name, timestampExtractor, null, null, topics); } /** * Add a new source that consumes from topics matching the given pattern * and forward the records to child processor and/or sink nodes. * The source will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key deserializer} and * {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value deserializer} specified in the * {@link org.apache.kafka.streams.StreamsConfig stream configuration}. * The default {@link TimestampExtractor} as specified in the {@link StreamsConfig config} is used. * * @param name the unique name of the source used to reference this node when * {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}. * @param topicPattern regular expression pattern to match Kafka topics that this source is to consume * @return this builder instance so methods can be chained together; never null */ public synchronized final TopologyBuilder addSource(final String name, final Pattern topicPattern) { return addSource(null, name, null, null, null, topicPattern); } /** * Add a new source that consumes from topics matching the given pattern * and forward the records to child processor and/or sink nodes. * The source will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key deserializer} and * {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value deserializer} specified in the * {@link org.apache.kafka.streams.StreamsConfig stream configuration}. * The default {@link TimestampExtractor} as specified in the {@link StreamsConfig config} is used. * * @param offsetReset the auto offset reset policy value for this source if no committed offsets found; acceptable values earliest or latest. * @param name the unique name of the source used to reference this node when * {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}. * @param topicPattern regular expression pattern to match Kafka topics that this source is to consume * @return this builder instance so methods can be chained together; never null */ public synchronized final TopologyBuilder addSource(final AutoOffsetReset offsetReset, final String name, final Pattern topicPattern) { return addSource(offsetReset, name, null, null, null, topicPattern); } /** * Add a new source that consumes from topics matching the given pattern * and forward the records to child processor and/or sink nodes. * The source will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key deserializer} and * {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value deserializer} specified in the * {@link org.apache.kafka.streams.StreamsConfig stream configuration}. * * @param timestampExtractor the stateless timestamp extractor used for this source, * if not specified the default extractor defined in the configs will be used * @param name the unique name of the source used to reference this node when * {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}. * @param topicPattern regular expression pattern to match Kafka topics that this source is to consume * @return this builder instance so methods can be chained together; never null */ public synchronized final TopologyBuilder addSource(final TimestampExtractor timestampExtractor, final String name, final Pattern topicPattern) { return addSource(null, name, timestampExtractor, null, null, topicPattern); } /** * Add a new source that consumes from topics matching the given pattern * and forward the records to child processor and/or sink nodes. * The source will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key deserializer} and * {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value deserializer} specified in the * {@link org.apache.kafka.streams.StreamsConfig stream configuration}. * * @param offsetReset the auto offset reset policy value for this source if no committed offsets found; * acceptable values earliest or latest. * @param timestampExtractor the stateless timestamp extractor used for this source, * if not specified the default extractor defined in the configs will be used * @param name the unique name of the source used to reference this node when * {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}. * @param topicPattern regular expression pattern to match Kafka topics that this source is to consume * @return this builder instance so methods can be chained together; never null */ public synchronized final TopologyBuilder addSource(final AutoOffsetReset offsetReset, final TimestampExtractor timestampExtractor, final String name, final Pattern topicPattern) { return addSource(offsetReset, name, timestampExtractor, null, null, topicPattern); } /** * Add a new source that consumes the named topics and forwards the records to child processor and/or sink nodes. * The source will use the specified key and value deserializers. * The default {@link TimestampExtractor} as specified in the {@link StreamsConfig config} is used. * * @param name the unique name of the source used to reference this node when * {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children} * @param keyDeserializer key deserializer used to read this source, if not specified the default * key deserializer defined in the configs will be used * @param valDeserializer value deserializer used to read this source, * if not specified the default value deserializer defined in the configs will be used * @param topics the name of one or more Kafka topics that this source is to consume * @return this builder instance so methods can be chained together; never null * @throws TopologyBuilderException if processor is already added or if topics have already been registered by another source */ public synchronized final TopologyBuilder addSource(final String name, final Deserializer keyDeserializer, final Deserializer valDeserializer, final String... topics) { return addSource(null, name, null, keyDeserializer, valDeserializer, topics); } /** * Add a new source that consumes the named topics and forwards the records to child processor and/or sink nodes. * The source will use the specified key and value deserializers. * * @param offsetReset the auto offset reset policy to use for this stream if no committed offsets found; * acceptable values are earliest or latest. * @param name the unique name of the source used to reference this node when * {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}. * @param timestampExtractor the stateless timestamp extractor used for this source, * if not specified the default extractor defined in the configs will be used * @param keyDeserializer key deserializer used to read this source, if not specified the default * key deserializer defined in the configs will be used * @param valDeserializer value deserializer used to read this source, * if not specified the default value deserializer defined in the configs will be used * @param topics the name of one or more Kafka topics that this source is to consume * @return this builder instance so methods can be chained together; never null * @throws TopologyBuilderException if processor is already added or if topics have already been registered by another source */ public synchronized final TopologyBuilder addSource(final AutoOffsetReset offsetReset, final String name, final TimestampExtractor timestampExtractor, final Deserializer keyDeserializer, final Deserializer valDeserializer, final String... topics) { if (topics.length == 0) { throw new TopologyBuilderException("You must provide at least one topic"); } Objects.requireNonNull(name, "name must not be null"); if (nodeFactories.containsKey(name)) throw new TopologyBuilderException("Processor " + name + " is already added."); for (String topic : topics) { Objects.requireNonNull(topic, "topic names cannot be null"); validateTopicNotAlreadyRegistered(topic); maybeAddToResetList(earliestResetTopics, latestResetTopics, offsetReset, topic); sourceTopicNames.add(topic); } nodeFactories.put(name, new SourceNodeFactory(name, topics, null, timestampExtractor, keyDeserializer, valDeserializer)); nodeToSourceTopics.put(name, Arrays.asList(topics)); nodeGrouper.add(name); return this; } /** * Adds a global {@link StateStore} to the topology. The {@link StateStore} sources its data * from all partitions of the provided input topic. There will be exactly one instance of this * {@link StateStore} per Kafka Streams instance. * <p> * A {@link SourceNode} with the provided sourceName will be added to consume the data arriving * from the partitions of the input topic. * <p> * The provided {@link ProcessorSupplier} will be used to create an {@link ProcessorNode} that will * receive all records forwarded from the {@link SourceNode}. This * {@link ProcessorNode} should be used to keep the {@link StateStore} up-to-date. * The default {@link TimestampExtractor} as specified in the {@link StreamsConfig config} is used. * * @param storeSupplier user defined state store supplier * @param sourceName name of the {@link SourceNode} that will be automatically added * @param keyDeserializer the {@link Deserializer} to deserialize keys with * @param valueDeserializer the {@link Deserializer} to deserialize values with * @param topic the topic to source the data from * @param processorName the name of the {@link ProcessorSupplier} * @param stateUpdateSupplier the instance of {@link ProcessorSupplier} * @return this builder instance so methods can be chained together; never null */ public synchronized TopologyBuilder addGlobalStore(final StateStoreSupplier<KeyValueStore> storeSupplier, final String sourceName, final Deserializer keyDeserializer, final Deserializer valueDeserializer, final String topic, final String processorName, final ProcessorSupplier stateUpdateSupplier) { return addGlobalStore(storeSupplier, sourceName, null, keyDeserializer, valueDeserializer, topic, processorName, stateUpdateSupplier); } /** * Adds a global {@link StateStore} to the topology. The {@link StateStore} sources its data * from all partitions of the provided input topic. There will be exactly one instance of this * {@link StateStore} per Kafka Streams instance. * <p> * A {@link SourceNode} with the provided sourceName will be added to consume the data arriving * from the partitions of the input topic. * <p> * The provided {@link ProcessorSupplier} will be used to create an {@link ProcessorNode} that will * receive all records forwarded from the {@link SourceNode}. This * {@link ProcessorNode} should be used to keep the {@link StateStore} up-to-date. * * @param storeSupplier user defined state store supplier * @param sourceName name of the {@link SourceNode} that will be automatically added * @param timestampExtractor the stateless timestamp extractor used for this source, * if not specified the default extractor defined in the configs will be used * @param keyDeserializer the {@link Deserializer} to deserialize keys with * @param valueDeserializer the {@link Deserializer} to deserialize values with * @param topic the topic to source the data from * @param processorName the name of the {@link ProcessorSupplier} * @param stateUpdateSupplier the instance of {@link ProcessorSupplier} * @return this builder instance so methods can be chained together; never null */ public synchronized TopologyBuilder addGlobalStore(final StateStoreSupplier<KeyValueStore> storeSupplier, final String sourceName, final TimestampExtractor timestampExtractor, final Deserializer keyDeserializer, final Deserializer valueDeserializer, final String topic, final String processorName, final ProcessorSupplier stateUpdateSupplier) { Objects.requireNonNull(storeSupplier, "store supplier must not be null"); Objects.requireNonNull(sourceName, "sourceName must not be null"); Objects.requireNonNull(topic, "topic must not be null"); Objects.requireNonNull(stateUpdateSupplier, "supplier must not be null"); Objects.requireNonNull(processorName, "processorName must not be null"); if (nodeFactories.containsKey(sourceName)) { throw new TopologyBuilderException("Processor " + sourceName + " is already added."); } if (nodeFactories.containsKey(processorName)) { throw new TopologyBuilderException("Processor " + processorName + " is already added."); } if (stateFactories.containsKey(storeSupplier.name()) || globalStateStores.containsKey(storeSupplier.name())) { throw new TopologyBuilderException("StateStore " + storeSupplier.name() + " is already added."); } if (storeSupplier.loggingEnabled()) { throw new TopologyBuilderException("StateStore " + storeSupplier.name() + " for global table must not have logging enabled."); } if (sourceName.equals(processorName)) { throw new TopologyBuilderException("sourceName and processorName must be different."); } validateTopicNotAlreadyRegistered(topic); globalTopics.add(topic); final String[] topics = {topic}; nodeFactories.put(sourceName, new SourceNodeFactory(sourceName, topics, null, timestampExtractor, keyDeserializer, valueDeserializer)); nodeToSourceTopics.put(sourceName, Arrays.asList(topics)); nodeGrouper.add(sourceName); final String[] parents = {sourceName}; final ProcessorNodeFactory nodeFactory = new ProcessorNodeFactory(processorName, parents, stateUpdateSupplier); nodeFactory.addStateStore(storeSupplier.name()); nodeFactories.put(processorName, nodeFactory); nodeGrouper.add(processorName); nodeGrouper.unite(processorName, parents); globalStateStores.put(storeSupplier.name(), storeSupplier.get()); connectSourceStoreAndTopic(storeSupplier.name(), topic); return this; } private void validateTopicNotAlreadyRegistered(final String topic) { if (sourceTopicNames.contains(topic) || globalTopics.contains(topic)) { throw new TopologyBuilderException("Topic " + topic + " has already been registered by another source."); } for (Pattern pattern : nodeToSourcePatterns.values()) { if (pattern.matcher(topic).matches()) { throw new TopologyBuilderException("Topic " + topic + " matches a Pattern already registered by another source."); } } } /** * Add a new source that consumes from topics matching the given pattern * and forwards the records to child processor and/or sink nodes. * The source will use the specified key and value deserializers. The provided * de-/serializers will be used for all matched topics, so care should be taken to specify patterns for * topics that share the same key-value data format. * The default {@link TimestampExtractor} as specified in the {@link StreamsConfig config} is used. * * @param name the unique name of the source used to reference this node when * {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children} * @param keyDeserializer key deserializer used to read this source, if not specified the default * key deserializer defined in the configs will be used * @param valDeserializer value deserializer used to read this source, * if not specified the default value deserializer defined in the configs will be used * @param topicPattern regular expression pattern to match Kafka topics that this source is to consume * @return this builder instance so methods can be chained together; never null * @throws TopologyBuilderException if processor is already added or if topics have already been registered by name */ public synchronized final TopologyBuilder addSource(final String name, final Deserializer keyDeserializer, final Deserializer valDeserializer, final Pattern topicPattern) { return addSource(null, name, null, keyDeserializer, valDeserializer, topicPattern); } /** * Add a new source that consumes from topics matching the given pattern * and forwards the records to child processor and/or sink nodes. * The source will use the specified key and value deserializers. The provided * de-/serializers will be used for all matched topics, so care should be taken to specify patterns for * topics that share the same key-value data format. * * @param offsetReset the auto offset reset policy to use for this stream if no committed offsets found; * acceptable values are earliest or latest * @param name the unique name of the source used to reference this node when * {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}. * @param timestampExtractor the stateless timestamp extractor used for this source, * if not specified the default extractor defined in the configs will be used * @param keyDeserializer key deserializer used to read this source, if not specified the default * key deserializer defined in the configs will be used * @param valDeserializer value deserializer used to read this source, * if not specified the default value deserializer defined in the configs will be used * @param topicPattern regular expression pattern to match Kafka topics that this source is to consume * @return this builder instance so methods can be chained together; never null * @throws TopologyBuilderException if processor is already added or if topics have already been registered by name */ public synchronized final TopologyBuilder addSource(final AutoOffsetReset offsetReset, final String name, final TimestampExtractor timestampExtractor, final Deserializer keyDeserializer, final Deserializer valDeserializer, final Pattern topicPattern) { Objects.requireNonNull(topicPattern, "topicPattern can't be null"); Objects.requireNonNull(name, "name can't be null"); if (nodeFactories.containsKey(name)) { throw new TopologyBuilderException("Processor " + name + " is already added."); } for (String sourceTopicName : sourceTopicNames) { if (topicPattern.matcher(sourceTopicName).matches()) { throw new TopologyBuilderException("Pattern " + topicPattern + " will match a topic that has already been registered by another source."); } } maybeAddToResetList(earliestResetPatterns, latestResetPatterns, offsetReset, topicPattern); nodeFactories.put(name, new SourceNodeFactory(name, null, topicPattern, timestampExtractor, keyDeserializer, valDeserializer)); nodeToSourcePatterns.put(name, topicPattern); nodeGrouper.add(name); return this; } /** * Add a new source that consumes from topics matching the given pattern * and forwards the records to child processor and/or sink nodes. * The source will use the specified key and value deserializers. The provided * de-/serializers will be used for all matched topics, so care should be taken to specify patterns for * topics that share the same key-value data format. * * @param offsetReset the auto offset reset policy to use for this stream if no committed offsets found; * acceptable values are earliest or latest * @param name the unique name of the source used to reference this node when * {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children} * @param keyDeserializer key deserializer used to read this source, if not specified the default * key deserializer defined in the configs will be used * @param valDeserializer value deserializer used to read this source, * if not specified the default value deserializer defined in the configs will be used * @param topicPattern regular expression pattern to match Kafka topics that this source is to consume * @return this builder instance so methods can be chained together; never null * @throws TopologyBuilderException if processor is already added or if topics have already been registered by name */ public synchronized final TopologyBuilder addSource(final AutoOffsetReset offsetReset, final String name, final Deserializer keyDeserializer, final Deserializer valDeserializer, final Pattern topicPattern) { return addSource(offsetReset, name, null, keyDeserializer, valDeserializer, topicPattern); } /** * Add a new sink that forwards records from upstream parent processor and/or source nodes to the named Kafka topic. * The sink will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key serializer} and * {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value serializer} specified in the * {@link org.apache.kafka.streams.StreamsConfig stream configuration}. * * @param name the unique name of the sink * @param topic the name of the Kafka topic to which this sink should write its records * @param parentNames the name of one or more source or processor nodes whose output records this sink should consume * and write to its topic * @return this builder instance so methods can be chained together; never null * @see #addSink(String, String, StreamPartitioner, String...) * @see #addSink(String, String, Serializer, Serializer, String...) * @see #addSink(String, String, Serializer, Serializer, StreamPartitioner, String...) */ public synchronized final TopologyBuilder addSink(final String name, final String topic, final String... parentNames) { return addSink(name, topic, null, null, parentNames); } /** * Add a new sink that forwards records from upstream parent processor and/or source nodes to the named Kafka topic, using * the supplied partitioner. * The sink will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key serializer} and * {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value serializer} specified in the * {@link org.apache.kafka.streams.StreamsConfig stream configuration}. * <p> * The sink will also use the specified {@link StreamPartitioner} to determine how records are distributed among * the named Kafka topic's partitions. Such control is often useful with topologies that use * {@link #addStateStore(StateStoreSupplier, String...) state stores} * in its processors. In most other cases, however, a partitioner needs not be specified and Kafka will automatically distribute * records among partitions using Kafka's default partitioning logic. * * @param name the unique name of the sink * @param topic the name of the Kafka topic to which this sink should write its records * @param partitioner the function that should be used to determine the partition for each record processed by the sink * @param parentNames the name of one or more source or processor nodes whose output records this sink should consume * and write to its topic * @return this builder instance so methods can be chained together; never null * @see #addSink(String, String, String...) * @see #addSink(String, String, Serializer, Serializer, String...) * @see #addSink(String, String, Serializer, Serializer, StreamPartitioner, String...) */ public synchronized final TopologyBuilder addSink(final String name, final String topic, final StreamPartitioner partitioner, final String... parentNames) { return addSink(name, topic, null, null, partitioner, parentNames); } /** * Add a new sink that forwards records from upstream parent processor and/or source nodes to the named Kafka topic. * The sink will use the specified key and value serializers. * * @param name the unique name of the sink * @param topic the name of the Kafka topic to which this sink should write its records * @param keySerializer the {@link Serializer key serializer} used when consuming records; may be null if the sink * should use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key serializer} specified in the * {@link org.apache.kafka.streams.StreamsConfig stream configuration} * @param valSerializer the {@link Serializer value serializer} used when consuming records; may be null if the sink * should use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value serializer} specified in the * {@link org.apache.kafka.streams.StreamsConfig stream configuration} * @param parentNames the name of one or more source or processor nodes whose output records this sink should consume * and write to its topic * @return this builder instance so methods can be chained together; never null * @see #addSink(String, String, String...) * @see #addSink(String, String, StreamPartitioner, String...) * @see #addSink(String, String, Serializer, Serializer, StreamPartitioner, String...) */ public synchronized final TopologyBuilder addSink(final String name, final String topic, final Serializer keySerializer, final Serializer valSerializer, final String... parentNames) { return addSink(name, topic, keySerializer, valSerializer, null, parentNames); } /** * Add a new sink that forwards records from upstream parent processor and/or source nodes to the named Kafka topic. * The sink will use the specified key and value serializers, and the supplied partitioner. * * @param name the unique name of the sink * @param topic the name of the Kafka topic to which this sink should write its records * @param keySerializer the {@link Serializer key serializer} used when consuming records; may be null if the sink * should use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key serializer} specified in the * {@link org.apache.kafka.streams.StreamsConfig stream configuration} * @param valSerializer the {@link Serializer value serializer} used when consuming records; may be null if the sink * should use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value serializer} specified in the * {@link org.apache.kafka.streams.StreamsConfig stream configuration} * @param partitioner the function that should be used to determine the partition for each record processed by the sink * @param parentNames the name of one or more source or processor nodes whose output records this sink should consume * and write to its topic * @return this builder instance so methods can be chained together; never null * @see #addSink(String, String, String...) * @see #addSink(String, String, StreamPartitioner, String...) * @see #addSink(String, String, Serializer, Serializer, String...) * @throws TopologyBuilderException if parent processor is not added yet, or if this processor's name is equal to the parent's name */ public synchronized final <K, V> TopologyBuilder addSink(final String name, final String topic, final Serializer<K> keySerializer, final Serializer<V> valSerializer, final StreamPartitioner<? super K, ? super V> partitioner, final String... parentNames) { Objects.requireNonNull(name, "name must not be null"); Objects.requireNonNull(topic, "topic must not be null"); if (nodeFactories.containsKey(name)) throw new TopologyBuilderException("Processor " + name + " is already added."); for (final String parent : parentNames) { if (parent.equals(name)) { throw new TopologyBuilderException("Processor " + name + " cannot be a parent of itself."); } if (!nodeFactories.containsKey(parent)) { throw new TopologyBuilderException("Parent processor " + parent + " is not added yet."); } } nodeFactories.put(name, new SinkNodeFactory<>(name, parentNames, topic, keySerializer, valSerializer, partitioner)); nodeToSinkTopic.put(name, topic); nodeGrouper.add(name); nodeGrouper.unite(name, parentNames); return this; } /** * Add a new processor node that receives and processes records output by one or more parent source or processor node. * Any new record output by this processor will be forwarded to its child processor or sink nodes. * @param name the unique name of the processor node * @param supplier the supplier used to obtain this node's {@link Processor} instance * @param parentNames the name of one or more source or processor nodes whose output records this processor should receive * and process * @return this builder instance so methods can be chained together; never null * @throws TopologyBuilderException if parent processor is not added yet, or if this processor's name is equal to the parent's name */ public synchronized final TopologyBuilder addProcessor(final String name, final ProcessorSupplier supplier, final String... parentNames) { Objects.requireNonNull(name, "name must not be null"); Objects.requireNonNull(supplier, "supplier must not be null"); if (nodeFactories.containsKey(name)) throw new TopologyBuilderException("Processor " + name + " is already added."); for (final String parent : parentNames) { if (parent.equals(name)) { throw new TopologyBuilderException("Processor " + name + " cannot be a parent of itself."); } if (!nodeFactories.containsKey(parent)) { throw new TopologyBuilderException("Parent processor " + parent + " is not added yet."); } } nodeFactories.put(name, new ProcessorNodeFactory(name, parentNames, supplier)); nodeGrouper.add(name); nodeGrouper.unite(name, parentNames); return this; } /** * Adds a state store * * @param supplier the supplier used to obtain this state store {@link StateStore} instance * @return this builder instance so methods can be chained together; never null * @throws TopologyBuilderException if state store supplier is already added */ public synchronized final TopologyBuilder addStateStore(final StateStoreSupplier supplier, final String... processorNames) { Objects.requireNonNull(supplier, "supplier can't be null"); if (stateFactories.containsKey(supplier.name())) { throw new TopologyBuilderException("StateStore " + supplier.name() + " is already added."); } stateFactories.put(supplier.name(), new StateStoreFactory(supplier)); if (processorNames != null) { for (String processorName : processorNames) { connectProcessorAndStateStore(processorName, supplier.name()); } } return this; } /** * Connects the processor and the state stores * * @param processorName the name of the processor * @param stateStoreNames the names of state stores that the processor uses * @return this builder instance so methods can be chained together; never null */ public synchronized final TopologyBuilder connectProcessorAndStateStores(final String processorName, final String... stateStoreNames) { Objects.requireNonNull(processorName, "processorName can't be null"); if (stateStoreNames != null) { for (String stateStoreName : stateStoreNames) { connectProcessorAndStateStore(processorName, stateStoreName); } } return this; } /** * This is used only for KStreamBuilder: when adding a KTable from a source topic, * we need to add the topic as the KTable's materialized state store's changelog. */ protected synchronized final TopologyBuilder connectSourceStoreAndTopic(final String sourceStoreName, final String topic) { if (storeToChangelogTopic.containsKey(sourceStoreName)) { throw new TopologyBuilderException("Source store " + sourceStoreName + " is already added."); } storeToChangelogTopic.put(sourceStoreName, topic); return this; } /** * Connects a list of processors. * * NOTE this function would not needed by developers working with the processor APIs, but only used * for the high-level DSL parsing functionalities. * * @param processorNames the name of the processors * @return this builder instance so methods can be chained together; never null * @throws TopologyBuilderException if less than two processors are specified, or if one of the processors is not added yet */ public synchronized final TopologyBuilder connectProcessors(final String... processorNames) { if (processorNames.length < 2) throw new TopologyBuilderException("At least two processors need to participate in the connection."); for (String processorName : processorNames) { if (!nodeFactories.containsKey(processorName)) throw new TopologyBuilderException("Processor " + processorName + " is not added yet."); } String firstProcessorName = processorNames[0]; nodeGrouper.unite(firstProcessorName, Arrays.copyOfRange(processorNames, 1, processorNames.length)); return this; } /** * Adds an internal topic * * @param topicName the name of the topic * @return this builder instance so methods can be chained together; never null */ public synchronized final TopologyBuilder addInternalTopic(final String topicName) { Objects.requireNonNull(topicName, "topicName can't be null"); this.internalTopicNames.add(topicName); return this; } /** * Asserts that the streams of the specified source nodes must be copartitioned. * * @param sourceNodes a set of source node names * @return this builder instance so methods can be chained together; never null */ public synchronized final TopologyBuilder copartitionSources(final Collection<String> sourceNodes) { copartitionSourceGroups.add(Collections.unmodifiableSet(new HashSet<>(sourceNodes))); return this; } private void connectProcessorAndStateStore(final String processorName, final String stateStoreName) { if (!stateFactories.containsKey(stateStoreName)) throw new TopologyBuilderException("StateStore " + stateStoreName + " is not added yet."); if (!nodeFactories.containsKey(processorName)) throw new TopologyBuilderException("Processor " + processorName + " is not added yet."); final StateStoreFactory stateStoreFactory = stateFactories.get(stateStoreName); final Iterator<String> iter = stateStoreFactory.users.iterator(); if (iter.hasNext()) { final String user = iter.next(); nodeGrouper.unite(user, processorName); } stateStoreFactory.users.add(processorName); NodeFactory nodeFactory = nodeFactories.get(processorName); if (nodeFactory instanceof ProcessorNodeFactory) { final ProcessorNodeFactory processorNodeFactory = (ProcessorNodeFactory) nodeFactory; processorNodeFactory.addStateStore(stateStoreName); connectStateStoreNameToSourceTopicsOrPattern(stateStoreName, processorNodeFactory); } else { throw new TopologyBuilderException("cannot connect a state store " + stateStoreName + " to a source node or a sink node."); } } private Set<SourceNodeFactory> findSourcesForProcessorParents(final String[] parents) { final Set<SourceNodeFactory> sourceNodes = new HashSet<>(); for (String parent : parents) { final NodeFactory nodeFactory = nodeFactories.get(parent); if (nodeFactory instanceof SourceNodeFactory) { sourceNodes.add((SourceNodeFactory) nodeFactory); } else if (nodeFactory instanceof ProcessorNodeFactory) { sourceNodes.addAll(findSourcesForProcessorParents(((ProcessorNodeFactory) nodeFactory).parents)); } } return sourceNodes; } private void connectStateStoreNameToSourceTopicsOrPattern(final String stateStoreName, final ProcessorNodeFactory processorNodeFactory) { // we should never update the mapping from state store names to source topics if the store name already exists // in the map; this scenario is possible, for example, that a state store underlying a source KTable is // connecting to a join operator whose source topic is not the original KTable's source topic but an internal repartition topic. if (stateStoreNameToSourceTopics.containsKey(stateStoreName) || stateStoreNameToSourceRegex.containsKey(stateStoreName)) { return; } final Set<String> sourceTopics = new HashSet<>(); final Set<Pattern> sourcePatterns = new HashSet<>(); final Set<SourceNodeFactory> sourceNodesForParent = findSourcesForProcessorParents(processorNodeFactory.parents); for (SourceNodeFactory sourceNodeFactory : sourceNodesForParent) { if (sourceNodeFactory.pattern != null) { sourcePatterns.add(sourceNodeFactory.pattern); } else { sourceTopics.addAll(sourceNodeFactory.topics); } } if (!sourceTopics.isEmpty()) { stateStoreNameToSourceTopics.put(stateStoreName, Collections.unmodifiableSet(sourceTopics)); } if (!sourcePatterns.isEmpty()) { stateStoreNameToSourceRegex.put(stateStoreName, Collections.unmodifiableSet(sourcePatterns)); } } private <T> void maybeAddToResetList(final Collection<T> earliestResets, final Collection<T> latestResets, final AutoOffsetReset offsetReset, final T item) { if (offsetReset != null) { switch (offsetReset) { case EARLIEST: earliestResets.add(item); break; case LATEST: latestResets.add(item); break; default: throw new TopologyBuilderException(String.format("Unrecognized reset format %s", offsetReset)); } } } /** * Returns the map of node groups keyed by the topic group id. * * @return groups of node names */ public synchronized Map<Integer, Set<String>> nodeGroups() { if (nodeGroups == null) nodeGroups = makeNodeGroups(); return nodeGroups; } private Map<Integer, Set<String>> makeNodeGroups() { final HashMap<Integer, Set<String>> nodeGroups = new LinkedHashMap<>(); final HashMap<String, Set<String>> rootToNodeGroup = new HashMap<>(); int nodeGroupId = 0; // Go through source nodes first. This makes the group id assignment easy to predict in tests final HashSet<String> allSourceNodes = new HashSet<>(nodeToSourceTopics.keySet()); allSourceNodes.addAll(nodeToSourcePatterns.keySet()); for (String nodeName : Utils.sorted(allSourceNodes)) { final String root = nodeGrouper.root(nodeName); Set<String> nodeGroup = rootToNodeGroup.get(root); if (nodeGroup == null) { nodeGroup = new HashSet<>(); rootToNodeGroup.put(root, nodeGroup); nodeGroups.put(nodeGroupId++, nodeGroup); } nodeGroup.add(nodeName); } // Go through non-source nodes for (String nodeName : Utils.sorted(nodeFactories.keySet())) { if (!nodeToSourceTopics.containsKey(nodeName)) { final String root = nodeGrouper.root(nodeName); Set<String> nodeGroup = rootToNodeGroup.get(root); if (nodeGroup == null) { nodeGroup = new HashSet<>(); rootToNodeGroup.put(root, nodeGroup); nodeGroups.put(nodeGroupId++, nodeGroup); } nodeGroup.add(nodeName); } } return nodeGroups; } /** * Build the topology for the specified topic group. This is called automatically when passing this builder into the * {@link org.apache.kafka.streams.KafkaStreams#KafkaStreams(TopologyBuilder, org.apache.kafka.streams.StreamsConfig)} constructor. * * @see org.apache.kafka.streams.KafkaStreams#KafkaStreams(TopologyBuilder, org.apache.kafka.streams.StreamsConfig) */ public synchronized ProcessorTopology build(final Integer topicGroupId) { Set<String> nodeGroup; if (topicGroupId != null) { nodeGroup = nodeGroups().get(topicGroupId); } else { // when topicGroupId is null, we build the full topology minus the global groups final Set<String> globalNodeGroups = globalNodeGroups(); final Collection<Set<String>> values = nodeGroups().values(); nodeGroup = new HashSet<>(); for (Set<String> value : values) { nodeGroup.addAll(value); } nodeGroup.removeAll(globalNodeGroups); } return build(nodeGroup); } /** * Builds the topology for any global state stores * @return ProcessorTopology */ public synchronized ProcessorTopology buildGlobalStateTopology() { final Set<String> globalGroups = globalNodeGroups(); if (globalGroups.isEmpty()) { return null; } return build(globalGroups); } private Set<String> globalNodeGroups() { final Set<String> globalGroups = new HashSet<>(); for (final Map.Entry<Integer, Set<String>> nodeGroup : nodeGroups().entrySet()) { final Set<String> nodes = nodeGroup.getValue(); for (String node : nodes) { final NodeFactory nodeFactory = nodeFactories.get(node); if (nodeFactory instanceof SourceNodeFactory) { final List<String> topics = ((SourceNodeFactory) nodeFactory).topics; if (topics != null && topics.size() == 1 && globalTopics.contains(topics.get(0))) { globalGroups.addAll(nodes); } } } } return globalGroups; } private ProcessorTopology build(final Set<String> nodeGroup) { final List<ProcessorNode> processorNodes = new ArrayList<>(nodeFactories.size()); final Map<String, ProcessorNode> processorMap = new HashMap<>(); final Map<String, SourceNode> topicSourceMap = new HashMap<>(); final Map<String, SinkNode> topicSinkMap = new HashMap<>(); final Map<String, StateStore> stateStoreMap = new LinkedHashMap<>(); // create processor nodes in a topological order ("nodeFactories" is already topologically sorted) for (NodeFactory factory : nodeFactories.values()) { if (nodeGroup == null || nodeGroup.contains(factory.name)) { final ProcessorNode node = factory.build(); processorNodes.add(node); processorMap.put(node.name(), node); if (factory instanceof ProcessorNodeFactory) { for (String parent : ((ProcessorNodeFactory) factory).parents) { final ProcessorNode<?, ?> parentNode = processorMap.get(parent); parentNode.addChild(node); } for (String stateStoreName : ((ProcessorNodeFactory) factory).stateStoreNames) { if (!stateStoreMap.containsKey(stateStoreName)) { StateStore stateStore; if (stateFactories.containsKey(stateStoreName)) { final StateStoreSupplier supplier = stateFactories.get(stateStoreName).supplier; stateStore = supplier.get(); // remember the changelog topic if this state store is change-logging enabled if (supplier.loggingEnabled() && !storeToChangelogTopic.containsKey(stateStoreName)) { final String changelogTopic = ProcessorStateManager.storeChangelogTopic(this.applicationId, stateStoreName); storeToChangelogTopic.put(stateStoreName, changelogTopic); } } else { stateStore = globalStateStores.get(stateStoreName); } stateStoreMap.put(stateStoreName, stateStore); } } } else if (factory instanceof SourceNodeFactory) { final SourceNodeFactory sourceNodeFactory = (SourceNodeFactory) factory; final List<String> topics = (sourceNodeFactory.pattern != null) ? sourceNodeFactory.getTopics(subscriptionUpdates.getUpdates()) : sourceNodeFactory.topics; for (String topic : topics) { if (internalTopicNames.contains(topic)) { // prefix the internal topic name with the application id topicSourceMap.put(decorateTopic(topic), (SourceNode) node); } else { topicSourceMap.put(topic, (SourceNode) node); } } } else if (factory instanceof SinkNodeFactory) { final SinkNodeFactory sinkNodeFactory = (SinkNodeFactory) factory; for (String parent : sinkNodeFactory.parents) { processorMap.get(parent).addChild(node); if (internalTopicNames.contains(sinkNodeFactory.topic)) { // prefix the internal topic name with the application id topicSinkMap.put(decorateTopic(sinkNodeFactory.topic), (SinkNode) node); } else { topicSinkMap.put(sinkNodeFactory.topic, (SinkNode) node); } } } else { throw new TopologyBuilderException("Unknown definition class: " + factory.getClass().getName()); } } } return new ProcessorTopology(processorNodes, topicSourceMap, topicSinkMap, new ArrayList<>(stateStoreMap.values()), storeToChangelogTopic, new ArrayList<>(globalStateStores.values())); } /** * Get any global {@link StateStore}s that are part of the * topology * @return map containing all global {@link StateStore}s */ public Map<String, StateStore> globalStateStores() { return Collections.unmodifiableMap(globalStateStores); } /** * Returns the map of topic groups keyed by the group id. * A topic group is a group of topics in the same task. * * @return groups of topic names */ public synchronized Map<Integer, TopicsInfo> topicGroups() { final Map<Integer, TopicsInfo> topicGroups = new LinkedHashMap<>(); if (nodeGroups == null) nodeGroups = makeNodeGroups(); for (Map.Entry<Integer, Set<String>> entry : nodeGroups.entrySet()) { final Set<String> sinkTopics = new HashSet<>(); final Set<String> sourceTopics = new HashSet<>(); final Map<String, InternalTopicConfig> internalSourceTopics = new HashMap<>(); final Map<String, InternalTopicConfig> stateChangelogTopics = new HashMap<>(); for (String node : entry.getValue()) { // if the node is a source node, add to the source topics final List<String> topics = nodeToSourceTopics.get(node); if (topics != null) { // if some of the topics are internal, add them to the internal topics for (String topic : topics) { // skip global topic as they don't need partition assignment if (globalTopics.contains(topic)) { continue; } if (this.internalTopicNames.contains(topic)) { // prefix the internal topic name with the application id final String internalTopic = decorateTopic(topic); internalSourceTopics.put(internalTopic, new InternalTopicConfig(internalTopic, Collections.singleton(InternalTopicConfig.CleanupPolicy.delete), Collections.<String, String>emptyMap())); sourceTopics.add(internalTopic); } else { sourceTopics.add(topic); } } } // if the node is a sink node, add to the sink topics final String topic = nodeToSinkTopic.get(node); if (topic != null) { if (internalTopicNames.contains(topic)) { // prefix the change log topic name with the application id sinkTopics.add(decorateTopic(topic)); } else { sinkTopics.add(topic); } } // if the node is connected to a state, add to the state topics for (StateStoreFactory stateFactory : stateFactories.values()) { final StateStoreSupplier supplier = stateFactory.supplier; if (supplier.loggingEnabled() && stateFactory.users.contains(node)) { final String name = ProcessorStateManager.storeChangelogTopic(applicationId, supplier.name()); final InternalTopicConfig internalTopicConfig = createInternalTopicConfig(supplier, name); stateChangelogTopics.put(name, internalTopicConfig); } } } if (!sourceTopics.isEmpty()) { topicGroups.put(entry.getKey(), new TopicsInfo( Collections.unmodifiableSet(sinkTopics), Collections.unmodifiableSet(sourceTopics), Collections.unmodifiableMap(internalSourceTopics), Collections.unmodifiableMap(stateChangelogTopics))); } } return Collections.unmodifiableMap(topicGroups); } private void setRegexMatchedTopicsToSourceNodes() { if (subscriptionUpdates.hasUpdates()) { for (Map.Entry<String, Pattern> stringPatternEntry : nodeToSourcePatterns.entrySet()) { final SourceNodeFactory sourceNode = (SourceNodeFactory) nodeFactories.get(stringPatternEntry.getKey()); //need to update nodeToSourceTopics with topics matched from given regex nodeToSourceTopics.put(stringPatternEntry.getKey(), sourceNode.getTopics(subscriptionUpdates.getUpdates())); log.debug("nodeToSourceTopics {}", nodeToSourceTopics); } } } private void setRegexMatchedTopicToStateStore() { if (subscriptionUpdates.hasUpdates()) { for (Map.Entry<String, Set<Pattern>> storePattern : stateStoreNameToSourceRegex.entrySet()) { final Set<String> updatedTopicsForStateStore = new HashSet<>(); for (String subscriptionUpdateTopic : subscriptionUpdates.getUpdates()) { for (Pattern pattern : storePattern.getValue()) { if (pattern.matcher(subscriptionUpdateTopic).matches()) { updatedTopicsForStateStore.add(subscriptionUpdateTopic); } } } if (!updatedTopicsForStateStore.isEmpty()) { Collection<String> storeTopics = stateStoreNameToSourceTopics.get(storePattern.getKey()); if (storeTopics != null) { updatedTopicsForStateStore.addAll(storeTopics); } stateStoreNameToSourceTopics.put(storePattern.getKey(), Collections.unmodifiableSet(updatedTopicsForStateStore)); } } } } private InternalTopicConfig createInternalTopicConfig(final StateStoreSupplier<?> supplier, final String name) { if (!(supplier instanceof WindowStoreSupplier)) { return new InternalTopicConfig(name, Collections.singleton(InternalTopicConfig.CleanupPolicy.compact), supplier.logConfig()); } final WindowStoreSupplier windowStoreSupplier = (WindowStoreSupplier) supplier; final InternalTopicConfig config = new InternalTopicConfig(name, Utils.mkSet(InternalTopicConfig.CleanupPolicy.compact, InternalTopicConfig.CleanupPolicy.delete), supplier.logConfig()); config.setRetentionMs(windowStoreSupplier.retentionPeriod()); return config; } /** * Get the Pattern to match all topics requiring to start reading from earliest available offset * @return the Pattern for matching all topics reading from earliest offset, never null */ public synchronized Pattern earliestResetTopicsPattern() { final List<String> topics = maybeDecorateInternalSourceTopics(earliestResetTopics); final Pattern earliestPattern = buildPatternForOffsetResetTopics(topics, earliestResetPatterns); ensureNoRegexOverlap(earliestPattern, latestResetPatterns, latestResetTopics); return earliestPattern; } /** * Get the Pattern to match all topics requiring to start reading from latest available offset * @return the Pattern for matching all topics reading from latest offset, never null */ public synchronized Pattern latestResetTopicsPattern() { final List<String> topics = maybeDecorateInternalSourceTopics(latestResetTopics); final Pattern latestPattern = buildPatternForOffsetResetTopics(topics, latestResetPatterns); ensureNoRegexOverlap(latestPattern, earliestResetPatterns, earliestResetTopics); return latestPattern; } private void ensureNoRegexOverlap(final Pattern builtPattern, final Set<Pattern> otherPatterns, final Set<String> otherTopics) { for (Pattern otherPattern : otherPatterns) { if (builtPattern.pattern().contains(otherPattern.pattern())) { throw new TopologyBuilderException(String.format("Found overlapping regex [%s] against [%s] for a KStream with auto offset resets", otherPattern.pattern(), builtPattern.pattern())); } } for (String otherTopic : otherTopics) { if (builtPattern.matcher(otherTopic).matches()) { throw new TopologyBuilderException(String.format("Found overlapping regex [%s] matching topic [%s] for a KStream with auto offset resets", builtPattern.pattern(), otherTopic)); } } } /** * Builds a composite pattern out of topic names and Pattern object for matching topic names. If the provided * arrays are empty a Pattern.compile("") instance is returned. * * @param sourceTopics the name of source topics to add to a composite pattern * @param sourcePatterns Patterns for matching source topics to add to a composite pattern * @return a Pattern that is composed of the literal source topic names and any Patterns for matching source topics */ private static synchronized Pattern buildPatternForOffsetResetTopics(final Collection<String> sourceTopics, final Collection<Pattern> sourcePatterns) { final StringBuilder builder = new StringBuilder(); for (String topic : sourceTopics) { builder.append(topic).append("|"); } for (Pattern sourcePattern : sourcePatterns) { builder.append(sourcePattern.pattern()).append("|"); } if (builder.length() > 0) { builder.setLength(builder.length() - 1); return Pattern.compile(builder.toString()); } return EMPTY_ZERO_LENGTH_PATTERN; } /** * @return a mapping from state store name to a Set of source Topics. */ public Map<String, List<String>> stateStoreNameToSourceTopics() { final Map<String, List<String>> results = new HashMap<>(); for (Map.Entry<String, Set<String>> entry : stateStoreNameToSourceTopics.entrySet()) { results.put(entry.getKey(), maybeDecorateInternalSourceTopics(entry.getValue())); } return results; } /** * Returns the copartition groups. * A copartition group is a group of source topics that are required to be copartitioned. * * @return groups of topic names */ public synchronized Collection<Set<String>> copartitionGroups() { final List<Set<String>> list = new ArrayList<>(copartitionSourceGroups.size()); for (Set<String> nodeNames : copartitionSourceGroups) { Set<String> copartitionGroup = new HashSet<>(); for (String node : nodeNames) { final List<String> topics = nodeToSourceTopics.get(node); if (topics != null) copartitionGroup.addAll(maybeDecorateInternalSourceTopics(topics)); } list.add(Collections.unmodifiableSet(copartitionGroup)); } return Collections.unmodifiableList(list); } private List<String> maybeDecorateInternalSourceTopics(final Collection<String> sourceTopics) { final List<String> decoratedTopics = new ArrayList<>(); for (String topic : sourceTopics) { if (internalTopicNames.contains(topic)) { decoratedTopics.add(decorateTopic(topic)); } else { decoratedTopics.add(topic); } } return decoratedTopics; } private String decorateTopic(final String topic) { if (applicationId == null) { throw new TopologyBuilderException("there are internal topics and " + "applicationId hasn't been set. Call " + "setApplicationId first"); } return applicationId + "-" + topic; } public synchronized Pattern sourceTopicPattern() { if (this.topicPattern == null) { final List<String> allSourceTopics = new ArrayList<>(); if (!nodeToSourceTopics.isEmpty()) { for (List<String> topics : nodeToSourceTopics.values()) { allSourceTopics.addAll(maybeDecorateInternalSourceTopics(topics)); } } Collections.sort(allSourceTopics); this.topicPattern = buildPatternForOffsetResetTopics(allSourceTopics, nodeToSourcePatterns.values()); } return this.topicPattern; } public synchronized void updateSubscriptions(final SubscriptionUpdates subscriptionUpdates, final String threadId) { log.debug("stream-thread [{}] updating builder with {} topic(s) with possible matching regex subscription(s)", threadId, subscriptionUpdates); this.subscriptionUpdates = subscriptionUpdates; setRegexMatchedTopicsToSourceNodes(); setRegexMatchedTopicToStateStore(); } }