/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.kafka.streams.processor;
import org.apache.kafka.common.serialization.Deserializer;
import org.apache.kafka.common.serialization.Serializer;
import org.apache.kafka.common.utils.Utils;
import org.apache.kafka.streams.StreamsConfig;
import org.apache.kafka.streams.errors.TopologyBuilderException;
import org.apache.kafka.streams.kstream.KStream;
import org.apache.kafka.streams.kstream.KTable;
import org.apache.kafka.streams.processor.internals.InternalTopicConfig;
import org.apache.kafka.streams.processor.internals.ProcessorNode;
import org.apache.kafka.streams.processor.internals.ProcessorStateManager;
import org.apache.kafka.streams.processor.internals.ProcessorTopology;
import org.apache.kafka.streams.processor.internals.QuickUnion;
import org.apache.kafka.streams.processor.internals.SinkNode;
import org.apache.kafka.streams.processor.internals.SourceNode;
import org.apache.kafka.streams.processor.internals.StreamPartitionAssignor.SubscriptionUpdates;
import org.apache.kafka.streams.state.KeyValueStore;
import org.apache.kafka.streams.state.internals.WindowStoreSupplier;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.regex.Pattern;
/**
* A component that is used to build a {@link ProcessorTopology}. A topology contains an acyclic graph of sources, processors,
* and sinks. A {@link SourceNode source} is a node in the graph that consumes one or more Kafka topics and forwards them to
* its child nodes. A {@link Processor processor} is a node in the graph that receives input records from upstream nodes,
* processes that records, and optionally forwarding new records to one or all of its children. Finally, a {@link SinkNode sink}
* is a node in the graph that receives records from upstream nodes and writes them to a Kafka topic. This builder allows you
* to construct an acyclic graph of these nodes, and the builder is then passed into a new {@link org.apache.kafka.streams.KafkaStreams}
* instance that will then {@link org.apache.kafka.streams.KafkaStreams#start() begin consuming, processing, and producing records}.
*/
public class TopologyBuilder {
private static final Logger log = LoggerFactory.getLogger(TopologyBuilder.class);
private static final Pattern EMPTY_ZERO_LENGTH_PATTERN = Pattern.compile("");
// node factories in a topological order
private final LinkedHashMap<String, NodeFactory> nodeFactories = new LinkedHashMap<>();
// state factories
private final Map<String, StateStoreFactory> stateFactories = new HashMap<>();
// global state factories
private final Map<String, StateStore> globalStateStores = new LinkedHashMap<>();
// all topics subscribed from source processors (without application-id prefix for internal topics)
private final Set<String> sourceTopicNames = new HashSet<>();
// all internal topics auto-created by the topology builder and used in source / sink processors
private final Set<String> internalTopicNames = new HashSet<>();
// groups of source processors that need to be copartitioned
private final List<Set<String>> copartitionSourceGroups = new ArrayList<>();
// map from source processor names to subscribed topics (without application-id prefix for internal topics)
private final HashMap<String, List<String>> nodeToSourceTopics = new HashMap<>();
// map from source processor names to regex subscription patterns
private final HashMap<String, Pattern> nodeToSourcePatterns = new LinkedHashMap<>();
// map from sink processor names to subscribed topic (without application-id prefix for internal topics)
private final HashMap<String, String> nodeToSinkTopic = new HashMap<>();
// map from topics to their matched regex patterns, this is to ensure one topic is passed through on source node
// even if it can be matched by multiple regex patterns
private final HashMap<String, Pattern> topicToPatterns = new HashMap<>();
// map from state store names to all the topics subscribed from source processors that
// are connected to these state stores
private final Map<String, Set<String>> stateStoreNameToSourceTopics = new HashMap<>();
// map from state store names to all the regex subscribed topics from source processors that
// are connected to these state stores
private final Map<String, Set<Pattern>> stateStoreNameToSourceRegex = new HashMap<>();
// map from state store names to this state store's corresponding changelog topic if possible,
// this is used in the extended KStreamBuilder.
private final Map<String, String> storeToChangelogTopic = new HashMap<>();
// all global topics
private final Set<String> globalTopics = new HashSet<>();
private final Set<String> earliestResetTopics = new HashSet<>();
private final Set<String> latestResetTopics = new HashSet<>();
private final Set<Pattern> earliestResetPatterns = new HashSet<>();
private final Set<Pattern> latestResetPatterns = new HashSet<>();
private final QuickUnion<String> nodeGrouper = new QuickUnion<>();
private SubscriptionUpdates subscriptionUpdates = new SubscriptionUpdates();
private String applicationId = null;
private Pattern topicPattern = null;
private Map<Integer, Set<String>> nodeGroups = null;
private static class StateStoreFactory {
public final Set<String> users;
public final StateStoreSupplier supplier;
StateStoreFactory(StateStoreSupplier supplier) {
this.supplier = supplier;
this.users = new HashSet<>();
}
}
private static abstract class NodeFactory {
public final String name;
NodeFactory(String name) {
this.name = name;
}
public abstract ProcessorNode build();
}
private static class ProcessorNodeFactory extends NodeFactory {
private final String[] parents;
private final ProcessorSupplier<?, ?> supplier;
private final Set<String> stateStoreNames = new HashSet<>();
ProcessorNodeFactory(String name, String[] parents, ProcessorSupplier<?, ?> supplier) {
super(name);
this.parents = parents.clone();
this.supplier = supplier;
}
public void addStateStore(String stateStoreName) {
stateStoreNames.add(stateStoreName);
}
@Override
public ProcessorNode build() {
return new ProcessorNode<>(name, supplier.get(), stateStoreNames);
}
}
private class SourceNodeFactory extends NodeFactory {
private final List<String> topics;
private final Pattern pattern;
private final Deserializer<?> keyDeserializer;
private final Deserializer<?> valDeserializer;
private final TimestampExtractor timestampExtractor;
private SourceNodeFactory(final String name,
final String[] topics,
final Pattern pattern,
final TimestampExtractor timestampExtractor,
final Deserializer<?> keyDeserializer,
final Deserializer<?> valDeserializer) {
super(name);
this.topics = topics != null ? Arrays.asList(topics) : new ArrayList<String>();
this.pattern = pattern;
this.keyDeserializer = keyDeserializer;
this.valDeserializer = valDeserializer;
this.timestampExtractor = timestampExtractor;
}
List<String> getTopics(Collection<String> subscribedTopics) {
// if it is subscribed via patterns, it is possible that the topic metadata has not been updated
// yet and hence the map from source node to topics is stale, in this case we put the pattern as a place holder;
// this should only happen for debugging since during runtime this function should always be called after the metadata has updated.
if (subscribedTopics.isEmpty())
return Collections.singletonList("" + pattern + "");
List<String> matchedTopics = new ArrayList<>();
for (String update : subscribedTopics) {
if (this.pattern == topicToPatterns.get(update)) {
matchedTopics.add(update);
} else if (topicToPatterns.containsKey(update) && isMatch(update)) {
// the same topic cannot be matched to more than one pattern
// TODO: we should lift this requirement in the future
throw new TopologyBuilderException("Topic " + update +
" is already matched for another regex pattern " + topicToPatterns.get(update) +
" and hence cannot be matched to this regex pattern " + pattern + " any more.");
} else if (isMatch(update)) {
topicToPatterns.put(update, this.pattern);
matchedTopics.add(update);
}
}
return matchedTopics;
}
@Override
public ProcessorNode build() {
final List<String> sourceTopics = nodeToSourceTopics.get(name);
// if it is subscribed via patterns, it is possible that the topic metadata has not been updated
// yet and hence the map from source node to topics is stale, in this case we put the pattern as a place holder;
// this should only happen for debugging since during runtime this function should always be called after the metadata has updated.
if (sourceTopics == null)
return new SourceNode<>(name, Collections.singletonList("" + pattern + ""), timestampExtractor, keyDeserializer, valDeserializer);
else
return new SourceNode<>(name, maybeDecorateInternalSourceTopics(sourceTopics), timestampExtractor, keyDeserializer, valDeserializer);
}
private boolean isMatch(String topic) {
return this.pattern.matcher(topic).matches();
}
}
private class SinkNodeFactory<K, V> extends NodeFactory {
private final String[] parents;
private final String topic;
private final Serializer<K> keySerializer;
private final Serializer<V> valSerializer;
private final StreamPartitioner<? super K, ? super V> partitioner;
private SinkNodeFactory(String name, String[] parents, String topic, Serializer<K> keySerializer, Serializer<V> valSerializer, StreamPartitioner<? super K, ? super V> partitioner) {
super(name);
this.parents = parents.clone();
this.topic = topic;
this.keySerializer = keySerializer;
this.valSerializer = valSerializer;
this.partitioner = partitioner;
}
@Override
public ProcessorNode build() {
if (internalTopicNames.contains(topic)) {
// prefix the internal topic name with the application id
return new SinkNode<>(name, decorateTopic(topic), keySerializer, valSerializer, partitioner);
} else {
return new SinkNode<>(name, topic, keySerializer, valSerializer, partitioner);
}
}
}
public static class TopicsInfo {
public Set<String> sinkTopics;
public Set<String> sourceTopics;
public Map<String, InternalTopicConfig> stateChangelogTopics;
public Map<String, InternalTopicConfig> repartitionSourceTopics;
TopicsInfo(Set<String> sinkTopics, Set<String> sourceTopics, Map<String, InternalTopicConfig> repartitionSourceTopics, Map<String, InternalTopicConfig> stateChangelogTopics) {
this.sinkTopics = sinkTopics;
this.sourceTopics = sourceTopics;
this.stateChangelogTopics = stateChangelogTopics;
this.repartitionSourceTopics = repartitionSourceTopics;
}
@Override
public boolean equals(Object o) {
if (o instanceof TopicsInfo) {
TopicsInfo other = (TopicsInfo) o;
return other.sourceTopics.equals(this.sourceTopics) && other.stateChangelogTopics.equals(this.stateChangelogTopics);
} else {
return false;
}
}
@Override
public int hashCode() {
long n = ((long) sourceTopics.hashCode() << 32) | (long) stateChangelogTopics.hashCode();
return (int) (n % 0xFFFFFFFFL);
}
@Override
public String toString() {
return "TopicsInfo{" +
"sinkTopics=" + sinkTopics +
", sourceTopics=" + sourceTopics +
", repartitionSourceTopics=" + repartitionSourceTopics +
", stateChangelogTopics=" + stateChangelogTopics +
'}';
}
}
/**
* Enum used to define auto offset reset policy when creating {@link KStream} or {@link KTable}
*/
public enum AutoOffsetReset {
EARLIEST, LATEST
}
/**
* Create a new builder.
*/
public TopologyBuilder() {}
/**
* Set the applicationId to be used for auto-generated internal topics.
*
* This is required before calling {@link #topicGroups}, {@link #copartitionSources},
* {@link #stateStoreNameToSourceTopics} and {@link #build(Integer)}.
*
* @param applicationId the streams applicationId. Should be the same as set by
* {@link org.apache.kafka.streams.StreamsConfig#APPLICATION_ID_CONFIG}
*/
public synchronized final TopologyBuilder setApplicationId(final String applicationId) {
Objects.requireNonNull(applicationId, "applicationId can't be null");
this.applicationId = applicationId;
return this;
}
/**
* Add a new source that consumes the named topics and forward the records to child processor and/or sink nodes.
* The source will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key deserializer} and
* {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value deserializer} specified in the
* {@link org.apache.kafka.streams.StreamsConfig stream configuration}.
* The default {@link TimestampExtractor} as specified in the {@link StreamsConfig config} is used.
*
* @param name the unique name of the source used to reference this node when
* {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}.
* @param topics the name of one or more Kafka topics that this source is to consume
* @return this builder instance so methods can be chained together; never null
*/
public synchronized final TopologyBuilder addSource(final String name, final String... topics) {
return addSource(null, name, null, null, null, topics);
}
/**
* Add a new source that consumes the named topics and forward the records to child processor and/or sink nodes.
* The source will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key deserializer} and
* {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value deserializer} specified in the
* {@link org.apache.kafka.streams.StreamsConfig stream configuration}.
* The default {@link TimestampExtractor} as specified in the {@link StreamsConfig config} is used.
*
* @param offsetReset the auto offset reset policy to use for this source if no committed offsets found; acceptable values earliest or latest
* @param name the unique name of the source used to reference this node when
* {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}.
* @param topics the name of one or more Kafka topics that this source is to consume
* @return this builder instance so methods can be chained together; never null
*/
public synchronized final TopologyBuilder addSource(final AutoOffsetReset offsetReset, final String name, final String... topics) {
return addSource(offsetReset, name, null, null, null, topics);
}
/**
* Add a new source that consumes the named topics and forward the records to child processor and/or sink nodes.
* The source will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key deserializer} and
* {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value deserializer} specified in the
* {@link org.apache.kafka.streams.StreamsConfig stream configuration}.
*
* @param timestampExtractor the stateless timestamp extractor used for this source,
* if not specified the default extractor defined in the configs will be used
* @param name the unique name of the source used to reference this node when
* {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}.
* @param topics the name of one or more Kafka topics that this source is to consume
* @return this builder instance so methods can be chained together; never null
*/
public synchronized final TopologyBuilder addSource(final TimestampExtractor timestampExtractor, final String name, final String... topics) {
return addSource(null, name, timestampExtractor, null, null, topics);
}
/**
* Add a new source that consumes the named topics and forward the records to child processor and/or sink nodes.
* The source will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key deserializer} and
* {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value deserializer} specified in the
* {@link org.apache.kafka.streams.StreamsConfig stream configuration}.
*
* @param offsetReset the auto offset reset policy to use for this source if no committed offsets found;
* acceptable values earliest or latest
* @param timestampExtractor the stateless timestamp extractor used for this source,
* if not specified the default extractor defined in the configs will be used
* @param name the unique name of the source used to reference this node when
* {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}.
* @param topics the name of one or more Kafka topics that this source is to consume
* @return this builder instance so methods can be chained together; never null
*/
public synchronized final TopologyBuilder addSource(final AutoOffsetReset offsetReset, final TimestampExtractor timestampExtractor, final String name, final String... topics) {
return addSource(offsetReset, name, timestampExtractor, null, null, topics);
}
/**
* Add a new source that consumes from topics matching the given pattern
* and forward the records to child processor and/or sink nodes.
* The source will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key deserializer} and
* {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value deserializer} specified in the
* {@link org.apache.kafka.streams.StreamsConfig stream configuration}.
* The default {@link TimestampExtractor} as specified in the {@link StreamsConfig config} is used.
*
* @param name the unique name of the source used to reference this node when
* {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}.
* @param topicPattern regular expression pattern to match Kafka topics that this source is to consume
* @return this builder instance so methods can be chained together; never null
*/
public synchronized final TopologyBuilder addSource(final String name, final Pattern topicPattern) {
return addSource(null, name, null, null, null, topicPattern);
}
/**
* Add a new source that consumes from topics matching the given pattern
* and forward the records to child processor and/or sink nodes.
* The source will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key deserializer} and
* {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value deserializer} specified in the
* {@link org.apache.kafka.streams.StreamsConfig stream configuration}.
* The default {@link TimestampExtractor} as specified in the {@link StreamsConfig config} is used.
*
* @param offsetReset the auto offset reset policy value for this source if no committed offsets found; acceptable values earliest or latest.
* @param name the unique name of the source used to reference this node when
* {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}.
* @param topicPattern regular expression pattern to match Kafka topics that this source is to consume
* @return this builder instance so methods can be chained together; never null
*/
public synchronized final TopologyBuilder addSource(final AutoOffsetReset offsetReset, final String name, final Pattern topicPattern) {
return addSource(offsetReset, name, null, null, null, topicPattern);
}
/**
* Add a new source that consumes from topics matching the given pattern
* and forward the records to child processor and/or sink nodes.
* The source will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key deserializer} and
* {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value deserializer} specified in the
* {@link org.apache.kafka.streams.StreamsConfig stream configuration}.
*
* @param timestampExtractor the stateless timestamp extractor used for this source,
* if not specified the default extractor defined in the configs will be used
* @param name the unique name of the source used to reference this node when
* {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}.
* @param topicPattern regular expression pattern to match Kafka topics that this source is to consume
* @return this builder instance so methods can be chained together; never null
*/
public synchronized final TopologyBuilder addSource(final TimestampExtractor timestampExtractor, final String name, final Pattern topicPattern) {
return addSource(null, name, timestampExtractor, null, null, topicPattern);
}
/**
* Add a new source that consumes from topics matching the given pattern
* and forward the records to child processor and/or sink nodes.
* The source will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key deserializer} and
* {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value deserializer} specified in the
* {@link org.apache.kafka.streams.StreamsConfig stream configuration}.
*
* @param offsetReset the auto offset reset policy value for this source if no committed offsets found;
* acceptable values earliest or latest.
* @param timestampExtractor the stateless timestamp extractor used for this source,
* if not specified the default extractor defined in the configs will be used
* @param name the unique name of the source used to reference this node when
* {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}.
* @param topicPattern regular expression pattern to match Kafka topics that this source is to consume
* @return this builder instance so methods can be chained together; never null
*/
public synchronized final TopologyBuilder addSource(final AutoOffsetReset offsetReset, final TimestampExtractor timestampExtractor, final String name, final Pattern topicPattern) {
return addSource(offsetReset, name, timestampExtractor, null, null, topicPattern);
}
/**
* Add a new source that consumes the named topics and forwards the records to child processor and/or sink nodes.
* The source will use the specified key and value deserializers.
* The default {@link TimestampExtractor} as specified in the {@link StreamsConfig config} is used.
*
* @param name the unique name of the source used to reference this node when
* {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}
* @param keyDeserializer key deserializer used to read this source, if not specified the default
* key deserializer defined in the configs will be used
* @param valDeserializer value deserializer used to read this source,
* if not specified the default value deserializer defined in the configs will be used
* @param topics the name of one or more Kafka topics that this source is to consume
* @return this builder instance so methods can be chained together; never null
* @throws TopologyBuilderException if processor is already added or if topics have already been registered by another source
*/
public synchronized final TopologyBuilder addSource(final String name, final Deserializer keyDeserializer, final Deserializer valDeserializer, final String... topics) {
return addSource(null, name, null, keyDeserializer, valDeserializer, topics);
}
/**
* Add a new source that consumes the named topics and forwards the records to child processor and/or sink nodes.
* The source will use the specified key and value deserializers.
*
* @param offsetReset the auto offset reset policy to use for this stream if no committed offsets found;
* acceptable values are earliest or latest.
* @param name the unique name of the source used to reference this node when
* {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}.
* @param timestampExtractor the stateless timestamp extractor used for this source,
* if not specified the default extractor defined in the configs will be used
* @param keyDeserializer key deserializer used to read this source, if not specified the default
* key deserializer defined in the configs will be used
* @param valDeserializer value deserializer used to read this source,
* if not specified the default value deserializer defined in the configs will be used
* @param topics the name of one or more Kafka topics that this source is to consume
* @return this builder instance so methods can be chained together; never null
* @throws TopologyBuilderException if processor is already added or if topics have already been registered by another source
*/
public synchronized final TopologyBuilder addSource(final AutoOffsetReset offsetReset,
final String name,
final TimestampExtractor timestampExtractor,
final Deserializer keyDeserializer,
final Deserializer valDeserializer,
final String... topics) {
if (topics.length == 0) {
throw new TopologyBuilderException("You must provide at least one topic");
}
Objects.requireNonNull(name, "name must not be null");
if (nodeFactories.containsKey(name))
throw new TopologyBuilderException("Processor " + name + " is already added.");
for (String topic : topics) {
Objects.requireNonNull(topic, "topic names cannot be null");
validateTopicNotAlreadyRegistered(topic);
maybeAddToResetList(earliestResetTopics, latestResetTopics, offsetReset, topic);
sourceTopicNames.add(topic);
}
nodeFactories.put(name, new SourceNodeFactory(name, topics, null, timestampExtractor, keyDeserializer, valDeserializer));
nodeToSourceTopics.put(name, Arrays.asList(topics));
nodeGrouper.add(name);
return this;
}
/**
* Adds a global {@link StateStore} to the topology. The {@link StateStore} sources its data
* from all partitions of the provided input topic. There will be exactly one instance of this
* {@link StateStore} per Kafka Streams instance.
* <p>
* A {@link SourceNode} with the provided sourceName will be added to consume the data arriving
* from the partitions of the input topic.
* <p>
* The provided {@link ProcessorSupplier} will be used to create an {@link ProcessorNode} that will
* receive all records forwarded from the {@link SourceNode}. This
* {@link ProcessorNode} should be used to keep the {@link StateStore} up-to-date.
* The default {@link TimestampExtractor} as specified in the {@link StreamsConfig config} is used.
*
* @param storeSupplier user defined state store supplier
* @param sourceName name of the {@link SourceNode} that will be automatically added
* @param keyDeserializer the {@link Deserializer} to deserialize keys with
* @param valueDeserializer the {@link Deserializer} to deserialize values with
* @param topic the topic to source the data from
* @param processorName the name of the {@link ProcessorSupplier}
* @param stateUpdateSupplier the instance of {@link ProcessorSupplier}
* @return this builder instance so methods can be chained together; never null
*/
public synchronized TopologyBuilder addGlobalStore(final StateStoreSupplier<KeyValueStore> storeSupplier,
final String sourceName,
final Deserializer keyDeserializer,
final Deserializer valueDeserializer,
final String topic,
final String processorName,
final ProcessorSupplier stateUpdateSupplier) {
return addGlobalStore(storeSupplier, sourceName, null, keyDeserializer, valueDeserializer, topic, processorName, stateUpdateSupplier);
}
/**
* Adds a global {@link StateStore} to the topology. The {@link StateStore} sources its data
* from all partitions of the provided input topic. There will be exactly one instance of this
* {@link StateStore} per Kafka Streams instance.
* <p>
* A {@link SourceNode} with the provided sourceName will be added to consume the data arriving
* from the partitions of the input topic.
* <p>
* The provided {@link ProcessorSupplier} will be used to create an {@link ProcessorNode} that will
* receive all records forwarded from the {@link SourceNode}. This
* {@link ProcessorNode} should be used to keep the {@link StateStore} up-to-date.
*
* @param storeSupplier user defined state store supplier
* @param sourceName name of the {@link SourceNode} that will be automatically added
* @param timestampExtractor the stateless timestamp extractor used for this source,
* if not specified the default extractor defined in the configs will be used
* @param keyDeserializer the {@link Deserializer} to deserialize keys with
* @param valueDeserializer the {@link Deserializer} to deserialize values with
* @param topic the topic to source the data from
* @param processorName the name of the {@link ProcessorSupplier}
* @param stateUpdateSupplier the instance of {@link ProcessorSupplier}
* @return this builder instance so methods can be chained together; never null
*/
public synchronized TopologyBuilder addGlobalStore(final StateStoreSupplier<KeyValueStore> storeSupplier,
final String sourceName,
final TimestampExtractor timestampExtractor,
final Deserializer keyDeserializer,
final Deserializer valueDeserializer,
final String topic,
final String processorName,
final ProcessorSupplier stateUpdateSupplier) {
Objects.requireNonNull(storeSupplier, "store supplier must not be null");
Objects.requireNonNull(sourceName, "sourceName must not be null");
Objects.requireNonNull(topic, "topic must not be null");
Objects.requireNonNull(stateUpdateSupplier, "supplier must not be null");
Objects.requireNonNull(processorName, "processorName must not be null");
if (nodeFactories.containsKey(sourceName)) {
throw new TopologyBuilderException("Processor " + sourceName + " is already added.");
}
if (nodeFactories.containsKey(processorName)) {
throw new TopologyBuilderException("Processor " + processorName + " is already added.");
}
if (stateFactories.containsKey(storeSupplier.name()) || globalStateStores.containsKey(storeSupplier.name())) {
throw new TopologyBuilderException("StateStore " + storeSupplier.name() + " is already added.");
}
if (storeSupplier.loggingEnabled()) {
throw new TopologyBuilderException("StateStore " + storeSupplier.name() + " for global table must not have logging enabled.");
}
if (sourceName.equals(processorName)) {
throw new TopologyBuilderException("sourceName and processorName must be different.");
}
validateTopicNotAlreadyRegistered(topic);
globalTopics.add(topic);
final String[] topics = {topic};
nodeFactories.put(sourceName, new SourceNodeFactory(sourceName, topics, null, timestampExtractor, keyDeserializer, valueDeserializer));
nodeToSourceTopics.put(sourceName, Arrays.asList(topics));
nodeGrouper.add(sourceName);
final String[] parents = {sourceName};
final ProcessorNodeFactory nodeFactory = new ProcessorNodeFactory(processorName, parents, stateUpdateSupplier);
nodeFactory.addStateStore(storeSupplier.name());
nodeFactories.put(processorName, nodeFactory);
nodeGrouper.add(processorName);
nodeGrouper.unite(processorName, parents);
globalStateStores.put(storeSupplier.name(), storeSupplier.get());
connectSourceStoreAndTopic(storeSupplier.name(), topic);
return this;
}
private void validateTopicNotAlreadyRegistered(final String topic) {
if (sourceTopicNames.contains(topic) || globalTopics.contains(topic)) {
throw new TopologyBuilderException("Topic " + topic + " has already been registered by another source.");
}
for (Pattern pattern : nodeToSourcePatterns.values()) {
if (pattern.matcher(topic).matches()) {
throw new TopologyBuilderException("Topic " + topic + " matches a Pattern already registered by another source.");
}
}
}
/**
* Add a new source that consumes from topics matching the given pattern
* and forwards the records to child processor and/or sink nodes.
* The source will use the specified key and value deserializers. The provided
* de-/serializers will be used for all matched topics, so care should be taken to specify patterns for
* topics that share the same key-value data format.
* The default {@link TimestampExtractor} as specified in the {@link StreamsConfig config} is used.
*
* @param name the unique name of the source used to reference this node when
* {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}
* @param keyDeserializer key deserializer used to read this source, if not specified the default
* key deserializer defined in the configs will be used
* @param valDeserializer value deserializer used to read this source,
* if not specified the default value deserializer defined in the configs will be used
* @param topicPattern regular expression pattern to match Kafka topics that this source is to consume
* @return this builder instance so methods can be chained together; never null
* @throws TopologyBuilderException if processor is already added or if topics have already been registered by name
*/
public synchronized final TopologyBuilder addSource(final String name, final Deserializer keyDeserializer, final Deserializer valDeserializer, final Pattern topicPattern) {
return addSource(null, name, null, keyDeserializer, valDeserializer, topicPattern);
}
/**
* Add a new source that consumes from topics matching the given pattern
* and forwards the records to child processor and/or sink nodes.
* The source will use the specified key and value deserializers. The provided
* de-/serializers will be used for all matched topics, so care should be taken to specify patterns for
* topics that share the same key-value data format.
*
* @param offsetReset the auto offset reset policy to use for this stream if no committed offsets found;
* acceptable values are earliest or latest
* @param name the unique name of the source used to reference this node when
* {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}.
* @param timestampExtractor the stateless timestamp extractor used for this source,
* if not specified the default extractor defined in the configs will be used
* @param keyDeserializer key deserializer used to read this source, if not specified the default
* key deserializer defined in the configs will be used
* @param valDeserializer value deserializer used to read this source,
* if not specified the default value deserializer defined in the configs will be used
* @param topicPattern regular expression pattern to match Kafka topics that this source is to consume
* @return this builder instance so methods can be chained together; never null
* @throws TopologyBuilderException if processor is already added or if topics have already been registered by name
*/
public synchronized final TopologyBuilder addSource(final AutoOffsetReset offsetReset,
final String name,
final TimestampExtractor timestampExtractor,
final Deserializer keyDeserializer,
final Deserializer valDeserializer,
final Pattern topicPattern) {
Objects.requireNonNull(topicPattern, "topicPattern can't be null");
Objects.requireNonNull(name, "name can't be null");
if (nodeFactories.containsKey(name)) {
throw new TopologyBuilderException("Processor " + name + " is already added.");
}
for (String sourceTopicName : sourceTopicNames) {
if (topicPattern.matcher(sourceTopicName).matches()) {
throw new TopologyBuilderException("Pattern " + topicPattern + " will match a topic that has already been registered by another source.");
}
}
maybeAddToResetList(earliestResetPatterns, latestResetPatterns, offsetReset, topicPattern);
nodeFactories.put(name, new SourceNodeFactory(name, null, topicPattern, timestampExtractor, keyDeserializer, valDeserializer));
nodeToSourcePatterns.put(name, topicPattern);
nodeGrouper.add(name);
return this;
}
/**
* Add a new source that consumes from topics matching the given pattern
* and forwards the records to child processor and/or sink nodes.
* The source will use the specified key and value deserializers. The provided
* de-/serializers will be used for all matched topics, so care should be taken to specify patterns for
* topics that share the same key-value data format.
*
* @param offsetReset the auto offset reset policy to use for this stream if no committed offsets found;
* acceptable values are earliest or latest
* @param name the unique name of the source used to reference this node when
* {@link #addProcessor(String, ProcessorSupplier, String...) adding processor children}
* @param keyDeserializer key deserializer used to read this source, if not specified the default
* key deserializer defined in the configs will be used
* @param valDeserializer value deserializer used to read this source,
* if not specified the default value deserializer defined in the configs will be used
* @param topicPattern regular expression pattern to match Kafka topics that this source is to consume
* @return this builder instance so methods can be chained together; never null
* @throws TopologyBuilderException if processor is already added or if topics have already been registered by name
*/
public synchronized final TopologyBuilder addSource(final AutoOffsetReset offsetReset,
final String name,
final Deserializer keyDeserializer,
final Deserializer valDeserializer,
final Pattern topicPattern) {
return addSource(offsetReset, name, null, keyDeserializer, valDeserializer, topicPattern);
}
/**
* Add a new sink that forwards records from upstream parent processor and/or source nodes to the named Kafka topic.
* The sink will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key serializer} and
* {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value serializer} specified in the
* {@link org.apache.kafka.streams.StreamsConfig stream configuration}.
*
* @param name the unique name of the sink
* @param topic the name of the Kafka topic to which this sink should write its records
* @param parentNames the name of one or more source or processor nodes whose output records this sink should consume
* and write to its topic
* @return this builder instance so methods can be chained together; never null
* @see #addSink(String, String, StreamPartitioner, String...)
* @see #addSink(String, String, Serializer, Serializer, String...)
* @see #addSink(String, String, Serializer, Serializer, StreamPartitioner, String...)
*/
public synchronized final TopologyBuilder addSink(final String name, final String topic, final String... parentNames) {
return addSink(name, topic, null, null, parentNames);
}
/**
* Add a new sink that forwards records from upstream parent processor and/or source nodes to the named Kafka topic, using
* the supplied partitioner.
* The sink will use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key serializer} and
* {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value serializer} specified in the
* {@link org.apache.kafka.streams.StreamsConfig stream configuration}.
* <p>
* The sink will also use the specified {@link StreamPartitioner} to determine how records are distributed among
* the named Kafka topic's partitions. Such control is often useful with topologies that use
* {@link #addStateStore(StateStoreSupplier, String...) state stores}
* in its processors. In most other cases, however, a partitioner needs not be specified and Kafka will automatically distribute
* records among partitions using Kafka's default partitioning logic.
*
* @param name the unique name of the sink
* @param topic the name of the Kafka topic to which this sink should write its records
* @param partitioner the function that should be used to determine the partition for each record processed by the sink
* @param parentNames the name of one or more source or processor nodes whose output records this sink should consume
* and write to its topic
* @return this builder instance so methods can be chained together; never null
* @see #addSink(String, String, String...)
* @see #addSink(String, String, Serializer, Serializer, String...)
* @see #addSink(String, String, Serializer, Serializer, StreamPartitioner, String...)
*/
public synchronized final TopologyBuilder addSink(final String name, final String topic, final StreamPartitioner partitioner, final String... parentNames) {
return addSink(name, topic, null, null, partitioner, parentNames);
}
/**
* Add a new sink that forwards records from upstream parent processor and/or source nodes to the named Kafka topic.
* The sink will use the specified key and value serializers.
*
* @param name the unique name of the sink
* @param topic the name of the Kafka topic to which this sink should write its records
* @param keySerializer the {@link Serializer key serializer} used when consuming records; may be null if the sink
* should use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key serializer} specified in the
* {@link org.apache.kafka.streams.StreamsConfig stream configuration}
* @param valSerializer the {@link Serializer value serializer} used when consuming records; may be null if the sink
* should use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value serializer} specified in the
* {@link org.apache.kafka.streams.StreamsConfig stream configuration}
* @param parentNames the name of one or more source or processor nodes whose output records this sink should consume
* and write to its topic
* @return this builder instance so methods can be chained together; never null
* @see #addSink(String, String, String...)
* @see #addSink(String, String, StreamPartitioner, String...)
* @see #addSink(String, String, Serializer, Serializer, StreamPartitioner, String...)
*/
public synchronized final TopologyBuilder addSink(final String name, final String topic, final Serializer keySerializer, final Serializer valSerializer, final String... parentNames) {
return addSink(name, topic, keySerializer, valSerializer, null, parentNames);
}
/**
* Add a new sink that forwards records from upstream parent processor and/or source nodes to the named Kafka topic.
* The sink will use the specified key and value serializers, and the supplied partitioner.
*
* @param name the unique name of the sink
* @param topic the name of the Kafka topic to which this sink should write its records
* @param keySerializer the {@link Serializer key serializer} used when consuming records; may be null if the sink
* should use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_KEY_SERDE_CLASS_CONFIG default key serializer} specified in the
* {@link org.apache.kafka.streams.StreamsConfig stream configuration}
* @param valSerializer the {@link Serializer value serializer} used when consuming records; may be null if the sink
* should use the {@link org.apache.kafka.streams.StreamsConfig#DEFAULT_VALUE_SERDE_CLASS_CONFIG default value serializer} specified in the
* {@link org.apache.kafka.streams.StreamsConfig stream configuration}
* @param partitioner the function that should be used to determine the partition for each record processed by the sink
* @param parentNames the name of one or more source or processor nodes whose output records this sink should consume
* and write to its topic
* @return this builder instance so methods can be chained together; never null
* @see #addSink(String, String, String...)
* @see #addSink(String, String, StreamPartitioner, String...)
* @see #addSink(String, String, Serializer, Serializer, String...)
* @throws TopologyBuilderException if parent processor is not added yet, or if this processor's name is equal to the parent's name
*/
public synchronized final <K, V> TopologyBuilder addSink(final String name, final String topic, final Serializer<K> keySerializer, final Serializer<V> valSerializer, final StreamPartitioner<? super K, ? super V> partitioner, final String... parentNames) {
Objects.requireNonNull(name, "name must not be null");
Objects.requireNonNull(topic, "topic must not be null");
if (nodeFactories.containsKey(name))
throw new TopologyBuilderException("Processor " + name + " is already added.");
for (final String parent : parentNames) {
if (parent.equals(name)) {
throw new TopologyBuilderException("Processor " + name + " cannot be a parent of itself.");
}
if (!nodeFactories.containsKey(parent)) {
throw new TopologyBuilderException("Parent processor " + parent + " is not added yet.");
}
}
nodeFactories.put(name, new SinkNodeFactory<>(name, parentNames, topic, keySerializer, valSerializer, partitioner));
nodeToSinkTopic.put(name, topic);
nodeGrouper.add(name);
nodeGrouper.unite(name, parentNames);
return this;
}
/**
* Add a new processor node that receives and processes records output by one or more parent source or processor node.
* Any new record output by this processor will be forwarded to its child processor or sink nodes.
* @param name the unique name of the processor node
* @param supplier the supplier used to obtain this node's {@link Processor} instance
* @param parentNames the name of one or more source or processor nodes whose output records this processor should receive
* and process
* @return this builder instance so methods can be chained together; never null
* @throws TopologyBuilderException if parent processor is not added yet, or if this processor's name is equal to the parent's name
*/
public synchronized final TopologyBuilder addProcessor(final String name, final ProcessorSupplier supplier, final String... parentNames) {
Objects.requireNonNull(name, "name must not be null");
Objects.requireNonNull(supplier, "supplier must not be null");
if (nodeFactories.containsKey(name))
throw new TopologyBuilderException("Processor " + name + " is already added.");
for (final String parent : parentNames) {
if (parent.equals(name)) {
throw new TopologyBuilderException("Processor " + name + " cannot be a parent of itself.");
}
if (!nodeFactories.containsKey(parent)) {
throw new TopologyBuilderException("Parent processor " + parent + " is not added yet.");
}
}
nodeFactories.put(name, new ProcessorNodeFactory(name, parentNames, supplier));
nodeGrouper.add(name);
nodeGrouper.unite(name, parentNames);
return this;
}
/**
* Adds a state store
*
* @param supplier the supplier used to obtain this state store {@link StateStore} instance
* @return this builder instance so methods can be chained together; never null
* @throws TopologyBuilderException if state store supplier is already added
*/
public synchronized final TopologyBuilder addStateStore(final StateStoreSupplier supplier, final String... processorNames) {
Objects.requireNonNull(supplier, "supplier can't be null");
if (stateFactories.containsKey(supplier.name())) {
throw new TopologyBuilderException("StateStore " + supplier.name() + " is already added.");
}
stateFactories.put(supplier.name(), new StateStoreFactory(supplier));
if (processorNames != null) {
for (String processorName : processorNames) {
connectProcessorAndStateStore(processorName, supplier.name());
}
}
return this;
}
/**
* Connects the processor and the state stores
*
* @param processorName the name of the processor
* @param stateStoreNames the names of state stores that the processor uses
* @return this builder instance so methods can be chained together; never null
*/
public synchronized final TopologyBuilder connectProcessorAndStateStores(final String processorName, final String... stateStoreNames) {
Objects.requireNonNull(processorName, "processorName can't be null");
if (stateStoreNames != null) {
for (String stateStoreName : stateStoreNames) {
connectProcessorAndStateStore(processorName, stateStoreName);
}
}
return this;
}
/**
* This is used only for KStreamBuilder: when adding a KTable from a source topic,
* we need to add the topic as the KTable's materialized state store's changelog.
*/
protected synchronized final TopologyBuilder connectSourceStoreAndTopic(final String sourceStoreName, final String topic) {
if (storeToChangelogTopic.containsKey(sourceStoreName)) {
throw new TopologyBuilderException("Source store " + sourceStoreName + " is already added.");
}
storeToChangelogTopic.put(sourceStoreName, topic);
return this;
}
/**
* Connects a list of processors.
*
* NOTE this function would not needed by developers working with the processor APIs, but only used
* for the high-level DSL parsing functionalities.
*
* @param processorNames the name of the processors
* @return this builder instance so methods can be chained together; never null
* @throws TopologyBuilderException if less than two processors are specified, or if one of the processors is not added yet
*/
public synchronized final TopologyBuilder connectProcessors(final String... processorNames) {
if (processorNames.length < 2)
throw new TopologyBuilderException("At least two processors need to participate in the connection.");
for (String processorName : processorNames) {
if (!nodeFactories.containsKey(processorName))
throw new TopologyBuilderException("Processor " + processorName + " is not added yet.");
}
String firstProcessorName = processorNames[0];
nodeGrouper.unite(firstProcessorName, Arrays.copyOfRange(processorNames, 1, processorNames.length));
return this;
}
/**
* Adds an internal topic
*
* @param topicName the name of the topic
* @return this builder instance so methods can be chained together; never null
*/
public synchronized final TopologyBuilder addInternalTopic(final String topicName) {
Objects.requireNonNull(topicName, "topicName can't be null");
this.internalTopicNames.add(topicName);
return this;
}
/**
* Asserts that the streams of the specified source nodes must be copartitioned.
*
* @param sourceNodes a set of source node names
* @return this builder instance so methods can be chained together; never null
*/
public synchronized final TopologyBuilder copartitionSources(final Collection<String> sourceNodes) {
copartitionSourceGroups.add(Collections.unmodifiableSet(new HashSet<>(sourceNodes)));
return this;
}
private void connectProcessorAndStateStore(final String processorName, final String stateStoreName) {
if (!stateFactories.containsKey(stateStoreName))
throw new TopologyBuilderException("StateStore " + stateStoreName + " is not added yet.");
if (!nodeFactories.containsKey(processorName))
throw new TopologyBuilderException("Processor " + processorName + " is not added yet.");
final StateStoreFactory stateStoreFactory = stateFactories.get(stateStoreName);
final Iterator<String> iter = stateStoreFactory.users.iterator();
if (iter.hasNext()) {
final String user = iter.next();
nodeGrouper.unite(user, processorName);
}
stateStoreFactory.users.add(processorName);
NodeFactory nodeFactory = nodeFactories.get(processorName);
if (nodeFactory instanceof ProcessorNodeFactory) {
final ProcessorNodeFactory processorNodeFactory = (ProcessorNodeFactory) nodeFactory;
processorNodeFactory.addStateStore(stateStoreName);
connectStateStoreNameToSourceTopicsOrPattern(stateStoreName, processorNodeFactory);
} else {
throw new TopologyBuilderException("cannot connect a state store " + stateStoreName + " to a source node or a sink node.");
}
}
private Set<SourceNodeFactory> findSourcesForProcessorParents(final String[] parents) {
final Set<SourceNodeFactory> sourceNodes = new HashSet<>();
for (String parent : parents) {
final NodeFactory nodeFactory = nodeFactories.get(parent);
if (nodeFactory instanceof SourceNodeFactory) {
sourceNodes.add((SourceNodeFactory) nodeFactory);
} else if (nodeFactory instanceof ProcessorNodeFactory) {
sourceNodes.addAll(findSourcesForProcessorParents(((ProcessorNodeFactory) nodeFactory).parents));
}
}
return sourceNodes;
}
private void connectStateStoreNameToSourceTopicsOrPattern(final String stateStoreName,
final ProcessorNodeFactory processorNodeFactory) {
// we should never update the mapping from state store names to source topics if the store name already exists
// in the map; this scenario is possible, for example, that a state store underlying a source KTable is
// connecting to a join operator whose source topic is not the original KTable's source topic but an internal repartition topic.
if (stateStoreNameToSourceTopics.containsKey(stateStoreName) || stateStoreNameToSourceRegex.containsKey(stateStoreName)) {
return;
}
final Set<String> sourceTopics = new HashSet<>();
final Set<Pattern> sourcePatterns = new HashSet<>();
final Set<SourceNodeFactory> sourceNodesForParent = findSourcesForProcessorParents(processorNodeFactory.parents);
for (SourceNodeFactory sourceNodeFactory : sourceNodesForParent) {
if (sourceNodeFactory.pattern != null) {
sourcePatterns.add(sourceNodeFactory.pattern);
} else {
sourceTopics.addAll(sourceNodeFactory.topics);
}
}
if (!sourceTopics.isEmpty()) {
stateStoreNameToSourceTopics.put(stateStoreName,
Collections.unmodifiableSet(sourceTopics));
}
if (!sourcePatterns.isEmpty()) {
stateStoreNameToSourceRegex.put(stateStoreName,
Collections.unmodifiableSet(sourcePatterns));
}
}
private <T> void maybeAddToResetList(final Collection<T> earliestResets, final Collection<T> latestResets, final AutoOffsetReset offsetReset, final T item) {
if (offsetReset != null) {
switch (offsetReset) {
case EARLIEST:
earliestResets.add(item);
break;
case LATEST:
latestResets.add(item);
break;
default:
throw new TopologyBuilderException(String.format("Unrecognized reset format %s", offsetReset));
}
}
}
/**
* Returns the map of node groups keyed by the topic group id.
*
* @return groups of node names
*/
public synchronized Map<Integer, Set<String>> nodeGroups() {
if (nodeGroups == null)
nodeGroups = makeNodeGroups();
return nodeGroups;
}
private Map<Integer, Set<String>> makeNodeGroups() {
final HashMap<Integer, Set<String>> nodeGroups = new LinkedHashMap<>();
final HashMap<String, Set<String>> rootToNodeGroup = new HashMap<>();
int nodeGroupId = 0;
// Go through source nodes first. This makes the group id assignment easy to predict in tests
final HashSet<String> allSourceNodes = new HashSet<>(nodeToSourceTopics.keySet());
allSourceNodes.addAll(nodeToSourcePatterns.keySet());
for (String nodeName : Utils.sorted(allSourceNodes)) {
final String root = nodeGrouper.root(nodeName);
Set<String> nodeGroup = rootToNodeGroup.get(root);
if (nodeGroup == null) {
nodeGroup = new HashSet<>();
rootToNodeGroup.put(root, nodeGroup);
nodeGroups.put(nodeGroupId++, nodeGroup);
}
nodeGroup.add(nodeName);
}
// Go through non-source nodes
for (String nodeName : Utils.sorted(nodeFactories.keySet())) {
if (!nodeToSourceTopics.containsKey(nodeName)) {
final String root = nodeGrouper.root(nodeName);
Set<String> nodeGroup = rootToNodeGroup.get(root);
if (nodeGroup == null) {
nodeGroup = new HashSet<>();
rootToNodeGroup.put(root, nodeGroup);
nodeGroups.put(nodeGroupId++, nodeGroup);
}
nodeGroup.add(nodeName);
}
}
return nodeGroups;
}
/**
* Build the topology for the specified topic group. This is called automatically when passing this builder into the
* {@link org.apache.kafka.streams.KafkaStreams#KafkaStreams(TopologyBuilder, org.apache.kafka.streams.StreamsConfig)} constructor.
*
* @see org.apache.kafka.streams.KafkaStreams#KafkaStreams(TopologyBuilder, org.apache.kafka.streams.StreamsConfig)
*/
public synchronized ProcessorTopology build(final Integer topicGroupId) {
Set<String> nodeGroup;
if (topicGroupId != null) {
nodeGroup = nodeGroups().get(topicGroupId);
} else {
// when topicGroupId is null, we build the full topology minus the global groups
final Set<String> globalNodeGroups = globalNodeGroups();
final Collection<Set<String>> values = nodeGroups().values();
nodeGroup = new HashSet<>();
for (Set<String> value : values) {
nodeGroup.addAll(value);
}
nodeGroup.removeAll(globalNodeGroups);
}
return build(nodeGroup);
}
/**
* Builds the topology for any global state stores
* @return ProcessorTopology
*/
public synchronized ProcessorTopology buildGlobalStateTopology() {
final Set<String> globalGroups = globalNodeGroups();
if (globalGroups.isEmpty()) {
return null;
}
return build(globalGroups);
}
private Set<String> globalNodeGroups() {
final Set<String> globalGroups = new HashSet<>();
for (final Map.Entry<Integer, Set<String>> nodeGroup : nodeGroups().entrySet()) {
final Set<String> nodes = nodeGroup.getValue();
for (String node : nodes) {
final NodeFactory nodeFactory = nodeFactories.get(node);
if (nodeFactory instanceof SourceNodeFactory) {
final List<String> topics = ((SourceNodeFactory) nodeFactory).topics;
if (topics != null && topics.size() == 1 && globalTopics.contains(topics.get(0))) {
globalGroups.addAll(nodes);
}
}
}
}
return globalGroups;
}
private ProcessorTopology build(final Set<String> nodeGroup) {
final List<ProcessorNode> processorNodes = new ArrayList<>(nodeFactories.size());
final Map<String, ProcessorNode> processorMap = new HashMap<>();
final Map<String, SourceNode> topicSourceMap = new HashMap<>();
final Map<String, SinkNode> topicSinkMap = new HashMap<>();
final Map<String, StateStore> stateStoreMap = new LinkedHashMap<>();
// create processor nodes in a topological order ("nodeFactories" is already topologically sorted)
for (NodeFactory factory : nodeFactories.values()) {
if (nodeGroup == null || nodeGroup.contains(factory.name)) {
final ProcessorNode node = factory.build();
processorNodes.add(node);
processorMap.put(node.name(), node);
if (factory instanceof ProcessorNodeFactory) {
for (String parent : ((ProcessorNodeFactory) factory).parents) {
final ProcessorNode<?, ?> parentNode = processorMap.get(parent);
parentNode.addChild(node);
}
for (String stateStoreName : ((ProcessorNodeFactory) factory).stateStoreNames) {
if (!stateStoreMap.containsKey(stateStoreName)) {
StateStore stateStore;
if (stateFactories.containsKey(stateStoreName)) {
final StateStoreSupplier supplier = stateFactories.get(stateStoreName).supplier;
stateStore = supplier.get();
// remember the changelog topic if this state store is change-logging enabled
if (supplier.loggingEnabled() && !storeToChangelogTopic.containsKey(stateStoreName)) {
final String changelogTopic = ProcessorStateManager.storeChangelogTopic(this.applicationId, stateStoreName);
storeToChangelogTopic.put(stateStoreName, changelogTopic);
}
} else {
stateStore = globalStateStores.get(stateStoreName);
}
stateStoreMap.put(stateStoreName, stateStore);
}
}
} else if (factory instanceof SourceNodeFactory) {
final SourceNodeFactory sourceNodeFactory = (SourceNodeFactory) factory;
final List<String> topics = (sourceNodeFactory.pattern != null) ?
sourceNodeFactory.getTopics(subscriptionUpdates.getUpdates()) :
sourceNodeFactory.topics;
for (String topic : topics) {
if (internalTopicNames.contains(topic)) {
// prefix the internal topic name with the application id
topicSourceMap.put(decorateTopic(topic), (SourceNode) node);
} else {
topicSourceMap.put(topic, (SourceNode) node);
}
}
} else if (factory instanceof SinkNodeFactory) {
final SinkNodeFactory sinkNodeFactory = (SinkNodeFactory) factory;
for (String parent : sinkNodeFactory.parents) {
processorMap.get(parent).addChild(node);
if (internalTopicNames.contains(sinkNodeFactory.topic)) {
// prefix the internal topic name with the application id
topicSinkMap.put(decorateTopic(sinkNodeFactory.topic), (SinkNode) node);
} else {
topicSinkMap.put(sinkNodeFactory.topic, (SinkNode) node);
}
}
} else {
throw new TopologyBuilderException("Unknown definition class: " + factory.getClass().getName());
}
}
}
return new ProcessorTopology(processorNodes, topicSourceMap, topicSinkMap, new ArrayList<>(stateStoreMap.values()), storeToChangelogTopic, new ArrayList<>(globalStateStores.values()));
}
/**
* Get any global {@link StateStore}s that are part of the
* topology
* @return map containing all global {@link StateStore}s
*/
public Map<String, StateStore> globalStateStores() {
return Collections.unmodifiableMap(globalStateStores);
}
/**
* Returns the map of topic groups keyed by the group id.
* A topic group is a group of topics in the same task.
*
* @return groups of topic names
*/
public synchronized Map<Integer, TopicsInfo> topicGroups() {
final Map<Integer, TopicsInfo> topicGroups = new LinkedHashMap<>();
if (nodeGroups == null)
nodeGroups = makeNodeGroups();
for (Map.Entry<Integer, Set<String>> entry : nodeGroups.entrySet()) {
final Set<String> sinkTopics = new HashSet<>();
final Set<String> sourceTopics = new HashSet<>();
final Map<String, InternalTopicConfig> internalSourceTopics = new HashMap<>();
final Map<String, InternalTopicConfig> stateChangelogTopics = new HashMap<>();
for (String node : entry.getValue()) {
// if the node is a source node, add to the source topics
final List<String> topics = nodeToSourceTopics.get(node);
if (topics != null) {
// if some of the topics are internal, add them to the internal topics
for (String topic : topics) {
// skip global topic as they don't need partition assignment
if (globalTopics.contains(topic)) {
continue;
}
if (this.internalTopicNames.contains(topic)) {
// prefix the internal topic name with the application id
final String internalTopic = decorateTopic(topic);
internalSourceTopics.put(internalTopic, new InternalTopicConfig(internalTopic,
Collections.singleton(InternalTopicConfig.CleanupPolicy.delete),
Collections.<String, String>emptyMap()));
sourceTopics.add(internalTopic);
} else {
sourceTopics.add(topic);
}
}
}
// if the node is a sink node, add to the sink topics
final String topic = nodeToSinkTopic.get(node);
if (topic != null) {
if (internalTopicNames.contains(topic)) {
// prefix the change log topic name with the application id
sinkTopics.add(decorateTopic(topic));
} else {
sinkTopics.add(topic);
}
}
// if the node is connected to a state, add to the state topics
for (StateStoreFactory stateFactory : stateFactories.values()) {
final StateStoreSupplier supplier = stateFactory.supplier;
if (supplier.loggingEnabled() && stateFactory.users.contains(node)) {
final String name = ProcessorStateManager.storeChangelogTopic(applicationId, supplier.name());
final InternalTopicConfig internalTopicConfig = createInternalTopicConfig(supplier, name);
stateChangelogTopics.put(name, internalTopicConfig);
}
}
}
if (!sourceTopics.isEmpty()) {
topicGroups.put(entry.getKey(), new TopicsInfo(
Collections.unmodifiableSet(sinkTopics),
Collections.unmodifiableSet(sourceTopics),
Collections.unmodifiableMap(internalSourceTopics),
Collections.unmodifiableMap(stateChangelogTopics)));
}
}
return Collections.unmodifiableMap(topicGroups);
}
private void setRegexMatchedTopicsToSourceNodes() {
if (subscriptionUpdates.hasUpdates()) {
for (Map.Entry<String, Pattern> stringPatternEntry : nodeToSourcePatterns.entrySet()) {
final SourceNodeFactory sourceNode = (SourceNodeFactory) nodeFactories.get(stringPatternEntry.getKey());
//need to update nodeToSourceTopics with topics matched from given regex
nodeToSourceTopics.put(stringPatternEntry.getKey(), sourceNode.getTopics(subscriptionUpdates.getUpdates()));
log.debug("nodeToSourceTopics {}", nodeToSourceTopics);
}
}
}
private void setRegexMatchedTopicToStateStore() {
if (subscriptionUpdates.hasUpdates()) {
for (Map.Entry<String, Set<Pattern>> storePattern : stateStoreNameToSourceRegex.entrySet()) {
final Set<String> updatedTopicsForStateStore = new HashSet<>();
for (String subscriptionUpdateTopic : subscriptionUpdates.getUpdates()) {
for (Pattern pattern : storePattern.getValue()) {
if (pattern.matcher(subscriptionUpdateTopic).matches()) {
updatedTopicsForStateStore.add(subscriptionUpdateTopic);
}
}
}
if (!updatedTopicsForStateStore.isEmpty()) {
Collection<String> storeTopics = stateStoreNameToSourceTopics.get(storePattern.getKey());
if (storeTopics != null) {
updatedTopicsForStateStore.addAll(storeTopics);
}
stateStoreNameToSourceTopics.put(storePattern.getKey(), Collections.unmodifiableSet(updatedTopicsForStateStore));
}
}
}
}
private InternalTopicConfig createInternalTopicConfig(final StateStoreSupplier<?> supplier, final String name) {
if (!(supplier instanceof WindowStoreSupplier)) {
return new InternalTopicConfig(name, Collections.singleton(InternalTopicConfig.CleanupPolicy.compact), supplier.logConfig());
}
final WindowStoreSupplier windowStoreSupplier = (WindowStoreSupplier) supplier;
final InternalTopicConfig config = new InternalTopicConfig(name,
Utils.mkSet(InternalTopicConfig.CleanupPolicy.compact,
InternalTopicConfig.CleanupPolicy.delete),
supplier.logConfig());
config.setRetentionMs(windowStoreSupplier.retentionPeriod());
return config;
}
/**
* Get the Pattern to match all topics requiring to start reading from earliest available offset
* @return the Pattern for matching all topics reading from earliest offset, never null
*/
public synchronized Pattern earliestResetTopicsPattern() {
final List<String> topics = maybeDecorateInternalSourceTopics(earliestResetTopics);
final Pattern earliestPattern = buildPatternForOffsetResetTopics(topics, earliestResetPatterns);
ensureNoRegexOverlap(earliestPattern, latestResetPatterns, latestResetTopics);
return earliestPattern;
}
/**
* Get the Pattern to match all topics requiring to start reading from latest available offset
* @return the Pattern for matching all topics reading from latest offset, never null
*/
public synchronized Pattern latestResetTopicsPattern() {
final List<String> topics = maybeDecorateInternalSourceTopics(latestResetTopics);
final Pattern latestPattern = buildPatternForOffsetResetTopics(topics, latestResetPatterns);
ensureNoRegexOverlap(latestPattern, earliestResetPatterns, earliestResetTopics);
return latestPattern;
}
private void ensureNoRegexOverlap(final Pattern builtPattern, final Set<Pattern> otherPatterns, final Set<String> otherTopics) {
for (Pattern otherPattern : otherPatterns) {
if (builtPattern.pattern().contains(otherPattern.pattern())) {
throw new TopologyBuilderException(String.format("Found overlapping regex [%s] against [%s] for a KStream with auto offset resets", otherPattern.pattern(), builtPattern.pattern()));
}
}
for (String otherTopic : otherTopics) {
if (builtPattern.matcher(otherTopic).matches()) {
throw new TopologyBuilderException(String.format("Found overlapping regex [%s] matching topic [%s] for a KStream with auto offset resets", builtPattern.pattern(), otherTopic));
}
}
}
/**
* Builds a composite pattern out of topic names and Pattern object for matching topic names. If the provided
* arrays are empty a Pattern.compile("") instance is returned.
*
* @param sourceTopics the name of source topics to add to a composite pattern
* @param sourcePatterns Patterns for matching source topics to add to a composite pattern
* @return a Pattern that is composed of the literal source topic names and any Patterns for matching source topics
*/
private static synchronized Pattern buildPatternForOffsetResetTopics(final Collection<String> sourceTopics, final Collection<Pattern> sourcePatterns) {
final StringBuilder builder = new StringBuilder();
for (String topic : sourceTopics) {
builder.append(topic).append("|");
}
for (Pattern sourcePattern : sourcePatterns) {
builder.append(sourcePattern.pattern()).append("|");
}
if (builder.length() > 0) {
builder.setLength(builder.length() - 1);
return Pattern.compile(builder.toString());
}
return EMPTY_ZERO_LENGTH_PATTERN;
}
/**
* @return a mapping from state store name to a Set of source Topics.
*/
public Map<String, List<String>> stateStoreNameToSourceTopics() {
final Map<String, List<String>> results = new HashMap<>();
for (Map.Entry<String, Set<String>> entry : stateStoreNameToSourceTopics.entrySet()) {
results.put(entry.getKey(), maybeDecorateInternalSourceTopics(entry.getValue()));
}
return results;
}
/**
* Returns the copartition groups.
* A copartition group is a group of source topics that are required to be copartitioned.
*
* @return groups of topic names
*/
public synchronized Collection<Set<String>> copartitionGroups() {
final List<Set<String>> list = new ArrayList<>(copartitionSourceGroups.size());
for (Set<String> nodeNames : copartitionSourceGroups) {
Set<String> copartitionGroup = new HashSet<>();
for (String node : nodeNames) {
final List<String> topics = nodeToSourceTopics.get(node);
if (topics != null)
copartitionGroup.addAll(maybeDecorateInternalSourceTopics(topics));
}
list.add(Collections.unmodifiableSet(copartitionGroup));
}
return Collections.unmodifiableList(list);
}
private List<String> maybeDecorateInternalSourceTopics(final Collection<String> sourceTopics) {
final List<String> decoratedTopics = new ArrayList<>();
for (String topic : sourceTopics) {
if (internalTopicNames.contains(topic)) {
decoratedTopics.add(decorateTopic(topic));
} else {
decoratedTopics.add(topic);
}
}
return decoratedTopics;
}
private String decorateTopic(final String topic) {
if (applicationId == null) {
throw new TopologyBuilderException("there are internal topics and "
+ "applicationId hasn't been set. Call "
+ "setApplicationId first");
}
return applicationId + "-" + topic;
}
public synchronized Pattern sourceTopicPattern() {
if (this.topicPattern == null) {
final List<String> allSourceTopics = new ArrayList<>();
if (!nodeToSourceTopics.isEmpty()) {
for (List<String> topics : nodeToSourceTopics.values()) {
allSourceTopics.addAll(maybeDecorateInternalSourceTopics(topics));
}
}
Collections.sort(allSourceTopics);
this.topicPattern = buildPatternForOffsetResetTopics(allSourceTopics, nodeToSourcePatterns.values());
}
return this.topicPattern;
}
public synchronized void updateSubscriptions(final SubscriptionUpdates subscriptionUpdates, final String threadId) {
log.debug("stream-thread [{}] updating builder with {} topic(s) with possible matching regex subscription(s)", threadId, subscriptionUpdates);
this.subscriptionUpdates = subscriptionUpdates;
setRegexMatchedTopicsToSourceNodes();
setRegexMatchedTopicToStateStore();
}
}