/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.io.gcp.pubsub;
import static com.google.common.base.Preconditions.checkState;
import com.google.auto.value.AutoValue;
import com.google.common.collect.ImmutableMap;
import com.google.protobuf.Message;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
import org.apache.beam.sdk.PipelineRunner;
import org.apache.beam.sdk.coders.AvroCoder;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.CoderException;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.coders.VoidCoder;
import org.apache.beam.sdk.extensions.protobuf.ProtoCoder;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.OutgoingMessage;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.ProjectPath;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.SubscriptionPath;
import org.apache.beam.sdk.io.gcp.pubsub.PubsubClient.TopicPath;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.transforms.windowing.AfterWatermark;
import org.apache.beam.sdk.util.CoderUtils;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PDone;
import org.joda.time.Instant;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Read and Write {@link PTransform}s for Cloud Pub/Sub streams. These transforms create
* and consume unbounded {@link PCollection PCollections}.
*
* <h3>Using local emulator</h3>
*
* <p>In order to use local emulator for Pubsub you should use
* {@code PubsubOptions#setPubsubRootUrl(String)} method to set host and port of your
* local emulator.
*
* <h3>Permissions</h3>
*
* <p>Permission requirements depend on the {@link PipelineRunner} that is used to execute the
* Beam pipeline. Please refer to the documentation of corresponding
* {@link PipelineRunner PipelineRunners} for more details.
*/
public class PubsubIO {
private static final Logger LOG = LoggerFactory.getLogger(PubsubIO.class);
/** Factory for creating pubsub client to manage transport. */
private static final PubsubClient.PubsubClientFactory FACTORY = PubsubJsonClient.FACTORY;
/**
* Project IDs must contain 6-63 lowercase letters, digits, or dashes.
* IDs must start with a letter and may not end with a dash.
* This regex isn't exact - this allows for patterns that would be rejected by
* the service, but this is sufficient for basic parsing of table references.
*/
private static final Pattern PROJECT_ID_REGEXP =
Pattern.compile("[a-z][-a-z0-9:.]{4,61}[a-z0-9]");
private static final Pattern SUBSCRIPTION_REGEXP =
Pattern.compile("projects/([^/]+)/subscriptions/(.+)");
private static final Pattern TOPIC_REGEXP = Pattern.compile("projects/([^/]+)/topics/(.+)");
private static final Pattern V1BETA1_SUBSCRIPTION_REGEXP =
Pattern.compile("/subscriptions/([^/]+)/(.+)");
private static final Pattern V1BETA1_TOPIC_REGEXP = Pattern.compile("/topics/([^/]+)/(.+)");
private static final Pattern PUBSUB_NAME_REGEXP = Pattern.compile("[a-zA-Z][-._~%+a-zA-Z0-9]+");
private static final int PUBSUB_NAME_MIN_LENGTH = 3;
private static final int PUBSUB_NAME_MAX_LENGTH = 255;
private static final String SUBSCRIPTION_RANDOM_TEST_PREFIX = "_random/";
private static final String SUBSCRIPTION_STARTING_SIGNAL = "_starting_signal/";
private static final String TOPIC_DEV_NULL_TEST_NAME = "/topics/dev/null";
private static void validateProjectName(String project) {
Matcher match = PROJECT_ID_REGEXP.matcher(project);
if (!match.matches()) {
throw new IllegalArgumentException(
"Illegal project name specified in Pubsub subscription: " + project);
}
}
private static void validatePubsubName(String name) {
if (name.length() < PUBSUB_NAME_MIN_LENGTH) {
throw new IllegalArgumentException(
"Pubsub object name is shorter than 3 characters: " + name);
}
if (name.length() > PUBSUB_NAME_MAX_LENGTH) {
throw new IllegalArgumentException(
"Pubsub object name is longer than 255 characters: " + name);
}
if (name.startsWith("goog")) {
throw new IllegalArgumentException("Pubsub object name cannot start with goog: " + name);
}
Matcher match = PUBSUB_NAME_REGEXP.matcher(name);
if (!match.matches()) {
throw new IllegalArgumentException("Illegal Pubsub object name specified: " + name
+ " Please see Javadoc for naming rules.");
}
}
/**
* Populate common {@link DisplayData} between Pubsub source and sink.
*/
private static void populateCommonDisplayData(DisplayData.Builder builder,
String timestampAttribute, String idAttribute, ValueProvider<PubsubTopic> topic) {
builder
.addIfNotNull(DisplayData.item("timestampAttribute", timestampAttribute)
.withLabel("Timestamp Attribute"))
.addIfNotNull(DisplayData.item("idAttribute", idAttribute)
.withLabel("ID Attribute"));
if (topic != null) {
String topicString = topic.isAccessible() ? topic.get().asPath()
: topic.toString();
builder.add(DisplayData.item("topic", topicString)
.withLabel("Pubsub Topic"));
}
}
/**
* Class representing a Cloud Pub/Sub Subscription.
*/
public static class PubsubSubscription implements Serializable {
private enum Type {NORMAL, FAKE}
private final Type type;
private final String project;
private final String subscription;
private PubsubSubscription(Type type, String project, String subscription) {
this.type = type;
this.project = project;
this.subscription = subscription;
}
/**
* Creates a class representing a Pub/Sub subscription from the specified subscription path.
*
* <p>Cloud Pub/Sub subscription names should be of the form
* {@code projects/<project>/subscriptions/<subscription>}, where {@code <project>} is the name
* of the project the subscription belongs to. The {@code <subscription>} component must comply
* with the following requirements:
*
* <ul>
* <li>Can only contain lowercase letters, numbers, dashes ('-'), underscores ('_') and periods
* ('.').</li>
* <li>Must be between 3 and 255 characters.</li>
* <li>Must begin with a letter.</li>
* <li>Must end with a letter or a number.</li>
* <li>Cannot begin with {@code 'goog'} prefix.</li>
* </ul>
*/
public static PubsubSubscription fromPath(String path) {
if (path.startsWith(SUBSCRIPTION_RANDOM_TEST_PREFIX)
|| path.startsWith(SUBSCRIPTION_STARTING_SIGNAL)) {
return new PubsubSubscription(Type.FAKE, "", path);
}
String projectName, subscriptionName;
Matcher v1beta1Match = V1BETA1_SUBSCRIPTION_REGEXP.matcher(path);
if (v1beta1Match.matches()) {
LOG.warn("Saw subscription in v1beta1 format. Subscriptions should be in the format "
+ "projects/<project_id>/subscriptions/<subscription_name>");
projectName = v1beta1Match.group(1);
subscriptionName = v1beta1Match.group(2);
} else {
Matcher match = SUBSCRIPTION_REGEXP.matcher(path);
if (!match.matches()) {
throw new IllegalArgumentException("Pubsub subscription is not in "
+ "projects/<project_id>/subscriptions/<subscription_name> format: " + path);
}
projectName = match.group(1);
subscriptionName = match.group(2);
}
validateProjectName(projectName);
validatePubsubName(subscriptionName);
return new PubsubSubscription(Type.NORMAL, projectName, subscriptionName);
}
/**
* Returns the string representation of this subscription as a path used in the Cloud Pub/Sub
* v1beta1 API.
*
* @deprecated the v1beta1 API for Cloud Pub/Sub is deprecated.
*/
@Deprecated
public String asV1Beta1Path() {
if (type == Type.NORMAL) {
return "/subscriptions/" + project + "/" + subscription;
} else {
return subscription;
}
}
/**
* Returns the string representation of this subscription as a path used in the Cloud Pub/Sub
* v1beta2 API.
*
* @deprecated the v1beta2 API for Cloud Pub/Sub is deprecated.
*/
@Deprecated
public String asV1Beta2Path() {
if (type == Type.NORMAL) {
return "projects/" + project + "/subscriptions/" + subscription;
} else {
return subscription;
}
}
/**
* Returns the string representation of this subscription as a path used in the Cloud Pub/Sub
* API.
*/
public String asPath() {
if (type == Type.NORMAL) {
return "projects/" + project + "/subscriptions/" + subscription;
} else {
return subscription;
}
}
}
/**
* Used to build a {@link ValueProvider} for {@link PubsubSubscription}.
*/
private static class SubscriptionTranslator
implements SerializableFunction<String, PubsubSubscription> {
@Override
public PubsubSubscription apply(String from) {
return PubsubSubscription.fromPath(from);
}
}
/**
* Used to build a {@link ValueProvider} for {@link SubscriptionPath}.
*/
private static class SubscriptionPathTranslator
implements SerializableFunction<PubsubSubscription, SubscriptionPath> {
@Override
public SubscriptionPath apply(PubsubSubscription from) {
return PubsubClient.subscriptionPathFromName(from.project, from.subscription);
}
}
/**
* Used to build a {@link ValueProvider} for {@link PubsubTopic}.
*/
private static class TopicTranslator
implements SerializableFunction<String, PubsubTopic> {
@Override
public PubsubTopic apply(String from) {
return PubsubTopic.fromPath(from);
}
}
/**
* Used to build a {@link ValueProvider} for {@link TopicPath}.
*/
private static class TopicPathTranslator
implements SerializableFunction<PubsubTopic, TopicPath> {
@Override
public TopicPath apply(PubsubTopic from) {
return PubsubClient.topicPathFromName(from.project, from.topic);
}
}
/**
* Used to build a {@link ValueProvider} for {@link ProjectPath}.
*/
private static class ProjectPathTranslator
implements SerializableFunction<PubsubTopic, ProjectPath> {
@Override
public ProjectPath apply(PubsubTopic from) {
return PubsubClient.projectPathFromId(from.project);
}
}
/**
* Class representing a Cloud Pub/Sub Topic.
*/
public static class PubsubTopic implements Serializable {
private enum Type {NORMAL, FAKE}
private final Type type;
private final String project;
private final String topic;
private PubsubTopic(Type type, String project, String topic) {
this.type = type;
this.project = project;
this.topic = topic;
}
/**
* Creates a class representing a Cloud Pub/Sub topic from the specified topic path.
*
* <p>Cloud Pub/Sub topic names should be of the form
* {@code /topics/<project>/<topic>}, where {@code <project>} is the name of
* the publishing project. The {@code <topic>} component must comply with
* the following requirements:
*
* <ul>
* <li>Can only contain lowercase letters, numbers, dashes ('-'), underscores ('_') and periods
* ('.').</li>
* <li>Must be between 3 and 255 characters.</li>
* <li>Must begin with a letter.</li>
* <li>Must end with a letter or a number.</li>
* <li>Cannot begin with 'goog' prefix.</li>
* </ul>
*/
public static PubsubTopic fromPath(String path) {
if (path.equals(TOPIC_DEV_NULL_TEST_NAME)) {
return new PubsubTopic(Type.FAKE, "", path);
}
String projectName, topicName;
Matcher v1beta1Match = V1BETA1_TOPIC_REGEXP.matcher(path);
if (v1beta1Match.matches()) {
LOG.warn("Saw topic in v1beta1 format. Topics should be in the format "
+ "projects/<project_id>/topics/<topic_name>");
projectName = v1beta1Match.group(1);
topicName = v1beta1Match.group(2);
} else {
Matcher match = TOPIC_REGEXP.matcher(path);
if (!match.matches()) {
throw new IllegalArgumentException(
"Pubsub topic is not in projects/<project_id>/topics/<topic_name> format: " + path);
}
projectName = match.group(1);
topicName = match.group(2);
}
validateProjectName(projectName);
validatePubsubName(topicName);
return new PubsubTopic(Type.NORMAL, projectName, topicName);
}
/**
* Returns the string representation of this topic as a path used in the Cloud Pub/Sub
* v1beta1 API.
*
* @deprecated the v1beta1 API for Cloud Pub/Sub is deprecated.
*/
@Deprecated
public String asV1Beta1Path() {
if (type == Type.NORMAL) {
return "/topics/" + project + "/" + topic;
} else {
return topic;
}
}
/**
* Returns the string representation of this topic as a path used in the Cloud Pub/Sub
* v1beta2 API.
*
* @deprecated the v1beta2 API for Cloud Pub/Sub is deprecated.
*/
@Deprecated
public String asV1Beta2Path() {
if (type == Type.NORMAL) {
return "projects/" + project + "/topics/" + topic;
} else {
return topic;
}
}
/**
* Returns the string representation of this topic as a path used in the Cloud Pub/Sub
* API.
*/
public String asPath() {
if (type == Type.NORMAL) {
return "projects/" + project + "/topics/" + topic;
} else {
return topic;
}
}
}
/** Returns A {@link PTransform} that continuously reads from a Google Cloud Pub/Sub stream. */
private static <T> Read<T> read() {
return new AutoValue_PubsubIO_Read.Builder<T>().setNeedsAttributes(false).build();
}
/**
* Returns A {@link PTransform} that continuously reads from a Google Cloud Pub/Sub stream. The
* messages will only contain a {@link PubsubMessage#getPayload() payload}, but no {@link
* PubsubMessage#getAttributeMap() attributes}.
*/
public static Read<PubsubMessage> readMessages() {
return new AutoValue_PubsubIO_Read.Builder<PubsubMessage>()
.setCoder(PubsubMessagePayloadOnlyCoder.of())
.setParseFn(new IdentityMessageFn())
.setNeedsAttributes(false)
.build();
}
/**
* Returns A {@link PTransform} that continuously reads from a Google Cloud Pub/Sub stream. The
* messages will contain both a {@link PubsubMessage#getPayload() payload} and {@link
* PubsubMessage#getAttributeMap() attributes}.
*/
public static Read<PubsubMessage> readMessagesWithAttributes() {
return new AutoValue_PubsubIO_Read.Builder<PubsubMessage>()
.setCoder(PubsubMessageWithAttributesCoder.of())
.setParseFn(new IdentityMessageFn())
.setNeedsAttributes(true)
.build();
}
/**
* Returns A {@link PTransform} that continuously reads UTF-8 encoded strings from a Google Cloud
* Pub/Sub stream.
*/
public static Read<String> readStrings() {
return PubsubIO.<String>read().withCoderAndParseFn(
StringUtf8Coder.of(), new ParsePayloadAsUtf8());
}
/**
* Returns A {@link PTransform} that continuously reads binary encoded protobuf messages of the
* given type from a Google Cloud Pub/Sub stream.
*/
public static <T extends Message> Read<T> readProtos(Class<T> messageClass) {
// TODO: Stop using ProtoCoder and instead parse the payload directly.
// We should not be relying on the fact that ProtoCoder's wire format is identical to
// the protobuf wire format, as the wire format is not part of a coder's API.
ProtoCoder<T> coder = ProtoCoder.of(messageClass);
return PubsubIO.<T>read().withCoderAndParseFn(coder, new ParsePayloadUsingCoder<>(coder));
}
/**
* Returns A {@link PTransform} that continuously reads binary encoded Avro messages of the
* given type from a Google Cloud Pub/Sub stream.
*/
public static <T> Read<T> readAvros(Class<T> clazz) {
// TODO: Stop using AvroCoder and instead parse the payload directly.
// We should not be relying on the fact that AvroCoder's wire format is identical to
// the Avro wire format, as the wire format is not part of a coder's API.
AvroCoder<T> coder = AvroCoder.of(clazz);
return PubsubIO.<T>read().withCoderAndParseFn(coder, new ParsePayloadUsingCoder<>(coder));
}
/** Returns A {@link PTransform} that writes to a Google Cloud Pub/Sub stream. */
private static <T> Write<T> write() {
return new AutoValue_PubsubIO_Write.Builder<T>().build();
}
/** Returns A {@link PTransform} that writes to a Google Cloud Pub/Sub stream. */
public static Write<PubsubMessage> writeMessages() {
return PubsubIO.<PubsubMessage>write().withFormatFn(new IdentityMessageFn());
}
/**
* Returns A {@link PTransform} that writes UTF-8 encoded strings to a Google Cloud Pub/Sub
* stream.
*/
public static Write<String> writeStrings() {
return PubsubIO.<String>write().withFormatFn(new FormatPayloadAsUtf8());
}
/**
* Returns A {@link PTransform} that writes binary encoded protobuf messages of a given type
* to a Google Cloud Pub/Sub stream.
*/
public static <T extends Message> Write<T> writeProtos(Class<T> messageClass) {
// TODO: Like in readProtos(), stop using ProtoCoder and instead format the payload directly.
return PubsubIO.<T>write()
.withFormatFn(new FormatPayloadUsingCoder<>(ProtoCoder.of(messageClass)));
}
/**
* Returns A {@link PTransform} that writes binary encoded Avro messages of a given type
* to a Google Cloud Pub/Sub stream.
*/
public static <T> Write<T> writeAvros(Class<T> clazz) {
// TODO: Like in readAvros(), stop using AvroCoder and instead format the payload directly.
return PubsubIO.<T>write().withFormatFn(new FormatPayloadUsingCoder<>(AvroCoder.of(clazz)));
}
/** Implementation of {@link #read}. */
@AutoValue
public abstract static class Read<T> extends PTransform<PBegin, PCollection<T>> {
@Nullable
abstract ValueProvider<PubsubTopic> getTopicProvider();
@Nullable
abstract ValueProvider<PubsubSubscription> getSubscriptionProvider();
/** The name of the message attribute to read timestamps from. */
@Nullable
abstract String getTimestampAttribute();
/** The name of the message attribute to read unique message IDs from. */
@Nullable
abstract String getIdAttribute();
/** The coder used to decode each record. */
@Nullable
abstract Coder<T> getCoder();
/** User function for parsing PubsubMessage object. */
@Nullable
abstract SimpleFunction<PubsubMessage, T> getParseFn();
abstract boolean getNeedsAttributes();
abstract Builder<T> toBuilder();
@AutoValue.Builder
abstract static class Builder<T> {
abstract Builder<T> setTopicProvider(ValueProvider<PubsubTopic> topic);
abstract Builder<T> setSubscriptionProvider(ValueProvider<PubsubSubscription> subscription);
abstract Builder<T> setTimestampAttribute(String timestampAttribute);
abstract Builder<T> setIdAttribute(String idAttribute);
abstract Builder<T> setCoder(Coder<T> coder);
abstract Builder<T> setParseFn(SimpleFunction<PubsubMessage, T> parseFn);
abstract Builder<T> setNeedsAttributes(boolean needsAttributes);
abstract Read<T> build();
}
/**
* Reads from the given subscription.
*
* <p>See {@link PubsubIO.PubsubSubscription#fromPath(String)} for more details on the format
* of the {@code subscription} string.
*
* <p>Multiple readers reading from the same subscription will each receive
* some arbitrary portion of the data. Most likely, separate readers should
* use their own subscriptions.
*/
public Read<T> fromSubscription(String subscription) {
return fromSubscription(StaticValueProvider.of(subscription));
}
/**
* Like {@code subscription()} but with a {@link ValueProvider}.
*/
public Read<T> fromSubscription(ValueProvider<String> subscription) {
if (subscription.isAccessible()) {
// Validate.
PubsubSubscription.fromPath(subscription.get());
}
return toBuilder()
.setSubscriptionProvider(
NestedValueProvider.of(subscription, new SubscriptionTranslator()))
.build();
}
/**
* Creates and returns a transform for reading from a Cloud Pub/Sub topic. Mutually exclusive
* with {@link #fromSubscription(String)}.
*
* <p>See {@link PubsubIO.PubsubTopic#fromPath(String)} for more details on the format
* of the {@code topic} string.
*
* <p>The Beam runner will start reading data published on this topic from the time the
* pipeline is started. Any data published on the topic before the pipeline is started will
* not be read by the runner.
*/
public Read<T> fromTopic(String topic) {
return fromTopic(StaticValueProvider.of(topic));
}
/**
* Like {@code topic()} but with a {@link ValueProvider}.
*/
public Read<T> fromTopic(ValueProvider<String> topic) {
if (topic.isAccessible()) {
// Validate.
PubsubTopic.fromPath(topic.get());
}
return toBuilder()
.setTopicProvider(NestedValueProvider.of(topic, new TopicTranslator()))
.build();
}
/**
* When reading from Cloud Pub/Sub where record timestamps are provided as Pub/Sub message
* attributes, specifies the name of the attribute that contains the timestamp.
*
* <p>The timestamp value is expected to be represented in the attribute as either:
*
* <ul>
* <li>a numerical value representing the number of milliseconds since the Unix epoch. For
* example, if using the Joda time classes, {@link Instant#getMillis()} returns the correct
* value for this attribute.
* <li>a String in RFC 3339 format. For example, {@code 2015-10-29T23:41:41.123Z}. The
* sub-second component of the timestamp is optional, and digits beyond the first three
* (i.e., time units smaller than milliseconds) will be ignored.
* </ul>
*
* <p>If {@code timestampAttribute} is not provided, the system will generate record timestamps
* the first time it sees each record. All windowing will be done relative to these
* timestamps.
*
* <p>By default, windows are emitted based on an estimate of when this source is likely
* done producing data for a given timestamp (referred to as the Watermark; see
* {@link AfterWatermark} for more details). Any late data will be handled by the trigger
* specified with the windowing strategy – by default it will be output immediately.
*
* <p>Note that the system can guarantee that no late data will ever be seen when it assigns
* timestamps by arrival time (i.e. {@code timestampAttribute} is not provided).
*
* @see <a href="https://www.ietf.org/rfc/rfc3339.txt">RFC 3339</a>
*/
public Read<T> withTimestampAttribute(String timestampAttribute) {
return toBuilder().setTimestampAttribute(timestampAttribute).build();
}
/**
* When reading from Cloud Pub/Sub where unique record identifiers are provided as Pub/Sub
* message attributes, specifies the name of the attribute containing the unique identifier.
* The value of the attribute can be any string that uniquely identifies this record.
*
* <p>Pub/Sub cannot guarantee that no duplicate data will be delivered on the Pub/Sub stream.
* If {@code idAttribute} is not provided, Beam cannot guarantee that no duplicate data will
* be delivered, and deduplication of the stream will be strictly best effort.
*/
public Read<T> withIdAttribute(String idAttribute) {
return toBuilder().setIdAttribute(idAttribute).build();
}
/**
* Causes the source to return a PubsubMessage that includes Pubsub attributes, and uses the
* given parsing function to transform the PubsubMessage into an output type.
* A Coder for the output type T must be registered or set on the output via
* {@link PCollection#setCoder(Coder)}.
*/
private Read<T> withCoderAndParseFn(Coder<T> coder, SimpleFunction<PubsubMessage, T> parseFn) {
return toBuilder().setCoder(coder).setParseFn(parseFn).build();
}
@Override
public PCollection<T> expand(PBegin input) {
if (getTopicProvider() == null && getSubscriptionProvider() == null) {
throw new IllegalStateException(
"Need to set either the topic or the subscription for " + "a PubsubIO.Read transform");
}
if (getTopicProvider() != null && getSubscriptionProvider() != null) {
throw new IllegalStateException(
"Can't set both the topic and the subscription for " + "a PubsubIO.Read transform");
}
@Nullable
ValueProvider<ProjectPath> projectPath =
getTopicProvider() == null
? null
: NestedValueProvider.of(getTopicProvider(), new ProjectPathTranslator());
@Nullable
ValueProvider<TopicPath> topicPath =
getTopicProvider() == null
? null
: NestedValueProvider.of(getTopicProvider(), new TopicPathTranslator());
@Nullable
ValueProvider<SubscriptionPath> subscriptionPath =
getSubscriptionProvider() == null
? null
: NestedValueProvider.of(getSubscriptionProvider(), new SubscriptionPathTranslator());
PubsubUnboundedSource source =
new PubsubUnboundedSource(
FACTORY,
projectPath,
topicPath,
subscriptionPath,
getTimestampAttribute(),
getIdAttribute(),
getNeedsAttributes());
return input.apply(source).apply(MapElements.via(getParseFn()));
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
populateCommonDisplayData(
builder, getTimestampAttribute(), getIdAttribute(), getTopicProvider());
if (getSubscriptionProvider() != null) {
String subscriptionString = getSubscriptionProvider().isAccessible()
? getSubscriptionProvider().get().asPath() : getSubscriptionProvider().toString();
builder.add(DisplayData.item("subscription", subscriptionString)
.withLabel("Pubsub Subscription"));
}
}
@Override
protected Coder<T> getDefaultOutputCoder() {
return getCoder();
}
}
/////////////////////////////////////////////////////////////////////////////
/** Disallow construction of utility class. */
private PubsubIO() {}
/** Implementation of {@link #write}. */
@AutoValue
public abstract static class Write<T> extends PTransform<PCollection<T>, PDone> {
@Nullable
abstract ValueProvider<PubsubTopic> getTopicProvider();
/** The name of the message attribute to publish message timestamps in. */
@Nullable
abstract String getTimestampAttribute();
/** The name of the message attribute to publish unique message IDs in. */
@Nullable
abstract String getIdAttribute();
/** The format function for input PubsubMessage objects. */
@Nullable
abstract SimpleFunction<T, PubsubMessage> getFormatFn();
abstract Builder<T> toBuilder();
@AutoValue.Builder
abstract static class Builder<T> {
abstract Builder<T> setTopicProvider(ValueProvider<PubsubTopic> topicProvider);
abstract Builder<T> setTimestampAttribute(String timestampAttribute);
abstract Builder<T> setIdAttribute(String idAttribute);
abstract Builder<T> setFormatFn(SimpleFunction<T, PubsubMessage> formatFn);
abstract Write<T> build();
}
/**
* Publishes to the specified topic.
*
* <p>See {@link PubsubIO.PubsubTopic#fromPath(String)} for more details on the format of the
* {@code topic} string.
*/
public Write<T> to(String topic) {
return to(StaticValueProvider.of(topic));
}
/**
* Like {@code topic()} but with a {@link ValueProvider}.
*/
public Write<T> to(ValueProvider<String> topic) {
return toBuilder()
.setTopicProvider(NestedValueProvider.of(topic, new TopicTranslator()))
.build();
}
/**
* Writes to Pub/Sub and adds each record's timestamp to the published messages in an attribute
* with the specified name. The value of the attribute will be a number representing the number
* of milliseconds since the Unix epoch. For example, if using the Joda
* time classes, {@link Instant#Instant(long)} can be used to parse this value.
*
* <p>If the output from this sink is being read by another Beam pipeline, then
* {@link PubsubIO.Read#withTimestampAttribute(String)} can be used to ensure the other source
* reads these timestamps from the appropriate attribute.
*/
public Write<T> withTimestampAttribute(String timestampAttribute) {
return toBuilder().setTimestampAttribute(timestampAttribute).build();
}
/**
* Writes to Pub/Sub, adding each record's unique identifier to the published messages in an
* attribute with the specified name. The value of the attribute is an opaque string.
*
* <p>If the the output from this sink is being read by another Beam pipeline, then {@link
* PubsubIO.Read#withIdAttribute(String)} can be used to ensure that* the other source reads
* these unique identifiers from the appropriate attribute.
*/
public Write<T> withIdAttribute(String idAttribute) {
return toBuilder().setIdAttribute(idAttribute).build();
}
/**
* Used to write a PubSub message together with PubSub attributes. The user-supplied format
* function translates the input type T to a PubsubMessage object, which is used by the sink
* to separately set the PubSub message's payload and attributes.
*/
private Write<T> withFormatFn(SimpleFunction<T, PubsubMessage> formatFn) {
return toBuilder().setFormatFn(formatFn).build();
}
@Override
public PDone expand(PCollection<T> input) {
if (getTopicProvider() == null) {
throw new IllegalStateException("need to set the topic of a PubsubIO.Write transform");
}
switch (input.isBounded()) {
case BOUNDED:
input.apply(ParDo.of(new PubsubBoundedWriter()));
return PDone.in(input.getPipeline());
case UNBOUNDED:
return input.apply(MapElements.via(getFormatFn())).apply(new PubsubUnboundedSink(
FACTORY,
NestedValueProvider.of(getTopicProvider(), new TopicPathTranslator()),
getTimestampAttribute(),
getIdAttribute(),
100 /* numShards */));
}
throw new RuntimeException(); // cases are exhaustive.
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
populateCommonDisplayData(
builder, getTimestampAttribute(), getIdAttribute(), getTopicProvider());
}
@Override
protected Coder<Void> getDefaultOutputCoder() {
return VoidCoder.of();
}
/**
* Writer to Pubsub which batches messages from bounded collections.
*
* <p>Public so can be suppressed by runners.
*/
public class PubsubBoundedWriter extends DoFn<T, Void> {
private static final int MAX_PUBLISH_BATCH_SIZE = 100;
private transient List<OutgoingMessage> output;
private transient PubsubClient pubsubClient;
@StartBundle
public void startBundle(StartBundleContext c) throws IOException {
this.output = new ArrayList<>();
// NOTE: idAttribute is ignored.
this.pubsubClient =
FACTORY.newClient(getTimestampAttribute(), null,
c.getPipelineOptions().as(PubsubOptions.class));
}
@ProcessElement
public void processElement(ProcessContext c) throws IOException {
byte[] payload;
PubsubMessage message = getFormatFn().apply(c.element());
payload = message.getPayload();
Map<String, String> attributes = message.getAttributeMap();
// NOTE: The record id is always null.
output.add(new OutgoingMessage(payload, attributes, c.timestamp().getMillis(), null));
if (output.size() >= MAX_PUBLISH_BATCH_SIZE) {
publish();
}
}
@FinishBundle
public void finishBundle() throws IOException {
if (!output.isEmpty()) {
publish();
}
output = null;
pubsubClient.close();
pubsubClient = null;
}
private void publish() throws IOException {
PubsubTopic topic = getTopicProvider().get();
int n =
pubsubClient.publish(
PubsubClient.topicPathFromName(
topic.project, topic.topic),
output);
checkState(n == output.size());
output.clear();
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
builder.delegate(Write.this);
}
}
}
private static class ParsePayloadAsUtf8 extends SimpleFunction<PubsubMessage, String> {
@Override
public String apply(PubsubMessage input) {
return new String(input.getPayload(), StandardCharsets.UTF_8);
}
}
private static class ParsePayloadUsingCoder<T> extends SimpleFunction<PubsubMessage, T> {
private Coder<T> coder;
public ParsePayloadUsingCoder(Coder<T> coder) {
this.coder = coder;
}
@Override
public T apply(PubsubMessage input) {
try {
return CoderUtils.decodeFromByteArray(coder, input.getPayload());
} catch (CoderException e) {
throw new RuntimeException("Could not decode Pubsub message", e);
}
}
}
private static class FormatPayloadAsUtf8 extends SimpleFunction<String, PubsubMessage> {
@Override
public PubsubMessage apply(String input) {
return new PubsubMessage(
input.getBytes(StandardCharsets.UTF_8), ImmutableMap.<String, String>of());
}
}
private static class FormatPayloadUsingCoder<T> extends SimpleFunction<T, PubsubMessage> {
private Coder<T> coder;
public FormatPayloadUsingCoder(Coder<T> coder) {
this.coder = coder;
}
@Override
public PubsubMessage apply(T input) {
try {
return new PubsubMessage(
CoderUtils.encodeToByteArray(coder, input), ImmutableMap.<String, String>of());
} catch (CoderException e) {
throw new RuntimeException("Could not decode Pubsub message", e);
}
}
}
private static class IdentityMessageFn extends SimpleFunction<PubsubMessage, PubsubMessage> {
@Override
public PubsubMessage apply(PubsubMessage input) {
return input;
}
}
}