CassandraSink.java example

Explorer
flink-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.flink.streaming.connectors.cassandra;

import com.datastax.driver.core.Cluster;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSink;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.operators.ChainingStrategy;
import org.apache.flink.streaming.api.transformations.SinkTransformation;
import org.apache.flink.streaming.api.transformations.StreamTransformation;
import org.apache.flink.streaming.runtime.operators.CheckpointCommitter;

/**
 * This class wraps different Cassandra sink implementations to provide a common interface for all of them.
 *
 * @param <IN> input type
 */
public class CassandraSink<IN> {
	private final boolean useDataStreamSink;
	private DataStreamSink<IN> sink1;
	private SingleOutputStreamOperator<IN> sink2;

	private CassandraSink(DataStreamSink<IN> sink) {
		sink1 = sink;
		useDataStreamSink = true;
	}

	private CassandraSink(SingleOutputStreamOperator<IN> sink) {
		sink2 = sink;
		useDataStreamSink = false;
	}

	private SinkTransformation<IN> getSinkTransformation() {
		return sink1.getTransformation();
	}

	private StreamTransformation<IN> getStreamTransformation() {
		return sink2.getTransformation();
	}

	/**
	 * Sets the name of this sink. This name is
	 * used by the visualization and logging during runtime.
	 *
	 * @return The named sink.
	 */
	public CassandraSink<IN> name(String name) {
		if (useDataStreamSink) {
			getSinkTransformation().setName(name);
		} else {
			getStreamTransformation().setName(name);
		}
		return this;
	}

	/**
	 * Sets an ID for this operator.
	 * <p/>
	 * <p>The specified ID is used to assign the same operator ID across job
	 * submissions (for example when starting a job from a savepoint).
	 * <p/>
	 * <p><strong>Important</strong>: this ID needs to be unique per
	 * transformation and job. Otherwise, job submission will fail.
	 *
	 * @param uid The unique user-specified ID of this transformation.
	 * @return The operator with the specified ID.
	 */
	@PublicEvolving
	public CassandraSink<IN> uid(String uid) {
		if (useDataStreamSink) {
			getSinkTransformation().setUid(uid);
		} else {
			getStreamTransformation().setUid(uid);
		}
		return this;
	}

	/**
	 * Sets an user provided hash for this operator. This will be used AS IS the create the JobVertexID.
	 * <p/>
	 * <p>The user provided hash is an alternative to the generated hashes, that is considered when identifying an
	 * operator through the default hash mechanics fails (e.g. because of changes between Flink versions).
	 * <p/>
	 * <p><strong>Important</strong>: this should be used as a workaround or for trouble shooting. The provided hash
	 * needs to be unique per transformation and job. Otherwise, job submission will fail. Furthermore, you cannot
	 * assign user-specified hash to intermediate nodes in an operator chain and trying so will let your job fail.
	 *
	 * <p>
	 * A use case for this is in migration between Flink versions or changing the jobs in a way that changes the
	 * automatically generated hashes. In this case, providing the previous hashes directly through this method (e.g.
	 * obtained from old logs) can help to reestablish a lost mapping from states to their target operator.
	 * <p/>
	 *
	 * @param uidHash The user provided hash for this operator. This will become the JobVertexID, which is shown in the
	 *                 logs and web ui.
	 * @return The operator with the user provided hash.
	 */
	@PublicEvolving
	public CassandraSink<IN> setUidHash(String uidHash) {
		if (useDataStreamSink) {
			getSinkTransformation().setUidHash(uidHash);
		} else {
			getStreamTransformation().setUidHash(uidHash);
		}
		return this;
	}

	/**
	 * Sets the parallelism for this sink. The degree must be higher than zero.
	 *
	 * @param parallelism The parallelism for this sink.
	 * @return The sink with set parallelism.
	 */
	public CassandraSink<IN> setParallelism(int parallelism) {
		if (useDataStreamSink) {
			getSinkTransformation().setParallelism(parallelism);
		} else {
			getStreamTransformation().setParallelism(parallelism);
		}
		return this;
	}

	/**
	 * Turns off chaining for this operator so thread co-location will not be
	 * used as an optimization.
	 * <p/>
	 * <p/>
	 * Chaining can be turned off for the whole
	 * job by {@link org.apache.flink.streaming.api.environment.StreamExecutionEnvironment#disableOperatorChaining()}
	 * however it is not advised for performance considerations.
	 *
	 * @return The sink with chaining disabled
	 */
	public CassandraSink<IN> disableChaining() {
		if (useDataStreamSink) {
			getSinkTransformation().setChainingStrategy(ChainingStrategy.NEVER);
		} else {
			getStreamTransformation().setChainingStrategy(ChainingStrategy.NEVER);
		}
		return this;
	}

	/**
	 * Sets the slot sharing group of this operation. Parallel instances of
	 * operations that are in the same slot sharing group will be co-located in the same
	 * TaskManager slot, if possible.
	 * <p/>
	 * <p>Operations inherit the slot sharing group of input operations if all input operations
	 * are in the same slot sharing group and no slot sharing group was explicitly specified.
	 * <p/>
	 * <p>Initially an operation is in the default slot sharing group. An operation can be put into
	 * the default group explicitly by setting the slot sharing group to {@code "default"}.
	 *
	 * @param slotSharingGroup The slot sharing group name.
	 */
	public CassandraSink<IN> slotSharingGroup(String slotSharingGroup) {
		if (useDataStreamSink) {
			getSinkTransformation().setSlotSharingGroup(slotSharingGroup);
		} else {
			getStreamTransformation().setSlotSharingGroup(slotSharingGroup);
		}
		return this;
	}

	/**
	 * Writes a DataStream into a Cassandra database.
	 *
	 * @param input input DataStream
	 * @param <IN>  input type
	 * @return CassandraSinkBuilder, to further configure the sink
	 */
	public static <IN, T extends Tuple> CassandraSinkBuilder<IN> addSink(DataStream<IN> input) {
		if (input.getType() instanceof TupleTypeInfo) {
			DataStream<T> tupleInput = (DataStream<T>) input;
			return (CassandraSinkBuilder<IN>) new CassandraTupleSinkBuilder<>(tupleInput, tupleInput.getType(), tupleInput.getType().createSerializer(tupleInput.getExecutionEnvironment().getConfig()));
		} else {
			return new CassandraPojoSinkBuilder<>(input, input.getType(), input.getType().createSerializer(input.getExecutionEnvironment().getConfig()));
		}
	}

	public abstract static class CassandraSinkBuilder<IN> {
		protected final DataStream<IN> input;
		protected final TypeSerializer<IN> serializer;
		protected final TypeInformation<IN> typeInfo;
		protected ClusterBuilder builder;
		protected String query;
		protected CheckpointCommitter committer;
		protected boolean isWriteAheadLogEnabled;

		public CassandraSinkBuilder(DataStream<IN> input, TypeInformation<IN> typeInfo, TypeSerializer<IN> serializer) {
			this.input = input;
			this.typeInfo = typeInfo;
			this.serializer = serializer;
		}

		/**
		 * Sets the query that is to be executed for every record.
		 *
		 * @param query query to use
		 * @return this builder
		 */
		public CassandraSinkBuilder<IN> setQuery(String query) {
			this.query = query;
			return this;
		}

		/**
		 * Sets the cassandra host to connect to.
		 *
		 * @param host host to connect to
		 * @return this builder
		 */
		public CassandraSinkBuilder<IN> setHost(String host) {
			return setHost(host, 9042);
		}

		/**
		 * Sets the cassandra host/port to connect to.
		 *
		 * @param host host to connect to
		 * @param port port to connect to
		 * @return this builder
		 */
		public CassandraSinkBuilder<IN> setHost(final String host, final int port) {
			if (this.builder != null) {
				throw new IllegalArgumentException("Builder was already set. You must use either setHost() or setClusterBuilder().");
			}
			this.builder = new ClusterBuilder() {
				@Override
				protected Cluster buildCluster(Cluster.Builder builder) {
					return builder.addContactPoint(host).withPort(port).build();
				}
			};
			return this;
		}

		/**
		 * Sets the ClusterBuilder for this sink. A ClusterBuilder is used to configure the connection to cassandra.
		 *
		 * @param builder ClusterBuilder to configure the connection to cassandra
		 * @return this builder
		 */
		public CassandraSinkBuilder<IN> setClusterBuilder(ClusterBuilder builder) {
			if (this.builder != null) {
				throw new IllegalArgumentException("Builder was already set. You must use either setHost() or setClusterBuilder().");
			}
			this.builder = builder;
			return this;
		}

		/**
		 * Enables the write-ahead log, which allows exactly-once processing for non-deterministic algorithms that use
		 * idempotent updates.
		 *
		 * @return this builder
		 */
		public CassandraSinkBuilder<IN> enableWriteAheadLog() {
			this.isWriteAheadLogEnabled = true;
			return this;
		}

		/**
		 * Enables the write-ahead log, which allows exactly-once processing for non-deterministic algorithms that use
		 * idempotent updates.
		 *
		 * @param committer CheckpointCommitter, that stores informationa bout completed checkpoints in an external
		 *                  resource. By default this information is stored within a separate table within Cassandra.
		 * @return this builder
		 */
		public CassandraSinkBuilder<IN> enableWriteAheadLog(CheckpointCommitter committer) {
			this.isWriteAheadLogEnabled = true;
			this.committer = committer;
			return this;
		}

		/**
		 * Finalizes the configuration of this sink.
		 *
		 * @return finalized sink
		 * @throws Exception
		 */
		public abstract CassandraSink<IN> build() throws Exception;

		protected void sanityCheck() {
			if (builder == null) {
				throw new IllegalArgumentException("Cassandra host information must be supplied using either setHost() or setClusterBuilder().");
			}
		}
	}

	public static class CassandraTupleSinkBuilder<IN extends Tuple> extends CassandraSinkBuilder<IN> {
		public CassandraTupleSinkBuilder(DataStream<IN> input, TypeInformation<IN> typeInfo, TypeSerializer<IN> serializer) {
			super(input, typeInfo, serializer);
		}

		@Override
		protected void sanityCheck() {
			super.sanityCheck();
			if (query == null || query.length() == 0) {
				throw new IllegalArgumentException("Query must not be null or empty.");
			}
		}

		@Override
		public CassandraSink<IN> build() throws Exception {
			sanityCheck();
			if (isWriteAheadLogEnabled) {
				return committer == null
					? new CassandraSink<>(input.transform("Cassandra Sink", null, new CassandraTupleWriteAheadSink<>(query, serializer, builder, new CassandraCommitter(builder))))
					: new CassandraSink<>(input.transform("Cassandra Sink", null, new CassandraTupleWriteAheadSink<>(query, serializer, builder, committer)));
			} else {
				return new CassandraSink<>(input.addSink(new CassandraTupleSink<IN>(query, builder)).name("Cassandra Sink"));
			}
		}
	}

	public static class CassandraPojoSinkBuilder<IN> extends CassandraSinkBuilder<IN> {
		public CassandraPojoSinkBuilder(DataStream<IN> input, TypeInformation<IN> typeInfo, TypeSerializer<IN> serializer) {
			super(input, typeInfo, serializer);
		}

		@Override
		protected void sanityCheck() {
			super.sanityCheck();
			if (query != null) {
				throw new IllegalArgumentException("Specifying a query is not allowed when using a Pojo-Stream as input.");
			}
		}

		@Override
		public CassandraSink<IN> build() throws Exception {
			sanityCheck();
			if (isWriteAheadLogEnabled) {
				throw new IllegalArgumentException("Exactly-once guarantees can only be provided for tuple types.");
			} else {
				return new CassandraSink<>(input.addSink(new CassandraPojoSink<>(typeInfo.getTypeClass(), builder)).name("Cassandra Sink"));
			}
		}
	}
}