/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.streaming.connectors.cassandra;
import com.datastax.driver.core.Cluster;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.typeutils.TupleTypeInfo;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSink;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.operators.ChainingStrategy;
import org.apache.flink.streaming.api.transformations.SinkTransformation;
import org.apache.flink.streaming.api.transformations.StreamTransformation;
import org.apache.flink.streaming.runtime.operators.CheckpointCommitter;
/**
* This class wraps different Cassandra sink implementations to provide a common interface for all of them.
*
* @param <IN> input type
*/
public class CassandraSink<IN> {
private final boolean useDataStreamSink;
private DataStreamSink<IN> sink1;
private SingleOutputStreamOperator<IN> sink2;
private CassandraSink(DataStreamSink<IN> sink) {
sink1 = sink;
useDataStreamSink = true;
}
private CassandraSink(SingleOutputStreamOperator<IN> sink) {
sink2 = sink;
useDataStreamSink = false;
}
private SinkTransformation<IN> getSinkTransformation() {
return sink1.getTransformation();
}
private StreamTransformation<IN> getStreamTransformation() {
return sink2.getTransformation();
}
/**
* Sets the name of this sink. This name is
* used by the visualization and logging during runtime.
*
* @return The named sink.
*/
public CassandraSink<IN> name(String name) {
if (useDataStreamSink) {
getSinkTransformation().setName(name);
} else {
getStreamTransformation().setName(name);
}
return this;
}
/**
* Sets an ID for this operator.
* <p/>
* <p>The specified ID is used to assign the same operator ID across job
* submissions (for example when starting a job from a savepoint).
* <p/>
* <p><strong>Important</strong>: this ID needs to be unique per
* transformation and job. Otherwise, job submission will fail.
*
* @param uid The unique user-specified ID of this transformation.
* @return The operator with the specified ID.
*/
@PublicEvolving
public CassandraSink<IN> uid(String uid) {
if (useDataStreamSink) {
getSinkTransformation().setUid(uid);
} else {
getStreamTransformation().setUid(uid);
}
return this;
}
/**
* Sets an user provided hash for this operator. This will be used AS IS the create the JobVertexID.
* <p/>
* <p>The user provided hash is an alternative to the generated hashes, that is considered when identifying an
* operator through the default hash mechanics fails (e.g. because of changes between Flink versions).
* <p/>
* <p><strong>Important</strong>: this should be used as a workaround or for trouble shooting. The provided hash
* needs to be unique per transformation and job. Otherwise, job submission will fail. Furthermore, you cannot
* assign user-specified hash to intermediate nodes in an operator chain and trying so will let your job fail.
*
* <p>
* A use case for this is in migration between Flink versions or changing the jobs in a way that changes the
* automatically generated hashes. In this case, providing the previous hashes directly through this method (e.g.
* obtained from old logs) can help to reestablish a lost mapping from states to their target operator.
* <p/>
*
* @param uidHash The user provided hash for this operator. This will become the JobVertexID, which is shown in the
* logs and web ui.
* @return The operator with the user provided hash.
*/
@PublicEvolving
public CassandraSink<IN> setUidHash(String uidHash) {
if (useDataStreamSink) {
getSinkTransformation().setUidHash(uidHash);
} else {
getStreamTransformation().setUidHash(uidHash);
}
return this;
}
/**
* Sets the parallelism for this sink. The degree must be higher than zero.
*
* @param parallelism The parallelism for this sink.
* @return The sink with set parallelism.
*/
public CassandraSink<IN> setParallelism(int parallelism) {
if (useDataStreamSink) {
getSinkTransformation().setParallelism(parallelism);
} else {
getStreamTransformation().setParallelism(parallelism);
}
return this;
}
/**
* Turns off chaining for this operator so thread co-location will not be
* used as an optimization.
* <p/>
* <p/>
* Chaining can be turned off for the whole
* job by {@link org.apache.flink.streaming.api.environment.StreamExecutionEnvironment#disableOperatorChaining()}
* however it is not advised for performance considerations.
*
* @return The sink with chaining disabled
*/
public CassandraSink<IN> disableChaining() {
if (useDataStreamSink) {
getSinkTransformation().setChainingStrategy(ChainingStrategy.NEVER);
} else {
getStreamTransformation().setChainingStrategy(ChainingStrategy.NEVER);
}
return this;
}
/**
* Sets the slot sharing group of this operation. Parallel instances of
* operations that are in the same slot sharing group will be co-located in the same
* TaskManager slot, if possible.
* <p/>
* <p>Operations inherit the slot sharing group of input operations if all input operations
* are in the same slot sharing group and no slot sharing group was explicitly specified.
* <p/>
* <p>Initially an operation is in the default slot sharing group. An operation can be put into
* the default group explicitly by setting the slot sharing group to {@code "default"}.
*
* @param slotSharingGroup The slot sharing group name.
*/
public CassandraSink<IN> slotSharingGroup(String slotSharingGroup) {
if (useDataStreamSink) {
getSinkTransformation().setSlotSharingGroup(slotSharingGroup);
} else {
getStreamTransformation().setSlotSharingGroup(slotSharingGroup);
}
return this;
}
/**
* Writes a DataStream into a Cassandra database.
*
* @param input input DataStream
* @param <IN> input type
* @return CassandraSinkBuilder, to further configure the sink
*/
public static <IN, T extends Tuple> CassandraSinkBuilder<IN> addSink(DataStream<IN> input) {
if (input.getType() instanceof TupleTypeInfo) {
DataStream<T> tupleInput = (DataStream<T>) input;
return (CassandraSinkBuilder<IN>) new CassandraTupleSinkBuilder<>(tupleInput, tupleInput.getType(), tupleInput.getType().createSerializer(tupleInput.getExecutionEnvironment().getConfig()));
} else {
return new CassandraPojoSinkBuilder<>(input, input.getType(), input.getType().createSerializer(input.getExecutionEnvironment().getConfig()));
}
}
public abstract static class CassandraSinkBuilder<IN> {
protected final DataStream<IN> input;
protected final TypeSerializer<IN> serializer;
protected final TypeInformation<IN> typeInfo;
protected ClusterBuilder builder;
protected String query;
protected CheckpointCommitter committer;
protected boolean isWriteAheadLogEnabled;
public CassandraSinkBuilder(DataStream<IN> input, TypeInformation<IN> typeInfo, TypeSerializer<IN> serializer) {
this.input = input;
this.typeInfo = typeInfo;
this.serializer = serializer;
}
/**
* Sets the query that is to be executed for every record.
*
* @param query query to use
* @return this builder
*/
public CassandraSinkBuilder<IN> setQuery(String query) {
this.query = query;
return this;
}
/**
* Sets the cassandra host to connect to.
*
* @param host host to connect to
* @return this builder
*/
public CassandraSinkBuilder<IN> setHost(String host) {
return setHost(host, 9042);
}
/**
* Sets the cassandra host/port to connect to.
*
* @param host host to connect to
* @param port port to connect to
* @return this builder
*/
public CassandraSinkBuilder<IN> setHost(final String host, final int port) {
if (this.builder != null) {
throw new IllegalArgumentException("Builder was already set. You must use either setHost() or setClusterBuilder().");
}
this.builder = new ClusterBuilder() {
@Override
protected Cluster buildCluster(Cluster.Builder builder) {
return builder.addContactPoint(host).withPort(port).build();
}
};
return this;
}
/**
* Sets the ClusterBuilder for this sink. A ClusterBuilder is used to configure the connection to cassandra.
*
* @param builder ClusterBuilder to configure the connection to cassandra
* @return this builder
*/
public CassandraSinkBuilder<IN> setClusterBuilder(ClusterBuilder builder) {
if (this.builder != null) {
throw new IllegalArgumentException("Builder was already set. You must use either setHost() or setClusterBuilder().");
}
this.builder = builder;
return this;
}
/**
* Enables the write-ahead log, which allows exactly-once processing for non-deterministic algorithms that use
* idempotent updates.
*
* @return this builder
*/
public CassandraSinkBuilder<IN> enableWriteAheadLog() {
this.isWriteAheadLogEnabled = true;
return this;
}
/**
* Enables the write-ahead log, which allows exactly-once processing for non-deterministic algorithms that use
* idempotent updates.
*
* @param committer CheckpointCommitter, that stores informationa bout completed checkpoints in an external
* resource. By default this information is stored within a separate table within Cassandra.
* @return this builder
*/
public CassandraSinkBuilder<IN> enableWriteAheadLog(CheckpointCommitter committer) {
this.isWriteAheadLogEnabled = true;
this.committer = committer;
return this;
}
/**
* Finalizes the configuration of this sink.
*
* @return finalized sink
* @throws Exception
*/
public abstract CassandraSink<IN> build() throws Exception;
protected void sanityCheck() {
if (builder == null) {
throw new IllegalArgumentException("Cassandra host information must be supplied using either setHost() or setClusterBuilder().");
}
}
}
public static class CassandraTupleSinkBuilder<IN extends Tuple> extends CassandraSinkBuilder<IN> {
public CassandraTupleSinkBuilder(DataStream<IN> input, TypeInformation<IN> typeInfo, TypeSerializer<IN> serializer) {
super(input, typeInfo, serializer);
}
@Override
protected void sanityCheck() {
super.sanityCheck();
if (query == null || query.length() == 0) {
throw new IllegalArgumentException("Query must not be null or empty.");
}
}
@Override
public CassandraSink<IN> build() throws Exception {
sanityCheck();
if (isWriteAheadLogEnabled) {
return committer == null
? new CassandraSink<>(input.transform("Cassandra Sink", null, new CassandraTupleWriteAheadSink<>(query, serializer, builder, new CassandraCommitter(builder))))
: new CassandraSink<>(input.transform("Cassandra Sink", null, new CassandraTupleWriteAheadSink<>(query, serializer, builder, committer)));
} else {
return new CassandraSink<>(input.addSink(new CassandraTupleSink<IN>(query, builder)).name("Cassandra Sink"));
}
}
}
public static class CassandraPojoSinkBuilder<IN> extends CassandraSinkBuilder<IN> {
public CassandraPojoSinkBuilder(DataStream<IN> input, TypeInformation<IN> typeInfo, TypeSerializer<IN> serializer) {
super(input, typeInfo, serializer);
}
@Override
protected void sanityCheck() {
super.sanityCheck();
if (query != null) {
throw new IllegalArgumentException("Specifying a query is not allowed when using a Pojo-Stream as input.");
}
}
@Override
public CassandraSink<IN> build() throws Exception {
sanityCheck();
if (isWriteAheadLogEnabled) {
throw new IllegalArgumentException("Exactly-once guarantees can only be provided for tuple types.");
} else {
return new CassandraSink<>(input.addSink(new CassandraPojoSink<>(typeInfo.getTypeClass(), builder)).name("Cassandra Sink"));
}
}
}
}