/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.source.extractor.extract.kafka; import io.confluent.kafka.serializers.KafkaAvroDeserializer; import io.confluent.kafka.serializers.KafkaJsonDeserializer; import java.io.IOException; import java.util.Properties; import lombok.AccessLevel; import lombok.AllArgsConstructor; import lombok.Getter; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.commons.lang3.reflect.ConstructorUtils; import org.apache.kafka.common.serialization.ByteArrayDeserializer; import org.apache.kafka.common.serialization.Deserializer; import org.apache.kafka.common.serialization.StringDeserializer; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Enums; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import gobblin.annotation.Alias; import gobblin.configuration.WorkUnitState; import gobblin.kafka.client.ByteArrayBasedKafkaRecord; import gobblin.metrics.kafka.KafkaSchemaRegistry; import gobblin.metrics.kafka.SchemaRegistryException; import gobblin.util.AvroUtils; import gobblin.util.PropertiesUtils; /** * <p> * Extension of {@link KafkaExtractor} that wraps Kafka's {@link Deserializer} API. Kafka's {@link Deserializer} provides * a generic way of converting Kafka {@link kafka.message.Message}s to {@link Object}. Typically, a {@link Deserializer} * will be used along with a {@link org.apache.kafka.common.serialization.Serializer} which is responsible for converting * an {@link Object} to a Kafka {@link kafka.message.Message}. These APIs are useful for reading and writing to Kafka, * since Kafka is primarily a byte oriented system. * </p> * * <p> * This class wraps the {@link Deserializer} API allowing any existing classes that implement the {@link Deserializer} * API to integrate with seamlessly with Gobblin. The deserializer can be specified in the following ways: * * <ul> * <li>{@link #KAFKA_DESERIALIZER_TYPE} can be used to specify a pre-defined enum from {@link Deserializers} or * it can be used to specify the fully-qualified name of a {@link Class} that defines the {@link Deserializer} * interface. If this property is set to a class name, then {@link KafkaSchemaRegistry} must also be specified * using the {@link KafkaSchemaRegistry#KAFKA_SCHEMA_REGISTRY_CLASS} config key</li> * </ul> * </p> */ @Getter(AccessLevel.PACKAGE) @Alias(value = "DESERIALIZER") public class KafkaDeserializerExtractor extends KafkaExtractor<Object, Object> { public static final String KAFKA_DESERIALIZER_TYPE = "kafka.deserializer.type"; private static final String CONFLUENT_SCHEMA_REGISTRY_URL = "schema.registry.url"; private final Deserializer<?> kafkaDeserializer; private final KafkaSchemaRegistry<?, ?> kafkaSchemaRegistry; private final Schema latestSchema; public KafkaDeserializerExtractor(WorkUnitState state) throws ReflectiveOperationException { this(state, getDeserializerType(state.getProperties())); } private KafkaDeserializerExtractor(WorkUnitState state, Optional<Deserializers> deserializerType) throws ReflectiveOperationException { this(state, deserializerType, getDeserializer(getProps(state), deserializerType), getKafkaSchemaRegistry(getProps(state))); } @VisibleForTesting KafkaDeserializerExtractor(WorkUnitState state, Optional<Deserializers> deserializerType, Deserializer<?> kafkaDeserializer, KafkaSchemaRegistry<?, ?> kafkaSchemaRegistry) { super(state); this.kafkaDeserializer = kafkaDeserializer; this.kafkaSchemaRegistry = kafkaSchemaRegistry; this.latestSchema = (deserializerType.equals(Optional.of(Deserializers.CONFLUENT_AVRO))) ? (Schema) getSchema() : null; } @Override protected Object decodeRecord(ByteArrayBasedKafkaRecord messageAndOffset) throws IOException { Object deserialized = kafkaDeserializer.deserialize(this.topicName, messageAndOffset.getMessageBytes()); // For Confluent's Schema Registry the read schema is the latest registered schema to support schema evolution return (this.latestSchema == null) ? deserialized : AvroUtils.convertRecordSchema((GenericRecord) deserialized, this.latestSchema); } @Override public Object getSchema() { try { return this.kafkaSchemaRegistry.getLatestSchemaByTopic(this.topicName); } catch (SchemaRegistryException e) { throw new RuntimeException(e); } } private static Optional<Deserializers> getDeserializerType(Properties props) { Preconditions.checkArgument(props.containsKey(KAFKA_DESERIALIZER_TYPE), "Missing required property " + KAFKA_DESERIALIZER_TYPE); return Enums.getIfPresent(Deserializers.class, props.getProperty(KAFKA_DESERIALIZER_TYPE).toUpperCase()); } /** * Constructs a {@link Deserializer}, using the value of {@link #KAFKA_DESERIALIZER_TYPE}. */ private static Deserializer<?> getDeserializer(Properties props, Optional<Deserializers> deserializerType) throws ReflectiveOperationException { Deserializer<?> deserializer; if (deserializerType.isPresent()) { deserializer = ConstructorUtils.invokeConstructor(deserializerType.get().getDeserializerClass()); } else { deserializer = Deserializer.class .cast(ConstructorUtils.invokeConstructor(Class.forName(props.getProperty(KAFKA_DESERIALIZER_TYPE)))); } deserializer.configure(PropertiesUtils.propsToStringKeyMap(props), false); return deserializer; } /** * Constructs a {@link KafkaSchemaRegistry} using the value of {@link #KAFKA_DESERIALIZER_TYPE}, if not set it * defaults to {@link SimpleKafkaSchemaRegistry}. */ private static KafkaSchemaRegistry<?, ?> getKafkaSchemaRegistry(Properties props) throws ReflectiveOperationException { Optional<Deserializers> deserializerType = Enums.getIfPresent(Deserializers.class, props.getProperty(KAFKA_DESERIALIZER_TYPE).toUpperCase()); if (deserializerType.isPresent()) { return ConstructorUtils.invokeConstructor(deserializerType.get().getSchemaRegistryClass(), props); } if (props.containsKey(KafkaSchemaRegistry.KAFKA_SCHEMA_REGISTRY_CLASS)) { return KafkaSchemaRegistry.get(props); } return new SimpleKafkaSchemaRegistry(props); } /** * Gets {@link Properties} from a {@link WorkUnitState} and sets the config <code>schema.registry.url</code> to value * of {@link KafkaSchemaRegistry#KAFKA_SCHEMA_REGISTRY_URL} if set. This way users don't need to specify both * properties as <code>schema.registry.url</code> is required by the {@link ConfluentKafkaSchemaRegistry}. */ private static Properties getProps(WorkUnitState workUnitState) { Properties properties = workUnitState.getProperties(); if (properties.containsKey(KafkaSchemaRegistry.KAFKA_SCHEMA_REGISTRY_URL)) { properties.setProperty(CONFLUENT_SCHEMA_REGISTRY_URL, properties.getProperty(KafkaSchemaRegistry.KAFKA_SCHEMA_REGISTRY_URL)); } return properties; } /** * Pre-defined {@link Deserializer} that can be referenced by the enum name. */ @AllArgsConstructor @Getter public enum Deserializers { /** * Confluent's Avro {@link Deserializer} * * @see KafkaAvroDeserializer */ CONFLUENT_AVRO(KafkaAvroDeserializer.class, ConfluentKafkaSchemaRegistry.class), /** * Confluent's JSON {@link Deserializer} * * @see KafkaJsonDeserializer */ CONFLUENT_JSON(KafkaJsonDeserializer.class, SimpleKafkaSchemaRegistry.class), /** * A custom {@link Deserializer} for converting <code>byte[]</code> to {@link com.google.gson.JsonElement}s * * @see KafkaGsonDeserializer */ GSON(KafkaGsonDeserializer.class, SimpleKafkaSchemaRegistry.class), /** * A standard Kafka {@link Deserializer} that does nothing, it simply returns the <code>byte[]</code> */ BYTE_ARRAY(ByteArrayDeserializer.class, SimpleKafkaSchemaRegistry.class), /** * A standard Kafka {@link Deserializer} for converting <code>byte[]</code> to {@link String}s */ STRING(StringDeserializer.class, SimpleKafkaSchemaRegistry.class); private final Class<? extends Deserializer> deserializerClass; private final Class<? extends KafkaSchemaRegistry> schemaRegistryClass; } }