/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.service; import java.io.ByteArrayInputStream; import java.io.Closeable; import java.io.DataInputStream; import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Properties; import java.util.concurrent.Future; import java.util.regex.Pattern; import org.apache.avro.io.BinaryDecoder; import org.apache.avro.io.Decoder; import org.apache.avro.io.DecoderFactory; import org.apache.avro.specific.SpecificDatumReader; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; import org.slf4j.Logger; import com.google.common.base.Optional; import com.google.common.collect.Lists; import com.typesafe.config.Config; import gobblin.kafka.client.ByteArrayBasedKafkaRecord; import gobblin.kafka.client.DecodeableKafkaRecord; import gobblin.kafka.client.GobblinKafkaConsumerClient; import gobblin.kafka.client.Kafka08ConsumerClient; import gobblin.kafka.client.KafkaConsumerRecord; import gobblin.metrics.reporter.util.FixedSchemaVersionWriter; import gobblin.metrics.reporter.util.SchemaVersionWriter; import gobblin.runtime.api.JobSpec; import gobblin.runtime.api.Spec; import gobblin.runtime.api.SpecExecutorInstanceConsumer; import gobblin.runtime.job_spec.AvroJobSpec; import gobblin.source.extractor.extract.kafka.KafkaOffsetRetrievalFailureException; import gobblin.source.extractor.extract.kafka.KafkaPartition; import gobblin.source.extractor.extract.kafka.KafkaTopic; import gobblin.util.CompletedFuture; public class SimpleKafkaSpecExecutorInstanceConsumer extends SimpleKafkaSpecExecutorInstance implements SpecExecutorInstanceConsumer<Spec>, Closeable { // Consumer protected final GobblinKafkaConsumerClient _kafka08Consumer; protected final List<KafkaPartition> _partitions; protected final List<Long> _lowWatermark; protected final List<Long> _nextWatermark; protected final List<Long> _highWatermark; private Iterator<KafkaConsumerRecord> messageIterator = null; private int currentPartitionIdx = -1; private boolean isFirstRun = true; private final BinaryDecoder _decoder; private final SpecificDatumReader<AvroJobSpec> _reader; private final SchemaVersionWriter<?> _versionWriter; public SimpleKafkaSpecExecutorInstanceConsumer(Config config, Optional<Logger> log) { super(config, log); // Consumer _kafka08Consumer = new Kafka08ConsumerClient.Factory().create(config); List<KafkaTopic> kafkaTopics = _kafka08Consumer.getFilteredTopics(Collections.EMPTY_LIST, Lists.newArrayList(Pattern.compile(config.getString(SPEC_KAFKA_TOPICS_KEY)))); _partitions = kafkaTopics.get(0).getPartitions(); _lowWatermark = Lists.newArrayList(Collections.nCopies(_partitions.size(), 0L)); _nextWatermark = Lists.newArrayList(Collections.nCopies(_partitions.size(), 0L)); _highWatermark = Lists.newArrayList(Collections.nCopies(_partitions.size(), 0L)); InputStream dummyInputStream = new ByteArrayInputStream(new byte[0]); _decoder = DecoderFactory.get().binaryDecoder(dummyInputStream, null); _reader = new SpecificDatumReader<AvroJobSpec>(AvroJobSpec.SCHEMA$); _versionWriter = new FixedSchemaVersionWriter(); } public SimpleKafkaSpecExecutorInstanceConsumer(Config config, Logger log) { this(config, Optional.of(log)); } /** Constructor with no logging */ public SimpleKafkaSpecExecutorInstanceConsumer(Config config) { this(config, Optional.<Logger>absent()); } @Override public Future<? extends List<Pair<Verb, Spec>>> changedSpecs() { List<Pair<Verb, Spec>> changesSpecs = new ArrayList<>(); initializeWatermarks(); this.currentPartitionIdx = -1; while (!allPartitionsFinished()) { if (currentPartitionFinished()) { moveToNextPartition(); continue; } if (this.messageIterator == null || !this.messageIterator.hasNext()) { try { this.messageIterator = fetchNextMessageBuffer(); } catch (Exception e) { _log.error(String.format("Failed to fetch next message buffer for partition %s. Will skip this partition.", getCurrentPartition()), e); moveToNextPartition(); continue; } if (this.messageIterator == null || !this.messageIterator.hasNext()) { moveToNextPartition(); continue; } } while (!currentPartitionFinished()) { if (!this.messageIterator.hasNext()) { break; } KafkaConsumerRecord nextValidMessage = this.messageIterator.next(); // Even though we ask Kafka to give us a message buffer starting from offset x, it may // return a buffer that starts from offset smaller than x, so we need to skip messages // until we get to x. if (nextValidMessage.getOffset() < _nextWatermark.get(this.currentPartitionIdx)) { continue; } _nextWatermark.set(this.currentPartitionIdx, nextValidMessage.getNextOffset()); try { final AvroJobSpec record; if (nextValidMessage instanceof ByteArrayBasedKafkaRecord) { record = decodeRecord((ByteArrayBasedKafkaRecord)nextValidMessage); } else if (nextValidMessage instanceof DecodeableKafkaRecord){ record = ((DecodeableKafkaRecord<?, AvroJobSpec>) nextValidMessage).getValue(); } else { throw new IllegalStateException( "Unsupported KafkaConsumerRecord type. The returned record can either be ByteArrayBasedKafkaRecord" + " or DecodeableKafkaRecord"); } JobSpec.Builder jobSpecBuilder = JobSpec.builder(record.getUri()); Properties props = new Properties(); props.putAll(record.getProperties()); jobSpecBuilder.withJobCatalogURI(record.getUri()).withVersion(record.getVersion()) .withDescription(record.getDescription()).withConfigAsProperties(props); if (!record.getTemplateUri().isEmpty()) { jobSpecBuilder.withTemplate(new URI(record.getTemplateUri())); } String verbName = record.getMetadata().get(VERB_KEY); Verb verb = Verb.valueOf(verbName); changesSpecs.add(new ImmutablePair<Verb, Spec>(verb, jobSpecBuilder.build())); } catch (Throwable t) { _log.error("Could not decode record at partition " + this.currentPartitionIdx + " offset " + nextValidMessage.getOffset()); } } } return new CompletedFuture(changesSpecs, null); } private void initializeWatermarks() { initializeLowWatermarks(); initializeHighWatermarks(); } private void initializeLowWatermarks() { try { int i=0; for (KafkaPartition kafkaPartition : _partitions) { if (isFirstRun) { long earliestOffset = _kafka08Consumer.getEarliestOffset(kafkaPartition); _lowWatermark.set(i, earliestOffset); } else { _lowWatermark.set(i, _highWatermark.get(i)); } i++; } isFirstRun = false; } catch (KafkaOffsetRetrievalFailureException e) { throw new RuntimeException(e); } } private void initializeHighWatermarks() { try { int i=0; for (KafkaPartition kafkaPartition : _partitions) { long latestOffset = _kafka08Consumer.getLatestOffset(kafkaPartition); _highWatermark.set(i, latestOffset); i++; } } catch (KafkaOffsetRetrievalFailureException e) { throw new RuntimeException(e); } } private boolean allPartitionsFinished() { return this.currentPartitionIdx >= _nextWatermark.size(); } private boolean currentPartitionFinished() { if (this.currentPartitionIdx == -1) { return true; } else if (_nextWatermark.get(this.currentPartitionIdx) >= _highWatermark.get(this.currentPartitionIdx)) { return true; } else { return false; } } private int moveToNextPartition() { this.messageIterator = null; return this.currentPartitionIdx ++; } private KafkaPartition getCurrentPartition() { return _partitions.get(this.currentPartitionIdx); } private Iterator<KafkaConsumerRecord> fetchNextMessageBuffer() { return _kafka08Consumer.consume(_partitions.get(this.currentPartitionIdx), _nextWatermark.get(this.currentPartitionIdx), _highWatermark.get(this.currentPartitionIdx)); } private AvroJobSpec decodeRecord(ByteArrayBasedKafkaRecord kafkaConsumerRecord) throws IOException { InputStream is = new ByteArrayInputStream(kafkaConsumerRecord.getMessageBytes()); _versionWriter.readSchemaVersioningInformation(new DataInputStream(is)); Decoder decoder = DecoderFactory.get().binaryDecoder(is, _decoder); return _reader.read(null, decoder); } @Override public void close() throws IOException { _kafka08Consumer.close(); } }