/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.source.extractor.extract.kafka.workunit.packer;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Maps;
import gobblin.configuration.ConfigurationKeys;
import gobblin.configuration.SourceState;
import gobblin.configuration.WorkUnitState;
import gobblin.source.extractor.extract.kafka.KafkaPartition;
import gobblin.source.extractor.extract.kafka.KafkaUtils;
import gobblin.source.workunit.WorkUnit;
/**
* An implementation of {@link KafkaWorkUnitSizeEstimator} which uses the average record size of each partition to
* estimate the sizes of {@link WorkUnits}.
*
* Each partition pulled in the previous run should have an avg record size in its {@link WorkUnitState}. In the
* next run, for each partition the avg record size pulled in the previous run is considered the avg record size
* to be pulled in this run.
*
* If a partition was not pulled in the previous run, a default value of 1024 is used.
*
* @author Ziyang Liu
*/
public class KafkaAvgRecordSizeBasedWorkUnitSizeEstimator implements KafkaWorkUnitSizeEstimator {
private static final Logger LOG = LoggerFactory.getLogger(KafkaAvgRecordSizeBasedWorkUnitSizeEstimator.class);
private static final long DEFAULT_AVG_RECORD_SIZE = 1024;
private final Map<KafkaPartition, Long> estAvgSizes = Maps.newHashMap();
KafkaAvgRecordSizeBasedWorkUnitSizeEstimator(SourceState state) {
readPreAvgRecordSizes(state);
}
@Override
public double calcEstimatedSize(WorkUnit workUnit) {
long avgSize = this.getEstAvgSizeForPartition(KafkaUtils.getPartition(workUnit));
long numOfRecords = workUnit.getPropAsLong(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY)
- workUnit.getPropAsLong(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY);
return (double) avgSize * numOfRecords;
}
private long getEstAvgSizeForPartition(KafkaPartition partition) {
if (this.estAvgSizes.containsKey(partition)) {
LOG.info(String.format("Estimated avg record size for partition %s is %d", partition,
this.estAvgSizes.get(partition)));
return this.estAvgSizes.get(partition);
}
LOG.warn(String.format("Avg record size for partition %s not available, using default size %d", partition,
DEFAULT_AVG_RECORD_SIZE));
return DEFAULT_AVG_RECORD_SIZE;
}
private void readPreAvgRecordSizes(SourceState state) {
this.estAvgSizes.clear();
for (WorkUnitState workUnitState : state.getPreviousWorkUnitStates()) {
List<KafkaPartition> partitions = KafkaUtils.getPartitions(workUnitState);
for (KafkaPartition partition : partitions) {
if (KafkaUtils.containsPartitionAvgRecordSize(workUnitState, partition)) {
long previousAvgSize = KafkaUtils.getPartitionAvgRecordSize(workUnitState, partition);
this.estAvgSizes.put(partition, previousAvgSize);
}
}
}
}
}