/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gobblin.source.extractor.extract.kafka.workunit.packer;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import com.google.common.collect.Lists;
import gobblin.configuration.SourceState;
import gobblin.configuration.State;
import gobblin.source.extractor.extract.AbstractSource;
import gobblin.source.workunit.MultiWorkUnit;
import gobblin.source.workunit.WorkUnit;
/**
* An implementation of {@link KafkaWorkUnitPacker} with two levels of bin packing.
*
* In the first level, some {@link WorkUnit}s corresponding to partitions
* of the same topic are grouped together into a single {@link WorkUnit}. The number of grouped {@link WorkUnit}s
* is approximately {@link #WORKUNIT_PRE_GROUPING_SIZE_FACTOR} * number of {@link MultiWorkUnit}s. The value of
* {@link #WORKUNIT_PRE_GROUPING_SIZE_FACTOR} should generally be 3.0 or higher, since the worst-fit-decreasing
* algorithm (used by the second level) may not achieve a good balance if the number of items
* is less than 3 times the number of bins.
*
* In the second level, these grouped {@link WorkUnit}s are assembled into {@link MultiWorkunit}s
* using worst-fit-decreasing.
*
* Bi-level bin packing has two advantages: (1) reduce the number of small output files since it tends to pack
* partitions of the same topic together; (2) reduce the total number of workunits / tasks since multiple partitions
* of the same topic are assigned to the same task. A task has a non-trivial cost of initialization, tear down and
* task state persistence. However, bi-level bin packing has more mapper skew than single-level bin packing, because
* if we pack lots of partitions of the same topic to the same mapper, and we underestimate the avg time per record
* for this topic, then this mapper could be much slower than other mappers.
*
* @author Ziyang Liu
*/
public class KafkaBiLevelWorkUnitPacker extends KafkaWorkUnitPacker {
public static final String WORKUNIT_PRE_GROUPING_SIZE_FACTOR = "workunit.pre.grouping.size.factor";
public static final double DEFAULT_WORKUNIT_PRE_GROUPING_SIZE_FACTOR = 3.0;
protected KafkaBiLevelWorkUnitPacker(AbstractSource<?, ?> source, SourceState state) {
super(source, state);
}
@Override
public List<WorkUnit> pack(Map<String, List<WorkUnit>> workUnitsByTopic, int numContainers) {
double totalEstDataSize = setWorkUnitEstSizes(workUnitsByTopic);
double avgGroupSize = totalEstDataSize / numContainers / getPreGroupingSizeFactor(this.state);
List<MultiWorkUnit> mwuGroups = Lists.newArrayList();
for (List<WorkUnit> workUnitsForTopic : workUnitsByTopic.values()) {
double estimatedDataSizeForTopic = calcTotalEstSizeForTopic(workUnitsForTopic);
if (estimatedDataSizeForTopic < avgGroupSize) {
// If the total estimated size of a topic is smaller than group size, put all partitions of this
// topic in a single group.
MultiWorkUnit mwuGroup = MultiWorkUnit.createEmpty();
addWorkUnitsToMultiWorkUnit(workUnitsForTopic, mwuGroup);
mwuGroups.add(mwuGroup);
} else {
// Use best-fit-decreasing to group workunits for a topic into multiple groups.
mwuGroups.addAll(bestFitDecreasingBinPacking(workUnitsForTopic, avgGroupSize));
}
}
List<WorkUnit> groups = squeezeMultiWorkUnits(mwuGroups);
return worstFitDecreasingBinPacking(groups, numContainers);
}
private static double calcTotalEstSizeForTopic(List<WorkUnit> workUnitsForTopic) {
double totalSize = 0;
for (WorkUnit w : workUnitsForTopic) {
totalSize += getWorkUnitEstSize(w);
}
return totalSize;
}
private static double getPreGroupingSizeFactor(State state) {
return state.getPropAsDouble(WORKUNIT_PRE_GROUPING_SIZE_FACTOR, DEFAULT_WORKUNIT_PRE_GROUPING_SIZE_FACTOR);
}
/**
* Group {@link WorkUnit}s into groups. Each group is a {@link MultiWorkUnit}. Each group has a capacity of
* avgGroupSize. If there's a single {@link WorkUnit} whose size is larger than avgGroupSize, it forms a group itself.
*/
private static List<MultiWorkUnit> bestFitDecreasingBinPacking(List<WorkUnit> workUnits, double avgGroupSize) {
// Sort workunits by data size desc
Collections.sort(workUnits, LOAD_DESC_COMPARATOR);
PriorityQueue<MultiWorkUnit> pQueue = new PriorityQueue<>(workUnits.size(), LOAD_DESC_COMPARATOR);
for (WorkUnit workUnit : workUnits) {
MultiWorkUnit bestGroup = findAndPopBestFitGroup(workUnit, pQueue, avgGroupSize);
if (bestGroup != null) {
addWorkUnitToMultiWorkUnit(workUnit, bestGroup);
} else {
bestGroup = MultiWorkUnit.createEmpty();
addWorkUnitToMultiWorkUnit(workUnit, bestGroup);
}
pQueue.add(bestGroup);
}
return Lists.newArrayList(pQueue);
}
/**
* Find the best group using the best-fit-decreasing algorithm.
* The best group is the fullest group that has enough capacity for the new {@link WorkUnit}.
* If no existing group has enough capacity for the new {@link WorkUnit}, return null.
*/
private static MultiWorkUnit findAndPopBestFitGroup(WorkUnit workUnit, PriorityQueue<MultiWorkUnit> pQueue,
double avgGroupSize) {
List<MultiWorkUnit> fullWorkUnits = Lists.newArrayList();
MultiWorkUnit bestFit = null;
while (!pQueue.isEmpty()) {
MultiWorkUnit candidate = pQueue.poll();
if (getWorkUnitEstSize(candidate) + getWorkUnitEstSize(workUnit) <= avgGroupSize) {
bestFit = candidate;
break;
}
fullWorkUnits.add(candidate);
}
for (MultiWorkUnit fullWorkUnit : fullWorkUnits) {
pQueue.add(fullWorkUnit);
}
return bestFit;
}
}