package org.apache.hadoop.hive.kafka.camus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.InputSplit; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Properties; public class BaseAllocator extends WorkAllocator{ protected Properties props; public void init(Properties props){ this.props = props; } protected void reverseSortRequests(List<CamusRequest> requests){ // Reverse sort by size Collections.sort(requests, new Comparator<CamusRequest>() { @Override public int compare(CamusRequest o1, CamusRequest o2) { if (o2.estimateDataSize() == o1.estimateDataSize()) { return 0; } if (o2.estimateDataSize() < o1.estimateDataSize()) { return -1; } else { return 1; } } }); } @Override public InputSplit[] allocateWork(List<CamusRequest> requests, JobConf conf) throws IOException { int numTasks = conf.getInt("mapred.map.tasks", 30); reverseSortRequests(requests); List<InputSplit> kafkaETLSplits = new ArrayList<InputSplit>(); Path[] tablePaths = FileInputFormat.getInputPaths(conf); for (int i = 0; i < numTasks; i++) { if (requests.size() > 0) { kafkaETLSplits.add(new KafkaSplit(tablePaths[0])); } } for (CamusRequest r : requests) { getSmallestMultiSplit(kafkaETLSplits).addRequest(r); } InputSplit[] inputSplits = new InputSplit[kafkaETLSplits.size()]; return kafkaETLSplits.toArray(inputSplits); } protected KafkaSplit getSmallestMultiSplit(List<InputSplit> kafkaETLSplits) throws IOException { KafkaSplit smallest = (KafkaSplit) kafkaETLSplits.get(0); for (int i = 1; i < kafkaETLSplits.size(); i++) { KafkaSplit challenger = (KafkaSplit) kafkaETLSplits.get(i); if ((smallest.getLength() == challenger.getLength() && smallest .getNumRequests() > challenger.getNumRequests()) || smallest.getLength() > challenger.getLength()) { smallest = challenger; } } return smallest; } }