package com.linkedin.camus.workallocater; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.net.URI; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import com.linkedin.camus.etl.kafka.common.EtlRequest; import com.linkedin.camus.etl.kafka.mapred.EtlSplit; public class TopicGroupingAllocator extends BaseAllocator { public final static String CAMUS_MAX_GROUP_SIZE_REDUCTION_FACTOR = "camus.max.group.size.reduction.factor"; @Override public List<InputSplit> allocateWork(List<CamusRequest> requests, JobContext context) throws IOException { int numTasks = context.getConfiguration().getInt("mapred.map.tasks", 30); List<InputSplit> kafkaETLSplits = new ArrayList<InputSplit>(); for (int i = 0; i < numTasks; i++) { if (requests.size() > 0) { kafkaETLSplits.add(new EtlSplit()); } } List<CamusRequest> groupedRequests = groupSmallRequest(requests, context); reverseSortRequests(groupedRequests); for (CamusRequest r : groupedRequests) { EtlSplit split = getSmallestMultiSplit(kafkaETLSplits); for (CamusRequest r1 : (GroupedRequest) r) { split.addRequest(r1); } } return kafkaETLSplits; } private List<CamusRequest> groupSmallRequest(List<CamusRequest> requests, JobContext context) { List<CamusRequest> finalRequests = new ArrayList<CamusRequest>(); Map<String, List<CamusRequest>> requestsTopicMap = new HashMap<String, List<CamusRequest>>(); long totalEstimatedDataSize = 0; for (CamusRequest cr : requests) { if (!requestsTopicMap.containsKey(cr.getTopic())) { requestsTopicMap.put(cr.getTopic(), new ArrayList<CamusRequest>()); } requestsTopicMap.get(cr.getTopic()).add(cr); totalEstimatedDataSize += cr.estimateDataSize(); } long maxSize = totalEstimatedDataSize / requests.size() / context.getConfiguration().getInt(CAMUS_MAX_GROUP_SIZE_REDUCTION_FACTOR, 3); for (List<CamusRequest> topic : requestsTopicMap.values()) { long size = 0; List<CamusRequest> groupedRequests = new ArrayList<CamusRequest>(); for (CamusRequest cr : topic) { if (size + cr.estimateDataSize() >= maxSize) { if (groupedRequests.size() > 0) finalRequests.add(new GroupedRequest(groupedRequests)); groupedRequests = new ArrayList<CamusRequest>(); size = 0; } groupedRequests.add(cr); } finalRequests.add(new GroupedRequest(groupedRequests)); } return finalRequests; } class GroupedRequest implements CamusRequest, Iterable<CamusRequest> { private List<CamusRequest> requests; private long size = -1; public GroupedRequest(List<CamusRequest> requests) { this.requests = requests; } @Override public void readFields(DataInput arg0) throws IOException { requests = new ArrayList<CamusRequest>(); int size = arg0.readInt(); for (int i = 0; i < size; i++) { //TODO: factor out kafka specific request functionality CamusRequest request = new EtlRequest(); request.readFields(arg0); requests.add(request); } } @Override public void write(DataOutput arg0) throws IOException { arg0.writeInt(requests.size()); for (CamusRequest cr : requests) { cr.write(arg0); } } @Override public void setLatestOffset(long latestOffset) { throw new UnsupportedOperationException(); } @Override public void setEarliestOffset(long earliestOffset) { throw new UnsupportedOperationException(); } @Override public void setOffset(long offset) { throw new UnsupportedOperationException(); } @Override public void setURI(URI uri) { throw new UnsupportedOperationException(); } @Override public String getTopic() { return requests.get(0).getTopic(); } @Override public URI getURI() { return requests.get(0).getURI(); } @Override public int getPartition() { return requests.get(0).getPartition(); } @Override public long getOffset() { return requests.get(0).getOffset(); } @Override public boolean isValidOffset() { return requests.get(0).isValidOffset(); } @Override public long getEarliestOffset() { return requests.get(0).getEarliestOffset(); } @Override public long getLastOffset() { return requests.get(0).getLastOffset(); } @Override public long getLastOffset(long time) { return requests.get(0).getLastOffset(time); } @Override public long estimateDataSize() { if (size == -1) { for (CamusRequest cr : requests) { size += cr.estimateDataSize(); } } return size; } @Override public long estimateDataSize(long endTime) { throw new UnsupportedOperationException(); } @Override public Iterator<CamusRequest> iterator() { return requests.iterator(); } @Override public void setAvgMsgSize(long size) { for (CamusRequest cr : requests) { cr.setAvgMsgSize(size); } } } }