/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.sqoop.mapreduce.hcat;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hive.hcatalog.data.HCatRecord;
import org.apache.hive.hcatalog.mapreduce.HCatInputFormat;
import org.apache.sqoop.mapreduce.ExportInputFormat;
/**
* A combined HCatInputFormat equivalent that allows us to generate the number
* of splits to the number of map tasks.
*
* The logic is simple. We get the list of splits for HCatInputFormat. If it is
* less than the number of mappers, all is good. Else, we sort the splits by
* size and assign them to each of the mappers in a simple scheme. After
* assigning the splits to each of the mapper, for the next round we start with
* the mapper that got the last split. That way, the size of the split is
* distributed in a more uniform fashion than a simple round-robin assignment.
*/
public class SqoopHCatExportFormat extends HCatInputFormat {
public static final Log LOG = LogFactory
.getLog(SqoopHCatExportFormat.class.getName());
@Override
public List<InputSplit> getSplits(JobContext job)
throws IOException, InterruptedException {
List<InputSplit> hCatSplits = super.getSplits(job);
int hCatSplitCount = hCatSplits.size();
int expectedSplitCount = ExportInputFormat.getNumMapTasks(job);
if (expectedSplitCount == 0) {
expectedSplitCount = hCatSplitCount;
}
LOG.debug("Expected split count " + expectedSplitCount);
LOG.debug("HCatInputFormat provided split count " + hCatSplitCount);
// Sort the splits by length descending.
Collections.sort(hCatSplits, new Comparator<InputSplit>() {
@Override
public int compare(InputSplit is1, InputSplit is2) {
try {
return (int) (is2.getLength() - is1.getLength());
} catch (Exception e) {
LOG.warn("Exception caught while sorting Input splits " + e);
}
return 0;
}
});
List<InputSplit> combinedSplits = new ArrayList<InputSplit>();
// The number of splits generated by HCatInputFormat is within
// our limits
if (hCatSplitCount <= expectedSplitCount) {
for (InputSplit split : hCatSplits) {
List<InputSplit> hcSplitList = new ArrayList<InputSplit>();
hcSplitList.add(split);
combinedSplits.add(new SqoopHCatInputSplit(hcSplitList));
}
return combinedSplits;
}
List<List<InputSplit>> combinedSplitList =
new ArrayList<List<InputSplit>>();
for (int i = 0; i < expectedSplitCount; i++) {
combinedSplitList.add(new ArrayList<InputSplit>());
}
boolean ascendingAssigment = true;
int lastSet = 0;
for (int i = 0; i < hCatSplitCount; ++i) {
int splitNum = i % expectedSplitCount;
int currentSet = i / expectedSplitCount;
if (currentSet != lastSet) {
ascendingAssigment = !ascendingAssigment;
}
if (ascendingAssigment) {
combinedSplitList.get(splitNum).add(hCatSplits.get(i));
} else {
combinedSplitList.
get(expectedSplitCount - 1 - splitNum).add(hCatSplits.get(i));
}
lastSet = currentSet;
}
for (int i = 0; i < expectedSplitCount; i++) {
SqoopHCatInputSplit sqoopSplit =
new SqoopHCatInputSplit(combinedSplitList.get(i));
combinedSplits.add(sqoopSplit);
}
return combinedSplits;
}
@Override
public RecordReader<WritableComparable, HCatRecord>
createRecordReader(InputSplit split,
TaskAttemptContext taskContext)
throws IOException, InterruptedException {
LOG.debug("Creating a SqoopHCatRecordReader");
return new SqoopHCatRecordReader(split, taskContext, this);
}
public RecordReader<WritableComparable, HCatRecord>
createHCatRecordReader(InputSplit split,
TaskAttemptContext taskContext)
throws IOException, InterruptedException {
LOG.debug("Creating a base HCatRecordReader");
return super.createRecordReader(split, taskContext);
}
}