/* * Copyright [2013-2015] PayPal Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ml.shifu.shifu.core.mr.input; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.lib.input.FileSplit; /** * {@link InputSplit} implementation to combine multiple . * * <p> * For worker, input {@link #fileSplits} are included, here <code>FileSplit</code> array is used to support combining * <code>FileSplit</code>s in one task. */ public class CombineInputSplit extends InputSplit implements Writable { /** * File splits used for the task. Using array here to make support combining small files into one GuaguaInputSplit. */ private FileSplit[] fileSplits; /** * Default constructor without any setting. */ public CombineInputSplit() { } /** * Constructor with {@link #fileSplits} settings. * * @param fileSplits * File splits used for mapper task. */ public CombineInputSplit(FileSplit... fileSplits) { this.fileSplits = fileSplits; } /** * Constructor with one FileSplit settings. * * @param fileSplit * File split used for mapper task. */ public CombineInputSplit(FileSplit fileSplit) { this(new FileSplit[] { fileSplit }); } /* * (non-Javadoc) * * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) */ @Override public void write(DataOutput out) throws IOException { int length = this.getFileSplits().length; out.writeInt(length); for(int i = 0; i < length; i++) { this.getFileSplits()[i].write(out); } } /* * (non-Javadoc) * * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput) */ @Override public void readFields(DataInput in) throws IOException { int len = in.readInt(); FileSplit[] splits = new FileSplit[len]; for(int i = 0; i < len; i++) { splits[i] = new FileSplit(null, 0, 0, (String[]) null); splits[i].readFields(in); } this.setFileSplits(splits); } /** * For master split, use <code>Long.MAX_VALUE</code> as its length to make it is the first task for Hadoop job. It * is convenient for users to check master in Hadoop UI. */ @Override public long getLength() throws IOException, InterruptedException { long len = 0; for(FileSplit split: this.getFileSplits()) { len += split.getLength(); } return len; } /** * Data locality functions, return all hosts for all file splits. */ @Override public String[] getLocations() throws IOException, InterruptedException { if(this.getFileSplits() == null || this.getFileSplits().length == 0) { return new String[0]; } List<String> hosts = new ArrayList<String>(); for(FileSplit fileSplit: this.getFileSplits()) { if(fileSplit != null) { hosts.addAll(Arrays.asList(fileSplit.getLocations())); } } return hosts.toArray(new String[0]); } public FileSplit[] getFileSplits() { return fileSplits; } public void setFileSplits(FileSplit[] fileSplits) { this.fileSplits = fileSplits; } @Override public String toString() { return String.format("CombineInputSplit [fileSplit=%s]", Arrays.toString(this.fileSplits)); } }