/*
* Copyright [2013-2014] PayPal Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ml.shifu.guagua.hadoop.io;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectInputStream;
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* {@link InputSplit} implementation in guagua for Hadoop MapReduce job.
*
* <p>
* If mapper with {@link GuaguaInputSplit#isMaster} true means it is master, for master so far {@link #fileSplits} is
* {@code null}.
*
* <p>
* For worker, input {@link #fileSplits} are included, here <code>FileSplit</code> array is used to make guagua support
* combining <code>FileSplit</code>s in one task.
*/
public class GuaguaInputSplit extends InputSplit implements Writable {
private static final Logger LOG = LoggerFactory.getLogger(GuaguaInputSplit.class);
/**
* Whether the input split is master split.
*/
private boolean isMaster;
/**
* File splits used for the task. For master task, it is almost null. Using array here to make guagua
* support combining small files into one GuaguaInputSplit.
*/
private FileSplit[] fileSplits;
private Object[] extensions;
/**
* Default constructor without any setting.
*/
public GuaguaInputSplit() {
}
/**
* Constructor with {@link #isMaster} and {@link #fileSplits} settings.
*
* @param isMaster
* Whether the input split is master split.
* @param fileSplits
* File splits used for mapper task.
*/
public GuaguaInputSplit(boolean isMaster, FileSplit... fileSplits) {
this.isMaster = isMaster;
this.fileSplits = fileSplits;
}
/**
* Constructor with {@link #isMaster} and one FileSplit settings.
*
* @param isMaster
* Whether the input split is master split.
* @param fileSplit
* File split used for mapper task.
*/
public GuaguaInputSplit(boolean isMaster, FileSplit fileSplit) {
this(isMaster, new FileSplit[] { fileSplit });
}
/*
* (non-Javadoc)
*
* @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
*/
@Override
public void write(DataOutput out) throws IOException {
out.writeBoolean(this.isMaster());
if(!this.isMaster()) {
int length = this.getFileSplits().length;
out.writeInt(length);
for(int i = 0; i < length; i++) {
this.getFileSplits()[i].write(out);
}
if(this.extensions != null) {
out.writeInt(extensions.length);
for(int i = 0; i < extensions.length; i++) {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
ObjectOutput ext = null;
try {
ext = new ObjectOutputStream(bos);
ext.writeObject(extensions[i]);
byte[] bytes = bos.toByteArray();
out.writeInt(bytes.length);
out.write(bytes);
} finally {
IOUtils.closeQuietly(bos);
}
}
}else{
out.writeInt(0);
}
}
}
/*
* (non-Javadoc)
*
* @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
*/
@Override
public void readFields(DataInput in) throws IOException {
this.setMaster(in.readBoolean());
if(!isMaster()) {
int len = in.readInt();
FileSplit[] splits = new FileSplit[len];
for(int i = 0; i < len; i++) {
splits[i] = new FileSplit(null, 0, 0, (String[]) null);
splits[i].readFields(in);
}
this.setFileSplits(splits);
int extLen = in.readInt();
if(extLen > 0) {
Object[] exts = new Object[extLen];
for(int i = 0; i < extLen; i++) {
int objectLen = in.readInt();
byte[] bytes = new byte[objectLen];
in.readFully(bytes);
ByteArrayInputStream bis = new ByteArrayInputStream(bytes);
ObjectInput ext = new ObjectInputStream(bis);
try {
Object extension = ext.readObject();
exts[i] = extension;
} catch (ClassNotFoundException ce) {
LOG.error(ce.getMessage(), ce);
} finally {
IOUtils.closeQuietly(bis);
}
}
this.setExtensions(exts);
}
}
}
/**
* For master split, use <code>Long.MAX_VALUE</code> as its length to make it is the first task for Hadoop job. It
* is convenient for users to check master in Hadoop UI.
*/
@Override
public long getLength() throws IOException, InterruptedException {
if(isMaster()) {
return Long.MAX_VALUE;
}
long len = 0;
for(FileSplit split: this.getFileSplits()) {
len += split.getLength();
}
return len;
}
/**
* Data locality functions, return all hosts for all file splits.
*/
@Override
public String[] getLocations() throws IOException, InterruptedException {
if(this.getFileSplits() == null || this.getFileSplits().length == 0) {
return new String[0];
}
List<String> hosts = new ArrayList<String>();
for(FileSplit fileSplit: this.getFileSplits()) {
if(fileSplit != null) {
hosts.addAll(Arrays.asList(fileSplit.getLocations()));
}
}
return hosts.toArray(new String[0]);
}
public boolean isMaster() {
return isMaster;
}
public void setMaster(boolean isMaster) {
this.isMaster = isMaster;
}
public FileSplit[] getFileSplits() {
return fileSplits;
}
public void setFileSplits(FileSplit[] fileSplits) {
this.fileSplits = fileSplits;
}
public Object[] getExtensions() {
return extensions;
}
public void setExtensions(Object[] extensions) {
this.extensions = extensions;
}
@Override
public String toString() {
return String
.format("GuaguaInputSplit [isMaster=%s, fileSplit=%s]", isMaster, Arrays.toString(this.fileSplits));
}
}