GuaguaInputSplit.java example

Explorer
guagua-master
/*
 * Copyright [2013-2014] PayPal Software Foundation
 *  
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *  
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package ml.shifu.guagua.hadoop.io;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectInputStream;
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * {@link InputSplit} implementation in guagua for Hadoop MapReduce job.
 * 
 * <p>
 * If mapper with {@link GuaguaInputSplit#isMaster} true means it is master, for master so far {@link #fileSplits} is
 * {@code null}.
 * 
 * <p>
 * For worker, input {@link #fileSplits} are included, here <code>FileSplit</code> array is used to make guagua support
 * combining <code>FileSplit</code>s in one task.
 */
public class GuaguaInputSplit extends InputSplit implements Writable {

    private static final Logger LOG = LoggerFactory.getLogger(GuaguaInputSplit.class);
    /**
     * Whether the input split is master split.
     */
    private boolean isMaster;

    /**
     * File splits used for the task. For master task, it is almost null. Using array here to make guagua
     * support combining small files into one GuaguaInputSplit.
     */
    private FileSplit[] fileSplits;
    
    private Object[] extensions;

    /**
     * Default constructor without any setting.
     */
    public GuaguaInputSplit() {
    }
    
    

    /**
     * Constructor with {@link #isMaster} and {@link #fileSplits} settings.
     * 
     * @param isMaster
     *            Whether the input split is master split.
     * @param fileSplits
     *            File splits used for mapper task.
     */
    public GuaguaInputSplit(boolean isMaster, FileSplit... fileSplits) {
        this.isMaster = isMaster;
        this.fileSplits = fileSplits;
    }

    /**
     * Constructor with {@link #isMaster} and one FileSplit settings.
     * 
     * @param isMaster
     *            Whether the input split is master split.
     * @param fileSplit
     *            File split used for mapper task.
     */
    public GuaguaInputSplit(boolean isMaster, FileSplit fileSplit) {
        this(isMaster, new FileSplit[] { fileSplit });
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
     */
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeBoolean(this.isMaster());
        if(!this.isMaster()) {
            int length = this.getFileSplits().length;
            out.writeInt(length);
            for(int i = 0; i < length; i++) {
                this.getFileSplits()[i].write(out);
            }
            if(this.extensions != null) {
                out.writeInt(extensions.length);
                for(int i = 0; i < extensions.length; i++) {
                    ByteArrayOutputStream bos = new ByteArrayOutputStream();
                    ObjectOutput ext = null;
                    try {
                        ext = new ObjectOutputStream(bos);
                        ext.writeObject(extensions[i]);
                        byte[] bytes = bos.toByteArray();
                        out.writeInt(bytes.length);
                        out.write(bytes);
                    } finally {
                        IOUtils.closeQuietly(bos);
                    }
                }
            }else{
                out.writeInt(0);
            }
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
     */
    @Override
    public void readFields(DataInput in) throws IOException {
        this.setMaster(in.readBoolean());
        if(!isMaster()) {
            int len = in.readInt();
            FileSplit[] splits = new FileSplit[len];
            for(int i = 0; i < len; i++) {
                splits[i] = new FileSplit(null, 0, 0, (String[]) null);
                splits[i].readFields(in);
            }
            this.setFileSplits(splits);
            int extLen = in.readInt();
            if(extLen > 0) {
                Object[] exts = new Object[extLen];
                for(int i = 0; i < extLen; i++) {
                    int objectLen = in.readInt();
                    byte[] bytes = new byte[objectLen];
                    in.readFully(bytes);
                    ByteArrayInputStream bis = new ByteArrayInputStream(bytes);
                    ObjectInput ext = new ObjectInputStream(bis);
                    try {                      
                        Object extension = ext.readObject();
                        exts[i] = extension;
                    } catch (ClassNotFoundException ce) {
                        LOG.error(ce.getMessage(), ce);
                    } finally {
                        IOUtils.closeQuietly(bis);
                    }
                }
               this.setExtensions(exts);
            }
        }
    }

    /**
     * For master split, use <code>Long.MAX_VALUE</code> as its length to make it is the first task for Hadoop job. It
     * is convenient for users to check master in Hadoop UI.
     */
    @Override
    public long getLength() throws IOException, InterruptedException {
        if(isMaster()) {
            return Long.MAX_VALUE;
        }
        long len = 0;
        for(FileSplit split: this.getFileSplits()) {
            len += split.getLength();
        }
        return len;
    }

    /**
     * Data locality functions, return all hosts for all file splits.
     */
    @Override
    public String[] getLocations() throws IOException, InterruptedException {
        if(this.getFileSplits() == null || this.getFileSplits().length == 0) {
            return new String[0];
        }

        List<String> hosts = new ArrayList<String>();
        for(FileSplit fileSplit: this.getFileSplits()) {
            if(fileSplit != null) {
                hosts.addAll(Arrays.asList(fileSplit.getLocations()));
            }
        }

        return hosts.toArray(new String[0]);
    }

    public boolean isMaster() {
        return isMaster;
    }

    public void setMaster(boolean isMaster) {
        this.isMaster = isMaster;
    }

    public FileSplit[] getFileSplits() {
        return fileSplits;
    }

    public void setFileSplits(FileSplit[] fileSplits) {
        this.fileSplits = fileSplits;
    }
    
    public Object[] getExtensions() {
        return extensions;
    }

    public void setExtensions(Object[] extensions) {
        this.extensions = extensions;
    }

    @Override
    public String toString() {
        return String
                .format("GuaguaInputSplit [isMaster=%s, fileSplit=%s]", isMaster, Arrays.toString(this.fileSplits));
    }

}