GuaguaMRUnitDriver.java example

Explorer
guagua-master
/*
 * Copyright [2013-2014] PayPal Software Foundation
 *  
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *  
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package ml.shifu.guagua.hadoop;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import ml.shifu.guagua.io.Bytable;
import ml.shifu.guagua.io.GuaguaFileSplit;
import ml.shifu.guagua.unit.GuaguaUnitDriver;
import ml.shifu.guagua.yarn.GuaguaYarnConstants;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.util.StringUtils;

/**
 * {@link GuaguaMRUnitDriver} is used to run in-memory guagua application by using hadoop MapReduce splits.
 * 
 * @param <MASTER_RESULT>
 *            master result for computation in each iteration.
 * @param <WORKER_RESULT>
 *            worker result for computation in each iteration.
 * 
 * @see ml.shifu.guagua.mapreduce.example.sum.SumTest in guagua-mapreduce-examples project.
 */
public class GuaguaMRUnitDriver<MASTER_RESULT extends Bytable, WORKER_RESULT extends Bytable> extends
        GuaguaUnitDriver<MASTER_RESULT, WORKER_RESULT> {

    /**
     * A only constructor here for local in-memory guagua job.
     * 
     * @param props
     *            set all the configurations like input, output and ..
     * 
     * @see ml.shifu.guagua.mapreduce.example.sum.SumTest in guagua-mapreduce-examples project.
     * 
     */
    public GuaguaMRUnitDriver(Properties props) {
        super(props);
    }

    /**
     * Whether it is not pig or hadoop meta output file.
     */
    private boolean isPigOrHadoopMetaFile(Path path) {
        return path.toString().indexOf(GuaguaYarnConstants.HADOOP_SUCCESS) >= 0
                || path.toString().indexOf(GuaguaYarnConstants.PIG_HEADER) >= 0
                || path.toString().indexOf(GuaguaYarnConstants.PIG_SCHEMA) >= 0;
    }

    /**
     * Check whether file is splitable in HDFS.
     */
    private boolean isSplitable(Configuration conf, Path file) {
        // bzip2 can be split.
        if(file.getName().endsWith(GuaguaYarnConstants.BZ2)) {
            return true;
        }
        // other compression can not be split, maybe for lzo I should add it to split list.
        CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(file);
        return codec == null;
    }

    @Override
    public List<GuaguaFileSplit[]> generateWorkerSplits(String inputs) throws IOException {
        List<GuaguaFileSplit[]> splits = new ArrayList<GuaguaFileSplit[]>();
        Configuration conf = new Configuration();
        // generate splits
        List<FileStatus> files = listStatus(conf, inputs);
        for(FileStatus file: files) {
            Path path = file.getPath();
            if(isPigOrHadoopMetaFile(path)) {
                continue;
            }
            long length = file.getLen();
            if((length != 0) && isSplitable(conf, path)) {
                long splitSize = file.getBlockSize();

                long bytesRemaining = length;
                while(((double) bytesRemaining) / splitSize > GuaguaYarnConstants.SPLIT_SLOP) {
                    splits.add(new GuaguaFileSplit[] { new GuaguaFileSplit(path.toString(), length - bytesRemaining,
                            splitSize) });
                    bytesRemaining -= splitSize;
                }

                if(bytesRemaining != 0) {
                    splits.add(new GuaguaFileSplit[] { new GuaguaFileSplit(path.toString(), length - bytesRemaining,
                            bytesRemaining) });
                }
            } else if(length != 0) {
                splits.add(new GuaguaFileSplit[] { new GuaguaFileSplit(path.toString(), 0, length) });
            }
        }
        return splits;
    }

    /**
     * Get the list of input {@link Path}s for the map-reduce job.
     */
    private static Path[] getInputPaths(String inputs) {
        String[] list = StringUtils.split(inputs);
        Path[] result = new Path[list.length];
        for(int i = 0; i < list.length; i++) {
            result[i] = new Path(StringUtils.unEscapeString(list[i]));
        }
        return result;
    }

    private static final PathFilter hiddenFileFilter = new PathFilter() {
        public boolean accept(Path p) {
            String name = p.getName();
            return !name.startsWith("_") && !name.startsWith(".");
        }
    };

    /**
     * Proxy PathFilter that accepts a path only if all filters given in the constructor do. Used by the listPaths() to
     * apply the built-in hiddenFileFilter together with a user provided one (if any).
     */
    private static class MultiPathFilter implements PathFilter {
        private List<PathFilter> filters;

        public MultiPathFilter(List<PathFilter> filters) {
            this.filters = filters;
        }

        public boolean accept(Path path) {
            for(PathFilter filter: filters) {
                if(!filter.accept(path)) {
                    return false;
                }
            }
            return true;
        }
    }

    /**
     * List input directories.
     * Subclasses may override to, e.g., select only files matching a regular expression.
     * 
     * @param job
     *            the job to list input paths for
     * @return array of FileStatus objects
     * @throws IOException
     *             if zero items.
     * @throws InvalidInputException
     *             If any IOException for input files.
     */
    protected List<FileStatus> listStatus(Configuration conf, String input) throws IOException {
        List<FileStatus> result = new ArrayList<FileStatus>();
        Path[] dirs = getInputPaths(input);
        if(dirs.length == 0) {
            throw new IOException("No input paths specified in job");
        }

        List<IOException> errors = new ArrayList<IOException>();

        // creates a MultiPathFilter with the hiddenFileFilter and the
        // user provided one (if any).
        List<PathFilter> filters = new ArrayList<PathFilter>();
        filters.add(hiddenFileFilter);
        PathFilter inputFilter = new MultiPathFilter(filters);

        for(int i = 0; i < dirs.length; ++i) {
            Path p = dirs[i];
            FileSystem fs = p.getFileSystem(conf);
            FileStatus[] matches = fs.globStatus(p, inputFilter);
            if(matches == null) {
                errors.add(new IOException("Input path does not exist: " + p));
            } else if(matches.length == 0) {
                errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
            } else {
                for(FileStatus globStat: matches) {
                    if(globStat.isDirectory()) {
                        for(FileStatus stat: fs.listStatus(globStat.getPath(), inputFilter)) {
                            result.add(stat);
                        }
                    } else {
                        result.add(globStat);
                    }
                }
            }
        }

        if(!errors.isEmpty()) {
            throw new IOException(errors.toString());
        }
        return result;
    }

}