InputSplitUtils.java example

Explorer
guagua-master
/*
 * Copyright [2013-2014] PayPal Software Foundation
 *  
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *  
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package ml.shifu.guagua.yarn.util;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;

import ml.shifu.guagua.GuaguaRuntimeException;
import ml.shifu.guagua.hadoop.io.GuaguaInputSplit;
import ml.shifu.guagua.yarn.GuaguaYarnConstants;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Helper class to get input splits.
 */
public final class InputSplitUtils {

    private static final Logger LOG = LoggerFactory.getLogger(InputSplitUtils.class);

    // To avoid somebody new InputSplitUtils
    private InputSplitUtils() {
    }

    /**
     * Copy from pig implementation, need to check this code logic.
     */
    public static List<InputSplit> getFinalCombineGuaguaSplits(List<InputSplit> newSplits, long combineSize)
            throws IOException {
        List<List<InputSplit>> combinePigSplits;
        try {
            combinePigSplits = getCombineGuaguaSplits(newSplits, combineSize);
        } catch (InterruptedException e) {
            throw new GuaguaRuntimeException(e);
        }
        newSplits = new ArrayList<InputSplit>();
        for(List<InputSplit> inputSplits: combinePigSplits) {
            FileSplit[] fss = new FileSplit[inputSplits.size()];
            for(int i = 0; i < inputSplits.size(); i++) {
                fss[i] = (FileSplit) (inputSplits.get(i));
            }
            newSplits.add(new GuaguaInputSplit(false, fss));
        }
        return newSplits;
    }

    /**
     * List all the inputs files. Better to follow FileInputFormat#listStatus
     */
    public static FileStatus[] listStatus(Configuration conf) throws IOException {
        String newPath = expandInputFolder(conf);
        // Get all files except pig or hadoop meta
        FileStatus[] fileStatus = FileSystem.get(conf).globStatus(new Path(newPath), new PathFilter() {
            @Override
            public boolean accept(Path path) {
                return !isPigOrHadoopMetaFile(path);
            }
        });
        return fileStatus;
    }

    /**
     * Expand folder to all files to support all files in that folder
     */
    public static String expandInputFolder(Configuration conf) throws IOException {
        Path path = new Path(conf.get(GuaguaYarnConstants.GUAGUA_YARN_INPUT_DIR));
        String newPath = path.toString();
        return FileSystem.get(conf).getFileStatus(path).isDirectory() ? newPath + Path.SEPARATOR + "*" : newPath;
    }

    /**
     * Generate the list of files and make them into FileSplits.
     */
    public static List<InputSplit> getFileSplits(Configuration conf, long splitSize) throws IOException {
        // generate splits
        List<InputSplit> splits = new ArrayList<InputSplit>();
        FileStatus[] files = listStatus(conf);
        for(FileStatus file: files) {
            Path path = file.getPath();
            if(isPigOrHadoopMetaFile(path)) {
                continue;
            }
            FileSystem fs = path.getFileSystem(conf);
            long length = file.getLen();
            BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
            if((length != 0) && isSplitable(conf, path)) {
                long bytesRemaining = length;
                while(((double) bytesRemaining) / splitSize > GuaguaYarnConstants.SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex]
                            .getHosts()));
                    bytesRemaining -= splitSize;
                }

                if(bytesRemaining != 0) {
                    splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining,
                            blkLocations[blkLocations.length - 1].getHosts()));
                }
            } else if(length != 0) {
                splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
            } else {
                // Create empty hosts array for zero length files
                splits.add(new FileSplit(path, 0, length, new String[0]));
            }
        }

        LOG.debug("Total # of splits: {}", splits.size());
        return splits;
    }

    /**
     * Generate the list of files and make them into FileSplits.
     */
    public static List<InputSplit> getGuaguaSplits(Configuration conf, long splitSize) throws IOException {
        // generate splits
        List<InputSplit> splits = new ArrayList<InputSplit>();
        FileStatus[] files = listStatus(conf);
        for(FileStatus file: files) {
            Path path = file.getPath();
            if(isPigOrHadoopMetaFile(path)) {
                continue;
            }
            FileSystem fs = path.getFileSystem(conf);
            long length = file.getLen();
            BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
            if((length != 0) && isSplitable(conf, path)) {
                long bytesRemaining = length;
                while(((double) bytesRemaining) / splitSize > GuaguaYarnConstants.SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(new GuaguaInputSplit(false, new FileSplit[] { new FileSplit(path, length
                            - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()) }));
                    bytesRemaining -= splitSize;
                }

                if(bytesRemaining != 0) {
                    splits.add(new GuaguaInputSplit(false, new FileSplit[] { new FileSplit(path, length
                            - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts()) }));
                }
            } else if(length != 0) {
                splits.add(new GuaguaInputSplit(false, new FileSplit[] { new FileSplit(path, 0, length, blkLocations[0]
                        .getHosts()) }));
            } else {
                // Create empty hosts array for zero length files
                splits.add(new GuaguaInputSplit(false,
                        new FileSplit[] { new FileSplit(path, 0, length, new String[0]) }));
            }
        }

        LOG.debug("Total # of splits: {}", splits.size());
        return splits;
    }

    public static int getBlockIndex(BlockLocation[] blkLocations, long offset) {
        for(int i = 0; i < blkLocations.length; i++) {
            // is the offset inside this block?
            if((blkLocations[i].getOffset() <= offset)
                    && (offset < blkLocations[i].getOffset() + blkLocations[i].getLength())) {
                return i;
            }
        }
        BlockLocation last = blkLocations[blkLocations.length - 1];
        long fileLength = last.getOffset() + last.getLength() - 1;
        throw new IllegalArgumentException("Offset " + offset + " is outside of file (0.." + fileLength + ")");
    }

    private static final class ComparableSplit implements Comparable<ComparableSplit> {
        private InputSplit rawInputSplit;
        private HashSet<Node> nodes;
        // id used as a tie-breaker when two splits are of equal size.
        private long id;

        ComparableSplit(InputSplit split, long id) {
            rawInputSplit = split;
            nodes = new HashSet<Node>();
            this.id = id;
        }

        void add(Node node) {
            nodes.add(node);
        }

        void removeFromNodes() {
            for(Node node: nodes)
                node.remove(this);
        }

        public InputSplit getSplit() {
            return rawInputSplit;
        }

        @Override
        public boolean equals(Object other) {
            if(other == null || !(other instanceof ComparableSplit))
                return false;
            return (compareTo((ComparableSplit) other) == 0);
        }

        @Override
        public int hashCode() {
            return 41;
        }

        @Override
        public int compareTo(ComparableSplit other) {
            try {
                long cmp = rawInputSplit.getLength() - other.rawInputSplit.getLength();
                // in descending order
                return cmp == 0 ? (id == other.id ? 0 : id < other.id ? -1 : 1) : cmp < 0 ? 1 : -1;
            } catch (IOException e) {
                throw new RuntimeException(e);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
        }
    }

    private static class DummySplit extends InputSplit {
        private long length;

        @Override
        public String[] getLocations() {
            return null;
        }

        @Override
        public long getLength() {
            return length;
        }

        public void setLength(long length) {
            this.length = length;
        }
    }

    private static class Node {
        private long length = 0;
        private ArrayList<ComparableSplit> splits;
        private boolean sorted;

        Node() throws IOException, InterruptedException {
            length = 0;
            splits = new ArrayList<ComparableSplit>();
            sorted = false;
        }

        void add(ComparableSplit split) throws IOException, InterruptedException {
            splits.add(split);
            length++;
        }

        void remove(ComparableSplit split) {
            if(!sorted)
                sort();
            int index = Collections.binarySearch(splits, split);
            if(index >= 0) {
                splits.remove(index);
                length--;
            }
        }

        void sort() {
            if(!sorted) {
                Collections.sort(splits);
                sorted = true;
            }
        }

        ArrayList<ComparableSplit> getSplits() {
            return splits;
        }

        @SuppressWarnings("unused")
        public long getLength() {
            return length;
        }
    }

    public static List<List<InputSplit>> getCombineGuaguaSplits(List<InputSplit> oneInputSplits,
            long maxCombinedSplitSize) throws IOException, InterruptedException {
        ArrayList<Node> nodes = new ArrayList<Node>();
        HashMap<String, Node> nodeMap = new HashMap<String, Node>();
        List<List<InputSplit>> result = new ArrayList<List<InputSplit>>();
        List<Long> resultLengths = new ArrayList<Long>();
        long comparableSplitId = 0;

        int size = 0, nSplits = oneInputSplits.size();
        InputSplit lastSplit = null;
        int emptyCnt = 0;
        for(InputSplit split: oneInputSplits) {
            if(split.getLength() == 0) {
                emptyCnt++;
                continue;
            }
            if(split.getLength() >= maxCombinedSplitSize) {
                comparableSplitId++;
                ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
                combinedSplits.add(split);
                result.add(combinedSplits);
                resultLengths.add(split.getLength());
            } else {
                ComparableSplit csplit = new ComparableSplit(split, comparableSplitId++);
                String[] locations = split.getLocations();
                // sort the locations to stabilize the number of maps: PIG-1757
                Arrays.sort(locations);
                HashSet<String> locationSeen = new HashSet<String>();
                for(String location: locations) {
                    if(!locationSeen.contains(location)) {
                        Node node = nodeMap.get(location);
                        if(node == null) {
                            node = new Node();
                            nodes.add(node);
                            nodeMap.put(location, node);
                        }
                        node.add(csplit);
                        csplit.add(node);
                        locationSeen.add(location);
                    }
                }
                lastSplit = split;
                size++;
            }
        }

        if(nSplits > 0 && emptyCnt == nSplits) {
            // if all splits are empty, add a single empty split as currently an empty directory is
            // not properly handled somewhere
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            combinedSplits.add(oneInputSplits.get(0));
            result.add(combinedSplits);
        } else if(size == 1) {
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            combinedSplits.add(lastSplit);
            result.add(combinedSplits);
        } else if(size > 1) {
            // combine small splits
            Collections.sort(nodes, nodeComparator);
            DummySplit dummy = new DummySplit();
            // dummy is used to search for next split of suitable size to be combined
            ComparableSplit dummyComparableSplit = new ComparableSplit(dummy, -1);
            for(Node node: nodes) {
                // sort the splits on this node in descending order
                node.sort();
                long totalSize = 0;
                ArrayList<ComparableSplit> splits = node.getSplits();
                int idx;
                int lenSplits;
                ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
                ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();
                while(!splits.isEmpty()) {
                    combinedSplits.add(splits.get(0).getSplit());
                    combinedComparableSplits.add(splits.get(0));
                    int startIdx = 1;
                    lenSplits = splits.size();
                    totalSize += splits.get(0).getSplit().getLength();
                    long spaceLeft = maxCombinedSplitSize - totalSize;
                    dummy.setLength(spaceLeft);
                    idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits), dummyComparableSplit);
                    idx = -idx - 1 + startIdx;
                    while(idx < lenSplits) {
                        long thisLen = splits.get(idx).getSplit().getLength();
                        combinedSplits.add(splits.get(idx).getSplit());
                        combinedComparableSplits.add(splits.get(idx));
                        totalSize += thisLen;
                        spaceLeft -= thisLen;
                        if(spaceLeft <= 0)
                            break;
                        // find next combinable chunk
                        startIdx = idx + 1;
                        if(startIdx >= lenSplits)
                            break;
                        dummy.setLength(spaceLeft);
                        idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits),
                                dummyComparableSplit);
                        idx = -idx - 1 + startIdx;
                    }
                    if(totalSize > maxCombinedSplitSize / 2) {
                        result.add(combinedSplits);
                        resultLengths.add(totalSize);
                        removeSplits(combinedComparableSplits);
                        totalSize = 0;
                        combinedSplits = new ArrayList<InputSplit>();
                        combinedComparableSplits.clear();
                        splits = node.getSplits();
                    } else {
                        if(combinedSplits.size() != lenSplits)
                            throw new AssertionError("Combined split logic error!");
                        break;
                    }
                }
            }
            // handle leftovers
            ArrayList<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>();
            HashSet<InputSplit> seen = new HashSet<InputSplit>();
            for(Node node: nodes) {
                for(ComparableSplit split: node.getSplits()) {
                    if(!seen.contains(split.getSplit())) {
                        // remove duplicates. The set has to be on the raw input split not the
                        // comparable input split as the latter overrides the compareTo method
                        // so its equality semantics is changed and not we want here
                        seen.add(split.getSplit());
                        leftoverSplits.add(split);
                    }
                }
            }

            if(!leftoverSplits.isEmpty()) {
                long totalSize = 0;
                ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
                ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();

                int splitLen = leftoverSplits.size();
                for(int i = 0; i < splitLen; i++) {
                    ComparableSplit split = leftoverSplits.get(i);
                    long thisLen = split.getSplit().getLength();
                    if(totalSize + thisLen >= maxCombinedSplitSize) {
                        removeSplits(combinedComparableSplits);
                        result.add(combinedSplits);
                        resultLengths.add(totalSize);
                        combinedSplits = new ArrayList<InputSplit>();
                        combinedComparableSplits.clear();
                        totalSize = 0;
                    }
                    combinedSplits.add(split.getSplit());
                    combinedComparableSplits.add(split);
                    totalSize += split.getSplit().getLength();
                    if(i == splitLen - 1) {
                        // last piece: it could be very small, try to see it can be squeezed into any existing splits
                        for(int j = 0; j < result.size(); j++) {
                            if(resultLengths.get(j) + totalSize <= maxCombinedSplitSize) {
                                List<InputSplit> isList = result.get(j);
                                for(InputSplit csplit: combinedSplits) {
                                    isList.add(csplit);
                                }
                                removeSplits(combinedComparableSplits);
                                combinedSplits.clear();
                                break;
                            }
                        }
                        if(!combinedSplits.isEmpty()) {
                            // last piece can not be squeezed in, create a new combined split for them.
                            removeSplits(combinedComparableSplits);
                            result.add(combinedSplits);
                        }
                    }
                }
            }
        }
        LOG.info("Total input paths (combined) to process : {}", result.size());
        return result;
    }

    /*
     * The following codes are for split combination: see PIG-1518
     */
    private static Comparator<Node> nodeComparator = new Comparator<Node>() {
        @Override
        public int compare(Node o1, Node o2) {
            long cmp = o1.length - o2.length;
            return cmp == 0 ? 0 : cmp < 0 ? -1 : 1;
        }
    };

    private static void removeSplits(List<ComparableSplit> splits) {
        for(ComparableSplit split: splits)
            split.removeFromNodes();
    }

    /**
     * Whether it is not pig or hadoop meta output file.
     */
    private static boolean isPigOrHadoopMetaFile(Path path) {
        return path.toString().indexOf(GuaguaYarnConstants.HADOOP_SUCCESS) >= 0
                || path.toString().indexOf(GuaguaYarnConstants.PIG_HEADER) >= 0
                || path.toString().indexOf(GuaguaYarnConstants.PIG_SCHEMA) >= 0;
    }

    private static boolean isSplitable(Configuration conf, Path file) {
        // other compression can not be split, maybe for lzo I should add it to split list.
        CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(file);
        return codec == null;
    }

}