MapRedUtil.java example

Explorer
spork-streaming-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.pig.backend.hadoop.executionengine.util;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Comparator;
import java.util.Collections;
import java.util.Arrays;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.pig.FuncSpec;
import org.apache.pig.PigException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapReduce;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.builtin.PartitionSkewedKeys;
import org.apache.pig.impl.io.FileLocalizer;
import org.apache.pig.impl.io.FileSpec;
import org.apache.pig.impl.io.ReadToEndLoader;
import org.apache.pig.impl.plan.NodeIdGenerator;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.util.Pair;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.impl.util.Utils;

/**
 * A class of utility static methods to be used in the hadoop map reduce backend
 */
public class MapRedUtil {

    private static Log log = LogFactory.getLog(MapRedUtil.class);
         
    public static final String FILE_SYSTEM_NAME = "fs.default.name";

    /**
     * Loads the key distribution sampler file
     *
     * @param keyDistFile the name for the distribution file
     * @param totalReducers gets set to the total number of reducers as found in the dist file
     * @param keyType Type of the key to be stored in the return map. It currently treats Tuple as a special case.
     */
    @SuppressWarnings("unchecked")
    public static <E> Map<E, Pair<Integer, Integer>> loadPartitionFileFromLocalCache(
            String keyDistFile, Integer[] totalReducers, byte keyType, Configuration mapConf)
            throws IOException {

        Map<E, Pair<Integer, Integer>> reducerMap = new HashMap<E, Pair<Integer, Integer>>();

        // use local file system to get the keyDistFile
        Configuration conf = new Configuration(false);            
        
        if (mapConf.get("yarn.resourcemanager.principal")!=null) {
            conf.set("yarn.resourcemanager.principal", mapConf.get("yarn.resourcemanager.principal"));
        }
        
        if (PigMapReduce.sJobConfInternal.get().get("fs.file.impl")!=null)
            conf.set("fs.file.impl", PigMapReduce.sJobConfInternal.get().get("fs.file.impl"));
        if (PigMapReduce.sJobConfInternal.get().get("fs.hdfs.impl")!=null)
            conf.set("fs.hdfs.impl", PigMapReduce.sJobConfInternal.get().get("fs.hdfs.impl"));
        if (PigMapReduce.sJobConfInternal.get().getBoolean("pig.tmpfilecompression", false))
        {
            conf.setBoolean("pig.tmpfilecompression", true);
            if (PigMapReduce.sJobConfInternal.get().get("pig.tmpfilecompression.codec")!=null)
                conf.set("pig.tmpfilecompression.codec", PigMapReduce.sJobConfInternal.get().get("pig.tmpfilecompression.codec"));
        }
        conf.set(MapRedUtil.FILE_SYSTEM_NAME, "file:///");

        ReadToEndLoader loader = new ReadToEndLoader(Utils.getTmpFileStorageObject(PigMapReduce.sJobConfInternal.get()), conf, 
                keyDistFile, 0);
        DataBag partitionList;
        Tuple t = loader.getNext();
        if (t == null) {
            // this could happen if the input directory for sampling is empty
            log.warn("Empty dist file: " + keyDistFile);
            return reducerMap;
        }
        // The keydist file is structured as (key, min, max)
        // min, max being the index of the reducers
        Map<String, Object > distMap = (Map<String, Object>) t.get (0);
        partitionList = (DataBag) distMap.get(PartitionSkewedKeys.PARTITION_LIST);
        totalReducers[0] = Integer.valueOf(""+distMap.get(PartitionSkewedKeys.TOTAL_REDUCERS));
        Iterator<Tuple> it = partitionList.iterator();
        while (it.hasNext()) {
            Tuple idxTuple = it.next();
            Integer maxIndex = (Integer) idxTuple.get(idxTuple.size() - 1);
            Integer minIndex = (Integer) idxTuple.get(idxTuple.size() - 2);
            // Used to replace the maxIndex with the number of reducers
            if (maxIndex < minIndex) {
                maxIndex = totalReducers[0] + maxIndex; 
            }
            E keyT;

            // if the join is on more than 1 key
            if (idxTuple.size() > 3) {
                // remove the last 2 fields of the tuple, i.e: minIndex and maxIndex and store
                // it in the reducer map
                Tuple keyTuple = TupleFactory.getInstance().newTuple();
                for (int i=0; i < idxTuple.size() - 2; i++) {
                    keyTuple.append(idxTuple.get(i));	
                }
                keyT = (E) keyTuple;
            } else {
                if (keyType == DataType.TUPLE) {
                    keyT = (E)TupleFactory.getInstance().newTuple(1);
                    ((Tuple)keyT).set(0,idxTuple.get(0));
                } else {
                    keyT = (E) idxTuple.get(0);
                }
            }
            // number of reducers
            Integer cnt = maxIndex - minIndex;
            reducerMap.put(keyT, new Pair(minIndex, cnt));// 1 is added to account for the 0 index
        }
        return reducerMap;
    }
    
    public static void setupUDFContext(Configuration job) throws IOException {
        UDFContext udfc = UDFContext.getUDFContext();
        udfc.addJobConf(job);
        // don't deserialize in front-end 
        if (udfc.isUDFConfEmpty()) {
            udfc.deserialize();
        }
    }
    
    public static FileSpec checkLeafIsStore(
            PhysicalPlan plan,
            PigContext pigContext) throws ExecException {
        try {
            PhysicalOperator leaf = plan.getLeaves().get(0);
            FileSpec spec = null;
            if(!(leaf instanceof POStore)){
                String scope = leaf.getOperatorKey().getScope();
                POStore str = new POStore(new OperatorKey(scope,
                    NodeIdGenerator.getGenerator().getNextNodeId(scope)));
                spec = new FileSpec(FileLocalizer.getTemporaryPath(
                    pigContext).toString(),
                    new FuncSpec(Utils.getTmpFileCompressorName(pigContext)));
                str.setSFile(spec);
                plan.addAsLeaf(str);
            } else{
                spec = ((POStore)leaf).getSFile();
            }
            return spec;
        } catch (Exception e) {
            int errCode = 2045;
            String msg = "Internal error. Not able to check if the leaf node is a store operator.";
            throw new ExecException(msg, errCode, PigException.BUG, e);
        }
    }

    /**
     * Get all files recursively from the given list of files
     * 
     * @param files a list of FileStatus
     * @param conf the configuration object
     * @return the list of fileStatus that contains all the files in the given
     *         list and, recursively, all the files inside the directories in 
     *         the given list
     * @throws IOException
     */
    public static List<FileStatus> getAllFileRecursively(
            List<FileStatus> files, Configuration conf) throws IOException {
        List<FileStatus> result = new ArrayList<FileStatus>();
        int len = files.size();
        for (int i = 0; i < len; ++i) {
            FileStatus file = files.get(i);
            if (file.isDir()) {
                Path p = file.getPath();
                FileSystem fs = p.getFileSystem(conf);
                addInputPathRecursively(result, fs, p, hiddenFileFilter);
            } else {
                result.add(file);
            }
        }
        log.info("Total input paths to process : " + result.size()); 
        return result;
    }
    
    private static void addInputPathRecursively(List<FileStatus> result,
            FileSystem fs, Path path, PathFilter inputFilter) 
            throws IOException {
        for (FileStatus stat: fs.listStatus(path, inputFilter)) {
            if (stat.isDir()) {
                addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
            } else {
                result.add(stat);
            }
        }
    }          

    private static final PathFilter hiddenFileFilter = new PathFilter(){
        public boolean accept(Path p){
            String name = p.getName(); 
            return !name.startsWith("_") && !name.startsWith("."); 
        }
    };    

    /* The following codes are for split combination: see PIG-1518
     * 
     */
    private static Comparator<Node> nodeComparator = new Comparator<Node>() {
        @Override
        public int compare(Node o1, Node o2) {
            long cmp = o1.length - o2.length;
            return cmp == 0 ? 0 : cmp < 0 ? -1 : 1;
        }
    };
    
    private static final class ComparableSplit implements Comparable<ComparableSplit> {
        private InputSplit rawInputSplit;
        private HashSet<Node> nodes;
        // id used as a tie-breaker when two splits are of equal size.
        private long id;
        ComparableSplit(InputSplit split, long id) {
            rawInputSplit = split;
            nodes = new HashSet<Node>();
            this.id = id;
        }
        
        void add(Node node) {
            nodes.add(node);
        }
        
        void removeFromNodes() {
            for (Node node : nodes)
                node.remove(this);
        }
        
        public InputSplit getSplit() {
            return rawInputSplit;
        }
  
        @Override
        public boolean equals(Object other) {
            if (other == null || !(other instanceof ComparableSplit))
                return false;
            return (compareTo((ComparableSplit) other) == 0);
        }
        
        @Override
        public int hashCode() {
            return 41;
        }
        
        @Override
        public int compareTo(ComparableSplit other) {
            try {
                long cmp = rawInputSplit.getLength() - other.rawInputSplit.getLength();
                // in descending order
                return cmp == 0 ? (id == other.id ? 0 : id < other.id ? -1 : 1) : cmp < 0 ?  1 : -1;
            } catch (IOException e) {
                throw new RuntimeException(e);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
        }
    }
      
    private static class DummySplit extends InputSplit {
        private long length;
        
        @Override
        public String[] getLocations() {
            return null;
        }
        
        @Override
        public long getLength() {
            return length;
        }
        
        public void setLength(long length) {
            this.length = length;
        }
    }
    
    private static class Node {
        private long length = 0;
        private ArrayList<ComparableSplit> splits;
        private boolean sorted;
        
        Node() throws IOException, InterruptedException {
            length = 0;
            splits = new ArrayList<ComparableSplit>();
            sorted = false;
        }
        
        void add(ComparableSplit split) throws IOException, InterruptedException {
            splits.add(split);
            length++;
        }
        
        void remove(ComparableSplit split) {
            if (!sorted)
                sort();
            int index = Collections.binarySearch(splits, split);
            if (index >= 0) {
                splits.remove(index);
                length--;
            }
        }
        
        void sort() {
            if (!sorted) {
                Collections.sort(splits);
                sorted = true;
            }
        }
        
        ArrayList<ComparableSplit> getSplits() {
            return splits;
        }
  
        public long getLength() {
            return length;
        }
    }
  
    public static List<List<InputSplit>> getCombinePigSplits(List<InputSplit>
        oneInputSplits, long maxCombinedSplitSize, Configuration conf)
          throws IOException, InterruptedException {
        ArrayList<Node> nodes = new ArrayList<Node>();
        HashMap<String, Node> nodeMap = new HashMap<String, Node>();
        List<List<InputSplit>> result = new ArrayList<List<InputSplit>>();
        List<Long> resultLengths = new ArrayList<Long>();
        long comparableSplitId = 0;
        
        int size = 0, nSplits = oneInputSplits.size();
        InputSplit lastSplit = null;
        int emptyCnt = 0;
        for (InputSplit split : oneInputSplits) {
            if (split.getLength() == 0) {
                emptyCnt++; 
                continue;
            }
            if (split.getLength() >= maxCombinedSplitSize) {
                comparableSplitId++;
                ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
                combinedSplits.add(split);
                result.add(combinedSplits);
                resultLengths.add(split.getLength());
            } else {
                ComparableSplit csplit = new ComparableSplit(split, comparableSplitId++);
                String[] locations = split.getLocations();
                // sort the locations to stabilize the number of maps: PIG-1757
                Arrays.sort(locations);
                HashSet<String> locationSeen = new HashSet<String>();
                for (String location : locations)
                {
                    if (!locationSeen.contains(location)) 
                    {
                        Node node = nodeMap.get(location);
                        if (node == null) {
                            node = new Node();
                            nodes.add(node);
                            nodeMap.put(location, node);
                        }
                        node.add(csplit);
                        csplit.add(node);
                        locationSeen.add(location);
                    }
                }
                lastSplit = split;
                size++;
            }
        }
        /* verification code: debug purpose
        {
          ArrayList<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>();
          HashSet<InputSplit> seen = new HashSet<InputSplit>();
          for (Node node : nodes) {
            if (node.getLength() > 0)
            {
              ArrayList<ComparableSplit> splits = node.getSplits();
              for (ComparableSplit split : splits) {
                if (!seen.contains(split.getSplit())) {
                  // remove duplicates. The set has to be on the raw input split not the 
                  // comparable input split as the latter overrides the compareTo method
                  // so its equality semantics is changed and not we want here
                  seen.add(split.getSplit());
                  leftoverSplits.add(split);
                }
              }
            }
          }
          
          int combinedSplitLen = 0;
          for (PigSplit split : result)
            combinedSplitLen += split.getNumPaths();
          if (combinedSplitLen + leftoverSplits.size()!= nSplits-emptyCnt) {
            throw new AssertionError("number of combined splits {"+combinedSplitLen+"+"+leftoverSplits.size()+"-"+size+"} does not match the number of original splits ["+nSplits+"].");
          }
        }
        */
        if (nSplits > 0 && emptyCnt == nSplits)
        {
            // if all splits are empty, add a single empty split as currently an empty directory is
            // not properly handled somewhere
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            combinedSplits.add(oneInputSplits.get(0));
            result.add(combinedSplits);
        }
        else if (size == 1) {
            ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
            combinedSplits.add(lastSplit);
            result.add(combinedSplits);
        } else if (size > 1) {
            // combine small splits
            Collections.sort(nodes, nodeComparator);
            DummySplit dummy = new DummySplit();
            // dummy is used to search for next split of suitable size to be combined
            ComparableSplit dummyComparableSplit = new ComparableSplit(dummy, -1);
            for (Node node : nodes) {
                // sort the splits on this node in descending order
                node.sort();
                long totalSize = 0;
                ArrayList<ComparableSplit> splits = node.getSplits();
                int idx;
                int lenSplits;
                ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
                ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();
                while (!splits.isEmpty()) {
                    combinedSplits.add(splits.get(0).getSplit());
                    combinedComparableSplits.add(splits.get(0));
                    int startIdx = 1;
                    lenSplits = splits.size();
                    totalSize += splits.get(0).getSplit().getLength();
                    long spaceLeft = maxCombinedSplitSize - totalSize;
                    dummy.setLength(spaceLeft);
                    idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits), dummyComparableSplit);
                    idx = -idx-1+startIdx;
                    while (idx < lenSplits)
                    {
                        long thisLen = splits.get(idx).getSplit().getLength();
                        combinedSplits.add(splits.get(idx).getSplit());
                        combinedComparableSplits.add(splits.get(idx));
                        totalSize += thisLen;
                        spaceLeft -= thisLen;
                        if (spaceLeft <= 0)
                            break;
                        // find next combinable chunk
                        startIdx = idx + 1;
                        if (startIdx >= lenSplits)
                            break;
                        dummy.setLength(spaceLeft);
                        idx = Collections.binarySearch(node.getSplits().subList(startIdx, lenSplits), dummyComparableSplit);
                        idx = -idx-1+startIdx;
                    }
                    if (totalSize > maxCombinedSplitSize/2) {
                        result.add(combinedSplits);
                        resultLengths.add(totalSize);
                        removeSplits(combinedComparableSplits);
                        totalSize = 0;
                        combinedSplits = new ArrayList<InputSplit>();
                        combinedComparableSplits.clear();
                        splits = node.getSplits();
                    } else {
                        if (combinedSplits.size() != lenSplits)
                            throw new AssertionError("Combined split logic error!");
                        break;
                    }
                }
            }
            // handle leftovers
            ArrayList<ComparableSplit> leftoverSplits = new ArrayList<ComparableSplit>();
            HashSet<InputSplit> seen = new HashSet<InputSplit>();
            for (Node node : nodes) {
                for (ComparableSplit split : node.getSplits()) {
                    if (!seen.contains(split.getSplit())) {
                        // remove duplicates. The set has to be on the raw input split not the 
                        // comparable input split as the latter overrides the compareTo method
                        // so its equality semantics is changed and not we want here
                        seen.add(split.getSplit());
                        leftoverSplits.add(split);
                    }
                }
            }
            
            /* verification code
            int combinedSplitLen = 0;
            for (PigSplit split : result)
              combinedSplitLen += split.getNumPaths();
            if (combinedSplitLen + leftoverSplits.size()!= nSplits-emptyCnt)
              throw new AssertionError("number of combined splits ["+combinedSplitLen+"+"+leftoverSplits.size()+"] does not match the number of original splits ["+nSplits+"].");
            */
            if (!leftoverSplits.isEmpty())
            {
                long totalSize = 0;
                ArrayList<InputSplit> combinedSplits = new ArrayList<InputSplit>();
                ArrayList<ComparableSplit> combinedComparableSplits = new ArrayList<ComparableSplit>();
                
                int splitLen = leftoverSplits.size();
                for (int i = 0; i < splitLen; i++)
                {
                    ComparableSplit split = leftoverSplits.get(i);
                    long thisLen = split.getSplit().getLength();
                    if (totalSize + thisLen >= maxCombinedSplitSize) {
                        removeSplits(combinedComparableSplits);
                        result.add(combinedSplits);
                        resultLengths.add(totalSize);
                        combinedSplits = new ArrayList<InputSplit>();
                        combinedComparableSplits.clear();
                        totalSize = 0;
                    }
                    combinedSplits.add(split.getSplit());
                    combinedComparableSplits.add(split);
                    totalSize += split.getSplit().getLength();
                    if (i == splitLen - 1) {
                        // last piece: it could be very small, try to see it can be squeezed into any existing splits
                        for (int j =0; j < result.size(); j++)
                        {
                            if (resultLengths.get(j) + totalSize <= maxCombinedSplitSize)
                            {
                                List<InputSplit> isList = result.get(j);
                                for (InputSplit csplit : combinedSplits) {
                                    isList.add(csplit);
                                }
                                removeSplits(combinedComparableSplits);
                                combinedSplits.clear();
                                break;
                            }
                        }
                        if (!combinedSplits.isEmpty()) {
                            // last piece can not be squeezed in, create a new combined split for them.
                            removeSplits(combinedComparableSplits);
                            result.add(combinedSplits);
                        }
                    }
                }
            }
        }
        /* verification codes
        int combinedSplitLen = 0;
        for (PigSplit split : result)
          combinedSplitLen += split.getNumPaths();
        if (combinedSplitLen != nSplits-emptyCnt)
          throw new AssertionError("number of combined splits ["+combinedSplitLen+"] does not match the number of original splits ["+nSplits+"].");
        
        long totalLen = 0;
        for (PigSplit split : result)
          totalLen += split.getLength();
        
        long origTotalLen = 0;
        for (InputSplit split : oneInputSplits)
          origTotalLen += split.getLength();
        if (totalLen != origTotalLen)
          throw new AssertionError("The total length ["+totalLen+"] does not match the original ["+origTotalLen+"]");
        */ 
        log.info("Total input paths (combined) to process : " + result.size());
        return result;
    }
    
    private static void removeSplits(List<ComparableSplit> splits) {
        for (ComparableSplit split: splits)
            split.removeFromNodes();
    }
    
    public String inputSplitToString(InputSplit[] splits) throws IOException, InterruptedException {
        // debugging purpose only
        StringBuilder st = new StringBuilder();
        st.append("Number of splits :" + splits.length+"\n");
        long len = 0;
        for (InputSplit split: splits)
            len += split.getLength();
        st.append("Total Length = "+ len +"\n");
        for (int i = 0; i < splits.length; i++) {
            st.append("Input split["+i+"]:\n   Length = "+ splits[i].getLength()+"\n  Locations:\n");
            for (String location :  splits[i].getLocations())
                st.append("    "+location+"\n");
            st.append("\n-----------------------\n"); 
        }
        return st.toString();
    }
    
    /* verification code: debug purpose only
    public String inputSplitToString(ArrayList<ComparableSplit> splits) throws IOException, InterruptedException {
      StringBuilder st = new StringBuilder();
      st.append("Number of splits :" + splits.size()+"\n");
      long len = 0;
      for (ComparableSplit split: splits)
        len += split.getSplit().getLength();
      st.append("Total Length = "+ len +"\n");
      for (int i = 0; i < splits.size(); i++) {
        st.append("Input split["+i+"]:\n   Length = "+ splits.get(i).getSplit().getLength()+"\n  Locations:\n");
        for (String location :  splits.get(i).getSplit().getLocations())
          st.append("    "+location+"\n");
        st.append("\n-----------------------\n"); 
      }
      return st.toString();
    }
    */
}