InputSizeReducerEstimator.java example

Explorer
flare-spork-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.backend.hadoop.executionengine.mapReduceLayer;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadMetadata;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POLoad;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.util.PlanHelper;
import org.apache.pig.impl.util.UriUtil;
import org.apache.pig.impl.util.Utils;

import java.io.IOException;
import java.util.List;

/**
 * Class that estimates the number of reducers based on input size.
 * Number of reducers is based on two properties:
 * <ul>
 *     <li>pig.exec.reducers.bytes.per.reducer -
 *     how many bytes of input per reducer (default is 1000*1000*1000)</li>
 *     <li>pig.exec.reducers.max -
 *     constrain the maximum number of reducer task (default is 999)</li>
 * </ul>
 * If using a loader that implements LoadMetadata the reported input size is used, otherwise
 * attempt to determine size from the filesystem.
 * <p>
 * e.g. the following is your pig script
 * <pre>
 * a = load '/data/a';
 * b = load '/data/b';
 * c = join a by $0, b by $0;
 * store c into '/tmp';
 * </pre>
 * and the size of /data/a is 1000*1000*1000, and the size of /data/b is
 * 2*1000*1000*1000 then the estimated number of reducer to use will be
 * (1000*1000*1000+2*1000*1000*1000)/(1000*1000*1000)=3
 *
 */
public class InputSizeReducerEstimator implements PigReducerEstimator {
    private static final Log log = LogFactory.getLog(InputSizeReducerEstimator.class);

    /**
     * Determines the number of reducers to be used.
     *
     * @param job job instance
     * @param mapReduceOper
     * @throws java.io.IOException
     */
    @Override
    public int estimateNumberOfReducers(Job job, MapReduceOper mapReduceOper) throws IOException {
        Configuration conf = job.getConfiguration();

        long bytesPerReducer = conf.getLong(BYTES_PER_REDUCER_PARAM, DEFAULT_BYTES_PER_REDUCER);
        int maxReducers = conf.getInt(MAX_REDUCER_COUNT_PARAM, DEFAULT_MAX_REDUCER_COUNT_PARAM);

        List<POLoad> poLoads = PlanHelper.getPhysicalOperators(mapReduceOper.mapPlan, POLoad.class);
        long totalInputFileSize = getTotalInputFileSize(conf, poLoads, job);

        log.info("BytesPerReducer=" + bytesPerReducer + " maxReducers="
            + maxReducers + " totalInputFileSize=" + totalInputFileSize);

        // if totalInputFileSize == -1, we couldn't get the input size so we can't estimate.
        if (totalInputFileSize == -1) { return -1; }

        int reducers = (int)Math.ceil((double)totalInputFileSize / bytesPerReducer);
        reducers = Math.max(1, reducers);
        reducers = Math.min(maxReducers, reducers);

        return reducers;
    }

    /**
     * Get the input size for as many inputs as possible. Inputs that do not report
     * their size nor can pig look that up itself are excluded from this size.
     */
    static long getTotalInputFileSize(Configuration conf,
                                      List<POLoad> lds, Job job) throws IOException {
        long totalInputFileSize = 0;
        boolean foundSize = false;
        for (POLoad ld : lds) {
            long size = getInputSizeFromLoader(ld, job);
            if (size > -1) { foundSize = true; }
            if (size > 0) {
                totalInputFileSize += size;
                continue;
            }
            // the input file location might be a list of comma separated files,
            // separate them out
            for (String location : LoadFunc.getPathStrings(ld.getLFile().getFileName())) {
                if (UriUtil.isHDFSFileOrLocalOrS3N(location)) {
                    Path path = new Path(location);
                    FileSystem fs = path.getFileSystem(conf);
                    FileStatus[] status = fs.globStatus(path);
                    if (status != null) {
                        for (FileStatus s : status) {
                            totalInputFileSize += Utils.getPathLength(fs, s);
                            foundSize = true;
                        }
                    }
                }
            }
        }
        return foundSize ? totalInputFileSize : -1;
    }

    /**
     * Get the total input size in bytes by looking at statistics provided by
     * loaders that implement @{link LoadMetadata}.
     * @param ld
     * @param job
     * @return total input size in bytes, or -1 if unknown or incomplete
     * @throws IOException on error
     */
    static long getInputSizeFromLoader(POLoad ld, Job job) throws IOException {
        if (ld.getLoadFunc() == null
                || !(ld.getLoadFunc() instanceof LoadMetadata)
                || ld.getLFile() == null
                || ld.getLFile().getFileName() == null) {
            return -1;
        }

        ResourceStatistics statistics;
        try {
            statistics = ((LoadMetadata) ld.getLoadFunc())
                        .getStatistics(ld.getLFile().getFileName(), job);
        } catch (Exception e) {
            log.warn("Couldn't get statistics from LoadFunc: " + ld.getLoadFunc(), e);
            return -1;
        }

        if (statistics == null || statistics.getSizeInBytes() == null) {
            return -1;
        }

        return statistics.getSizeInBytes();
    }
}