PigRecordReader.java example

Explorer
spork-streaming-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.backend.hadoop.executionengine.mapReduceLayer;

import static org.apache.pig.PigConfiguration.TIME_UDFS_PROP;

import java.io.IOException;
import java.util.ArrayList;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.pig.LoadFunc;
import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.io.FileSpec;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.pig.tools.pigstats.PigStatsUtil;
import org.apache.pig.tools.pigstats.PigStatusReporter;

/**
 * A wrapper around the actual RecordReader and loadfunc - this is needed for
 * two reasons
 * 1) To intercept the initialize call from hadoop and initialize the underlying
 * actual RecordReader with the right Context object - this is achieved by 
 * looking up the Context corresponding to the input split this Reader is 
 * supposed to process
 * 2) We need to give hadoop consistent key-value types - text and tuple 
 * respectively - so PigRecordReader will call underlying Loader's getNext() to
 * get the Tuple value - the key is null text since key is not used in input to
 * map() in Pig.
 */
public class PigRecordReader extends RecordReader<Text, Tuple> {

    private static final Log LOG = LogFactory.getLog(PigRecordReader.class);

    private final static String TIMING_COUNTER = "approx_microsecs";
    private final static int TIMING_FREQ = 100;

    transient private String counterGroup = "";
    private boolean doTiming = false;

    /**
     * the current Tuple value as returned by underlying
     * {@link LoadFunc#getNext()}
     */
    Tuple curValue = null;
    
    // the current wrapped RecordReader used by the loader
    @SuppressWarnings("unchecked")
    private RecordReader curReader;
    
    // the loader object
    private LoadFunc loadfunc;
    
    // the Hadoop counter for multi-input jobs 
    transient private Counter inputRecordCounter = null;
    
    // the Hadoop counter name
    transient private String counterName = null;
    
    // the wrapped inputformat
    private InputFormat inputformat;
    
    // the wrapped splits
    private PigSplit pigSplit;
    
    // the wrapped split index in use
    private int idx;
    
    private long progress;
    
    private TaskAttemptContext context;
    
    private final long limit;

    private long recordCount = 0;
    
    /**
     * the Configuration object with data specific to the input the underlying
     * RecordReader will process (this is obtained after a 
     * {@link LoadFunc#setLocation(String, org.apache.hadoop.mapreduce.Job)} 
     * call and hence can contain specific properties the underlying
     * {@link InputFormat} might have put in.
     */
    private Configuration inputSpecificConf;
    /**
     * @param context 
     * 
     */
    public PigRecordReader(InputFormat inputformat, PigSplit pigSplit, 
            LoadFunc loadFunc, TaskAttemptContext context, long limit) throws IOException, InterruptedException {
        this.inputformat = inputformat;
        this.pigSplit = pigSplit; 
        this.loadfunc = loadFunc;
        this.context = context;
        this.inputSpecificConf = context.getConfiguration();
        curReader = null;
        progress = 0;
        idx = 0;
        this.limit = limit;
        initNextRecordReader();
        counterGroup = loadFunc.toString();
        doTiming = context.getConfiguration().getBoolean(TIME_UDFS_PROP, false);
    }
    
    @Override
    public void close() throws IOException {
        if (curReader != null) {
            curReader.close();
            curReader = null;
        }
    }

    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        // In pig we don't really use the key in the input to the map - so send
        // null
        return null;
    }

    @Override
    public Tuple getCurrentValue() throws IOException, InterruptedException {    
        if (inputRecordCounter == null && counterName != null) {
            PigStatusReporter reporter = PigStatusReporter.getInstance();
            if (reporter != null) {
                inputRecordCounter = reporter.getCounter(
                        PigStatsUtil.MULTI_INPUTS_COUNTER_GROUP,
                        counterName);
                LOG.info("Created input record counter: " + counterName);
            } else {
                LOG.warn("Get null reporter for " + counterName);
            }
        }
        // Increment the multi-input record counter
        if (inputRecordCounter != null && curValue != null) {
            inputRecordCounter.increment(1);            
        }
       
        return curValue;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        long subprogress = 0;    // bytes processed in current split
        if (null != curReader) {
            // idx is always one past the current subsplit's true index.
            subprogress = (long)(curReader.getProgress() * pigSplit.getLength(idx - 1));
        }
        return Math.min(1.0f,  (progress + subprogress)/(float)(pigSplit.getLength()));
    }

    @Override
    public void initialize(InputSplit split, TaskAttemptContext context)
            throws IOException, InterruptedException {
        // initialize the underlying actual RecordReader with the right Context 
        // object - this is achieved by merging the Context corresponding to 
        // the input split this Reader is supposed to process with the context
        // passed in.
        this.pigSplit = (PigSplit)split;
        this.context = context;
        ConfigurationUtil.mergeConf(context.getConfiguration(),
                inputSpecificConf);
        // Pass loader signature to LoadFunc and to InputFormat through
        // the conf
        PigInputFormat.passLoadSignature(loadfunc, pigSplit.getInputIndex(), 
                context.getConfiguration());
        // now invoke initialize() on underlying RecordReader with
        // the "adjusted" conf
        if (null != curReader) {
            curReader.initialize(pigSplit.getWrappedSplit(), context);
            loadfunc.prepareToRead(curReader, pigSplit);
        }
                
        if (pigSplit.isMultiInputs() && !pigSplit.disableCounter()) { 
            counterName = getMultiInputsCounerName(pigSplit, inputSpecificConf);
        }
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {

        if (limit != -1 && recordCount >= limit)
            return false;
        boolean timeThis = doTiming && ( (recordCount + 1) % TIMING_FREQ == 0);
        long startNanos = 0;
        if (timeThis) {
            startNanos = System.nanoTime();
        }
        while ((curReader == null) || (curValue = loadfunc.getNext()) == null) {
            if (!initNextRecordReader()) {
              return false;
            }
        }
        if (timeThis) {
            PigStatusReporter.getInstance().getCounter(counterGroup, TIMING_COUNTER).increment(
                    ( Math.round((System.nanoTime() - startNanos) / 1000)) * TIMING_FREQ);
        }
        recordCount++;
        return true;
    }

    @SuppressWarnings("unchecked")
    private static String getMultiInputsCounerName(PigSplit pigSplit,
            Configuration conf) throws IOException {
        ArrayList<FileSpec> inputs = 
            (ArrayList<FileSpec>) ObjectSerializer.deserialize(
                    conf.get(PigInputFormat.PIG_INPUTS));
        String fname = inputs.get(pigSplit.getInputIndex()).getFileName();
        return PigStatsUtil.getMultiInputsCounterName(fname, pigSplit.getInputIndex());
    }
    
    /**
     * Get the record reader for the next chunk in this CombineFileSplit.
     */
    protected boolean initNextRecordReader() throws IOException, InterruptedException {

        if (curReader != null) {
            curReader.close();
            curReader = null;
            if (idx > 0) {
                progress += pigSplit.getLength(idx-1);    // done processing so far
            }
        }

        // if all chunks have been processed, nothing more to do.
        if (idx == pigSplit.getNumPaths()) {
            return false;
        }

        // get a record reader for the idx-th chunk
        try {
          
            pigSplit.setCurrentIdx(idx);
            curReader =  inputformat.createRecordReader(pigSplit.getWrappedSplit(), context);
            LOG.info("Current split being processed "+pigSplit.getWrappedSplit());

            if (idx > 0) {
                // initialize() for the first RecordReader will be called by MapTask;
                // we're responsible for initializing subsequent RecordReaders.
                curReader.initialize(pigSplit.getWrappedSplit(), context);
                loadfunc.prepareToRead(curReader, pigSplit);
            }
        } catch (Exception e) {
            throw new RuntimeException (e);
        }
        idx++;
        return true;
    }
}