ReadToEndLoader.java example

Explorer
spork-streaming-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.impl.io;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.pig.Expression;
import org.apache.pig.LoadCaster;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadMetadata;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.backend.hadoop.executionengine.shims.HadoopShims;
import org.apache.pig.data.SchemaTupleBackend;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.plan.OperatorKey;

/**
 * This is wrapper Loader which wraps a real LoadFunc underneath and allows
 * to read a file completely starting a given split (indicated by a split index 
 * which is used to look in the List<InputSplit> returned by the underlying
 * InputFormat's getSplits() method). So if the supplied split index is 0, this
 * loader will read the entire file. If it is non zero it will read the partial
 * file beginning from that split to the last split.
 * 
 * The call sequence to use this is:
 * 1) construct an object using the constructor
 * 2) Call getNext() in a loop till it returns null
 */
public class ReadToEndLoader extends LoadFunc implements LoadMetadata {

    /**
     * the wrapped LoadFunc which will do the actual reading
     */
    private LoadFunc wrappedLoadFunc;
    
    /**
     * the Configuration object used to locate the input location - this will
     * be used to call {@link LoadFunc#setLocation(String, Job)} on
     * the wrappedLoadFunc
     */
    private Configuration conf;
    
    /**
     * the input location string (typically input file/dir name )
     */
    private String inputLocation;
      
    /**
     * If the splits to be read are not in increasing sequence of integers
     * this array can be used
     */
    private int[] toReadSplits = null;
    
    /**
     * index into toReadSplits
     */
    private int toReadSplitsIdx = 0;
    
    /**
     * the index of the split the loader is currently reading from
     */
    private int curSplitIndex;
    
    /**
     * the input splits returned by underlying {@link InputFormat#getSplits(JobContext)}
     */
    private List<InputSplit> inpSplits = null;
    
    /**
     * underlying RecordReader
     */
    private RecordReader reader = null;
    
    /**
     * underlying InputFormat
     */
    private InputFormat inputFormat = null;
    
    private PigContext pigContext;
    
    private String udfContextSignature = null;

    /**
     * @param wrappedLoadFunc
     * @param conf
     * @param inputLocation
     * @param splitIndex
     * @throws IOException 
     * @throws InterruptedException 
     */
    public ReadToEndLoader(LoadFunc wrappedLoadFunc, Configuration conf,
            String inputLocation, int splitIndex) throws IOException {
        this.wrappedLoadFunc = wrappedLoadFunc;
        this.inputLocation = inputLocation;
        this.conf = conf;
        this.curSplitIndex = splitIndex;
        init();
    }
    
    public ReadToEndLoader(LoadFunc wrappedLoadFunc, Configuration conf,
            String inputLocation, int splitIndex, PigContext pigContext) throws IOException {
        this.wrappedLoadFunc = wrappedLoadFunc;
        this.inputLocation = inputLocation;
        this.conf = conf;
        this.curSplitIndex = splitIndex;
        this.pigContext = pigContext;
        init();
    }
    
    public ReadToEndLoader(LoadFunc wrappedLoadFunc, Configuration conf,
            String inputLocation, int splitIndex, String signature) throws IOException {
        this.udfContextSignature = signature;
        this.wrappedLoadFunc = wrappedLoadFunc;
        this.inputLocation = inputLocation;
        this.conf = conf;
        this.curSplitIndex = splitIndex;
        init();
    }

    /**
     * This constructor takes an array of split indexes (toReadSplitIdxs) of the 
     * splits to be read.
     * @param wrappedLoadFunc
     * @param conf
     * @param inputLocation
     * @param toReadSplitIdxs
     * @throws IOException 
     * @throws InterruptedException 
     */
    public ReadToEndLoader(LoadFunc wrappedLoadFunc, Configuration conf,
            String inputLocation, int[] toReadSplitIdxs) throws IOException {
        this.wrappedLoadFunc = wrappedLoadFunc;
        this.inputLocation = inputLocation;
        this.toReadSplits = toReadSplitIdxs;
        this.conf = conf;
        this.curSplitIndex =
            toReadSplitIdxs.length > 0 ? toReadSplitIdxs[0] : Integer.MAX_VALUE;
        init();
    }
    
    @SuppressWarnings("unchecked")
    private void init() throws IOException {
        if (conf != null && pigContext != null) {
            SchemaTupleBackend.initialize(conf, pigContext, true);
        }

        // make a copy so that if the underlying InputFormat writes to the
        // conf, we don't affect the caller's copy
        conf = new Configuration(conf);

        // let's initialize the wrappedLoadFunc 
        Job job = new Job(conf);
        wrappedLoadFunc.setUDFContextSignature(this.udfContextSignature);
        wrappedLoadFunc.setLocation(inputLocation, 
                job);
        // The above setLocation call could write to the conf within
        // the job - get a hold of the modified conf
        conf = job.getConfiguration();
        inputFormat = wrappedLoadFunc.getInputFormat();
        try {
            inpSplits = inputFormat.getSplits(HadoopShims.createJobContext(conf,
                    new JobID()));
        } catch (InterruptedException e) {
            throw new IOException(e);
        }        
    }

    private boolean initializeReader() throws IOException, 
    InterruptedException {
        if(curSplitIndex > inpSplits.size() - 1) {
            // past the last split, we are done
            return false;
        }
        if(reader != null){
            reader.close();
        }
        InputSplit curSplit = inpSplits.get(curSplitIndex);
        TaskAttemptContext tAContext = HadoopShims.createTaskAttemptContext(conf, 
                new TaskAttemptID());
        reader = inputFormat.createRecordReader(curSplit, tAContext);
        reader.initialize(curSplit, tAContext);
        // create a dummy pigsplit - other than the actual split, the other
        // params are really not needed here where we are just reading the
        // input completely
        PigSplit pigSplit = new PigSplit(new InputSplit[] {curSplit}, -1, 
                new ArrayList<OperatorKey>(), -1);
        wrappedLoadFunc.prepareToRead(reader, pigSplit);
        return true;
    }

    @Override
    public Tuple getNext() throws IOException {
        try {
            Tuple t = null;
            if(reader == null) {
                // first call
                return getNextHelper();
            } else {
                // we already have a reader initialized
                t = wrappedLoadFunc.getNext();
                if(t != null) {
                    return t;
                }
                // if loadfunc returned null, we need to read next split
                // if there is one
                updateCurSplitIndex();
                return getNextHelper();
            }
        } catch (InterruptedException e) {
            throw new IOException(e);
        }
    }
    
    private Tuple getNextHelper() throws IOException, InterruptedException {
        Tuple t = null;
        while(initializeReader()) {
            t = wrappedLoadFunc.getNext();
            if(t == null) {
                // try next split
                updateCurSplitIndex();
            } else {
                return t;
            }
        }
        return null;
    }
    
    
    /**
     * Updates curSplitIndex , just increment if splitIndexes is null,
     * else get next split in splitIndexes
     */
    private void updateCurSplitIndex() {
        if(toReadSplits == null){
            ++curSplitIndex;
        }else{
            ++toReadSplitsIdx;
            if(toReadSplitsIdx >= toReadSplits.length){
                // finished all the splits in splitIndexes array
                curSplitIndex = Integer.MAX_VALUE;
            }else{
                curSplitIndex = toReadSplits[toReadSplitsIdx];
            }
        }
    }

    @Override
    public InputFormat getInputFormat() throws IOException {
        throw new UnsupportedOperationException();
    }

    @Override
    public LoadCaster getLoadCaster() throws IOException {
        throw new UnsupportedOperationException();
    }

    @Override
    public void prepareToRead(RecordReader reader, PigSplit split) {
        throw new UnsupportedOperationException();
    }

    @Override
    public void setLocation(String location, Job job) throws IOException {
        wrappedLoadFunc.setLocation(location, job);
    }

    @Override
    public ResourceSchema getSchema(String location, Job job) throws IOException {
        if (wrappedLoadFunc instanceof LoadMetadata) {
            return ((LoadMetadata) wrappedLoadFunc).getSchema(location, job);
        } else {
            return null;
        }
    }

    @Override
    public ResourceStatistics getStatistics(String location, Job job) throws IOException {
        if (wrappedLoadFunc instanceof LoadMetadata) {
            return ((LoadMetadata) wrappedLoadFunc).getStatistics(location, job);
        } else {
            return null;
        }
    }

    @Override
    public String[] getPartitionKeys(String location, Job job) throws IOException {
        if (wrappedLoadFunc instanceof LoadMetadata) {
            return ((LoadMetadata) wrappedLoadFunc).getPartitionKeys(location, job);
        } else {
            return null;
        }
    }

    @Override
    public void setPartitionFilter(Expression partitionFilter) throws IOException {
        if (wrappedLoadFunc instanceof LoadMetadata) {
             ((LoadMetadata) wrappedLoadFunc).setPartitionFilter(partitionFilter);
        }
    }
    
    @Override
    public void setUDFContextSignature(String signature) {
        this.udfContextSignature = signature;
    }
}