LoadFunc.java example

Explorer
Cloud-Stenography-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig;

import java.io.IOException;
import java.net.URL;
import java.util.Map;

import org.apache.pig.backend.datastorage.DataStorage;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.io.BufferedPositionedInputStream;
import org.apache.pig.impl.logicalLayer.schema.Schema;


/**
 * This interface is used to implement functions to parse records
 * from a dataset.  This also includes functions to cast raw byte data into various
 * datatypes.  These are external functions because we want loaders, whenever
 * possible, to delay casting of datatypes until the last possible moment (i.e.
 * don't do it on load).  This means we need to expose the functionality so that
 * other sections of the code can call back to the loader to do the cast.
 */
public interface LoadFunc {
    /**
     * Specifies a portion of an InputStream to read tuples. Because the
     * starting and ending offsets may not be on record boundaries it is up to
     * the implementor to deal with figuring out the actual starting and ending
     * offsets in such a way that an arbitrarily sliced up file will be processed
     * in its entirety.
     * <p>
     * A common way of handling slices in the middle of records is to start at
     * the given offset and, if the offset is not zero, skip to the end of the
     * first record (which may be a partial record) before reading tuples.
     * Reading continues until a tuple has been read that ends at an offset past
     * the ending offset.
     * <p>
     * <b>The load function should not do any buffering on the input stream</b>. Buffering will
     * cause the offsets returned by is.getPos() to be unreliable.
     *  
     * @param fileName the name of the file to be read
     * @param is the stream representing the file to be processed, and which can also provide its position.
     * @param offset the offset to start reading tuples.
     * @param end the ending offset for reading.
     * @throws IOException
     */
    public void bindTo(String fileName,
                       BufferedPositionedInputStream is,
                       long offset,
                       long end) throws IOException;

    /**
     * Retrieves the next tuple to be processed.
     * @return the next tuple to be processed or null if there are no more tuples
     * to be processed.
     * @throws IOException
     */
    public Tuple getNext() throws IOException;
    
    
    /**
     * Cast data from bytes to integer value.  
     * @param b byte array to be cast.
     * @return Integer value.
     * @throws IOException if the value cannot be cast.
     */
    public Integer bytesToInteger(byte[] b) throws IOException;

    /**
     * Cast data from bytes to long value.  
     * @param b byte array to be cast.
     * @return Long value.
     * @throws IOException if the value cannot be cast.
     */
    public Long bytesToLong(byte[] b) throws IOException;

    /**
     * Cast data from bytes to float value.  
     * @param b byte array to be cast.
     * @return Float value.
     * @throws IOException if the value cannot be cast.
     */
    public Float bytesToFloat(byte[] b) throws IOException;

    /**
     * Cast data from bytes to double value.  
     * @param b byte array to be cast.
     * @return Double value.
     * @throws IOException if the value cannot be cast.
     */
    public Double bytesToDouble(byte[] b) throws IOException;

    /**
     * Cast data from bytes to chararray value.  
     * @param b byte array to be cast.
     * @return String value.
     * @throws IOException if the value cannot be cast.
     */
    public String bytesToCharArray(byte[] b) throws IOException;

    /**
     * Cast data from bytes to map value.  
     * @param b byte array to be cast.
     * @return Map value.
     * @throws IOException if the value cannot be cast.
     */
    public Map<Object, Object> bytesToMap(byte[] b) throws IOException;

    /**
     * Cast data from bytes to tuple value.  
     * @param b byte array to be cast.
     * @return Tuple value.
     * @throws IOException if the value cannot be cast.
     */
    public Tuple bytesToTuple(byte[] b) throws IOException;

    /**
     * Cast data from bytes to bag value.  
     * @param b byte array to be cast.
     * @return Bag value.
     * @throws IOException if the value cannot be cast.
     */
    public DataBag bytesToBag(byte[] b) throws IOException;

    /**
     * Indicate to the loader fields that will be needed.  This can be useful for
     * loaders that access data that is stored in a columnar format where indicating
     * columns to be accessed a head of time will save scans.  If the loader
     * function cannot make use of this information, it is free to ignore it.
     * @param schema Schema indicating which columns will be needed.
     */
    public void fieldsToRead(Schema schema);

    /**
     * Find the schema from the loader.  This function will be called at parse time
     * (not run time) to see if the loader can provide a schema for the data.  The
     * loader may be able to do this if the data is self describing (e.g. JSON).  If
     * the loader cannot determine the schema, it can return a null.
     * LoadFunc implementations which need to open the input "fileName", can use 
     * FileLocalizer.open(String fileName, ExecType execType, DataStorage storage) to get
     * an InputStream which they can use to initialize their loader implementation. They
     * can then use this to read the input data to discover the schema. Note: this will
     * work only when the fileName represents a file on Local File System or Hadoop file 
     * system
     * @param fileName Name of the file to be read.(this will be the same as the filename 
     * in the "load statement of the script)
     * @param execType - execution mode of the pig script - one of ExecType.LOCAL or ExecType.MAPREDUCE
     * @param storage - the DataStorage object corresponding to the execType
     * @return a Schema describing the data if possible, or null otherwise.
     * @throws IOException.
     */
    public Schema determineSchema(String fileName, ExecType execType, DataStorage storage) throws IOException;
}