/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig;
import java.io.IOException;
import java.net.URL;
import java.util.Map;
import org.apache.pig.backend.datastorage.DataStorage;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.io.BufferedPositionedInputStream;
import org.apache.pig.impl.logicalLayer.schema.Schema;
/**
* This interface is used to implement functions to parse records
* from a dataset. This also includes functions to cast raw byte data into various
* datatypes. These are external functions because we want loaders, whenever
* possible, to delay casting of datatypes until the last possible moment (i.e.
* don't do it on load). This means we need to expose the functionality so that
* other sections of the code can call back to the loader to do the cast.
*/
public interface LoadFunc {
/**
* Specifies a portion of an InputStream to read tuples. Because the
* starting and ending offsets may not be on record boundaries it is up to
* the implementor to deal with figuring out the actual starting and ending
* offsets in such a way that an arbitrarily sliced up file will be processed
* in its entirety.
* <p>
* A common way of handling slices in the middle of records is to start at
* the given offset and, if the offset is not zero, skip to the end of the
* first record (which may be a partial record) before reading tuples.
* Reading continues until a tuple has been read that ends at an offset past
* the ending offset.
* <p>
* <b>The load function should not do any buffering on the input stream</b>. Buffering will
* cause the offsets returned by is.getPos() to be unreliable.
*
* @param fileName the name of the file to be read
* @param is the stream representing the file to be processed, and which can also provide its position.
* @param offset the offset to start reading tuples.
* @param end the ending offset for reading.
* @throws IOException
*/
public void bindTo(String fileName,
BufferedPositionedInputStream is,
long offset,
long end) throws IOException;
/**
* Retrieves the next tuple to be processed.
* @return the next tuple to be processed or null if there are no more tuples
* to be processed.
* @throws IOException
*/
public Tuple getNext() throws IOException;
/**
* Cast data from bytes to integer value.
* @param b byte array to be cast.
* @return Integer value.
* @throws IOException if the value cannot be cast.
*/
public Integer bytesToInteger(byte[] b) throws IOException;
/**
* Cast data from bytes to long value.
* @param b byte array to be cast.
* @return Long value.
* @throws IOException if the value cannot be cast.
*/
public Long bytesToLong(byte[] b) throws IOException;
/**
* Cast data from bytes to float value.
* @param b byte array to be cast.
* @return Float value.
* @throws IOException if the value cannot be cast.
*/
public Float bytesToFloat(byte[] b) throws IOException;
/**
* Cast data from bytes to double value.
* @param b byte array to be cast.
* @return Double value.
* @throws IOException if the value cannot be cast.
*/
public Double bytesToDouble(byte[] b) throws IOException;
/**
* Cast data from bytes to chararray value.
* @param b byte array to be cast.
* @return String value.
* @throws IOException if the value cannot be cast.
*/
public String bytesToCharArray(byte[] b) throws IOException;
/**
* Cast data from bytes to map value.
* @param b byte array to be cast.
* @return Map value.
* @throws IOException if the value cannot be cast.
*/
public Map<Object, Object> bytesToMap(byte[] b) throws IOException;
/**
* Cast data from bytes to tuple value.
* @param b byte array to be cast.
* @return Tuple value.
* @throws IOException if the value cannot be cast.
*/
public Tuple bytesToTuple(byte[] b) throws IOException;
/**
* Cast data from bytes to bag value.
* @param b byte array to be cast.
* @return Bag value.
* @throws IOException if the value cannot be cast.
*/
public DataBag bytesToBag(byte[] b) throws IOException;
/**
* Indicate to the loader fields that will be needed. This can be useful for
* loaders that access data that is stored in a columnar format where indicating
* columns to be accessed a head of time will save scans. If the loader
* function cannot make use of this information, it is free to ignore it.
* @param schema Schema indicating which columns will be needed.
*/
public void fieldsToRead(Schema schema);
/**
* Find the schema from the loader. This function will be called at parse time
* (not run time) to see if the loader can provide a schema for the data. The
* loader may be able to do this if the data is self describing (e.g. JSON). If
* the loader cannot determine the schema, it can return a null.
* LoadFunc implementations which need to open the input "fileName", can use
* FileLocalizer.open(String fileName, ExecType execType, DataStorage storage) to get
* an InputStream which they can use to initialize their loader implementation. They
* can then use this to read the input data to discover the schema. Note: this will
* work only when the fileName represents a file on Local File System or Hadoop file
* system
* @param fileName Name of the file to be read.(this will be the same as the filename
* in the "load statement of the script)
* @param execType - execution mode of the pig script - one of ExecType.LOCAL or ExecType.MAPREDUCE
* @param storage - the DataStorage object corresponding to the execType
* @return a Schema describing the data if possible, or null otherwise.
* @throws IOException.
*/
public Schema determineSchema(String fileName, ExecType execType, DataStorage storage) throws IOException;
}