/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig; import java.io.IOException; import java.net.URI; import java.util.AbstractCollection; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.pig.classification.InterfaceAudience; import org.apache.pig.classification.InterfaceStability; import org.apache.pig.LoadPushDown.RequiredFieldList; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; import org.apache.pig.builtin.Utf8StorageConverter; import org.apache.pig.data.Tuple; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.util.UDFContext; import org.apache.pig.tools.pigstats.PigStatusReporter; /** * A LoadFunc loads data into Pig. It can read from an HDFS file or other source. * LoadFunc is tightly coupled to Hadoop's {@link org.apache.hadoop.mapreduce.InputFormat}. * LoadFunc's sit atop an InputFormat and translate from the keys and values of Hadoop * to Pig's tuples. * <p> * LoadFunc contains the basic features needed by the majority of load functions. For * more advanced functionality there are separate interfaces that a load function * can implement. See {@link LoadCaster}, {@link LoadMetadata}, {@link LoadPushDown}, * {@link OrderedLoadFunc}, {@link CollectableLoadFunc}, and {@link IndexableLoadFunc}. */ @InterfaceAudience.Public @InterfaceStability.Stable public abstract class LoadFunc { /** * This method is called by the Pig runtime in the front end to convert the * input location to an absolute path if the location is relative. The * loadFunc implementation is free to choose how it converts a relative * location to an absolute location since this may depend on what the location * string represent (hdfs path or some other data source) * * @param location location as provided in the "load" statement of the script * @param curDir the current working direction based on any "cd" statements * in the script before the "load" statement. If there are no "cd" statements * in the script, this would be the home directory - * <pre>/user/<username> </pre> * @return the absolute location based on the arguments passed * @throws IOException if the conversion is not possible */ public String relativeToAbsolutePath(String location, Path curDir) throws IOException { return getAbsolutePath(location, curDir); } /** * Communicate to the loader the location of the object(s) being loaded. * The location string passed to the LoadFunc here is the return value of * {@link LoadFunc#relativeToAbsolutePath(String, Path)}. Implementations * should use this method to communicate the location (and any other information) * to its underlying InputFormat through the Job object. * * This method will be called in the frontend and backend multiple times. Implementations * should bear in mind that this method is called multiple times and should * ensure there are no inconsistent side effects due to the multiple calls. * * @param location Location as returned by * {@link LoadFunc#relativeToAbsolutePath(String, Path)} * @param job the {@link Job} object * store or retrieve earlier stored information from the {@link UDFContext} * @throws IOException if the location is not valid. */ public abstract void setLocation(String location, Job job) throws IOException; /** * This will be called during planning on the front end. This is the * instance of InputFormat (rather than the class name) because the * load function may need to instantiate the InputFormat in order * to control how it is constructed. * @return the InputFormat associated with this loader. * @throws IOException if there is an exception during InputFormat * construction */ @SuppressWarnings("unchecked") public abstract InputFormat getInputFormat() throws IOException; /** * This will be called on the front end during planning and not on the back * end during execution. * @return the {@link LoadCaster} associated with this loader. Returning null * indicates that casts from byte array are not supported for this loader. * construction * @throws IOException if there is an exception during LoadCaster */ public LoadCaster getLoadCaster() throws IOException { return new Utf8StorageConverter(); } /** * Initializes LoadFunc for reading data. This will be called during execution * before any calls to getNext. The RecordReader needs to be passed here because * it has been instantiated for a particular InputSplit. * @param reader {@link RecordReader} to be used by this instance of the LoadFunc * @param split The input {@link PigSplit} to process * @throws IOException if there is an exception during initialization */ @SuppressWarnings("unchecked") public abstract void prepareToRead(RecordReader reader, PigSplit split) throws IOException; /** * Retrieves the next tuple to be processed. Implementations should NOT reuse * tuple objects (or inner member objects) they return across calls and * should return a different tuple object in each call. * @return the next tuple to be processed or null if there are no more tuples * to be processed. * @throws IOException if there is an exception while retrieving the next * tuple */ public abstract Tuple getNext() throws IOException; //------------------------------------------------------------------------ /** * Join multiple strings into a string delimited by the given delimiter. * * @param s a collection of strings * @param delimiter the delimiter * @return a 'delimiter' separated string */ public static String join(AbstractCollection<String> s, String delimiter) { if (s.isEmpty()) return ""; Iterator<String> iter = s.iterator(); StringBuffer buffer = new StringBuffer(iter.next()); while (iter.hasNext()) { buffer.append(delimiter); buffer.append(iter.next()); } return buffer.toString(); } /** * Parse comma separated path strings into a string array. This method * escapes commas in the Hadoop glob pattern of the given paths. * * This method is borrowed from * {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}. A jira * (MAPREDUCE-1205) is opened to make the same name method there * accessible. We'll use that method directly when the jira is fixed. * * @param commaSeparatedPaths a comma separated string * @return an array of path strings */ public static String[] getPathStrings(String commaSeparatedPaths) { int length = commaSeparatedPaths.length(); int curlyOpen = 0; int pathStart = 0; boolean globPattern = false; List<String> pathStrings = new ArrayList<String>(); for (int i=0; i<length; i++) { char ch = commaSeparatedPaths.charAt(i); switch(ch) { case '{' : { curlyOpen++; if (!globPattern) { globPattern = true; } break; } case '}' : { curlyOpen--; if (curlyOpen == 0 && globPattern) { globPattern = false; } break; } case ',' : { if (!globPattern) { pathStrings.add(commaSeparatedPaths.substring(pathStart, i)); pathStart = i + 1 ; } break; } } } pathStrings.add(commaSeparatedPaths.substring(pathStart, length)); return pathStrings.toArray(new String[0]); } /** * Construct the absolute path from the file location and the current * directory. The current directory is either of the form * {code}hdfs://<nodename>:<nodeport>/<directory>{code} in Hadoop * MapReduce mode, or of the form * {code}file:///<directory>{code} in Hadoop local mode. * * @param location the location string specified in the load statement * @param curDir the current file system directory * @return the absolute path of file in the file system * @throws FrontendException if the scheme of the location is incompatible * with the scheme of the file system */ public static String getAbsolutePath(String location, Path curDir) throws FrontendException { if (location == null || curDir == null) { throw new FrontendException( "location: " + location + " curDir: " + curDir); } URI fsUri = curDir.toUri(); String fsScheme = fsUri.getScheme(); if (fsScheme == null) { throw new FrontendException("curDir: " + curDir); } fsScheme = fsScheme.toLowerCase(); String authority = fsUri.getAuthority(); if(authority == null) { authority = ""; } Path rootDir = new Path(fsScheme, authority, "/"); ArrayList<String> pathStrings = new ArrayList<String>(); String[] fnames = getPathStrings(location); for (String fname: fnames) { // remove leading/trailing whitespace(s) fname = fname.trim(); Path p = new Path(fname); URI uri = p.toUri(); // if the supplied location has a scheme (i.e. uri is absolute) or // an absolute path, just use it. if(! (uri.isAbsolute() || p.isAbsolute())) { String scheme = uri.getScheme(); if (scheme != null) { scheme = scheme.toLowerCase(); } if (scheme != null && !scheme.equals(fsScheme)) { throw new FrontendException("Incompatible file URI scheme: " + scheme + " : " + fsScheme); } String path = uri.getPath(); fname = (p.isAbsolute()) ? new Path(rootDir, path).toString() : new Path(curDir, path).toString(); } fname = fname.replaceFirst("^file:/([^/])", "file:///$1"); // remove the trailing / fname = fname.replaceFirst("/$", ""); pathStrings.add(fname); } return join(pathStrings, ","); } /** * This method will be called by Pig both in the front end and back end to * pass a unique signature to the {@link LoadFunc}. The signature can be used * to store into the {@link UDFContext} any information which the * {@link LoadFunc} needs to store between various method invocations in the * front end and back end. A use case is to store {@link RequiredFieldList} * passed to it in {@link LoadPushDown#pushProjection(RequiredFieldList)} for * use in the back end before returning tuples in {@link LoadFunc#getNext()}. * This method will be call before other methods in {@link LoadFunc} * @param signature a unique signature to identify this LoadFunc */ public void setUDFContextSignature(String signature) { // default implementation is a no-op } /** * Issue a warning. Warning messages are aggregated and reported to * the user. * @param msg String message of the warning * @param warningEnum type of warning */ public final void warn(String msg, Enum warningEnum) { Counter counter = PigStatusReporter.getInstance().getCounter(warningEnum); if (counter!=null) counter.increment(1); } }