TableInputFormat.java example

Explorer
flare-spork-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.apache.hadoop.zebra.mapred;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.zebra.tfile.RawComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.InvalidInputException;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.zebra.io.BasicTable;
import org.apache.hadoop.zebra.io.BasicTableStatus;
import org.apache.hadoop.zebra.io.BlockDistribution;
import org.apache.hadoop.zebra.io.KeyDistribution;
import org.apache.hadoop.zebra.io.BasicTable.Reader;
import org.apache.hadoop.zebra.io.BasicTable.Reader.RangeSplit;
import org.apache.hadoop.zebra.io.BasicTable.Reader.RowSplit;
import org.apache.hadoop.zebra.mapred.TableExpr.LeafTableInfo;
import org.apache.hadoop.zebra.parser.ParseException;
import org.apache.hadoop.zebra.types.Projection;
import org.apache.hadoop.zebra.schema.Schema;
import org.apache.hadoop.zebra.types.SortInfo;
import org.apache.hadoop.zebra.tfile.TFile;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;

/**
 * {@link org.apache.hadoop.mapred.InputFormat} class for reading one or more
 * BasicTables.
 * 
 * Usage Example:
 * <p>
 * In the main program, add the following code.
 * 
 * <pre>
 * jobConf.setInputFormat(TableInputFormat.class);
 * TableInputFormat.setInputPaths(jobConf, new Path("path/to/table1", new Path("path/to/table2");
 * TableInputFormat.setProjection(jobConf, "Name, Salary, BonusPct");
 * </pre>
 * 
 * The above code does the following things:
 * <UL>
 * <LI>Set the input format class to TableInputFormat.
 * <LI>Set the paths to the BasicTables to be consumed by user's Mapper code.
 * <LI>Set the projection on the input tables. In this case, the Mapper code is
 * only interested in three fields: "Name", "Salary", "BonusPct". "Salary"
 * (perhaps for the purpose of calculating the person's total payout). If no
 * project is specified, then all columns from the input tables will be
 * retrieved. If input tables have different schemas, then the input contains
 * the union of all columns from all the input tables. Absent fields will be
 * left as nul in the input tuple.
 * </UL>
 * The user Mapper code should look like the following:
 * 
 * <pre>
 * static class MyMapClass implements Mapper<BytesWritable, Tuple, K, V> {
 *   // keep the tuple object for reuse.
 *   // indices of various fields in the input Tuple.
 *   int idxName, idxSalary, idxBonusPct;
 * 
 *   @Override
 *   public void configure(JobConf job) {
 *     Schema projection = TableInputFormat.getProjection(job);
 *     // determine the field indices.
 *     idxName = projection.getColumnIndex("Name");
 *     idxSalary = projection.getColumnIndex("Salary");
 *     idxBonusPct = projection.getColumnIndex("BonusPct");
 *   }
 * 
 *   @Override
 *   public void map(BytesWritable key, Tuple value, OutputCollector<K, V> output,
 *       Reporter reporter) throws IOException {
 *     try {
 *       String name = (String) value.get(idxName);
 *       int salary = (Integer) value.get(idxSalary);
 *       double bonusPct = (Double) value.get(idxBonusPct);
 *       // do something with the input data
 *     } catch (ExecException e) {
 *       e.printStackTrace();
 *     }
 *   }
 * 
 *   @Override
 *   public void close() throws IOException {
 *     // no-op
 *   }
 * }
 * </pre>
 * 
 * A little bit more explanation on the PIG {@link Tuple} objects. A Tuple is an
 * ordered list of PIG datum objects. The permitted PIG datum types can be
 * categorized as Scalar types and Composite types.
 * <p>
 * Supported Scalar types include seven native Java types: Boolean, Byte,
 * Integer, Long, Float, Double, String, as well as one PIG class called
 * {@link DataByteArray} that represents type-less byte array.
 * <p>
 * Supported Composite types include:
 * <UL>
 * <LI>{@link Map} : It is the same as Java Map class, with the additional
 * restriction that the key-type must be one of the scalar types PIG recognizes,
 * and the value-type any of the scaler or composite types PIG understands.
 * <LI>{@link DataBag} : A DataBag is a collection of Tuples.
 * <LI>{@link Tuple} : Yes, Tuple itself can be a datum in another Tuple.
 * </UL>
 * 
 * @Deprecated Use (@link org.apache.hadoop.zebra.mapreduce.TableInputFormat) instead
 */
@Deprecated
public class TableInputFormat implements InputFormat<BytesWritable, Tuple> {
  static Log LOG = LogFactory.getLog(TableInputFormat.class);
  
  public static final String INPUT_EXPR = "mapred.lib.table.input.expr";
  public static final String INPUT_PROJ = "mapred.lib.table.input.projection";
  public static final String INPUT_SORT = "mapred.lib.table.input.sort";
  public static final String INPUT_FE = "mapred.lib.table.input.fe";
  public static final String INPUT_DELETED_CGS = "mapred.lib.table.input.deleted_cgs";
  static final String DELETED_CG_SEPARATOR_PER_UNION = ";";

  /**
   * Set the paths to the input table.
   * 
   * @param conf
   *          JobConf object.
   * @param paths
   *          one or more paths to BasicTables. The InputFormat class will
   *          produce splits on the "union" of these BasicTables.
   */
  public static void setInputPaths(JobConf conf, Path... paths) {
    if (paths.length < 1) {
      throw new IllegalArgumentException("Requring at least one input path");
    }
    if (paths.length == 1) {
      setInputExpr(conf, new BasicTableExpr(paths[0]));
    }
    else {
      TableUnionExpr expr = new TableUnionExpr();
      for (Path path : paths) {
        expr.add(new BasicTableExpr(path));
      }
      setInputExpr(conf, expr);
    }
  }
  
  /**
   * Set the input expression in the JobConf object.
   * 
   * @param conf
   *          JobConf object.
   * @param expr
   *          The input table expression.
   */
  static void setInputExpr(JobConf conf, TableExpr expr) {
    StringBuilder out = new StringBuilder();
    expr.encode(out);
    conf.set(INPUT_EXPR, out.toString());
  }

  static TableExpr getInputExpr(JobConf conf) throws IOException {
    String expr = conf.get(INPUT_EXPR);
    if (expr == null) {
      // try setting from input path
      Path[] paths = FileInputFormat.getInputPaths(conf);
      if (paths != null) {
        setInputPaths(conf, paths);
      }
      expr = conf.get(INPUT_EXPR);
    }
      
    if (expr == null) {
      throw new IllegalArgumentException("Input expression not defined.");
    }
    StringReader in = new StringReader(expr);
    return TableExpr.parse(in);
  }
  
  /**
   * Get the schema of a table expr
   * 
   * @param conf
   *          JobConf object.
   *                    
   */
  public static Schema getSchema(JobConf conf) throws IOException
  {
	  TableExpr expr = getInputExpr(conf);
	  return expr.getSchema(conf);
  }  
  
  /**
   * Set the input projection in the JobConf object.
   * 
   * @param conf
   *          JobConf object.
   * @param projection
   *          A common separated list of column names. If we want select all
   *          columns, pass projection==null. The syntax of the projection
   *          conforms to the {@link Schema} string.
   * @deprecated Use {@link #setProjection(JobConf, ZebraProjection)} instead.
   */
  public static void setProjection(JobConf conf, String projection) throws ParseException {
    conf.set(INPUT_PROJ, Schema.normalize(projection));
  }
  
  /**
   * Set the input projection in the JobConf object.
   * 
   * @param conf
   *          JobConf object.
   * @param projection
   *          A common separated list of column names. If we want select all
   *          columns, pass projection==null. The syntax of the projection
   *          conforms to the {@link Schema} string.
   *
   */
  public static void setProjection(JobConf conf, ZebraProjection projection) throws ParseException {
    /* validity check on projection */
    Schema schema = null;
    String normalizedProjectionString = Schema.normalize(projection.toString());
    try {
      schema = getSchema(conf);
      new org.apache.hadoop.zebra.types.Projection(schema, normalizedProjectionString);
    } catch (ParseException e) {
      throw new ParseException("[" + projection + "] " + "is not a valid Zebra projection string " + e.getMessage());
    } catch (IOException e) {
      throw new ParseException("[" + projection + "] " + "is not a valid Zebra projection string " + e.getMessage());
    }
    
    conf.set(INPUT_PROJ, normalizedProjectionString);
  }  

  /**
   * Get the projection from the JobConf
   * 
   * @param conf
   *          The JobConf object
   * @return The projection schema. If projection has not been defined, or is
   *         not known at this time, null will be returned. Note that by the time
   *         when this method is called in Mapper code, the projection must
   *         already be known.
   * @throws IOException
   *  
   */
  public static String getProjection(JobConf conf) throws IOException, ParseException {
    String strProj = conf.get(INPUT_PROJ);
    // TODO: need to be revisited
    if (strProj != null) return strProj;
    TableExpr expr = getInputExpr(conf);
    if (expr != null) {
      return expr.getSchema(conf).toProjectionString();
    }
    return null;
  }
      
  /**
   * Set requirement for sorted table
   *
   *@param conf
   *          JobConf object.
   */
  private static void setSorted(JobConf conf) {
    conf.setBoolean(INPUT_SORT, true);
  }
  
  /**
   * Get the SortInfo object regarding a Zebra table
   *
   * @param conf
   *          JobConf object
   * @return the zebra tables's SortInfo; null if the table is unsorted.
   */
  public static SortInfo getSortInfo(JobConf conf) throws IOException
  {
	  TableExpr expr = getInputExpr(conf);
	  SortInfo result = null;
	  int sortSize = 0;
	  if (expr instanceof BasicTableExpr)
    {
      BasicTable.Reader reader = new BasicTable.Reader(((BasicTableExpr) expr).getPath(), conf);
      SortInfo sortInfo = reader.getSortInfo();
      reader.close();
      result = sortInfo;
	  } else {
      List<LeafTableInfo> leaves = expr.getLeafTables(null);
      for (Iterator<LeafTableInfo> it = leaves.iterator(); it.hasNext(); )
      {
        LeafTableInfo leaf = it.next();
        BasicTable.Reader reader = new BasicTable.Reader(leaf.getPath(), conf);
        SortInfo sortInfo = reader.getSortInfo();
        reader.close();
        if (sortSize == 0)
        {
          sortSize = sortInfo.size();
          result = sortInfo;
        } else if (sortSize != sortInfo.size()) {
          throw new IOException("Tables of the table union do not possess the same sort property.");
        }
		  }
	  }
	  return result;
  }

  /**
   * Requires sorted table or table union
   * 
   * @param conf
   *          JobConf object.
   * @param sortInfo
   *          ZebraSortInfo object containing sorting information.
   *        
   */
  public static void requireSortedTable(JobConf conf, ZebraSortInfo sortInfo) throws IOException {
	 TableExpr expr = getInputExpr(conf);
	 String comparatorName = null;
 	 String[] sortcolumns = null;
         if (sortInfo != null)
         {
           comparatorName = TFile.COMPARATOR_JCLASS+sortInfo.getComparator();
           String sortColumnNames = sortInfo.getSortColumns();
           if (sortColumnNames != null)
             sortcolumns =  sortColumnNames.trim().split(SortInfo.SORTED_COLUMN_DELIMITER);
           if (sortcolumns == null)
             throw new IllegalArgumentException("No sort columns specified.");
         }

	 if (expr instanceof BasicTableExpr)
	 {
		 BasicTable.Reader reader = new BasicTable.Reader(((BasicTableExpr) expr).getPath(), conf);
		 SortInfo mySortInfo = reader.getSortInfo();

		 reader.close();
		 if (mySortInfo == null)
       throw new IOException("The table is not sorted");
		 if (comparatorName == null)
			 // cheat the equals method's comparator comparison
			 comparatorName = mySortInfo.getComparator();
		 if (sortcolumns != null && !mySortInfo.equals(sortcolumns, comparatorName))
		 {
			 throw new IOException("The table is not properly sorted");
		 }
    } else {
		 List<LeafTableInfo> leaves = expr.getLeafTables(null);
		 for (Iterator<LeafTableInfo> it = leaves.iterator(); it.hasNext(); )
		 {
			 LeafTableInfo leaf = it.next();
			 BasicTable.Reader reader = new BasicTable.Reader(leaf.getPath(), conf);
			 SortInfo mySortInfo = reader.getSortInfo();
			 reader.close();
			 if (mySortInfo == null)
			   throw new IOException("The table is not sorted");
			 if (comparatorName == null)
				 comparatorName = mySortInfo.getComparator(); // use the first table's comparator as comparison base
			 if (sortcolumns == null)
       {
         sortcolumns = mySortInfo.getSortColumnNames();
         comparatorName = mySortInfo.getComparator();
       } else {
         if (!mySortInfo.equals(sortcolumns, comparatorName))
         {
           throw new IOException("The table is not properly sorted");
         }
       }
		 }
	 }
    // need key range input splits for sorted table union
    setSorted(conf);
  }
  
  /**
   * Get requirement for sorted table
   *
   *@param conf
   *          JobConf object.
   */
  private static boolean getSorted(JobConf conf) {
    return conf.getBoolean(INPUT_SORT, false);
  }

  /**
   * @see InputFormat#getRecordReader(InputSplit, JobConf, Reporter)
   */
  @Override
  public RecordReader<BytesWritable, Tuple> getRecordReader(InputSplit split,
      JobConf conf, Reporter reporter) throws IOException {        
    TableExpr expr = getInputExpr(conf);
    if (expr == null) {
      throw new IOException("Table expression not defined");
    }

    if (getSorted(conf))
      expr.setSortedSplit();

    String strProj = conf.get(INPUT_PROJ);
    
    String projection = null;
    try {
      if (strProj == null) {
        projection = expr.getSchema(conf).toProjectionString();
        TableInputFormat.setProjection(conf, projection);
      } else {
        projection = strProj;
      }
    } catch (ParseException e) {
    	throw new IOException("Projection parsing failed : "+e.getMessage());
    }

    try {
      return new TableRecordReader(expr, projection, split, conf);
    } catch (ParseException e) {
    	throw new IOException("Projection parsing faile : "+e.getMessage());
    }
  }
  
  /**
   * Get a TableRecordReader on a single split
   * 
   * @param conf
   *          JobConf object.
   * @param projection
   *          comma-separated column names in projection. null means all columns in projection
   */
  
  public static TableRecordReader getTableRecordReader(JobConf conf, String projection) throws IOException, ParseException
  {
	// a single split is needed
    if (projection != null)
    	setProjection(conf, projection);
    TableInputFormat inputFormat = new TableInputFormat();
    InputSplit[] splits = inputFormat.getSplits(conf, 1);
    return (TableRecordReader) inputFormat.getRecordReader(splits[0], conf, Reporter.NULL);
  }

  private static InputSplit[] getSortedSplits(JobConf conf, int numSplits,
      TableExpr expr, List<BasicTable.Reader> readers,
      List<BasicTableStatus> status) throws IOException {

    if (expr.sortedSplitRequired() && !expr.sortedSplitCapable()) {
      throw new IOException("Unable to created sorted splits");
    }

    long totalBytes = 0;
    for (Iterator<BasicTableStatus> it = status.iterator(); it.hasNext();) {
      BasicTableStatus s = it.next();
      totalBytes += s.getSize();
    }

    long maxSplits = totalBytes / getMinSplitSize(conf);

    if (maxSplits == 0)
      numSplits = 1;
    else if (numSplits > maxSplits) {
      numSplits = -1;
    }

    ArrayList<InputSplit> splits = new ArrayList<InputSplit>();

    for (Iterator<BasicTable.Reader> it = readers.iterator(); it.hasNext();) {
      BasicTable.Reader reader = it.next();
      if (!reader.isSorted()) {
        throw new IOException("Attempting sorted split on unsorted table");
      }
    }

    if (numSplits == 1) {
      BlockDistribution bd = null;
      for (Iterator<BasicTable.Reader> it = readers.iterator(); it.hasNext();) {
        BasicTable.Reader reader = it.next();
        bd = BlockDistribution.sum(bd, reader.getBlockDistribution((RangeSplit) null));
      }
      
      SortedTableSplit split = new SortedTableSplit(null, null, bd, conf);
      return new InputSplit[] { split };
    }
    
    // TODO: Does it make sense to interleave keys for all leaf tables if
    // numSplits <= 0 ?
    int nLeaves = readers.size();
    BlockDistribution lastBd = new BlockDistribution();
    ArrayList<KeyDistribution> btKeyDistributions = new ArrayList<KeyDistribution>();
    for (int i = 0; i < nLeaves; ++i) {
      KeyDistribution btKeyDistri =
          readers.get(i).getKeyDistribution(
              (numSplits <= 0) ? -1 :
              Math.max(numSplits * 5 / nLeaves, numSplits), nLeaves, lastBd);
      btKeyDistributions.add(btKeyDistri);
    }
    int btSize = btKeyDistributions.size();
    KeyDistribution[] btKds = new KeyDistribution[btSize];
    Object[] btArray = btKeyDistributions.toArray();
    for (int i = 0; i < btSize; i++)
      btKds[i] = (KeyDistribution) btArray[i];
    
    KeyDistribution keyDistri = KeyDistribution.merge(btKds);

    if (keyDistri == null) {
      // should never happen.
      SortedTableSplit split = new SortedTableSplit(null, null, null, conf);
      return new InputSplit[] { split };
    }
    
    keyDistri.resize(lastBd);
    
    RawComparable[] keys = keyDistri.getKeys();
    for (int i = 0; i <= keys.length; ++i) {
      RawComparable begin = (i == 0) ? null : keys[i - 1];
      RawComparable end = (i == keys.length) ? null : keys[i];
      BlockDistribution bd;
      if (i < keys.length)
        bd = keyDistri.getBlockDistribution(keys[i]);
      else
        bd = lastBd;
      BytesWritable beginB = null, endB = null;
      if (begin != null)
        beginB = new BytesWritable(begin.buffer());
      if (end != null)
        endB = new BytesWritable(end.buffer());
      SortedTableSplit split = new SortedTableSplit(beginB, endB, bd, conf);
      splits.add(split);
    }

    return splits.toArray(new InputSplit[splits.size()]);
  }
  
  static long getMinSplitSize(JobConf conf) {
    return conf.getLong("table.input.split.minSize", 1 * 1024 * 1024L);
  }

  /**
   * Set the minimum split size.
   * 
   * @param conf
   *          The job conf object.
   * @param minSize
   *          Minimum size.
   */
  public static void setMinSplitSize(JobConf conf, long minSize) {
    conf.setLong("table.input.split.minSize", minSize);
  }
  
  private static class DummyFileInputFormat extends FileInputFormat<BytesWritable, Tuple> {
    /**
     * the next constant and class are copies from FileInputFormat
     */
    private static final PathFilter hiddenFileFilter = new PathFilter(){
        public boolean accept(Path p){
          String name = p.getName(); 
          return !name.startsWith("_") && !name.startsWith("."); 
        }
      }; 

    /**
     * Proxy PathFilter that accepts a path only if all filters given in the
     * constructor do. Used by the listPaths() to apply the built-in
     * hiddenFileFilter together with a user provided one (if any).
     */
    private static class MultiPathFilter implements PathFilter {
      private List<PathFilter> filters;

      public MultiPathFilter(List<PathFilter> filters) {
        this.filters = filters;
      }

      public boolean accept(Path path) {
        for (PathFilter filter : filters) {
          if (!filter.accept(path)) {
            return false;
          }
        }
        return true;
      }
    }
    private Integer[] fileNumbers = null;

    private List<BasicTable.Reader> readers;

    public Integer[] getFileNumbers() {
      return fileNumbers;
    }

    public DummyFileInputFormat(long minSplitSize, List<BasicTable.Reader> readers) {
      super.setMinSplitSize(minSplitSize);
      this.readers = readers;
    }
    
    @Override
    public RecordReader<BytesWritable, Tuple> getRecordReader(InputSplit split,
        JobConf conf, Reporter reporter) throws IOException {
      // no-op
      return null;
    }

    @Override
    public long computeSplitSize(long goalSize, long minSize, long blockSize) {
      return super.computeSplitSize(goalSize, minSize, blockSize);
    }

    /**
     * copy from FileInputFormat: add assignment to table file numbers
     */
    @Override
    public FileStatus[] listStatus(JobConf job) throws IOException {
      Path[] dirs = getInputPaths(job);
      if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
      }

      List<FileStatus> result = new ArrayList<FileStatus>();
      List<IOException> errors = new ArrayList<IOException>();
      
      // creates a MultiPathFilter with the hiddenFileFilter and the
      // user provided one (if any).
      List<PathFilter> filters = new ArrayList<PathFilter>();
      filters.add(hiddenFileFilter);
      PathFilter jobFilter = getInputPathFilter(job);
      if (jobFilter != null) {
        filters.add(jobFilter);
      }
      PathFilter inputFilter = new MultiPathFilter(filters);

      ArrayList<Integer> fileNumberList  = new ArrayList<Integer>();
      int index = 0;
      for (Path p: dirs) {
        FileSystem fs = p.getFileSystem(job); 
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
          errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
          errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
          for (FileStatus globStat: matches) {
            if (globStat.isDir()) {
              FileStatus[] fileStatuses = fs.listStatus(globStat.getPath(), inputFilter);
              // reorder according to CG index
              BasicTable.Reader reader = readers.get(index);
              reader.rearrangeFileIndices(fileStatuses);
              for(FileStatus stat: fileStatuses) {
                if (stat != null)
                  result.add(stat);
              }
              fileNumberList.add(fileStatuses.length);
            } else {
              result.add(globStat);
              fileNumberList.add(1);
            }
          }
        }
        index++;
      }
      fileNumbers = new Integer[fileNumberList.size()];
      fileNumberList.toArray(fileNumbers);

      if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
      }
      LOG.info("Total input paths to process : " + result.size()); 
      return result.toArray(new FileStatus[result.size()]);
    }
  }
  
  private static InputSplit[] getRowSplits(JobConf conf, int numSplits,
      TableExpr expr, List<BasicTable.Reader> readers,
      List<BasicTableStatus> status) throws IOException {
    ArrayList<InputSplit> ret = new ArrayList<InputSplit>();

    long minSplitSize = getMinSplitSize(conf);
  
    long minSize = Math.max(conf.getLong("mapred.min.split.size", 1), minSplitSize);
    long totalBytes = 0;
    for (Iterator<BasicTableStatus> it = status.iterator(); it.hasNext(); )
    {
      totalBytes += it.next().getSize();
    }
    long goalSize = totalBytes / (numSplits < 1 ? 1 : numSplits);
    StringBuilder sb = new StringBuilder();
    boolean first = true;
    PathFilter filter = null;
    List<BasicTable.Reader> realReaders = new ArrayList<BasicTable.Reader>();
    int[] realReaderIndices = new int[readers.size()];

    for (int i = 0; i < readers.size(); ++i) {
      BasicTable.Reader reader = readers.get(i);
      /* Get the index of the column group that will be used for row-split.*/
      int splitCGIndex = reader.getRowSplitCGIndex();
      
      /* We can create input splits only if there does exist a valid column group for split.
       * Otherwise, we do not create input splits. */
      if (splitCGIndex >= 0) {        
        realReaderIndices[realReaders.size()] = i;
        realReaders.add(reader);
        if (first)
        {
          // filter is identical across tables
          filter = reader.getPathFilter(conf);
          first = false;
        } else
          sb.append(",");
        sb.append(reader.getPath().toString() + "/" + reader.getName(splitCGIndex));
      }
    }
    
    DummyFileInputFormat helper = new DummyFileInputFormat(minSplitSize, realReaders);

    if (!realReaders.isEmpty())
    {
      DummyFileInputFormat.setInputPaths(conf, sb.toString());
      DummyFileInputFormat.setInputPathFilter(conf, filter.getClass());
      InputSplit[] inputSplits = helper.getSplits(conf, (numSplits < 1 ? 1 : numSplits));

      int batchesPerSplit = inputSplits.length / (numSplits < 1 ? 1 : numSplits);
      if (batchesPerSplit <= 0)
        batchesPerSplit = 1;

      /*
       * Potential file batching optimizations include:
       * 1) sort single file inputSplits in the descending order of their sizes so
       *    that the ops of new file opens are spread to a maximum degree;
       * 2) batching the files with maximum block distribution affinities into the same input split
       */

      int[] inputSplitBoundaries = new int[realReaders.size()];
      long start, prevStart = Long.MIN_VALUE;
      int tableIndex = 0, fileNumber = 0;
      Integer[] fileNumbers = helper.getFileNumbers();
      if (fileNumbers.length != realReaders.size())
        throw new IOException("Number of tables in input paths of input splits is incorrect.");
      for (int j=0; j<inputSplits.length; j++) {
        FileSplit fileSplit = (FileSplit) inputSplits[j];
        start = fileSplit.getStart();
        if (start <= prevStart)
        {
          fileNumber++;
          if (fileNumber >= fileNumbers[tableIndex])
          {
            inputSplitBoundaries[tableIndex++] = j;
            fileNumber = 0;
          }
        }
        prevStart = start;
      }
      inputSplitBoundaries[tableIndex++] =  inputSplits.length;
      if (tableIndex != realReaders.size())
        throw new IOException("Number of tables in input splits is incorrect.");
      for (tableIndex = 0; tableIndex < realReaders.size(); tableIndex++)
      {
        int startSplitIndex = (tableIndex == 0 ? 0 : inputSplitBoundaries[tableIndex - 1]);
        int splitLen = (tableIndex == 0 ? inputSplitBoundaries[0] :
            inputSplitBoundaries[tableIndex] - inputSplitBoundaries[tableIndex-1]);
        BasicTable.Reader reader = realReaders.get(tableIndex);
        /* Get the index of the column group that will be used for row-split.*/
        int splitCGIndex = reader.getRowSplitCGIndex();
        
        long starts[] = new long[splitLen];
        long lengths[] = new long[splitLen];
        int batches[] = new int[splitLen + 1];
        batches[0] = 0;
        int numBatches = 0;
        int batchSize = 0;
        Path paths[] = new Path [splitLen];
        long totalLen = 0;
        final double SPLIT_SLOP = 1.1;
        int endSplitIndex = startSplitIndex + splitLen;
        for (int j=startSplitIndex; j< endSplitIndex; j++) {
          FileSplit fileSplit = (FileSplit) inputSplits[j];
          Path p = fileSplit.getPath();
          long blockSize = p.getFileSystem(conf).getBlockSize(p);
          long splitSize = (long) (helper.computeSplitSize(goalSize, minSize, blockSize) * SPLIT_SLOP);
          start = fileSplit.getStart();
          long length = fileSplit.getLength();
          int index = j - startSplitIndex;
          starts[index] = start;
          lengths[index] = length;
          totalLen += length;
          paths[index] = p;
          if (totalLen >= splitSize)
          {

             for (int ii = batches[numBatches] + 1; ii < index - 1; ii++)
               starts[ii] = -1; // all intermediate files are not split
             batches[++numBatches] = index;
             batchSize = 1;
             totalLen = length;
          } else if (batchSize + 1 > batchesPerSplit) {
            for (int ii = batches[numBatches] + 1; ii < index - 1; ii++)
              starts[ii] = -1; // all intermediate files are not split
            batches[++numBatches] = index;
            batchSize = 1;
            totalLen = length;
          } else {
            batchSize++;
          }
        }
        for (int ii = batches[numBatches] + 1; ii < splitLen - 1; ii++)
          starts[ii] = -1; // all intermediate files are not split
        if (splitLen > 0)
          batches[++numBatches] = splitLen;
        
        List<RowSplit> subSplits = reader.rowSplit(starts, lengths, paths, splitCGIndex, batches, numBatches);
        int realTableIndex = realReaderIndices[tableIndex];
        for (Iterator<RowSplit> it = subSplits.iterator(); it.hasNext();) {
          RowSplit subSplit = it.next();
          RowTableSplit split = new RowTableSplit(reader, subSplit, realTableIndex, conf);
          ret.add(split);
        }
      }
    }

    LOG.info("getSplits : returning " + ret.size() + " row splits.");
    return ret.toArray(new InputSplit[ret.size()]);
  }

  /**
   * @see InputFormat#getSplits(JobConf, int)
   */
  @Override
  public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
    TableExpr expr = getInputExpr(conf);
    if (getSorted(conf))
      expr.setSortedSplit();
    if (expr.sortedSplitRequired() && !expr.sortedSplitCapable()) {
      throw new IOException("Unable to created sorted splits");
    }
    
    String projection;
    try {
      projection = getProjection(conf);
    } catch (ParseException e) {
      throw new IOException("getProjection failed : "+e.getMessage());
    }
    List<LeafTableInfo> leaves = expr.getLeafTables(projection);
    int nLeaves = leaves.size();
    ArrayList<BasicTable.Reader> readers =
        new ArrayList<BasicTable.Reader>(nLeaves);
    ArrayList<BasicTableStatus> status =
        new ArrayList<BasicTableStatus>(nLeaves);

    try {
      StringBuilder sb = new StringBuilder();
      boolean sorted = expr.sortedSplitRequired();
      boolean first = true;
      for (Iterator<LeafTableInfo> it = leaves.iterator(); it.hasNext();) {
        LeafTableInfo leaf = it.next();
        BasicTable.Reader reader =
          new BasicTable.Reader(leaf.getPath(), conf);
        reader.setProjection(leaf.getProjection());
        BasicTableStatus s = reader.getStatus();
        status.add(s);
        readers.add(reader);
        if (first)
          first = false;
        else {
          sb.append(TableInputFormat.DELETED_CG_SEPARATOR_PER_UNION);
        }
        sb.append(reader.getDeletedCGs());
      }
      
      conf.set(INPUT_FE, "true");
      conf.set(INPUT_DELETED_CGS, sb.toString());
      
      if (readers.isEmpty()) {
        return new InputSplit[0];
      }
      
      if (sorted) {
        return getSortedSplits(conf, numSplits, expr, readers, status);
      }
       
      return getRowSplits(conf, numSplits, expr, readers, status);
    } catch (ParseException e) {
      throw new IOException("Projection parsing failed : "+e.getMessage());
    }
    finally {
      for (Iterator<BasicTable.Reader> it = readers.iterator(); it.hasNext();) {
        try {
          it.next().close();
        }
        catch (Exception e) {
          e.printStackTrace();
          // TODO: log the error here.
        }
      }
    }
  }

  @Deprecated
  public synchronized void validateInput(JobConf conf) throws IOException {
    // Validating imports by opening all Tables.
    TableExpr expr = getInputExpr(conf);
    try {
      String projection = getProjection(conf);
      List<LeafTableInfo> leaves = expr.getLeafTables(projection);
      Iterator<LeafTableInfo> iterator = leaves.iterator();
      while (iterator.hasNext()) {
        LeafTableInfo leaf = iterator.next();
        BasicTable.Reader reader =
            new BasicTable.Reader(leaf.getPath(), conf);
        reader.setProjection(projection);
        reader.close();
      }
    } catch (ParseException e) {
    	throw new IOException("Projection parsing failed : "+e.getMessage());
    }
  }
}

/**
 * Adaptor class for sorted InputSplit for table.
 */
class SortedTableSplit implements InputSplit {
  BytesWritable begin = null, end = null;
  
  String[] hosts;
  long length = 1;

  public SortedTableSplit()
  {
    // no-op for Writable construction
  }
  
  public SortedTableSplit(BytesWritable begin, BytesWritable end,
      BlockDistribution bd, JobConf conf) {
    if (begin != null) {
      this.begin = new BytesWritable();
      this.begin.set(begin.get(), 0, begin.getSize());
    }
    if (end != null) {
      this.end = new BytesWritable();
      this.end.set(end.get(), 0, end.getSize());
    }
    
    if (bd != null) {
      length = bd.getLength();
      hosts =
        bd.getHosts(conf.getInt("mapred.lib.table.input.nlocation", 5));
    }
  }
  
  @Override
  public long getLength() throws IOException {
    return length;
  }

  @Override
  public String[] getLocations() throws IOException {
    if (hosts == null)
    {
      String[] tmp = new String[1];
      tmp[0] = "";
      return tmp;
    }
    return hosts;
  }

  @Override
  public void readFields(DataInput in) throws IOException {
    begin = end = null;
    int bool = WritableUtils.readVInt(in);
    if (bool == 1) {
      begin = new BytesWritable();
      begin.readFields(in);
    }
    bool = WritableUtils.readVInt(in);
    if (bool == 1) {
      end = new BytesWritable();
      end.readFields(in);
    }
    length = WritableUtils.readVLong(in);
    int size = WritableUtils.readVInt(in);
    if (size > 0)
      hosts = new String[size];
    for (int i = 0; i < size; i++)
    	hosts[i] = WritableUtils.readString(in);
  }

  @Override
  public void write(DataOutput out) throws IOException {
    if (begin == null) {
      WritableUtils.writeVInt(out, 0);
    }
    else {
      WritableUtils.writeVInt(out, 1);
      begin.write(out);
    }
    if (end == null) {
      WritableUtils.writeVInt(out, 0);
    }
    else {
      WritableUtils.writeVInt(out, 1);
      end.write(out);
    }
    WritableUtils.writeVLong(out, length);
    WritableUtils.writeVInt(out, hosts == null ? 0 : hosts.length);
    for (int i = 0; i < hosts.length; i++)
    {
    	WritableUtils.writeString(out, hosts[i]);
    }
  }
  
  public BytesWritable getBegin() {
    return begin;
  }

  public BytesWritable getEnd() {
    return end;
  }
}

/**
 * Adaptor class for unsorted InputSplit for table.
 */
class RowTableSplit implements InputSplit {
  String path = null;
  int tableIndex;
  RowSplit split = null;
  String[] hosts = null;
  long length = 1;

  public RowTableSplit(Reader reader, RowSplit split, int tableIndex, JobConf conf)
      throws IOException {
    this.path = reader.getPath();
    this.split = split;
    this.tableIndex = tableIndex;
    BlockDistribution dataDist = reader.getBlockDistribution(split);
    if (dataDist != null) {
      length = dataDist.getLength();
      hosts =
          dataDist.getHosts(conf.getInt("mapred.lib.table.input.nlocation", 5));
    }
  }
  
  public RowTableSplit() {
    // no-op for Writable construction
  }
  
  @Override
  public long getLength() throws IOException {
    return length;
  }

  @Override
  public String[] getLocations() throws IOException {
    return hosts;
  }

  @Override
  public void readFields(DataInput in) throws IOException {
    tableIndex = WritableUtils.readVInt(in);
    path = WritableUtils.readString(in);
    int bool = WritableUtils.readVInt(in);
    if (bool == 1) {
      if (split == null) split = new RowSplit();
      split.readFields(in);
    }
    else {
      split = null;
    }
    hosts = WritableUtils.readStringArray(in);
    length = WritableUtils.readVLong(in);
  }

  @Override
  public void write(DataOutput out) throws IOException {
    WritableUtils.writeVInt(out, tableIndex);
    WritableUtils.writeString(out, path);
    if (split == null) {
      WritableUtils.writeVInt(out, 0);
    }
    else {
      WritableUtils.writeVInt(out, 1);
      split.write(out);
    }
    WritableUtils.writeStringArray(out, hosts);
    WritableUtils.writeVLong(out, length);
  }

  public String getPath() {
    return path;
  }
  
  public RowSplit getSplit() {
    return split;
  }
  
  public int getTableIndex() {
    return tableIndex;
  }
}