ColumnGroup.java example

Explorer
spork-streaming-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with this
 * work for additional information regarding copyright ownership. The ASF
 * licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package org.apache.hadoop.zebra.io;

import java.io.Closeable;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.permission.*;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.zebra.tfile.TFile;
import org.apache.hadoop.zebra.tfile.Utils;
import org.apache.hadoop.zebra.tfile.ByteArray;
import org.apache.hadoop.zebra.tfile.RawComparable;
import org.apache.hadoop.zebra.types.CGSchema;
import org.apache.hadoop.zebra.parser.ParseException;
import org.apache.hadoop.zebra.types.Partition;
import org.apache.hadoop.zebra.types.Projection;
import org.apache.hadoop.zebra.schema.Schema;
import org.apache.hadoop.zebra.types.TypesUtils;
import org.apache.hadoop.zebra.types.TypesUtils.TupleReader;
import org.apache.hadoop.zebra.types.TypesUtils.TupleWriter;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;

/**
 * ColumnGroup is the basic unit of a persistent table. The following
 * Configuration parameters can customize the behavior of ColumnGroup.
 * <ul>
 * <li><b>table.output.tfile.minBlock.size</b> (int) Minimum compression block
 * size for underlying TFile (default to 1024*1024).
 * <li><b>table.output.tfile.compression</b> (String) Compression method (one
 * of "none", "lzo", "gz") (default to "lzo").
 * 
 * @see {@link TFile#getSupportedCompressionAlgorithms()}
 *      <li><b>table.input.split.minSize</b> (int) Minimum split size (default
 *      to 64*1024).
 *      </ul>
 */
class ColumnGroup {
  static Log LOG = LogFactory.getLog(ColumnGroup.class);
  
  private final static String CONF_COMPRESS = "table.output.tfile.compression";
  private final static String DEFAULT_COMPRESS = "gz";
  private final static String CONF_MIN_BLOCK_SIZE = "table.tfile.minblock.size";
  private final static int DEFAULT_MIN_BLOCK_SIZE = 1024 * 1024;

  private final static String CONF_MIN_SPLIT_SIZE = "table.input.split.minSize";
  private final static int DEFAULT_MIN_SPLIT_SIZE = 64 * 1024;

  static final double SPLIT_SLOP = 1.1; // 10% slop

  // excluding files start with the following prefix, may change to regex
  private final static String CONF_NON_DATAFILE_PREFIX =
      "table.cg.nondatafile.prefix";
  private final static String SPECIAL_FILE_PREFIX = ".";

  // tmp schema file name, used as a flag of unfinished CG
  private final static String SCHEMA_FILE = ".schema";
  // meta data TFile for entire CG, used as a flag of closed CG
  final static String META_FILE = ".meta";

  // sorted table key ranges for default sorted table split generations
  private final static String KEY_RANGE_FOR_DEFAULT_SORTED_SPLIT = ".keyrange";

  static final String BLOCK_NAME_INDEX = "ColumnGroup.index";

  static Path makeMetaFilePath(Path parent) {
    return new Path(parent, META_FILE);
  }

  static String getCompression(Configuration conf) {
    return conf.get(CONF_COMPRESS, DEFAULT_COMPRESS);
  }

  static int getMinBlockSize(Configuration conf) {
    return conf.getInt(CONF_MIN_BLOCK_SIZE, DEFAULT_MIN_BLOCK_SIZE);
  }

  static String getNonDataFilePrefix(Configuration conf) {
    return conf.get(CONF_NON_DATAFILE_PREFIX, SPECIAL_FILE_PREFIX);
  }

  static int getMinSplitSize(Configuration conf) {
    return conf.getInt(CONF_MIN_SPLIT_SIZE, DEFAULT_MIN_SPLIT_SIZE);
  }

  /**
   * Drop a Column Group, maps to deleting all the files relating to this Column
   * Group on the FileSystem.
   * 
   * @param path
   *          the path to the ColumnGroup.
   * @param conf
   *          The configuration object.
   */
  public static void drop(Path path, Configuration conf) throws IOException {
    FileSystem fs = path.getFileSystem(conf);
    fs.delete(path, true);
    // TODO:
    // fs.close();
  }

  /**
   * Scan the file system, looking for TFiles, and build an in-memory index of a
   * column group.
   * 
   * @param fs
   *          The file system
   * @param path
   *          The base path of the column group.
   * @param dirty
   *          Whether to build dirty index or not. Dirty index is built by only
   *          looking at file-level status and not opening up individual TFiles.
   *          The flag may only be set for unsorted ColumnGroups.
   * @param conf
   *          The configuration object.
   * @return The in-memory index object.
   * @throws IOException
   */
  static CGIndex buildIndex(FileSystem fs, Path path, boolean dirty,
      Configuration conf) throws IOException {
    CGIndex ret = new CGIndex();
    CGPathFilter cgPathFilter = new CGPathFilter();
    CGPathFilter.setConf(conf);
    FileStatus[] files = fs.listStatus(path, cgPathFilter);
    
    Comparator<RawComparable> comparator = null;
    for (FileStatus f : files) {
      if (dirty) {
        ret.add(f.getLen(), f.getPath().getName());
      }
      else {
        FSDataInputStream dis = null;
        TFile.Reader tr = null;
        try {
          dis = fs.open(f.getPath());
          tr = new TFile.Reader(dis, f.getLen(), conf);
          if (comparator == null) {
            comparator = tr.getComparator();
          }
          if (tr.getEntryCount() > 0) {
            CGIndexEntry range =
                new CGIndexEntry(f.getPath().getName(), tr.getEntryCount(), tr
                    .getFirstKey(), tr.getLastKey());
            ret.add(f.getLen(), tr.getEntryCount(), range);
          }
        }
        catch (IOException e) {
          // TODO: log the error, ignore incorrect TFiles.
          e.printStackTrace(System.err);
        }
        finally {
          if (tr != null) {
            tr.close();
          }
          if (dis != null) {
            dis.close();
          }
        }
      }
    }

    ret.sort(comparator);
    
    int idx = 0;
    for (CGIndexEntry e : ret.getIndex()) {
      e.setIndex(idx++);    
    }
    
    return ret;
  }   
  
  /**
   * ColumnGroup reader.
   */
  public static class Reader implements Closeable {
    Path path;
    Configuration conf;
    FileSystem fs;
    CGSchema cgschema;
    Comparator<RawComparable> comparator;
    Projection projection;
    CGIndex cgindex;
    ArrayList<SplitColumn> exec;
    SplitColumn top; // directly associated with logical schema
    SplitColumn leaf; // corresponding to projection
    boolean closed;
    boolean dirty;

    /**
     * Get the Column Group physical schema without loading the full CG index.
     * 
     * @param path
     *          The path to the ColumnGroup.
     * @param conf
     *          The configuration object.
     * @return The ColumnGroup schema.
     * @throws IOException
     */

    public static Schema getSchema(Path path, Configuration conf)
        throws IOException, ParseException {
      FileSystem fs = path.getFileSystem(conf);
      CGSchema cgschema = CGSchema.load(fs, path);
      return cgschema.getSchema();
    }

    /**
     * Create a ColumnGroup reader.
     * 
     * @param path
     *          The directory path to the column group.
     * @param conf
     *          Optional configuration parameters.
     * @throws IOException
     */
    public Reader(Path path, Configuration conf) throws IOException,
      ParseException {
      this(path, conf, false);
    }
    
    public Reader(Path path, Configuration conf, boolean mapper) throws IOException,
      ParseException {
      this(path, true, conf, mapper);
    }
    
    Reader(Path path, boolean dirty, Configuration conf) throws IOException,
      ParseException {
      this(path, dirty, conf, false);
    }

    Reader(Path path, boolean dirty, Configuration conf, boolean mapper) throws IOException,
      ParseException {
      this.path = path;
      this.conf = conf;
      this.dirty = dirty;

      fs = path.getFileSystem(conf);
      // check existence of path
      if (!fs.exists(path)) {
        throw new IOException("Path doesn't exist: " + path);
      }

      if (!mapper && !fs.getFileStatus(path).isDir()) {
        throw new IOException("Path exists but not a directory: " + path);
      }

      cgschema = CGSchema.load(fs, path);
      if (cgschema.isSorted()) {
        comparator = TFile.makeComparator(cgschema.getComparator());
      }
      projection = new Projection(cgschema.getSchema()); // default projection to CG schema.
      Path metaFilePath = makeMetaFilePath(path);
      /* If index file is not existing */
      if (!fs.exists(metaFilePath)) {
        throw new FileNotFoundException(
              "Missing Meta File of " + metaFilePath);
      }
      else if (cgschema.isSorted()) {
        MetaFile.Reader metaFile = MetaFile.createReader(metaFilePath, conf);
        try {
          cgindex = new CGIndex();
          DataInputStream dis = metaFile.getMetaBlock(BLOCK_NAME_INDEX);
          try {
            cgindex.readFields(dis);
          } catch (IOException e) {
            throw new IOException("Index file read failure :"+ e.getMessage());
          } finally {
            dis.close();
          }
        }
        finally {
          metaFile.close();
        }
      }
    }

    /**
     * Set the projection for the reader. This will affect calls to
     * getScanner(), getStatus(), and getColumnNames().
     * 
     * @param projection
     *          The projection on the column group for subsequent read
     *          operations. If we want select all columns, pass
     *          projection==null.
     */
    public synchronized void setProjection(String projection) throws ParseException {
      if (projection == null) {
        this.projection = new Projection(cgschema.getSchema());
      }
      else {
        this.projection = new Projection(cgschema.getSchema(), projection);
      }
    }

    /**
     * Get the schema of columns of the table (possibly through projection).
     * 
     * @return Schema of the columns of the table (possibly through projection).
     */
    public Schema getSchema() throws ParseException {
      return projection.getSchema();
    }
    
    /**
     * Get the projection
     * @return Projection of this Reader
     */
    public Projection getProjection() {
      return projection;
    }

    public String getName() {
      return cgschema.getName();
    }
    
    public String getSerializer() {
      return cgschema.getSerializer();
    }

    public String getCompressor() {
      return cgschema.getCompressor();
    }

    public CGSchema getCGSchema() {
      return cgschema;
    }

    public String getGroup() {
        return cgschema.getGroup();
      }

    public short getPerm() {
        return cgschema.getPerm();
    }

    /**
     * Get a scanner that reads all rows whose row keys fall in a specific
     * range.
     * 
     * @param beginKey
     *          The begin key of the scan range.
     * @param endKey
     *          The end key of the scan range.
     * @param closeReader
     *          close the underlying Reader object when we close the scanner.
     *          Should be set to true if we have only one scanner on top of the
     *          reader, so that we should release resources after the scanner is
     *          closed.
     * @return A scanner object.
     * @throws IOException
     */
    public synchronized CGScanner getScanner(BytesWritable beginKey,
        BytesWritable endKey, boolean closeReader) throws IOException,
        ParseException {
      if (closed) {
        throw new EOFException("Reader already closed");
      }
      if (!isSorted()) {
        throw new IOException(
            "Cannot get key-bounded scanner for unsorted table");
      }
      RawComparable begin =
          (beginKey != null) ? new ByteArray(beginKey.getBytes(), 0, beginKey
              .getLength()) : null;
      RawComparable end =
          (endKey != null) ? new ByteArray(endKey.getBytes(), 0, endKey.getLength())
              : null;
      if (begin != null && end != null) {
        if (comparator.compare(begin, end) >= 0) {
          throw new IOException("Zero-key-range split");
        }
      }

      return new CGScanner(begin, end, closeReader);
    }

    /**
     * Get a scanner that reads a consecutive number of rows as defined in the
     * CGRangeSplit object, which should be obtained from previous calls of
     * rangeSplit().
     * 
     * @param split
     *          The split range. If null, get a scanner to read the complete
     *          column group.
     * @param closeReader
     *          close the underlying Reader object when we close the scanner.
     *          Should be set to true if we have only one scanner on top of the
     *          reader, so that we should release resources after the scanner is
     *          closed.
     * @return A scanner object.
     * @throws IOException
     */
    public synchronized CGScanner getScanner(CGRangeSplit split,
        boolean closeReader) throws IOException, ParseException {
      if (closed) {
        throw new EOFException("Reader already closed");
      }

      if (split == null) {
        if (cgindex == null)
          cgindex = buildIndex(fs, path, dirty, conf);
        return getScanner(new CGRangeSplit(0, cgindex.size()), closeReader);
      }
      if (split.len < 0) {
        throw new IllegalArgumentException("Illegal range split");
      }

      return new CGScanner(split, closeReader);
    }

    /**
     * Get a scanner that reads the rows defined by rowRange. 
     * 
     * @param closeReader
     *          close the underlying Reader object when we close the scanner.
     *          Should be set to true if we have only one scanner on top of the
     *          reader, so that we should release resources after the scanner is
     *          closed.
     * @param rowSplit specifies part index, start row, and end row.
     * @return A scanner object.
     */
    public synchronized CGScanner getScanner(boolean closeReader, 
                                                CGRowSplit rowSplit)
                        throws IOException, ParseException {
      if (closed) {
        throw new EOFException("Reader already closed");
      }
      
      return new CGScanner(rowSplit, closeReader);
    }
     
    /**
     * Given a split range, calculate how the file data that fall into the range
     * are distributed among hosts.
     * 
     * @param split
     *          The range-based split. If null, return all blocks.
     * @return a map from host name to the amount of data (in bytes) the host
     *         owns that fall roughly into the key range.
     */
    public BlockDistribution getBlockDistribution(CGRangeSplit split)
        throws IOException {
      if (split == null) {
        return getBlockDistribution(new CGRangeSplit(0, cgindex.size()));
      }

      if (cgindex == null)
        cgindex = buildIndex(fs, path, dirty, conf);
      if ((split.start | split.len | (cgindex.size() - split.start - split.len)) < 0) {
        throw new IndexOutOfBoundsException("Bad split");
      }

      BlockDistribution ret = new BlockDistribution();
      for (int i = split.start; i < split.start + split.len; ++i) {
        CGIndexEntry dfkr = cgindex.get(i);
        Path tfilePath = new Path(path, dfkr.getName());
        FileStatus tfileStatus = fs.getFileStatus(tfilePath);
        BlockLocation[] locations =
            fs.getFileBlockLocations(tfileStatus, 0, tfileStatus.getLen());
        for (BlockLocation l : locations) {
          ret.add(l);
        }
      }

      return ret;
    }
    
    /**
     * Given a row range, calculate how the file data that fall into the range
     * are distributed among hosts.
     * 
     * @param split
     *          The row-based split. If null, return all blocks.
     * @return a map from host name to the amount of data (in bytes) the host
     *         owns that fall roughly into the key range.
     */
    public BlockDistribution getBlockDistribution(CGRowSplit split)
        throws IOException {
      if (split == null) {
        throw new IOException("Row-based split cannot be null for getBlockDistribution()");
      }

      BlockDistribution ret = new BlockDistribution();
      for (int i = 0; i < split.length; i++)
      {
        FileStatus tfileStatus = fs.getFileStatus(new Path(path, split.names[i]));
        
        BlockLocation[] locations = null;
        long len = 0;
        if (i == 0) {
          if (split.startByteFirst != -1)
          {
            len = split.numBytesFirst;
            locations = fs.getFileBlockLocations(tfileStatus, split.startByteFirst, len);
          }
        } else if (i == split.length - 1) {
           if (split.numBytesLast != -1)
           {
             len = split.numBytesLast;
             locations = fs.getFileBlockLocations(tfileStatus, 0, len);
           }
        }

        if (locations == null)
        {
          len = tfileStatus.getLen();
          locations = fs.getFileBlockLocations(tfileStatus, 0, len);
        }

        for (BlockLocation l : locations) {
          ret.add(l);
        }
      }
      return ret;
    }

  private int getStartBlockIndex(long[] startOffsets, long offset)
  {
    int index = Arrays.binarySearch(startOffsets, offset);
    if (index < 0)
      index = -index - 2;
    return index;
  }
  
  private int getEndBlockIndex(long[] startOffsets, long offset)
  {
    int index = Arrays.binarySearch(startOffsets, offset);
    if (index < 0)
      index = -index - 1;
    return index;
  }

   /**
    * Sets startRow and number of rows in rowSplit based on
    * startOffset and length.
    * 
    * It is assumed that 'startByte' and 'numBytes' in rowSplit itself
    * are not valid.
    */
    void fillRowSplit(CGRowSplit rowSplit, CGRowSplit src) throws IOException {

      if (src.names == null || src.length == 0)
        return;

      boolean noSizeInIndex = false;
      long[] sizes = rowSplit.sizes;
      if (sizes == null)
      {
        /* the on disk table is sorted. Later this will be made unnecessary when
         * CGIndexEntry serializes its bytes field and the meta file versioning is
         * supported.
         */ 
        noSizeInIndex = true;
      }
      rowSplit.names = src.names;
      rowSplit.length = src.length;
      rowSplit.startByteFirst = src.startByteFirst;
      rowSplit.numBytesFirst = src.numBytesFirst;
      rowSplit.numBytesLast = src.numBytesLast;

      Path firstPath = null, lastPath;
      TFile.Reader reader = null;
      
      if (src.startByteFirst != -1)
      {
        firstPath = new Path(path, rowSplit.names[0]);
        long size;
        if (noSizeInIndex)
        {
          FileStatus tfile = fs.getFileStatus(firstPath);
          size = tfile.getLen();
        } else
          size = sizes[0];
        reader = new TFile.Reader(fs.open(firstPath), size, conf);
        try {
          long startRow = reader.getRecordNumNear(src.startByteFirst);
          long endRow = reader.getRecordNumNear(src.startByteFirst + src.numBytesFirst);

          if (endRow < startRow)
            endRow = startRow;
          rowSplit.startRowFirst = startRow;
          rowSplit.numRowsFirst = endRow - startRow;
        } catch (IOException e) {
          reader.close();
          throw e;
        }
      }
      if (src.numBytesLast != -1 && rowSplit.length > 1)
      {
        lastPath = new Path(path, rowSplit.names[rowSplit.length - 1]);
        if (reader == null || !firstPath.equals(lastPath))
        {
          if (reader != null)
            reader.close();
          long size;
          if (noSizeInIndex)
          {
            FileStatus tfile = fs.getFileStatus(lastPath);
            size = tfile.getLen();
          } else
            size = sizes[rowSplit.length - 1];
          reader = new TFile.Reader(fs.open(lastPath), size, conf);
        }
        try {
          long endRow = reader.getRecordNumNear(src.numBytesLast);
          rowSplit.numRowsLast = endRow;
        } catch (IOException e) {
          reader.close();
          throw e;
        }
      }
      if (reader != null)
        reader.close();
    }
    
    /**
     * Get a sampling of keys and calculate how data are distributed among
     * key-partitioned buckets. The implementation attempts to calculate all
     * information in one shot to avoid reading TFile index multiple times.
     * Special care is also taken that memory requirement is not linear to the
     * size of total data set for the column group.
     * 
     * @param n
     *          Targeted size of the sampling.
     * @param nTables
     *          Number of tables in a union
     * @return KeyDistribution object.
     * @throws IOException
     */
    public KeyDistribution getKeyDistribution(int n, int nTables, BlockDistribution lastBd) throws IOException {
      // TODO: any need for similar capability for unsorted for sorted CGs?
      if (!isSorted()) {
        throw new IOException("Cannot get key distribution for unsorted table");
      }
      KeyDistribution ret = new KeyDistribution(comparator);

      if (n < 0)
      {
        /*
        Path keyRangeFile = new Path(path, KEY_RANGE_FOR_DEFAULT_SORTED_SPLIT);
        if (fs.exists(keyRangeFile))
        {
          try {
            FSDataInputStream ins = fs.open(keyRangeFile);
            long minStepSize = ins.readLong();
            int size = ins.readInt();
            for (int i = 0; i < size; i++)
            {
              BytesWritable keyIn = new BytesWritable();
              keyIn.readFields(ins);
              ByteArray key = new ByteArray(keyIn.getBytes());
              ret.add(key);
            }
            ret.setMinStepSize(minStepSize);
            return ret;
          } catch (Exception e) {
            // no-op
          }
        }
        */
        n = 1;
      }

      Path[] paths = new Path[cgindex.size()];
      FileStatus[] tfileStatus = new FileStatus[paths.length];
      long totalBytes = 0;
      for (int i = 0; i < paths.length; ++i) {
        paths[i] = cgindex.getPath(i, path);
        tfileStatus[i] = fs.getFileStatus(paths[i]);
        totalBytes += tfileStatus[i].getLen();
      }

      final long minSize = getMinSplitSize(conf);
      final long EPSILON = (long) (minSize * (SPLIT_SLOP - 1));
      long goalSize = totalBytes / n;
      long batchSize = 0;
      BlockDistribution bd = new BlockDistribution();;
      RawComparable prevKey = null;

      long minStepSize = -1;
      FSDataInputStream nextFsdis = null;
      TFile.Reader nextReader = null;
      for (int i = 0; i < paths.length; ++i) {
        FileStatus fstatus = tfileStatus[i];
        long blkSize = fstatus.getBlockSize();
        long fileLen = fstatus.getLen();
        long stepSize = Math.max(minSize,
            (goalSize < blkSize) ? goalSize : blkSize);
        if (minStepSize== -1 || minStepSize > stepSize)
          minStepSize = stepSize;
        // adjust the block size by the scaling factor
        blkSize /= nTables;
        stepSize = Math.max(minSize,
          (goalSize < blkSize) ? goalSize : blkSize);
        FSDataInputStream fsdis = null;
        TFile.Reader reader = null;
        long remainLen = fileLen;
        try {
          if (nextReader == null)
          {
            fsdis = fs.open(paths[i]);
            reader = new TFile.Reader(fsdis, fileLen, conf);
          } else {
            fsdis = nextFsdis;
            reader = nextReader;
          }
          BlockLocation[] locations =
              fs.getFileBlockLocations(fstatus, 0, fileLen);
          if (locations.length == 0) {
            throw new AssertionError(
                "getFileBlockLocations returns 0 location");
          }

          Arrays.sort(locations, new Comparator<BlockLocation>() {
            @Override
            public int compare(BlockLocation o1, BlockLocation o2) {
              long diff = o1.getOffset() - o2.getOffset();
              if (diff < 0) return -1;
              if (diff > 0) return 1;
              return 0;
            }
          });
          
          long[] startOffsets = new long[locations.length];

          for (int ii = 0; ii < locations.length; ii++)
            startOffsets[ii] = locations[ii].getOffset();

          boolean done = false;
          while ((remainLen > 0) && !done) {
            long splitBytes =
                remainLen > stepSize ? stepSize : remainLen;
            long offsetBegin = fileLen - remainLen;
            long offsetEnd = offsetBegin + splitBytes;
            int indexBegin = getStartBlockIndex(startOffsets, offsetBegin);
            int indexEnd = getEndBlockIndex(startOffsets, offsetEnd);
            BlockLocation firstBlock = locations[indexBegin];
            BlockLocation lastBlock = locations[indexEnd-1];
            long lastBlockOffsetBegin = lastBlock.getOffset();
            long lastBlockOffsetEnd =
                lastBlockOffsetBegin + lastBlock.getLength();
            if ((firstBlock.getOffset() > offsetBegin)
                || (lastBlockOffsetEnd < offsetEnd)) {
              throw new AssertionError(
                  "Block locations returned by getFileBlockLocations do not cover requested range");
            }

            // Adjust offsets
            if ((offsetEnd > lastBlockOffsetBegin)
                && (offsetEnd - lastBlockOffsetBegin < EPSILON)) {
              // the split includes a bit of the next block, remove it.
              if (offsetEnd != fileLen)
              {
            	// only if this is not the last chunk
                offsetEnd = lastBlockOffsetBegin;
                splitBytes = offsetEnd - offsetBegin;
                indexEnd--;
              }
            }
            else if ((lastBlockOffsetEnd > offsetEnd)
                && (lastBlockOffsetEnd - offsetEnd < EPSILON)) {
              // the split includes almost the whole block, fill it.
              offsetEnd = lastBlockOffsetEnd;
              splitBytes = offsetEnd - offsetBegin;
            }

            RawComparable key = reader.getKeyNear(offsetEnd);
            if (key == null) {
              offsetEnd = fileLen;
              splitBytes = offsetEnd - offsetBegin;
              if (i < paths.length-1)
              {
                nextFsdis = fs.open(paths[i+1]);
                nextReader = new TFile.Reader(nextFsdis, tfileStatus[i+1].getLen(), conf);
                key = nextReader.getFirstKey();
              }
              done = true; // TFile index too large? Is it necessary now?
            }
            remainLen -= splitBytes;
            batchSize += splitBytes;

            if (key != null && batchSize >= stepSize)
            {
              if (batchSize - splitBytes < EPSILON || splitBytes < EPSILON)
              {
                // the last chunk or this chunk is small enough to create a new range for this key
                setBlockDistribution(bd, reader, locations, fstatus, startOffsets, prevKey, key);
                ret.add(key, bd);
                batchSize = 0;
                bd = new BlockDistribution();
              } else {
                ret.add(prevKey, bd);
                batchSize = splitBytes;
                bd = new BlockDistribution();
                if (batchSize >= stepSize)
                {
                  setBlockDistribution(bd, reader, locations, fstatus, startOffsets, prevKey, key);
                  ret.add(key, bd);
                  batchSize = 0;
                  bd = new BlockDistribution();
                } else {
                  setBlockDistribution(bd, reader, locations, fstatus, startOffsets, prevKey, key);
                }
              }
            } else {
              setBlockDistribution(bd, reader, locations, fstatus, startOffsets, prevKey, key);
            }
            prevKey = key;
          }
        }
        finally {
          if (reader != null) {
            try {
              reader.close();
            }
            catch (Exception e) {
              // no-op;
            }
          }
          if (fsdis != null) {
            try {
              fsdis.close();
            }
            catch (Exception e) {
              // no-op
            }
          }
        }
      }
      if (lastBd != null)
        lastBd.add(bd);
      ret.setMinStepSize(minStepSize);
      
      return ret;
    }

    private void setBlockDistribution(BlockDistribution bd, TFile.Reader reader,
        BlockLocation[] locations, FileStatus fileStatus, long[] startOffsets,
        RawComparable begin, RawComparable end) throws IOException
    {
      long beginOffset, endOffset = -1;
      if (begin == null)
        beginOffset = 0;
      else
        beginOffset = reader.getOffsetForKey(begin);
      if (end != null)
      {
        if (begin == null)
          begin = reader.getFirstKey();
        /* Only if the key range is empty. This is needed because TFile has a 16-byte
         * Magic that causes getOffsetForKey to return 16 (not 0) even on the first key.
         */
        if (comparator.compare(begin, end) != 0)
          endOffset = reader.getOffsetForKey(end);
      }
      int startBlockIndex = (beginOffset == 0 ? 0 : getStartBlockIndex(startOffsets, beginOffset));
      BlockLocation l;
      int endBlockIndex = (end == null ? locations.length : endOffset == -1 ?
          startBlockIndex : getEndBlockIndex(startOffsets, endOffset));
      for (int ii = startBlockIndex; ii < endBlockIndex; ii++) {
        l = locations[ii];
        long blkBeginOffset = l.getOffset();
        long blkEndOffset = blkBeginOffset + l.getLength();
        if (blkEndOffset > blkBeginOffset) {
          bd.add(l, blkEndOffset - blkBeginOffset);
        }
      }
      return;
    }

    /**
     * Get the status of the ColumnGroup.
     */
    public BasicTableStatus getStatus() throws IOException {
      if (cgindex == null)
        cgindex = buildIndex(fs, path, dirty, conf);
      return cgindex.status;
    }

    /**
     * Split the ColumnGroup by file orders.
     * 
     * @param n
     *          Targeted number of partitions.
     * @return A list of range-based splits, whose size may be less than or
     *         equal to n.
     */
    public List<CGRangeSplit> rangeSplit(int n) throws IOException {
      // The output of this method must be only dependent on the cgindex and
      // input parameter n - so that horizontally stitched column groups will
      // get aligned splits.
      if (cgindex == null)
        cgindex = buildIndex(fs, path, dirty, conf);
      int numFiles = cgindex.size();
      if ((numFiles < n) || (n < 0)) {
        return rangeSplit(numFiles);
      }
      List<CGRangeSplit> lst = new ArrayList<CGRangeSplit>();
      int beginIndex = 0;
      for (int i = 0; i < n; ++i) {
        int endIndex = (int) ((long) (i + 1) * numFiles / n);
        lst.add(new CGRangeSplit(beginIndex, endIndex - beginIndex));
        beginIndex = endIndex;
      }
      return lst;
    }

    /**
     * We already use FileInputFormat to create byte offset-based input splits.
     * Their information is encoded in starts, lengths and paths. This method is 
     * to wrap this information to form CGRowSplit objects at column group level.
     * 
     * @param starts array of starting byte of fileSplits.
     * @param lengths array of length of fileSplits.
     * @param paths array of path of fileSplits.
     * @return A list of CGRowSplit objects. 
     *         
     */
    public List<CGRowSplit> rowSplit(long[] starts, long[] lengths, Path[] paths,
        int[] batches, int numBatches) throws IOException {
      List<CGRowSplit> lst = new ArrayList<CGRowSplit>();
      CGRowSplit cgRowSplit;
      long startFirst, bytesFirst, bytesLast;
      int length;
       
      if (numBatches == 0)
      {
        cgRowSplit = new CGRowSplit(null, null, 0, -1, 0, 0);
        lst.add(cgRowSplit);
        return lst;
      }

      if (cgindex == null)
        cgindex = buildIndex(fs, this.path, dirty, conf);

      if (cgindex.size() == 0)
      {
        cgRowSplit = new CGRowSplit(null, null, 0, -1, 0, 0);
        lst.add(cgRowSplit);
        return lst;
      }

      for (int i=0; i< numBatches; i++) {
        int indexFirst = batches[i];
        int indexLast = batches[i+1] - 1;
        startFirst = starts[indexFirst];
        bytesFirst = lengths[indexFirst];
        bytesLast = lengths[indexLast];
        length = batches[i+1] - batches[i];
        String[] namesInSplit = new String[length];
        long[] sizesInSplit = new long[length];
        for (int j = 0; j < length; j++)
        {
          namesInSplit[j] = paths[indexFirst+j].getName();
          sizesInSplit[j] = cgindex.get(cgindex.getFileIndex(paths[indexFirst+j])).bytes;
        }
        cgRowSplit = new CGRowSplit(namesInSplit, sizesInSplit, length, 
                startFirst, bytesFirst, bytesLast);
        lst.add(cgRowSplit);
      }
      
      return lst;
    }
    
    void rearrangeFileIndices(FileStatus[] fileStatus) throws IOException {
      int size = fileStatus.length;
      FileStatus[] result = new FileStatus[size];
      if (cgindex == null)
        cgindex = buildIndex(fs, path, dirty, conf);
      if (size < cgindex.size())
        throw new AssertionError("Incorrect file list size");
      for (int j, i = 0; i < size; i++)
      {
        j = cgindex.getFileIndex(fileStatus[i].getPath());
        if (j != -1)
          result[j] = fileStatus[i];
      }
      for (int i = 0; i < size; i++)
        fileStatus[i] = result[i];
    }

    /**
     * Is the ColumnGroup sorted?
     * 
     * @return Whether the ColumnGroup is sorted.
     */
    public boolean isSorted() {
      return cgschema.isSorted();
    }

    @Override
    public void close() throws IOException {
      if (!closed) {
        closed = true;
      }
    }

    /**
     * A simple wrapper class over TFile.Reader.Scanner to simplify the creation
     * and resource management.
     */
    static class TFileScanner implements Closeable {
      boolean closed = true;
      FSDataInputStream ins;
      TFile.Reader reader;
      TFile.Reader.Scanner scanner;
      TupleReader tupleReader;

      TFileScanner(FileSystem fs, Path path, CGRowSplit rowRange, 
                    RawComparable begin, RawComparable end, boolean first, boolean last,
                    CGSchema cgschema, Projection projection,
          Configuration conf) throws IOException, ParseException {
        try {
          ins = fs.open(path);
          /*
           * compressor is inside cgschema
           */
          reader = new TFile.Reader(ins, fs.getFileStatus(path).getLen(), conf);
          if (rowRange != null && rowRange.startByteFirst != -1) {
            if (first && rowRange.startByteFirst != -1)
              scanner = reader.createScannerByRecordNum(rowRange.startRowFirst, 
                                              rowRange.startRowFirst + rowRange.numRowsFirst);
            else if (last && rowRange.numBytesLast != -1)
              scanner = reader.createScannerByRecordNum(0, rowRange.numRowsLast);
            else
              scanner = reader.createScanner();
          } else {
            /* TODO: more investigation is needed for the following.
             *  using deprecated API just so that zebra can work with 
             * hadoop jar that does not contain HADOOP-6218 (Record ids for
             * TFile). This is expected to be temporary. Later we should 
             * use the undeprecated API.
             */
            scanner = reader.createScanner(begin, end);
          }
          /*
           * serializer is inside cgschema: different serializer will require
           * different Reader: for pig, it's TupleReader
           */
          tupleReader = new TupleReader(cgschema.getSchema(), projection);
          closed = false;
        }
        finally {
          if (closed == true) { // failed to instantiate the object.
            if (scanner != null) {
              try {
                scanner.close();
              }
              catch (Exception e) {
                // no-op
              }
            }

            if (reader != null) {
              try {
                reader.close();
              }
              catch (Exception e) {
                // no op
              }
            }

            if (ins != null) {
              try {
                ins.close();
              }
              catch (Exception e) {
                // no op
              }
            }
          }
        }
      }

      void rewind() throws IOException {
        scanner.rewind();
      }

      void getKey(BytesWritable key) throws IOException {
        scanner.entry().getKey(key);
      }

      void getValue(Tuple val) throws IOException, ParseException {
        DataInputStream dis = scanner.entry().getValueStream();
        try {
          tupleReader.get(dis, val);
          
        }
        finally {
          dis.close();
        }
      }

      boolean seekTo(BytesWritable key) throws IOException {
        return scanner.seekTo(key.getBytes(), 0, key.getLength());
      }

      boolean advance() throws IOException {
        return scanner.advance();
      }

      boolean atEnd() {
        return scanner.atEnd();
      }

      void seekToEnd() throws IOException {
        scanner.seekToEnd();
      }

      @Override
      public void close() throws IOException {
        if (!closed) {
          closed = true;
          try {
            scanner.close();
          }
          catch (Exception e) {
            // no op
          }

          try {
            reader.close();
          }
          catch (Exception e) {
            // no op
          }

          try {
            ins.close();
          }
          catch (Exception e) {
            // no op
          }
        }
      }
    }

    /**
     * ColumnGroup scanner
     */
    class CGScanner implements TableScanner {
      private Projection logicalSchema = null;
      private TFileScannerInfo[] scanners;
      private boolean closeReader;
      private int beginIndex, endIndex;
      private int current; // current scanner
      private boolean scannerClosed = true;
      private CGRowSplit rowRange;
      private TFileScanner scanner;
      
      private class TFileScannerInfo {
        boolean first, last;
        Path path;
        RawComparable begin, end;
        TFileScannerInfo(boolean first, boolean last, Path path, RawComparable begin, RawComparable end) {
          this.first = first;
          this.last = last;
          this.begin = begin;
          this.end = end;
          this.path = path;
        }
        
        TFileScanner getTFileScanner() throws IOException {
          try {
            return new TFileScanner(fs, path, rowRange, 
                  begin, end, first, last, cgschema, logicalSchema, conf);
          } catch (ParseException e) {
            throw new IOException(e.getMessage());
          }
        }
      }


      CGScanner(CGRangeSplit split, boolean closeReader) throws IOException,
      ParseException {
        if (cgindex== null)
          cgindex = buildIndex(fs, path, dirty, conf);
        if (split == null) {
          beginIndex = 0;
          endIndex = cgindex.size();
        }
        else {
          beginIndex = split.start;
          endIndex = split.start + split.len;
        }
        init(null, null, null, closeReader);
      }
      
      /**
       * Scanner for a range specified by the given row range.
       * 
       * @param rowRange see {@link CGRowSplit}
       * @param closeReader
       */
      CGScanner(CGRowSplit rowRange, boolean closeReader) 
                 throws IOException, ParseException {
        beginIndex = 0;
        endIndex = rowRange.length;
        init(rowRange, null, null, closeReader);
      }
      
      CGScanner(RawComparable beginKey, RawComparable endKey,
          boolean closeReader) throws IOException, ParseException {
        beginIndex = 0;
        endIndex = cgindex.size();
        if (beginKey != null) {
          beginIndex = cgindex.lowerBound(beginKey, comparator);
        }
        if (endKey != null) {
          endIndex = cgindex.lowerBound(endKey, comparator);
          if (endIndex < cgindex.size()) {
            ++endIndex;
          }
        }
        init(null, beginKey, endKey, closeReader);
      }

      private void init(CGRowSplit rowRange, RawComparable beginKey, 
                        RawComparable endKey, boolean doClose) 
             throws IOException, ParseException {
        this.rowRange = rowRange;
        if (beginIndex > endIndex) {
          throw new IllegalArgumentException("beginIndex > endIndex");
        }
        logicalSchema = ColumnGroup.Reader.this.getProjection();
        List<TFileScannerInfo> tmpScanners =
            new ArrayList<TFileScannerInfo>(endIndex - beginIndex);
        try {
          boolean first, last, realFirst = true;
          Path myPath;
          for (int i = beginIndex; i < endIndex; ++i) {
            first = (i == beginIndex);
            last = (i == endIndex -1);
            RawComparable begin = first ? beginKey : null;
            RawComparable end = last ? endKey : null;
            TFileScannerInfo scanner;
            if (rowRange == null)
              myPath = cgindex.getPath(i, path);
            else
              myPath = new Path(path, rowRange.names[i]);
            scanner =
                new TFileScannerInfo(first, last, myPath, begin, end);
            if (realFirst) {
              this.scanner = scanner.getTFileScanner();
              if (this.scanner.atEnd()) {
                this.scanner.close();
                this.scanner = null;
              } else {
                realFirst = false;
                tmpScanners.add(scanner);
              }
            } else {
              TFileScanner myScanner = scanner.getTFileScanner();
              if (!myScanner.atEnd())
                tmpScanners.add(scanner);
              myScanner.close();
            }
          }
          scanners = tmpScanners.toArray(new TFileScannerInfo[tmpScanners.size()]);
          this.closeReader = doClose;
          scannerClosed = false;
        }
        finally {
          if (scannerClosed) { // failed to initialize the object.
            if (scanner != null)
              scanner.close();
          }
        }
      }

      @Override
      public void getKey(BytesWritable key) throws IOException {
          if (atEnd()) {
            throw new EOFException("No more key-value to read");
          }
          scanner.getKey(key);
        }

        @Override
        public void getValue(Tuple row) throws IOException {
          if (atEnd()) {
            throw new EOFException("No more key-value to read");
          }
          try {
            scanner.getValue(row);
          } catch (ParseException e) {
            throw new IOException("Invalid Projection: "+e.getMessage());
          }
        }

      public void getCGKey(BytesWritable key) throws IOException {
        scanner.getKey(key);
      }

      public void getCGValue(Tuple row) throws IOException {
        try {
            scanner.getValue(row);
          } catch (ParseException e) {
            throw new IOException("Invalid Projection: "+e.getMessage());
          }
      }

      @Override
      public String getProjection() {
        return logicalSchema.toString();
      }
      
      public Schema getSchema() {
        return logicalSchema.getSchema();
      }

      @Override
      public boolean advance() throws IOException {
        if (atEnd()) {
          return false;
        }
        scanner.advance();
        while (true)
        {
          if (scanner.atEnd()) {
            scanner.close();
            scanner = null;
            ++current;
            if (!atEnd()) {
              scanner = scanners[current].getTFileScanner();
            } else
              return false;
          } else
            return true;
        }
      }

      public boolean advanceCG() throws IOException {
        scanner.advance();
        while (true)
        {
          if (scanner.atEnd()) {
            scanner.close();
            scanner = null;
            ++current;
            if (!atEnd()) {
              scanner = scanners[current].getTFileScanner();
            } else
              return false;
          } else
            return true;
        }
      }

      @Override
      public boolean atEnd() throws IOException {
        return (current >= scanners.length);
      }

      @Override
      public boolean seekTo(BytesWritable key) throws IOException {
        if (!isSorted()) {
          throw new IOException("Cannot seek in unsorted Column Gruop");
        }
        if (atEnd())
        {
          return false;
        }
        int index =
            cgindex.lowerBound(new ByteArray(key.getBytes(), 0, key.getLength()),
                comparator);
        if (index >= endIndex) {
          seekToEnd();
          return false;
        }

        if ((index < beginIndex)) {
          // move to the beginning
          index = beginIndex;
        }

        int prevCurrent = current;
        current = index - beginIndex;
        if (current != prevCurrent)
        {
          if (scanner != null)
          {
            try {
              scanner.close();
            } catch (Exception e) {
              // no-op
            }
          }
          scanner = scanners[current].getTFileScanner();
        }
        return scanner.seekTo(key);
      }

      @Override
      public void seekToEnd() throws IOException {
        if (scanner != null)
        {
          try {
            scanner.close();
          } catch (Exception e) {
            // no-op
          }
        }
        scanner = null;
        current = scanners.length;
      }

      @Override
      public void close() throws IOException {
        if (!scannerClosed) {
          scannerClosed = true;
          if (scanner != null)
          {
            try {
              scanner.close();
              scanner = null;
            } catch (Exception e) {
              // no-op
            }
          }
          if (closeReader) {
            Reader.this.close();
          }
        }
      }
    }

    public static class CGRangeSplit implements Writable {
      int start = 0; // starting index in the list
      int len = 0;

      CGRangeSplit(int start, int len) {
        this.start = start;
        this.len = len;
      }

      public CGRangeSplit() {
        // no-op;
      }

      @Override
      public void readFields(DataInput in) throws IOException {
        start = Utils.readVInt(in);
        len = Utils.readVInt(in);
      }

      @Override
      public void write(DataOutput out) throws IOException {
        Utils.writeVInt(out, start);
        Utils.writeVInt(out, len);
      }
    }
    
    public static class CGRowSplit implements Writable {
      int length; // number of files in the batch
      long startByteFirst = -1;
      long numBytesFirst;
      long startRowFirst = -1;
      long numRowsFirst = -1;
      long numBytesLast = -1;
      long numRowsLast = -1;
      String[] names;
      long[] sizes = null;

      CGRowSplit(String[] names, long[] sizes, int length, long startFirst, long bytesFirst,
          long bytesLast) throws IOException {
        this.names = names;
        this.sizes = sizes;
        this.length = length;

        if (startFirst != -1)
        {
          startByteFirst = startFirst;
          numBytesFirst = bytesFirst;
        }
        if (bytesLast != -1 && this.length > 1)
        {
          numBytesLast = bytesLast;
        }
      }

      public CGRowSplit() {
        // no-op;
      }
      
      @Override
      public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append("{length = " + length + "}\n");
        for (int i = 0; i < length; i++)
        {
          sb.append("{name = " + names[i] + "}\n");
          sb.append("{size = " + sizes[i] + "}\n");
        }
        sb.append("{startByteFirst = " + startByteFirst + "}\n");
        sb.append("{numBytesFirst = " + numBytesFirst + "}\n");
        sb.append("{startRowFirst = " + startRowFirst + "}\n");
        sb.append("{numRowsFirst = " + numRowsFirst + "}\n");
        sb.append("{numBytesLast = " + numBytesLast + "}\n");
        sb.append("{numRowsLast = " + numRowsLast + "}\n");
        
        return sb.toString();
      }

      @Override
      public void readFields(DataInput in) throws IOException {
        length = Utils.readVInt(in);
        if (length > 0)
        {
          names = new String[length];
          sizes = new long[length];
        }
        for (int i = 0; i < length; i++)
        {
          names[i] = Utils.readString(in);
          sizes[i] = Utils.readVLong(in);
        }
        startByteFirst = Utils.readVLong(in);
        numBytesFirst = Utils.readVLong(in);
        startRowFirst = Utils.readVLong(in);
        numRowsFirst = Utils.readVLong(in);
        numBytesLast = Utils.readVLong(in);
        numRowsLast = Utils.readVLong(in);
      }

      @Override
      public void write(DataOutput out) throws IOException {
        Utils.writeVInt(out, length);
        for (int i = 0; i < length; i++)
        {
          Utils.writeString(out, names[i]);
          Utils.writeVLong(out, sizes[i]);
        }
        Utils.writeVLong(out, startByteFirst);
        Utils.writeVLong(out, numBytesFirst);
        Utils.writeVLong(out, startRowFirst);
        Utils.writeVLong(out, numRowsFirst);
        Utils.writeVLong(out, numBytesLast);
        Utils.writeVLong(out, numRowsLast);
      }
    }
    
    private static class SplitColumn {
      SplitColumn(Partition.SplitType st) {
        this.st = st;
      }

      SplitColumn(int fieldIndex, Partition.SplitType st) {
        this.fieldIndex = fieldIndex;
        this.st = st;
      }

      SplitColumn(int fieldIndex, String key, Partition.SplitType st) {
        this.fieldIndex = fieldIndex;
        this.key = key;
        this.st = st;
      }

      SplitColumn(int fieldIndex, int projIndex, SplitColumn leaf, String key,
          Partition.SplitType st) {
        this(fieldIndex, key, st);
        this.projIndex = projIndex;
      }

      int fieldIndex = -1; // field index to parent
      int projIndex = -1; // index in projection: only used by leaves
      SplitColumn leaf = null;
      String key = null; // MAP key to parent
      ArrayList<SplitColumn> children = null;
      int index = -1; // index in the logical schema
      Object field = null;
      Partition.SplitType st = Partition.SplitType.NONE;

      void dispatch(Object field) {
        this.field = field;
      }

      @SuppressWarnings("unchecked")
      void split() throws ExecException {
        int size = children.size();
        if (st == Partition.SplitType.RECORD) {
          for (int i = 0; i < size; i++) {
            if (children.get(i).projIndex != -1) // a leaf: set projection
                                                  // directly
            ((Tuple) (leaf.field)).set(projIndex, ((Tuple) field).get(children
                .get(i).fieldIndex));
            else children.get(i).field =
                ((Tuple) field).get(children.get(i).fieldIndex);
          }
        }
        else if (st == Partition.SplitType.MAP) {
          for (int i = 0; i < size; i++) {
            if (children.get(i).projIndex != -1) // a leaf: set projection
                                                  // directly
            ((Tuple) (leaf.field)).set(projIndex, ((Map<String, Object>) field)
                .get(children.get(i).key));
            else children.get(i).field =
                ((Map<String, Object>) field).get(children.get(i).key);
          }
        }
      }

      void addChild(SplitColumn child) {
        if (children == null) children = new ArrayList<SplitColumn>();
        children.add(child);
      }
    }
  }

  /**
   * Column Group writer.
   */
  public static class Writer implements Closeable {
    Path path;
    Path finalOutputPath;
    Configuration conf;
    FileSystem fs;
    CGSchema cgschema;
    private boolean finished, closed;
    CGIndex index;

    /**
     * Create a ColumnGroup writer. The semantics are as follows:
     * <ol>
     * <li>If path does not exist:
     * <ul>
     * <li>create the path directory
     * <li>write out the meta data file.
     * </ul>
     * <li>If path exists and the directory is empty: write out the meta data
     * file.
     * <li>If path exists and contains what look like a complete Column Group,
     * ColumnGroupExists exception will be thrown.
     * <li>If path exists and overwrite is true, remove all files under the
     * directory and resume as in Step 2.
     * <li>If path exists directory not empty and overwrite= false,
     * ColumnGroupExists will be thrown.
     * </ol>
     * This constructor never removes a valid/complete ColumnGroup.
     * 
     * @param path
     *          The path to the Column Group, either not existent or must be a
     *          directory.
     * @param schema
     *          The schema of the ColumnGroup. For this version of
     *          implementation, the schema of a table is a comma separated list
     *          of column names, such as "FirstName, LastName, Sex, Department".
     * @param sorted
     *          Whether the column group to be created is sorted or not. If set
     *          to true, we expect the rows inserted by every inserter created
     *          from this Writer must be sorted. Additionally, there exists an
     *          ordering of the inserters Ins-1, Ins-2, ... such that the rows
     *          created by Ins-1, followed by rows created by Ins-2, ... form a
     *          total order.
     * @param overwrite
     *          Should we overwrite the path if it already exists?
     * @param conf
     *          The optional configuration objects.
     * @throws IOException
     */
    public Writer(Path path, String schema, boolean sorted, String name, String serializer,
        String compressor, String owner, String group, short perm,boolean overwrite, Configuration conf)
        throws IOException, ParseException {
      this(path, new Schema(schema), sorted, null, name, serializer, compressor, owner, group, perm, overwrite,
          conf);
    }
    
    public Writer(Path path, Schema schema, boolean sorted, String name, String serializer,
        String compressor, String owner, String group, short perm,boolean overwrite, Configuration conf)
        throws IOException, ParseException {
      this(path, schema, sorted, null, name, serializer, compressor, owner, group, perm, overwrite,
          conf);
    }

    public Writer(Path path, String schema, boolean sorted, String comparator, String name, String serializer,
        String compressor, String owner, String group, short perm,boolean overwrite, Configuration conf)
        throws IOException, ParseException {
      this(path, new Schema(schema), sorted, comparator, name, serializer, compressor, owner, group, perm, overwrite,
          conf);
    }

    public Writer(Path path, Schema schema, boolean sorted, String comparator, String name, String serializer,
        String compressor, String owner, String group, short perm, boolean overwrite, Configuration conf)
        throws IOException, ParseException {
      this.path = path;
      this.conf = conf;
      this.finalOutputPath = path;

      fs = path.getFileSystem(conf);

      // If meta file already exists, that means the ColumnGroup is complete and
      // valid, we will not proceed.
      checkMetaFile(path);

      // if overwriting, remove everything
      if (overwrite) {
        fs.delete(path, true);
      }

      // create final output path and temporary output path
      checkPath(path, true);
      
      Path parent = path.getParent();
      Path tmpPath1 = new Path(parent, "_temporary");
      Path tmpPath2 = new Path(tmpPath1, name);
      checkPath(tmpPath2, true);
      
      cgschema = new CGSchema(schema, sorted, comparator, name, serializer, compressor, owner, group, perm);
      CGSchema sfNew = CGSchema.load(fs, path);
      if (sfNew != null) {
        // sanity check - compare input with on-disk schema.
        if (!sfNew.equals(cgschema)) {
          throw new IOException("Schema passed in is different from the one on disk");
        }
      } else {
        // create the schema file in FS
        cgschema.create(fs, path);
      }
    }

    /**
     * Reopen an already created ColumnGroup for writing. It accepts
     * a temporary path for column group where cginserter can write.
     * RuntimeException will be thrown if the table is already closed, 
     * or if createMetaBlock() is called by some other process.
     */
    public Writer(Path finalPath, Path workPath, Configuration conf) throws IOException,
        ParseException {
      this.path = workPath;
      finalOutputPath = finalPath;
      this.conf = conf;
      fs = path.getFileSystem(conf);
      checkPath(finalOutputPath, false);
      checkPath(path, true);
      checkMetaFile(finalOutputPath);
      cgschema = CGSchema.load(fs, finalOutputPath);
    }

    /*
     * Reopen an already created ColumnGroup for writing.
     * It takes in a CGSchema to set its own cgschema instead of going
     * to disk to fetch this information. 
     */
    public Writer(Path finalPath, Path workPath, CGSchema cgschema, Configuration conf) throws IOException, ParseException {
      this.path = workPath;
      finalOutputPath = finalPath;      
      this.conf = conf;
      fs = path.getFileSystem(conf);
      this.cgschema = cgschema;
    }

    /**
     * Reopen an already created ColumnGroup for writing. RuntimeException will
     * be thrown if the table is already closed, or if createMetaBlock() is
     * called by some other process.
     */
    public Writer(Path path, Configuration conf) throws IOException,
        ParseException {
      this.path = path;
      finalOutputPath = path;
      this.conf = conf;
      fs = path.getFileSystem(conf);
      checkPath(path, false);
      checkMetaFile(path);
      // read the schema file
      cgschema = CGSchema.load(fs, path);
    }

    /**
     * Release resources used by the object. Unlike close(), finish() does not
     * make the table immutable. However, if a user already adds some meta data
     * into the CG, then finish() would close the column group.
     */
    public void finish() {
      if (!finished) {
        finished = true;
      }
    }

    @Override
    public void close() throws IOException {
      if (!finished) {
        finish();
      }
      if (!closed) {
        closed = true;
        createIndex();
      }
    }

    public Schema getSchema() {
      return cgschema.getSchema();
    }

    /**
     * Get a inserter with a given name.
     * 
     * @param name
     *          the name of the inserter.
     * @param finishWriter
     *          finish the underlying Writer object upon the close of the
     *          Inserter. Should be set to true if there is only one inserter
     *          operate on the table, so we should call finish() after the
     *          Inserter is closed.
     * 
     * @return A table inserter object.
     * @throws IOException
     */
    public TableInserter getInserter(String name, boolean finishWriter)
        throws IOException {
      return getInserter(name, finishWriter, true);      
    }
    
    /**
     * Get a inserter with a given name.
     * 
     * @param name
     *          the name of the inserter.
     * @param finishWriter
     *          finish the underlying Writer object upon the close of the
     *          Inserter. Should be set to true if there is only one inserter
     *          operate on the table, so we should call finish() after the
     *          Inserter is closed.
     * @param checktype
     *          whether or not do type check.
     * 
     * @return A table inserter object.
     * @throws IOException
     */
    public TableInserter getInserter(String name, boolean finishWriter, boolean checkType)
        throws IOException {
      if (finished) {
        throw new IOException("ColumnGroup has been closed for insertion.");
      }
      return new CGInserter(name, finishWriter, checkType);
    }

    private void createIndex() throws IOException {
      MetaFile.Writer metaFile =
        MetaFile.createWriter(makeMetaFilePath(finalOutputPath), conf);
      index = buildIndex(fs, finalOutputPath, false, conf);
      DataOutputStream dos = metaFile.createMetaBlock(BLOCK_NAME_INDEX);
      try {
        index.write(dos);
      }
      finally {
        dos.close();
      }
      metaFile.close();
    }

    private void checkPath(Path p, boolean createNew) throws IOException {
      // check existence of path
      if (!fs.exists(p)) {
        if (createNew) {
          fs.mkdirs(p);
        }
        else {
          throw new IOException("Path doesn't exists for appending: " + p);
        }
      }
      if (!fs.getFileStatus(p).isDir()) {
        throw new IOException("Path exists but not a directory: " + p);
      }
    }

    private void checkMetaFile(Path p) throws IOException {
      Path pathMeta = new Path(p, META_FILE);
      if (fs.exists(pathMeta)) {
        throw new IOException("Index meta file already exists: " + pathMeta);
      }
    }

    /**
     * Inserter for ColumnGroup
     */
    class CGInserter implements TableInserter {
      String name;
      String tmpName;
      boolean finishWriter;
      FSDataOutputStream out;
      TFile.Writer tfileWriter;
      TupleWriter tupleWriter;
      boolean closed = true;
      boolean checkType = true;
      
      private void createTempFile() throws IOException {
        int maxTrial = 10;
        String prefix = ".tmp." + name + ".";
        Random random = new Random();

        while (true) {
          /**
           * Try to set a real random seed by throwing all the runtime
           * ingredients into it.
           */
          random.setSeed(System.nanoTime() * Thread.currentThread().getId()
              * Runtime.getRuntime().freeMemory());
          try {
            tmpName = prefix + String.format("%08X", random.nextInt());
            Path tmpPath = new Path(path, tmpName);
            fs.mkdirs(path);

            if(cgschema.getOwner() != null  || cgschema.getGroup() != null) {
              fs.setOwner(path, cgschema.getOwner(), cgschema.getGroup());
            }  

            FsPermission permission = null;
            if(cgschema.getPerm() != -1) {
                permission = new FsPermission((short) cgschema.getPerm());
            	fs.setPermission(path, permission);
            }  
            
            out = fs.create(tmpPath, false);

            if(cgschema.getOwner() != null || cgschema.getGroup() != null) {
                fs.setOwner(tmpPath, cgschema.getOwner(), cgschema.getGroup());
       	    }  

            if(cgschema.getPerm() != -1) {
        	  fs.setPermission(tmpPath, permission);
            }	
            return;
          }
          catch (IOException e) {
            --maxTrial;
            if (maxTrial == 0) {
              throw e;
            }
            Thread.yield();
          }
        }
      }
      
      CGInserter(String name, boolean finishWriter, boolean checkType) throws IOException {
        this.name = name;
        this.finishWriter = finishWriter;
        this.tupleWriter = new TupleWriter(getSchema());
        this.checkType = checkType;
        
        try {
          createTempFile();
          tfileWriter =
            new TFile.Writer(out, getMinBlockSize(conf), cgschema.getCompressor(), cgschema.getComparator(), conf);
          closed = false;
        }
        finally {
          if (closed) {
            if (tfileWriter != null) {
              try {
                tfileWriter.close();
              }
              catch (Exception e) {
                // no-op
              }
            }
            if (out != null) {
              try {
                out.close();
              }
              catch (Exception e) {
                // no-op
              }
            }
            if (tmpName != null) {
              try {
                fs.delete(new Path(path, tmpName), false);
              }
              catch (Exception e) {
                // no-op
              }
            }
          }
        }
      }


      @Override
      public Schema getSchema() {
        return ColumnGroup.Writer.this.getSchema();
      }

      @Override
      public void insert(BytesWritable key, Tuple row) throws IOException {
        /*
         * If checkType is set to be true, we check for the first row - this is only a sanity check preventing
         * users from messing up output schema;
         * If checkType is set to be false, we do not do any type check. 
         */
        if (checkType == true) {
          TypesUtils.checkCompatible(row, getSchema());
          checkType = false;
        }
        
        DataOutputStream outKey = tfileWriter.prepareAppendKey(key.getLength());
        try {
          outKey.write(key.getBytes(), 0, key.getLength());
        }
        finally {
          outKey.close();
        }

        DataOutputStream outValue = tfileWriter.prepareAppendValue(-1);
        try {
          tupleWriter.put(outValue, row);
        }
        finally {
          outValue.close();
        }
      }

      @Override
      public void close() throws IOException {
        if (closed) {
          return;
        }
        closed = true;

        try {
          // TODO: add schema to each TFile as a meta block?

          tfileWriter.close();
          tfileWriter = null;
          out.close();
          out = null;
          // do renaming only if all the above is successful.
          fs.rename(new Path(path, tmpName), new Path(finalOutputPath, name));

/*
          if(cgschema.getOwner() != null || cgschema.getGroup() != null) {
            fs.setOwner(new Path(path, name), cgschema.getOwner(), cgschema.getGroup());
          }  
          FsPermission permission = null;
          if(cgschema.getPerm() != -1) {
            permission = new FsPermission((short) cgschema.getPerm());
            fs.setPermission(path, permission);
          }
*/                     
          tmpName = null;
          if (finishWriter) {
            finish();
          }
        }
        finally {
          if (tfileWriter != null) {
            try {
              tfileWriter.close();
            }
            catch (Exception e) {
              // no-op
            }
          }
          if (out != null) {
            try {
              out.close();
            }
            catch (Exception e) {
              // no-op
            }
          }
          if (tmpName != null) {
            try {
              fs.delete(new Path(path, tmpName), false);
            }
            catch (Exception e) {
              // no-op
            }
          }
          if (finishWriter) {
            try {
              finish();
            }
            catch (Exception e) {
              // no-op
            }
          }
        }
      }
    }

  }

  /**
   * name, first and last key (inclusive) of a data file
   */
  static class CGIndexEntry implements RawComparable, Writable {
    int index;
    String name;
    long rows, bytes;
    RawComparable firstKey;
    RawComparable lastKey;

    // for reading
    public CGIndexEntry() {
      // no-op
    }

    // for writing
    public CGIndexEntry(String name, long rows, RawComparable firstKey,
        RawComparable lastKey) {
      this.name = name;
      this.rows = rows;
      this.firstKey = firstKey;
      this.lastKey = lastKey;
    }

    public int getIndex() {
      return index;
    }
    
    public String getName() {
      return name;
    }

    public long getRows() {
      return rows;
    }

    public RawComparable getFirstKey() {
      return firstKey;
    }

    public RawComparable getLastKey() {
      return lastKey;
    }
    
    void setIndex (int idx) {
      this.index = idx;
    }

    @Override
    public byte[] buffer() {
      return (lastKey != null) ? lastKey.buffer() : null;
    }

    @Override
    public int offset() {
      return (lastKey != null) ? lastKey.offset() : 0;
    }

    @Override
    public int size() {
      return (lastKey != null) ? lastKey.size() : 0;
    }

    @Override
    public void readFields(DataInput in) throws IOException {
      name = Utils.readString(in);
      rows = Utils.readVLong(in);
      if (rows == 0) {
        firstKey = null;
        lastKey = null;
      }
      else {
        int firstKeyLen = Utils.readVInt(in);
        byte[] firstKeyBuffer = new byte[firstKeyLen];
        in.readFully(firstKeyBuffer);
        int lastKeyLen = Utils.readVInt(in);
        byte[] lastKeyBuffer = new byte[lastKeyLen];
        in.readFully(lastKeyBuffer);
        firstKey = new ByteArray(firstKeyBuffer);
        lastKey = new ByteArray(lastKeyBuffer);
      }
    }

    @Override
    public void write(DataOutput out) throws IOException {
      Utils.writeString(out, name);
      Utils.writeVLong(out, rows);
      if (rows > 0) {
        if ((firstKey == null) && (lastKey == null)) {
          throw new IOException("In-memory only entry");
        }
        Utils.writeVInt(out, firstKey.size());
        out.write(firstKey.buffer(), firstKey.offset(), firstKey.size());
        Utils.writeVInt(out, lastKey.size());
        out.write(lastKey.buffer(), lastKey.offset(), lastKey.size());
      }
    }
  }

  static class CGIndex implements Writable {
    boolean dirty = false;
    boolean sorted = true;
    BasicTableStatus status;
    ArrayList<CGIndexEntry> index;

    CGIndex() {
      status = new BasicTableStatus();
      index = new ArrayList<CGIndexEntry>();
    }
    
    int getFileIndex(Path path) throws IOException {
      String filename = path.getName();
      if (index.isEmpty())
        return -1;
      for (CGIndexEntry cgie : index) {
        if (cgie.getName().equals(filename)) {
          return cgie.getIndex(); 
        }
      }
      return -1;
    }

    int size() {
      return index.size();
    }

    CGIndexEntry get(int i) {
      return index.get(i);
    }

    List<CGIndexEntry> getIndex() {
      return index;
    }

    Path getPath(int i, Path parent) {
      return new Path(parent, index.get(i).getName());
    }

    void sort(final Comparator<RawComparable> comparator) throws IOException {
      if (dirty && comparator != null) {
        throw new IOException("Cannot sort dirty index");
      }

      if (comparator != null) {
        // sort by keys. For empty TFiles, they are always sorted before
        // non-empty TFiles, and they themselves are sorted by their names.
        Collections.sort(index, new Comparator<CGIndexEntry>() {

          @Override
          public int compare(CGIndexEntry o1, CGIndexEntry o2) {
            if ((o1.getRows() == 0) && (o2.getRows() == 0)) {
              return o1.getName().compareTo(o2.getName());
            }
            if (o1.getRows() == 0) return -1;
            if (o2.getRows() == 0) return 1;
            int cmprv = comparator.compare(o1.lastKey, o2.lastKey);
            if (cmprv == 0) {
              cmprv = comparator.compare(o1.firstKey, o2.firstKey);
              if (cmprv == 0) {
                cmprv = o1.getName().compareTo(o2.getName());
              }
            }
            return cmprv;
          }
        });

        for (int i = 0; i < index.size() - 1; ++i) {
          RawComparable prevLastKey = index.get(i).lastKey;
          RawComparable nextFirstKey = index.get(i + 1).firstKey;
          if (nextFirstKey == null) {
            continue;
          }
          if (comparator.compare(prevLastKey, nextFirstKey) > 0) {
            throw new IOException("Overlapping key ranges");
          }
        }
      }
      else {
        // sort by name
        Collections.sort(index, new Comparator<CGIndexEntry>() {

          @Override
          public int compare(CGIndexEntry o1, CGIndexEntry o2) {
            return o1.name.compareTo(o2.name);
          }
        });
      }

      // update status
      if ((!dirty) && (index.size() > 0)) {
        RawComparable keyFirst = index.get(0).getFirstKey();
        status.beginKey = new BytesWritable();
        status.beginKey.set(keyFirst.buffer(), keyFirst.offset(), keyFirst
            .size());
        RawComparable keyLast = index.get(index.size() - 1).getLastKey();
        status.endKey = new BytesWritable();
        status.endKey.set(keyLast.buffer(), keyLast.offset(), keyLast.size());
      }
      sorted = true;
    }

    // building full index.
    void add(long bytes, long rows, CGIndexEntry range) {
      status.size += bytes;
      status.rows += rows;
      index.add(range);
      sorted = false;
      range.bytes = bytes;
    }

    // building dirty index
    void add(long bytes, String name) {
      dirty = true;
      status.rows = -1; // reset rows to -1.
      status.size += bytes;
      CGIndexEntry next = new CGIndexEntry();
      next.name = name;
      index.add(next);
      sorted = false;
      next.bytes = bytes;
    }

    int lowerBound(RawComparable key, final Comparator<RawComparable> comparator)
        throws IOException {
      if ((key == null) || (comparator == null)) {
        throw new IllegalArgumentException("CGIndex.lowerBound");
      }

      if (!sorted) {
        sort(comparator);
      }

      // Treat null keys as the least key.
      return Utils.lowerBound(index, key, new Comparator<RawComparable>() {
        @Override
        public int compare(RawComparable o1, RawComparable o2) {
          if ((o1.buffer() == null) && (o2.buffer() == null)) {
            return 0;
          }
          if (o1.buffer() == null) return -1;
          if (o2.buffer() == null) return 1;
          return comparator.compare(o1, o2);
        }
      });
    }

    @Override
    public void readFields(DataInput in) throws IOException {
      int n = Utils.readVInt(in);
      index.clear();
      index.ensureCapacity(n);
      for (int i = 0; i < n; ++i) {
        CGIndexEntry range = new CGIndexEntry();
        range.readFields(in);
        range.setIndex(i);
        index.add(range);
      }
      status.readFields(in);
      dirty = false;
      sorted = true;
    }

    @Override
    public void write(DataOutput out) throws IOException {
      if (dirty) {
        throw new IOException("Cannot write dirty index");
      }
      if (!sorted) {
        throw new IOException("Please sort index before calling write");
      }
      Utils.writeVInt(out, index.size());
      for (int i = 0; i < index.size(); ++i) {
        index.get(i).write(out);
      }
      status.write(out);
    }
  }

  public static class CGPathFilter implements PathFilter {
    private static Configuration conf;
   
    public static void setConf(Configuration c) {
      conf = c;
    }

    public boolean accept(Path p) {
      return p.getName().equals(META_FILE) || p.getName().equals(SCHEMA_FILE)
          || p.getName().startsWith(".tmp.")
          || p.getName().startsWith("_")
          || p.getName().startsWith("ttt")
          || p.getName().startsWith(getNonDataFilePrefix(conf)) ? false : true;
    }
  }

  /**
   * Dump information about CG.
   * 
   * @param file
   *          Path string of the CG
   * @param out
   *          PrintStream to output the information.
   * @param conf
   *          The configuration object.
   * @throws IOException
   */
  static public void dumpInfo(String file, PrintStream out, Configuration conf)
      throws IOException, Exception {
    // final int maxKeySampleLen = 16;
    dumpInfo(new Path(file), out, conf);
  }

  static public void dumpInfo(Path path, PrintStream out, Configuration conf)
      throws IOException, Exception {
      dumpInfo(path, out, conf, 0);
  }

  static public void dumpInfo(Path path, PrintStream out, Configuration conf, int indent)
      throws IOException, Exception {
    // final int maxKeySampleLen = 16;
    IOutils.indent(out, indent);
    out.println();
    IOutils.indent(out, indent);
    out.println("Column Group : " + path);
    ColumnGroup.Reader reader = new ColumnGroup.Reader(path, false, conf);
    try {
      LinkedHashMap<String, String> properties =
          new LinkedHashMap<String, String>();
      IOutils.indent(out, indent);
      out.println("Name: " + reader.getName());
      IOutils.indent(out, indent);
      out.println("Serializer: " + reader.getSerializer());
      IOutils.indent(out, indent);
      out.println("Compressor: " + reader.getCompressor());      
      IOutils.indent(out, indent);
      out.println("Group: " + reader.getGroup());      
      IOutils.indent(out, indent);
      out.println("Perm: " + reader.getPerm());      

      properties.put("Schema", reader.getSchema().toString());
      // Now output the properties table.
      int maxKeyLength = 0;
      Set<Map.Entry<String, String>> entrySet = properties.entrySet();
      for (Iterator<Map.Entry<String, String>> it = entrySet.iterator(); it
          .hasNext();) {
        Map.Entry<String, String> e = it.next();
        if (e.getKey().length() > maxKeyLength) {
          maxKeyLength = e.getKey().length();
        }
      }
      for (Iterator<Map.Entry<String, String>> it = entrySet.iterator(); it
          .hasNext();) {
        Map.Entry<String, String> e = it.next();
        IOutils.indent(out, indent);
        out.printf("%s : %s\n", e.getKey(), e.getValue());
      }
      out.println("TFiles within the Column Group :");
      if (reader.cgindex == null)
        reader.cgindex = buildIndex(reader.fs, reader.path, reader.dirty, conf);
      for (CGIndexEntry entry : reader.cgindex.index) {
        IOutils.indent(out, indent);
        out.printf(" *Name : %s\n", entry.name);
        IOutils.indent(out, indent);
        out.printf("  Rows : %d\n", entry.rows);
        if (entry.firstKey != null) {
          IOutils.indent(out, indent);
          out.printf("  First Key : %s\n", headToString(entry.firstKey));
        }
        if (entry.lastKey != null) {
          IOutils.indent(out, indent);
          out.printf("  Larst Key : %s\n", headToString(entry.lastKey));
        }
        // dump TFile info
        // Path pathTFile = new Path(path, entry.name);
        // TFile.dumpInfo(pathTFile.toString(), out, conf);
      }
    }
    finally {
      try {
        reader.close();
      }
      catch (Exception e) {
        // no-op
      }
    }
  }

  private static String headToString(RawComparable raw) {
    return new String(raw.buffer(), raw.offset(), raw.size() > 70 ? 70 : raw
        .size());
  }

  /**
   * Dumping the CG information.
   * 
   * @param args
   *          A list of CG paths.
   */
  public static void main(String[] args) throws Exception {
    System.out.printf("ColumnGroup Dumper\n");
    if (args.length == 0) {
      System.out
          .println("Usage: java ... org.apache.hadoop.zebra.io.ColumnGroup cg-path [cg-path ...]");
      System.exit(0);
    }
    Configuration conf = new Configuration();
    for (String file : args) {
      try {
        dumpInfo(file, System.out, conf);
      }
      catch (IOException e) {
        e.printStackTrace(System.err);
      }
    }
  }
}