HadoopArchives.java example

Explorer
bigpetstore-master
- hadoop-1.2.1
  - src
- src
  - integration
    - java
      - org
        bigtop
        bigpetstore
        integration
        BigPetStoreHiveIT.java
        BigPetStoreMahoutIT.java
        BigPetStorePigIT.java
        ITUtils.java
  - main
    - java
      - org
        bigtop
        bigpetstore
        clustering
        BPSRecommnder.java
        MahoutClusterTransactionsByRegion.java
        contract
        PetStoreStatistics.java
        etl
        CrunchETL.java
        HiveViewCreator.java
        LineItem.java
        PigCSVCleaner.java
        generator
        BPSGenerator.java
        GeneratePetStoreTransactionsInputFormat.java
        PetStoreTransaction.java
        PetStoreTransactionInputSplit.java
        TransactionIteratorFactory.java
        util
        BigPetStoreConstants.java
        DeveloperTools.java
        NumericalIdUtils.java
        Pair.java
        PetStoreParseFunctions.java
        StringUtils.java
  - test
    - java
      - org
        bigtop
        bigpetstore
        docs
        TestDocs.java
        generator
        TestNumericalIdUtils.java
        TestPetStoreTransactionGeneratorJob.java
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.tools;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.TreeMap;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.HarFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileRecordReader;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobSubmissionFiles;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


/**
 * a archive creation utility.
 * This class provides methods that can be used 
 * to create hadoop archives. For understanding of 
 * Hadoop archives look at {@link HarFileSystem}.
 */
public class HadoopArchives implements Tool {
  public static final int VERSION = 3;
  private static final Log LOG = LogFactory.getLog(HadoopArchives.class);
  
  private static final String NAME = "har"; 
  static final String SRC_LIST_LABEL = NAME + ".src.list";
  static final String DST_DIR_LABEL = NAME + ".dest.path";
  static final String TMP_DIR_LABEL = NAME + ".tmp.dir";
  static final String JOB_DIR_LABEL = NAME + ".job.dir";
  static final String SRC_COUNT_LABEL = NAME + ".src.count";
  static final String TOTAL_SIZE_LABEL = NAME + ".total.size";
  static final String DST_HAR_LABEL = NAME + ".archive.name";
  static final String SRC_PARENT_LABEL = NAME + ".parent.path";
  /** the size of the blocks that will be created when archiving **/
  static final String HAR_BLOCKSIZE_LABEL = NAME + ".block.size";
  /**the size of the part files that will be created when archiving **/
  static final String HAR_PARTSIZE_LABEL = NAME + ".partfile.size";

  /** size of each part file size **/
  long partSize = 2 * 1024 * 1024 * 1024l;
  /** size of blocks in hadoop archives **/
  long blockSize = 512 * 1024 * 1024l;

  private static final String usage = "archive"
  + " -archiveName NAME -p <parent path> <src>* <dest>" +
  "\n";
  
 
  private JobConf conf;

  public void setConf(Configuration conf) {
    if (conf instanceof JobConf) {
      this.conf = (JobConf) conf;
    } else {
      this.conf = new JobConf(conf, HadoopArchives.class);
    }
  }

  public Configuration getConf() {
    return this.conf;
  }

  public HadoopArchives(Configuration conf) {
    setConf(conf);
  }

  // check the src paths
  private static void checkPaths(Configuration conf, List<Path> paths) throws
  IOException {
    for (Path p : paths) {
      FileSystem fs = p.getFileSystem(conf);
      if (!fs.exists(p)) {
        throw new FileNotFoundException("Source " + p + " does not exist.");
      }
    }
  }

  /**
   * this assumes that there are two types of files file/dir
   * @param fs the input filesystem
   * @param fdir the filestatusdir of the path  
   * @param out the list of paths output of recursive ls
   * @throws IOException
   */
  private void recursivels(FileSystem fs, FileStatusDir fdir, List<FileStatusDir> out) 
  throws IOException {
    if (!fdir.getFileStatus().isDir()) {
      out.add(fdir);
      return;
    }
    else {
      out.add(fdir);
      FileStatus[] listStatus = fs.listStatus(fdir.getFileStatus().getPath());
      fdir.setChildren(listStatus);
      for (FileStatus stat: listStatus) {
        FileStatusDir fstatDir = new FileStatusDir(stat, null);
        recursivels(fs, fstatDir, out);
      }
    }
  }

  /** HarEntry is used in the {@link HArchivesMapper} as the input value. */
  private static class HarEntry implements Writable {
    String path;
    String[] children;

    HarEntry() {}
    
    HarEntry(String path, String[] children) {
      this.path = path;
      this.children = children;
    }

    boolean isDir() {
      return children != null;      
    }

    @Override
    public void readFields(DataInput in) throws IOException {
      path = Text.readString(in);

      if (in.readBoolean()) {
        children = new String[in.readInt()];
        for(int i = 0; i < children.length; i++) {
          children[i] = Text.readString(in);
        }
      } else {
        children = null;
      }
    }

    @Override
    public void write(DataOutput out) throws IOException {
      Text.writeString(out, path);

      final boolean dir = isDir();
      out.writeBoolean(dir);
      if (dir) {
        out.writeInt(children.length);
        for(String c : children) {
          Text.writeString(out, c);
        }
      }
    }
  }

  /**
   * Input format of a hadoop archive job responsible for 
   * generating splits of the file list
   */
  static class HArchiveInputFormat implements InputFormat<LongWritable, HarEntry> {

    //generate input splits from the src file lists
    public InputSplit[] getSplits(JobConf jconf, int numSplits)
    throws IOException {
      String srcfilelist = jconf.get(SRC_LIST_LABEL, "");
      if ("".equals(srcfilelist)) {
          throw new IOException("Unable to get the " +
              "src file for archive generation.");
      }
      long totalSize = jconf.getLong(TOTAL_SIZE_LABEL, -1);
      if (totalSize == -1) {
        throw new IOException("Invalid size of files to archive");
      }
      //we should be safe since this is set by our own code
      Path src = new Path(srcfilelist);
      FileSystem fs = src.getFileSystem(jconf);
      FileStatus fstatus = fs.getFileStatus(src);
      ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
      LongWritable key = new LongWritable();
      final HarEntry value = new HarEntry();
      SequenceFile.Reader reader = null;
      // the remaining bytes in the file split
      long remaining = fstatus.getLen();
      // the count of sizes calculated till now
      long currentCount = 0L;
      // the endposition of the split
      long lastPos = 0L;
      // the start position of the split
      long startPos = 0L;
      long targetSize = totalSize/numSplits;
      // create splits of size target size so that all the maps 
      // have equals sized data to read and write to.
      try {
        reader = new SequenceFile.Reader(fs, src, jconf);
        while(reader.next(key, value)) {
          if (currentCount + key.get() > targetSize && currentCount != 0){
            long size = lastPos - startPos;
            splits.add(new FileSplit(src, startPos, size, (String[]) null));
            remaining = remaining - size;
            startPos = lastPos;
            currentCount = 0L;
          }
          currentCount += key.get();
          lastPos = reader.getPosition();
        }
        // the remaining not equal to the target size.
        if (remaining != 0) {
          splits.add(new FileSplit(src, startPos, remaining, (String[])null));
        }
      }
      finally { 
        reader.close();
      }
      return splits.toArray(new FileSplit[splits.size()]);
    }

    @Override
    public RecordReader<LongWritable, HarEntry> getRecordReader(InputSplit split,
        JobConf job, Reporter reporter) throws IOException {
      return new SequenceFileRecordReader<LongWritable, HarEntry>(job,
                 (FileSplit)split);
    }
  }

  private boolean checkValidName(String name) {
    Path tmp = new Path(name);
    if (tmp.depth() != 1) {
      return false;
    }
    if (name.endsWith(".har")) 
      return true;
    return false;
  }
  

  private Path largestDepth(List<Path> paths) {
    Path deepest = paths.get(0);
    for (Path p: paths) {
      if (p.depth() > deepest.depth()) {
        deepest = p;
      }
    }
    return deepest;
  }
  
  /**
   * truncate the prefix root from the full path
   * @param fullPath the full path
   * @param root the prefix root to be truncated
   * @return the relative path
   */
  private Path relPathToRoot(Path fullPath, Path root) {
    // just take some effort to do it 
    // rather than just using substring 
    // so that we do not break sometime later
    final Path justRoot = new Path(Path.SEPARATOR);
    if (fullPath.depth() == root.depth()) {
      return justRoot;
    }
    else if (fullPath.depth() > root.depth()) {
      Path retPath = new Path(fullPath.getName());
      Path parent = fullPath.getParent();
      for (int i=0; i < (fullPath.depth() - root.depth() -1); i++) {
        retPath = new Path(parent.getName(), retPath);
        parent = parent.getParent();
      }
      return new Path(justRoot, retPath);
    }
    return null;
  }

  /**
   * this method writes all the valid top level directories 
   * into the srcWriter for indexing. This method is a little
   * tricky. example- 
   * for an input with parent path /home/user/ and sources 
   * as /home/user/source/dir1, /home/user/source/dir2 - this 
   * will output <source, dir, dir1, dir2> (dir means that source is a dir
   * with dir1 and dir2 as children) and <source/dir1, file, null>
   * and <source/dir2, file, null>
   * @param srcWriter the sequence file writer to write the
   * directories to
   * @param paths the source paths provided by the user. They
   * are glob free and have full path (not relative paths)
   * @param parentPath the parent path that you wnat the archives
   * to be relative to. example - /home/user/dir1 can be archived with
   * parent as /home or /home/user.
   * @throws IOException
   */
  private void writeTopLevelDirs(SequenceFile.Writer srcWriter, 
      List<Path> paths, Path parentPath) throws IOException {
    //add all the directories 
    List<Path> justDirs = new ArrayList<Path>();
    for (Path p: paths) {
      if (!p.getFileSystem(getConf()).isFile(p)) {
        justDirs.add(new Path(p.toUri().getPath()));
      }
      else {
        justDirs.add(new Path(p.getParent().toUri().getPath()));
      }
    }
    /* find all the common parents of paths that are valid archive
     * paths. The below is done so that we do not add a common path
     * twice and also we need to only add valid child of a path that
     * are specified the user.
     */
    TreeMap<String, HashSet<String>> allpaths = new TreeMap<String, 
                                                HashSet<String>>();
    /* the largest depth of paths. the max number of times
     * we need to iterate
     */
    Path deepest = largestDepth(paths);
    Path root = new Path(Path.SEPARATOR);
    for (int i = parentPath.depth(); i < deepest.depth(); i++) {
      List<Path> parents = new ArrayList<Path>();
      for (Path p: justDirs) {
        if (p.compareTo(root) == 0){
          //do nothing
        }
        else {
          Path parent = p.getParent();
          if (null != parent) {
            if (allpaths.containsKey(parent.toString())) {
              HashSet<String> children = allpaths.get(parent.toString());
              children.add(p.getName());
            } 
            else {
              HashSet<String> children = new HashSet<String>();
              children.add(p.getName());
              allpaths.put(parent.toString(), children);
            }
            parents.add(parent);
          }
        }
      }
      justDirs = parents;
    }
    Set<Map.Entry<String, HashSet<String>>> keyVals = allpaths.entrySet();
    for (Map.Entry<String, HashSet<String>> entry : keyVals) {
      final Path relPath = relPathToRoot(new Path(entry.getKey()), parentPath);
      if (relPath != null) {
        final String[] children = new String[entry.getValue().size()];
        int i = 0;
        for(String child: entry.getValue()) {
          children[i++] = child;
        }
        append(srcWriter, 0L, relPath.toString(), children);
      }
    }
  }

  private void append(SequenceFile.Writer srcWriter, long len,
      String path, String[] children) throws IOException {
    srcWriter.append(new LongWritable(len), new HarEntry(path, children));
  }
    
  /**
   * A static class that keeps
   * track of status of a path 
   * and there children if path is a dir
   */
  static class FileStatusDir {
    private FileStatus fstatus;
    private FileStatus[] children = null;
    
    /**
     * constructor for filestatusdir
     * @param fstatus the filestatus object that maps to filestatusdir
     * @param children the children list if fs is a directory
     */
    FileStatusDir(FileStatus fstatus, FileStatus[] children) {
      this.fstatus  = fstatus;
      this.children = children;
    }
    
    /**
     * set children of this object
     * @param listStatus the list of children
     */
    public void setChildren(FileStatus[] listStatus) {
      this.children = listStatus;
    }

    /**
     * the filestatus of this object
     * @return the filestatus of this object
     */
    FileStatus getFileStatus() {
      return this.fstatus;
    }
    
    /**
     * the children list of this object, null if  
     * @return the children list
     */
    FileStatus[] getChildren() {
      return this.children;
    }
  }
  
  /**archive the given source paths into
   * the dest
   * @param parentPath the parent path of all the source paths
   * @param srcPaths the src paths to be archived
   * @param dest the dest dir that will contain the archive
   */
  void archive(Path parentPath, List<Path> srcPaths, 
      String archiveName, Path dest) throws IOException {
    checkPaths(conf, srcPaths);
    int numFiles = 0;
    long totalSize = 0;
    FileSystem fs = parentPath.getFileSystem(conf);
    this.blockSize = conf.getLong(HAR_BLOCKSIZE_LABEL, blockSize);
    this.partSize = conf.getLong(HAR_PARTSIZE_LABEL, partSize);
    conf.setLong(HAR_BLOCKSIZE_LABEL, blockSize);
    conf.setLong(HAR_PARTSIZE_LABEL, partSize);
    conf.set(DST_HAR_LABEL, archiveName);
    conf.set(SRC_PARENT_LABEL, parentPath.makeQualified(fs).toString());
    Path outputPath = new Path(dest, archiveName);
    FileOutputFormat.setOutputPath(conf, outputPath);
    FileSystem outFs = outputPath.getFileSystem(conf);
    if (outFs.exists(outputPath) || outFs.isFile(dest)) {
      throw new IOException("Invalid Output: " + outputPath);
    }
    conf.set(DST_DIR_LABEL, outputPath.toString());
    JobClient jClient = new JobClient(conf);
    Path stagingArea;
    try {
      stagingArea = JobSubmissionFiles.getStagingDir(jClient, conf);
    } catch (InterruptedException ie) {
      throw new IOException(ie);
    }
    Path jobDirectory = new Path(stagingArea,
        NAME+"_"+Integer.toString(new Random().nextInt(Integer.MAX_VALUE), 36));
    FsPermission mapredSysPerms = 
      new FsPermission(JobSubmissionFiles.JOB_DIR_PERMISSION);
    FileSystem.mkdirs(jobDirectory.getFileSystem(conf), jobDirectory,
                      mapredSysPerms);
    conf.set(JOB_DIR_LABEL, jobDirectory.toString());
    //get a tmp directory for input splits
    FileSystem jobfs = jobDirectory.getFileSystem(conf);
    Path srcFiles = new Path(jobDirectory, "_har_src_files");
    conf.set(SRC_LIST_LABEL, srcFiles.toString());
    SequenceFile.Writer srcWriter = SequenceFile.createWriter(jobfs, conf,
        srcFiles, LongWritable.class, HarEntry.class, 
        SequenceFile.CompressionType.NONE);
    // get the list of files 
    // create single list of files and dirs
    try {
      // write the top level dirs in first 
      writeTopLevelDirs(srcWriter, srcPaths, parentPath);
      srcWriter.sync();
      // these are the input paths passed 
      // from the command line
      // we do a recursive ls on these paths 
      // and then write them to the input file 
      // one at a time
      for (Path src: srcPaths) {
        ArrayList<FileStatusDir> allFiles = new ArrayList<FileStatusDir>();
        FileStatus fstatus = fs.getFileStatus(src);
        FileStatusDir fdir = new FileStatusDir(fstatus, null);
        recursivels(fs, fdir, allFiles);
        for (FileStatusDir statDir: allFiles) {
          FileStatus stat = statDir.getFileStatus();
          long len = stat.isDir()? 0:stat.getLen();
          final Path path = relPathToRoot(stat.getPath(), parentPath);
          final String[] children;
          if (stat.isDir()) {
            //get the children 
            FileStatus[] list = statDir.getChildren();
            children = new String[list.length];
            for (int i = 0; i < list.length; i++) {
              children[i] = list[i].getPath().getName();
            }
          }
          else {
            children = null;
          }
          append(srcWriter, len, path.toString(), children);
          srcWriter.sync();
          numFiles++;
          totalSize += len;
        }
      }
    } finally {
      srcWriter.close();
    }
    //increase the replication of src files
    jobfs.setReplication(srcFiles, (short) 10);
    conf.setInt(SRC_COUNT_LABEL, numFiles);
    conf.setLong(TOTAL_SIZE_LABEL, totalSize);
    int numMaps = (int)(totalSize/partSize);
    //run atleast one map.
    conf.setNumMapTasks(numMaps == 0? 1:numMaps);
    conf.setNumReduceTasks(1);
    conf.setInputFormat(HArchiveInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setMapperClass(HArchivesMapper.class);
    conf.setReducerClass(HArchivesReducer.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.set("hadoop.job.history.user.location", "none");
    FileInputFormat.addInputPath(conf, jobDirectory);
    //make sure no speculative execution is done
    conf.setSpeculativeExecution(false);
    JobClient.runJob(conf);
    //delete the tmp job directory
    try {
      jobfs.delete(jobDirectory, true);
    } catch(IOException ie) {
      LOG.info("Unable to clean tmp directory " + jobDirectory);
    }
  }

  static class HArchivesMapper 
  implements Mapper<LongWritable, HarEntry, IntWritable, Text> {
    private JobConf conf = null;
    int partId = -1 ; 
    Path tmpOutputDir = null;
    Path tmpOutput = null;
    String partname = null;
    Path rootPath = null;
    FSDataOutputStream partStream = null;
    FileSystem destFs = null;
    byte[] buffer;
    int buf_size = 128 * 1024;
    long blockSize = 512 * 1024 * 1024l;

    // configure the mapper and create 
    // the part file.
    // use map reduce framework to write into
    // tmp files. 
    public void configure(JobConf conf) {
      this.conf = conf;

      // this is tightly tied to map reduce
      // since it does not expose an api 
      // to get the partition
      partId = conf.getInt("mapred.task.partition", -1);
      // create a file name using the partition
      // we need to write to this directory
      tmpOutputDir = FileOutputFormat.getWorkOutputPath(conf);
      blockSize = conf.getLong(HAR_BLOCKSIZE_LABEL, blockSize);
      // get the output path and write to the tmp 
      // directory 
      partname = "part-" + partId;
      tmpOutput = new Path(tmpOutputDir, partname);
      rootPath = (conf.get(SRC_PARENT_LABEL, null) == null) ? null :
                  new Path(conf.get(SRC_PARENT_LABEL));
      if (rootPath == null) {
        throw new RuntimeException("Unable to read parent " +
        		"path for har from config");
      }
      try {
        destFs = tmpOutput.getFileSystem(conf);
        //this was a stale copy
        if (destFs.exists(tmpOutput)) {
          destFs.delete(tmpOutput, false);
        } 
        partStream = destFs.create(tmpOutput, false, conf.getInt("io.file.buffer.size", 4096), 
            destFs.getDefaultReplication(), blockSize);
      } catch(IOException ie) {
        throw new RuntimeException("Unable to open output file " + tmpOutput, ie);
      }
      buffer = new byte[buf_size];
    }

    // copy raw data.
    public void copyData(Path input, FSDataInputStream fsin, 
        FSDataOutputStream fout, Reporter reporter) throws IOException {
      try {
        for (int cbread=0; (cbread = fsin.read(buffer))>= 0;) {
          fout.write(buffer, 0,cbread);
          reporter.progress();
        }
      } finally {
        fsin.close();
      }
    }
    
    /**
     * get rid of / in the beginning of path
     * @param p the path
     * @return return path without /
     */
    private Path realPath(Path p, Path parent) {
      Path rootPath = new Path(Path.SEPARATOR);
      if (rootPath.compareTo(p) == 0) {
        return parent;
      }
      return new Path(parent, new Path(p.toString().substring(1)));
    }

    private static String encodeName(String s) 
      throws UnsupportedEncodingException {
      return URLEncoder.encode(s,"UTF-8");
    }

    private static String encodeProperties( FileStatus fStatus )
      throws UnsupportedEncodingException {
      String propStr = encodeName(
          fStatus.getModificationTime() + " "
        + fStatus.getPermission().toShort() + " "
        + encodeName(fStatus.getOwner()) + " "
        + encodeName(fStatus.getGroup()));
      return propStr;
    }

    // read files from the split input 
    // and write it onto the part files.
    // also output hash(name) and string 
    // for reducer to create index 
    // and masterindex files.
    public void map(LongWritable key, HarEntry value,
        OutputCollector<IntWritable, Text> out,
        Reporter reporter) throws IOException {
      Path relPath = new Path(value.path);
      int hash = HarFileSystem.getHarHash(relPath);
      String towrite = null;
      Path srcPath = realPath(relPath, rootPath);
      long startPos = partStream.getPos();
      FileSystem srcFs = srcPath.getFileSystem(conf);
      FileStatus srcStatus = srcFs.getFileStatus(srcPath);
      String propStr = encodeProperties(srcStatus);
      if (value.isDir()) { 
        towrite = encodeName(relPath.toString())
                  + " dir " + propStr + " 0 0 ";
        StringBuffer sbuff = new StringBuffer();
        sbuff.append(towrite);
        for (String child: value.children) {
          sbuff.append(encodeName(child) + " ");
        }
        towrite = sbuff.toString();
        //reading directories is also progress
        reporter.progress();
      }
      else {
        FSDataInputStream input = srcFs.open(srcStatus.getPath());
        reporter.setStatus("Copying file " + srcStatus.getPath() + 
            " to archive.");
        copyData(srcStatus.getPath(), input, partStream, reporter);
        towrite = encodeName(relPath.toString())
                  + " file " + partname + " " + startPos
                  + " " + srcStatus.getLen() + " " + propStr + " ";
      }
      out.collect(new IntWritable(hash), new Text(towrite));
    }
    
    public void close() throws IOException {
      // close the part files.
      partStream.close();
    }
  }
  
  /** the reduce for creating the index and the master index 
   * 
   */
  static class HArchivesReducer implements Reducer<IntWritable, 
  Text, Text, Text> {
    private JobConf conf = null;
    private long startIndex = 0;
    private long endIndex = 0;
    private long startPos = 0;
    private Path masterIndex = null;
    private Path index = null;
    private FileSystem fs = null;
    private FSDataOutputStream outStream = null;
    private FSDataOutputStream indexStream = null;
    private int numIndexes = 1000;
    private Path tmpOutputDir = null;
    private int written = 0;
    private int keyVal = 0;
    
    // configure 
    public void configure(JobConf conf) {
      this.conf = conf;
      tmpOutputDir = FileOutputFormat.getWorkOutputPath(this.conf);
      masterIndex = new Path(tmpOutputDir, "_masterindex");
      index = new Path(tmpOutputDir, "_index");
      try {
        fs = masterIndex.getFileSystem(conf);
        if (fs.exists(masterIndex)) {
          fs.delete(masterIndex, false);
        }
        if (fs.exists(index)) {
          fs.delete(index, false);
        }
        indexStream = fs.create(index);
        outStream = fs.create(masterIndex);
        String version = VERSION + " \n";
        outStream.write(version.getBytes());
        
      } catch(IOException e) {
        throw new RuntimeException(e);
      }
    }
    
    // create the index and master index. The input to 
    // the reduce is already sorted by the hash of the 
    // files. SO we just need to write it to the index. 
    // We update the masterindex as soon as we update 
    // numIndex entries.
    public void reduce(IntWritable key, Iterator<Text> values,
        OutputCollector<Text, Text> out,
        Reporter reporter) throws IOException {
      keyVal = key.get();
      while(values.hasNext()) {
        Text value = values.next();
        String towrite = value.toString() + "\n";
        indexStream.write(towrite.getBytes());
        written++;
        if (written > numIndexes -1) {
          // every 1000 indexes we report status
          reporter.setStatus("Creating index for archives");
          reporter.progress();
          endIndex = keyVal;
          String masterWrite = startIndex + " " + endIndex + " " + startPos 
                              +  " " + indexStream.getPos() + " \n" ;
          outStream.write(masterWrite.getBytes());
          startPos = indexStream.getPos();
          startIndex = endIndex;
          written = 0;
        }
      }
    }
    
    public void close() throws IOException {
      //write the last part of the master index.
      if (written > 0) {
        String masterWrite = startIndex + " " + keyVal + " " + startPos  +
                             " " + indexStream.getPos() + " \n";
        outStream.write(masterWrite.getBytes());
      }
      // close the streams
      outStream.close();
      indexStream.close();
      // try increasing the replication 
      fs.setReplication(index, (short) 5);
      fs.setReplication(masterIndex, (short) 5);
    }
    
  }
  
  /** the main driver for creating the archives
   *  it takes at least three command line parameters. The parent path, 
   *  The src and the dest. It does an lsr on the source paths.
   *  The mapper created archuves and the reducer creates 
   *  the archive index.
   */

  public int run(String[] args) throws Exception {
    try {
      Path parentPath = null;
      List<Path> srcPaths = new ArrayList<Path>();
      Path destPath = null;
      String archiveName = null;
      if (args.length < 5) {
        System.out.println(usage);
        throw new IOException("Invalid usage.");
      }
      if (!"-archiveName".equals(args[0])) {
        System.out.println(usage);
        throw new IOException("Archive Name not specified.");
      }
      archiveName = args[1];
      if (!checkValidName(archiveName)) {
        System.out.println(usage);
        throw new IOException("Invalid name for archives. " + archiveName);
      }
      int i = 2;
      //check to see if relative parent has been provided or not
      //this is a required parameter. 
      if (! "-p".equals(args[i])) {
        System.out.println(usage);
        throw new IOException("Parent path not specified.");
      }
      parentPath = new Path(args[i+1]);
      i+=2;
      //read the rest of the paths
      for (; i < args.length; i++) {
        if (i == (args.length - 1)) {
          destPath = new Path(args[i]);
        }
        else {
          Path argPath = new Path(args[i]);
          if (argPath.isAbsolute()) {
            System.out.println(usage);
            throw new IOException("source path " + argPath +
                " is not relative  to "+ parentPath);
          }
          srcPaths.add(new Path(parentPath, argPath));
        }
      }
      if (srcPaths.size() == 0) {
        // assuming if the user does not specify path for sources
        // the whole parent directory needs to be archived. 
        srcPaths.add(parentPath);
      }
      // do a glob on the srcPaths and then pass it on
      List<Path> globPaths = new ArrayList<Path>();
      for (Path p: srcPaths) {
        FileSystem fs = p.getFileSystem(getConf());
        FileStatus[] statuses = fs.globStatus(p);
        if (statuses != null) {
          for (FileStatus status: statuses) {
            globPaths.add(fs.makeQualified(status.getPath()));
          }
        }
      }
      if (globPaths.isEmpty()) {
        throw new IOException("The resolved paths set is empty."
            + "  Please check whether the srcPaths exist, where srcPaths = "
            + srcPaths);
      }
      archive(parentPath, globPaths, archiveName, destPath);
    } catch(IOException ie) {
      System.err.println(ie.getLocalizedMessage());
      return -1;
    }
    return 0;
  }

  /** the main functions **/
  public static void main(String[] args) {
    JobConf job = new JobConf(HadoopArchives.class);
    HadoopArchives harchives = new HadoopArchives(job);
    int ret = 0;

    try{
      ret = ToolRunner.run(harchives, args);
    } catch(Exception e) {
      LOG.debug("Exception in archives  ", e);
      System.err.println(e.getClass().getSimpleName() + " in archives");
      final String s = e.getLocalizedMessage();
      if (s != null) {
        System.err.println(s);
      } else {
        e.printStackTrace(System.err);
      }
      System.exit(1);
    }
    System.exit(ret);
  }
}