DistRaid.java example

Explorer
hadoop-20-master
- src
package org.apache.hadoop.raid;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Random;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.Counters;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobID;
import org.apache.hadoop.mapred.JobInProgress;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileRecordReader;
import org.apache.hadoop.mapred.TaskCompletionEvent;
import org.apache.hadoop.raid.RaidNode.Statistics;
import org.apache.hadoop.raid.protocol.PolicyInfo;
import org.apache.hadoop.util.StringUtils;

public class DistRaid {

  protected static final Log LOG = LogFactory.getLog(DistRaid.class);

  static final String NAME = "distRaid";
  static final String JOB_DIR_LABEL = NAME + ".job.dir";
  static final String OP_LIST_LABEL = NAME + ".op.list";
  static final String OP_COUNT_LABEL = NAME + ".op.count";
  static final String SCHEDULER_OPTION_LABEL = NAME + ".scheduleroption";
  static final String IGNORE_FAILURES_OPTION_LABEL = NAME + ".ignore.failures";
  static final int   OP_LIST_BLOCK_SIZE = 32 * 1024 * 1024; // block size of control file
  static final short OP_LIST_REPLICATION = 10; // replication factor of control file

  private static final long DEFAULT_OP_PER_MAP = 50;
  public static final String OP_PER_MAP_KEY = "hdfs.raid.op.per.map";
  private static final int DEFAULT_MAX_MAPS_PER_NODE = 20;
  public static final String MAX_MAPS_PER_NODE_KEY = "hdfs.raid.max.maps.per.node";
  public static final int DEFAULT_MAX_FAILURE_RETRY = 3;
  public static final String MAX_FAILURE_RETRY_KEY = "hdfs.raid.max.failure.retry";
  public static final String SLEEP_TIME_BETWEEN_RETRY_KEY =
      "hdfs.raid.sleep.time.between.retry";
  public static final long DEFAULT_SLEEP_TIME_BETWEEN_RETRY = 1000L;
  private static long opPerMap = DEFAULT_OP_PER_MAP;
  private static int maxMapsPerNode = DEFAULT_MAX_MAPS_PER_NODE;
  
  private static final int SYNC_FILE_MAX = 10;
  private static final SimpleDateFormat dateForm = new SimpleDateFormat("yyyy-MM-dd HH:mm");
  private static String jobName = NAME;

  public static enum Counter {
    FILES_SUCCEEDED, FILES_FAILED, PROCESSED_BLOCKS, PROCESSED_SIZE, META_BLOCKS, META_SIZE,
    SAVING_SIZE
  }

  protected JobConf jobconf;

  /** {@inheritDoc} */
  public void setConf(Configuration conf) {
    if (jobconf != conf) {
      jobconf = conf instanceof JobConf ? (JobConf) conf : new JobConf(conf);
    }
  }

  /** {@inheritDoc} */
  public JobConf getConf() {
    return jobconf;
  }

  public DistRaid(Configuration conf) {
    setConf(createJobConf(conf));
    opPerMap = conf.getLong(OP_PER_MAP_KEY, DEFAULT_OP_PER_MAP);
    maxMapsPerNode = conf.getInt(MAX_MAPS_PER_NODE_KEY,
        DEFAULT_MAX_MAPS_PER_NODE);
  }

  private static final Random RANDOM = new Random();

  protected static String getRandomId() {
    return Integer.toString(RANDOM.nextInt(Integer.MAX_VALUE), 36);
  }
  
  /*
   * This class denotes the unit of encoding for a mapper task
   * The key in the map function is formated as:
   * "startStripeId encodingId encodingUnit path"
   * Given a file path /user/dhruba/1 with 13 blocks
   * Suppose codec's stripe length is 3 and encoding unit is 2
   * RaidNode.splitPaths will split the raiding task into 3 tasks
   * "0 1 2 /user/dhruba/1": will raid the stripe 0 and 1 (blocks 0-5)
   * "2 2 2 /user/dhruba/1": will raid the stripe 2 and 3 (blocks 6-11)
   * "4 3 2 /user/dhruba/1": will raid the stripe 4 (blocks 12-13)
   * encodingId is unique for each task and it's used to construct a temporary
   * directory to store the partial parity file
   * modificationTime is the modification time of candidate when job is submitted
   * it's used for checking if candidate changes after the job submit.  
   */
  
  public static class EncodingCandidate {
    public final static int DEFAULT_GET_SRC_STAT_RETRY = 5;
    public FileStatus srcStat;
    public long startStripe = 0;
    public long encodingUnit = 0;
    public String encodingId = null;
    final static public String delim = " ";
    public long modificationTime = 0L;
    public boolean isEncoded = false;
    public boolean isRenamed = false;
    public boolean isConcated = false;
    public List<List<Block>> srcStripes = null;
    
    EncodingCandidate(FileStatus newStat, long newStartStripe,
        String newEncodingId, long newEncodingUnit, long newModificationTime) {
      this.srcStat = newStat;
      this.startStripe = newStartStripe;
      this.encodingId = newEncodingId;
      this.encodingUnit = newEncodingUnit;
      this.modificationTime = newModificationTime;
    }
    
    public String toString() {
      return startStripe + delim + encodingId + delim + encodingUnit 
          + delim + modificationTime + delim + this.srcStat.getPath().toString();
    }
    
    public static EncodingCandidate getEncodingCandidate(String key,
        Configuration jobconf) throws IOException {
      String[] keys = key.split(delim, 5);
      Path p = new Path(keys[4]);
      long startStripe = Long.parseLong(keys[0]);
      long modificationTime = Long.parseLong(keys[3]);
      FileStatus srcStat = getSrcStatus(jobconf, p);
      long encodingUnit = Long.parseLong(keys[2]);
      return new EncodingCandidate(srcStat, startStripe, keys[1],
          encodingUnit, modificationTime);
    }
    
    public static FileStatus getSrcStatus(Configuration jobconf, Path p)
        throws IOException {
      for (int i = 0; i < DEFAULT_GET_SRC_STAT_RETRY; i++) {
        try {
          return p.getFileSystem(jobconf).getFileStatus(p); 
        } catch (FileNotFoundException fnfe) {
          return null;
        } catch (IOException ioe) {
          LOG.warn("Get exception ", ioe);
          if (i == DEFAULT_GET_SRC_STAT_RETRY - 1) {
            throw ioe;
          }
          try {
            Thread.sleep(3000);
          } catch (InterruptedException ignore) {
          }
        }
      }
      throw new IOException("couldn't getFileStatus " + p);
    }
    
    public void refreshFile(Configuration jobconf) throws IOException {
      srcStat = getSrcStatus(jobconf, srcStat.getPath());
    }
  }

  /**
   * 
   * helper class which holds the policy and paths
   * 
   */
  public static class RaidPolicyPathPair {
    public PolicyInfo policy;
    public List<EncodingCandidate> srcPaths;

    RaidPolicyPathPair(PolicyInfo policy, List<EncodingCandidate> srcPaths) {
      this.policy = policy;
      this.srcPaths = srcPaths;
    }
  }

  List<RaidPolicyPathPair> raidPolicyPathPairList = new ArrayList<RaidPolicyPathPair>();

  private JobClient jobClient;
  private RunningJob runningJob;
  private int jobEventCounter = 0;
  private String lastReport = null;
  private long startTime = System.currentTimeMillis();

  private long totalSaving;

  /** Responsible for generating splits of the src file list. */
  static class DistRaidInputFormat implements InputFormat<Text, PolicyInfo> {
    /** Do nothing. */
    public void validateInput(JobConf job) {
    }

    /**
     * Produce splits such that each is no greater than the quotient of the
     * total size and the number of splits requested.
     * 
     * @param job
     *          The handle to the JobConf object
     * @param numSplits
     *          Number of splits requested
     */
    public InputSplit[] getSplits(JobConf job, int numSplits)
        throws IOException {
      final int srcCount = job.getInt(OP_COUNT_LABEL, -1);
      final int targetcount = srcCount / numSplits;
      String srclist = job.get(OP_LIST_LABEL, "");
      if (srcCount < 0 || "".equals(srclist)) {
        throw new RuntimeException("Invalid metadata: #files(" + srcCount
            + ") listuri(" + srclist + ")");
      }
      Path srcs = new Path(srclist);
      FileSystem fs = srcs.getFileSystem(job);

      List<FileSplit> splits = new ArrayList<FileSplit>(numSplits);

      Text key = new Text();
      PolicyInfo value = new PolicyInfo();
      SequenceFile.Reader in = null;
      long prev = 0L;
      int count = 0; // count src
      try {
        for (in = new SequenceFile.Reader(fs, srcs, job); in.next(key, value);) {
          long curr = in.getPosition();
          long delta = curr - prev;
          if (++count > targetcount) {
            count = 0;
            splits.add(new FileSplit(srcs, prev, delta, (String[]) null));
            prev = curr;
          }
        }
      } finally {
        in.close();
      }
      long remaining = fs.getFileStatus(srcs).getLen() - prev;
      if (remaining != 0) {
        splits.add(new FileSplit(srcs, prev, remaining, (String[]) null));
      }
      LOG.info("jobname= " + jobName + " numSplits=" + numSplits + 
               ", splits.size()=" + splits.size());
      return splits.toArray(new FileSplit[splits.size()]);
    }

    /** {@inheritDoc} */
    public RecordReader<Text, PolicyInfo> getRecordReader(InputSplit split,
        JobConf job, Reporter reporter) throws IOException {
      return new SequenceFileRecordReader<Text, PolicyInfo>(job,
          (FileSplit) split);
    }
  }

  /** The mapper for raiding files. */
  static class DistRaidMapper implements
      Mapper<Text, PolicyInfo, WritableComparable, Text> {
    private JobConf jobconf;
    private boolean ignoreFailures;
    private int retryNum;

    private int failcount = 0;
    private int succeedcount = 0;
    private Statistics st = null;
    private Reporter reporter = null;

    private String getCountString() {
      return "Succeeded: " + succeedcount + " Failed: " + failcount;
    }

    /** {@inheritDoc} */
    public void configure(JobConf job) {
      this.jobconf = job;
      ignoreFailures = jobconf.getBoolean(IGNORE_FAILURES_OPTION_LABEL, true);
      retryNum = jobconf.getInt(DistRaid.MAX_FAILURE_RETRY_KEY,
          DistRaid.DEFAULT_MAX_FAILURE_RETRY);
      st = new Statistics();
    }
    
    public static boolean doRaid(int retryNum, String key, Configuration jobconf,
        PolicyInfo policy, Statistics st, Reporter reporter)
            throws IOException {
      String s = "FAIL: " + policy + ", " + key + " ";
      long sleepTimeBetwRetry = jobconf.getLong(SLEEP_TIME_BETWEEN_RETRY_KEY,
                                            DEFAULT_SLEEP_TIME_BETWEEN_RETRY);
      EncodingCandidate ec = EncodingCandidate.getEncodingCandidate(key, jobconf);
      for (int i = 0; i < retryNum; i++) {
        LOG.info("The " + i + "th attempt: " + s);
        if (ec.srcStat == null) {
          LOG.info("Raiding Candidate doesn't exist, NO_ACTION");
          return false;
        }
        if (ec.modificationTime != ec.srcStat.getModificationTime()) {
          LOG.info("Raiding Candidate was changed, NO_ACTION");
          return false;
        }
        try {
          return RaidNode.doRaid(jobconf, policy, ec, st, reporter);
        } catch (IOException e) {
          LOG.info(s, e);
          if (i == retryNum - 1) {
            throw new IOException(s, e);
          }
          ec.refreshFile(jobconf);
          try {
            Thread.sleep(sleepTimeBetwRetry);
          } catch (InterruptedException ie) {
            throw new IOException(ie);
          }
        }
      }
      throw new IOException(s);
    }

    /** Run a FileOperation */
    public void map(Text key, PolicyInfo policy,
        OutputCollector<WritableComparable, Text> out, Reporter reporter)
        throws IOException {
      this.reporter = reporter;
      try {
        Codec.initializeCodecs(jobconf);
        LOG.info("Raiding file=" + key.toString() + " policy=" + policy);
        boolean result = doRaid(retryNum, key.toString(), jobconf, policy, st, reporter);
        if (result) {
          ++succeedcount;
          
          reporter.incrCounter(Counter.PROCESSED_BLOCKS, st.numProcessedBlocks);
          reporter.incrCounter(Counter.PROCESSED_SIZE, st.processedSize);
          reporter.incrCounter(Counter.META_BLOCKS, st.numMetaBlocks);
          reporter.incrCounter(Counter.META_SIZE, st.metaSize);
          reporter.incrCounter(Counter.SAVING_SIZE,
              st.processedSize - st.remainingSize - st.metaSize);
          reporter.incrCounter(Counter.FILES_SUCCEEDED, 1);
        }
      } catch (IOException e) {
        ++failcount;
        reporter.incrCounter(Counter.FILES_FAILED, 1);
        out.collect(null, new Text(e.getMessage()));
      } finally {
        reporter.setStatus(getCountString());
      }
    }

    /** {@inheritDoc} */
    public void close() throws IOException {
      if (failcount == 0 || ignoreFailures) {
        return;
      }
      throw new IOException(getCountString());
    }
  }

  /**
   * create new job conf based on configuration passed.
   * 
   * @param conf
   * @return
   */
  private static JobConf createJobConf(Configuration conf) {
    JobConf jobconf = new JobConf(conf, DistRaid.class);
    jobName = NAME + " " + dateForm.format(new Date(RaidNode.now()));
    jobconf.setUser(RaidNode.JOBUSER);
    jobconf.setJobName(jobName);
    jobconf.setMapSpeculativeExecution(false);
    RaidUtils.parseAndSetOptions(jobconf, SCHEDULER_OPTION_LABEL);

    jobconf.setJarByClass(DistRaid.class);
    jobconf.setInputFormat(DistRaidInputFormat.class);
    jobconf.setOutputKeyClass(Text.class);
    jobconf.setOutputValueClass(Text.class);

    jobconf.setMapperClass(DistRaidMapper.class);
    jobconf.setNumReduceTasks(0);
    return jobconf;
  }

  /** Add paths to be raided */
  public void addRaidPaths(PolicyInfo info, List<EncodingCandidate> paths) {
    raidPolicyPathPairList.add(new RaidPolicyPathPair(info, paths));
  }

  /** Calculate how many maps to run. */
  private static int getMapCount(int srcCount) {
    int numMaps = (int) (srcCount / opPerMap);
    return Math.max(numMaps, maxMapsPerNode);
  }

  /** Invokes a map-reduce job do parallel raiding.
   *  @return true if the job was started, false otherwise
   */
  public boolean startDistRaid() throws IOException {
    assert(raidPolicyPathPairList.size() > 0);
    if (setup()) {
      this.jobClient = new JobClient(jobconf);
      this.runningJob = this.jobClient.submitJob(jobconf);
      LOG.info("Job Started: " + runningJob.getID());
      this.startTime = System.currentTimeMillis();
      return true;
    }
    return false;
  }

  /**
   * Get the URL of the current running job
   * @return the tracking URL
   */
  public String getJobTrackingURL() {
    if (runningJob == null)
      return null;
    return runningJob.getTrackingURL();
  }
  
   /** Checks if the map-reduce job has completed.
    *
    * @return true if the job completed, false otherwise.
    * @throws IOException
    */
   public boolean checkComplete() throws IOException {
     JobID jobID = runningJob.getID();
     if (runningJob.isComplete()) {
       // delete job directory
       final String jobdir = jobconf.get(JOB_DIR_LABEL);
       if (jobdir != null) {
         final Path jobpath = new Path(jobdir);
         jobpath.getFileSystem(jobconf).delete(jobpath, true);
       }
       if (runningJob.isSuccessful()) {
         LOG.info("Job Complete(Succeeded): " + jobID);
       } else {
         LOG.info("Job Complete(Failed): " + jobID);
       }
       raidPolicyPathPairList.clear();
       Counters ctrs = runningJob.getCounters();
       if (ctrs != null) {
         RaidNodeMetrics metrics = RaidNodeMetrics.getInstance(RaidNodeMetrics.DEFAULT_NAMESPACE_ID);
         if (ctrs.findCounter(Counter.FILES_FAILED) != null) {
           long filesFailed = ctrs.findCounter(Counter.FILES_FAILED).getValue();
           metrics.raidFailures.inc(filesFailed);
         }
         long slotSeconds = ctrs.findCounter(
          JobInProgress.Counter.SLOTS_MILLIS_MAPS).getValue() / 1000;
         metrics.raidSlotSeconds.inc(slotSeconds);
       }
       return true;
     } else {
       String report =  (" job " + jobID +
         " map " + StringUtils.formatPercent(runningJob.mapProgress(), 0)+
         " reduce " + StringUtils.formatPercent(runningJob.reduceProgress(), 0));
       if (!report.equals(lastReport)) {
         LOG.info(report);
         lastReport = report;
       }
       TaskCompletionEvent[] events =
         runningJob.getTaskCompletionEvents(jobEventCounter);
       jobEventCounter += events.length;
       for(TaskCompletionEvent event : events) {
         if (event.getTaskStatus() ==  TaskCompletionEvent.Status.FAILED) {
           LOG.info(" Job " + jobID + " " + event.toString());
         }
       }
       return false;
     }
   }

   public void killJob() throws IOException {
     runningJob.killJob();
   }
   
   public void cleanUp() {
     for (Codec codec: Codec.getCodecs()) {
       Path tmpdir = new Path(codec.tmpParityDirectory, this.getJobID());
       try {
         FileSystem fs = tmpdir.getFileSystem(jobconf);
         if (fs.exists(tmpdir)) {
           fs.delete(tmpdir, true);
         }
       } catch (IOException ioe) {
         LOG.error("Fail to delete " + tmpdir, ioe);
       }
     }
   }

   public boolean successful() throws IOException {
     return runningJob.isSuccessful();
   }

   private void estimateSavings() {
     for (RaidPolicyPathPair p : raidPolicyPathPairList) {
       Codec codec = Codec.getCodec(p.policy.getCodecId());
       int stripeSize = codec.stripeLength;
       int paritySize = codec.parityLength;
       int targetRepl = Integer.parseInt(p.policy.getProperty("targetReplication"));
       int parityRepl = Integer.parseInt(p.policy.getProperty("metaReplication"));
       for (EncodingCandidate st : p.srcPaths) {
         long saving = RaidNode.savingFromRaidingFile(
             st, stripeSize, paritySize, targetRepl, parityRepl);
         totalSaving += saving;
       }
     }
   }

  /**
   * set up input file which has the list of input files.
   * 
   * @return boolean
   * @throws IOException
   */
  private boolean setup() throws IOException {
    estimateSavings();

    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobconf);
    Path jobdir = new Path(jClient.getSystemDir(), NAME + "_" + randomId);

    LOG.info(JOB_DIR_LABEL + "=" + jobdir);
    jobconf.set(JOB_DIR_LABEL, jobdir.toString());
    Path log = new Path(jobdir, "_logs");

    // The control file should have small size blocks. This helps
    // in spreading out the load from mappers that will be spawned.
    jobconf.setInt("dfs.blocks.size",  OP_LIST_BLOCK_SIZE);

    FileOutputFormat.setOutputPath(jobconf, log);
    LOG.info("log=" + log);

    // create operation list
    FileSystem fs = jobdir.getFileSystem(jobconf);
    Path opList = new Path(jobdir, "_" + OP_LIST_LABEL);
    jobconf.set(OP_LIST_LABEL, opList.toString());
    int opCount = 0, synCount = 0;
    SequenceFile.Writer opWriter = null;

    try {
      opWriter = SequenceFile.createWriter(fs, jobconf, opList, Text.class,
          PolicyInfo.class, SequenceFile.CompressionType.NONE);
      for (RaidPolicyPathPair p : raidPolicyPathPairList) {
        // If a large set of files are Raided for the first time, files
        // in the same directory that tend to have the same size will end up
        // with the same map. This shuffle mixes things up, allowing a better
        // mix of files.
        java.util.Collections.shuffle(p.srcPaths);
        for (EncodingCandidate ec : p.srcPaths) {
          opWriter.append(new Text(ec.toString()), p.policy);
          opCount++;
          if (++synCount > SYNC_FILE_MAX) {
            opWriter.sync();
            synCount = 0;
          }
        }
      }

    } finally {
      if (opWriter != null) {
        opWriter.close();
      }
      fs.setReplication(opList, OP_LIST_REPLICATION); // increase replication for control file
    }
    raidPolicyPathPairList.clear();
    
    jobconf.setInt(OP_COUNT_LABEL, opCount);
    LOG.info("Number of files=" + opCount);
    jobconf.setNumMapTasks(getMapCount(opCount));
    LOG.info("jobName= " + jobName + " numMapTasks=" + jobconf.getNumMapTasks());
    return opCount != 0;

  } 
  
  public long getStartTime() {
    return this.startTime;
  }

  public String toHtmlRow() {
    return JspUtils.tr(
        JspUtils.td(
            JspUtils.link(
                runningJob.getID().toString(), runningJob.getTrackingURL())) +
        JspUtils.td(runningJob.getJobName()) +
        JspUtils.td(StringUtils.humanReadableInt(totalSaving)));
  }
  
  public static String htmlRowHeader() {
    return JspUtils.tr(
        JspUtils.td("Job ID") +
        JspUtils.td("Name") +
        JspUtils.td("Estimated Saving"));
  }
  
  public Counters getCounters() throws IOException{
    return this.runningJob.getCounters();
  }

  public String getJobID() {
    return this.runningJob.getID().toString();
  }
}