MapReduceBlockRepairManager.java example

Explorer
hops-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.hops.erasure_coding;

import io.hops.metadata.HdfsStorageFactory;
import io.hops.metadata.hdfs.dal.RepairJobsDataAccess;
import io.hops.metadata.hdfs.entity.RepairJob;
import io.hops.transaction.handler.HDFSOperationType;
import io.hops.transaction.handler.LightWeightRequestHandler;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.server.datanode.BlockReconstructor;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

import java.io.IOException;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.UUID;

public class MapReduceBlockRepairManager extends BlockRepairManager {
  protected static final Log LOG =
      LogFactory.getLog(MapReduceBlockRepairManager.class);

  public static final String JOBUSER = "erasure_coding";

  private static final String IN_FILE_SUFFIX = ".in";

  private static final String MAX_FIX_TIME_FOR_FILE =
      "io.hops.erasure_coding.blockfix.max.fix.time.for.file";
  private static final long DEFAULT_MAX_FIX_TIME_FOR_FILE = 4 * 60 * 60 * 1000; // 4 hrs.

  private Worker corruptionWorker = new CorruptionWorker();

  static enum Counter {
    FILES_SUCCEEDED,
    FILES_FAILED
  }

  private static final String REPAIR_TYPE = "repair_type";
  private static final String SOURCE_PATH = "source_path";
  private static final String PARITY_PATH = "parity_path";
  private static final String CODEC_ID = "codec_id";

  private enum RepairType {
    SOURCE_FILE,
    PARITY_FILE
  }

  private class ActiveRepair {
    private JobID jobId;
    private Job job;
    private RunningJob runningJob;
    private Path inDir;
    private Path outDir;
    // Not persisted right now.
    private long startTime = System.currentTimeMillis();

    public ActiveRepair(JobID jobId, Job job,
        Path inDir, Path outDir) {
      this.jobId = jobId;
      this.job = job;
      this.inDir = inDir;
      this.outDir = outDir;
    }

    public ActiveRepair(JobID jobId, RunningJob job,
        Path inDir, Path outDir) {
      this.jobId = jobId;
      this.runningJob = job;
      this.inDir = inDir;
      this.outDir = outDir;
    }

    public JobID getJobId() {
      return jobId;
    }

    public void killJob() throws IOException {
      if (job != null) {
        job.killJob();
      } else {
        runningJob.killJob();
      }
    }

    public boolean isComplete() throws IOException {
      if (job != null) {
        return job.isComplete();
      } else {
        return runningJob.isComplete();
      }
    }

    public boolean isSuccessful() throws IOException {
      if (job != null) {
        return job.isSuccessful();
      } else {
        return runningJob.isSuccessful();
      }
    }

    public Path getInDir() {
      return inDir;
    }

    public Path getOutDir() {
      return outDir;
    }

    public long getStartTime() {
      return startTime;
    }
  }

  private Map<String, ActiveRepair> currentRepairs =
      new HashMap<String, ActiveRepair>();
  private Collection<ActiveRepair> completedJobs =
      new ArrayList<ActiveRepair>();

  private boolean initialized = false;

  public MapReduceBlockRepairManager(Configuration conf) {
    super(conf);
  }

  @Override
  public void repairSourceBlocks(String codecId, Path sourceFile,
      Path parityFile) {
    initialize();
    String uniqueName =
        sourceFile.getName() + "_" + UUID.randomUUID().toString();
    try {
      corruptionWorker.startJob(uniqueName, sourceFile, parityFile, codecId,
          RepairType.SOURCE_FILE);
    } catch (IOException e) {
      LOG.error("Exception", e);
      LOG.error("Exception", e);
    } catch (InterruptedException e) {
      LOG.error("Exception", e);
    } catch (ClassNotFoundException e) {
      LOG.error("Exception", e);
    }
  }

  @Override
  public void repairParityBlocks(String codecId, Path sourceFile,
      Path parityFile) {
    initialize();
    String uniqueName =
        sourceFile.getName() + "_" + UUID.randomUUID().toString();
    try {
      corruptionWorker.startJob(uniqueName, sourceFile, parityFile, codecId,
          RepairType.PARITY_FILE);
    } catch (IOException e) {
      LOG.error("Exception", e);
    } catch (InterruptedException e) {
      LOG.error("Exception", e);
    } catch (ClassNotFoundException e) {
      LOG.error("Exception", e);
    }
  }

  protected long getMaxFixTimeForFile() {
    return getConf()
        .getLong(MAX_FIX_TIME_FOR_FILE, DEFAULT_MAX_FIX_TIME_FOR_FILE);
  }

  public abstract class Worker {
    protected final Log LOG;
    protected final Class<? extends BlockReconstructor> RECONSTRUCTOR_CLASS;
    protected final String JOB_NAME_PREFIX;


    protected Worker(Log log, Class<? extends BlockReconstructor> rClass,
        String prefix) {

      this.LOG = log;
      this.RECONSTRUCTOR_CLASS = rClass;
      this.JOB_NAME_PREFIX = prefix;
    }

    /**
     * creates and submits a job, updates file index and job index
     */
    private void startJob(String jobName, Path sourceFile, Path parityFile,
        String codecId, RepairType type)
        throws IOException, InterruptedException, ClassNotFoundException {
      Path inDir = new Path(JOB_NAME_PREFIX + "/in/" + jobName);
      Path outDir = new Path(JOB_NAME_PREFIX + "/out/" + jobName);
      createInputFile(jobName, inDir, sourceFile.toUri().getPath());

      Configuration jobConf = new Configuration(getConf());
      jobConf.set(REPAIR_TYPE, type.toString());
      jobConf.set(SOURCE_PATH, sourceFile.toUri().getPath());
      jobConf.set(PARITY_PATH, parityFile.toUri().getPath());
      jobConf.set(CODEC_ID, codecId);

      Job job = new Job(jobConf, jobName);
      configureJob(job, this.RECONSTRUCTOR_CLASS);
      job.setJarByClass(getClass());
      job.setJobName(jobName);
      job.setMapperClass(ReconstructionMapper.class);
      job.setNumReduceTasks(0);
      job.setInputFormatClass(ReconstructionInputFormat.class);
      job.setOutputFormatClass(SequenceFileOutputFormat.class);
      job.setOutputKeyClass(Text.class);
      job.setOutputValueClass(Text.class);

      ReconstructionInputFormat.setInputPaths(job, inDir);
      SequenceFileOutputFormat.setOutputPath(job, outDir);

      submitJob(job);
      String path = sourceFile.toUri().getPath();
      JobClient jobClient = new JobClient(getConf());
      JobID jobId = job.getJobID();
      ActiveRepair activeRepair = new ActiveRepair(jobId, job, inDir,
          outDir);
      // There is a risk that it crashes before persisting the job for recovery.
      // The ErasureCodingManager should restart the job in that case as it
      // does not set the state to active before this code succeeded.
      persistActiveJob(path, jobId, inDir.toUri().getPath(),
          outDir.toUri().getPath());
      currentRepairs.put(type == RepairType.SOURCE_FILE ?
          sourceFile.toUri().getPath() : parityFile.toUri().getPath(),
          activeRepair);
    }

    private void submitJob(Job job)
        throws IOException, InterruptedException, ClassNotFoundException {
      LOG.info("Submitting job");
      MapReduceBlockRepairManager.this.submitJob(job);
    }

    /**
     * creates the input file (containing the names of the file to be
     * reconstructed)
     */
    private void createInputFile(String jobName, Path inDir, String lostFile)
        throws IOException {
      Path file = new Path(inDir, jobName + IN_FILE_SUFFIX);
      FileSystem fs = file.getFileSystem(getConf());
      SequenceFile.Writer fileOut = SequenceFile
          .createWriter(fs, getConf(), file, LongWritable.class, Text.class);
      fileOut.append(new LongWritable(0L), new Text(lostFile));
      fileOut.close();
    }
  }

  public class CorruptionWorker extends Worker {

    public CorruptionWorker() {
      super(LogFactory.getLog(CorruptionWorker.class), BlockReconstructor.class,
          "blockfixer");
    }
  }

  public void configureJob(Job job,
      Class<? extends BlockReconstructor> reconstructorClass) {

    ((JobConf) job.getConfiguration()).setUser(JOBUSER);
    ((JobConf) job.getConfiguration())
        .setClass(ReconstructionMapper.RECONSTRUCTOR_CLASS_TAG,
            reconstructorClass, BlockReconstructor.class);
  }

  private void submitJob(Job job)
      throws IOException, InterruptedException, ClassNotFoundException {
    job.submit();
    LOG.info("Job " + job.getJobID() + "(" + job.getJobName() + ") started");
  }

  static class ReconstructionInputFormat
      extends SequenceFileInputFormat<LongWritable, Text> {

    protected static final Log LOG =
        LogFactory.getLog(ReconstructionMapper.class);

    /**
     * splits the input files into tasks handled by a single node
     * we have to read the input files to do this based on a number of
     * items in a sequence
     */
    @Override
    public List<InputSplit> getSplits(JobContext job) throws IOException {
      long filesPerTask = 1;

      Path[] inPaths = getInputPaths(job);

      List<InputSplit> splits = new ArrayList<InputSplit>();

      long fileCounter = 0;

      for (Path inPath : inPaths) {

        FileSystem fs = inPath.getFileSystem(job.getConfiguration());

        if (!fs.getFileStatus(inPath).isDir()) {
          throw new IOException(inPath.toString() + " is not a directory");
        }

        FileStatus[] inFiles = fs.listStatus(inPath);

        for (FileStatus inFileStatus : inFiles) {
          Path inFile = inFileStatus.getPath();

          if (!inFileStatus.isDir() &&
              (inFile.getName().equals(job.getJobName() + IN_FILE_SUFFIX))) {

            fileCounter++;
            SequenceFile.Reader inFileReader =
                new SequenceFile.Reader(fs, inFile, job.getConfiguration());

            long startPos = inFileReader.getPosition();
            long counter = 0;

            // create an input split every filesPerTask items in the sequence
            LongWritable key = new LongWritable();
            Text value = new Text();
            try {
              while (inFileReader.next(key, value)) {
                if (counter % filesPerTask == filesPerTask - 1L) {
                  splits.add(new FileSplit(inFile, startPos,
                      inFileReader.getPosition() - startPos, null));
                  startPos = inFileReader.getPosition();
                }
                counter++;
              }

              // create input split for remaining items if necessary
              // this includes the case where no splits were created by the loop
              if (startPos != inFileReader.getPosition()) {
                splits.add(new FileSplit(inFile, startPos,
                    inFileReader.getPosition() - startPos, null));
              }
            } finally {
              inFileReader.close();
            }
          }
        }
      }

      LOG.info("created " + splits.size() + " input splits from " +
          fileCounter + " files");

      return splits;
    }

    /**
     * indicates that input file can be split
     */
    @Override
    public boolean isSplitable(JobContext job, Path file) {
      return true;
    }
  }

  /**
   * Mapper for reconstructing stripes with lost blocks
   */
  static class ReconstructionMapper
      extends Mapper<LongWritable, Text, Text, Text> {

    protected static final Log LOG =
        LogFactory.getLog(ReconstructionMapper.class);

    public static final String RECONSTRUCTOR_CLASS_TAG =
        "hdfs.blockintegrity.reconstructor";

    private BlockReconstructor reconstructor;

    private RepairType repairType;
    private Path sourceFile;
    private Path parityFile;
    private Decoder decoder;

    @Override
    protected void setup(Context context)
        throws IOException, InterruptedException {

      super.setup(context);

      Configuration conf = context.getConfiguration();
      Codec.initializeCodecs(conf);

      repairType = RepairType.valueOf(conf.get(REPAIR_TYPE));
      sourceFile = new Path(conf.get(SOURCE_PATH));
      parityFile = new Path(conf.get(PARITY_PATH));
      decoder = new Decoder(conf, Codec.getCodec(conf.get(CODEC_ID)));

      Class<? extends BlockReconstructor> reconstructorClass =
          context.getConfiguration().getClass(RECONSTRUCTOR_CLASS_TAG, null,
              BlockReconstructor.class);

      if (reconstructorClass == null) {
        LOG.error("No class supplied for reconstructor " +
            "(prop " + RECONSTRUCTOR_CLASS_TAG + ")");
        context.progress();
        return;
      }

      // We dynamically instantiate the helper based on the helperClass member
      try {
        Constructor<? extends BlockReconstructor> ctor =
            reconstructorClass.getConstructor(new Class[]{Configuration.class});

        reconstructor = ctor.newInstance(conf);

      } catch (Exception ex) {
        throw new IOException("Could not instantiate a block reconstructor " +
            "based on class " + reconstructorClass, ex);
      }
    }

    @Override
    public void map(LongWritable key, Text fileText, Mapper.Context context)
        throws IOException, InterruptedException {

      String fileStr = fileText.toString();
      LOG.info("reconstructing " + fileStr);
      Path file = new Path(fileStr);
      try {
        switch (repairType) {
          case SOURCE_FILE:
            reconstructor.processFile(sourceFile, parityFile, decoder);
            break;
          case PARITY_FILE:
            reconstructor.processParityFile(sourceFile, parityFile, decoder);
            break;
        }
      } catch (Exception e) {
        LOG.error("Reconstructing file " + file + " failed", e);

        // report file as failed
        context.getCounter(Counter.FILES_FAILED).increment(1L);
        String outkey = fileStr;
        String outval = "failed";
        context.write(new Text(outkey), new Text(outval));
        throw new RuntimeException(e);
      }
      context.progress();
    }
  }

  @Override
  public List<Report> computeReports() throws IOException {
    initialize();
    cleanRecovery();

    List<Report> reports = new ArrayList<Report>();
    for (Map.Entry<String, ActiveRepair> entry : currentRepairs.entrySet()) {
      String fileName = entry.getKey();
      ActiveRepair job = entry.getValue();
      try {
        if (job.isComplete() && job.isSuccessful()) {
          LOG.info("REPAIR COMPLETE");
          reports.add(new Report(fileName, Report.Status.FINISHED));
          cleanup(job);
        } else if (job.isComplete() && !job.isSuccessful()) {
          LOG.info("REPAIR FAILED");
          reports.add(new Report(fileName, Report.Status.FAILED));
          cleanup(job);
        } else if (job.getStartTime() > 0 && System.currentTimeMillis()
            - job.getStartTime() > getMaxFixTimeForFile()){
          LOG.info("Timeout: " + (System.currentTimeMillis()
              - job.getStartTime()) + " " + job.getStartTime());
          job.killJob();
          reports.add(new Report(fileName, Report.Status.CANCELED));
          cleanup(job);
        } else {
          LOG.info("REPAIR RUNNING");
          reports.add(new Report(fileName, Report.Status.ACTIVE));
        }
      } catch (Exception e) {
        LOG.info("Exception during completeness check", e);
        try {
          job.killJob();
        } catch (Exception e1) {
        }
        reports.add(new Report(fileName, Report.Status.FAILED));
        cleanup(job);
      }
    }

    for (Report report : reports) {
      Report.Status status = report.getStatus();
      if (status == Report.Status.FINISHED || status == Report.Status.FAILED ||
          status == Report.Status.CANCELED) {
        ActiveRepair activeRepair = currentRepairs.remove(report.getFilePath());
        completedJobs.add(activeRepair);
      }
    }

    return reports;
  }

  @Override
  public void cancelAll() {
    initialize();
    for (ActiveRepair job : currentRepairs.values()) {
      try {
        job.killJob();
      } catch (Exception e) {
        LOG.error("Exception", e);
      }
      cleanup(job);
    }
    currentRepairs.clear();
  }

  @Override
  public void cancel(String toCancel) {
    initialize();
    ActiveRepair job = currentRepairs.get(toCancel);
    try {
      job.killJob();
    } catch (Exception e) {
      LOG.error("Exception", e);
    }
    currentRepairs.remove(toCancel);
    cleanup(job);
  }

  private void cleanup(ActiveRepair job) {
    Path outDir = null;
    try {
      outDir = job.getOutDir();
      outDir.getFileSystem(getConf()).delete(outDir, true);
    } catch (IOException e) {
      LOG.warn("Could not delete output dir " + outDir, e);
    }
    Path inDir = null;
    try {
      // We only create one input directory.
      inDir = job.getInDir();
      inDir.getFileSystem(getConf()).delete(inDir, true);
    } catch (IOException e) {
      LOG.warn("Could not delete input dir " + inDir, e);
    }
  }

  private void persistActiveJob(final String path, final JobID jobId,
      final String inDir, final String outDir) throws IOException {
    new LightWeightRequestHandler(HDFSOperationType.PERSIST_ENCODING_JOB) {
      @Override
      public Object performTask() throws IOException {
        RepairJobsDataAccess da = (RepairJobsDataAccess)
            HdfsStorageFactory.getDataAccess(RepairJobsDataAccess.class);
        da.add(new RepairJob(jobId.getJtIdentifier(),
            jobId.getId(), path, inDir, outDir));
        return null;
      }
    }.handle();
  }

  private void cleanRecovery() throws IOException {
    new LightWeightRequestHandler(HDFSOperationType.DELETE_ENCODING_JOBS) {
      @Override
      public Object performTask() throws IOException {
        RepairJobsDataAccess da = (RepairJobsDataAccess)
            HdfsStorageFactory.getDataAccess(RepairJobsDataAccess.class);
        Iterator<ActiveRepair> it = completedJobs.iterator();
        while (it.hasNext()) {
          ActiveRepair job = it.next();
          JobID jobId = job.getJobId();
          da.delete(new RepairJob(jobId.getJtIdentifier(), jobId.getId()));
          it.remove();
        }
        return null;
      }
    }.handle();
  }

  private void initialize() {
    if (initialized) {
      return;
    }

    try {
      JobClient jobClient = new JobClient(getConf());
      for (RepairJob job : recoverActiveJobs()) {
        org.apache.hadoop.mapred.JobID jobId =
            new org.apache.hadoop.mapred.JobID(job.getJtIdentifier(),
                job.getJobId());
        RunningJob runningJob = jobClient.getJob(jobId);
        if (runningJob == null) {
          throw new IOException("Failed to recover");
        }
        ActiveRepair recovered = new ActiveRepair(jobId, runningJob,
            new Path(job.getInDir()), new Path(job.getOutDir()));
        currentRepairs.put(job.getPath(), recovered);
      }
    } catch (IOException e) {
      LOG.error("Encoding job recovery failed", e);
      throw new RuntimeException(e);
    }

    initialized = true;
  }

  private Collection<RepairJob> recoverActiveJobs() throws IOException {
    LightWeightRequestHandler handler = new LightWeightRequestHandler(
        HDFSOperationType.RECOVER_ENCODING_JOBS) {
      @Override
      public Object performTask() throws IOException {
        RepairJobsDataAccess da = (RepairJobsDataAccess)
            HdfsStorageFactory.getDataAccess(RepairJobsDataAccess.class);
        return da.findAll();
      }
    };

    return (Collection<RepairJob>) handler.handle();
  }
}