TestNameNodeRecovery.java example

Explorer
hadoop-on-lustre2-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hdfs.server.namenode;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import static org.mockito.Mockito.doNothing;
import static org.mockito.Mockito.spy;

import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.DeleteOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.OpInstanceCache;
import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.test.PathUtils;
import org.apache.hadoop.util.StringUtils;
import org.junit.Test;

import com.google.common.collect.Sets;

/**
 * This tests data recovery mode for the NameNode.
 */
public class TestNameNodeRecovery {
  private static final Log LOG = LogFactory.getLog(TestNameNodeRecovery.class);
  private static final StartupOption recoverStartOpt = StartupOption.RECOVER;
  private static final File TEST_DIR = PathUtils.getTestDir(TestNameNodeRecovery.class);

  static {
    recoverStartOpt.setForce(MetaRecoveryContext.FORCE_ALL);
    EditLogFileOutputStream.setShouldSkipFsyncForTesting(true);
  }

  static void runEditLogTest(EditLogTestSetup elts) throws IOException {
    final File TEST_LOG_NAME = new File(TEST_DIR, "test_edit_log");
    final OpInstanceCache cache = new OpInstanceCache();
    
    EditLogFileOutputStream elfos = null;
    EditLogFileInputStream elfis = null;
    try {
      elfos = new EditLogFileOutputStream(new Configuration(), TEST_LOG_NAME, 0);
      elfos.create(NameNodeLayoutVersion.CURRENT_LAYOUT_VERSION);

      elts.addTransactionsToLog(elfos, cache);
      elfos.setReadyToFlush();
      elfos.flushAndSync(true);
      elfos.close();
      elfos = null;
      elfis = new EditLogFileInputStream(TEST_LOG_NAME);
      elfis.setMaxOpSize(elts.getMaxOpSize());
      
      // reading through normally will get you an exception
      Set<Long> validTxIds = elts.getValidTxIds();
      FSEditLogOp op = null;
      long prevTxId = 0;
      try {
        while (true) {
          op = elfis.nextOp();
          if (op == null) {
            break;
          }
          LOG.debug("read txid " + op.txid);
          if (!validTxIds.contains(op.getTransactionId())) {
            fail("read txid " + op.getTransactionId() +
                ", which we did not expect to find.");
          }
          validTxIds.remove(op.getTransactionId());
          prevTxId = op.getTransactionId();
        }
        if (elts.getLastValidTxId() != -1) {
          fail("failed to throw IoException as expected");
        }
      } catch (IOException e) {
        if (elts.getLastValidTxId() == -1) {
          fail("expected all transactions to be valid, but got exception " +
              "on txid " + prevTxId);
        } else {
          assertEquals(prevTxId, elts.getLastValidTxId());
        }
      }
      
      if (elts.getLastValidTxId() != -1) {
        // let's skip over the bad transaction
        op = null;
        prevTxId = 0;
        try {
          while (true) {
            op = elfis.nextValidOp();
            if (op == null) {
              break;
            }
            prevTxId = op.getTransactionId();
            assertTrue(validTxIds.remove(op.getTransactionId()));
          }
        } catch (Throwable e) {
          fail("caught IOException while trying to skip over bad " +
              "transaction.   message was " + e.getMessage() + 
              "\nstack trace\n" + StringUtils.stringifyException(e));
        }
      }
      // We should have read every valid transaction.
      assertTrue(validTxIds.isEmpty());
    } finally {
      IOUtils.cleanup(LOG, elfos, elfis);
    }
  }

  /**
   * A test scenario for the edit log
   */
  private static abstract class EditLogTestSetup {
    /** 
     * Set up the edit log.
     */
    abstract public void addTransactionsToLog(EditLogOutputStream elos,
        OpInstanceCache cache) throws IOException;

    /**
     * Get the transaction ID right before the transaction which causes the
     * normal edit log loading process to bail out-- or -1 if the first
     * transaction should be bad.
     */
    abstract public long getLastValidTxId();

    /**
     * Get the transaction IDs which should exist and be valid in this
     * edit log.
     **/
    abstract public Set<Long> getValidTxIds();
    
    /**
     * Return the maximum opcode size we will use for input.
     */
    public int getMaxOpSize() {
      return DFSConfigKeys.DFS_NAMENODE_MAX_OP_SIZE_DEFAULT;
    }
  }
  
  static void padEditLog(EditLogOutputStream elos, int paddingLength)
      throws IOException {
    if (paddingLength <= 0) {
      return;
    }
    byte buf[] = new byte[4096];
    for (int i = 0; i < buf.length; i++) {
      buf[i] = (byte)-1;
    }
    int pad = paddingLength;
    while (pad > 0) {
      int toWrite = pad > buf.length ? buf.length : pad;
      elos.writeRaw(buf, 0, toWrite);
      pad -= toWrite;
    }
  }

  static void addDeleteOpcode(EditLogOutputStream elos,
        OpInstanceCache cache, long txId, String path) throws IOException {
    DeleteOp op = DeleteOp.getInstance(cache);
    op.setTransactionId(txId);
    op.setPath(path);
    op.setTimestamp(0);
    elos.write(op);
  }
  
  /**
   * Test the scenario where we have an empty edit log.
   *
   * This class is also useful in testing whether we can correctly handle
   * various amounts of padding bytes at the end of the log.  We should be
   * able to handle any amount of padding (including no padding) without
   * throwing an exception.
   */
  private static class EltsTestEmptyLog extends EditLogTestSetup {
    private final int paddingLength;

    public EltsTestEmptyLog(int paddingLength) {
      this.paddingLength = paddingLength;
    }

    public void addTransactionsToLog(EditLogOutputStream elos,
        OpInstanceCache cache) throws IOException {
      padEditLog(elos, paddingLength);
    }

    public long getLastValidTxId() {
      return -1;
    }

    public Set<Long> getValidTxIds() {
      return new HashSet<Long>();
    } 
  }
  
  /** Test an empty edit log */
  @Test(timeout=180000)
  public void testEmptyLog() throws IOException {
    runEditLogTest(new EltsTestEmptyLog(0));
  }

  /** Test an empty edit log with padding */
  @Test(timeout=180000)
  public void testEmptyPaddedLog() throws IOException {
    runEditLogTest(new EltsTestEmptyLog(
        EditLogFileOutputStream.MIN_PREALLOCATION_LENGTH));
  }
  
  /** Test an empty edit log with extra-long padding */
  @Test(timeout=180000)
  public void testEmptyExtraPaddedLog() throws IOException {
    runEditLogTest(new EltsTestEmptyLog(
        3 * EditLogFileOutputStream.MIN_PREALLOCATION_LENGTH));
  }

  /**
   * Test using a non-default maximum opcode length.
   */
  private static class EltsTestNonDefaultMaxOpSize extends EditLogTestSetup {
    public EltsTestNonDefaultMaxOpSize() {
    }

    @Override
    public void addTransactionsToLog(EditLogOutputStream elos,
        OpInstanceCache cache) throws IOException {
      addDeleteOpcode(elos, cache, 0, "/foo");
      addDeleteOpcode(elos, cache, 1,
       "/supercalifragalisticexpialadocius.supercalifragalisticexpialadocius");
    }

    @Override
    public long getLastValidTxId() {
      return 0;
    }

    @Override
    public Set<Long> getValidTxIds() {
      return Sets.newHashSet(0L);
    } 
    
    public int getMaxOpSize() {
      return 40;
    }
  }

  /** Test an empty edit log with extra-long padding */
  @Test(timeout=180000)
  public void testNonDefaultMaxOpSize() throws IOException {
    runEditLogTest(new EltsTestNonDefaultMaxOpSize());
  }

  /**
   * Test the scenario where an edit log contains some padding (0xff) bytes
   * followed by valid opcode data.
   *
   * These edit logs are corrupt, but all the opcodes should be recoverable
   * with recovery mode.
   */
  private static class EltsTestOpcodesAfterPadding extends EditLogTestSetup {
    private final int paddingLength;

    public EltsTestOpcodesAfterPadding(int paddingLength) {
      this.paddingLength = paddingLength;
    }

    public void addTransactionsToLog(EditLogOutputStream elos,
        OpInstanceCache cache) throws IOException {
      padEditLog(elos, paddingLength);
      addDeleteOpcode(elos, cache, 0, "/foo");
    }

    public long getLastValidTxId() {
      return 0;
    }

    public Set<Long> getValidTxIds() {
      return Sets.newHashSet(0L);
    } 
  }

  @Test(timeout=180000)
  public void testOpcodesAfterPadding() throws IOException {
    runEditLogTest(new EltsTestOpcodesAfterPadding(
        EditLogFileOutputStream.MIN_PREALLOCATION_LENGTH));
  }

  @Test(timeout=180000)
  public void testOpcodesAfterExtraPadding() throws IOException {
    runEditLogTest(new EltsTestOpcodesAfterPadding(
        3 * EditLogFileOutputStream.MIN_PREALLOCATION_LENGTH));
  }

  private static class EltsTestGarbageInEditLog extends EditLogTestSetup {
    final private long BAD_TXID = 4;
    final private long MAX_TXID = 10;
    
    @Override
    public void addTransactionsToLog(EditLogOutputStream elos,
        OpInstanceCache cache) throws IOException {
      for (long txid = 1; txid <= MAX_TXID; txid++) {
        if (txid == BAD_TXID) {
          byte garbage[] = { 0x1, 0x2, 0x3 };
          elos.writeRaw(garbage, 0, garbage.length);
        }
        else {
          DeleteOp op;
          op = DeleteOp.getInstance(cache);
          op.setTransactionId(txid);
          op.setPath("/foo." + txid);
          op.setTimestamp(txid);
          elos.write(op);
        }
      }
    }

    @Override
    public long getLastValidTxId() {
      return BAD_TXID - 1;
    }

    @Override
    public Set<Long> getValidTxIds() {
      return Sets.newHashSet(1L , 2L, 3L, 5L, 6L, 7L, 8L, 9L, 10L);
    }
  }
  
  /** Test that we can successfully recover from a situation where there is
   * garbage in the middle of the edit log file output stream. */
  @Test(timeout=180000)
  public void testSkipEdit() throws IOException {
    runEditLogTest(new EltsTestGarbageInEditLog());
  }

  /**
   * An algorithm for corrupting an edit log.
   */
  static interface Corruptor {
    /*
     * Corrupt an edit log file.
     *
     * @param editFile   The edit log file
     */
    public void corrupt(File editFile) throws IOException;

    /*
     * Explain whether we need to read the log in recovery mode
     *
     * @param finalized  True if the edit log in question is finalized.
     *                   We're a little more lax about reading unfinalized
     *                   logs.  We will allow a small amount of garbage at
     *                   the end.  In a finalized log, every byte must be
     *                   perfect.
     *
     * @return           Whether we need to read the log in recovery mode
     */
    public boolean needRecovery(boolean finalized);

    /*
     * Get the name of this corruptor
     *
     * @return           The Corruptor name
     */
    public String getName();
  }

  static class TruncatingCorruptor implements Corruptor {
    @Override
    public void corrupt(File editFile) throws IOException {
      // Corrupt the last edit
      long fileLen = editFile.length();
      RandomAccessFile rwf = new RandomAccessFile(editFile, "rw");
      rwf.setLength(fileLen - 1);
      rwf.close();
    }

    @Override
    public boolean needRecovery(boolean finalized) {
      return finalized;
    }

    @Override
    public String getName() {
      return "truncated";
    }
  }

  static class PaddingCorruptor implements Corruptor {
    @Override
    public void corrupt(File editFile) throws IOException {
      // Add junk to the end of the file
      RandomAccessFile rwf = new RandomAccessFile(editFile, "rw");
      rwf.seek(editFile.length());
      for (int i = 0; i < 129; i++) {
        rwf.write((byte)0);
      }
      rwf.write(0xd);
      rwf.write(0xe);
      rwf.write(0xa);
      rwf.write(0xd);
      rwf.close();
    }

    @Override
    public boolean needRecovery(boolean finalized) {
      // With finalized edit logs, we ignore what's at the end as long as we
      // can make it to the correct transaction ID.
      // With unfinalized edit logs, the finalization process ignores garbage
      // at the end.
      return false;
    }

    @Override
    public String getName() {
      return "padFatal";
    }
  }

  static class SafePaddingCorruptor implements Corruptor {
    private final byte padByte;

    public SafePaddingCorruptor(byte padByte) {
      this.padByte = padByte;
      assert ((this.padByte == 0) || (this.padByte == -1));
    }

    @Override
    public void corrupt(File editFile) throws IOException {
      // Add junk to the end of the file
      RandomAccessFile rwf = new RandomAccessFile(editFile, "rw");
      rwf.seek(editFile.length());
      rwf.write((byte)-1);
      for (int i = 0; i < 1024; i++) {
        rwf.write(padByte);
      }
      rwf.close();
    }

    @Override
    public boolean needRecovery(boolean finalized) {
      return false;
    }

    @Override
    public String getName() {
      return "pad" + ((int)padByte);
    }
  }

  /**
   * Create a test configuration that will exercise the initializeGenericKeys
   * code path.  This is a regression test for HDFS-4279.
   */
  static void setupRecoveryTestConf(Configuration conf) throws IOException {
    conf.set(DFSConfigKeys.DFS_NAMESERVICES, "ns1");
    conf.set(DFSConfigKeys.DFS_HA_NAMENODE_ID_KEY, "nn1");
    conf.set(DFSUtil.addKeySuffixes(DFSConfigKeys.DFS_HA_NAMENODES_KEY_PREFIX,
      "ns1"), "nn1,nn2");
    String baseDir = System.getProperty(
        MiniDFSCluster.PROP_TEST_BUILD_DATA, "build/test/data") + "/dfs/";
    File nameDir = new File(baseDir, "nameR");
    File secondaryDir = new File(baseDir, "namesecondaryR");
    conf.set(DFSUtil.addKeySuffixes(DFSConfigKeys.
        DFS_NAMENODE_NAME_DIR_KEY, "ns1", "nn1"),
        nameDir.getCanonicalPath());
    conf.set(DFSUtil.addKeySuffixes(DFSConfigKeys.
        DFS_NAMENODE_CHECKPOINT_DIR_KEY, "ns1", "nn1"),
        secondaryDir.getCanonicalPath());
    conf.unset(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY);
    conf.unset(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_DIR_KEY);
    FileUtils.deleteQuietly(nameDir);
    if (!nameDir.mkdirs()) {
      throw new RuntimeException("failed to make directory " +
        nameDir.getAbsolutePath());
    }
    FileUtils.deleteQuietly(secondaryDir);
    if (!secondaryDir.mkdirs()) {
      throw new RuntimeException("failed to make directory " +
        secondaryDir.getAbsolutePath());
    }
  }

  static void testNameNodeRecoveryImpl(Corruptor corruptor, boolean finalize)
      throws IOException {
    final String TEST_PATH = "/test/path/dir";
    final String TEST_PATH2 = "/second/dir";
    final boolean needRecovery = corruptor.needRecovery(finalize);

    // start a cluster
    Configuration conf = new HdfsConfiguration();
    setupRecoveryTestConf(conf);
    MiniDFSCluster cluster = null;
    FileSystem fileSys = null;
    StorageDirectory sd = null;
    try {
      cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0)
          .manageNameDfsDirs(false).build();
      cluster.waitActive();
      if (!finalize) {
        // Normally, the in-progress edit log would be finalized by
        // FSEditLog#endCurrentLogSegment.  For testing purposes, we
        // disable that here.
        FSEditLog spyLog =
            spy(cluster.getNameNode().getFSImage().getEditLog());
        doNothing().when(spyLog).endCurrentLogSegment(true);
        cluster.getNameNode().getFSImage().setEditLogForTesting(spyLog);
      }
      fileSys = cluster.getFileSystem();
      final FSNamesystem namesystem = cluster.getNamesystem();
      FSImage fsimage = namesystem.getFSImage();
      fileSys.mkdirs(new Path(TEST_PATH));
      fileSys.mkdirs(new Path(TEST_PATH2));
      sd = fsimage.getStorage().dirIterator(NameNodeDirType.EDITS).next();
    } finally {
      if (cluster != null) {
        cluster.shutdown();
      }
    }

    File editFile = FSImageTestUtil.findLatestEditsLog(sd).getFile();
    assertTrue("Should exist: " + editFile, editFile.exists());

    // Corrupt the edit log
    LOG.info("corrupting edit log file '" + editFile + "'");
    corruptor.corrupt(editFile);

    // If needRecovery == true, make sure that we can't start the
    // cluster normally before recovery
    cluster = null;
    try {
      LOG.debug("trying to start normally (this should fail)...");
      cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0)
          .enableManagedDfsDirsRedundancy(false).format(false).build();
      cluster.waitActive();
      cluster.shutdown();
      if (needRecovery) {
        fail("expected the corrupted edit log to prevent normal startup");
      }
    } catch (IOException e) {
      if (!needRecovery) {
        LOG.error("Got unexpected failure with " + corruptor.getName() +
            corruptor, e);
        fail("got unexpected exception " + e.getMessage());
      }
    } finally {
      if (cluster != null) {
        cluster.shutdown();
      }
    }

    // Perform NameNode recovery.
    // Even if there was nothing wrong previously (needRecovery == false),
    // this should still work fine.
    cluster = null;
    try {
      LOG.debug("running recovery...");
      cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0)
          .enableManagedDfsDirsRedundancy(false).format(false)
          .startupOption(recoverStartOpt).build();
    } catch (IOException e) {
      fail("caught IOException while trying to recover. " +
          "message was " + e.getMessage() +
          "\nstack trace\n" + StringUtils.stringifyException(e));
    } finally {
      if (cluster != null) {
        cluster.shutdown();
      }
    }

    // Make sure that we can start the cluster normally after recovery
    cluster = null;
    try {
      LOG.debug("starting cluster normally after recovery...");
      cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0)
          .enableManagedDfsDirsRedundancy(false).format(false).build();
      LOG.debug("successfully recovered the " + corruptor.getName() +
          " corrupted edit log");
      cluster.waitActive();
      assertTrue(cluster.getFileSystem().exists(new Path(TEST_PATH)));
    } catch (IOException e) {
      fail("failed to recover.  Error message: " + e.getMessage());
    } finally {
      if (cluster != null) {
        cluster.shutdown();
      }
    }
  }

  /** Test that we can successfully recover from a situation where the last
   * entry in the edit log has been truncated. */
  @Test(timeout=180000)
  public void testRecoverTruncatedEditLog() throws IOException {
    testNameNodeRecoveryImpl(new TruncatingCorruptor(), true);
    testNameNodeRecoveryImpl(new TruncatingCorruptor(), false);
  }

  /** Test that we can successfully recover from a situation where the last
   * entry in the edit log has been padded with garbage. */
  @Test(timeout=180000)
  public void testRecoverPaddedEditLog() throws IOException {
    testNameNodeRecoveryImpl(new PaddingCorruptor(), true);
    testNameNodeRecoveryImpl(new PaddingCorruptor(), false);
  }

  /** Test that don't need to recover from a situation where the last
   * entry in the edit log has been padded with 0. */
  @Test(timeout=180000)
  public void testRecoverZeroPaddedEditLog() throws IOException {
    testNameNodeRecoveryImpl(new SafePaddingCorruptor((byte)0), true);
    testNameNodeRecoveryImpl(new SafePaddingCorruptor((byte)0), false);
  }

  /** Test that don't need to recover from a situation where the last
   * entry in the edit log has been padded with 0xff bytes. */
  @Test(timeout=180000)
  public void testRecoverNegativeOnePaddedEditLog() throws IOException {
    testNameNodeRecoveryImpl(new SafePaddingCorruptor((byte)-1), true);
    testNameNodeRecoveryImpl(new SafePaddingCorruptor((byte)-1), false);
  }
}