Journal.java example

Explorer
hadoop-20-master
- src
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.qjournal.server;

import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.qjournal.protocol.JournalNotFormattedException;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol;
import org.apache.hadoop.hdfs.qjournal.protocol.JournalOutOfSyncException;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetStorageStateProto;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PersistedRecoveryPaxosData;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto;
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto;
import org.apache.hadoop.hdfs.qjournal.protocol.RequestInfo;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.Transition;
import org.apache.hadoop.hdfs.server.common.StorageErrorReporter;
import org.apache.hadoop.hdfs.server.namenode.EditLogOutputStream;
import org.apache.hadoop.hdfs.server.namenode.FSEditLog;
import org.apache.hadoop.hdfs.server.namenode.FSImageStorageInspector.FSImageFile;
import org.apache.hadoop.hdfs.server.namenode.FileImageManager;
import org.apache.hadoop.hdfs.server.namenode.FileJournalManager;
import org.apache.hadoop.hdfs.server.namenode.FileJournalManager.EditLogFile;
import org.apache.hadoop.hdfs.server.namenode.FSImageTransactionalStorageInspector;
import org.apache.hadoop.hdfs.server.namenode.ImageSet;
import org.apache.hadoop.hdfs.server.namenode.JournalManager;
import org.apache.hadoop.hdfs.server.namenode.TransferFsImage;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
import org.apache.hadoop.hdfs.server.protocol.RemoteImage;
import org.apache.hadoop.hdfs.server.protocol.RemoteImageManifest;
import org.apache.hadoop.hdfs.util.AtomicFileOutputStream;
import org.apache.hadoop.hdfs.util.BestEffortLongFile;
import org.apache.hadoop.hdfs.util.MD5FileUtils;
import org.apache.hadoop.hdfs.util.PersistentLongFile;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.ShortVoid;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.ipc.Server;

import com.google.common.base.Charsets;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Range;
import com.google.common.collect.Ranges;

/**
 * A JournalNode can manage journals and optionally images for several clusters
 * at once. Each such journal is entirely independent despite being hosted by
 * the same JVM. 
 */
public class Journal implements Closeable {
  public static final Log LOG = LogFactory.getLog(Journal.class);
  
  private final JournalNode journalNode;


  // Current writing state
  private EditLogOutputStream curSegment;
  private long curSegmentTxId = HdfsConstants.INVALID_TXID;
  private long nextTxId = HdfsConstants.INVALID_TXID;
  private long highestWrittenTxId = 0;
  
  private long currentSegmentWrittenBytes;
  
  // minimum txid of the underlying segments (best effort)
  // updated when we purge logs
  private long minTxid = 0;
  
  private final String journalId;
  
  private final JNStorage journalStorage;
  private final JNStorage imageStorage;
  private final Map<Long, MD5Hash> checkpointImageDigests = new HashMap<Long,MD5Hash>();
  
  private final FileImageManager imageManager;
  private final FileJournalManager fjm;

  private long mostRecentCheckpointTxid = HdfsConstants.INVALID_TXID;

  /**
   * When a new writer comes along, it asks each node to promise
   * to ignore requests from any previous writer, as identified
   * by epoch number. In order to make such a promise, the epoch
   * number of that writer is stored persistently on disk.
   */
  private PersistentLongFile lastPromisedEpoch;

  /**
   * Each IPC that comes from a given client contains a serial number
   * which only increases from the client's perspective. Whenever
   * we switch epochs, we reset this back to -1. Whenever an IPC
   * comes from a client, we ensure that it is strictly higher
   * than any previous IPC. This guards against any bugs in the IPC
   * layer that would re-order IPCs or cause a stale retry from an old
   * request to resurface and confuse things.
   */
  private long currentEpochIpcSerial = -1;
  
  /**
   * The epoch number of the last writer to actually write a transaction.
   * This is used to differentiate log segments after a crash at the very
   * beginning of a segment. See the the 'testNewerVersionOfSegmentWins'
   * test case.
   */
  private PersistentLongFile lastWriterEpoch;
  
  /**
   * Lower-bound on the last committed transaction ID. This is not
   * depended upon for correctness, but acts as a sanity check
   * during the recovery procedures, and as a visibility mark
   * for clients reading in-progress logs.
   */
  private BestEffortLongFile committedTxnId;
  private long lastPersistedCommittedTxId = 0;
  
  private static final String LAST_PROMISED_FILENAME = "last-promised-epoch";
  private static final String LAST_WRITER_EPOCH = "last-writer-epoch";
  private static final String COMMITTED_TXID_FILENAME = "committed-txid";

  private final JournalMetrics metrics;
  
  /**
   * Time threshold for sync calls, beyond which a warning should be logged.
   */
  private static final int WARN_SYNC_MILLIS_THRESHOLD = 1000;

  Journal(File logDir, File imageDir, String journalId,
      StorageErrorReporter errorReporter, JournalNode journalNode) throws IOException {
    this.journalNode = journalNode;
    // initialize storage directories
    journalStorage = new JNStorage(logDir, errorReporter, false, journalNode.getConf());
    imageStorage = new JNStorage(imageDir, errorReporter, true, journalNode.getConf());

    // initialize journal and image managers
    this.fjm = new FileJournalManager(journalStorage.getSingularStorageDir(),
        null, errorReporter);

    this.imageManager = new FileImageManager(
        imageStorage.getStorageDirectory(), imageStorage);
    
    this.journalId = journalId;
    this.metrics = JournalMetrics.create(this);

    refreshCachedData();
     
    EditLogFile latest = scanStorageForLatestEdits();
    if (latest != null) {
      highestWrittenTxId = latest.getLastTxId();
      metrics.setLastWrittenTxId(highestWrittenTxId);
    }
  }

  /**
   * Reload any data that may have been cached. This is necessary
   * when we first load the Journal, but also after any formatting
   * operation, since the cached data is no longer relevant.
   * @throws IOException 
   */
  private synchronized void refreshCachedData() throws IOException {
    IOUtils.closeStream(committedTxnId);
    
    File currentDir = journalStorage.getSingularStorageDir().getCurrentDir();
    this.lastPromisedEpoch = new PersistentLongFile(
        new File(currentDir, LAST_PROMISED_FILENAME), 0);
    this.lastWriterEpoch = new PersistentLongFile(
        new File(currentDir, LAST_WRITER_EPOCH), 0);
    this.committedTxnId = new BestEffortLongFile(
        new File(currentDir, COMMITTED_TXID_FILENAME),
        HdfsConstants.INVALID_TXID);
    metrics.lastWriterEpoch.set(lastWriterEpoch.get());
  }
  
  /**
   * After an upgrade we must ensure that the current directory still holds all
   * epoch, committed txid and paxos files that it had before we did the
   * upgrade.
   */
  private synchronized void copyMetaFilesForUpgrade() throws IOException {
    Configuration conf = new Configuration();
    File currentDir = journalStorage.getSingularStorageDir().getCurrentDir();
    File prevDir = journalStorage.getSingularStorageDir().getPreviousTmp();
    FileSystem fs = FileSystem.getLocal(conf).getRaw();

    FileUtil.copy(new File(prevDir, LAST_PROMISED_FILENAME), fs, new File(
        currentDir, LAST_PROMISED_FILENAME), false, conf);
    FileUtil.copy(new File(prevDir, LAST_WRITER_EPOCH), fs, new File(
        currentDir, LAST_WRITER_EPOCH), false, conf);
    FileUtil.copy(new File(prevDir, COMMITTED_TXID_FILENAME), fs, new File(
        currentDir, COMMITTED_TXID_FILENAME), false, conf);
    FileUtil.copy(new File(prevDir, JNStorage.PAXOS_DIR), fs, new File(
        currentDir, JNStorage.PAXOS_DIR), false, conf);
  }

  /**
   * Scan the local storage directory, and return the segment containing
   * the highest transaction.
   * @return the EditLogFile with the highest transactions, or null
   * if no files exist.
   */
  private synchronized EditLogFile scanStorageForLatestEdits() throws IOException {
    if (!fjm.getStorageDirectory().getCurrentDir().exists()) {
      return null;
    }
    
    LOG.info("Scanning storage " + fjm);
    List<EditLogFile> files = fjm.getLogFiles(0);
    
    while (!files.isEmpty()) {
      EditLogFile latestLog = files.remove(files.size() - 1);
      latestLog.validateLog();
      LOG.info("Latest log is " + latestLog);
      if (latestLog.getLastTxId() == HdfsConstants.INVALID_TXID) {
        // the log contains no transactions
        LOG.warn("Latest log " + latestLog + " has no transactions. " +
            "moving it aside and looking for previous log");
        latestLog.moveAsideEmptyFile();
      } else {
        return latestLog;
      }
    }
    
    LOG.info("No files in " + fjm);
    return null;
  }

  public void transitionJournal(NamespaceInfo nsInfo,
      Transition transition, StartupOption startOpt) throws IOException {
    switch (transition) {
    case UPGRADE:
      doUpgradeJournal(nsInfo);
      break;
    case COMPLETE_UPGRADE:
      completeUpgradeJournal(nsInfo);
      break;
    case RECOVER:
      recoverJournal(startOpt);
      break;
    case FORMAT:
      formatJournal(nsInfo);
      break;
    case ROLLBACK:
      rollbackJournal(nsInfo);
      break;
    case FINALIZE:
      finalizeJournal();
      break;
    }
  }

  public void transitionImage(NamespaceInfo nsInfo,
      Transition transition, StartupOption startOpt) throws IOException {
    switch (transition) {
    case UPGRADE:
      doUpgradeImage(nsInfo);
      break;
    case COMPLETE_UPGRADE:
      completeUpgradeImage(nsInfo);
      break;
    case RECOVER:
      recoverImage(startOpt);
      break;
    case FORMAT:
      formatImage(nsInfo);
      break;
    case ROLLBACK:
      rollbackImage(nsInfo);
      break;
    case FINALIZE:
      finalizeImage();
      break;
    }
  }

  /**
   * Format the local storage with the given namespace.
   */
  private void formatImage(NamespaceInfo nsInfo) throws IOException {
    Preconditions.checkState(nsInfo.getNamespaceID() != 0,
        "can't format with uninitialized namespace info: %s",
        nsInfo.toColonSeparatedString());
    LOG.info("Formatting image " + this.getJournalId() + " with namespace info: (" +
        nsInfo.toColonSeparatedString() + ")");
    imageStorage.backupDirs();
    imageStorage.format(nsInfo);
    // clear obsolete image digests
    checkpointImageDigests.clear();
  }

  /**
   * Finalize upgrade for the local image storage.
   */
  public void finalizeImage() throws IOException {
    LOG.info("Finalizing upgrade for journal " + this.getJournalId());
    imageStorage.finalizeStorage();
  }

  /**
   * Rollback the local image storage with the given namespace.
   */
  public void rollbackImage(NamespaceInfo nsInfo) throws IOException {
    Preconditions.checkState(nsInfo.getLayoutVersion() != 0,
        "can't rollback with uninitialized layout version: %s",
        nsInfo.toColonSeparatedString());
    LOG.info("Rolling back image " + this.getJournalId()
        + " with namespace info: (" + nsInfo.toColonSeparatedString() + ")");
    imageStorage.rollback(nsInfo);
  }

  /**
   * Upgrade the local image storage with the given namespace.
   */
  private void doUpgradeImage(NamespaceInfo nsInfo) throws IOException {
    Preconditions.checkState(nsInfo.getNamespaceID() != 0,
        "can't upgrade with uninitialized namespace info: %s",
        nsInfo.toColonSeparatedString());
    LOG.info("Upgrading image " + this.getJournalId()
        + " with namespace info: (" + nsInfo.toColonSeparatedString() + ")");

    // clear the digest for the most recent image, it might change during
    // upgrade
    checkpointImageDigests.remove(mostRecentCheckpointTxid);

    imageStorage.doUpgrade(nsInfo);
  }

  /**
   * Complete the upgrade for local image storage with the given namespace.
   */
  private void completeUpgradeImage(NamespaceInfo nsInfo) throws IOException {
    Preconditions.checkState(nsInfo.getNamespaceID() != 0,
        "can't upgrade with uninitialized namespace info: %s",
        nsInfo.toColonSeparatedString());
    LOG.info("Completing Upgrading image " + this.getJournalId()
        + " with namespace info: (" + nsInfo.toColonSeparatedString() + ")");
    // Do something about checkpoint image digests.

    imageStorage.completeUpgrade(nsInfo);
  }

  /**
   * Format the local storage with the given namespace.
   */
  private void formatJournal(NamespaceInfo nsInfo) throws IOException {
    Preconditions.checkState(nsInfo.getNamespaceID() != 0,
        "can't format with uninitialized namespace info: %s",
        nsInfo.toColonSeparatedString());
    LOG.info("Formatting journal" + this.getJournalId()
        + " with namespace info: (" + nsInfo.toColonSeparatedString() + ")");
    journalStorage.backupDirs();
    journalStorage.format(nsInfo);
    refreshCachedData();
  }

  /**
   * Complete the upgrade for local journal storage with the given namespace.
   */
  private void completeUpgradeJournal(NamespaceInfo nsInfo) throws IOException {
    Preconditions.checkState(nsInfo.getNamespaceID() != 0,
        "can't upgrade with uninitialized namespace info: %s",
        nsInfo.toColonSeparatedString());
    LOG.info("Completing Upgrading journal" + this.getJournalId()
        + " with namespace info: (" + nsInfo.toColonSeparatedString() + ")");
    journalStorage.completeUpgrade(nsInfo);
  }

  /**
   * Finalize upgrade for the local journal storage.
   */
  public void finalizeJournal() throws IOException {
    LOG.info("Finalizing upgrade for journal " + this.getJournalId());
    journalStorage.finalizeStorage();
  }

  /**
   * Rollback the local journal storage with the given namespace.
   */
  public void rollbackJournal(NamespaceInfo nsInfo) throws IOException {
    Preconditions.checkState(nsInfo.getLayoutVersion() != 0,
        "can't rollback with uninitialized layout version : %s",
        nsInfo.toColonSeparatedString());
    LOG.info("Rolling back journal " + this.getJournalId()
        + " with namespace info: (" + nsInfo.toColonSeparatedString() + ")");
    journalStorage.rollback(nsInfo);

    refreshCachedData();
  }

  /**
   * Upgrade the local journal storage with the given namespace.
   */
  private void doUpgradeJournal(NamespaceInfo nsInfo) throws IOException {
    Preconditions.checkState(nsInfo.getNamespaceID() != 0,
        "can't upgrade with uninitialized namespace info: %s",
        nsInfo.toColonSeparatedString());
    LOG.info("Upgrading journal " + this.getJournalId()
        + " with namespace info: (" + nsInfo.toColonSeparatedString() + ")");
    journalStorage.doUpgrade(nsInfo);

    copyMetaFilesForUpgrade();
    refreshCachedData();
  }

  /**
   * Recover the local journal storage.
   */
  private void recoverJournal(StartupOption startOpt) throws IOException {
    LOG.info("Recovering journal " + this.getJournalId());
    journalStorage.recover(startOpt);
  }

  /**
   * Recover the local image storage.
   */
  private void recoverImage(StartupOption startOpt) throws IOException {
    LOG.info("Recovering image" + this.getJournalId());
    imageStorage.recover(startOpt);
  }

  /**
   * Unlock and release resources.
   */
  @Override // Closeable
  public void close() throws IOException {
    journalStorage.close();
    imageStorage.close();
    
    IOUtils.closeStream(committedTxnId);
  }
  
  public JNStorage getJournalStorage() throws IOException {
    // this function is used by servlets to store and serve edit log segments
    // we should fail immediately if the edits storage is not formatted
    checkJournalStorageFormatted();
    return journalStorage;
  }
  
  
  public JNStorage getImageStorage() throws IOException {
    // this function is used by servlets to store and serve image
    // we should fail immediately if the image storage is not formatted
    checkImageStorageFormatted();
    return imageStorage;
  }
  
  String getJournalId() {
    return journalId;
  }

  /**
   * @return the last epoch which this node has promised not to accept
   * any lower epoch, or 0 if no promises have been made.
   */
  synchronized long getLastPromisedEpoch() throws IOException {
    checkJournalStorageFormatted();
    return lastPromisedEpoch.get();
  }

  synchronized public long getLastWriterEpoch() throws IOException {
    checkJournalStorageFormatted();
    return lastWriterEpoch.get();
  }
  
  synchronized long getCommittedTxnId() throws IOException {
    return committedTxnId.get();
  }
  
  synchronized long getCurrentLagTxns() throws IOException {
    long committed = committedTxnId.get();
    if (committed == 0) {
      return 0;
    }
    
    return Math.max(committed - highestWrittenTxId, 0L);
  }
  
  public synchronized long getHighestWrittenTxId() {
    return highestWrittenTxId;
  }
  
  JournalMetrics getMetrics() {
    return metrics;
  }

  /**
   * Try to create a new epoch for this journal.
   * @param nsInfo the namespace, which is verified for consistency or used to
   * format, if the Journal has not yet been written to.
   * @param epoch the epoch to start
   * @return the status information necessary to begin recovery
   * @throws IOException if the node has already made a promise to another
   * writer with a higher epoch number, if the namespace is inconsistent,
   * or if a disk error occurs.
   */
  synchronized NewEpochResponseProto newEpoch(
      NamespaceInfo nsInfo, long epoch) throws IOException {

    checkJournalStorageFormatted();
    journalStorage.checkConsistentNamespace(nsInfo);
    
    // if we are storing image too, check consistency as well
    if (imageStorage.isFormatted()) {
      imageStorage.checkConsistentNamespace(nsInfo);
    }

    // Check that the new epoch being proposed is in fact newer than
    // any other that we've promised. 
    if (epoch <= getLastPromisedEpoch()) {
      throw new IOException("Proposed epoch " + epoch + " <= last promise " +
          getLastPromisedEpoch());
    }
    
    updateLastPromisedEpoch(epoch);
    abortCurSegment();
    
    NewEpochResponseProto ret = new NewEpochResponseProto();

    EditLogFile latestFile = scanStorageForLatestEdits();

    if (latestFile != null) {
      ret.setLastSegmentTxId(latestFile.getFirstTxId());
    }
    return ret;
  }

  private void updateLastPromisedEpoch(long newEpoch) throws IOException {
    LOG.info("Updating lastPromisedEpoch from " + lastPromisedEpoch.get()
        + " to " + newEpoch + " for client " + Server.getRemoteIp());
    lastPromisedEpoch.set(newEpoch);
    
    // Since we have a new writer, reset the IPC serial - it will start
    // counting again from 0 for this writer.
    currentEpochIpcSerial = -1;
  }

  private void abortCurSegment() throws IOException {
    if (curSegment == null) {
      return;
    }
    
    curSegment.abort();
    curSegment = null;
    curSegmentTxId = HdfsConstants.INVALID_TXID;
    currentSegmentWrittenBytes = 0L;
  }

  /**
   * Write a batch of edits to the journal.
   * {@see QJournalProtocol#journal(RequestInfo, long, long, int, byte[])}
   */
  synchronized ShortVoid journal(RequestInfo reqInfo, 
      long segmentTxId, long firstTxnId, 
      int numTxns, byte[] records) throws IOException {
    checkJournalStorageFormatted();
    checkWriteRequest(reqInfo);

    if (curSegment == null) {
      checkSync(false, "Can't write, no segment open");
    }
    
    if (curSegmentTxId != segmentTxId) {
      // Sanity check: it is possible that the writer will fail IPCs
      // on both the finalize() and then the start() of the next segment.
      // This could cause us to continue writing to an old segment
      // instead of rolling to a new one, which breaks one of the
      // invariants in the design. If it happens, abort the segment
      // and throw an exception.
      JournalOutOfSyncException e = new JournalOutOfSyncException(
          "Writer out of sync: it thinks it is writing segment " + segmentTxId
          + " but current segment is " + curSegmentTxId);
      abortCurSegment();
      throw e;
    }
      
    if (nextTxId != firstTxnId) {
      checkSync(false, "Can't write txid " + firstTxnId
          + " expecting nextTxId=" + nextTxId);
    }
    
    long lastTxnId = firstTxnId + numTxns - 1;
    if (LOG.isTraceEnabled()) {
      LOG.trace("Writing txid " + firstTxnId + "-" + lastTxnId);
    }

    // If the edit has already been marked as committed, we know
    // it has been fsynced on a quorum of other nodes, and we are
    // "catching up" with the rest. Hence we do not need to fsync.
    boolean isLagging = lastTxnId <= committedTxnId.get();
    boolean shouldFsync = !isLagging;
    
    curSegment.writeRaw(records, 0, records.length);
    curSegment.setReadyToFlush();
    long start = System.nanoTime();
    curSegment.flush(shouldFsync);
    long time = DFSUtil.getElapsedTimeMicroSeconds(start);
    currentSegmentWrittenBytes += records.length;
    
    metrics.addSync(time);
    
    if (time / 1000 > WARN_SYNC_MILLIS_THRESHOLD) {
      LOG.warn("Sync of transaction range " + firstTxnId + "-" + lastTxnId + 
               " took " + (time / 1000) + "ms");
    }

    if (isLagging) {
      // This batch of edits has already been committed on a quorum of other
      // nodes. So, we are in "catch up" mode. This gets its own metric.
      metrics.batchesWrittenWhileLagging.inc(1);
    }
    
    metrics.batchesWritten.inc(1);
    metrics.bytesWritten.inc(records.length);
    metrics.txnsWritten.inc(numTxns);
    
    highestWrittenTxId = lastTxnId;
    metrics.setLastWrittenTxId(highestWrittenTxId);
    metrics.setCurrentTxnsLag(getCurrentLagTxns());
    nextTxId = lastTxnId + 1;
    return ShortVoid.instance;
  }

  public void heartbeat(RequestInfo reqInfo) throws IOException {
    checkRequest(reqInfo);
  }
  
  /**
   * Returns true if the given epoch matches the lastPromisedEpoch, false, otherwise.
   */
  boolean checkWriterEpoch(long epoch) throws IOException {
    return epoch == lastPromisedEpoch.get();
  }
  
  /**
   * Ensure that the given request is coming from the correct writer and in-order.
   * @param reqInfo the request info
   * @throws IOException if the request is invalid.
   */
  private synchronized void checkRequest(RequestInfo reqInfo) throws IOException {
    // Invariant 25 from ZAB paper
    if (reqInfo.getEpoch() < lastPromisedEpoch.get()) {
      throw new IOException("IPC's epoch " + reqInfo.getEpoch() +
          " is less than the last promised epoch " +
          lastPromisedEpoch.get());
    } else if (reqInfo.getEpoch() > lastPromisedEpoch.get()) {
      // A newer client has arrived. Fence any previous writers by updating
      // the promise.
      updateLastPromisedEpoch(reqInfo.getEpoch());
    }
    
    // Ensure that the IPCs are arriving in-order as expected.
    if (reqInfo.getIpcSerialNumber() <= currentEpochIpcSerial) {
      checkSync(false,
          "IPC serial %s from client %s was not higher than prior highest " +
          "IPC serial %s", reqInfo.getIpcSerialNumber(),
          Server.getRemoteIp(),
          currentEpochIpcSerial);
    }
    currentEpochIpcSerial = reqInfo.getIpcSerialNumber();

    if (reqInfo.hasCommittedTxId()) {
      if (reqInfo.getCommittedTxId() < committedTxnId.get()) {
        throw new IllegalArgumentException(
            "Client trying to move committed txid backward from "
                + committedTxnId.get() + " to " + reqInfo.getCommittedTxId());
      }
      // persist txid every second, as it is not needed for correctness
      boolean persist = (now() - lastPersistedCommittedTxId) > 1000;
      if (persist) {
        lastPersistedCommittedTxId = now();
      }
      committedTxnId.set(reqInfo.getCommittedTxId(), persist);
    }
  }
  
  static long now() {
    return System.currentTimeMillis();
  }
  
  private synchronized void checkWriteRequest(RequestInfo reqInfo) throws IOException {
    checkRequest(reqInfo);
    
    if (reqInfo.getEpoch() != lastWriterEpoch.get()) {
      throw new IOException("IPC's epoch " + reqInfo.getEpoch() +
          " is not the current writer epoch  " +
          lastWriterEpoch.get());
    }
  }
  
  public synchronized boolean isJournalFormatted() {
    return journalStorage.isFormatted();
  }
  
  public synchronized boolean isImageFormatted() {
    return imageStorage.isFormatted();
  }

  private void checkJournalStorageFormatted()
      throws JournalNotFormattedException {
    if (!journalStorage.isFormatted()) {
      String msg = "Journal edits: " + journalStorage.getSingularStorageDir()
          + " not formatted";
      LOG.error(msg);
      throw new JournalNotFormattedException(msg);
    }
  }

  private void checkImageStorageFormatted() throws JournalNotFormattedException {
    if (!imageStorage.isFormatted()) {
      String msg = "Journal image: " + imageStorage.getSingularStorageDir()
          + " not formatted";
      LOG.error(msg);
      throw new JournalNotFormattedException(msg);
    }
  }
  
  /**
   * Get the transaction id of the currently written segment.
   */
  public synchronized long getCurrentSegmentTxId() {
    return curSegmentTxId;
  }
  
  /**
   * Get upper bound on the size of the currently written segment.
   */
  public synchronized long getValidSizeOfCurrentSegment() {
    return currentSegmentWrittenBytes;
  }

  /**
   * @throws JournalOutOfSyncException if the given expression is not true.
   * The message of the exception is formatted using the 'msg' and
   * 'formatArgs' parameters.
   */
  private void checkSync(boolean expression, String msg,
      Object... formatArgs) throws JournalOutOfSyncException {
    if (!expression) {
      throw new JournalOutOfSyncException(String.format(msg, formatArgs));
    }
  }

  /**
   * @throws AssertionError if the given expression is not true.
   * The message of the exception is formatted using the 'msg' and
   * 'formatArgs' parameters.
   * 
   * This should be used in preference to Java's built-in assert in
   * non-performance-critical paths, where a failure of this invariant
   * might cause the protocol to lose data. 
   */
  private void alwaysAssert(boolean expression, String msg,
      Object... formatArgs) {
    if (!expression) {
      throw new AssertionError(String.format(msg, formatArgs));
    }
  }
  
  /**
   * Start a new segment at the given txid. The previous segment
   * must have already been finalized.
   */
  public synchronized void startLogSegment(RequestInfo reqInfo, long txid)
      throws IOException {
    assert fjm != null;
    checkJournalStorageFormatted();
    checkRequest(reqInfo);
    
    if (curSegment != null) {
      LOG.warn("Client is requesting a new log segment " + txid + 
          " though we are already writing " + curSegment + ". " +
          "Aborting the current segment in order to begin the new one.");
      // The writer may have lost a connection to us and is now
      // re-connecting after the connection came back.
      // We should abort our own old segment.
      abortCurSegment();
    }

    // Paranoid sanity check: we should never overwrite a finalized log file.
    // Additionally, if it's in-progress, it should have at most 1 transaction.
    // This can happen if the writer crashes exactly at the start of a segment.
    EditLogFile existing = fjm.getLogFile(txid);
    if (existing != null) {
      if (!existing.isInProgress()) {
        throw new IllegalStateException("Already have a finalized segment " +
            existing + " beginning at " + txid);
      }
      
      // If it's in-progress, it should only contain one transaction,
      // because the "startLogSegment" transaction is written alone at the
      // start of each segment. 
      existing.validateLog();
      if (existing.getLastTxId() != existing.getFirstTxId()) {
        throw new IllegalStateException("The log file " +
            existing + " seems to contain valid transactions");
      }
    }
    
    long curLastWriterEpoch = lastWriterEpoch.get();
    if (curLastWriterEpoch != reqInfo.getEpoch()) {
      LOG.info("Updating lastWriterEpoch from " + curLastWriterEpoch +
          " to " + reqInfo.getEpoch() + " for client " +
          Server.getRemoteIp());
      lastWriterEpoch.set(reqInfo.getEpoch());
      metrics.lastWriterEpoch.set(reqInfo.getEpoch());
    }

    // The fact that we are starting a segment at this txid indicates
    // that any previous recovery for this same segment was aborted.
    // Otherwise, no writer would have started writing. So, we can
    // remove the record of the older segment here.
    purgePaxosDecision(txid);
    
    curSegment = fjm.startLogSegment(txid);
    curSegmentTxId = txid;
    nextTxId = txid;
    // the layout version has only been written
    // plus OP_INVALID
    currentSegmentWrittenBytes = 5L;
    
    // inform the syncer service that we might have some work to do
    if (journalNode != null) {
      journalNode.addSyncTask(this, curSegmentTxId);
    }
  }
  
  /**
   * Finalize the log segment at the given transaction ID.
   */
  public synchronized void finalizeLogSegment(RequestInfo reqInfo, long startTxId,
      long endTxId) throws IOException {
    checkJournalStorageFormatted();
    checkRequest(reqInfo);
    
    boolean needsValidation = true;

    // Finalizing the log that the writer was just writing.
    if (startTxId == curSegmentTxId) {
      if (curSegment != null) {
        curSegment.close();
        curSegment = null;
        curSegmentTxId = HdfsConstants.INVALID_TXID;
        currentSegmentWrittenBytes = 0L;
      }
      
      checkSync(nextTxId == endTxId + 1,
          "Trying to finalize in-progress log segment %s to end at " +
          "txid %s but only written up to txid %s",
          startTxId, endTxId, nextTxId - 1);
      // No need to validate the edit log if the client is finalizing
      // the log segment that it was just writing to.
      needsValidation = false;
    }
    
    FileJournalManager.EditLogFile elf = fjm.getLogFile(startTxId);
    if (elf == null) {
      throw new JournalOutOfSyncException("No log file to finalize at " +
          "transaction ID " + startTxId);
    }

    if (elf.isInProgress()) {
      if (needsValidation) {
        LOG.info("Validating log segment " + elf.getFile() + " about to be " +
            "finalized");
        elf.validateLog();
  
        checkSync(elf.getLastTxId() == endTxId,
            "Trying to finalize in-progress log segment %s to end at " +
            "txid %s but log %s on disk only contains up to txid %s",
            startTxId, endTxId, elf.getFile(), elf.getLastTxId());
      }
      fjm.finalizeLogSegment(startTxId, endTxId);
    } else {
      Preconditions.checkArgument(endTxId == elf.getLastTxId(),
          "Trying to re-finalize already finalized log " +
              elf + " with different endTxId " + endTxId);
    }

    // Once logs are finalized, a different length will never be decided.
    // During recovery, we treat a finalized segment the same as an accepted
    // recovery. Thus, we no longer need to keep track of the previously-
    // accepted decision. The existence of the finalized log segment is enough.
    purgePaxosDecision(elf.getFirstTxId());
  }
  
  /**
   * @see JournalManager#purgeLogsOlderThan(long)
   */
  public synchronized void purgeLogsOlderThan(RequestInfo reqInfo,
      long minTxIdToKeep) throws IOException {
    checkJournalStorageFormatted();
    checkRequest(reqInfo);
    
    journalStorage.purgeDataOlderThan(minTxIdToKeep);
    if (minTxIdToKeep == FSEditLog.PURGE_ALL_TXID) {
      // When trying to remove all the segments, reset
      // the committed transaction ID too.
      committedTxnId.set(0, true);
      minTxid = 0;
    } else {
      minTxid = minTxIdToKeep;
    }
    if (imageStorage.isFormatted()) {
      imageStorage.purgeDataOlderThan(minTxIdToKeep == 0 ? -1 : minTxIdToKeep);
    }
  }
  
  public synchronized long getMinTxid() {
    return minTxid;
  }
  
  /**
   * Remove the previously-recorded 'accepted recovery' information
   * for a given log segment, once it is no longer necessary. 
   * @param segmentTxId the transaction ID to purge
   * @throws IOException if the file could not be deleted
   */
  private void purgePaxosDecision(long segmentTxId) throws IOException {
    File paxosFile = journalStorage.getPaxosFile(segmentTxId);
    if (paxosFile.exists()) {
      if (!paxosFile.delete()) {
        throw new IOException("Unable to delete paxos file " + paxosFile);
      }
    }
  }
  
  /**
   * Get the list of underlying this journal
   */
  List<EditLogFile> getAllLogFiles() throws IOException {
    return fjm.getLogFiles(HdfsConstants.INVALID_TXID, false);
  }

  /**
   * @see QJournalProtocol#getEditLogManifest(String, long)
   */
  public RemoteEditLogManifest getEditLogManifest(long sinceTxId)
      throws IOException {
    // No need to checkRequest() here - anyone may ask for the list
    // of segments.
    checkJournalStorageFormatted();
    
    RemoteEditLogManifest manifest = fjm.getEditLogManifest(sinceTxId);
    return manifest;
  }

  /**
   * @return the current state of the given segment, or null if the
   * segment does not exist.
   */
  private SegmentStateProto getSegmentInfo(long segmentTxId)
      throws IOException {
    EditLogFile elf = fjm.getLogFile(segmentTxId);
    if (elf == null) {
      return null;
    }
    if (elf.isInProgress()) {
      elf.validateLog();
    }
    if (elf.getLastTxId() == HdfsConstants.INVALID_TXID) {
      LOG.info("Edit log file " + elf + " appears to be empty. " +
          "Moving it aside...");
      elf.moveAsideEmptyFile();
      return null;
    }
    SegmentStateProto ret = new SegmentStateProto(segmentTxId, elf.getLastTxId(), elf.isInProgress());
    LOG.info("getSegmentInfo(" + segmentTxId + "): " + elf + " -> " + ret);
    return ret;
  }

  /**
   * @see QJournalProtocol#prepareRecovery(RequestInfo, long)
   */
  public synchronized PrepareRecoveryResponseProto prepareRecovery(
      RequestInfo reqInfo, long segmentTxId) throws IOException {
    checkJournalStorageFormatted();
    checkRequest(reqInfo);
    
    abortCurSegment();
    
    PrepareRecoveryResponseProto ret = new PrepareRecoveryResponseProto();

    PersistedRecoveryPaxosData previouslyAccepted = getPersistedPaxosData(segmentTxId);
    completeHalfDoneAcceptRecovery(previouslyAccepted);

    SegmentStateProto segInfo = getSegmentInfo(segmentTxId);
    boolean hasFinalizedSegment = segInfo != null && !segInfo.getIsInProgress();

    if (previouslyAccepted != null && !hasFinalizedSegment) {
      ret.setAcceptedInEpoch(previouslyAccepted.getAcceptedInEpoch());   
      ret.setSegmentState(previouslyAccepted.getSegmentState());
    } else {
      if (segInfo != null) {
        ret.setSegmentState(segInfo);
      }
    }
    
    ret.setLastWriterEpoch(lastWriterEpoch.get());
    if (committedTxnId.get() != HdfsConstants.INVALID_TXID) {
      ret.setLastCommittedTxId(committedTxnId.get());
    }
    
    LOG.info("Prepared recovery for segment " + segmentTxId + ": " + ret);
    return ret;
  }
  
  /**
   * @see QJournalProtocol#acceptRecovery(RequestInfo, SegmentStateProto, URL)
   */
  public synchronized void acceptRecovery(RequestInfo reqInfo,
      SegmentStateProto segment, URL fromUrl)
      throws IOException {
    checkJournalStorageFormatted();
    checkRequest(reqInfo);
    
    abortCurSegment();

    long segmentTxId = segment.getStartTxId();

    // Basic sanity checks that the segment is well-formed and contains
    // at least one transaction.
    Preconditions.checkArgument(segment.getEndTxId() > -1 &&
        segment.getEndTxId() >= segmentTxId,
        "bad recovery state for segment %s: %s",
        segmentTxId, segment);
    
    PersistedRecoveryPaxosData oldData = getPersistedPaxosData(segmentTxId);
    PersistedRecoveryPaxosData newData = new PersistedRecoveryPaxosData();
    newData.setAcceptedInEpoch(reqInfo.getEpoch());
    newData.setSegmentState(segment);
    
    // If we previously acted on acceptRecovery() from a higher-numbered writer,
    // this call is out of sync. We should never actually trigger this, since the
    // checkRequest() call above should filter non-increasing epoch numbers.
    if (oldData != null) {
      alwaysAssert(oldData.getAcceptedInEpoch() <= reqInfo.getEpoch(),
          "Bad paxos transition, out-of-order epochs.\nOld: %s\nNew: %s\n",
          oldData, newData);
    }
    
    File syncedFile = null;
    
    SegmentStateProto currentSegment = getSegmentInfo(segmentTxId);
    if (currentSegment == null ||
        currentSegment.getEndTxId() != segment.getEndTxId()) {
      if (currentSegment == null) {
        LOG.info("Synchronizing log " + segment +
            ": no current segment in place");
        
        // Update the highest txid for lag metrics
        highestWrittenTxId = Math.max(segment.getEndTxId(),
            highestWrittenTxId);
        metrics.setLastWrittenTxId(highestWrittenTxId);
      } else {
        LOG.info("Synchronizing log " + segment +
            ": old segment " + currentSegment +
            " is not the right length");
        
        // Paranoid sanity check: if the new log is shorter than the log we
        // currently have, we should not end up discarding any transactions
        // which are already Committed.
        if (txnRange(currentSegment).contains(committedTxnId.get()) &&
            !txnRange(segment).contains(committedTxnId.get())) {
          throw new AssertionError(
              "Cannot replace segment " + currentSegment +
              " with new segment " + segment + 
              ": would discard already-committed txn " +
              committedTxnId.get());
        }
        
        // Another paranoid check: we should not be asked to synchronize a log
        // on top of a finalized segment.
        alwaysAssert(currentSegment.getIsInProgress(),
            "Should never be asked to synchronize a different log on top of an " +
            "already-finalized segment");
        
        // If we're shortening the log, update our highest txid
        // used for lag metrics.
        if (txnRange(currentSegment).contains(highestWrittenTxId)) {
          highestWrittenTxId = segment.getEndTxId();
          metrics.setLastWrittenTxId(highestWrittenTxId);
        }
      }
      syncedFile = syncLog(reqInfo, segment, fromUrl);
      
    } else {
      LOG.info("Skipping download of log " + segment +
          ": already have up-to-date logs");
    }
    
    // This is one of the few places in the protocol where we have a single
    // RPC that results in two distinct actions:
    //
    // - 1) Downloads the new log segment data (above)
    // - 2) Records the new Paxos data about the synchronized segment (below)
    //
    // These need to be treated as a transaction from the perspective
    // of any external process. We do this by treating the persistPaxosData()
    // success as the "commit" of an atomic transaction. If we fail before
    // this point, the downloaded edit log will only exist at a temporary
    // path, and thus not change any externally visible state. If we fail
    // after this point, then any future prepareRecovery() call will see
    // the Paxos data, and by calling completeHalfDoneAcceptRecovery() will
    // roll forward the rename of the referenced log file.
    //
    // See also: HDFS-3955
    //
    // The fault points here are exercised by the randomized fault injection
    // test case to ensure that this atomic "transaction" operates correctly.
    JournalFaultInjector.get().beforePersistPaxosData();
    persistPaxosData(segmentTxId, newData);
    JournalFaultInjector.get().afterPersistPaxosData();

    if (syncedFile != null) {
      FileUtil.replaceFile(syncedFile,
          journalStorage.getInProgressEditLog(segmentTxId));
    }

    LOG.info("Accepted recovery for segment " + segmentTxId + ": " +
        newData);
  }

  private Range<Long> txnRange(SegmentStateProto seg) {
    return Ranges.closed(seg.getStartTxId(), seg.getEndTxId());
  }

  /**
   * Synchronize a log segment from another JournalNode. The log is
   * downloaded from the provided URL into a temporary location on disk,
   * which is named based on the current request's epoch.
   *
   * @return the temporary location of the downloaded file
   */
  File syncLog(RequestInfo reqInfo, final SegmentStateProto segment,
      final URL url) throws IOException {
    long startTxId = segment.getStartTxId();
    long epoch = reqInfo.getEpoch();
    return syncLog(epoch, segment.getStartTxId(), url, segment.toString(),
        journalStorage.getSyncLogTemporaryFile(startTxId, epoch));
  }

  /**
   * Synchronize a log segment from another JournalNode. The log is
   * downloaded from the provided URL into a temporary location on disk
   */
  File syncLog(long stamp, final long startTxId, final URL url, String name, File tmpFile)
      throws IOException {
    final File[] localPaths = new File[] { tmpFile };

    // TODO add security if needed.
    LOG.info("Synchronizing log " + name + " from " + url);
    boolean success = false;
    try {
      TransferFsImage.doGetUrl(
          url,
          ImageSet.convertFilesToStreams(localPaths, journalStorage,
              url.toString()), journalStorage, true);
      assert tmpFile.exists();
      success = true;
    } finally {
      if (!success) {
        if (!tmpFile.delete()) {
          LOG.warn("Failed to delete temporary file " + tmpFile);
        }
      }
    }
    return tmpFile;
  }
  

  /**
   * In the case the node crashes in between downloading a log segment
   * and persisting the associated paxos recovery data, the log segment
   * will be left in its temporary location on disk. Given the paxos data,
   * we can check if this was indeed the case, and "roll forward"
   * the atomic operation.
   * 
   * See the inline comments in
   * {@link #acceptRecovery(RequestInfo, SegmentStateProto, URL)} for more
   * details.
   *
   * @throws IOException if the temporary file is unable to be renamed into
   * place
   */
  private void completeHalfDoneAcceptRecovery(
      PersistedRecoveryPaxosData paxosData) throws IOException {
    if (paxosData == null) {
      return;
    }

    long segmentId = paxosData.getSegmentState().getStartTxId();
    long epoch = paxosData.getAcceptedInEpoch();
    
    File tmp = journalStorage.getSyncLogTemporaryFile(segmentId, epoch);
    
    if (tmp.exists()) {
      File dst = journalStorage.getInProgressEditLog(segmentId);
      LOG.info("Rolling forward previously half-completed synchronization: " +
          tmp + " -> " + dst);
      FileUtil.replaceFile(tmp, dst);
    }
  }

  /**
   * Retrieve the persisted data for recovering the given segment from disk.
   */
  private PersistedRecoveryPaxosData getPersistedPaxosData(long segmentTxId)
      throws IOException {
    File f = journalStorage.getPaxosFile(segmentTxId);
    if (!f.exists()) {
      // Default instance has no fields filled in (they're optional)
      return null;
    }
    
    InputStream in = new FileInputStream(f);
    try {
      PersistedRecoveryPaxosData ret = PersistedRecoveryPaxosData.parseDelimitedFrom(in);
      Preconditions.checkState(ret != null &&
          ret.getSegmentState().getStartTxId() == segmentTxId,
          "Bad persisted data for segment %s: %s",
          segmentTxId, ret);
      return ret;
    } finally {
      IOUtils.closeStream(in);
    }
  }

  /**
   * Persist data for recovering the given segment from disk.
   */
  private void persistPaxosData(long segmentTxId,
      PersistedRecoveryPaxosData newData) throws IOException {
    File f = journalStorage.getPaxosFile(segmentTxId);
    boolean success = false;
    AtomicFileOutputStream fos = new AtomicFileOutputStream(f);
    try {
      newData.writeDelimitedTo(fos);
      fos.write('\n');
      // Write human-readable data after the protobuf. This is only
      // to assist in debugging -- it's not parsed at all.
      OutputStreamWriter writer = new OutputStreamWriter(fos, Charsets.UTF_8);
      
      writer.write(String.valueOf(newData));
      writer.write('\n');
      writer.flush();
      
      fos.flush();
      success = true;
    } finally {
      if (success) {
        IOUtils.closeStream(fos);
      } else {
        fos.abort();
      }
    }
  }
  
  /**
   * Roll the image.
   */
  public void saveDigestAndRenameCheckpointImage(long txid, MD5Hash digest)
      throws IOException {
    MD5Hash storedDigest = checkpointImageDigests.get(txid);
    if (storedDigest == null || !storedDigest.equals(digest)) {
      throw new IOException("Digest of data written: " + storedDigest
          + " does not match requested digest: " + digest + " for txid: "
          + txid + ", journal: " + journalId);
    }
    imageManager.saveDigestAndRenameCheckpointImage(txid, digest);
    checkpointImageDigests.remove(txid);
  }

  synchronized void setCheckpointImageDigest(long txid, MD5Hash imageDigest)
      throws IOException {
    if (checkpointImageDigests.containsKey(txid)) {
      MD5Hash existing = checkpointImageDigests.get(txid);
      if (!existing.equals(imageDigest)) {
        throw new IOException(
            "Trying to set checkpoint image digest for txid: " + txid + "="
                + imageDigest + " existing " + existing + " for txid: " + txid
                + ", journal: " + journalId);
      }
    } else {
      checkpointImageDigests.put(txid, imageDigest);
      mostRecentCheckpointTxid = Math.max(mostRecentCheckpointTxid, txid);
    }
  }
  
  public synchronized void clearCheckpointImageDigests() {
    checkpointImageDigests.clear();
  }
  
  public RemoteImageManifest getImageManifest(long fromTxid) throws IOException {
    checkImageStorageFormatted();
    FSImageTransactionalStorageInspector inspector = new FSImageTransactionalStorageInspector();
    inspector.inspectDirectory(imageStorage.getSingularStorageDir());
    List<RemoteImage> results = Lists.newArrayList();

    for (FSImageFile foundImage : inspector.getFoundImages()) {
      // skip older images
      if (foundImage.getCheckpointTxId() < fromTxid) {
        continue;
      }
      // get md5 if exists
      MD5Hash digest = null;
      try {
        digest = MD5FileUtils.readStoredMd5ForFile(foundImage.getFile());
      } catch (IOException e) {
        LOG.warn("Exception when reading md5 file for image: " + foundImage, e);
      }
      // in case of failure the digest is simply "null"
      results.add(new RemoteImage(foundImage.getCheckpointTxId(), digest));
    }
    
    // sort the images
    Collections.sort(results);
    
    return new RemoteImageManifest(results);
  }

  public FileJournalManager getJournalManager() {
    return fjm;
  }
  
  /**
   * Get storage state for the journal storage directory.
   */
  public GetStorageStateProto analyzeJournalStorage() {
    return new GetStorageStateProto(journalStorage.getStorageState(),
        journalStorage.getStorageInfo());
  }
  
  /**
   * Get storage state for the image storage directory.
   */
  public GetStorageStateProto analyzeImageStorage() {
    return new GetStorageStateProto(imageStorage.getStorageState(),
        imageStorage.getStorageInfo());
  }
}