/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.namenode; import java.io.IOException; import java.io.File; import java.io.InterruptedIOException; import java.lang.Thread; import java.net.InetSocketAddress; import java.net.URI; import java.util.Date; import java.util.concurrent.CountDownLatch; import org.apache.commons.logging.Log; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.net.DNS; import org.apache.hadoop.util.InjectionHandler; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.hdfs.protocol.FSConstants; import org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager; import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; import org.apache.hadoop.hdfs.server.namenode.CheckpointSignature; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.FSImage; import org.apache.hadoop.hdfs.server.namenode.AvatarNode; import org.apache.hadoop.hdfs.server.namenode.NNStorage.StorageLocationType; import org.apache.hadoop.hdfs.server.namenode.metrics.AvatarNodeMetrics; import org.apache.hadoop.hdfs.tools.offlineImageViewer.LsImageVisitor; import org.apache.hadoop.hdfs.tools.offlineImageViewer.OfflineImageViewer; import org.apache.hadoop.hdfs.util.InjectionEvent; import org.apache.hadoop.http.HttpServer; /** * This class drives the ingest of transaciton logs from primary. * It also implements periodic checkpointing of the primary namenode. */ public class Standby implements Runnable{ // allowed states of the ingest thread enum StandbyIngestState { NOT_INGESTING, INGESTING_EDITS, CHECKPOINTING, STANDBY_QUIESCED }; public static final Log LOG = AvatarNode.LOG; private static final long CHECKPOINT_DELAY = 10000; // 10 seconds public static long CHECKPOINT_SLEEP_BEFORE_RETRY = 30000; volatile private boolean running; // standby namenode private AvatarNode avatarNode; private Configuration confg; // configuration of local standby namenode private Configuration startupConf; // original configuration of AvatarNode private FSImage fsImage; // fsImage of the current namenode. private FSNamesystem fsnamesys; // fsnamesystem of the local standby namenode // ingest for consuming transaction log private volatile Ingest ingest; // object that processes transaction logs from primary private volatile Thread ingestThread; // thread that is procesing the transaction log // primary namenode private InetSocketAddress nameNodeAddr; // remote primary namenode address private NamenodeProtocol primaryNamenode; // remote primary namenode private HttpServer infoServer; private int infoPort; // checkpointing private long checkpointPeriod; // in seconds private long checkpointTxnCount; // in txns private volatile long lastCheckpointTime; private long earlyScheduledCheckpointTime = Long.MAX_VALUE; private volatile long delayedScheduledCheckpointTime = 0L; private volatile boolean checkpointEnabled; private volatile CheckpointSignature sig; private volatile String checkpointStatus; private volatile ImageUploader imageUploader; private volatile CountDownLatch manualCheckpointLatch = new CountDownLatch(0); // for image upload private String fsName; // local namenode http name private final String machineName; // host name of name node private long sleepBetweenErrors; private volatile Thread backgroundThread; // thread for secondary namenode // journal from which we are ingesting transactions private final JournalManager remoteJournal; // lock to protect state private Object ingestStateLock = new Object(); protected volatile StandbyIngestState currentIngestState = StandbyIngestState.NOT_INGESTING; private volatile long currentSegmentTxId = -1; // this represents last transaction correctly loaded to namesystem // used for recovery, to rewind the state to last known checkpoint, but // reload transaction only from the lastCorrectlyLoadedTxId point private volatile long lastCorrectlyLoadedTxId = -1; // indicates to which txid the standby is quescing // used for quitting the ingest fast during failover volatile private long quiesceToTxid = Long.MAX_VALUE; // counts how many time standby failed to instantiate ingest private int ingestFailures = 0; private static final int MAX_INGEST_FAILURES = 10; // counts how many consecutive checkpoint failure the standby experienced private volatile int checkpointFailures = 0; static final int MAX_CHECKPOINT_FAILURES = 5; // max timeout for uploading image - 2 hours private static final long MAX_CHECKPOINT_UPLOAD_TIMEOUT = 2 * 60 * 60 * 1000; // maximum number of retires when instantiating ingest stream private final int inputStreamRetries; // image validation private final File tmpImageFileForValidation; private Object imageValidatorLock = new Object(); private ImageValidator imageValidator; // metrics private final AvatarNodeMetrics metrics; // The Standby can either be processing transaction logs // from the primary namenode or it could be doing a checkpoint to upload a merged // fsimage to the primary. // The startupConf is the original configuration that was used to start the // AvatarNode. It is used by the secondary namenode to talk to the primary. // The "conf" is the configuration of the local standby namenode. // Standby(AvatarNode avatarNode, Configuration startupConf, Configuration conf, InetSocketAddress nameNodeAddr, NamenodeProtocol primaryNamenode) throws IOException { this.running = true; this.avatarNode = avatarNode; this.metrics = avatarNode.getAvatarNodeMetrics(); this.confg = conf; this.startupConf = startupConf; this.fsImage = avatarNode.getFSImage(); this.fsnamesys = avatarNode.getNamesystem(); this.sleepBetweenErrors = startupConf.getInt("hdfs.avatarnode.sleep", 5000); this.nameNodeAddr = nameNodeAddr; this.primaryNamenode = primaryNamenode; initSecondary(startupConf); // start webserver for secondary namenode this.machineName = DNS.getDefaultIP(conf.get(FSConstants.DFS_NAMENODE_DNS_INTERFACE,"default")); LOG.info("machineName=" + machineName); InetSocketAddress addr = NameNode.getClientProtocolAddress(conf); this.tmpImageFileForValidation = new File("/tmp", "hadoop_image." + addr.getAddress().getHostAddress() + ":" + addr.getPort()); URI remoteJournalURI = avatarNode.getRemoteSharedEditsURI(conf); if (remoteJournalURI.getScheme().equals(NNStorage.LOCAL_URI_SCHEME)) { StorageDirectory remoteJournalStorage = fsImage.storage.new StorageDirectory( new File(remoteJournalURI.getPath())); remoteJournal = new FileJournalManager(remoteJournalStorage, null, null); } else if (remoteJournalURI.getScheme().equals( QuorumJournalManager.QJM_URI_SCHEME)) { // TODO for now we pass null for NameNodeMetrics // once we have shared log we will pass the actual metrics remoteJournal = new QuorumJournalManager(conf, remoteJournalURI, new NamespaceInfo(fsImage.storage), null, false); } else { remoteJournal = FSEditLog.createJournal(conf, remoteJournalURI, new NamespaceInfo(fsImage.storage), null); } // we will start ingestion from the txid of the image this.currentSegmentTxId = avatarNode.getFSImage().storage .getMostRecentCheckpointTxId() + 1; this.inputStreamRetries = confg.getInt("dfs.ingest.retries", 30); checkpointStatus("No checkpoint initiated"); } NamenodeProtocol getPrimaryNameNode(){ return primaryNamenode; } private void stopIngest() throws IOException { if (ingest != null) { ingest.stop(); try { ingestThread.join(); } catch (InterruptedException ie) { throw new IOException(ie); } ingestThread = null; ingest = null; } } public void run() { backgroundThread = Thread.currentThread(); while (running) { try { InjectionHandler.processEventIO(InjectionEvent.STANDBY_BEGIN_RUN); // if the checkpoint periodicity or the checkpoint size has // exceeded the configured parameters, then also we have to checkpoint // long now = AvatarNode.now(); checkAndRecoverState(); if (shouldScheduleCheckpoint(now)) { // schedule an early checkpoint if this current one fails. earlyScheduledCheckpointTime = now + CHECKPOINT_DELAY; doCheckpoint(); earlyScheduledCheckpointTime = Long.MAX_VALUE; lastCheckpointTime = now; InjectionHandler .processEvent(InjectionEvent.STANDBY_AFTER_DO_CHECKPOINT); } // if the checkpoint creation has switched off ingesting, then we restart the // ingestion here. if (currentIngestState == StandbyIngestState.NOT_INGESTING) { // the ingest might have finished but we need to ensure // we joined the thread. We do not quiesce the ingest here since that // incorrectly informs the ingest to move onto the next segment. stopIngest(); instantiateIngest(); } try { synchronized(backgroundThread) { backgroundThread.wait(sleepBetweenErrors); } } catch (InterruptedException e) { return; } } catch (SaveNamespaceCancelledException e) { return; } catch (FinalizeCheckpointException e) { LOG.info("Could not finalize checkpoint - will not kill ingest", e); // standby is not running any more if(!running) return; // if exception happened during finalization, ingestion completed // successfully and now we are consuming next segment // do not kill the ingest continue; } catch (IOException e) { LOG.warn("Standby: encounter exception " + StringUtils.stringifyException(e)); if(!running) // standby is quiescing return; try { synchronized(backgroundThread) { backgroundThread.wait(sleepBetweenErrors); } } catch (InterruptedException e1) { // give a change to exit this thread, if necessary } // since we had an error, we have to cleanup the ingest thread if (ingest != null) { ingest.stop(); try { ingestThread.join(); LOG.info("Standby: error cleanup Ingest thread exited."); } catch (InterruptedException em) { String msg = "Standby: error cleanup Ingest thread did not exit. " + em; LOG.info(msg); throw new RuntimeException(msg); } } } catch (Throwable e) { LOG.warn("Standby: encounter exception ", e); running = false; } } } private long getTransactionLag() { long lastCheckpoint = avatarNode.getFSImage().storage .getMostRecentCheckpointTxId(); long currentTransaction = avatarNode.getFSImage().getLastAppliedTxId(); return currentTransaction - lastCheckpoint; } protected JournalManager getRemoteJournal() { return remoteJournal; } synchronized void shutdown() { if (infoServer != null) { try { LOG.info("Shutting down secondary info server"); infoServer.stop(); infoServer = null; } catch (Exception ex) { LOG.error("Error shutting down infoServer", ex); } } if (remoteJournal != null) { try { LOG.info("Shutting down remote journal manager"); remoteJournal.close(); } catch (IOException ex) { LOG.error("Error shutting down infoServer", ex); } } } /** * Instantiate/quiesces edit log segments until the requested transaction * id has been reached. When -1 is given, it quiesces all available log segments. * Notice that there is a retry logic in ingest instantiation */ private long quiesceIngest(long lastTxId) throws IOException { return quiesceIngest(lastTxId, true); } private long quiesceIngest(long lastTxId, boolean recoverUnfinalizedSegments) throws IOException { final boolean ignoreLastTxid = lastTxId == FSEditLogLoader.TXID_IGNORE; long startSegmentId = currentSegmentTxId; LOG.info("Standby: Quiescing ingest - Consuming transactions up to: " + lastTxId); if (ignoreLastTxid && !(remoteJournal instanceof FileJournalManager) && recoverUnfinalizedSegments) { // Since we have not yet recovered unfinalized segments, we don't // quiesce the ingest thread and instead just join on it. stopIngest(); // we need to make sure to recover unclosed streams LOG.info("Standby: Recovering unclosed streams since the journal is non-file"); remoteJournal.recoverUnfinalizedSegments(); } else { // quiesce current ingest thread quiesceIngestWithReprocess(); } // instantiate ingest for needed segments and quiesce while (avatarNode.getLastWrittenTxId() < lastTxId || ignoreLastTxid) { LOG.info("Standby: Quiescing ingest up to: " + lastTxId + ", setting up ingest for txid: " + currentSegmentTxId); assertState(StandbyIngestState.NOT_INGESTING); try { instantiateIngest(); } catch (IOException e) { if (ignoreLastTxid) { LOG.warn("Cannot obtain the stream - exiting since the requested txid is " + lastTxId); break; } } quiesceIngestWithReprocess(); if (ingest.getIngestStatus() && currentSegmentTxId == startSegmentId) { // no progress, reached the end break; } startSegmentId = currentSegmentTxId; } // TODO successful roll can add 2 transactions // we need to take care of this if (lastTxId != FSEditLogLoader.TXID_IGNORE && lastTxId > avatarNode.getLastWrittenTxId()) { String msg = "Standby: Quiescing - Standby could not successfully ingest the edits up to: " + lastTxId + ", last consumed txid: " + avatarNode.getLastWrittenTxId(); LOG.fatal(msg); throw new StandbyStateException(msg); } LOG.info("Standby: Quiescing ingest - Consumed transactions up to: " + avatarNode.getLastWrittenTxId()); return currentSegmentTxId; } /** * When ingest consumes the end of segment transaction, it sets the state to * not ingesting. This function ensures that the ingest thread exited. * * @throws IOException */ private void quiesceIngestWithReprocess() throws IOException { if (ingest != null) { LOG.info("Standby: Quiescing - quiescing ongoing ingest"); quiesceIngest(); reprocessCurrentSegmentIfNeeded(ingest.getIngestStatus()); } } /** * Quiesces the currently running ingest */ private void quiesceIngest() throws IOException { InjectionHandler.processEvent(InjectionEvent.STANDBY_QUIESCE_INGEST); synchronized (ingestStateLock) { assertState(StandbyIngestState.INGESTING_EDITS, StandbyIngestState.NOT_INGESTING); ingest.quiesce(); } try { ingestThread.join(); currentIngestState = StandbyIngestState.NOT_INGESTING; LOG.info("Standby: Quiesce - Ingest thread for segment: " + ingest.toString() + " exited."); } catch (InterruptedException e) { LOG.info("Standby: Quiesce - Ingest thread interrupted."); throw new IOException(e.getMessage()); } } /** * Check the correct ingest state for instantiating new ingest * (synchronized on ingestStateLock) * @return true if the ingest for the requested txid is running * @throws IOException when the state is incorrect */ private boolean checkIngestState() throws IOException { if (currentIngestState == StandbyIngestState.INGESTING_EDITS) { String msg = ""; if (ingest == null) { msg = "Standby: Ingest instantiation failed, the state is " + currentIngestState + ", but the ingest is null"; } if (currentSegmentTxId != ingest.getStartTxId()) { msg = "Standby: Ingest instantiation failed, trying to instantiate ingest for txid: " + currentSegmentTxId + ", but there is ingest for txid: " + ingest.getStartTxId(); } if (currentIngestState == StandbyIngestState.NOT_INGESTING && ingest.isRunning()) { msg = "Standby: Ingest instantiation failed, the state is " + currentIngestState + " but the ingest is running"; } if (!msg.isEmpty()) { LOG.warn(msg); throw new IOException(msg); } return true; } return false; } private void checkAndRecoverState() throws Exception { if (ingestFailures > MAX_INGEST_FAILURES || InjectionHandler .falseCondition(InjectionEvent.STANDBY_RECOVER_STATE)) { LOG.info("Standby: Recovery - Ingest instantiation failed too many times"); long lastCorrectCheckpointTxId = avatarNode.getFSImage().storage .getMostRecentCheckpointTxId(); LOG.info("Standby: Recovery - Revert standby state to last checkpointed txid: " + lastCorrectCheckpointTxId + ", last correctly loaded txid: " + getLastCorrectTxId()); LOG.info("Standby: Recovery - cleaning up ingest thread"); clearIngestState(lastCorrectCheckpointTxId + 1); stopIngest(); currentIngestState = StandbyIngestState.NOT_INGESTING; LOG.info("Standby: Recovery - Purging all local logs"); FSEditLog editLog = avatarNode.getFSImage().getEditLog(); // ingest is down, no one should be writing to edit log if (editLog.isOpen()) { // end current log segment avatarNode.getFSImage().getEditLog().endCurrentLogSegment(true); } // remove all local logs editLog.purgeLogsOlderThan(FSEditLog.PURGE_ALL_TXID); // reset edit log state to last checkpointed txid editLog.resetTxIds(lastCorrectCheckpointTxId); setIngestFailures(0); LOG.info("Standby: Recovery - Completed"); } } private void setIngestFailures(int failures) { ingestFailures = failures; if (metrics != null) { metrics.numIngestFailures.set(failures); } } private void setCheckpointFailures(int failures) { checkpointFailures = failures; if (metrics != null) { metrics.numCheckpointFailures.set(failures); } } /** * Instantiates ingest thread for the current edits segment. */ private void instantiateIngest() throws IOException { InjectionHandler.processEvent(InjectionEvent.STANDBY_INSTANTIATE_INGEST); try { synchronized (ingestStateLock) { if (checkIngestState()) { LOG.info("Standby: Ingest for txid: " + currentSegmentTxId + " is already running"); return; } assertState(StandbyIngestState.NOT_INGESTING); ingest = new Ingest(this, fsnamesys, confg, currentSegmentTxId); ingestThread = new Thread(ingest); ingestThread.setName("Ingest_for_" + currentSegmentTxId); ingestThread.start(); currentIngestState = StandbyIngestState.INGESTING_EDITS; } LOG.info("Standby: Instatiated ingest for txid: " + currentSegmentTxId); } catch (IOException e) { setIngestFailures(ingestFailures + 1); currentIngestState = StandbyIngestState.NOT_INGESTING; throw e; } } /** * Processes previously consumed edits segment if needed * * @param status of the previous ingest */ private void reprocessCurrentSegmentIfNeeded(boolean status) throws IOException { if (status) { return; } assertState(StandbyIngestState.NOT_INGESTING); LOG.info("Standby: Quiesce - reprocessing edits segment starting at: " + currentSegmentTxId); instantiateIngest(); quiesceIngest(); // verify that the entire transaction log was truly consumed // when re-processing, if we fail here, we cannot do anything // better than fail if (!ingest.getIngestStatus()) { String emsg = "Standby: Quiesce could not successfully ingest " + "transaction log starting at " + currentSegmentTxId; LOG.warn(emsg); setIngestFailures(ingestFailures + 1); throw new IOException(emsg); } } /** * Stop the standby, read all edit log segments. * @param lastTxId - the last txid to be consumed * @throws IOException */ synchronized void quiesce(long lastTxId) throws IOException { quiesce(lastTxId, true); } synchronized void quiesce(long lastTxId, boolean recoverUnfinalizedSegments) throws IOException { if (currentIngestState == StandbyIngestState.STANDBY_QUIESCED) { LOG.info("Standby: Quiescing - already quiesced"); return; // nothing to do } // have to wait for the main thread to exit here // first stop the main thread before stopping the ingest thread LOG.info("Standby: Quiescing up to txid: " + lastTxId); running = false; // in case there run() retries to instantiate ingest for next // non-existent segment, we indicate that the standby can not // expect to find anything with txid higher than lastTxId quiesceToTxid = lastTxId; InjectionHandler.processEvent(InjectionEvent.STANDBY_QUIESCE_INITIATED); // cancel saving namespace, and image validation fsnamesys.cancelSaveNamespace("Standby: Quiescing - Cancel save namespace"); interruptImageValidation(); InjectionHandler.processEvent(InjectionEvent.STANDBY_QUIESCE_INTERRUPT); try { synchronized(backgroundThread) { backgroundThread.notifyAll(); } if (backgroundThread != null) { backgroundThread.join(); backgroundThread = null; } } catch (InterruptedException e) { LOG.info("Standby: quiesce interrupted."); throw new IOException(e.getMessage()); } finally { // We don't want to cancel further save namespaces done manually. fsnamesys.clearCancelSaveNamespace(); } try { if (infoServer != null) { infoServer.stop(); infoServer= null; } } catch (Exception e) { LOG.error(StringUtils.stringifyException(e)); } // process all log segments quiesceIngest(lastTxId, recoverUnfinalizedSegments); clearIngestState(currentSegmentTxId); // mark quiesce as completed LOG.info("Standby: Quiescing - Consumed transactions up to: " + getLastCorrectTxId() + ", requested: " + lastTxId); currentIngestState = StandbyIngestState.STANDBY_QUIESCED; } protected EditLogInputStream setupIngestStreamWithRetries(long txid) throws IOException { LOG.info("Standby: setup edit stream for txid: " + txid); for (int i = 0; i < inputStreamRetries; i++) { try { return setupCurrentEditStream(txid); } catch (IOException e) { // either the number of retires is too high // or the standby is quiescing to a lower transaction id if (i == inputStreamRetries - 1) { throwIOException("Cannot obtain edit stream for txid: " + txid, e); } if (txid > quiesceToTxid) { throwIOException("Standby: Quiesce in progress to txid: " + quiesceToTxid + ", aborting creating edit stream for: " + txid, e); } LOG.info("Error :", e); } try { Thread.sleep(1000); } catch (InterruptedException e) { throw new IOException("Standby: - received interruption"); } LOG.info("Standby: - retrying to get edit input stream for txid: " + txid + ", tried: " + (i + 1) + " times"); } return null; } private EditLogInputStream setupCurrentEditStream(long txid) throws IOException { synchronized (ingestStateLock) { EditLogInputStream currentEditLogInputStream = JournalSet.getInputStream( remoteJournal, txid); InjectionHandler.processEventIO(InjectionEvent.STANDBY_JOURNAL_GETSTREAM); currentSegmentTxId = txid; return currentEditLogInputStream; } } /** * Set current checkpoint status */ private void checkpointStatus(String st) { checkpointStatus = new Date(System.currentTimeMillis()).toString() + ": " + st; } /** * Get current checkpoint status. * Used for webui. */ protected String getCheckpointStatus() { return checkpointStatus; } /** * Upload the image to the primary namenode */ private class ImageUploader extends Thread { long txid; private volatile Exception error = null; private volatile boolean succeeded = false; private volatile boolean done = false; private ImageUploader(long txid) throws IOException { this.txid = txid; } public void run() { try { InjectionHandler.processEvent(InjectionEvent.STANDBY_UPLOAD_CREATE); putFSImage(txid); succeeded = true; } catch (Exception e) { LOG.info("Standby: Checkpointing - Image upload exception: ", e); error = e; } finally { done = true; } } } /** * Should checkpoint be triggered now. * Checkpoint is scheduled when: * - no checkpoint have been performed so far * - the checkpointPeriod time has passed since last checkpoint * - there is an early scheduled checkpoint (previous has failed) * - the number of uncheckpointed transactions is higher than configured * - manual checkpoint has been triggered * * @param now current time * @return */ private boolean shouldScheduleCheckpoint(long now) { fsnamesys.writeLock(); try { if (lastCheckpointTime == 0 || (lastCheckpointTime + 1000 * checkpointPeriod < now) || (earlyScheduledCheckpointTime < now) || getTransactionLag() > checkpointTxnCount || manualCheckpointLatch.getCount() == 2 || InjectionHandler .falseCondition(InjectionEvent.STANDBY_CHECKPOINT_TRIGGER)) { // perform another check to see if the checkpoint has not been delayed // through configuration if (delayedScheduledCheckpointTime > now) { LOG.info("Standby: Checkpointing is delayed - will do checkpoint in: " + ((delayedScheduledCheckpointTime - now) / 1000)); InjectionHandler .processEvent(InjectionEvent.STANDBY_DELAY_CHECKPOINT); return false; } countDownManualLatch(2); return true; } return false; } finally { fsnamesys.writeUnlock(); } } private void countDownManualLatch(int countDownWhenEqual) { if (manualCheckpointLatch.getCount() == countDownWhenEqual) { // 2 - checkpoint is triggered manually, we decrease to 1 // when checking the checkpoint trigger condition // 1 - checkpoint was triggered manually, we decrease to 0 // when checkpoint is done manualCheckpointLatch.countDown(); } } /** * Trigger checkpoint. If there is an ongoing scheduled checkpoint, this call * will trigger a checkpoint immediately after. The method blocks until the * checkpoint is done. */ void triggerCheckpoint(boolean uncompressed) throws IOException { String pref = "Standby: Checkpoint - "; LOG.info(pref + "triggering checkpoint manually"); // check error conditions if (uncompressed) { throwIOException(pref + " uncompressed option not supported", null); } if (manualCheckpointLatch.getCount() > 0) { throwIOException(pref + "Another manual checkpoint is in progress", null); } // set the manual checkpoint latch manualCheckpointLatch = new CountDownLatch(2); lastCheckpointTime = delayedScheduledCheckpointTime = 0; try { manualCheckpointLatch.await(); } catch (InterruptedException e) { throwIOException(pref + "interrupted when performing manual checkpoint", e); } // check if checkpoint succeeded if (checkpointFailures > 0) { throwIOException(pref + "manual checkpoint failed", null); } LOG.info(pref + "manual checkpoint done"); } /** * writes the in memory image of the local namenode to the fsimage * and tnen uploads this image to the primary namenode. The transaction * log on the primary is purged too. */ // DO NOT CHANGE THIS TO PUBLIC private void doCheckpoint() throws IOException { long start = AvatarNode.now(); try { InjectionHandler.processEvent(InjectionEvent.STANDBY_ENTER_CHECKPOINT, this.sig); // Tell the remote namenode to start logging transactions in a new edit file // Retuns a token that would be used to upload the merged image. if (!checkpointEnabled) { checkpointStatus("Disabled"); // This means the Standby is not meant to checkpoint the primary LOG.info("Standby: Checkpointing is disabled - return"); return; } CheckpointSignature sig = null; InjectionHandler.processEvent(InjectionEvent.STANDBY_BEFORE_ROLL_EDIT); try { LOG.info("Standby: Checkpointing - Roll edits logs of primary namenode " + nameNodeAddr); checkpointStatus("Edit log rolled on primary"); synchronized(ingestStateLock) { sig = (CheckpointSignature) primaryNamenode.rollEditLog(); } } catch (IOException ex) { // In this case we can return since we did not kill the Ingest thread yet // Nothing prevents us from doing the next checkpoint attempt checkpointStatus("Checkpoint failed"); LOG.warn("Standby: Checkpointing - roll Edits on the primary node failed."); InjectionHandler.processEvent( InjectionEvent.STANDBY_EXIT_CHECKPOINT_FAILED_ROLL, ex); return; } setLastRollSignature(sig); // consume all finalized log segments up to the required txid long checkpointTxId = sig.curSegmentTxId - 1; LOG.info("Standby: Checkpointing - checkpoint txid: " + checkpointTxId); checkpointStatus("Quiescing ingest"); quiesceIngest(checkpointTxId); if (currentSegmentTxId != checkpointTxId + 1) { throw new IOException( "Standby: Checkpointing txid mismatch - ingest consumed txns up to " + (currentSegmentTxId - 1) + " but should have up to " + checkpointTxId); } // if everything is fine, the edit log should be closed now LOG.info("Standby: Checkpointing - finished quitting ingest thread just before ckpt."); assertState(StandbyIngestState.NOT_INGESTING); /** * From now on Ingest thread needs to know if the checkpoint was started and never finished. * This would mean that it doesn't have to read the edits, since they were already processed * to the end as a part of a checkpoint. state = StandbyIngestState.CHECKPOINTING */ fsnamesys.writeLock(); try { InjectionHandler.processEvent(InjectionEvent.STANDBY_BEFORE_SAVE_NAMESPACE); currentIngestState = StandbyIngestState.CHECKPOINTING; // save a checkpoint of the current namespace of the local Namenode // We should ideally use fsnamesystem.saveNamespace but that works // only if namenode is not in safemode. LOG.info("Standby: Checkpointing - save fsimage on local namenode."); checkpointStatus("Saving namespace started"); // by default checkpoints are compressed if configured // manual command can override this setting for one checkpoint fsnamesys.getFSImage().saveNamespace(false); // get the new signature sig.mostRecentCheckpointTxId = fsImage.getEditLog().getLastWrittenTxId(); sig.imageDigest = fsImage.storage.getCheckpointImageDigest(sig.mostRecentCheckpointTxId); } catch (SaveNamespaceCancelledException e) { InjectionHandler.processEvent(InjectionEvent.STANDBY_CANCELLED_EXCEPTION_THROWN); LOG.info("Standby: Checkpointing - cancelled saving namespace"); fsnamesys.getFSImage().deleteCheckpoint(checkpointTxId); throw e; } catch (IOException ex) { // Standby failed to save fsimage locally. Need to reinitialize String msg = "Standby: Checkpointing - failed to checkpoint itself, so " + "no image can be uploaded to the primary. The only course of action " + "is to start from the very beginning by reinitializing AvatarNode"; LOG.error(msg, ex); fsnamesys.getFSImage().deleteCheckpoint(checkpointTxId); throw new RuntimeException(msg, ex); } finally { currentIngestState = StandbyIngestState.NOT_INGESTING; fsnamesys.writeUnlock(); } // we can start the ingest again for next segment instantiateIngest(); try { finalizeCheckpoint(sig); } catch (IOException ex) { LOG.error("Standby: Checkpointing - rolling the fsimage " + "on the Primary node failed.", ex); throw new FinalizeCheckpointException(ex.toString()); } setCheckpointFailures(0); LOG.info("Standby: Checkpointing - checkpoint completed in " + ((AvatarNode.now() - start) / 1000) + " s."); } catch (IOException e) { LOG.error("Standby: Checkpointing - failed to complete the checkpoint: " + StringUtils.stringifyException(e)); checkpointStatus("Checkpoint failed"); if (!checkpointEnabled) { // we received instruction to prepare for failover return; } handleCheckpointFailure(); InjectionHandler.processEvent(InjectionEvent.STANDBY_EXIT_CHECKPOINT_EXCEPTION, e); throw e; } finally { countDownManualLatch(1); setLastRollSignature(null); InjectionHandler.processEvent(InjectionEvent.STANDBY_EXIT_CHECKPOINT, this.sig); } } /** * If checkpoint fails continuously, we want to abort the standby. We want to * avoid the situation in which the standby continuously rolls edit log on the * primary without finalizing checkpoint. */ private void handleCheckpointFailure() { setCheckpointFailures(checkpointFailures + 1); if (checkpointFailures > MAX_CHECKPOINT_FAILURES) { LOG.fatal("Standby: Checkpointing - standby failed to checkpoint in " + checkpointFailures + " attempts. Aborting"); } else { // We want to give some time for some transition error to recover // and DNS caching to expire. This is mainly for small clusters // where checkpoint can be very fast. Doesn't hurt if we sleep // on large clusters too. // LOG.info("Sleeping " + CHECKPOINT_SLEEP_BEFORE_RETRY + " msecs before retry checkpoints..."); try { Thread.sleep(CHECKPOINT_SLEEP_BEFORE_RETRY); return; } catch (InterruptedException ie) { LOG.warn("Standby: Checkpointing - Thread interrupted" + " while sleeping before a retry.", ie); } } FSEditLog.runtime.exit(-1); } /** * Creates image upload thread. */ private void uploadImage(long txid) throws IOException { final long start = AvatarNode.now(); LOG.info("Standby: Checkpointing - Upload fsimage to remote namenode."); checkpointStatus("Image upload started"); imageUploader = new ImageUploader(txid); imageUploader.start(); // wait for the upload to complete while (running && !imageUploader.done && AvatarNode.now() - start < MAX_CHECKPOINT_UPLOAD_TIMEOUT) { try { imageUploader.join(3000); } catch (InterruptedException ie) { LOG.error("Reveived interruption when uploading image for txid: " + txid); Thread.currentThread().interrupt(); throw (IOException) new InterruptedIOException().initCause(ie); } } if (!running || !imageUploader.succeeded) { InjectionHandler.processEvent(InjectionEvent.STANDBY_UPLOAD_FAIL); throw new IOException( "Standby: Checkpointing - Image upload failed (time= " + (AvatarNode.now() - start) + " ms).", imageUploader.error); } imageUploader = null; LOG.info("Standby: Checkpointing - Upload fsimage to remote namenode DONE."); checkpointStatus("Image upload completed"); } /** * Copy the new fsimage into the NameNode */ private void putFSImage(long txid) throws IOException { TransferFsImage.uploadImageFromStorage(fsName, machineName, infoPort, fsImage.storage, txid); } private void finalizeCheckpoint(CheckpointSignature sig) throws IOException{ try { File imageFile = fsImage.storage.getFsImageName( StorageLocationType.LOCAL, sig.mostRecentCheckpointTxId); InjectionHandler.processEvent(InjectionEvent.STANDBY_BEFORE_PUT_IMAGE, imageFile); // start a thread to validate image while uploading the image to primary createImageValidation(imageFile); // copy image to primary namenode uploadImage(sig.mostRecentCheckpointTxId); // check if the image is valid checkImageValidation(); // make transaction to primary namenode to switch edit logs LOG.info("Standby: Checkpointing - Roll fsimage on primary namenode."); InjectionHandler.processEventIO(InjectionEvent.STANDBY_BEFORE_ROLL_IMAGE); assertState( StandbyIngestState.NOT_INGESTING, StandbyIngestState.INGESTING_EDITS); primaryNamenode.rollFsImage(new CheckpointSignature(fsImage)); setLastRollSignature(null); LOG.info("Standby: Checkpointing - Checkpoint done. New Image Size: " + fsImage.getFsImageName(StorageLocationType.LOCAL).length()); checkpointStatus("Completed"); } finally { interruptImageValidation(); } } ////////////////////////// IMAGE VALIDATION ////////////////////////// /** * Load the image to validate that it is not corrupted */ private class ImageValidator extends Thread { private OfflineImageViewer viewer; volatile private Throwable error = null; volatile private boolean succeeded = false; private ImageValidator(File imageFile) throws IOException { LOG.info("Validating image file " + imageFile); tmpImageFileForValidation.delete(); LsImageVisitor v = new LsImageVisitor(tmpImageFileForValidation.toString()); viewer = new OfflineImageViewer(imageFile.toString(), v, true); } public void run() { try { viewer.go(); succeeded = true; } catch (Throwable e) { LOG.info("Standby: Image validation exception: ", e); error = e; } } } /** * Checks the status of image validation during checkpoint. * @throws IOException */ private void checkImageValidation() throws IOException { try { imageValidator.join(); } catch (InterruptedException ie) { throw (IOException) new InterruptedIOException().initCause(ie); } if (!imageValidator.succeeded) { throw new IOException("Image file validation failed", imageValidator.error); } } /** * Creates image validation thread. * @param imageFile * @throws IOException on error, or when standby quiesce was invoked */ private void createImageValidation(File imageFile) throws IOException { synchronized (imageValidatorLock) { InjectionHandler.processEvent(InjectionEvent.STANDBY_VALIDATE_CREATE); if (!running) { // fails the checkpoint InjectionHandler.processEvent(InjectionEvent.STANDBY_VALIDATE_CREATE_FAIL); throw new IOException("Standby: standby is quiescing"); } imageValidator = new ImageValidator(imageFile); imageValidator.start(); } } /** * Interrupts and joins ongoing image validation. * @throws IOException */ private void interruptImageValidation() throws IOException { synchronized (imageValidatorLock) { if (imageValidator != null) { imageValidator.interrupt(); try { imageValidator.join(); } catch (InterruptedException e) { throw new InterruptedIOException("Standby: received interruption"); } } } } //////////////////////////IMAGE VALIDATION END ////////////////////////// /** * Initialize the webserver so that the primary namenode can fetch * transaction logs from standby via http. */ void initSecondary(Configuration conf) throws IOException { fsName = AvatarNode.getRemoteNamenodeHttpName(conf, avatarNode.getInstanceId()); // Initialize other scheduling parameters from the configuration checkpointEnabled = conf.getBoolean("fs.checkpoint.enabled", false); checkpointPeriod = conf.getLong("fs.checkpoint.period", 3600); checkpointTxnCount = NNStorageConfiguration.getCheckpointTxnCount(conf); delayedScheduledCheckpointTime = conf.getBoolean("fs.checkpoint.delayed", false) ? AvatarNode.now() + checkpointPeriod * 1000 : 0; // initialize the webserver for uploading files. String infoAddr = NetUtils.getServerAddress(conf, "dfs.secondary.info.bindAddress", "dfs.secondary.info.port", "dfs.secondary.http.address"); InetSocketAddress infoSocAddr = NetUtils.createSocketAddr(infoAddr); String infoBindIpAddress = infoSocAddr.getAddress().getHostAddress(); int tmpInfoPort = infoSocAddr.getPort(); infoServer = new HttpServer("secondary", infoBindIpAddress, tmpInfoPort, tmpInfoPort == 0, conf); infoServer.setAttribute("name.system.image", fsImage); this.infoServer.setAttribute("name.conf", conf); infoServer.addInternalServlet("getimage", "/getimage", GetImageServlet.class); infoServer.start(); avatarNode.httpServer.setAttribute("avatar.node", avatarNode); avatarNode.httpServer.addInternalServlet("outstandingnodes", "/outstandingnodes", OutStandingDatanodesServlet.class); // The web-server port can be ephemeral... ensure we have the correct info infoPort = infoServer.getPort(); conf.set("dfs.secondary.http.address", infoBindIpAddress + ":" +infoPort); LOG.info("Secondary Web-server up at: " + infoBindIpAddress + ":" +infoPort); LOG.warn("Checkpoint Period :" + checkpointPeriod + " secs " + "(" + checkpointPeriod/60 + " min)"); if (delayedScheduledCheckpointTime > 0) { LOG.warn("Standby: Checkpointing will be delayed by: " + checkpointPeriod + " seconds"); } LOG.warn("Log Size Trigger :" + checkpointTxnCount + " transactions."); } public void setLastRollSignature(CheckpointSignature sig) { this.sig = sig; } public CheckpointSignature getLastRollSignature() { return this.sig; } public long getLastCheckpointTime() { return lastCheckpointTime; } public boolean fellBehind() { try { switch (currentIngestState) { case INGESTING_EDITS: return ingest.isCatchingUp(); case NOT_INGESTING: case CHECKPOINTING: return true; case STANDBY_QUIESCED: return false; default: LOG.error("Standby: unknown ingest state: " + currentIngestState); return true; } } catch (Exception e) { // since this is not synchronized on ingest state lock // in the case of any exceptions (e.g., NPE) we return true return true; } } /** * Tells how far behind the standby is with consuming edits * (only in progress segments). */ public long getLagBytes() { return ingest == null ? -1 : this.ingest.getLagBytes(); } /** * Sets state for standby upon successful ingestion. * @param txid next starting txid */ protected void clearIngestState(long txid) { synchronized (ingestStateLock) { currentSegmentTxId = txid; currentIngestState = StandbyIngestState.NOT_INGESTING; } } /** * Sets state for standby upon unsuccessful ingestion. * Ingest will start from the same transaction id. */ protected void clearIngestState() { synchronized (ingestStateLock) { currentIngestState = StandbyIngestState.NOT_INGESTING; } } /** * Assert that the standby is in the expected state * * @param expectedStates expected states to be in */ private void assertState(StandbyIngestState... expectedStates) throws IOException { for (StandbyIngestState s : expectedStates) { if (currentIngestState == s) return; } throw new IOException("Standby: illegal state - current: " + currentIngestState); } /** * Ingest will keep track of the last correctly loaded transaction * for recovery. Only updated by ingest, does not need to be synchronized. * @param txid the id of the last correctly loaded transaction */ protected void setLastCorrectTxId(long txid) { if (txid > lastCorrectlyLoadedTxId) lastCorrectlyLoadedTxId = txid; } /** * Return the last known transaction that has been * successfully loaded. */ protected long getLastCorrectTxId() { return lastCorrectlyLoadedTxId; } /** * Get the number of failed checkpoints. */ int getNumCheckpointFailures() { return checkpointFailures; } private void throwIOException(String msg, Exception e) throws IOException { LOG.error(msg, e); throw new IOException(msg); } /** * No more checkpoints will be performed. */ void disableCheckpoint() { checkpointEnabled = false; } }