/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.namenode; import java.io.Closeable; import java.io.File; import java.io.IOException; import java.net.URI; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.protocol.LayoutVersion; import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature; import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException; import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; import org.apache.hadoop.hdfs.server.common.Storage.StorageState; import org.apache.hadoop.hdfs.server.common.Util; import static org.apache.hadoop.hdfs.server.common.Util.now; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; import org.apache.hadoop.hdfs.server.namenode.FSImageStorageInspector.LoadPlan; import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType; import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile; import org.apache.hadoop.hdfs.server.protocol.CheckpointCommand; import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand; import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol; import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; import org.apache.hadoop.hdfs.util.MD5FileUtils; import org.apache.hadoop.io.MD5Hash; import org.apache.hadoop.hdfs.DFSConfigKeys; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; /** * FSImage handles checkpointing and logging of the namespace edits. * */ @InterfaceAudience.Private @InterfaceStability.Evolving public class FSImage implements Closeable { protected static final Log LOG = LogFactory.getLog(FSImage.class.getName()); protected FSNamesystem namesystem = null; protected FSEditLog editLog = null; private boolean isUpgradeFinalized = false; protected NNStorage storage; /** * The last transaction ID that was either loaded from an image * or loaded by loading edits files. */ protected long lastAppliedTxId = 0; /** * URIs for importing an image from a checkpoint. In the default case, * URIs will represent directories. */ private Collection<URI> checkpointDirs; private Collection<URI> checkpointEditsDirs; final private Configuration conf; private final NNStorageRetentionManager archivalManager; /** * Construct an FSImage. * @param conf Configuration * @see #FSImage(Configuration conf, FSNamesystem ns, * Collection imageDirs, Collection editsDirs) * @throws IOException if default directories are invalid. */ public FSImage(Configuration conf) throws IOException { this(conf, (FSNamesystem)null); } /** * Construct an FSImage * @param conf Configuration * @param ns The FSNamesystem using this image. * @see #FSImage(Configuration conf, FSNamesystem ns, * Collection imageDirs, Collection editsDirs) * @throws IOException if default directories are invalid. */ private FSImage(Configuration conf, FSNamesystem ns) throws IOException { this(conf, ns, FSNamesystem.getNamespaceDirs(conf), FSNamesystem.getNamespaceEditsDirs(conf)); } /** * Construct the FSImage. Set the default checkpoint directories. * * Setup storage and initialize the edit log. * * @param conf Configuration * @param ns The FSNamesystem using this image. * @param imageDirs Directories the image can be stored in. * @param editsDirs Directories the editlog can be stored in. * @throws IOException if directories are invalid. */ protected FSImage(Configuration conf, FSNamesystem ns, Collection<URI> imageDirs, Collection<URI> editsDirs) throws IOException { this.conf = conf; setCheckpointDirectories(FSImage.getCheckpointDirs(conf, null), FSImage.getCheckpointEditsDirs(conf, null)); storage = new NNStorage(conf, imageDirs, editsDirs); if(conf.getBoolean(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_KEY, DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_DEFAULT)) { storage.setRestoreFailedStorage(true); } this.editLog = new FSEditLog(storage); setFSNamesystem(ns); archivalManager = new NNStorageRetentionManager(conf, storage, editLog); } protected FSNamesystem getFSNamesystem() { return namesystem; } void setFSNamesystem(FSNamesystem ns) { namesystem = ns; if (ns != null) { storage.setUpgradeManager(ns.upgradeManager); } } void setCheckpointDirectories(Collection<URI> dirs, Collection<URI> editsDirs) { checkpointDirs = dirs; checkpointEditsDirs = editsDirs; } void format(String clusterId) throws IOException { storage.format(clusterId); saveFSImageInAllDirs(0); } /** * Analyze storage directories. * Recover from previous transitions if required. * Perform fs state transition if necessary depending on the namespace info. * Read storage info. * * @throws IOException * @return true if the image needs to be saved or false otherwise */ boolean recoverTransitionRead(StartupOption startOpt) throws IOException { assert startOpt != StartupOption.FORMAT : "NameNode formatting should be performed before reading the image"; Collection<URI> imageDirs = storage.getImageDirectories(); Collection<URI> editsDirs = storage.getEditsDirectories(); // none of the data dirs exist if((imageDirs.size() == 0 || editsDirs.size() == 0) && startOpt != StartupOption.IMPORT) throw new IOException( "All specified directories are not accessible or do not exist."); if(startOpt == StartupOption.IMPORT && (checkpointDirs == null || checkpointDirs.isEmpty())) throw new IOException("Cannot import image from a checkpoint. " + "\"dfs.namenode.checkpoint.dir\" is not set." ); if(startOpt == StartupOption.IMPORT && (checkpointEditsDirs == null || checkpointEditsDirs.isEmpty())) throw new IOException("Cannot import image from a checkpoint. " + "\"dfs.namenode.checkpoint.dir\" is not set." ); // 1. For each data directory calculate its state and // check whether all is consistent before transitioning. Map<StorageDirectory, StorageState> dataDirStates = new HashMap<StorageDirectory, StorageState>(); boolean isFormatted = recoverStorageDirs(startOpt, dataDirStates); if (LOG.isTraceEnabled()) { LOG.trace("Data dir states:\n " + Joiner.on("\n ").withKeyValueSeparator(": ") .join(dataDirStates)); } if (!isFormatted && startOpt != StartupOption.ROLLBACK && startOpt != StartupOption.IMPORT) { throw new IOException("NameNode is not formatted."); } int layoutVersion = storage.getLayoutVersion(); if (layoutVersion < Storage.LAST_PRE_UPGRADE_LAYOUT_VERSION) { NNStorage.checkVersionUpgradable(storage.getLayoutVersion()); } if (startOpt != StartupOption.UPGRADE && layoutVersion < Storage.LAST_PRE_UPGRADE_LAYOUT_VERSION && layoutVersion != HdfsConstants.LAYOUT_VERSION) { throw new IOException( "\nFile system image contains an old layout version " + storage.getLayoutVersion() + ".\nAn upgrade to version " + HdfsConstants.LAYOUT_VERSION + " is required.\n" + "Please restart NameNode with -upgrade option."); } storage.processStartupOptionsForUpgrade(startOpt, layoutVersion); // check whether distributed upgrade is required and/or should be continued storage.verifyDistributedUpgradeProgress(startOpt); // 2. Format unformatted dirs. for (Iterator<StorageDirectory> it = storage.dirIterator(); it.hasNext();) { StorageDirectory sd = it.next(); StorageState curState = dataDirStates.get(sd); switch(curState) { case NON_EXISTENT: throw new IOException(StorageState.NON_EXISTENT + " state cannot be here"); case NOT_FORMATTED: LOG.info("Storage directory " + sd.getRoot() + " is not formatted."); LOG.info("Formatting ..."); sd.clearDirectory(); // create empty currrent dir break; default: break; } } // 3. Do transitions switch(startOpt) { case UPGRADE: doUpgrade(); return false; // upgrade saved image already case IMPORT: doImportCheckpoint(); return false; // import checkpoint saved image already case ROLLBACK: doRollback(); break; case REGULAR: // just load the image } return loadFSImage(); } /** * For each storage directory, performs recovery of incomplete transitions * (eg. upgrade, rollback, checkpoint) and inserts the directory's storage * state into the dataDirStates map. * @param dataDirStates output of storage directory states * @return true if there is at least one valid formatted storage directory */ private boolean recoverStorageDirs(StartupOption startOpt, Map<StorageDirectory, StorageState> dataDirStates) throws IOException { boolean isFormatted = false; for (Iterator<StorageDirectory> it = storage.dirIterator(); it.hasNext();) { StorageDirectory sd = it.next(); StorageState curState; try { curState = sd.analyzeStorage(startOpt, storage); // sd is locked but not opened switch(curState) { case NON_EXISTENT: // name-node fails if any of the configured storage dirs are missing throw new InconsistentFSStateException(sd.getRoot(), "storage directory does not exist or is not accessible."); case NOT_FORMATTED: break; case NORMAL: break; default: // recovery is possible sd.doRecover(curState); } if (curState != StorageState.NOT_FORMATTED && startOpt != StartupOption.ROLLBACK) { // read and verify consistency with other directories storage.readProperties(sd); isFormatted = true; } if (startOpt == StartupOption.IMPORT && isFormatted) // import of a checkpoint is allowed only into empty image directories throw new IOException("Cannot import image from a checkpoint. " + " NameNode already contains an image in " + sd.getRoot()); } catch (IOException ioe) { sd.unlock(); throw ioe; } dataDirStates.put(sd,curState); } return isFormatted; } private void doUpgrade() throws IOException { if(storage.getDistributedUpgradeState()) { // only distributed upgrade need to continue // don't do version upgrade this.loadFSImage(); storage.initializeDistributedUpgrade(); return; } // Upgrade is allowed only if there are // no previous fs states in any of the directories for (Iterator<StorageDirectory> it = storage.dirIterator(); it.hasNext();) { StorageDirectory sd = it.next(); if (sd.getPreviousDir().exists()) throw new InconsistentFSStateException(sd.getRoot(), "previous fs state should not exist during upgrade. " + "Finalize or rollback first."); } // load the latest image this.loadFSImage(); // Do upgrade for each directory long oldCTime = storage.getCTime(); storage.cTime = now(); // generate new cTime for the state int oldLV = storage.getLayoutVersion(); storage.layoutVersion = HdfsConstants.LAYOUT_VERSION; List<StorageDirectory> errorSDs = Collections.synchronizedList(new ArrayList<StorageDirectory>()); for (Iterator<StorageDirectory> it = storage.dirIterator(); it.hasNext();) { StorageDirectory sd = it.next(); LOG.info("Starting upgrade of image directory " + sd.getRoot() + ".\n old LV = " + oldLV + "; old CTime = " + oldCTime + ".\n new LV = " + storage.getLayoutVersion() + "; new CTime = " + storage.getCTime()); try { File curDir = sd.getCurrentDir(); File prevDir = sd.getPreviousDir(); File tmpDir = sd.getPreviousTmp(); assert curDir.exists() : "Current directory must exist."; assert !prevDir.exists() : "prvious directory must not exist."; assert !tmpDir.exists() : "prvious.tmp directory must not exist."; assert !editLog.isOpen() : "Edits log must not be open."; // rename current to tmp NNStorage.rename(curDir, tmpDir); if (!curDir.mkdir()) { throw new IOException("Cannot create directory " + curDir); } } catch (Exception e) { LOG.error("Failed to move aside pre-upgrade storage " + "in image directory " + sd.getRoot(), e); errorSDs.add(sd); continue; } } storage.reportErrorsOnDirectories(errorSDs); errorSDs.clear(); saveFSImageInAllDirs(editLog.getLastWrittenTxId()); for (Iterator<StorageDirectory> it = storage.dirIterator(); it.hasNext();) { StorageDirectory sd = it.next(); try { // Write the version file, since saveFsImage above only makes the // fsimage_<txid>, and the directory is otherwise empty. storage.writeProperties(sd); File prevDir = sd.getPreviousDir(); File tmpDir = sd.getPreviousTmp(); // rename tmp to previous NNStorage.rename(tmpDir, prevDir); } catch (IOException ioe) { LOG.error("Unable to rename temp to previous for " + sd.getRoot(), ioe); errorSDs.add(sd); continue; } LOG.info("Upgrade of " + sd.getRoot() + " is complete."); } storage.reportErrorsOnDirectories(errorSDs); isUpgradeFinalized = false; if (!storage.getRemovedStorageDirs().isEmpty()) { //during upgrade, it's a fatal error to fail any storage directory throw new IOException("Upgrade failed in " + storage.getRemovedStorageDirs().size() + " storage directory(ies), previously logged."); } storage.initializeDistributedUpgrade(); } private void doRollback() throws IOException { // Rollback is allowed only if there is // a previous fs states in at least one of the storage directories. // Directories that don't have previous state do not rollback boolean canRollback = false; FSImage prevState = new FSImage(conf, getFSNamesystem()); prevState.getStorage().layoutVersion = HdfsConstants.LAYOUT_VERSION; for (Iterator<StorageDirectory> it = storage.dirIterator(); it.hasNext();) { StorageDirectory sd = it.next(); File prevDir = sd.getPreviousDir(); if (!prevDir.exists()) { // use current directory then LOG.info("Storage directory " + sd.getRoot() + " does not contain previous fs state."); // read and verify consistency with other directories storage.readProperties(sd); continue; } // read and verify consistency of the prev dir prevState.getStorage().readPreviousVersionProperties(sd); if (prevState.getLayoutVersion() != HdfsConstants.LAYOUT_VERSION) { throw new IOException( "Cannot rollback to storage version " + prevState.getLayoutVersion() + " using this version of the NameNode, which uses storage version " + HdfsConstants.LAYOUT_VERSION + ". " + "Please use the previous version of HDFS to perform the rollback."); } canRollback = true; } if (!canRollback) throw new IOException("Cannot rollback. None of the storage " + "directories contain previous fs state."); // Now that we know all directories are going to be consistent // Do rollback for each directory containing previous state for (Iterator<StorageDirectory> it = storage.dirIterator(); it.hasNext();) { StorageDirectory sd = it.next(); File prevDir = sd.getPreviousDir(); if (!prevDir.exists()) continue; LOG.info("Rolling back storage directory " + sd.getRoot() + ".\n new LV = " + prevState.getStorage().getLayoutVersion() + "; new CTime = " + prevState.getStorage().getCTime()); File tmpDir = sd.getRemovedTmp(); assert !tmpDir.exists() : "removed.tmp directory must not exist."; // rename current to tmp File curDir = sd.getCurrentDir(); assert curDir.exists() : "Current directory must exist."; NNStorage.rename(curDir, tmpDir); // rename previous to current NNStorage.rename(prevDir, curDir); // delete tmp dir NNStorage.deleteDir(tmpDir); LOG.info("Rollback of " + sd.getRoot()+ " is complete."); } isUpgradeFinalized = true; // check whether name-node can start in regular mode storage.verifyDistributedUpgradeProgress(StartupOption.REGULAR); } private void doFinalize(StorageDirectory sd) throws IOException { File prevDir = sd.getPreviousDir(); if (!prevDir.exists()) { // already discarded LOG.info("Directory " + prevDir + " does not exist."); LOG.info("Finalize upgrade for " + sd.getRoot()+ " is not required."); return; } LOG.info("Finalizing upgrade for storage directory " + sd.getRoot() + "." + (storage.getLayoutVersion()==0 ? "" : "\n cur LV = " + storage.getLayoutVersion() + "; cur CTime = " + storage.getCTime())); assert sd.getCurrentDir().exists() : "Current directory must exist."; final File tmpDir = sd.getFinalizedTmp(); // rename previous to tmp and remove NNStorage.rename(prevDir, tmpDir); NNStorage.deleteDir(tmpDir); isUpgradeFinalized = true; LOG.info("Finalize upgrade for " + sd.getRoot()+ " is complete."); } /** * Load image from a checkpoint directory and save it into the current one. * @throws IOException */ void doImportCheckpoint() throws IOException { FSNamesystem fsNamesys = getFSNamesystem(); FSImage ckptImage = new FSImage(conf, fsNamesys, checkpointDirs, checkpointEditsDirs); // replace real image with the checkpoint image FSImage realImage = fsNamesys.getFSImage(); assert realImage == this; fsNamesys.dir.fsImage = ckptImage; // load from the checkpoint dirs try { ckptImage.recoverTransitionRead(StartupOption.REGULAR); } finally { ckptImage.close(); } // return back the real image realImage.getStorage().setStorageInfo(ckptImage.getStorage()); realImage.getEditLog().setNextTxId(ckptImage.getEditLog().getLastWrittenTxId()+1); fsNamesys.dir.fsImage = realImage; realImage.getStorage().setBlockPoolID(ckptImage.getBlockPoolID()); // and save it but keep the same checkpointTime saveNamespace(); getStorage().writeAll(); } void finalizeUpgrade() throws IOException { for (Iterator<StorageDirectory> it = storage.dirIterator(); it.hasNext();) { StorageDirectory sd = it.next(); doFinalize(sd); } } boolean isUpgradeFinalized() { return isUpgradeFinalized; } public FSEditLog getEditLog() { return editLog; } void openEditLog() throws IOException { assert editLog != null : "editLog must be initialized"; Preconditions.checkState(!editLog.isOpen(), "edit log should not yet be open"); editLog.open(); storage.writeTransactionIdFileToStorage(editLog.getCurSegmentTxId()); }; /** * Toss the current image and namesystem, reloading from the specified * file. */ void reloadFromImageFile(File file) throws IOException { namesystem.dir.reset(); LOG.debug("Reloading namespace from " + file); loadFSImage(file); } /** * Choose latest image from one of the directories, * load it and merge with the edits from that directory. * * Saving and loading fsimage should never trigger symlink resolution. * The paths that are persisted do not have *intermediate* symlinks * because intermediate symlinks are resolved at the time files, * directories, and symlinks are created. All paths accessed while * loading or saving fsimage should therefore only see symlinks as * the final path component, and the functions called below do not * resolve symlinks that are the final path component. * * @return whether the image should be saved * @throws IOException */ boolean loadFSImage() throws IOException { FSImageStorageInspector inspector = storage.readAndInspectDirs(); isUpgradeFinalized = inspector.isUpgradeFinalized(); boolean needToSave = inspector.needToSave(); // Plan our load. This will throw if it's impossible to load from the // data that's available. LoadPlan loadPlan = inspector.createLoadPlan(); LOG.debug("Planning to load image using following plan:\n" + loadPlan); // Recover from previous interrupted checkpoint, if any needToSave |= loadPlan.doRecovery(); // // Load in bits // StorageDirectory sdForProperties = loadPlan.getStorageDirectoryForProperties(); storage.readProperties(sdForProperties); File imageFile = loadPlan.getImageFile(); try { if (LayoutVersion.supports(Feature.TXID_BASED_LAYOUT, getLayoutVersion())) { // For txid-based layout, we should have a .md5 file // next to the image file loadFSImage(imageFile); } else if (LayoutVersion.supports(Feature.FSIMAGE_CHECKSUM, getLayoutVersion())) { // In 0.22, we have the checksum stored in the VERSION file. String md5 = storage.getDeprecatedProperty( NNStorage.DEPRECATED_MESSAGE_DIGEST_PROPERTY); if (md5 == null) { throw new InconsistentFSStateException(sdForProperties.getRoot(), "Message digest property " + NNStorage.DEPRECATED_MESSAGE_DIGEST_PROPERTY + " not set for storage directory " + sdForProperties.getRoot()); } loadFSImage(imageFile, new MD5Hash(md5)); } else { // We don't have any record of the md5sum loadFSImage(imageFile, null); } } catch (IOException ioe) { throw new IOException("Failed to load image from " + loadPlan.getImageFile(), ioe); } long numLoaded = loadEdits(loadPlan.getEditsFiles()); needToSave |= needsResaveBasedOnStaleCheckpoint(imageFile, numLoaded); // update the txid for the edit log editLog.setNextTxId(storage.getMostRecentCheckpointTxId() + numLoaded + 1); return needToSave; } /** * @param imageFile the image file that was loaded * @param numEditsLoaded the number of edits loaded from edits logs * @return true if the NameNode should automatically save the namespace * when it is started, due to the latest checkpoint being too old. */ private boolean needsResaveBasedOnStaleCheckpoint( File imageFile, long numEditsLoaded) { final long checkpointPeriod = conf.getLong( DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT); final long checkpointTxnCount = conf.getLong( DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT); long checkpointAge = System.currentTimeMillis() - imageFile.lastModified(); return (checkpointAge > checkpointPeriod * 1000) || (numEditsLoaded > checkpointTxnCount); } /** * Load the specified list of edit files into the image. * @return the number of transactions loaded */ protected long loadEdits(List<File> editLogs) throws IOException { LOG.debug("About to load edits:\n " + Joiner.on("\n ").join(editLogs)); long startingTxId = getLastAppliedTxId() + 1; FSEditLogLoader loader = new FSEditLogLoader(namesystem); int numLoaded = 0; // Load latest edits for (File edits : editLogs) { LOG.debug("Reading " + edits + " expecting start txid #" + startingTxId); EditLogFileInputStream editIn = new EditLogFileInputStream(edits); int thisNumLoaded = loader.loadFSEdits(editIn, startingTxId); startingTxId += thisNumLoaded; numLoaded += thisNumLoaded; lastAppliedTxId += thisNumLoaded; editIn.close(); } // update the counts getFSNamesystem().dir.updateCountForINodeWithQuota(); return numLoaded; } /** * Load the image namespace from the given image file, verifying * it against the MD5 sum stored in its associated .md5 file. */ private void loadFSImage(File imageFile) throws IOException { MD5Hash expectedMD5 = MD5FileUtils.readStoredMd5ForFile(imageFile); if (expectedMD5 == null) { throw new IOException("No MD5 file found corresponding to image file " + imageFile); } loadFSImage(imageFile, expectedMD5); } /** * Load in the filesystem image from file. It's a big list of * filenames and blocks. Return whether we should * "re-save" and consolidate the edit-logs */ private void loadFSImage(File curFile, MD5Hash expectedMd5) throws IOException { FSImageFormat.Loader loader = new FSImageFormat.Loader( conf, getFSNamesystem()); loader.load(curFile); namesystem.setBlockPoolId(this.getBlockPoolID()); // Check that the image digest we loaded matches up with what // we expected MD5Hash readImageMd5 = loader.getLoadedImageMd5(); if (expectedMd5 != null && !expectedMd5.equals(readImageMd5)) { throw new IOException("Image file " + curFile + " is corrupt with MD5 checksum of " + readImageMd5 + " but expecting " + expectedMd5); } long txId = loader.getLoadedImageTxId(); LOG.info("Loaded image for txid " + txId + " from " + curFile); lastAppliedTxId = txId; storage.setMostRecentCheckpointInfo(txId, curFile.lastModified()); } /** * Save the contents of the FS image to the file. */ void saveFSImage(StorageDirectory sd, long txid) throws IOException { File newFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE_NEW, txid); File dstFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE, txid); FSImageFormat.Saver saver = new FSImageFormat.Saver(); FSImageCompression compression = FSImageCompression.createCompression(conf); saver.save(newFile, txid, getFSNamesystem(), compression); MD5FileUtils.saveMD5File(dstFile, saver.getSavedDigest()); storage.setMostRecentCheckpointInfo(txid, Util.now()); } /** * FSImageSaver is being run in a separate thread when saving * FSImage. There is one thread per each copy of the image. * * FSImageSaver assumes that it was launched from a thread that holds * FSNamesystem lock and waits for the execution of FSImageSaver thread * to finish. * This way we are guraranteed that the namespace is not being updated * while multiple instances of FSImageSaver are traversing it * and writing it out. */ private class FSImageSaver implements Runnable { private StorageDirectory sd; private List<StorageDirectory> errorSDs; private final long txid; FSImageSaver(StorageDirectory sd, List<StorageDirectory> errorSDs, long txid) { this.sd = sd; this.errorSDs = errorSDs; this.txid = txid; } public void run() { try { saveFSImage(sd, txid); } catch (Throwable t) { LOG.error("Unable to save image for " + sd.getRoot(), t); errorSDs.add(sd); } } public String toString() { return "FSImageSaver for " + sd.getRoot() + " of type " + sd.getStorageDirType(); } } private void waitForThreads(List<Thread> threads) { for (Thread thread : threads) { while (thread.isAlive()) { try { thread.join(); } catch (InterruptedException iex) { LOG.error("Caught exception while waiting for thread " + thread.getName() + " to finish. Retrying join"); } } } } /** * Save the contents of the FS image to a new image file in each of the * current storage directories. */ void saveNamespace() throws IOException { assert editLog != null : "editLog must be initialized"; storage.attemptRestoreRemovedStorage(); boolean editLogWasOpen = editLog.isOpen(); if (editLogWasOpen) { editLog.endCurrentLogSegment(true); } long imageTxId = editLog.getLastWrittenTxId(); try { saveFSImageInAllDirs(imageTxId); storage.writeAll(); } finally { if (editLogWasOpen) { editLog.startLogSegment(imageTxId + 1, true); // Take this opportunity to note the current transaction storage.writeTransactionIdFileToStorage(imageTxId + 1); } } } protected void saveFSImageInAllDirs(long txid) throws IOException { if (storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0) { throw new IOException("No image directories available!"); } List<StorageDirectory> errorSDs = Collections.synchronizedList(new ArrayList<StorageDirectory>()); List<Thread> saveThreads = new ArrayList<Thread>(); // save images into current for (Iterator<StorageDirectory> it = storage.dirIterator(NameNodeDirType.IMAGE); it.hasNext();) { StorageDirectory sd = it.next(); FSImageSaver saver = new FSImageSaver(sd, errorSDs, txid); Thread saveThread = new Thread(saver, saver.toString()); saveThreads.add(saveThread); saveThread.start(); } waitForThreads(saveThreads); saveThreads.clear(); storage.reportErrorsOnDirectories(errorSDs); if (storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0) { throw new IOException( "Failed to save in any storage directories while saving namespace."); } renameCheckpoint(txid); // Since we now have a new checkpoint, we can clean up some // old edit logs and checkpoints. purgeOldStorage(); } /** * Purge any files in the storage directories that are no longer * necessary. */ public void purgeOldStorage() { try { archivalManager.purgeOldStorage(); } catch (Exception e) { LOG.warn("Unable to purge old storage", e); } } /** * Renames new image */ private void renameCheckpoint(long txid) throws IOException { ArrayList<StorageDirectory> al = null; for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) { try { renameCheckpointInDir(sd, txid); } catch (IOException ioe) { LOG.warn("Unable to rename checkpoint in " + sd, ioe); if (al == null) { al = Lists.newArrayList(); } al.add(sd); } } if(al != null) storage.reportErrorsOnDirectories(al); } private void renameCheckpointInDir(StorageDirectory sd, long txid) throws IOException { File ckpt = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE_NEW, txid); File curFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE, txid); // renameTo fails on Windows if the destination file // already exists. if(LOG.isDebugEnabled()) { LOG.debug("renaming " + ckpt.getAbsolutePath() + " to " + curFile.getAbsolutePath()); } if (!ckpt.renameTo(curFile)) { if (!curFile.delete() || !ckpt.renameTo(curFile)) { throw new IOException("renaming " + ckpt.getAbsolutePath() + " to " + curFile.getAbsolutePath() + " FAILED"); } } } CheckpointSignature rollEditLog() throws IOException { getEditLog().rollEditLog(); // Record this log segment ID in all of the storage directories, so // we won't miss this log segment on a restart if the edits directories // go missing. storage.writeTransactionIdFileToStorage(getEditLog().getCurSegmentTxId()); return new CheckpointSignature(this); } /** * Start checkpoint. * <p> * If backup storage contains image that is newer than or incompatible with * what the active name-node has, then the backup node should shutdown.<br> * If the backup image is older than the active one then it should * be discarded and downloaded from the active node.<br> * If the images are the same then the backup image will be used as current. * * @param bnReg the backup node registration. * @param nnReg this (active) name-node registration. * @return {@link NamenodeCommand} if backup node should shutdown or * {@link CheckpointCommand} prescribing what backup node should * do with its image. * @throws IOException */ NamenodeCommand startCheckpoint(NamenodeRegistration bnReg, // backup node NamenodeRegistration nnReg) // active name-node throws IOException { String msg = null; // Verify that checkpoint is allowed if(bnReg.getNamespaceID() != storage.getNamespaceID()) msg = "Name node " + bnReg.getAddress() + " has incompatible namespace id: " + bnReg.getNamespaceID() + " expected: " + storage.getNamespaceID(); else if(bnReg.isRole(NamenodeRole.NAMENODE)) msg = "Name node " + bnReg.getAddress() + " role " + bnReg.getRole() + ": checkpoint is not allowed."; else if(bnReg.getLayoutVersion() < storage.getLayoutVersion() || (bnReg.getLayoutVersion() == storage.getLayoutVersion() && bnReg.getCTime() > storage.getCTime())) // remote node has newer image age msg = "Name node " + bnReg.getAddress() + " has newer image layout version: LV = " +bnReg.getLayoutVersion() + " cTime = " + bnReg.getCTime() + ". Current version: LV = " + storage.getLayoutVersion() + " cTime = " + storage.getCTime(); if(msg != null) { LOG.error(msg); return new NamenodeCommand(NamenodeProtocol.ACT_SHUTDOWN); } boolean needToReturnImg = true; if(storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0) // do not return image if there are no image directories needToReturnImg = false; CheckpointSignature sig = rollEditLog(); return new CheckpointCommand(sig, needToReturnImg); } /** * End checkpoint. * <p> * Validate the current storage info with the given signature. * * @param sig to validate the current storage info against * @throws IOException if the checkpoint fields are inconsistent */ void endCheckpoint(CheckpointSignature sig) throws IOException { sig.validateStorageInfo(this); } /** * This is called by the 2NN after having downloaded an image, and by * the NN after having received a new image from the 2NN. It * renames the image from fsimage_N.ckpt to fsimage_N and also * saves the related .md5 file into place. */ synchronized void saveDigestAndRenameCheckpointImage( long txid, MD5Hash digest) throws IOException { renameCheckpoint(txid); List<StorageDirectory> badSds = Lists.newArrayList(); for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) { File imageFile = NNStorage.getImageFile(sd, txid); try { MD5FileUtils.saveMD5File(imageFile, digest); } catch (IOException ioe) { badSds.add(sd); } } storage.reportErrorsOnDirectories(badSds); // So long as this is the newest image available, // advertise it as such to other checkpointers // from now on if (txid > storage.getMostRecentCheckpointTxId()) { storage.setMostRecentCheckpointInfo(txid, Util.now()); } } synchronized public void close() throws IOException { if (editLog != null) { // 2NN doesn't have any edit log getEditLog().close(); } storage.close(); } /** * Retrieve checkpoint dirs from configuration. * * @param conf the Configuration * @param defaultValue a default value for the attribute, if null * @return a Collection of URIs representing the values in * dfs.namenode.checkpoint.dir configuration property */ static Collection<URI> getCheckpointDirs(Configuration conf, String defaultValue) { Collection<String> dirNames = conf.getStringCollection(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_DIR_KEY); if (dirNames.size() == 0 && defaultValue != null) { dirNames.add(defaultValue); } return Util.stringCollectionAsURIs(dirNames); } static Collection<URI> getCheckpointEditsDirs(Configuration conf, String defaultName) { Collection<String> dirNames = conf.getStringCollection(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_EDITS_DIR_KEY); if (dirNames.size() == 0 && defaultName != null) { dirNames.add(defaultName); } return Util.stringCollectionAsURIs(dirNames); } public NNStorage getStorage() { return storage; } public int getLayoutVersion() { return storage.getLayoutVersion(); } public int getNamespaceID() { return storage.getNamespaceID(); } public String getClusterID() { return storage.getClusterID(); } public String getBlockPoolID() { return storage.getBlockPoolID(); } public synchronized long getLastAppliedTxId() { return lastAppliedTxId; } }