/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.namenode; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType; import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile; import org.apache.hadoop.hdfs.server.namenode.FileJournalManager.EditLogFile; import com.google.common.base.Joiner; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; class FSImageTransactionalStorageInspector extends FSImageStorageInspector { public static final Log LOG = LogFactory.getLog( FSImageTransactionalStorageInspector.class); private boolean needToSave = false; private boolean isUpgradeFinalized = true; List<FSImageFile> foundImages = new ArrayList<FSImageFile>(); List<EditLogFile> foundEditLogs = new ArrayList<EditLogFile>(); SortedMap<Long, LogGroup> logGroups = new TreeMap<Long, LogGroup>(); long maxSeenTxId = 0; private static final Pattern IMAGE_REGEX = Pattern.compile( NameNodeFile.IMAGE.getName() + "_(\\d+)"); @Override public void inspectDirectory(StorageDirectory sd) throws IOException { // Was the directory just formatted? if (!sd.getVersionFile().exists()) { LOG.info("No version file in " + sd.getRoot()); needToSave |= true; return; } File currentDir = sd.getCurrentDir(); File filesInStorage[]; try { filesInStorage = FileUtil.listFiles(currentDir); } catch (IOException ioe) { LOG.warn("Unable to inspect storage directory " + currentDir, ioe); return; } for (File f : filesInStorage) { LOG.debug("Checking file " + f); String name = f.getName(); // Check for fsimage_* Matcher imageMatch = IMAGE_REGEX.matcher(name); if (imageMatch.matches()) { if (sd.getStorageDirType().isOfType(NameNodeDirType.IMAGE)) { try { long txid = Long.valueOf(imageMatch.group(1)); foundImages.add(new FSImageFile(sd, f, txid)); } catch (NumberFormatException nfe) { LOG.error("Image file " + f + " has improperly formatted " + "transaction ID"); // skip } } else { LOG.warn("Found image file at " + f + " but storage directory is " + "not configured to contain images."); } } } // Check for a seen_txid file, which marks a minimum transaction ID that // must be included in our load plan. try { maxSeenTxId = Math.max(maxSeenTxId, NNStorage.readTransactionIdFile(sd)); } catch (IOException ioe) { LOG.warn("Unable to determine the max transaction ID seen by " + sd, ioe); } List<EditLogFile> editLogs = FileJournalManager.matchEditLogs(filesInStorage); if (sd.getStorageDirType().isOfType(NameNodeDirType.EDITS)) { for (EditLogFile log : editLogs) { addEditLog(log); } } else if (!editLogs.isEmpty()){ LOG.warn("Found the following edit log file(s) in " + sd + " even though it was not configured to store edits:\n" + " " + Joiner.on("\n ").join(editLogs)); } // set finalized flag isUpgradeFinalized = isUpgradeFinalized && !sd.getPreviousDir().exists(); } private void addEditLog(EditLogFile foundEditLog) { foundEditLogs.add(foundEditLog); LogGroup group = logGroups.get(foundEditLog.getFirstTxId()); if (group == null) { group = new LogGroup(foundEditLog.getFirstTxId()); logGroups.put(foundEditLog.getFirstTxId(), group); } group.add(foundEditLog); } @Override public boolean isUpgradeFinalized() { return isUpgradeFinalized; } /** * @return the image that has the most recent associated transaction ID. * If there are multiple storage directories which contain equal images * the storage directory that was inspected first will be preferred. * * Returns null if no images were found. */ FSImageFile getLatestImage() { FSImageFile ret = null; for (FSImageFile img : foundImages) { if (ret == null || img.txId > ret.txId) { ret = img; } } return ret; } public List<FSImageFile> getFoundImages() { return ImmutableList.copyOf(foundImages); } public List<EditLogFile> getEditLogFiles() { return ImmutableList.copyOf(foundEditLogs); } @Override public LoadPlan createLoadPlan() throws IOException { if (foundImages.isEmpty()) { throw new FileNotFoundException("No valid image files found"); } FSImageFile recoveryImage = getLatestImage(); LogLoadPlan logPlan = createLogLoadPlan(recoveryImage.txId, Long.MAX_VALUE); return new TransactionalLoadPlan(recoveryImage, logPlan); } /** * Plan which logs to load in order to bring the namespace up-to-date. * Transactions will be considered in the range (sinceTxId, maxTxId] * * @param sinceTxId the highest txid that is already loaded * (eg from the image checkpoint) * @param maxStartTxId ignore any log files that start after this txid */ LogLoadPlan createLogLoadPlan(long sinceTxId, long maxStartTxId) throws IOException { long expectedTxId = sinceTxId + 1; List<EditLogFile> recoveryLogs = new ArrayList<EditLogFile>(); SortedMap<Long, LogGroup> tailGroups = logGroups.tailMap(expectedTxId); if (logGroups.size() > tailGroups.size()) { LOG.debug("Excluded " + (logGroups.size() - tailGroups.size()) + " groups of logs because they start with a txid less than image " + "txid " + sinceTxId); } SortedMap<Long, LogGroup> usefulGroups; if (maxStartTxId > sinceTxId) { usefulGroups = tailGroups.headMap(maxStartTxId); } else { usefulGroups = new TreeMap<Long, LogGroup>(); } if (usefulGroups.size() > tailGroups.size()) { LOG.debug("Excluded " + (tailGroups.size() - usefulGroups.size()) + " groups of logs because they start with a txid higher than max " + "txid " + sinceTxId); } for (Map.Entry<Long, LogGroup> entry : usefulGroups.entrySet()) { long logStartTxId = entry.getKey(); LogGroup logGroup = entry.getValue(); logGroup.planRecovery(); if (expectedTxId != HdfsConstants.INVALID_TXID && logStartTxId != expectedTxId) { throw new IOException("Expected next log group would start at txid " + expectedTxId + " but starts at txid " + logStartTxId); } // We can pick any of the non-corrupt logs here recoveryLogs.add(logGroup.getBestNonCorruptLog()); // If this log group was finalized, we know to expect the next // log group to start at the following txid (ie no gaps) if (logGroup.hasKnownLastTxId()) { expectedTxId = logGroup.getLastTxId() + 1; } else { // the log group was in-progress so we don't know what ID // the next group should start from. expectedTxId = HdfsConstants.INVALID_TXID; } } long lastLogGroupStartTxId = usefulGroups.isEmpty() ? 0 : usefulGroups.lastKey(); if (maxSeenTxId > sinceTxId && maxSeenTxId > lastLogGroupStartTxId) { String msg = "At least one storage directory indicated it has seen a " + "log segment starting at txid " + maxSeenTxId; if (usefulGroups.isEmpty()) { msg += " but there are no logs to load."; } else { msg += " but the most recent log file found starts with txid " + lastLogGroupStartTxId; } throw new IOException(msg); } return new LogLoadPlan(recoveryLogs, Lists.newArrayList(usefulGroups.values())); } @Override public boolean needToSave() { return needToSave; } /** * A group of logs that all start at the same txid. * * Handles determining which logs are corrupt and which should be considered * candidates for loading. */ static class LogGroup { long startTxId; List<EditLogFile> logs = new ArrayList<EditLogFile>();; private Set<Long> endTxIds = new TreeSet<Long>(); private boolean hasInProgress = false; private boolean hasFinalized = false; LogGroup(long startTxId) { this.startTxId = startTxId; } EditLogFile getBestNonCorruptLog() { // First look for non-corrupt finalized logs for (EditLogFile log : logs) { if (!log.isCorrupt() && !log.isInProgress()) { return log; } } // Then look for non-corrupt in-progress logs for (EditLogFile log : logs) { if (!log.isCorrupt()) { return log; } } // We should never get here, because we don't get to the planning stage // without calling planRecovery first, and if we've called planRecovery, // we would have already thrown if there were no non-corrupt logs! throw new IllegalStateException( "No non-corrupt logs for txid " + startTxId); } /** * @return true if we can determine the last txid in this log group. */ boolean hasKnownLastTxId() { for (EditLogFile log : logs) { if (!log.isInProgress()) { return true; } } return false; } /** * @return the last txid included in the logs in this group * @throws IllegalStateException if it is unknown - * {@see #hasKnownLastTxId()} */ long getLastTxId() { for (EditLogFile log : logs) { if (!log.isInProgress()) { return log.getLastTxId(); } } throw new IllegalStateException("LogGroup only has in-progress logs"); } void add(EditLogFile log) { assert log.getFirstTxId() == startTxId; logs.add(log); if (log.isInProgress()) { hasInProgress = true; } else { hasFinalized = true; endTxIds.add(log.getLastTxId()); } } void planRecovery() throws IOException { assert hasInProgress || hasFinalized; checkConsistentEndTxIds(); if (hasFinalized && hasInProgress) { planMixedLogRecovery(); } else if (!hasFinalized && hasInProgress) { planAllInProgressRecovery(); } else if (hasFinalized && !hasInProgress) { LOG.debug("No recovery necessary for logs starting at txid " + startTxId); } } /** * Recovery case for when some logs in the group were in-progress, and * others were finalized. This happens when one of the storage * directories fails. * * The in-progress logs in this case should be considered corrupt. */ private void planMixedLogRecovery() throws IOException { for (EditLogFile log : logs) { if (log.isInProgress()) { LOG.warn("Log at " + log.getFile() + " is in progress, but " + "other logs starting at the same txid " + startTxId + " are finalized. Moving aside."); log.markCorrupt(); } } } /** * Recovery case for when all of the logs in the group were in progress. * This happens if the NN completely crashes and restarts. In this case * we check the non-zero lengths of each log file, and any logs that are * less than the max of these lengths are considered corrupt. */ private void planAllInProgressRecovery() throws IOException { // We only have in-progress logs. We need to figure out which logs have // the latest data to reccover them LOG.warn("Logs beginning at txid " + startTxId + " were are all " + "in-progress (probably truncated due to a previous NameNode " + "crash)"); if (logs.size() == 1) { // Only one log, it's our only choice! EditLogFile log = logs.get(0); if (log.validateLog().numTransactions == 0) { // If it has no transactions, we should consider it corrupt just // to be conservative. // See comment below for similar case LOG.warn("Marking log at " + log.getFile() + " as corrupt since " + "it has no transactions in it."); log.markCorrupt(); } return; } long maxValidTxnCount = Long.MIN_VALUE; for (EditLogFile log : logs) { long validTxnCount = log.validateLog().numTransactions; LOG.warn(" Log " + log.getFile() + " valid txns=" + validTxnCount + " valid len=" + log.validateLog().validLength); maxValidTxnCount = Math.max(maxValidTxnCount, validTxnCount); } for (EditLogFile log : logs) { long txns = log.validateLog().numTransactions; if (txns < maxValidTxnCount) { LOG.warn("Marking log at " + log.getFile() + " as corrupt since " + "it is has only " + txns + " valid txns whereas another " + "log has " + maxValidTxnCount); log.markCorrupt(); } else if (txns == 0) { // this can happen if the NN crashes right after rolling a log // but before the START_LOG_SEGMENT txn is written. Since the log // is empty, we can just move it aside to its corrupt name. LOG.warn("Marking log at " + log.getFile() + " as corrupt since " + "it has no transactions in it."); log.markCorrupt(); } } } /** * Check for the case when we have multiple finalized logs and they have * different ending transaction IDs. This violates an invariant that all * log directories should roll together. We should abort in this case. */ private void checkConsistentEndTxIds() throws IOException { if (hasFinalized && endTxIds.size() > 1) { throw new IOException("More than one ending txid was found " + "for logs starting at txid " + startTxId + ". " + "Found: " + StringUtils.join(endTxIds, ',')); } } void recover() throws IOException { for (EditLogFile log : logs) { if (log.isCorrupt()) { log.moveAsideCorruptFile(); } else if (log.isInProgress()) { log.finalizeLog(); } } } } static class TransactionalLoadPlan extends LoadPlan { final FSImageFile image; final LogLoadPlan logPlan; public TransactionalLoadPlan(FSImageFile image, LogLoadPlan logPlan) { super(); this.image = image; this.logPlan = logPlan; } @Override boolean doRecovery() throws IOException { logPlan.doRecovery(); return false; } @Override File getImageFile() { return image.getFile(); } @Override List<File> getEditsFiles() { return logPlan.getEditsFiles(); } @Override StorageDirectory getStorageDirectoryForProperties() { return image.sd; } } static class LogLoadPlan { final List<EditLogFile> editLogs; final List<LogGroup> logGroupsToRecover; LogLoadPlan(List<EditLogFile> editLogs, List<LogGroup> logGroupsToRecover) { this.editLogs = editLogs; this.logGroupsToRecover = logGroupsToRecover; } public void doRecovery() throws IOException { for (LogGroup g : logGroupsToRecover) { g.recover(); } } public List<File> getEditsFiles() { List<File> ret = new ArrayList<File>(); for (EditLogFile log : editLogs) { ret.add(log.getFile()); } return ret; } } }