/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.namenode;
import static org.apache.hadoop.hdfs.server.common.Util.now;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.EnumMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.permission.PermissionStatus;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
import org.apache.hadoop.hdfs.protocol.LayoutVersion;
import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.namenode.BlocksMap.BlockInfo;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AddCloseOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AppendOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ClearNSQuotaOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ConcatDeleteOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.DeleteOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.HardLinkOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.MergeOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.MkdirOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.RenameOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetGenstampOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetNSQuotaOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetOwnerOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetPermissionsOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetQuotaOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.SetReplicationOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.TimesOp;
import org.apache.hadoop.hdfs.util.Holder;
import org.apache.hadoop.raid.RaidCodec;
import com.google.common.base.Joiner;
public class FSEditLogLoader {
public static final Log LOG = LogFactory.getLog(FSEditLogLoader.class);
private final FSNamesystem fsNamesys;
public static final long TXID_IGNORE = -1;
private long lastAppliedTxId = -1;
public FSEditLogLoader(FSNamesystem fsNamesys) {
this.fsNamesys = fsNamesys;
}
/**
* Allocate inode id for legacy edit log, or generate inode id for new edit log,
* and update lastInodeId in {@link FSDirectory}
* Also doing some sanity check
*
* @param fsDir {@link FSDirectory}
* @param inodeIdFromOp inode id read from edit log
* @param logVersion current layout version
* @param lastInodeId the latest inode id in the system
* @return inode id
* @throws IOException When invalid inode id in layout that supports inode id.
*
*/
private static long getAndUpdateLastInodeId(FSDirectory fsDir,
long inodeIdFromOp, int logVersion) throws IOException {
long inodeId = inodeIdFromOp;
if (inodeId == INodeId.GRANDFATHER_INODE_ID) {
// This id is read from old edit log
if (LayoutVersion.supports(Feature.ADD_INODE_ID, logVersion)) {
throw new IOException("The layout version " + logVersion
+ " supports inodeId but gave bogus inodeId");
}
inodeId = fsDir.allocateNewInodeId();
} else {
// need to reset lastInodeId. fsdir gets lastInodeId firstly from
// fsimage but editlog captures more recent inodeId allocations
if (inodeId > fsDir.getLastInodeId()) {
fsDir.resetLastInodeId(inodeId);
}
}
return inodeId;
}
/**
* Load an edit log, and apply the changes to the in-memory structure
* This is where we apply edits that we've been writing to disk all
* along.
*/
int loadFSEdits(EditLogInputStream edits, long lastAppliedTxId)
throws IOException {
long startTime = now();
this.lastAppliedTxId = lastAppliedTxId;
int numEdits = loadFSEdits(edits, true);
FSImage.LOG.info("Edits file " + edits.toString()
+ " of size: " + edits.length() + ", # of edits: " + numEdits
+ " loaded in: " + (now()-startTime)/1000 + " seconds.");
return numEdits;
}
int loadFSEdits(EditLogInputStream edits, boolean closeOnExit)
throws IOException {
int numEdits = 0;
int logVersion = edits.getVersion();
try {
numEdits = loadEditRecords(logVersion, edits, false);
} finally {
if(closeOnExit) {
edits.close();
}
}
return numEdits;
}
static void loadEditRecord(int logVersion,
EditLogInputStream in,
long[] recentOpcodeOffsets,
EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts,
FSNamesystem fsNamesys,
FSDirectory fsDir,
int numEdits,
FSEditLogOp op) throws IOException{
recentOpcodeOffsets[numEdits % recentOpcodeOffsets.length] =
in.getPosition();
incrOpCount(op.opCode, opCounts);
switch (op.opCode) {
case OP_ADD:
case OP_CLOSE: {
AddCloseOp addCloseOp = (AddCloseOp)op;
// versions > 0 support per file replication
// get name and replication
final short replication = fsNamesys.adjustReplication(addCloseOp.replication);
long blockSize = addCloseOp.blockSize;
BlockInfo blocks[] = new BlockInfo[addCloseOp.blocks.length];
for (int i = 0; i < addCloseOp.blocks.length; i++) {
// We already read&create BlockInfo in FSEditLogOp
blocks[i] = new BlockInfo(addCloseOp.blocks[i], replication);
blocks[i].setChecksum(addCloseOp.blocks[i].getChecksum());
}
// Older versions of HDFS does not store the block size in inode.
// If the file has more than one block, use the size of the
// first block as the blocksize. Otherwise use the default
// block size.
if (-8 <= logVersion && blockSize == 0) {
if (blocks.length > 1) {
blockSize = blocks[0].getNumBytes();
} else {
long first = ((blocks.length == 1)? blocks[0].getNumBytes(): 0);
blockSize = Math.max(fsNamesys.getDefaultBlockSize(), first);
}
}
PermissionStatus permissions = fsNamesys.getUpgradePermission();
if (addCloseOp.permissions != null) {
permissions = addCloseOp.permissions;
}
// The open lease transaction re-creates a file if necessary.
// Delete the file if it already exists.
if (FSNamesystem.LOG.isDebugEnabled()) {
FSNamesystem.LOG.debug(op.opCode + ": " + addCloseOp.path +
" numblocks : " + blocks.length +
" clientHolder " + addCloseOp.clientName +
" clientMachine " + addCloseOp.clientMachine);
}
long inodeId = getAndUpdateLastInodeId(fsDir, addCloseOp.inodeId, logVersion);
// updateINodeFile calls unprotectedAddFile if node doesn't exist
INodeFile node = fsDir.updateINodefile(inodeId, addCloseOp.path, permissions,
blocks, replication, addCloseOp.mtime, addCloseOp.atime, blockSize,
addCloseOp.clientName, addCloseOp.clientMachine);
if (addCloseOp.opCode == FSEditLogOpCodes.OP_ADD) {
if (!node.isUnderConstruction()) {
throw new IOException("INodeFile : " + node
+ " is not under construction");
}
INodeFileUnderConstruction cons = (INodeFileUnderConstruction) node;
if (!cons.getClientName().equals(addCloseOp.clientName)) {
fsNamesys.leaseManager.removeLease(cons.getClientName(),
addCloseOp.path);
FSNamesystem.LOG.info("Updating client name for : " + addCloseOp.path
+ " from : " + cons.getClientName() + " to : "
+ addCloseOp.clientName);
}
cons.setClientName(addCloseOp.clientName);
cons.setClientMachine(addCloseOp.clientMachine);
// TODO: There is a chance that by clearing targets, we over count safe
// mode counts. Here is the case:
// The OP_ADD is triggered by a commitSynchronizedBlock() (which means
// only the generation stamp of the last block changed). Before block
// synchronization, a block is already reported by a data node after
// data node restarting so we've already incremented safe block count.
// Here we clear the targets without decrementing the safe block count,
// so that later the safe block count will be added again.
// We need to fix it.
cons.clearTargets();
fsNamesys.leaseManager.addLease(cons.getClientName(),
addCloseOp.path,
cons.getModificationTime());
} else {
INodeFile newNode = node;
if (node.isUnderConstruction()) {
INodeFileUnderConstruction pendingFile = (INodeFileUnderConstruction) node;
newNode = pendingFile.convertToInodeFile();
newNode.setAccessTime(addCloseOp.atime);
fsNamesys.leaseManager.removeLease(pendingFile.getClientName(),
addCloseOp.path);
fsDir.replaceNode(addCloseOp.path, node, newNode);
pendingFile.clearTargets();
}
}
break;
}
case OP_APPEND: {
AppendOp appendOp = (AppendOp)op;
INodeFile node = fsDir.getFileINode(appendOp.path);
// node must exist and the file must be closed
if (node == null || node.isUnderConstruction()) {
logErrorAndFail("Append - can not append to "
+ (node == null ? "non-existent" : "under construction") + " file");
}
// check if all blocks match
if (node.getBlocks().length != appendOp.blocks.length) {
logErrorAndFail("Append - the transaction has incorrect block information");
}
int index = 0;
for (Block b1 : node.getBlocks()) {
Block b2 = appendOp.blocks[index++];
// blocks should be exactly the same (id, gs, size)
if (!b1.equals(b2) || b1.getNumBytes() != b2.getNumBytes()) {
LOG.error("Append: existing block: " + b1 + ", block in edit log: "
+ b2);
logErrorAndFail("Append: blocks are mismatched");
}
}
INodeFileUnderConstruction cons = new INodeFileUnderConstruction(
node.getId(),
node.getLocalNameBytes(),
node.getReplication(),
node.getModificationTime(),
node.getAccessTime(),
node.getPreferredBlockSize(),
appendOp.blocks,
node.getPermissionStatus(),
appendOp.clientName,
appendOp.clientMachine,
fsNamesys.getNode(appendOp.clientMachine));
fsNamesys.dir.replaceNode(appendOp.path, null, node, cons, true);
fsNamesys.leaseManager.addLease(cons.getClientName(),
appendOp.path,
cons.getModificationTime());
break;
}
case OP_SET_REPLICATION: {
SetReplicationOp setReplicationOp = (SetReplicationOp)op;
short replication = fsNamesys.adjustReplication(
setReplicationOp.replication);
fsDir.unprotectedSetReplication(setReplicationOp.path,
replication, null);
break;
}
case OP_CONCAT_DELETE: {
ConcatDeleteOp concatDeleteOp = (ConcatDeleteOp)op;
fsDir.unprotectedConcat(concatDeleteOp.trg, concatDeleteOp.srcs,
concatDeleteOp.timestamp);
break;
}
case OP_MERGE: {
MergeOp mergeOp = (MergeOp)op;
RaidCodec codec = RaidCodec.getCodec(mergeOp.codecId);
if (codec == null) {
LOG.error("Codec " + mergeOp.codecId + " doesn't exist");
logErrorAndFail("Merge: codec doesn't exist");
}
INode[] sourceINodes = fsDir.getExistingPathINodes(mergeOp.source);
INode[] parityINodes = fsDir.getExistingPathINodes(mergeOp.parity);
fsDir.unprotectedMerge(parityINodes, sourceINodes, mergeOp.parity, mergeOp.source,
codec, mergeOp.checksums, mergeOp.timestamp);
break;
}
case OP_HARDLINK: {
HardLinkOp hardLinkOp = (HardLinkOp)op;
fsDir.unprotectedHardLinkTo(hardLinkOp.src, hardLinkOp.dst, hardLinkOp.timestamp);
break;
}
case OP_RENAME: {
RenameOp renameOp = (RenameOp)op;
HdfsFileStatus dinfo = fsDir.getHdfsFileInfo(renameOp.dst);
fsDir.unprotectedRenameTo(renameOp.src, renameOp.dst,
renameOp.timestamp);
fsNamesys.changeLease(renameOp.src, renameOp.dst, dinfo);
break;
}
case OP_DELETE: {
DeleteOp deleteOp = (DeleteOp)op;
fsDir.unprotectedDelete(deleteOp.path, deleteOp.timestamp);
break;
}
case OP_MKDIR: {
MkdirOp mkdirOp = (MkdirOp)op;
PermissionStatus permissions = fsNamesys.getUpgradePermission();
if (mkdirOp.permissions != null) {
permissions = mkdirOp.permissions;
}
long inodeId = getAndUpdateLastInodeId(fsDir, mkdirOp.inodeId, logVersion);
fsDir.unprotectedMkdir(inodeId, mkdirOp.path, permissions,
mkdirOp.timestamp);
break;
}
case OP_SET_GENSTAMP: {
SetGenstampOp setGenstampOp = (SetGenstampOp)op;
fsNamesys.setGenerationStamp(setGenstampOp.genStamp);
break;
}
case OP_SET_PERMISSIONS: {
SetPermissionsOp setPermissionsOp = (SetPermissionsOp)op;
fsDir.unprotectedSetPermission(setPermissionsOp.src,
setPermissionsOp.permissions);
break;
}
case OP_SET_OWNER: {
SetOwnerOp setOwnerOp = (SetOwnerOp)op;
fsDir.unprotectedSetOwner(setOwnerOp.src, setOwnerOp.username,
setOwnerOp.groupname);
break;
}
case OP_SET_NS_QUOTA: {
SetNSQuotaOp setNSQuotaOp = (SetNSQuotaOp)op;
fsDir.unprotectedSetQuota(setNSQuotaOp.src,
setNSQuotaOp.nsQuota,
FSConstants.QUOTA_DONT_SET);
break;
}
case OP_CLEAR_NS_QUOTA: {
ClearNSQuotaOp clearNSQuotaOp = (ClearNSQuotaOp)op;
fsDir.unprotectedSetQuota(clearNSQuotaOp.src,
FSConstants.QUOTA_RESET,
FSConstants.QUOTA_DONT_SET);
break;
}
case OP_SET_QUOTA:
SetQuotaOp setQuotaOp = (SetQuotaOp)op;
fsDir.unprotectedSetQuota(setQuotaOp.src,
setQuotaOp.nsQuota,
setQuotaOp.dsQuota);
break;
case OP_TIMES: {
TimesOp timesOp = (TimesOp)op;
fsDir.unprotectedSetTimes(timesOp.path,
timesOp.mtime,
timesOp.atime, true);
break;
}
case OP_START_LOG_SEGMENT:
case OP_END_LOG_SEGMENT: {
// no data in here currently.
break;
}
case OP_DATANODE_ADD:
case OP_DATANODE_REMOVE:
break;
default:
throw new IOException("Invalid operation read " + op.opCode);
}
}
@SuppressWarnings("deprecation")
int loadEditRecords(int logVersion, EditLogInputStream in, boolean closeOnExit)
throws IOException {
FSDirectory fsDir = fsNamesys.dir;
int numEdits = 0;
EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts =
new EnumMap<FSEditLogOpCodes, Holder<Integer>>(FSEditLogOpCodes.class);
fsNamesys.writeLock();
fsDir.writeLock();
long recentOpcodeOffsets[] = new long[2];
Arrays.fill(recentOpcodeOffsets, -1);
try {
try {
FSEditLogOp op;
FSEditLog.LOG.info("Planning to load: " + numEdits);
while (true) {
try {
op = in.readOp();
if (op == null) {
break;
}
} catch (Throwable e) {
String errorMessage = "Failed to read txId " + lastAppliedTxId
+ "skipping the bad section in the log";
checkFail(errorMessage);
in.resync();
FSImage.LOG.info("After resync, position is " + in.getPosition());
continue;
}
if (logVersion <= FSConstants.STORED_TXIDS) {
long diskTxid = op.txid;
if (diskTxid != (lastAppliedTxId + 1)) {
String errorMsg = "The transaction id in the edit log : "
+ diskTxid + " does not match the transaction id inferred"
+ " from FSIMAGE : " + (lastAppliedTxId + 1);
checkFail(errorMsg);
}
}
try {
loadEditRecord(logVersion,
in,
recentOpcodeOffsets,
opCounts,
fsNamesys,
fsDir,
numEdits,
op);
lastAppliedTxId = op.txid;
numEdits++;
} catch (Throwable t) {
String errorMsg = "Failed to load transaction " + op + ": " + t.getMessage();
checkFail(errorMsg);
// assume the transaction was loaded to avoid another error
lastAppliedTxId = op.txid;
}
}
} finally {
if(closeOnExit)
in.close();
}
} catch (Throwable t) {
// Catch Throwable because in the case of a truly corrupt edits log, any
// sort of error might be thrown (NumberFormat, NullPointer, EOF, etc.)
String errorMessage = getErrorMessage(recentOpcodeOffsets, in.getPosition());
FSImage.LOG.error(errorMessage);
throw new IOException(errorMessage, t);
} finally {
fsDir.writeUnlock();
fsNamesys.writeUnlock();
}
dumpOpCounts(opCounts);
return numEdits;
}
/**
* When encountering an error while loading the transaction, we can
* skip the problematic transaction and continue, first prompting the user.
* This will only be possible when NN is started with appropriate option.
*/
private void checkFail(String errorMsg) throws IOException {
if (fsNamesys.failOnTxIdMismatch()) {
FSEditLog.LOG.error(errorMsg);
throw new IOException(errorMsg);
}
MetaRecoveryContext.editLogLoaderPrompt(errorMsg);
}
public static String getErrorMessage(long[] recentOpcodeOffsets, long position) {
StringBuilder sb = new StringBuilder();
sb.append("Error replaying edit log at offset " + position);
if (recentOpcodeOffsets[0] != -1) {
Arrays.sort(recentOpcodeOffsets);
sb.append("\nRecent opcode offsets:");
for (long offset : recentOpcodeOffsets) {
if (offset != -1) {
sb.append(' ').append(offset);
}
}
}
return sb.toString();
}
public long getLastAppliedTxId() {
return lastAppliedTxId;
}
static void dumpOpCounts(
EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts) {
if (!FSImage.LOG.isDebugEnabled())
return;
StringBuilder sb = new StringBuilder();
sb.append("Summary of operations loaded from edit log:\n ");
Joiner.on("\n ").withKeyValueSeparator("=").appendTo(sb, opCounts);
FSImage.LOG.debug(sb.toString());
}
private static void incrOpCount(FSEditLogOpCodes opCode,
EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts) {
Holder<Integer> holder = opCounts.get(opCode);
if (holder == null) {
holder = new Holder<Integer>(1);
opCounts.put(opCode, holder);
} else {
holder.held++;
}
}
/**
* Return the number of valid transactions in the stream. If the stream is
* truncated during the header, returns a value indicating that there are
* 0 valid transactions. This reads through the stream but does not close
* it.
* @throws IOException if the stream cannot be read due to an IO error (eg
* if the log does not exist)
*/
public static EditLogValidation validateEditLog(EditLogInputStream in) {
long lastPos = 0;
long firstTxId = HdfsConstants.INVALID_TXID;
long lastTxId = HdfsConstants.INVALID_TXID;
long numValid = 0;
try {
FSEditLogOp op = null;
while (true) {
lastPos = in.getPosition();
try {
if ((op = in.readOp()) == null) {
break;
}
} catch (Throwable t) {
FSImage.LOG.warn("Caught exception after reading " + numValid +
" ops from " + in + " while determining its valid length." +
"Position was " + lastPos, t);
in.resync();
FSImage.LOG.info("After resync, position is " + in.getPosition());
continue;
}
if (firstTxId == HdfsConstants.INVALID_TXID) {
firstTxId = op.getTransactionId();
}
if (lastTxId == HdfsConstants.INVALID_TXID
|| op.txid > lastTxId) {
lastTxId = op.getTransactionId();
} else {
FSImage.LOG.error("Out of order txid found. Found " + op.txid
+ ", expected " + (lastTxId + 1));
}
numValid++;
}
} catch (Throwable t) {
// Catch Throwable and not just IOE, since bad edits may generate
// NumberFormatExceptions, AssertionErrors, OutOfMemoryErrors, etc.
FSImage.LOG.debug("Caught exception after reading " + numValid +
" ops from " + in + " while determining its valid length.", t);
}
return new EditLogValidation(lastPos, firstTxId, lastTxId, false);
}
static void logErrorAndFail(String errorMsg) throws IOException {
LOG.error(errorMsg);
throw new IOException(errorMsg);
}
public static class EditLogValidation {
private long validLength;
private long startTxId;
private long endTxId;
private boolean hasCorruptHeader;
public EditLogValidation(long validLength, long startTxId, long endTxId,
boolean hasCorruptHeader) {
this.validLength = validLength;
this.startTxId = startTxId;
this.endTxId = endTxId;
this.hasCorruptHeader = hasCorruptHeader;
}
long getValidLength() { return validLength; }
long getStartTxId() { return startTxId; }
public long getEndTxId() { return endTxId; }
public long getNumTransactions() {
if (endTxId == HdfsConstants.INVALID_TXID
|| startTxId == HdfsConstants.INVALID_TXID) {
return 0;
}
return (endTxId - startTxId) + 1;
}
boolean hasCorruptHeader() { return hasCorruptHeader; }
}
/**
* Stream wrapper that keeps track of the current stream position.
*/
public static class PositionTrackingInputStream extends FilterInputStream {
private long curPos = 0;
private long markPos = -1;
public PositionTrackingInputStream(InputStream is) {
super(is);
}
public PositionTrackingInputStream(InputStream is, long position) {
super(is);
curPos = position;
}
@Override
public int read() throws IOException {
int ret = super.read();
if (ret != -1) curPos++;
return ret;
}
@Override
public int read(byte[] data) throws IOException {
int ret = super.read(data);
if (ret > 0) curPos += ret;
return ret;
}
@Override
public int read(byte[] data, int offset, int length) throws IOException {
int ret = super.read(data, offset, length);
if (ret > 0) curPos += ret;
return ret;
}
@Override
public void mark(int limit) {
super.mark(limit);
markPos = curPos;
}
@Override
public void reset() throws IOException {
if (markPos == -1) {
throw new IOException("Not marked!");
}
super.reset();
curPos = markPos;
markPos = -1;
}
public long getPos() {
return curPos;
}
@Override
public long skip(long amt) throws IOException {
long ret = super.skip(amt);
curPos += ret;
return ret;
}
}
}