/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.namenode;
import java.io.IOException;
import java.io.EOFException;
import java.lang.Thread;
import java.util.Arrays;
import java.util.EnumMap;
import java.text.SimpleDateFormat;
import org.apache.commons.logging.Log;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.ChecksumException;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.protocol.LayoutVersion;
import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
import org.apache.hadoop.hdfs.server.namenode.AvatarNode;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.FSEditLog;
import org.apache.hadoop.hdfs.server.namenode.Standby.StandbyIngestState;
import org.apache.hadoop.hdfs.util.InjectionEvent;
import org.apache.hadoop.hdfs.util.Holder;
import org.apache.hadoop.util.InjectionHandler;
/**
* This class reads transaction logs from the primary's shared device
* and feeds it to the standby NameNode.
*/
public class Ingest implements Runnable {
public static final Log LOG = AvatarNode.LOG;
static final SimpleDateFormat DATE_FORM =
new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
private Standby standby;
private Configuration confg; // configuration of local namenode
volatile private boolean running = true;
volatile private boolean catchingUp = true;
volatile private long catchUpLag;
volatile private boolean lastScan = false; // is this the last scan?
private int logVersion;
volatile private boolean success = false; // not successfully ingested yet
EditLogInputStream inputEditStream = null;
long currentPosition; // current offset in the transaction log
final FSNamesystem fsNamesys;
// startTxId is known when the ingest is instantiated
long startTxId = -1;
// endTxId is -1 until the ingest completes
long endTxId = -1;
Ingest(Standby standby, FSNamesystem ns, Configuration conf, long startTxId)
throws IOException {
this.fsNamesys = ns;
this.standby = standby;
this.confg = conf;
this.startTxId = startTxId;
catchUpLag = conf.getLong("avatar.catchup.lag", 2 * 1024 * 1024L);
inputEditStream = standby.setupIngestStreamWithRetries(startTxId);
}
public void run() {
if (standby.currentIngestState == StandbyIngestState.CHECKPOINTING) {
CheckpointSignature sig = standby.getLastRollSignature();
if (sig == null) {
// This should never happen, since if checkpoint is in progress
// and the Ingest thread is getting recreated there should be a sig
throw new RuntimeException("Ingest: Weird thing has happened. The " +
"checkpoint was in progress, but there is no signature of the " +
"fsedits available");
}
/**
* If the checkpoint is in progress it means that the edits file was
* already read and we cannot read it again from the start.
* Just successfuly return.
*/
success = true;
return;
}
while (running) {
try {
// keep ingesting transactions from remote edits log.
loadFSEdits();
} catch (Exception e) {
LOG.warn("Ingest: Exception while processing transactions (" +
running + ") ", e);
throw new RuntimeException("Ingest: failure", e);
} finally {
LOG.warn("Ingest: Processing transactions has a hiccup. " + running);
}
}
success = true; // successful ingest.
}
/**
* Immediately Stop the ingestion of transaction logs
*/
void stop() {
running = false;
}
public boolean isCatchingUp() {
return catchingUp;
}
/**
* Checks if the ingest is catching up.
* If the ingest is consuming finalized segment, it's assumed to be behind.
* Otherwise, catching up is based on the position of the input stream.
* @throws IOException
*/
private void setCatchingUp() throws IOException {
try {
if (inputEditStream != null && inputEditStream.isInProgress()) {
catchingUp = (inputEditStream.length() - inputEditStream.getPosition() > catchUpLag);
} else {
catchingUp = true;
}
} catch (Exception e) {
catchingUp = true;
}
}
/**
* Indicate that this is the last pass over the transaction log
*/
synchronized void quiesce() {
lastScan = true;
}
/**
* Indicate whether ingest was successful or not.
* Returns true on success, else false.
*/
boolean getIngestStatus() {
return success;
}
boolean isRunning() {
return running;
}
long getStartTxId() {
return startTxId;
}
/**
* Returns the distance in bytes between the current position inside of the
* edits log and the length of the edits log
*/
public long getLagBytes() {
try {
if (inputEditStream != null && inputEditStream.isInProgress()) {
// for file journals it may happen that we read a segment finalized
// by primary, but not refreshed by the standby, so length() returns 0
// hence we take max(-1,lag)
return Math.max(-1,
inputEditStream.length() - this.inputEditStream.getPosition());
}
return -1;
} catch (IOException ex) {
LOG.error("Error getting the lag", ex);
return -1;
}
}
/**
* Load an edit log, and continue applying the changes to the in-memory
* structure. This is where we ingest transactions into the standby.
*/
private int loadFSEdits() throws IOException {
FSDirectory fsDir = fsNamesys.dir;
int numEdits = 0;
long startTime = FSNamesystem.now();
LOG.info("Ingest: Consuming transactions: " + this.toString());
try {
logVersion = inputEditStream.getVersion();
if (!LayoutVersion.supports(Feature.TXID_BASED_LAYOUT, logVersion))
throw new RuntimeException("Log version is too old");
currentPosition = inputEditStream.getPosition();
numEdits = ingestFSEdits(); // continue to ingest
} finally {
LOG.info("Ingest: Closing ingest for segment: " + this.toString());
// At this time we are done reading the transaction log
// We need to sync to have on disk status the same as in memory
// if we saw end segment, we already synced
if(endTxId == -1 && fsDir.fsImage.getEditLog().isOpen()) {
fsDir.fsImage.getEditLog().logSync();
}
inputEditStream.close();
standby.clearIngestState();
}
LOG.info("Ingest: Edits segment: " + this.toString()
+ " edits # " + numEdits
+ " loaded in " + (FSNamesystem.now()-startTime)/1000 + " seconds.");
if (logVersion != FSConstants.LAYOUT_VERSION) // other version
numEdits++; // save this image asap
return numEdits;
}
/**
* Read a single transaction from the input edit log
* @param inputEditLog the log to read from
* @return a single edit log entry, null if EOF
* @throws IOException on error
*/
private FSEditLogOp ingestFSEdit(EditLogInputStream inputEditLog)
throws IOException {
FSEditLogOp op = null;
try {
op = inputEditLog.readOp();
InjectionHandler.processEventIO(InjectionEvent.INGEST_READ_OP);
} catch (EOFException e) {
return null; // No more transactions.
} catch (IOException e) {
// rethrow, it's handled in ingestFSEdits()
throw e;
} catch (Exception e) {
// some other problem, maybe unchecked exception
throw new IOException(e);
}
return op;
}
/**
* Continue to ingest transaction logs until the currentState is
* no longer INGEST. If lastScan is set to true, then we process
* till the end of the file and return.
*/
int ingestFSEdits() throws IOException {
FSDirectory fsDir = fsNamesys.dir;
int numEdits = 0;
long recentOpcodeOffsets[] = new long[2];
Arrays.fill(recentOpcodeOffsets, -1);
EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts =
new EnumMap<FSEditLogOpCodes, Holder<Integer>>(FSEditLogOpCodes.class);
boolean error = false;
boolean reopen = false;
boolean quitAfterScan = false;
long sharedLogTxId = FSEditLogLoader.TXID_IGNORE;
long localLogTxId = FSEditLogLoader.TXID_IGNORE;
FSEditLogOp op = null;
FSEditLog localEditLog = fsDir.fsImage.getEditLog();
while (running && !quitAfterScan) {
// if the application requested that we make a final pass over
// the transaction log, then we remember it here. We close and
// reopen the file to ensure that we can see all the data in the
// file, one reason being that NFS has open-to-close cache
// coherancy and the edit log could be stored in NFS.
//
if (reopen || lastScan) {
inputEditStream.close();
inputEditStream = standby.setupIngestStreamWithRetries(startTxId);
if (lastScan) {
// QUIESCE requested by Standby thread
LOG.info("Ingest: Starting last scan of transaction log: " + this.toString());
quitAfterScan = true;
}
// discard older buffers and start a fresh one.
inputEditStream.refresh(currentPosition, localEditLog.getLastWrittenTxId());
setCatchingUp();
reopen = false;
}
//
// Process all existing transactions till end of file
//
while (running) {
if (lastScan && !quitAfterScan) {
// Standby thread informed the ingest to quiesce
// we should refresh the input stream as soon as possible
// then quitAfterScan will be true
break;
}
// record the current file offset.
currentPosition = inputEditStream.getPosition();
InjectionHandler.processEvent(InjectionEvent.INGEST_BEFORE_LOAD_EDIT);
fsNamesys.writeLock();
try {
error = false;
op = ingestFSEdit(inputEditStream);
/*
* In the case of segments recovered on primary namenode startup, we
* have segments that are finalized (by name), but not containing the
* ending transaction. Without this check, we will keep looping until
* the next checkpoint to discover this situation.
*/
if (!inputEditStream.isInProgress()
&& standby.getLastCorrectTxId() == inputEditStream.getLastTxId()) {
// this is a correct segment with no end segment transaction
LOG.info("Ingest: Reached finalized log segment end with no end marker. "
+ this.toString());
tearDown(localEditLog, false, true);
break;
}
if (op == null) {
FSNamesystem.LOG.debug("Ingest: Invalid opcode, reached end of log " +
"Number of transactions found " +
numEdits);
break; // No more transactions.
}
sharedLogTxId = op.txid;
// Verify transaction ids match.
localLogTxId = localEditLog.getLastWrittenTxId() + 1;
// Fatal error only when the log contains transactions from the future
// we allow to process a transaction with smaller txid than local
// we will simply skip it later after reading from the ingest edits
if (localLogTxId < sharedLogTxId
|| InjectionHandler
.falseCondition(InjectionEvent.INGEST_TXID_CHECK)) {
String message = "The transaction id in the edit log : "
+ sharedLogTxId
+ " does not match the transaction id inferred"
+ " from FSIMAGE : " + localLogTxId;
LOG.fatal(message);
throw new RuntimeException(message);
}
// skip previously loaded transactions
if (!canApplyTransaction(sharedLogTxId, localLogTxId, op))
continue;
// for recovery, we do not want to re-load transactions,
// but we want to populate local log with them
if (shouldLoad(sharedLogTxId)) {
FSEditLogLoader.loadEditRecord(
logVersion,
inputEditStream,
recentOpcodeOffsets,
opCounts,
fsNamesys,
fsDir,
numEdits,
op);
}
LOG.info("Ingest: " + this.toString()
+ ", size: " + inputEditStream.length()
+ ", processing transaction at offset: " + currentPosition
+ ", txid: " + op.txid
+ ", opcode: " + op.opCode);
if (op.opCode == FSEditLogOpCodes.OP_START_LOG_SEGMENT) {
LOG.info("Ingest: Opening log segment: " + this.toString());
localEditLog.open();
} else if (op.opCode == FSEditLogOpCodes.OP_END_LOG_SEGMENT) {
InjectionHandler
.processEventIO(InjectionEvent.INGEST_CLEAR_STANDBY_STATE);
LOG.info("Ingest: Closing log segment: " + this.toString());
tearDown(localEditLog, true, true);
numEdits++;
LOG.info("Ingest: Reached log segment end. " + this.toString());
break;
} else {
localEditLog.logEdit(op);
if (inputEditStream.getReadChecksum() != FSEditLog
.getChecksumForWrite().getValue()) {
throw new IOException(
"Ingest: mismatched r/w checksums for transaction #"
+ numEdits);
}
}
numEdits++;
standby.setLastCorrectTxId(op.txid);
}
catch (ChecksumException cex) {
LOG.info("Checksum error reading the transaction #" + numEdits +
" reopening the file");
reopen = true;
break;
}
catch (IOException e) {
LOG.info("Encountered error reading transaction", e);
error = true; // if we haven't reached eof, then error.
break;
} finally {
if (localEditLog.isOpen()) {
localEditLog.logSyncIfNeeded();
}
fsNamesys.writeUnlock();
}
} // end inner while(running) -- all breaks come here
// if we failed to read the entire transaction from disk,
// then roll back to the offset where there was a last good
// read, sleep for sometime for new transaction to
// appear in the file and then continue;
if (error || running) {
// discard older buffers and start a fresh one.
inputEditStream.refresh(currentPosition, localEditLog.getLastWrittenTxId());
setCatchingUp();
if (error) {
LOG.info("Ingest: Incomplete transaction record at offset " +
inputEditStream.getPosition() +
" but the file is of size " + inputEditStream.length() +
". Continuing....");
}
if (running && !lastScan) {
try {
Thread.sleep(100); // sleep for a second
} catch (InterruptedException e) {
// break out of waiting if we receive an interrupt.
}
}
}
} //end outer while(running)
///////////////////// FINAL ACTIONS /////////////////////
// This was the last scan of the file but we could not read a full
// transaction from disk. If we proceed this will corrupt the image
if (error) {
String errorMessage = FSEditLogLoader.getErrorMessage(recentOpcodeOffsets, currentPosition);
LOG.error(errorMessage);
throw new IOException("Failed to read the edits log. " +
"Incomplete transaction at " + currentPosition);
}
// If the last Scan was completed, then stop the Ingest thread.
if (lastScan && quitAfterScan) {
LOG.info("Ingest: lastScan completed. " + this.toString());
running = false;
if(localEditLog.isOpen()) {
// quiesced non-finalized segment
LOG.info("Ingest: Reached non-finalized log segment end. "+ this.toString());
tearDown(localEditLog, false, localLogTxId != startTxId);
}
}
FSEditLogLoader.dumpOpCounts(opCounts);
return numEdits; // total transactions consumed
}
private void tearDown(FSEditLog localEditLog,
boolean writeEndTxn, boolean updateLastCorrectTxn) throws IOException {
localEditLog.endCurrentLogSegment(writeEndTxn);
endTxId = localEditLog.getLastWrittenTxId();
running = false;
lastScan = true;
if (updateLastCorrectTxn) {
standby.setLastCorrectTxId(endTxId);
}
standby.clearIngestState(endTxId + 1);
}
public String toString() {
String endStr = (endTxId == -1) ? "in progress" : ("" + endTxId);
return "Log segment (" + startTxId + ", " + endStr + ")";
}
private boolean canApplyTransaction(long sharedLogTxid, long localLogTxid,
FSEditLogOp op) {
boolean canApply = false;
// process all transaction if log has no tx id's
// it is up to the avatarnode to ensure we
// do not re-consume the same edits
if (sharedLogTxid == FSEditLogLoader.TXID_IGNORE){
canApply = true;
} else {
// otherwise we can apply the transaction if the txid match
canApply = (sharedLogTxid == localLogTxid);
}
if (!canApply) {
LOG.info("Ingest: skip loading txId: " + sharedLogTxid
+ ", local txid: " + localLogTxid
+ " txn: " + op);
}
return canApply;
}
/**
* Used for ingest recovery, where we erase the local edit log, and
* transactions need to be populated to the local log, but they should be
* not loaded into the namespace.
*/
private boolean shouldLoad(long txid) {
boolean shouldLoad = txid > standby.getLastCorrectTxId();
if (!shouldLoad) {
LOG.info("Ingest: skip loading txId: " + txid
+ " to namesystem, but writing to edit log, last correct txid: "
+ standby.getLastCorrectTxId());
}
return shouldLoad;
}
}