/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.namenode; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.lang.reflect.Constructor; import java.net.URI; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentNavigableMap; import java.util.concurrent.ConcurrentSkipListMap; import java.util.zip.Checksum; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager; import org.apache.hadoop.hdfs.server.common.HdfsConstants; import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption; import org.apache.hadoop.hdfs.server.common.HdfsConstants.Transition; import org.apache.hadoop.hdfs.server.common.Storage.FormatConfirmable; import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; import org.apache.hadoop.hdfs.server.common.StorageInfo; import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.*; import org.apache.hadoop.hdfs.server.namenode.JournalSet.JournalAndStream; import org.apache.hadoop.hdfs.server.namenode.NNStorage.StorageLocationType; import org.apache.hadoop.hdfs.server.namenode.ValidateNamespaceDirPolicy.NNStorageLocation; import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest; import org.apache.hadoop.hdfs.util.InjectionEvent; import org.apache.hadoop.io.*; import org.apache.hadoop.ipc.Server; import org.apache.hadoop.util.InjectionHandler; import org.apache.hadoop.util.PureJavaCrc32; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.permission.*; import com.google.common.collect.Lists; /** * FSEditLog maintains a log of the namespace modifications. * */ public class FSEditLog { static final Log LOG = LogFactory.getLog(FSEditLog.class); public static final long PURGE_ALL_TXID = Long.MAX_VALUE; public static String CONF_ROLL_TIMEOUT_MSEC = "dfs.fsedits.timeout.roll.edits.msec"; public static int sizeFlushBuffer = HdfsConstants.DEFAULT_EDIT_BUFFER_SIZE; static long preallocateSize= HdfsConstants.DEFAULT_EDIT_PREALLOCATE_SIZE; static long maxBufferedTransactions= HdfsConstants.DEFAULT_MAX_BUFFERED_TRANSACTIONS; private final ConcurrentSkipListMap<Long, List<Long>> delayedSyncs = new ConcurrentSkipListMap<Long, List<Long>>(); private Thread syncThread; private SyncThread syncer; /** * State machine for edit log. The log starts in UNITIALIZED state upon * construction. Once it's initialized, it is usually in IN_SEGMENT state, * indicating that edits may be written. In the middle of a roll, or while * saving the namespace, it briefly enters the BETWEEN_LOG_SEGMENTS state, * indicating that the previous segment has been closed, but the new one has * not yet been opened. */ protected enum State { UNINITIALIZED, BETWEEN_LOG_SEGMENTS, IN_SEGMENT, CLOSED; } protected State state = State.UNINITIALIZED; // initialize private JournalSet journalSet; private EditLogOutputStream editLogStream = null; // a monotonically increasing counter that represents transactionIds. private long txid = -1; // stores the last synced transactionId. private long synctxid = -1; // the first txid of the log that's currently open for writing. // If this value is N, we are currently writing to edits_inprogress_N private long curSegmentTxId = HdfsConstants.INVALID_TXID; // the time of printing the statistics to the log file. private long lastPrintTime; // is a sync currently running? private volatile boolean isSyncRunning; // Used to exit in the event of a failure to sync to all journals. It's a // member variable so it can be swapped out for testing. static volatile Runtime runtime = Runtime.getRuntime(); // these are statistics counters. private long numTransactions; // number of transactions private long numTransactionsBatchedInSync; private long totalTimeTransactions; // total time for all transactions private NameNodeMetrics metrics; private NNStorage storage; private Configuration conf; private Collection<URI> editsDirs; private long timeoutRollEdits; private static ThreadLocal<Checksum> localChecksumForRead = new ThreadLocal<Checksum>() { protected Checksum initialValue() { return new PureJavaCrc32(); } }; private static ThreadLocal<Checksum> localChecksumForWrite = new ThreadLocal<Checksum>() { protected Checksum initialValue() { return new PureJavaCrc32(); } }; /** Get a thread local checksum for read */ static Checksum getChecksumForRead() { return localChecksumForRead.get(); } /** Get a thread local checksum for read */ static Checksum getChecksumForWrite() { return localChecksumForWrite.get(); } /** * Sets the current transaction id of the edit log. This is used when we load * the FSImage and FSEdits and read the last transaction id from disk and then * we continue logging transactions to the edit log from that id onwards. * * @param txid * the last transaction id */ public void setLastWrittenTxId(long txid) { this.txid = txid; } public void resetTxIds(long txid) throws IOException { this.txid = txid; this.synctxid = txid; this.curSegmentTxId = HdfsConstants.INVALID_TXID; this.state = State.BETWEEN_LOG_SEGMENTS; // Journals need to reset their committed IDs. journalSet.setCommittedTxId(txid, true); } private static class TransactionId { public long txid; TransactionId(long value) { this.txid = value; } } // stores the most current transactionId of this thread. private static final ThreadLocal<TransactionId> myTransactionId = new ThreadLocal<TransactionId>() { protected synchronized TransactionId initialValue() { return new TransactionId(-1L); } }; /** * Constructor for FSEditLog. Underlying journals are constructed, but no * streams are opened until open() is called. * * @param conf The namenode configuration * @param storage Storage object used by namenode * @param editsDirs List of journals to use * @param locationMap contains information about shared/local/remote locations */ FSEditLog(Configuration conf, FSImage image, NNStorage storage, Collection<URI> imageDirs, Collection<URI> editsDirs, Map<URI, NNStorageLocation> locationMap) { init(conf, image, storage, imageDirs, editsDirs, locationMap); timeoutRollEdits = conf.getLong(CONF_ROLL_TIMEOUT_MSEC, 0); } private void init(Configuration conf, FSImage image, NNStorage storage, Collection<URI> imageDirs, Collection<URI> editsDirs, Map<URI, NNStorageLocation> locationMap) { isSyncRunning = false; this.conf = conf; this.storage = storage; metrics = NameNode.getNameNodeMetrics(); lastPrintTime = FSNamesystem.now(); // If this list is empty, an error will be thrown on first use // of the editlog, as no journals will exist this.editsDirs = new ArrayList<URI>(editsDirs); journalSet = new JournalSet(conf, image, storage, this.editsDirs.size(), metrics); for (URI u : this.editsDirs) { boolean required = NNStorageConfiguration.getRequiredNamespaceEditsDirs( conf).contains(u); boolean shared = false; boolean remote = false; if (locationMap != null && locationMap.get(u) != null) { shared = locationMap.get(u).type == StorageLocationType.SHARED; remote = locationMap.get(u).type == StorageLocationType.REMOTE; } if (u.getScheme().equals(NNStorage.LOCAL_URI_SCHEME)) { StorageDirectory sd = storage.getStorageDirectory(u); if (sd != null) { LOG.info("Adding local file journal: " + u + ", required: " + required); // port error reporter journalSet.add(new FileJournalManager(sd, metrics, null), required, shared, remote); } } else if (u.getScheme().equals(QuorumJournalManager.QJM_URI_SCHEME)) { // for now, we only allow the QJM to store images boolean hasImageStorage = imageDirs.contains(u); try { journalSet.add(new QuorumJournalManager(conf, u, new NamespaceInfo( storage), metrics, hasImageStorage), required, shared, remote); } catch (Exception e) { throw new IllegalArgumentException("Unable to construct journal, " + u, e); } } else { LOG.info("Adding journal: " + u + ", required: " + required); journalSet.add(createJournal(conf, u, new NamespaceInfo(storage), metrics), required, shared, remote); } } if (journalSet.isEmpty()) { LOG.error("No edits directories configured!"); } state = State.BETWEEN_LOG_SEGMENTS; } /** * Get the list of URIs the editlog is using for storage * * @return collection of URIs in use by the edit log */ Collection<URI> getEditURIs() { return editsDirs; } /** * Create empty edit log files. * Initialize the output stream for logging. * * @throws IOException */ synchronized void open() throws IOException { if (syncer == null) { syncer = new SyncThread(); syncThread = new Thread(syncer); syncThread.start(); } if (state != State.BETWEEN_LOG_SEGMENTS) throw new IOException("Bad state: " + state); startLogSegment(getLastWrittenTxId() + 1, true); if (state != State.IN_SEGMENT) throw new IOException("Bad state: " + state); } synchronized boolean isOpen() { return state == State.IN_SEGMENT; } public synchronized void close() throws IOException { if (state == State.CLOSED) { LOG.info("Closing log when already closed"); return; } if (state == State.IN_SEGMENT) { assert editLogStream != null; waitForSyncToFinish(); endCurrentLogSegment(true && InjectionHandler .trueCondition(InjectionEvent.FSEDIT_LOG_WRITE_END_LOG_SEGMENT)); } if (syncThread != null) { syncer.stop(); syncThread.interrupt(); } try { journalSet.close(); } catch (IOException ioe) { LOG.warn("Error closing journalSet", ioe); } state = State.CLOSED; } synchronized void transitionNonFileJournals(StorageInfo nsInfo, boolean checkEmpty, Transition transition, StartupOption startOpt) throws IOException { if (Transition.FORMAT == transition && state != State.BETWEEN_LOG_SEGMENTS) { throw new IOException("Bad state:" + state); } journalSet.transitionNonFileJournals(nsInfo, checkEmpty, transition, startOpt); } synchronized List<JournalManager> getNonFileJournalManagers() { return journalSet.getNonFileJournalManagers(); } synchronized List<FormatConfirmable> getFormatConfirmables() throws IOException { if (state != State.BETWEEN_LOG_SEGMENTS) { throw new IOException("Bad state:" + state); } List<FormatConfirmable> ret = Lists.newArrayList(); for (final JournalManager jm : journalSet.getJournalManagers()) { // The FJMs are confirmed separately since they are also // StorageDirectories if (!(jm instanceof FileJournalManager)) { ret.add(jm); } } return ret; } void logEdit(final FSEditLogOp op) { synchronized (this) { assert state != State.CLOSED; // this will increase txid long start = beginTransaction(); op.setTransactionId(txid); try { if (editLogStream != null) { // if stream is null it will be handled in sync editLogStream.write(op); } } catch (IOException ex) { LOG.fatal("Could not write to required number of streams", ex); runtime.exit(1); } endTransaction(start); // check if it is time to schedule an automatic sync } } /** * Check if should automatically sync buffered edits to persistent store * * @return true if any of the edit stream says that it should sync */ private boolean shouldForceSync() { // if editLogStream is null, just fast fail return editLogStream == null ? true : editLogStream.shouldForceSync(); } private long beginTransaction() { assert Thread.holdsLock(this); // get a new transactionId txid++; // // record the transactionId when new data was written to the edits log // TransactionId id = myTransactionId.get(); id.txid = txid; // obtain time in nanoseconds // endTransaction will compute time in microseconds return System.nanoTime(); } private void endTransaction(long start) { assert Thread.holdsLock(this); // update statistics numTransactions++; long txnTime = DFSUtil.getElapsedTimeMicroSeconds(start); totalTimeTransactions += txnTime; if (metrics != null) { // Metrics is non-null only when used inside name // node metrics.transactions.inc(txnTime); metrics.numBufferedTransactions.set((int) (txid - synctxid)); metrics.currentTxnId.set(txid); } } /** * Blocks until all ongoing edits have been synced to disk. This differs from * logSync in that it waits for edits that have been written by other threads, * not just edits from the calling thread. * * NOTE: this should be done while holding the FSNamesystem lock, or else more * operations can start writing while this is in progress. */ public void logSyncAll() throws IOException { // Record the most recent transaction ID as our own id synchronized (this) { TransactionId id = myTransactionId.get(); id.txid = txid; } // Then make sure we're synced up to this point logSync(); } /** * if there are too many transactions that are yet to be synced, * then sync them. Otherwise, the in-memory buffer that keeps * the transactions would grow to be very very big. This can happen * when there are a large number of listStatus calls which update * the access time of files. */ public void logSyncIfNeeded() { boolean doSync = false; synchronized (this) { if (txid > synctxid + maxBufferedTransactions) { FSNamesystem.LOG.info("Out of band log sync triggered " + " because there are " + (txid-synctxid) + " buffered transactions which " + " is more than the configured limit of " + maxBufferedTransactions); doSync = true; } if (shouldForceSync()) { FSNamesystem.LOG.info("Log sync triggered by the output stream"); doSync = true; } } if (doSync) { logSync(); } } public void logSync() { logSync(true); } /** * Sync all modifications done by this thread. * * The internal concurrency design of this class is as follows: * - Log items are written synchronized into an in-memory buffer, * and each assigned a transaction ID. * - When a thread (client) would like to sync all of its edits, logSync() * uses a ThreadLocal transaction ID to determine what edit number must * be synced to. * - The isSyncRunning volatile boolean tracks whether a sync is currently * under progress. * * The data is double-buffered within each edit log implementation so that * in-memory writing can occur in parallel with the on-disk writing. * * Each sync occurs in three steps: * 1. synchronized, it swaps the double buffer and sets the isSyncRunning * flag. * 2. unsynchronized, it flushes the data to storage * 3. synchronized, it resets the flag and notifies anyone waiting on the * sync. * * The lack of synchronization on step 2 allows other threads to continue * to write into the memory buffer while the sync is in progress. * Because this step is unsynchronized, actions that need to avoid * concurrency with sync() should be synchronized and also call * waitForSyncToFinish() before assuming they are running alone. */ public void logSync(boolean doWait) { long syncStart = 0; boolean thisThreadSuccess = false; boolean thisThreadSyncing = false; EditLogOutputStream logStream = null; try { synchronized (this) { long mytxid = myTransactionId.get().txid; myTransactionId.get().txid = -1L; if (mytxid == -1) { mytxid = txid; } printStatistics(false); // if somebody is already syncing, then wait while (mytxid > synctxid && isSyncRunning) { if (!doWait) { long delayedId = Server.delayResponse(); List<Long> responses = delayedSyncs.get(mytxid); if (responses == null) { responses = new LinkedList<Long>(); delayedSyncs.put(mytxid, responses); } responses.add(delayedId); return; } try { wait(1000); } catch (InterruptedException ie) { } } // // If this transaction was already flushed, then nothing to do // if (mytxid <= synctxid) { numTransactionsBatchedInSync++; if (metrics != null) // Metrics is non-null only when used inside name // node metrics.transactionsBatchedInSync.inc(); return; } // now, this thread will do the sync syncStart = txid; isSyncRunning = true; thisThreadSyncing = true; // swap buffers try { if (journalSet.isEmpty()) { throw new IOException( "No journals available to flush, journalset is empty"); } if (editLogStream == null) { throw new IOException( "No journals available to flush, editlogstream is null"); } editLogStream.setReadyToFlush(); } catch (IOException e) { LOG.fatal("Could not sync enough journals to persistent storage. " + "Unsynced transactions: " + (txid - synctxid), new Exception(e)); runtime.exit(1); } // editLogStream may become null, // so store a local variable for flush. logStream = editLogStream; } // do the sync sync(logStream, syncStart); thisThreadSuccess = true; } finally { synchronized (this) { if (thisThreadSyncing) { if(thisThreadSuccess) { // only set this if the sync succeeded synctxid = syncStart; } // if this thread was syncing, clear isSyncRunning isSyncRunning = false; } this.notifyAll(); } } endDelay(syncStart); } private void sync(EditLogOutputStream logStream, long syncStart) { // do the sync long start = System.nanoTime(); try { if (logStream != null) { logStream.flush(); } } catch (IOException ex) { synchronized (this) { LOG.fatal("Could not sync enough journals to persistent storage. " + "Unsynced transactions: " + (txid - synctxid), new Exception()); runtime.exit(1); } } long elapsed = DFSUtil.getElapsedTimeMicroSeconds(start); if (metrics != null) // Metrics is non-null only when used inside name node metrics.syncs.inc(elapsed); } private void endDelay(long synced) { ConcurrentNavigableMap<Long, List<Long>> syncs = delayedSyncs.headMap( synced, true); for (Iterator<List<Long>> iter = syncs.values().iterator(); iter.hasNext();) { List<Long> responses = iter.next(); for (Long responseId : responses) { try { Server.sendDelayedResponse(responseId); } catch (IOException ex) { } } iter.remove(); } } private class SyncThread implements Runnable { private volatile boolean isRunning = true; public void stop() { isRunning = false; } @Override public void run() { try { long syncStart = 0; while (isRunning) { synchronized (FSEditLog.this) { while (isSyncRunning || (isRunning && delayedSyncs.size() == 0)) { try { FSEditLog.this.wait(); } catch (InterruptedException iex) { } } if (!isRunning) { // Shutting down the edits log return; } // There are delayed transactions waiting to be synced and // nobody to sync them syncStart = txid; isSyncRunning = true; try { if (journalSet.isEmpty()) { throw new IOException( "No journals available to flush, journalset is empty"); } if (editLogStream == null) { throw new IOException( "No journals available to flush, editlogstream is null"); } editLogStream.flush(); } catch (IOException ex) { synchronized (this) { LOG.fatal( "Could not sync enough journals to persistent storage. " + "Unsynced transactions: " + (txid - synctxid), new Exception()); runtime.exit(1); } } } sync(editLogStream, syncStart); synchronized (FSEditLog.this) { synctxid = syncStart; isSyncRunning = false; FSEditLog.this.notifyAll(); } endDelay(syncStart); } } catch (Throwable t) { FSNamesystem.LOG.fatal("SyncThread received Runtime exception: ", t); Runtime.getRuntime().exit(-1); } } public String toString() { return "SyncThread"; } } protected int checkJournals() throws IOException { return journalSet.checkJournals(""); } protected void updateNamespaceInfo(StorageInfo si) throws IOException { journalSet.updateNamespaceInfo(si); } // // print statistics every 1 minute. // private void printStatistics(boolean force) { long now = FSNamesystem.now(); if (lastPrintTime + 60000 > now && !force) { return; } lastPrintTime = now; StringBuilder buf = new StringBuilder(); buf.append("Number of transactions: "); buf.append(numTransactions); buf.append(" Number of transactions batched in Syncs: "); buf.append(numTransactionsBatchedInSync); buf.append(" Number of syncs: "); buf.append(editLogStream != null ? editLogStream.getNumSync() : "null"); buf.append(" Total time for writing transactions (us): "); buf.append(totalTimeTransactions); buf.append(" Journal sync times (us): "); buf.append(journalSet.getSyncTimes()); FSNamesystem.LOG.info(buf); } /** * Add open lease record to edit log. * Records the block locations of the last block. */ public void logOpenFile(String path, INodeFileUnderConstruction newNode) throws IOException { AddOp op = AddOp.getInstance(); op.set(newNode.getId(), path, newNode.getReplication(), newNode.getModificationTime(), newNode.getAccessTime(), newNode.getPreferredBlockSize(), newNode.getBlocks(), newNode.getPermissionStatus(), newNode.getClientName(), newNode.getClientMachine()); logEdit(op); } /** * Add close lease record to edit log. */ public void logCloseFile(String path, INodeFile newNode) { CloseOp op = CloseOp.getInstance(); op.set(newNode.getId(), path, newNode.getReplication(), newNode.getModificationTime(), newNode.getAccessTime(), newNode.getPreferredBlockSize(), newNode.getBlocks(), newNode.getPermissionStatus(), null, null); logEdit(op); } /** * Add append file record to the edit log. */ public void logAppendFile(String path, INodeFileUnderConstruction newNode) throws IOException { AppendOp op = AppendOp.getInstance(); op.set(path, newNode.getBlocks(), newNode.getClientName(), newNode.getClientMachine()); logEdit(op); } /** * Add create directory record to edit log */ public void logMkDir(String path, INode newNode) { MkdirOp op = MkdirOp.getInstance(); op.set(newNode.getId(), path, newNode.getModificationTime(), newNode.getPermissionStatus()); logEdit(op); } /** * Add hardlink record to edit log */ public void logHardLink(String src, String dst, long timestamp) { HardLinkOp op = HardLinkOp.getInstance(); op.set(src, dst, timestamp); logEdit(op); } /** * Add rename record to edit log */ public void logRename(String src, String dst, long timestamp) { RenameOp op = RenameOp.getInstance(); op.set(src, dst, timestamp); logEdit(op); } /** * Add raidFile record to edit log */ public void logRaidFile(String src, String codecId, short expectedSourceRepl, long timestamp) { //TODO } /** * Add set replication record to edit log */ public void logSetReplication(String src, short replication) { SetReplicationOp op = SetReplicationOp.getInstance(); op.set(src, replication); logEdit(op); } /** Add set namespace quota record to edit log * * @param src the string representation of the path to a directory * @param quota the directory size limit */ public void logSetQuota(String src, long nsQuota, long dsQuota) { SetQuotaOp op = SetQuotaOp.getInstance(); op.set(src, nsQuota, dsQuota); logEdit(op); } /** Add set permissions record to edit log */ public void logSetPermissions(String src, FsPermission permissions) { SetPermissionsOp op = SetPermissionsOp.getInstance(); op.set(src, permissions); logEdit(op); } /** Add set owner record to edit log */ public void logSetOwner(String src, String username, String groupname) { SetOwnerOp op = SetOwnerOp.getInstance(); op.set(src, username, groupname); logEdit(op); } /** * concat(trg,src..) log */ public void logConcat(String trg, String [] srcs, long timestamp) { ConcatDeleteOp op = ConcatDeleteOp.getInstance(); op.set(trg, srcs, timestamp); logEdit(op); } /** * Merge(parity, source, ...) log * It's used for converting old raided files into new format * by merging parity file and source file together into one file */ public void logMerge(String parity, String source, String codecId, int[] checksums, long timestamp) { MergeOp op = MergeOp.getInstance(); op.set(parity, source, codecId, checksums, timestamp); logEdit(op); } /** * Add delete file record to edit log */ public void logDelete(String src, long timestamp) { DeleteOp op = DeleteOp.getInstance(); op.set(src, timestamp); logEdit(op); } /** * Add generation stamp record to edit log */ public void logGenerationStamp(long genstamp) { SetGenstampOp op = SetGenstampOp.getInstance(); op.set(genstamp); logEdit(op); } /** * Add access time record to edit log */ public void logTimes(String src, long mtime, long atime) { TimesOp op = TimesOp.getInstance(); op.set(src, mtime, atime); logEdit(op); } /** * Get all journal streams */ public List<JournalAndStream> getJournals() { return journalSet.getAllJournalStreams(); } /** * Used only by unit tests. */ public static synchronized void setRuntimeForTesting(Runtime rt) { runtime = rt; } /** * Return a manifest of what finalized edit logs are available */ public synchronized RemoteEditLogManifest getEditLogManifest(long fromTxId) throws IOException { return journalSet.getEditLogManifest(fromTxId); } /** * Finalizes the current edit log and opens a new log segment. * @return the transaction id of the BEGIN_LOG_SEGMENT transaction * in the new log. */ synchronized long rollEditLog() throws IOException { LOG.info("Rolling edit logs."); long start = System.nanoTime(); endCurrentLogSegment(true); long nextTxId = getLastWrittenTxId() + 1; startLogSegment(nextTxId, true); assert curSegmentTxId == nextTxId; long rollTime = DFSUtil.getElapsedTimeMicroSeconds(start); if (metrics != null) { metrics.rollEditLogTime.inc(rollTime); metrics.tsLastEditsRoll.set(System.currentTimeMillis()); } return nextTxId; } /** * Start writing to the log segment with the given txid. * Transitions from BETWEEN_LOG_SEGMENTS state to IN_LOG_SEGMENT state. */ synchronized void startLogSegment(final long segmentTxId, boolean writeHeaderTxn) throws IOException { LOG.info("Starting log segment at " + segmentTxId); if (segmentTxId < 0) { throw new IOException("Bad txid: " + segmentTxId); } if (state != State.BETWEEN_LOG_SEGMENTS) { throw new IOException("Bad state: " + state); } if (segmentTxId <= curSegmentTxId) { throw new IOException("Cannot start writing to log segment " + segmentTxId + " when previous log segment started at " + curSegmentTxId); } if (segmentTxId != txid + 1) { throw new IOException("Cannot start log segment at txid " + segmentTxId + " when next expected " + (txid + 1)); } numTransactions = totalTimeTransactions = numTransactionsBatchedInSync = 0; // TODO no need to link this back to storage anymore! // See HDFS-2174. storage.attemptRestoreRemovedStorage(); try { editLogStream = journalSet.startLogSegment(segmentTxId); } catch (IOException ex) { throw new IOException("Unable to start log segment " + segmentTxId + ": no journals successfully started."); } curSegmentTxId = segmentTxId; state = State.IN_SEGMENT; if (writeHeaderTxn) { logEdit(LogSegmentOp.getInstance(FSEditLogOpCodes.OP_START_LOG_SEGMENT)); logSync(); } // force update of journal and image metrics journalSet.updateJournalMetrics(); // If it is configured, we want to schedule an automatic edits roll if (timeoutRollEdits > 0) { FSNamesystem fsn = this.journalSet.getImage().getFSNamesystem(); if (fsn != null) { // In some test cases fsn is NULL in images. Simply skip the feature. AutomaticEditsRoller aer = fsn.automaticEditsRoller; if (aer != null) { aer.setNextRollTime(System.currentTimeMillis() + timeoutRollEdits); } else { LOG.warn("Automatic edits roll is enabled but the roller thread " + "is not enabled. Should only happen in unit tests."); } } else { LOG.warn("FSNamesystem is NULL in FSEditLog."); } } } /** * Finalize the current log segment. Transitions from IN_SEGMENT state to * BETWEEN_LOG_SEGMENTS state. */ synchronized void endCurrentLogSegment(boolean writeEndTxn) throws IOException { LOG.info("Ending log segment " + curSegmentTxId); if (state != State.IN_SEGMENT) { throw new IllegalStateException("Bad state: " + state); } waitForSyncToFinish(); if (writeEndTxn) { logEdit(LogSegmentOp.getInstance(FSEditLogOpCodes.OP_END_LOG_SEGMENT)); } logSyncAll(); printStatistics(true); final long lastTxId = getLastWrittenTxId(); try { journalSet.finalizeLogSegment(curSegmentTxId, lastTxId); editLogStream = null; } catch (IOException e) { // All journals have failed, it will be handled in logSync. FSNamesystem.LOG.info("Cannot finalize log segment: " + e.toString()); } state = State.BETWEEN_LOG_SEGMENTS; } /** * Archive any log files that are older than the given txid. */ public void purgeLogsOlderThan(final long minTxIdToKeep) { synchronized (this) { // synchronized to prevent findbugs warning about inconsistent // synchronization. This will be JIT-ed out if asserts are // off. assert curSegmentTxId == HdfsConstants.INVALID_TXID || // on format this // is no-op minTxIdToKeep <= curSegmentTxId : "cannot purge logs older than txid " + minTxIdToKeep + " when current segment starts at " + curSegmentTxId; try { journalSet.purgeLogsOlderThan(minTxIdToKeep); } catch (IOException ex) { // All journals have failed, it will be handled in logSync. } } } /** * The actual sync activity happens while not synchronized on this object. * Thus, synchronized activities that require that they are not concurrent * with file operations should wait for any running sync to finish. */ synchronized void waitForSyncToFinish() { while (isSyncRunning) { try { wait(1000); } catch (InterruptedException ie) { } } } /** * Return the txid of the last synced transaction. For test use only */ synchronized long getSyncTxId() { return synctxid; } /** * Run recovery on all journals to recover any unclosed segments */ void recoverUnclosedStreams() { try { journalSet.recoverUnfinalizedSegments(); } catch (IOException ex) { // All journals have failed, it is handled in logSync. } } /** * Select a list of input streams to load. * * @param streams, streams to be returned * @param fromTxId first transaction in the selected streams * @param toAtLeast the selected streams must contain this transaction * @param inProgessOk set to true if in-progress streams are OK * * @return true if there the redundancy in no met */ public synchronized boolean selectInputStreams( Collection<EditLogInputStream> streams, long fromTxId, long toAtLeastTxId, int minRedundancy) throws IOException { // at this point we should not have any non-finalized segments // this function is called at startup, and must be invoked after // recovering all in progress segments if (journalSet.hasUnfinalizedSegments(fromTxId)) { LOG.fatal("All streams should be finalized"); throw new IOException("All streams should be finalized at startup"); } // get all finalized streams boolean redundancyViolated = journalSet.selectInputStreams(streams, fromTxId, false, false, minRedundancy); try { checkForGaps(streams, fromTxId, toAtLeastTxId, true); } catch (IOException e) { closeAllStreams(streams); throw e; } return redundancyViolated; } /** * Check for gaps in the edit log input stream list. * Note: we're assuming that the list is sorted and that txid ranges don't * overlap. This could be done better and with more generality with an * interval tree. */ private void checkForGaps(Collection<EditLogInputStream> streams, long fromTxId, long toAtLeastTxId, boolean inProgressOk) throws IOException { Iterator<EditLogInputStream> iter = streams.iterator(); long txId = fromTxId; while (true) { if (txId > toAtLeastTxId) return; if (!iter.hasNext()) break; EditLogInputStream elis = iter.next(); if (elis.getFirstTxId() > txId) { break; } long next = elis.getLastTxId(); if (next == HdfsConstants.INVALID_TXID) { if (!inProgressOk) { throw new RuntimeException("inProgressOk = false, but " + "selectInputStreams returned an in-progress edit " + "log input stream (" + elis + ")"); } // We don't know where the in-progress stream ends. // It could certainly go all the way up to toAtLeastTxId. return; } txId = next + 1; } throw new IOException(String.format("Gap in transactions. Expected to " + "be able to read up until at least txid %d but unable to find any " + "edit logs containing txid %d", toAtLeastTxId, txId)); } /** * Close all the streams in a collection * * @param streams The list of streams to close */ static void closeAllStreams(Iterable<EditLogInputStream> streams) { for (EditLogInputStream s : streams) { IOUtils.closeStream(s); } } /** * Retrieve the implementation class for a Journal scheme. * @param conf The configuration to retrieve the information from * @param uriScheme The uri scheme to look up. * @return the class of the journal implementation * @throws IllegalArgumentException if no class is configured for uri */ static Class<? extends JournalManager> getJournalClass(Configuration conf, String uriScheme) { String key = "dfs.name.edits.journal-plugin" + "." + uriScheme; Class<? extends JournalManager> clazz = null; try { clazz = conf.getClass(key, null, JournalManager.class); } catch (RuntimeException re) { throw new IllegalArgumentException("Invalid class specified for " + uriScheme, re); } if (clazz == null) { LOG.warn("No class configured for " + uriScheme + ", " + key + " is empty"); throw new IllegalArgumentException("No class configured for " + uriScheme); } return clazz; } /** * Construct a custom journal manager. * The class to construct is taken from the configuration. * @param uri Uri to construct * @return The constructed journal manager * @throws IllegalArgumentException if no class is configured for uri */ public static JournalManager createJournal(Configuration conf, URI uri, NamespaceInfo nsInfo, NameNodeMetrics metrics) { Class<? extends JournalManager> clazz = getJournalClass(conf, uri.getScheme()); try { Constructor<? extends JournalManager> cons = clazz.getConstructor( Configuration.class, URI.class, NamespaceInfo.class, NameNodeMetrics.class); return cons.newInstance(conf, uri, nsInfo, metrics); } catch (Exception e) { throw new IllegalArgumentException("Unable to construct journal, " + uri, e); } } // sets the initial capacity of the flush buffer. static void setBufferCapacity(int size) { sizeFlushBuffer = size; } // // maximum number of transactions to be buffered in memory static void setMaxBufferedTransactions(int num) { maxBufferedTransactions = num; } // sets the preallocate trigger of the edits log. static void setPreallocateSize(long size) { preallocateSize = size; } /** * Return the transaction ID for the transaction that was written last. */ synchronized long getLastWrittenTxId() { return txid; } public synchronized long getCurrentTxId() { return txid + 1; } synchronized long getLastSyncedTxId() { return synctxid; } /** * @return the first transaction ID in the current log segment */ public synchronized long getCurSegmentTxId() { assert state == State.IN_SEGMENT : "Bad state: " + state; return curSegmentTxId; } /** * Get number of journals available */ public int getNumberOfAvailableJournals() throws IOException { return checkJournals(); } /** * Get number of journals (enabled and disabled). */ public int getNumberOfJournals() throws IOException { return journalSet.getNumberOfJournals(); } /** * Check if the shared journal is available */ public boolean isSharedJournalAvailable() throws IOException { return journalSet.isSharedJournalAvailable(); } public void setTimeoutRollEdits(long timeoutRollEdits) { this.timeoutRollEdits = timeoutRollEdits; } /** * A class to read in blocks stored in the old format. The only two * fields in the block were blockid and length. */ static class BlockTwo implements Writable { long blkid; long len; static { // register a ctor WritableFactories.setFactory (BlockTwo.class, new WritableFactory() { public Writable newInstance() { return new BlockTwo(); } }); } BlockTwo() { blkid = 0; len = 0; } ///////////////////////////////////// // Writable ///////////////////////////////////// public void write(DataOutput out) throws IOException { out.writeLong(blkid); out.writeLong(len); } public void readFields(DataInput in) throws IOException { this.blkid = in.readLong(); this.len = in.readLong(); } } }