/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.namenode.bookkeeper;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Lists;
import org.apache.bookkeeper.client.BKException;
import org.apache.bookkeeper.client.BookKeeper;
import org.apache.bookkeeper.client.LedgerHandle;
import org.apache.bookkeeper.conf.ClientConfiguration;
import org.apache.bookkeeper.util.ZkUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.Transition;
import org.apache.hadoop.hdfs.server.common.StorageInfo;
import org.apache.hadoop.hdfs.server.namenode.EditLogInputStream;
import org.apache.hadoop.hdfs.server.namenode.EditLogOutputStream;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader;
import org.apache.hadoop.hdfs.server.namenode.JournalManager;
import org.apache.hadoop.hdfs.server.namenode.RemoteStorageState;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.BookKeeperJournalMetadataManager;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.CurrentInProgressMetadata;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.EditLogLedgerMetadata;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.MaxTxId;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.Versioned;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.proto.FormatInfoWritable;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.metadata.proto.WritableUtil;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.zk.BasicZooKeeper;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.zk.ConnectionWatcher;
import org.apache.hadoop.hdfs.server.namenode.bookkeeper.zk.RecoveringZooKeeper;
import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
import org.apache.hadoop.hdfs.util.InjectionEvent;
import org.apache.hadoop.util.InjectionHandler;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.Code;
import org.apache.zookeeper.ZooDefs.Ids;
import org.apache.zookeeper.ZooKeeper;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import static org.apache.hadoop.hdfs.server.namenode.bookkeeper.BookKeeperJournalConfigKeys.*;
import static org.apache.hadoop.hdfs.server.namenode.bookkeeper.zk.ZkUtil.*;
import static org.apache.zookeeper.AsyncCallback.*;
/**
* BookKeeper-based JournalManager implementation. This is inspired by
* Apache's BookKeeperJournalManager, with several core differences:
* interaction with ZooKeeper goes through {@link RecoveringZooKeeper},
* custom {@link BookKeeperEditLogInputStream} implementation is used that
* permits tailing in-progress edits and re-positioning within an ledger-based
* output stream, and a custom {@link BookKeeperEditLogOutputStream} is used
* that uses double buffer as used by the standard file journal manager
* implementation.
*/
public class BookKeeperJournalManager implements JournalManager, LedgerHandleProvider {
private static final Log LOG =
LogFactory.getLog(BookKeeperJournalManager.class);
// Version of the protocol used for serializing and de-serializing data in
// znodes (i.e., the Writables)
static final int PROTO_VERSION = -1;
private final Configuration conf; // Configuration
private final int quorumSize; // BookKeeper quorum size
private final int ensembleSize; // BookKeeper cluster size
private final BookKeeper bookKeeperClient; // BookKeeper client
private final RecoveringZooKeeper zk;
private final String digestPw; // BookKeeper digest password
@VisibleForTesting
protected final String zkParentPath; // Parent ZNode
@VisibleForTesting
protected final String formatInfoPath; // ZNode holding format/namespace information
// Handles ledger metadata
final BookKeeperJournalMetadataManager metadataManager;
private final MaxTxId maxTxId; // stores max txid
private final CurrentInProgressMetadata currentInProgressMetadata;
private boolean initialized = false;
private LedgerHandle currentInProgressLedger = null; // Current ledger
@VisibleForTesting
volatile String currentInProgressPath;
private volatile NameNodeMetrics metrics = null;
private long maxSeenTxId = -1;
private static final ThreadLocal<FormatInfoWritable>
localFormatInfoWritable = new ThreadLocal<FormatInfoWritable>() {
@Override
protected FormatInfoWritable initialValue() {
return new FormatInfoWritable();
}
};
public BookKeeperJournalManager(Configuration conf, URI uri,
NamespaceInfo nsInfo, NameNodeMetrics metrics)
throws IOException {
this.conf = conf;
this.metrics = metrics;
quorumSize = conf.getInt(BKJM_BOOKKEEPER_QUORUM_SIZE,
BKJM_BOOKKEEPER_QUORUM_SIZE_DEFAULT);
ensembleSize = conf.getInt(BKJM_BOOKKEEPER_ENSEMBLE_SIZE,
BKJM_BOOKKEEPER_ENSEMBLE_SIZE_DEFAULT);
digestPw = conf.get(BKJM_BOOKKEEPER_DIGEST_PW,
BKJM_BOOKKEEPER_DIGEST_PW_DEFAULT);
String zkConnect = uri.getAuthority().replace(";", ",");
zkParentPath = uri.getPath();
String ledgersAvailablePath = conf.get(
BKJM_ZK_LEDGERS_AVAILABLE_PATH,
BKJM_ZK_LEDGERS_AVAILABLE_PATH_DEFAULT);
formatInfoPath = joinPath(zkParentPath, "version");
String currentInProgressPath = joinPath(zkParentPath, "CurrentInProgress");
String maxTxIdPath = joinPath(zkParentPath, "maxtxid");
int zkSessionTimeoutMs = conf.getInt(BKJM_ZK_SESSION_TIMEOUT,
BKJM_ZK_SESSION_TIMEOUT_DEFAULT);
int zkMaxRetries = conf.getInt(BKJM_ZK_MAX_RETRIES,
BKJM_ZK_MAX_RETRIES_DEFAULT);
int zkRetryIntervalMs = conf.getInt(BKJM_ZK_RETRY_INTERVAL,
BKJM_ZK_RETRY_INTERVAL_DEFAULT);
CountDownLatch connectLatch = new CountDownLatch(1);
ConnectionWatcher connectionWatcher = new ConnectionWatcher(connectLatch);
ZooKeeper zooKeeper = new ZooKeeper(zkConnect, zkSessionTimeoutMs,
connectionWatcher);
// Use twice session timeout as the connection timeout
int zkConnectTimeoutMs = zkSessionTimeoutMs * 2;
if (!connectionWatcher.await(zkConnectTimeoutMs)) {
throw new IOException("Timed out waiting to connect to " + zkConnect
+ " after " + (zkSessionTimeoutMs * 2) + " ms.");
}
prepareBookKeeperEnv(ledgersAvailablePath, zooKeeper);
try {
ClientConfiguration clientConf = new ClientConfiguration();
clientConf.setClientTcpNoDelay(conf.getBoolean(
BKJM_BOOKKEEPER_CLIENT_TCP_NODELAY,
BKJM_BOOKKEEPER_CLIENT_TCP_NO_DELAY_DEFAULT));
clientConf.setThrottleValue(conf.getInt(BKJM_BOOKKEEPER_CLIENT_THROTTLE,
BKJM_BOOKKEEPER_CLIENT_THROTTLE_DEFAULT));
bookKeeperClient = new BookKeeper(clientConf, zooKeeper);
} catch (KeeperException e) {
keeperException("Unrecoverable ZooKeeper creating BookKeeper client",
e);
throw new IllegalStateException(e); // never reached
} catch (InterruptedException e) {
interruptedException("Interrupted creating a BookKeeper client", e);
throw new IllegalStateException(e); // never reached
}
zk = new RecoveringZooKeeper(new BasicZooKeeper(zooKeeper), zkMaxRetries,
zkRetryIntervalMs);
metadataManager = new BookKeeperJournalMetadataManager(zk, zkParentPath);
maxTxId = new MaxTxId(zk, maxTxIdPath);
currentInProgressMetadata = new CurrentInProgressMetadata(zk,
currentInProgressPath);
createZkMetadataIfNotExists(nsInfo);
metadataManager.init();
}
public static void bkException(String msg, BKException e) throws IOException {
LOG.error(msg, e);
throw new IOException(msg, e);
}
/**
* Create parent ZNode under which available BookKeeper bookie servers will
* register themselves. Will create parent ZNodes for that path as well.
* @see ZkUtils#createFullPathOptimistic(ZooKeeper, String, byte[], List, CreateMode, StringCallback, Object)
* @param availablePath Full ZooKeeper path for bookies to register
* themselves.
* @param zooKeeper Fully instantiated ZooKeeper instance.
* @throws IOException If we are unable to successfully create the path
* during the time specified as the ZooKeeper session
* timeout.
*/
@VisibleForTesting
public static void prepareBookKeeperEnv(final String availablePath,
ZooKeeper zooKeeper) throws IOException {
final CountDownLatch availablePathLatch = new CountDownLatch(1);
StringCallback cb = new StringCallback() {
@Override
public void processResult(int rc, String path, Object ctx, String name) {
if (Code.OK.intValue() == rc || Code.NODEEXISTS.intValue() == rc) {
availablePathLatch.countDown();
LOG.info("Successfully created bookie available path:" +
availablePath);
} else {
Code code = Code.get(rc);
LOG.error("Failed to create available bookie path (" +
availablePath + ")", KeeperException.create(code, path));
}
}
};
ZkUtils.createFullPathOptimistic(zooKeeper, availablePath, new byte[0],
Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT, cb, null);
try {
int timeoutMs = zooKeeper.getSessionTimeout();
if (!availablePathLatch.await(timeoutMs, TimeUnit.MILLISECONDS)) {
throw new IOException("Couldn't create the bookie available path : " +
availablePath + ", timed out after " + timeoutMs + " ms.");
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new IOException("Interrupted when creating the bookie available " +
"path: " + availablePath, e);
}
}
/**
* If environment information has yet not been read during the object's life
* do so and verify that it has been written the expected protocol version.
* Additionally, the call always refreshes the object's current
* {@link CurrentInProgressMetadata} information.
*/
synchronized private void checkEnv() throws IOException {
if (!initialized) {
FormatInfoWritable writable = localFormatInfoWritable.get();
if (metadataManager.readWritableFromZk(formatInfoPath, writable, null) == null) {
LOG.error("Environment not initialized (format() not called?)");
throw new IOException(
"Environment not initialized (format() not called?");
}
if (writable.getProtoVersion() != PROTO_VERSION) {
throw new IllegalStateException("Wrong protocol version! Expected " +
BKJM_BOOKKEEPER_DIGEST_PW + ", but read " +
writable.getProtoVersion());
}
if (LOG.isDebugEnabled()) {
LOG.debug("Namespace info read: " + writable.toColonSeparatedString());
}
}
currentInProgressMetadata.init();
initialized = true;
}
@VisibleForTesting
public LedgerHandle openForReading(long ledgerId) throws IOException {
try {
return bookKeeperClient.openLedgerNoRecovery(
ledgerId, BookKeeper.DigestType.MAC, digestPw.getBytes());
} catch (InterruptedException e) {
interruptedException("Interrupted while opening ledger id " + ledgerId +
" for reading", e);
} catch (BKException e) {
bkException("BookKeeper error opening ledger id " + ledgerId +
" for reading", e);
}
return null; // Should not be reached
}
@Override
public void transitionJournal(StorageInfo si, Transition transition,
StartupOption startOpt) throws IOException {
if (transition == Transition.FORMAT) {
deleteMetadataAndLedgers();
createZkMetadataIfNotExists(si);
metadataManager.init();
} else {
throw new UnsupportedOperationException();
}
}
/**
* If ZooKeeper metadata is not empty, forcefully delete the metadata
* and make a best effort attempt at deleting the ledgers. Used by
* {@link #formatJournal(StorageInfo)}
* @throws IOException If there is an error talking to BookKeeper or
* ZooKeeper
*/
private void deleteMetadataAndLedgers() throws IOException {
try {
if (hasSomeJournalData()) {
if (zkPathExists(metadataManager.getLedgerParentPath())) {
for (EditLogLedgerMetadata ledger : metadataManager.listLedgers(true)) {
try {
// Try to delete the individual ledger from BookKeeper
bookKeeperClient.deleteLedger(ledger.getLedgerId());
} catch (BKException e) {
// It is fine if we are unable to delete the ledger, as it will
// not be read and can then be deleted manually.
LOG.warn("Unable to delete ledger " + ledger + " from BookKeeper",
e);
} catch (InterruptedException e) {
interruptedException("Interrupted deleting ledger " + ledger, e);
}
}
}
deleteRecursively(zk, zkParentPath);
}
} catch (IOException e) {
LOG.error("Error clearing out metadata under " + zkParentPath, e);
throw e;
}
}
/**
* If there is no metadata present in ZooKeeper, create and populate the
* metadata with the right format information
* @param si The format information to set
* @throws IOException If there is an error writing to ZooKeeper
*/
private void createZkMetadataIfNotExists(StorageInfo si) throws IOException {
try {
if (!hasSomeJournalData()) {
try {
// First create the parent path
zk.create(zkParentPath, new byte[] { '0' },
Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
// Write format/namespace information to ZooKeeper
FormatInfoWritable writable = localFormatInfoWritable.get();
writable.set(PROTO_VERSION, si);
byte[] data = WritableUtil.writableToByteArray(writable);
zk.create(formatInfoPath, data, Ids.OPEN_ACL_UNSAFE,
CreateMode.PERSISTENT);
} catch (KeeperException e) {
keeperException("Unrecoverable ZooKeeper error initializing " +
zkParentPath, e);
} catch (InterruptedException e) {
interruptedException("Interrupted initializing " + zkParentPath +
" in ZooKeeper", e);
}
}
} catch (IOException e) {
LOG.error("Unable to initialize metadata", e);
throw e;
}
}
/**
* Check if a path exists in ZooKeeper
* @param path The ZNode path to check
* @return True if path exists, false if otherwise
* @throws IOException If there is an error talking to ZooKeeper
*/
private boolean zkPathExists(String path) throws IOException {
try {
return zk.exists(path, false) != null;
} catch (KeeperException e) {
keeperException("Unrecoverable ZooKeeper error checking if " +
path + " exists", e);
} catch (InterruptedException e) {
interruptedException("Interrupted checking if ZooKeeper path " +
path + " exists", e);
}
return false; // Should never be reached
}
@Override
public EditLogOutputStream startLogSegment(long txId) throws IOException {
if (LOG.isDebugEnabled()) {
LOG.debug("Trying to start a log segment at txId " + txId);
}
checkEnv();
try {
long currMaxTxId = maxTxId.get();
if (txId <= currMaxTxId) {
throw new IOException("Already saw up to txId " + currMaxTxId + "!");
}
String existingInProgress = currentInProgressMetadata.read();
if (existingInProgress != null &&
metadataManager.ledgerExists(existingInProgress)) {
throw new IOException(existingInProgress + " already exists, cannot "
+ " start a log segment that is already in progress!");
}
} catch (IOException e) {
LOG.error("Unable to start log segment for txId " + txId, e);
throw e;
}
try {
// There was an error handling on the last stream, so close it
if (currentInProgressLedger != null) {
currentInProgressLedger.close();
}
currentInProgressLedger = bookKeeperClient.createLedger(ensembleSize,
quorumSize, BookKeeper.DigestType.MAC, digestPw.getBytes());
} catch (BKException e) {
bkException("BookKeeper error creating ledger for txId " + txId, e);
} catch (InterruptedException e) {
interruptedException("Interrupted creating ledger for txId " + txId, e);
}
// Create metadata for associated with the edit log segment starting at
// txId in ZooKeeper
EditLogLedgerMetadata ledgerMetadata = new EditLogLedgerMetadata(
FSConstants.LAYOUT_VERSION, currentInProgressLedger.getId(), txId, -1);
String ledgerFullPath =
metadataManager.fullyQualifiedPathForLedger(ledgerMetadata);
metadataManager.writeEditLogLedgerMetadata(ledgerFullPath, ledgerMetadata);
maxTxId.store(txId);
currentInProgressMetadata.update(ledgerFullPath);
// Used by recoverUnfinalizedSegments()
currentInProgressPath = ledgerFullPath;
BookKeeperEditLogOutputStream out = new BookKeeperEditLogOutputStream(
currentInProgressLedger, zkParentPath, metrics);
out.create(); // Write the ledger header and flush it to BookKeeper
InjectionHandler.processEvent(InjectionEvent.BKJM_STARTLOGSEGMENT,
ledgerMetadata);
return out;
}
@Override
public void finalizeLogSegment(long firstTxId, long lastTxId)
throws IOException {
checkEnv();
try {
// First, find an in-progress ledger starting at firstTxId
Versioned<EditLogLedgerMetadata> inProgressMetaAndVersion =
metadataManager.findInProgressLedger(firstTxId);
if (inProgressMetaAndVersion == null) {
throw new IOException(
"Cannot find metadata for an in-progress ledger with first txId "
+ firstTxId);
}
EditLogLedgerMetadata inProgressMeta = inProgressMetaAndVersion.getEntry();
if (currentInProgressLedger != null) {
long inProgressLedgerId = currentInProgressLedger.getId();
if (inProgressMeta.getLedgerId() == inProgressLedgerId) {
// If the segment is already // If the segment is currently
// in-progress, then finalize the ledger (this ensures every entry
// in the ledger committed to the BookKeeper quorum)
try {
currentInProgressLedger.close();
} catch (BKException e) {
bkException("Unexpected BookKeeper error closing ledger id " +
inProgressLedgerId, e);
} catch (InterruptedException e) {
interruptedException("Interrupted closing ledger id " +
inProgressLedgerId, e);
}
currentInProgressPath = null;
currentInProgressLedger = null;
} else { // We can not finalize a ledger that is not in-progress
throw new IOException("Current in-progress ledger has ledger id (" +
inProgressLedgerId + ") different from expected ledger id " +
inProgressMeta.getLedgerId());
}
}
// Set lastTxId in the metadata and persist it to ZooKeeper
EditLogLedgerMetadata finalizedMeta =
inProgressMeta.finalizeWithLastTxId(lastTxId);
String finalizedPath =
metadataManager.fullyQualifiedPathForLedger(finalizedMeta);
if (LOG.isDebugEnabled()) {
LOG.debug("Attempting to finalize metadata " + finalizedMeta +
" to ZNode " + finalizedPath);
}
if (!metadataManager.writeEditLogLedgerMetadata(finalizedPath, finalizedMeta)
&& !metadataManager.verifyEditLogLedgerMetadata(inProgressMeta, finalizedPath)) {
throw new IOException("Node " + finalizedPath +
" already exists, but data doesn't match " + finalizedMeta);
}
maxTxId.store(lastTxId);
// Find the ZNode path for the metadata associated with the in-progress
// version of the ledger
String lastInProgressPath =
metadataManager.fullyQualifiedPathForLedger(inProgressMeta);
String inProgressPathFromCiMeta = currentInProgressMetadata.read();
if (lastInProgressPath.equals(inProgressPathFromCiMeta)) {
// If the ZNode path matches the ZNode path for the current in-progress
// metadata, then clear the current in-progress metadata
currentInProgressMetadata.clear();
}
// Delete the in-progress metadata iff no one else has updated it in
// the mean while
if (!metadataManager.deleteLedgerMetadata(inProgressMeta,
inProgressMetaAndVersion.getVersion())) {
throw new IOException(
"Unable to delete in-progress znode " + lastInProgressPath +
" as it no longer exists (Deleted by another process?)");
}
} catch (IOException e) {
LOG.error("Unable to finalized metadata for segment with firstTxId " +
firstTxId + ", lastTxId " + lastTxId, e);
throw e;
}
}
/**
* An implementation of {@link LedgerHandleProvider} that fences the
* ledger we are reading from, allowing the ledger to be recovered by
* BookKeeper as we validate it.
*
* @see BookKeeperEditLogInputStream#validateEditLog(LedgerHandleProvider, EditLogLedgerMetadata)
*/
class FencingLedgerHandleProvider implements LedgerHandleProvider {
@Override
public LedgerHandle openForReading(long ledgerId) throws IOException {
try {
LOG.info("Opening ledger id " + ledgerId + " for recovery...");
LedgerHandle lh = bookKeeperClient.openLedger(ledgerId,
BookKeeper.DigestType.MAC, digestPw.getBytes());
if (lh.getId() != ledgerId) { // Verify that correct ledger is opened
throw new IllegalStateException("Ledger id " + lh.getId() +
" does not match requested ledger id " + ledgerId);
}
LOG.info("Opened ledger id " + ledgerId + " for recovery!");
return lh;
} catch (BKException e) {
bkException("BookKeeper error opening ledger id " + ledgerId +
" for recovery", e);
} catch (InterruptedException e) {
interruptedException("Interrupted opening ledger id " + ledgerId +
"for recovery", e);
}
return null;
}
}
@VisibleForTesting
long validateAndGetEndTxId(EditLogLedgerMetadata ledger) throws IOException {
return validateAndGetEndTxId(ledger, false);
}
long validateAndGetEndTxId(EditLogLedgerMetadata ledger, boolean fence)
throws IOException {
FSEditLogLoader.EditLogValidation val;
if (!fence) {
val = BookKeeperEditLogInputStream.validateEditLog(this, ledger);
} else {
val = BookKeeperEditLogInputStream.validateEditLog(
new FencingLedgerHandleProvider(), ledger);
}
InjectionHandler.processEvent(InjectionEvent.BKJM_VALIDATELOGSEGMENT,
val);
if (val.getNumTransactions() == 0) {
return HdfsConstants.INVALID_TXID; // Ledger is corrupt
}
return val.getEndTxId();
}
private List<EditLogLedgerMetadata> getLedgers(long fromTxId) throws IOException {
Collection<EditLogLedgerMetadata> allLedgers =
metadataManager.listLedgers(true);
List<EditLogLedgerMetadata> ledgers = new ArrayList<EditLogLedgerMetadata>();
for (EditLogLedgerMetadata ledger : allLedgers) {
if (ledger.getLastTxId() != -1 &&
fromTxId > ledger.getFirstTxId() &&
fromTxId <= ledger.getLastTxId()) {
throw new IOException("Asked for fromTxId " + fromTxId +
" which is in the middle of " + ledger);
}
if (fromTxId <= ledger.getFirstTxId()) {
ledgers.add(ledger);
}
}
return ledgers;
}
private long findMaxTransaction() throws IOException {
List<EditLogLedgerMetadata> ledgers = getLedgers(0);
synchronized (this) {
for (EditLogLedgerMetadata ledgerMetadata : ledgers) {
if (ledgerMetadata.getLastTxId() == -1) {
maxSeenTxId = Math.max(ledgerMetadata.getFirstTxId(), maxSeenTxId);
}
maxSeenTxId = Math.max(ledgerMetadata.getLastTxId(), maxSeenTxId);
}
}
return maxSeenTxId;
}
/**
* For edit log segment that contains transactions with ids earlier than the
* earliest txid to be retained, remove the ZooKeeper-based metadata and
* BookKeeper ledgers associated with these segments.
*
* @param minTxIdToKeep the earliest txid that must be retained after purging
* old logs
* @throws IOException If there is an error talking to BookKeeper or
* ZooKeeper
*/
@Override
public void purgeLogsOlderThan(long minTxIdToKeep) throws IOException {
checkEnv();
Collection<EditLogLedgerMetadata> ledgers =
metadataManager.listLedgers(false); // Don't list in-progress ledgers
for (EditLogLedgerMetadata ledger : ledgers) {
if (ledger.getFirstTxId() < minTxIdToKeep &&
ledger.getLastTxId() < minTxIdToKeep) {
LOG.info("Purging edit log segment: " + ledger);
// Try to delete the associated ZooKeeper metadata
if (!metadataManager.deleteLedgerMetadata(ledger, -1)) {
// It's possible that another process has already deleted the
// metadata
LOG.warn(ledger + " has already been purged!");
} else {
try {
// Remove the ledger from BookKeeper itself to reclaim diskspace.
bookKeeperClient.deleteLedger(ledger.getLedgerId());
} catch (BKException e) {
bkException("Unrecoverable error deleting " + ledger +
" from BookKeeper", e);
} catch (InterruptedException e) {
interruptedException("Interrupted deleting " + ledger +
" from BookKeeper", e);
}
}
}
}
}
@Override
public void setCommittedTxId(long txid, boolean force) {
}
@Override
synchronized public void recoverUnfinalizedSegments() throws IOException {
checkEnv();
Collection<EditLogLedgerMetadata> allLedgers =
metadataManager.listLedgers(true);
for (EditLogLedgerMetadata ledger : allLedgers) {
if (ledger.getLastTxId() != -1) {
continue; // Only un-finalized segments may be recovered
}
String ledgerPath = metadataManager.fullyQualifiedPathForLedger(ledger);
if (currentInProgressPath != null &&
ledgerPath.equals(currentInProgressPath)) {
// Do not recover the current in-progress segment
continue;
}
// First open the ledger without fencing in order to check the length
// of the ledger (to check for any zero-length ledgers that may have
// been the result of a crash).
LedgerHandle ledgerHandle = openForReading(ledger.getLedgerId());
try {
if (ledgerHandle.getLength() == 0) {
handleZeroLengthLedger(ledger); // Delete any zero-length ledgers
continue;
}
} finally {
try {
ledgerHandle.close();
} catch (BKException e) {
bkException("BookKeeper error closing ledger id " +
ledger.getLedgerId(), e);
} catch (InterruptedException e) {
interruptedException("Interrupted closing ledger id " +
ledger.getLedgerId(), e);
}
}
// Fence the ledger and validate it as it's being recovered by BookKeeper
long endTxId = validateAndGetEndTxId(ledger, true);
findMaxTransaction(); // Update maxTxId seen so far by this instance
if (endTxId == HdfsConstants.INVALID_TXID) {
LOG.warn(ledger + "(" + ledgerPath + ")" + " cannot be recovered!");
metadataManager.moveAsideCorruptLedger(ledger);
continue;
}
// Now finalize the ledger
finalizeLogSegment(ledger.getFirstTxId(), endTxId);
}
}
private void handleZeroLengthLedger(EditLogLedgerMetadata ledger)
throws IOException {
LOG.warn("In-progress edit log segment " + ledger + " refers to an " +
"empty edit log segment. This occurs when NameNode crashes after " +
"opening a segment, but before writing OP_START_LOG_SEGMENT. Will " +
"delete the ledger and the metadata.");
if (maxTxId.get() == ledger.getFirstTxId()) {
LOG.warn("maxTxId is set to " + ledger.getFirstTxId() + " which is " +
"belongs to an empty ledger. Resetting to previous maxTxId.");
maxTxId.set(maxTxId.get() - 1);
}
metadataManager.deleteLedgerMetadata(ledger, -1);
try {
bookKeeperClient.deleteLedger(ledger.getLedgerId());
} catch (BKException e) {
bkException("BookKeeper error deleting empty ledger id " +
ledger.getLedgerId(), e);
} catch (InterruptedException e) {
interruptedException(
"Interrupted deleting empty ledger id " +
ledger.getLedgerId(), e);
}
}
@Override
public RemoteEditLogManifest getEditLogManifest(long fromTxId)
throws IOException {
Collection<EditLogLedgerMetadata> ledgers =
metadataManager.listLedgers(true);
LOG.info("Ledgers to include in manifest: " + ledgers);
List<RemoteEditLog> ret = Lists.newArrayListWithCapacity(ledgers.size());
for (EditLogLedgerMetadata ledger : ledgers) {
long endTxId = ledger.getLastTxId();
boolean isInProgress = endTxId == -1;
if (isInProgress) {
endTxId = validateAndGetEndTxId(ledger);
}
if (endTxId == HdfsConstants.INVALID_TXID) {
continue;
}
if (ledger.getFirstTxId() >= fromTxId) {
ret.add(new RemoteEditLog(ledger.getFirstTxId(),
endTxId,
isInProgress));
} else if ((fromTxId > ledger.getFirstTxId()) &&
(fromTxId <= endTxId)) {
throw new IOException("Asked for firstTxId " + fromTxId +
" which is in the middle of ledger " + ledger);
}
}
Collections.sort(ret);
return new RemoteEditLogManifest(ret, false);
}
private void closeBk() throws IOException {
try {
bookKeeperClient.close();
} catch (BKException e) {
bkException("Error closing BookKeeper client", e);
} catch (InterruptedException e) {
interruptedException("Interrupted closing BookKeeper client ", e);
}
}
private void closeZk() throws IOException {
try {
zk.close();
} catch (InterruptedException e) {
interruptedException("Interrupted closing ZooKeeper client", e);
}
}
@Override
public void close() throws IOException {
try {
closeBk();
} finally {
if (!Thread.currentThread().isInterrupted()) {
closeZk();
}
}
}
@Override
public void selectInputStreams(Collection<EditLogInputStream> streams,
long fromTxId,
boolean inProgressOk,
boolean validateInProgressSegments) throws IOException {
Collection<EditLogLedgerMetadata> allLedgers = getLedgers(fromTxId);
if (LOG.isDebugEnabled()) {
LOG.debug(this + ": selecting input streams starting at " + fromTxId +
(inProgressOk ? " (inProgress ok) " : "(excluding inProgress) " ) +
"from among " + allLedgers.size() + " candidate ledger(s).");
}
addStreamsToCollectionFromLedgers(allLedgers, streams, fromTxId,
inProgressOk, validateInProgressSegments);
}
void addStreamsToCollectionFromLedgers(
Collection<EditLogLedgerMetadata> allLedgers,
Collection<EditLogInputStream> streams, long fromTxId,
boolean inProgressOk, boolean validateInProgressSegments) throws IOException {
for (EditLogLedgerMetadata ledger : allLedgers) {
long endTxId = ledger.getLastTxId();
if (endTxId == -1) {
if (!inProgressOk) {
if (LOG.isDebugEnabled()) {
LOG.debug("Passing over " + ledger + " because it is in progress " +
" and we are ignoring in-progress logs.");
continue;
}
}
if (validateInProgressSegments) {
try {
endTxId = validateAndGetEndTxId(ledger);
} catch (IOException e) {
LOG.error("Got an IOException while trying to validate header of "
+ ledger + ". Skipping.", e);
continue;
}
} else {
LOG.info("Skipping validation of edit segment: " + ledger);
}
}
if (endTxId != HdfsConstants.INVALID_TXID && endTxId < fromTxId) {
if (LOG.isDebugEnabled()) {
LOG.debug("Passing over " + ledger + " because it ends at " +
endTxId + ", but we only care about transaction as new as " +
fromTxId);
}
continue;
}
BookKeeperEditLogInputStream bkelis = new BookKeeperEditLogInputStream(
this, ledger.getLedgerId(), 0, ledger.getFirstTxId(), endTxId,
ledger.getLastTxId() == -1);
bkelis.setJournalManager(this);
streams.add(bkelis);
}
}
@Override
public boolean hasSomeJournalData() throws IOException {
return zkPathExists(zkParentPath);
}
@Override
public boolean hasSomeImageData() throws IOException {
return false;
}
@Override
public String toHTMLString() {
return "BKJM journal";
}
@Override
public boolean hasImageStorage() {
return false;
}
@Override
public RemoteStorageState analyzeJournalStorage() {
// TODO
return null;
}
}