/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.namenode;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.File;
import java.io.FileOutputStream;
import java.io.DataOutputStream;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.net.InetSocketAddress;
import java.net.URI;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Date;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Collection;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import javax.management.NotCompliantMBeanException;
import javax.management.StandardMBean;
import org.apache.hadoop.ipc.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.metrics.util.MBeanUtil;
import org.apache.hadoop.hdfs.AvatarZooKeeperClient;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.hdfs.protocol.AvatarProtocol;
import org.apache.hadoop.hdfs.protocol.AvatarConstants.Avatar;
import org.apache.hadoop.hdfs.protocol.AvatarConstants.StartupOption;
import org.apache.hadoop.hdfs.protocol.AvatarConstants.InstanceId;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.server.protocol.AvatarDatanodeCommand;
import org.apache.hadoop.hdfs.server.protocol.BlockReport;
import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.BlocksMap.BlockInfo;
import org.apache.hadoop.hdfs.server.namenode.metrics.AvatarNodeStatusMBean;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.data.Stat;
/**
* This is an implementation of the AvatarNode, a hot
* standby for the NameNode.
* This is really cool, believe me!
* The AvatarNode has two avatars.. the Standby avatar and the Active
* avatar.
*
* In the Standby avatar, the AvatarNode is consuming transaction logs
* generated by the primary (via a transaction log stored in a shared device).
* Typically, the primary Namenode is writing transactions to a NFS filesystem
* and the Standby is reading the log from the same NFS filesystem. The
* Standby is also making periodic checkpoints to the primary namenode.
*
* A manual command can switch the AvatarNode from the Standby avatar
* to the Active avatar. In the Active avatar, the AvatarNode performs precisely
* the same functionality as a real usual Namenode. The switching from
* Standby avatar to the Active avatar is fast and can typically occur
* within seconds.
*
* Typically, an adminstrator will run require two shared mount points for
* transaction logs. It has to be set in fs.name.dir.shared0 and
* fs.name.dir.shared1 (similarly for edits). Then the adminstrator starts
* the AvatarNode on two different machines as follows:
*
* bin/hadoop org.apache.hadoop.hdfs.server.namenode.AvatarNode -zero -active
* bin/hadoop org.apache.hadoop.hdfs.server.namenode.AvatarNode -one -standby
* The first AvatarNode uses fs.name.dir.shared0 while the second
* AvatarNode uses fs.name.dir.shared1 to write its transaction logs.
* Also, at startup, the first instance is the primary Namenode and the
* second instance is the Standby
*
* After a while, the adminstrator decides to change the avatar of the
* second instance to Active. In this case, he/she has to first ensure that the
* first instance is really really dead. This code does not handle the
* split-brain scenario where there are two active namenodes in one cluster.
*
*/
public class AvatarNode extends NameNode
implements AvatarProtocol, AvatarNodeStatusMBean {
static {
Configuration.addDefaultResource("avatar-default.xml");
Configuration.addDefaultResource("avatar-site.xml");
}
public static final Log LOG = LogFactory.getLog(AvatarNode.class.getName());
private static final int INVALIDATES_CLEANUP_INTERVAL = 10 * 1000;
private static final String STORAGE_FILE_LOCK = "in_use.lock";
private static final String EDITSFILE = "/current/edits";
private static final String EDITSNEW = "/current/edits.new";
private static final String TIMEFILE = "/current/fstime";
private static final String IMAGENEW ="/current/fsimage.ckpt";
static final SimpleDateFormat dateForm =
new SimpleDateFormat("yyyy-MM-dd-HH:mm:ss.SSS");
// The instanceId is assigned at startuptime and does not change for
// the lifetime of the Node. The adminstrator has to name each instance
// of the AvatarNode with a different instanceId. The node number is used
// by the AvaterNode to determine which shared devices it should use to
// checkpoint the image.
//
private InstanceId instance;
// The time when (and if) the fsimage was sync-ed from the remote AvatarNode
volatile private long startCheckpointTime;
private Server server; /** RPC server */
private InetSocketAddress serverAddress; /** RPC server address */
private Avatar currentAvatar; // the current incarnation of this node
private Standby standby; // the standby object
private Configuration confg; // config for the standby namenode
private Configuration startupConf; // config for the namenode
private Thread standbyThread; // the standby daemon thread
private InvalidatesCleaner cleaner; // The thread cleaning up invalidates
private Thread cleanerThread;
private RunInfo runInfo;
AvatarNode(Configuration conf) throws IOException {
super(conf);
runInfo = new RunInfo();
initialize(conf);
}
/**
* The startup Conf is the original configuration of the AvatarNode. It is used by the
* secondary namenode to talk to the primary namenode.
* The conf is the modified configuration that is used by the standby namenode
*/
AvatarNode(Configuration startupConf, Configuration conf,
StartupInfo startInfo, RunInfo runInfo) throws IOException {
super(conf);
this.runInfo = runInfo;
this.instance = startInfo.instance;
// if we are starting as the standby then
// record the fstime of the checkpoint that we are about to sync from
if (startInfo.isStandby) {
setStartCheckpointTime(conf);
}
initialize(conf);
currentAvatar = startInfo.isStandby ? Avatar.STANDBY : Avatar.ACTIVE;
this.startupConf = startupConf;
this.confg = conf;
if (currentAvatar == Avatar.STANDBY) {
//
// If we are starting as a Hot Standby, then put namenode in
// safemode. This prevents this instance of the NameNode from
// doing active replication of blocks.
//
setSafeMode(SafeModeAction.SAFEMODE_ENTER);
// do not allow anybody else to directly contact the underlying
// namenode object to exit safemode.
super.namesystem.setSafeModeManualOverride(true);
// Create a standby object which does the actual work of
// processing transactions from the primary and checkpointing
standby = new Standby(this, startupConf, confg);
standbyThread = new Thread(standby);
standbyThread.start();
cleaner = new InvalidatesCleaner();
cleanerThread = new Thread(cleaner);
cleanerThread.start();
}
}
/**
* Wait for the StandbyNode to exit. If it does, then stop the underlying namenode.
*/
void waitForRestart() {
if (standbyThread != null) {
try {
// if this is the standby avatarnode, then wait for the Standby to exit
standbyThread.join();
} catch (InterruptedException ie) {
//eat it up
}
standbyThread = null;
LOG.info("waitForRestart Standby thread exited.");
// if we are still in standbymode, that means we need to restart from scratch.
if (getAvatar() == Avatar.STANDBY) {
LOG.info("waitForRestart Stopping encapsulated namenode.");
runInfo.isRunning = false;
super.stop(); // terminate encapsulated namenode
super.join(); // wait for encapsulated namenode to exit
standby.shutdown();
if (server != null) { // shutdown the AvatarNode
LOG.info("waitForRestart Stopping avatarnode rpcserver.");
server.stop();
try {
server.join();
} catch (InterruptedException ie) {
//eat it up
}
}
if (cleaner != null) {
// Shut down the cleaner thread as it will keep
// the process from shutting down
cleaner.stop();
cleanerThread.interrupt();
try {
cleanerThread.join();
} catch (InterruptedException iex) {
Thread.currentThread().interrupt();
}
}
LOG.info("waitForRestart exiting");
return;
}
}
super.join(); // wait for encapsulated namenode
}
public void registerMBean() {
StandardMBean avatarNodeBean;
try {
avatarNodeBean = new StandardMBean(this, AvatarNodeStatusMBean.class);
MBeanUtil.registerMBean("AvatarNode", "AvatarNodeState", avatarNodeBean);
} catch (NotCompliantMBeanException mex) {
LOG.error("Error registering mbean with JMX", mex);
}
}
@Override
public String getInstance() {
return this.instance.toString();
}
@Override
public String getState() {
return this.currentAvatar.toString();
}
@Override
public long getLagBytes() {
if (this.standby == null)
return 0;
return this.standby.getLagBytes();
}
/**
* Initialize AvatarNode
* @param conf the configuration
*/
private void initialize(Configuration conf) throws IOException {
InetSocketAddress socAddr = AvatarNode.getAddress(conf);
int handlerCount = conf.getInt("hdfs.avatarnode.handler.count", 3);
// create rpc server
this.server = RPC.getServer(this, socAddr.getHostName(),
socAddr.getPort(),
handlerCount, false, conf);
// The rpc-server port can be ephemeral... ensure we have the
// correct info
this.serverAddress = this.server.getListenerAddress();
LOG.info("AvatarNode up at: " + this.serverAddress);
this.registerMBean();
this.server.start();
}
/**
* If the specified protocol is AvatarProtocol, then return the
* AvatarProtocol version id, otherwise delegate to the underlying
* namenode.
*/
public long getProtocolVersion(String protocol,
long clientVersion) throws IOException {
if (protocol.equals(AvatarProtocol.class.getName())) {
return AvatarProtocol.versionID;
} else {
return super.getProtocolVersion(protocol, clientVersion);
}
}
//
// methods to support Avatar Protocol
//
/**
* @inheritDoc
*/
public synchronized Avatar getAvatar() {
return currentAvatar;
}
@Override
public void shutdownAvatar() throws IOException {
runInfo.shutdown = true;
LOG.info("Got shutdown message");
super.stop();
}
public void stopRPC() throws IOException {
this.server.stop();
try {
this.server.join();
} catch (InterruptedException ex) {
Thread.currentThread().interrupt();
}
}
/**
* @inheritDoc
*/
public synchronized void setAvatar(Avatar avatar) throws IOException {
if (avatar == currentAvatar) {
LOG.info("Trying to change avatar to " + avatar +
" but am already in that state.");
return;
}
if (avatar == Avatar.STANDBY) { // ACTIVE to STANDBY
String msg = "Changing state from active to standby is not allowed." +
"If you really want to pause your primary, put it in safemode.";
LOG.warn(msg);
throw new IOException(msg);
} else { // STANDBY to ACTIVE
// Check to see if the primary is somehow checkpointing itself. If so, then
// refuse to switch to active mode. This check is not foolproof but is a
// defensive mechanism to prevent administrator errors.
try {
if (!zkIsEmpty()) {
throw new IOException("Can't switch the AvatarNode to primary since " +
"zookeeper record is not clean. Either use shutdownAvatar to kill " +
"the current primary and clean the ZooKeeper entry, " +
"or clear out the ZooKeeper entry if the primary is dead");
}
} catch (Exception ex) {
throw new IOException("Cancelling setAvatar because of Exception", ex);
}
if (standby.hasStaleCheckpoint()) {
String msg = "Failed to change avatar from " + currentAvatar +
" to " + avatar +
" because the Standby has not yet consumed all transactions.";
LOG.warn(msg);
throw new IOException(msg);
}
standby.quiesce();
cleaner.stop();
cleanerThread.interrupt();
try {
cleanerThread.join();
} catch (InterruptedException iex) {
Thread.currentThread().interrupt();
}
clearInvalidates();
super.namesystem.setSafeModeManualOverride(false);
setSafeMode(SafeModeAction.SAFEMODE_LEAVE);
}
LOG.info("Changed avatar from " + currentAvatar +
" to " + avatar);
currentAvatar = avatar;
}
/*
* As the AvatarNode is running in Standby mode it fills up
* invalidates queues for each datanode with blocks it
* assumes have to be deleted. This information is not
* entirely accurate and fills up memory as well as leads
* to dataloss since those queues are flushed to the datanodes
* on failover and valid blocks may be deleted.
*
* To help prevent filling up the memory we clear these queues
* periodically. And we do a final cleanup jsut before switching
* to primary.
*/
private class InvalidatesCleaner implements Runnable {
volatile boolean running = true;
@Override
public void run() {
while (running) {
clearInvalidates();
try {
Thread.sleep(INVALIDATES_CLEANUP_INTERVAL);
} catch (InterruptedException iex) {
if (running == false)
return;
Thread.currentThread().interrupt();
}
}
}
public void stop() {
running = false;
}
}
private void clearInvalidates() {
try {
DatanodeInfo[] nodes = super.getDatanodeReport(DatanodeReportType.ALL);
assert namesystem.isInSafeMode();
super.namesystem.writeLock();
try {
for (DatanodeInfo node : nodes) {
super.namesystem.removeFromInvalidates(node.getStorageID());
}
} finally {
super.namesystem.writeUnlock();
}
} catch (IOException e) {
e.printStackTrace();
}
}
private boolean ignoreDatanodes() {
return currentAvatar == Avatar.STANDBY &&
(standby == null || standby.fellBehind());
}
public DatanodeCommand blockReportNew(DatanodeRegistration nodeReg, BlockReport rep) throws IOException {
if (runInfo.shutdown || !runInfo.isRunning) {
return null;
}
if (ignoreDatanodes()) {
LOG.info("Standby fell behind. Telling " + nodeReg.toString() +
" to back off");
// Do not process block reports yet as the ingest thread is catching up
return AvatarDatanodeCommand.BACKOFF;
}
return super.blockReport(nodeReg, rep);
}
/**
* @inheritDoc
*/
public Block[] blockReceivedNew(DatanodeRegistration nodeReg,
Block blocks[],
String delHints[]) throws IOException {
if (runInfo.shutdown || !runInfo.isRunning) {
// Do not attempt to process blocks when
// the namenode is shutting down
return new Block[0];
}
if (ignoreDatanodes()) {
return blocks;
}
List<Block> failed = new ArrayList<Block>();
if (currentAvatar == Avatar.STANDBY) {
for (int i = 0; i < blocks.length; i++) {
Block block = blocks[i];
namesystem.writeLock();
try {
BlockInfo storedBlock = namesystem.blocksMap.getStoredBlock(block);
if (storedBlock == null || storedBlock.getINode() == null) {
// If this block does not belong to anyfile, then record it.
LOG.info("blockReceived request received for "
+ block + " on " + nodeReg.getName()
+ " size " + block.getNumBytes()
+ " But it does not belong to any file."
+ " Retry later.");
failed.add(block);
}
} finally {
namesystem.writeUnlock();
}
}
}
super.blockReceived(nodeReg, blocks, delHints);
return failed.toArray(new Block[failed.size()]);
}
/**
* Returns the hostname:port for the AvatarNode. The default
* port for the AvatarNode is one more than the port of the
* underlying namenode.
*/
public static InetSocketAddress getAddress(Configuration conf) {
InetSocketAddress u = NameNode.getAddress(conf);
int port = conf.getInt("dfs.avatarnode.port", u.getPort() + 1);
return new InetSocketAddress(u.getHostName(), port);
}
/**
* Help message for a user
*/
private static void printUsage() {
System.err.println(
"Usage: java AvatarNode [" +
StartupOption.STANDBY.getName() + "] | [" +
StartupOption.SYNC.getName() + "] | [" +
StartupOption.NODEZERO.getName() + "] | [" +
StartupOption.NODEONE.getName() + "] | [" +
StartupOption.FORMAT.getName() + "] | [" +
StartupOption.UPGRADE.getName() + "] | [" +
StartupOption.ROLLBACK.getName() + "] | [" +
StartupOption.FINALIZE.getName() + "] | [" +
StartupOption.IMPORT.getName() + "]");
}
/**
* validates command line arguments
*/
static void validateStartupOptions(StartupInfo startInfo) throws IOException {
// sync cannot be specified along with format or finalize
if (startInfo.syncAtStartup) {
if (startInfo.startOpt == StartupOption.FORMAT ||
startInfo.startOpt == StartupOption.FINALIZE) {
String msg = "Option " + StartupOption.SYNC +
" cannot be specified along with " +
startInfo.startOpt;
LOG.warn(msg);
throw new IOException(msg);
}
}
}
private static class StartupInfo {
StartupOption startOpt;
InstanceId instance;
boolean isStandby;
boolean syncAtStartup;
public StartupInfo(StartupOption startOpt, InstanceId instance,
boolean isStandby, boolean syncAtStartup) {
this.startOpt = startOpt;
this.instance = instance;
this.isStandby = isStandby;
this.syncAtStartup = syncAtStartup;
}
}
/**
* Analyze the command line options
*/
private static StartupInfo parseArguments(String args[]) {
InstanceId instance = InstanceId.NODEZERO;
StartupOption startOpt = StartupOption.REGULAR;
boolean isStandby= false;
boolean syncAtStartup = false;
int argsLen = (args == null) ? 0 : args.length;
for (int i=0; i < argsLen; i++) {
String cmd = args[i];
if (StartupOption.STANDBY.getName().equalsIgnoreCase(cmd)) {
isStandby = true;
} else if (StartupOption.SYNC.getName().equalsIgnoreCase(cmd)) {
syncAtStartup = true;
} else if (StartupOption.NODEZERO.getName().equalsIgnoreCase(cmd)) {
instance = InstanceId.NODEZERO;
} else if (StartupOption.NODEONE.getName().equalsIgnoreCase(cmd)) {
instance = InstanceId.NODEONE;
} else if (StartupOption.FORMAT.getName().equalsIgnoreCase(cmd)) {
startOpt = StartupOption.FORMAT;
} else if (StartupOption.FORMATFORCE.getName().equalsIgnoreCase(cmd)) {
startOpt = StartupOption.FORMATFORCE;
} else if (StartupOption.REGULAR.getName().equalsIgnoreCase(cmd)) {
startOpt = StartupOption.REGULAR;
} else if (StartupOption.UPGRADE.getName().equalsIgnoreCase(cmd)) {
startOpt = StartupOption.UPGRADE;
} else if (StartupOption.ROLLBACK.getName().equalsIgnoreCase(cmd)) {
startOpt = StartupOption.ROLLBACK;
} else if (StartupOption.FINALIZE.getName().equalsIgnoreCase(cmd)) {
startOpt = StartupOption.FINALIZE;
} else if (StartupOption.IMPORT.getName().equalsIgnoreCase(cmd)) {
startOpt = StartupOption.IMPORT;
} else {
return null;
}
}
return new StartupInfo(startOpt, instance, isStandby, syncAtStartup);
}
/**
* Records the startup command in the configuration
*/
private static void setStartupOption(Configuration conf, StartupOption opt) {
conf.set("dfs.namenode.startup", opt.toString());
}
public static AvatarNode createAvatarNode(String argv[],
Configuration conf) throws IOException {
return createAvatarNode(argv, conf, new RunInfo());
}
public static AvatarNode createAvatarNode(String argv[],
Configuration conf,
RunInfo runInfo) throws IOException {
if (conf == null) {
conf = new Configuration();
}
Configuration startupConf = conf; // save configuration at startup
StartupInfo startInfo = parseArguments(argv);
StartupOption startOpt = startInfo.startOpt;
if (startOpt == null) {
printUsage();
return null;
}
setStartupOption(conf, startOpt);
// sync cannot be specified along with format or finalize
validateStartupOptions(startInfo);
conf = updateAddressConf(conf, startInfo);
// We need to check the zookeeper so that the node starting as active
// is the one registered with the zookeeper
// and if the node is starting as standby there has to be a master
// already so that the node doesn't move the log and the image
URI fsname = FileSystem.getDefaultUri(startupConf);
URI actualName = FileSystem.getDefaultUri(conf);
AvatarZooKeeperClient zk = new AvatarZooKeeperClient(conf, null);
boolean zkRegistryMatch = true;
boolean primaryPresent = false;
String errorMsg = null;
try {
Stat stat = new Stat();
String zkRegistry = zk.getPrimaryAvatarAddress(fsname, stat, false);
if (zkRegistry == null) {
// The registry is empty. Usually this means failover is in progress
// we need to manually fix it before starting primary
errorMsg = "A zNode that indicates the primary is empty. "
+ "AvatarNode can only be started as primary if it "
+ "is registered as primary with ZooKeeper";
zkRegistryMatch = false;
} else {
primaryPresent = true;
if (!zkRegistry.equalsIgnoreCase(actualName.getAuthority())) {
zkRegistryMatch = false;
errorMsg = "Registration information in ZooKeeper doesn't "
+ "match the address of this node. AvatarNode can "
+ "only be started as primary if it is registered as "
+ "primary with ZooKeeper";
}
}
} catch (Exception e) {
LOG.error("Got Exception reading primary node registration "
+ "from ZooKeeper. Aborting the start", e);
zkRegistryMatch = false;
}
if (!zkRegistryMatch && !startInfo.isStandby) {
LOG.error(errorMsg);
throw new IOException("Cannot start this AvatarNode as Primary.");
}
if (!primaryPresent && startInfo.isStandby) {
throw new IOException("Cannot start Standby since the " +
"primary is unknown");
}
// If sync is requested, then we copy only the fsimage
// (and not the transaction logs) from the other node.
// If we are NODEONE, then modify the configuration to
// set fs.name.dir, fs.default.name and dfs.http.address.
//
conf = copyFsImage(conf, startInfo);
// namenode options.
switch (startOpt) {
case FORMAT:
boolean aborted = format(conf, true);
System.exit(aborted ? 1 : 0);
case FORMATFORCE:
aborted = format(conf, false);
return null;
case FINALIZE:
aborted = finalize(conf, true);
System.exit(aborted ? 1 : 0);
default:
}
return new AvatarNode(startupConf, conf,
startInfo, runInfo);
}
private boolean zkIsEmpty() throws Exception {
URI fsname = FileSystem.getDefaultUri(startupConf);
AvatarZooKeeperClient zk =
new AvatarZooKeeperClient(this.confg, null);
try {
Stat stat = new Stat();
String zkRegistry = zk.getPrimaryAvatarAddress(fsname, stat, false);
return zkRegistry == null;
} catch (Exception e) {
LOG.error("Got Exception reading primary node registration " +
"from ZooKeeper.", e);
throw e;
}
}
/**
* Return the configuration that should be used by this instance of AvatarNode
* Copy fsimages from the remote shared device.
*/
static Configuration copyFsImage(Configuration conf, StartupInfo startInfo)
throws IOException {
String img0 = conf.get("dfs.name.dir.shared0");
String img1 = conf.get("dfs.name.dir.shared1");
String edit0 = conf.get("dfs.name.edits.dir.shared0");
String edit1 = conf.get("dfs.name.edits.dir.shared1");
Collection<String> namedirs = conf.getStringCollection("dfs.name.dir");
Collection<String> editsdir = conf.getStringCollection("dfs.name.edits.dir");
String msg = "";
if (img0 == null || img0.isEmpty()) {
msg += "No values specified in dfs.name.dir.share0";
}
if (img1 == null || img1.isEmpty()) {
msg += " No values specified in dfs.name.dir.share1";
}
if (edit0 == null || edit0.isEmpty()) {
msg += " No values specified in dfs.name.edits.dir.share0";
}
if (edit1 == null || edit1.isEmpty()) {
msg += " No values specified in dfs.name.edits.dir.share1";
}
if (msg.length() != 0) {
LOG.info(msg);
throw new IOException(msg);
}
// verify that the shared dirctories are not specified as dfs.name.dir
for (String str : namedirs) {
if (str.equalsIgnoreCase(img0)) {
msg = "The name specified in dfs.name.dir.shared0 " +
img0 + " is already part of dfs.name.dir ";
}
if (str.equalsIgnoreCase(img1)) {
msg += " The name specified in dfs.name.dir.shared1 " +
img1 + " is already part of dfs.name.dir ";
}
}
if (msg.length() != 0) {
LOG.info(msg);
throw new IOException(msg);
}
// verify that the shared edits directories are not specified as dfs.name.edits.dir
for (String str : editsdir) {
if (str.equalsIgnoreCase(edit0)) {
msg = "The name specified in dfs.name.edits.dir.shared0 " +
img0 + " is already part of dfs.name.dir ";
}
if (str.equalsIgnoreCase(edit1)) {
msg += " The name specified in dfs.name.edits.dir.shared1 " +
img1 + " is already part of dfs.name.dir ";
}
}
if (msg.length() != 0) {
LOG.info(msg);
throw new IOException(msg);
}
File primary = new File(img0);
File standby = new File(img1);
String mdate = dateForm.format(new Date(now()));
FileSystem localFs = FileSystem.getLocal(conf).getRaw();
File src = null;
File dest = null;
File srcedit = null;
File destedit = null;
//
// if we are instance one then copy from primary to secondary
// otherwise copy from secondary to primary.
//
if (startInfo.instance == InstanceId.NODEONE) {
src = primary;
dest = standby;
srcedit = new File(edit0);
destedit = new File(edit1);
} else if (startInfo.instance == InstanceId.NODEZERO) {
dest = primary;
src = standby;
destedit = new File(edit0);
srcedit = new File(edit1);
}
// copy fsimage directory if needed
if (src.exists() && startInfo.isStandby) {
if (dest.exists()) {
File tmp = new File (dest + File.pathSeparator + mdate);
if (!dest.renameTo(tmp)) {
throw new IOException("Unable to rename " + dest +
" to " + tmp);
}
cleanupBackup(conf, dest);
LOG.info("Moved aside " + dest + " as " + tmp);
}
if (!FileUtil.copy(localFs, new Path(src.toString()),
localFs, new Path(dest.toString()),
false, conf)) {
msg = "Error copying " + src + " to " + dest;
LOG.error(msg);
throw new IOException(msg);
}
LOG.info("Copied " + src + " into " + dest);
// Remove the lock file from the newly synced directory
File lockfile = new File(dest, STORAGE_FILE_LOCK);
lockfile.delete();
// Remove fsimage.ckpt if it exists.
File ckptfile = new File(dest.toString() + IMAGENEW);
ckptfile.delete();
// Now, copy from the now-updated shared directory to all other
// local dirs specified in fs.name.dir
src = dest;
if (!namedirs.isEmpty()) {
for (String str : namedirs) {
dest = new File(str);
if (dest.exists()) {
File tmp = new File (dest + File.pathSeparator + mdate);
if (!dest.renameTo(tmp)) {
throw new IOException("Unable to rename " + dest +
" to " + tmp);
}
cleanupBackup(conf, dest);
LOG.info("Moved aside " + dest + " as " + tmp);
}
if (!FileUtil.copy(localFs, new Path(src.toString()),
localFs, new Path(dest.toString()),
false, conf)) {
msg = "Error copying " + src + " to " + dest;
LOG.error(msg);
throw new IOException(msg);
}
LOG.info("Copied " + src + " into " + dest);
}
}
}
// copy edits directory if needed
if (srcedit.exists() && startInfo.isStandby) {
if (destedit.exists()) {
File tmp = new File (destedit + File.pathSeparator + mdate);
if (!destedit.renameTo(tmp)) {
throw new IOException("Unable to rename " + destedit +
" to " + tmp);
}
cleanupBackup(conf, destedit);
LOG.info("Moved aside " + destedit + " as " + tmp);
}
if (!FileUtil.copy(localFs, new Path(srcedit.toString()),
localFs, new Path(destedit.toString()),
false, conf)) {
msg = "Error copying " + srcedit + " to " + destedit;
LOG.error(msg);
throw new IOException(msg);
}
LOG.info("Copied " + srcedit + " into " + destedit);
// Remove the lock file from the newly synced directory
File lockfile = new File(destedit, STORAGE_FILE_LOCK);
if (lockfile.exists() && lockfile.delete() == false) {
throw new IOException("Unable to delete lock file " + lockfile);
}
// Remove edits and edits.new. Create empty edits file.
File efile = new File(destedit.toString() + EDITSFILE);
if (efile.exists() && efile.delete() == false) {
throw new IOException("Unable to delete edits file " + efile);
}
efile = new File(destedit + EDITSNEW);
efile.delete();
createEditsFile(destedit.toString());
// Now, copy from the now-updated shared directory to all other
// local dirs specified in fs.name.edits.dir
srcedit = destedit;
if (!editsdir.isEmpty()) {
for (String str : editsdir) {
destedit = new File(str);
if (destedit.exists()) {
File tmp = new File (destedit + File.pathSeparator + mdate);
if (!destedit.renameTo(tmp)) {
throw new IOException("Unable to rename " + destedit +
" to " + tmp);
}
cleanupBackup(conf, destedit);
LOG.info("Moved aside " + destedit + " as " + tmp);
}
if (!FileUtil.copy(localFs, new Path(srcedit.toString()),
localFs, new Path(destedit.toString()),
false, conf)) {
msg = "Error copying " + srcedit + " to " + destedit;
LOG.error(msg);
throw new IOException(msg);
}
LOG.info("Copied " + srcedit + " into " + destedit);
}
}
}
// allocate a new configuration and update fs.name.dir approprately
// The shared device should be the first in the list.
Configuration newconf = new Configuration(conf);
StringBuffer buf = new StringBuffer();
if (startInfo.instance == InstanceId.NODEONE) {
buf.append(img1);
} else if (startInfo.instance == InstanceId.NODEZERO) {
buf.append(img0);
}
for (String str : namedirs) {
buf.append(",");
buf.append(str);
}
newconf.set("dfs.name.dir", buf.toString());
buf = null;
// update fs.name.edits.dir approprately in the new configuration
// The shared device should be the first in the list.
StringBuffer buf1 = new StringBuffer();
if (startInfo.instance == InstanceId.NODEONE) {
buf1.append(edit1);
} else if (startInfo.instance == InstanceId.NODEZERO) {
buf1.append(edit0);
}
for (String str : editsdir) {
buf1.append(",");
buf1.append(str);
}
newconf.set("dfs.name.edits.dir", buf1.toString());
return newconf;
}
static void cleanupBackup(Configuration conf, File origin) {
File root = origin.getParentFile();
final String originName = origin.getName();
String[] backups = root.list(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
if (!name.startsWith(originName) || name.equals(originName))
return false;
try {
dateForm.parse(name.substring(name.indexOf(File.pathSeparator) + 1));
} catch (ParseException pex) {
return false;
}
return true;
}
});
Arrays.sort(backups, new Comparator<String>() {
@Override
public int compare(String back1, String back2) {
try {
Date date1 = dateForm.parse(back1.substring(back1
.indexOf(File.pathSeparator) + 1));
Date date2 = dateForm.parse(back2.substring(back2
.indexOf(File.pathSeparator) + 1));
// Sorting in reverse order, from later dates to earlier
return date2.compareTo(date1);
} catch (ParseException pex) {
return 0;
}
}
});
int copiesToKeep = conf.getInt("standby.image.copies.tokeep", 0);
int daysToKeep = conf.getInt("standby.image.days.tokeep", 0);
if (copiesToKeep == 0 && daysToKeep == 0) {
// Do not delete anything in this case
return;
}
Date now = new Date(now());
int copies = 0;
for (String backup : backups) {
copies++;
Date backupDate = null;
try {
backupDate = dateForm.parse(backup.substring(backup
.indexOf(File.pathSeparator) + 1));
} catch (ParseException pex) {
// This should not happen because of the
// way we construct the list
}
long backupAge = now.getTime() - backupDate.getTime();
if (copies > copiesToKeep && backupAge > daysToKeep * 24 * 60 * 60 * 1000) {
// This backup is both old and we have enough of newer backups stored -
// delete
try {
FileUtil.fullyDelete(new File(root, backup));
LOG.info("Deleted backup " + new File(root, backup));
} catch (IOException iex) {
LOG.error("Error deleting backup " + new File(root, backup), iex);
}
}
}
}
static Configuration updateAddressConf(Configuration conf,
StartupInfo startInfo) {
Configuration newconf = new Configuration(conf);
// if we are starting as the other namenode, then change the
// default URL to make the namenode attach to the appropriate URL
if (startInfo.instance == InstanceId.NODEZERO) {
String fs = conf.get("fs.default.name0");
if (fs != null) {
newconf.set("fs.default.name", fs);
}
fs = conf.get("dfs.http.address0");
if (fs != null) {
newconf.set("dfs.http.address", fs);
}
fs = conf.get("dfs.namenode.dn-address0");
if (fs != null) {
newconf.set("dfs.namenode.dn-address", fs);
}
}
if (startInfo.instance == InstanceId.NODEONE) {
String fs = conf.get("fs.default.name1");
if (fs != null) {
newconf.set("fs.default.name", fs);
}
fs = conf.get("dfs.http.address1");
if (fs != null) {
newconf.set("dfs.http.address", fs);
}
fs = conf.get("dfs.namenode.dn-address1");
if (fs != null) {
newconf.set("dfs.namenode.dn-address", fs);
}
}
return newconf;
}
/**
* Returns the address of the remote namenode
*/
InetSocketAddress getRemoteNamenodeAddress(Configuration conf)
throws IOException {
String fs = null;
if (instance == InstanceId.NODEZERO) {
fs = conf.get("fs.default.name1");
} else if (instance == InstanceId.NODEONE) {
fs = conf.get("fs.default.name0");
} else {
throw new IOException("Unknown instance " + instance);
}
if (fs != null) {
conf = new Configuration(conf);
conf.set("fs.default.name", fs);
}
return NameNode.getAddress(conf);
}
/**
* Returns the name of the http server of the local namenode
*/
String getRemoteNamenodeHttpName(Configuration conf)
throws IOException {
if (instance == InstanceId.NODEZERO) {
return conf.get("dfs.http.address1");
} else if (instance == InstanceId.NODEONE) {
return conf.get("dfs.http.address0");
} else {
throw new IOException("Unknown instance " + instance);
}
}
/**
* Create an empty edits log
*/
static void createEditsFile(String editDir) throws IOException {
File editfile = new File(editDir + EDITSFILE);
FileOutputStream fp = new FileOutputStream(editfile);
DataOutputBuffer buf = new DataOutputBuffer(1024);
buf.writeInt(FSConstants.LAYOUT_VERSION);
buf.writeTo(fp);
buf.close();
fp.close();
}
/**
* Return the edits file of the remote NameNode
*/
File getRemoteEditsFile(Configuration conf) throws IOException {
String edit = null;
if (instance == InstanceId.NODEZERO) {
edit = conf.get("dfs.name.edits.dir.shared1");
} else if (instance == InstanceId.NODEONE) {
edit = conf.get("dfs.name.edits.dir.shared0");
} else {
LOG.info("Instance is invalid. " + instance);
throw new IOException("Instance is invalid. " + instance);
}
return new File(edit + EDITSFILE);
}
/**
* Return the edits.new file of the remote NameNode
*/
File getRemoteEditsFileNew(Configuration conf) throws IOException {
String edit = null;
if (instance == InstanceId.NODEZERO) {
edit = conf.get("dfs.name.edits.dir.shared1");
} else if (instance == InstanceId.NODEONE) {
edit = conf.get("dfs.name.edits.dir.shared0");
} else {
LOG.info("Instance is invalid. " + instance);
throw new IOException("Instance is invalid. " + instance);
}
return new File(edit + EDITSNEW);
}
/**
* Return the fstime file of the remote NameNode
*/
File getRemoteTimeFile(Configuration conf) throws IOException {
String edit = null;
if (instance == InstanceId.NODEZERO) {
edit = conf.get("dfs.name.edits.dir.shared1");
} else if (instance == InstanceId.NODEONE) {
edit = conf.get("dfs.name.edits.dir.shared0");
} else {
LOG.info("Instance is invalid. " + instance);
throw new IOException("Instance is invalid. " + instance);
}
return new File(edit + TIMEFILE);
}
/**
* Reads the timestamp of the last checkpoint from the remote fstime file.
*/
long readRemoteFstime(Configuration conf)
throws IOException {
String edit = null;
if (instance == InstanceId.NODEZERO) {
edit = conf.get("dfs.name.edits.dir.shared1");
} else if (instance == InstanceId.NODEONE) {
edit = conf.get("dfs.name.edits.dir.shared0");
} else {
LOG.info("Instance is invalid. " + instance);
throw new IOException("Instance is invalid. " + instance);
}
File timeFile = new File(edit + TIMEFILE);
long timeStamp = 0L;
DataInputStream in = null;
try {
in = new DataInputStream(new FileInputStream(timeFile));
timeStamp = in.readLong();
} catch (IOException e) {
if (!timeFile.exists()) {
String msg = "Error reading checkpoint time file " + timeFile +
" file does not exist.";
LOG.error(msg);
throw new IOException(msg + e);
} else if (!timeFile.canRead()) {
String msg = "Error reading checkpoint time file " + timeFile +
" cannot read file of size " + timeFile.length() +
" last modified " +
dateForm.format(new Date(timeFile.lastModified()));
LOG.error(msg);
throw new IOException(msg + e);
} else {
String msg = "Error reading checkpoint time file " + timeFile;
LOG.error(msg);
throw new IOException(msg + e);
}
} finally {
if (in != null) {
in.close();
}
}
return timeStamp;
}
/**
* Returns the starting checkpoint time of this AvatarNode
*/
long getStartCheckpointTime() {
return startCheckpointTime;
}
/**
* Sets the starting checkpoint time of this AvatarNode
*/
void setStartCheckpointTime(Configuration conf)
throws IOException {
startCheckpointTime = readRemoteFstime(conf);
}
/**
* Indicates that the AvatarNode shoudl restart
*/
void doRestart() {
runInfo.doRestart = true;
}
/**
* Returns true if both edits and edits.new for the
* remote namenode exists.
*/
boolean twoEditsFile(Configuration conf) throws IOException{
File f1 = getRemoteEditsFile(conf);
File f2 = getRemoteEditsFileNew(conf);
return f1.exists() && f2.exists();
}
/**
* Returns the size of the edits file for the remote
* namenode.
*/
long editSize(Configuration conf) throws IOException{
return getRemoteEditsFile(conf).length();
}
/**
* Current system time.
* @return current time in msec.
*/
static long now() {
return System.currentTimeMillis();
}
/**
* Verify that configured directories exist, then
* Interactively confirm that formatting is desired
* for each existing directory and format them.
*
* @param conf
* @param isConfirmationNeeded
* @return true if formatting was aborted, false otherwise
* @throws IOException
*/
private static boolean format(Configuration conf,
boolean isConfirmationNeeded
) throws IOException {
boolean allowFormat = conf.getBoolean("dfs.namenode.support.allowformat",
true);
if (!allowFormat) {
throw new IOException("The option dfs.namenode.support.allowformat is "
+ "set to false for this filesystem, so it "
+ "cannot be formatted. You will need to set "
+ "dfs.namenode.support.allowformat parameter "
+ "to true in order to format this filesystem");
}
Collection<File> dirsToFormat = FSNamesystem.getNamespaceDirs(conf);
Collection<File> editDirsToFormat =
FSNamesystem.getNamespaceEditsDirs(conf);
for(Iterator<File> it = dirsToFormat.iterator(); it.hasNext();) {
File curDir = it.next();
if (!curDir.exists())
continue;
if (isConfirmationNeeded) {
System.err.print("Re-format filesystem in " + curDir +" ? (Y or N) ");
if (!(System.in.read() == 'Y')) {
System.err.println("Format aborted in "+ curDir);
return true;
}
while(System.in.read() != '\n'); // discard the enter-key
}
}
FSNamesystem nsys = new FSNamesystem(new FSImage(dirsToFormat,
editDirsToFormat), conf);
nsys.dir.fsImage.format();
return false;
}
private static boolean finalize(Configuration conf,
boolean isConfirmationNeeded
) throws IOException {
Collection<File> dirsToFormat = FSNamesystem.getNamespaceDirs(conf);
Collection<File> editDirsToFormat =
FSNamesystem.getNamespaceEditsDirs(conf);
FSNamesystem nsys = new FSNamesystem(new FSImage(dirsToFormat,
editDirsToFormat), conf);
System.err.print(
"\"finalize\" will remove the previous state of the files system.\n"
+ "Recent upgrade will become permanent.\n"
+ "Rollback option will not be available anymore.\n");
if (isConfirmationNeeded) {
System.err.print("Finalize filesystem state ? (Y or N) ");
if (!(System.in.read() == 'Y')) {
System.err.println("Finalize aborted.");
return true;
}
while(System.in.read() != '\n'); // discard the enter-key
}
nsys.dir.fsImage.finalizeUpgrade();
return false;
}
public static class RunInfo {
volatile boolean doRestart;
volatile boolean shutdown;
volatile boolean isRunning;
public RunInfo(boolean doRestart, boolean shutdown, boolean isRunning) {
this.doRestart = doRestart;
this.shutdown = shutdown;
this.isRunning = isRunning;
}
public RunInfo() {
this.doRestart = false;
this.shutdown = false;
this.isRunning = true;
}
}
/**
*/
public static void main(String argv[]) throws Exception {
Exception exception = null;
AvatarNode avatarnode = null;
RunInfo runInfo = new RunInfo();
do {
runInfo.doRestart = false;
runInfo.isRunning = true;
exception = null;
try {
StringUtils.startupShutdownMessage(AvatarNode.class, argv, LOG);
avatarnode = createAvatarNode(argv, null, runInfo);
if (avatarnode != null) {
avatarnode.waitForRestart();
}
} catch (Throwable e) {
LOG.error(StringUtils.stringifyException(e));
if (runInfo.doRestart) {
LOG.error("AvatarNode restarting...");
} else {
exception = new Exception(StringUtils.stringifyException(e));
}
}
} while (runInfo.doRestart == true);
if (runInfo.shutdown) {
avatarnode.stopRPC();
}
if (exception != null) {
LOG.fatal("Error running avatar", exception);
Runtime.getRuntime().exit(1);
}
}
}