/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.namenode; import org.apache.commons.logging.*; import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.hdfs.DFSClient; import org.apache.hadoop.hdfs.DFSLocatedBlocks; import org.apache.hadoop.hdfs.DFSInputStream; import org.apache.hadoop.hdfs.protocol.*; import org.apache.hadoop.hdfs.server.common.Storage.*; import org.apache.hadoop.hdfs.server.namenode.BlocksMap.BlockInfo; import org.apache.hadoop.hdfs.server.namenode.FSImage.CheckpointStates; import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType; import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile; import org.apache.hadoop.hdfs.server.namenode.LeaseManager.*; import org.apache.hadoop.hdfs.server.namenode.WaitingRoom.*; import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol; import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog; import org.apache.hadoop.hdfs.server.protocol.SnapshotProtocol; import org.apache.hadoop.hdfs.server.common.HdfsConstants; import org.apache.hadoop.hdfs.server.common.Util; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.ipc.*; import org.apache.hadoop.util.Daemon; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.hdfs.util.LightWeightLinkedSet; import java.io.*; import java.net.*; import java.util.*; import java.util.concurrent.*; /********************************************************** * The SnapshotNode is responsible for taking periodic * snapshots of the HDFS. The current design only allows * one SnapshotNode per cluster. * * The SnapshotNode is a daemon that periodically wakes * up (determined by the schedule specified in the configuration), * triggers a periodic snapshot and then goes back to sleep. * The SnapshotNode uses the Namesystem's jetty server to * retreive files. * **********************************************************/ public class SnapshotNode implements SnapshotProtocol { public static final Log LOG = LogFactory.getLog(SnapshotNode.class); public static final String CURRENT_DIR = "/current"; public static String SSNAME = "dfs_snapshot_"; // prefix of ss files private Configuration conf; // conf private String fileServer; // jetty image server namenode listens on private FileSystem dfs; // file system private String tempDir; // temp dir to download files from namenode private String ssDir; // path to store snapshots in private Daemon purgeThread; //waiting room purger thread private ExecutorService leaseUpdateThreadPool; private int maxLeaseUpdateThreads; private Server server; // RPC Server private InetSocketAddress serverAddress = null; // RPC server address private NamenodeProtocol namenode; private InetSocketAddress nameNodeAddr; public SnapshotNode(Configuration conf) { try { this.conf = conf; init(); } catch (IOException e) { LOG.error("Failed to start SnapshotNode"); shutdown(); } } /** * Initialize SnapshotNode * @throws IOException */ private void init() throws IOException { ssDir = conf.get("fs.snapshot.dir", "/.SNAPSHOT"); tempDir = conf.get("fs.snapshot.tempdir", "/tmp/snapshot"); fileServer = getImageServer(); dfs = FileSystem.get(conf); Path ssPath = new Path(ssDir); if (!dfs.exists(ssPath)) { dfs.mkdirs(ssPath); } maxLeaseUpdateThreads = conf.getInt("fs.snapshot.leaseupdatethreads", 100); // Waiting room purge thread purgeThread = new Daemon((new WaitingRoom(conf)).getPurger()); purgeThread.start(); // Get namenode rpc connection nameNodeAddr = NameNode.getAddress(conf); namenode = (NamenodeProtocol) RPC.waitForProxy(NamenodeProtocol.class, NamenodeProtocol.versionID, nameNodeAddr, conf); // Snapshot RPC Server InetSocketAddress socAddr = SnapshotNode.getAddress(conf); int handlerCount = conf.getInt("fs.snapshot.handler.count", 10); server = RPC.getServer(this, socAddr.getHostName(), socAddr.getPort(), handlerCount, false, conf); // The rpc-server port can be ephemeral... ensure we have the correct info serverAddress = server.getListenerAddress(); LOG.info("SnapshotNode up at: " + serverAddress); server.start(); // start rpc server } private static InetSocketAddress getAddress(String address) { return NetUtils.createSocketAddr(address); } public static InetSocketAddress getAddress(Configuration conf) { String nodeport = conf.get("fs.snapshot.server.address"); if (nodeport == null) { nodeport = "localhost:" + 60000; // DEFAULT PORT } return getAddress(nodeport); } @Override public long getProtocolVersion(String protocol, long clientVersion) throws IOException { if (protocol.equals(SnapshotProtocol.class.getName())) { return SnapshotProtocol.versionID; } throw new IOException("Unknown protocol to snapshot node: " + protocol); } @Override public ProtocolSignature getProtocolSignature(String protocol, long clientVersion, int clientMethodsHash) throws IOException { return ProtocolSignature.getProtocolSignature(this, protocol, clientVersion, clientMethodsHash); } void prepareDownloadDirs() throws IOException { // Check if temp dir exists File temp = new File(tempDir); if (!temp.exists()) temp.mkdirs(); if(!temp.isDirectory()) throw new IOException("Temp Dir: " + tempDir + " is not a directory."); // Check if current dir in temp exists temp = new File(tempDir + CURRENT_DIR); if (!temp.exists()) temp.mkdir(); if(!temp.isDirectory()) throw new IOException("Current in Temp Dir: " + tempDir + CURRENT_DIR + " is not a directory."); // Delete all previously downloaded files for (File f: temp.listFiles()) { f.delete(); } } /** * Shutdown snapshot node and attached daemons */ public void shutdown() { if (purgeThread != null) { WaitingRoomPurger purger = (WaitingRoomPurger) purgeThread.getRunnable(); purger.shutdown(); } RPC.stopProxy(namenode); if (server != null) server.stop(); } /** * Shutdown snapshot node and attached daemons */ public void shutdownWaitingRoomPurger() { if (purgeThread != null) { WaitingRoomPurger purger = (WaitingRoomPurger) purgeThread.getRunnable(); purger.shutdown(); } } // SNAPSHOT PROTOCOL // @Override public String[] listSnapshots() throws IOException { Path ssPath = new Path(ssDir); if (!dfs.exists(ssPath)) { throw new FileNotFoundException("Snapshot dir doesn't exist"); } FileStatus ssStatus = dfs.getFileStatus(ssPath); if (!ssStatus.isDir()) { throw new IOException("ssDir " + ssDir +" is not a directory"); } FileStatus[] files = dfs.listStatus(ssPath); List<String> ssIds = new ArrayList<String>(); // Separate snapshot files for (FileStatus ss: files) { if (ss.isDir()) continue; // skips dirs String name = ss.getPath().getName(); if (!name.startsWith("dfs_snapshot_")) continue; ssIds.add(name.substring(13)); } String[] rtn = new String[ssIds.size()]; for (int i = 0; i < ssIds.size(); i++) { rtn[i] = ssIds.get(i); } return rtn; } @Override public FileStatus getSnapshotFileStatus(String id) throws IOException { Path ss = new Path(ssDir + "/" + SSNAME + id); return dfs.getFileStatus(ss); } @Override public boolean deleteSnapshot(String id) throws IOException { Path fileToDelete = new Path(ssDir + "/" + SSNAME + id); return dfs.delete(fileToDelete, false); } @Override public LocatedBlocksWithMetaInfo[] getLocatedBlocks(String snapshotId, String path) throws IOException { FSImage fsImage = new FSImage(); FSNamesystem namesystem = new FSNamesystem(fsImage, conf); Path ssPath = new Path(ssDir + "/" + SSNAME + snapshotId); FSDataInputStream in = dfs.open(ssPath); fsImage.loadFSImage(new File(ssPath.toString()), in); INode inode = namesystem.dir.getInode(path); if (inode == null) { throw new IOException("File/dir at " + path + " does not exist in snapshot " + snapshotId); } List<LocatedBlocksWithMetaInfo> blocks = new ArrayList<LocatedBlocksWithMetaInfo>(); getAllLocatedBlocks(inode, blocks); // fill blocks with LocatedBlocks for all files LocatedBlocksWithMetaInfo[] blocksArr = new LocatedBlocksWithMetaInfo[blocks .size()]; for (int i = 0; i < blocksArr.length; ++i) { blocksArr[i] = blocks.get(i); } fsImage.close(); return blocksArr; } @Override public void createSnapshot(String snapshotId, boolean updateLeases) throws IOException { // Create new SnapshotStore SnapshotStorage ssStore = new SnapshotStorage(conf, Util.stringAsURI(tempDir)); // Download image & edit files from namenode downloadSnapshotFiles(ssStore); // Merge image and edit files doMerge(ssStore); // Update file lengths for leased files (optional) if (updateLeases) { updateLeasedFiles(ssStore); } // Save snapshot saveSnapshot(ssStore, snapshotId); ssStore.close(); } private void getAllLocatedBlocks(INode inode, List<LocatedBlocksWithMetaInfo> blocks) throws IOException { if (inode.isDirectory()) { INodeDirectory dir = (INodeDirectory) inode; for (INode child: dir.getChildren()) { getAllLocatedBlocks(child, blocks); } } else { INodeFile file = (INodeFile) inode; BlockInfo[] fileBlocks = file.getBlocks(); List<LocatedBlock> lb = new ArrayList<LocatedBlock>(); for (BlockInfo block: fileBlocks) { // DatanodeInfo is unavailable, so set as empty for now lb.add(new LocatedBlock(block, new DatanodeInfo[0])); } LocatedBlocks locatedBlocks = new LocatedBlocks( file.computeContentSummary().getLength(), // flength lb, // blks false); // isUnderConstruction // Update DatanodeInfo from NN blocks.add(namenode.updateDatanodeInfo(locatedBlocks)); } } void saveSnapshot(SnapshotStorage ssStore, String id) throws IOException { // Create new snapshot in temp file Path tmpPath = new Path("/tmp/" + SSNAME + id); FSDataOutputStream out = dfs.create(tmpPath); ssStore.saveSnapshot(tmpPath.toString(), out); out.close(); // Rename snapshot Path ssPath = new Path(ssDir + "/" + SSNAME + id); if (!dfs.rename(tmpPath, ssPath)) { throw new IOException("Could not rename temp snapshot file"); } } void doMerge(SnapshotStorage ssStore) throws IOException { FSNamesystem namesystem = new FSNamesystem(ssStore, conf); ssStore.doMerge(); } /** * Create a snapshot with id equals to * current system time. */ void createSnapshot() throws IOException { createSnapshot(Long.toString(System.currentTimeMillis()), true); } void createSnapshot(String id) throws IOException { createSnapshot(id, true); } /** * Tries to get the most up to date lengths of files under construction. */ void updateLeasedFiles(SnapshotStorage ssStore) throws IOException { FSNamesystem fsNamesys = ssStore.getFSNamesystem(); List<Block> blocksForNN = new ArrayList<Block>(); leaseUpdateThreadPool = new ThreadPoolExecutor(1, maxLeaseUpdateThreads, 60, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>()); ((ThreadPoolExecutor)leaseUpdateThreadPool).allowCoreThreadTimeOut(true); // Try to update lengths for leases from DN LightWeightLinkedSet<Lease> sortedLeases = fsNamesys.leaseManager.getSortedLeases(); Iterator<Lease> itr = sortedLeases.iterator(); while (itr.hasNext()) { Lease lease = itr.next(); for (String path : lease.getPaths()) { // Update file lengths using worker threads to increase throughput leaseUpdateThreadPool.execute( new LeaseUpdateWorker(conf, path, fsNamesys, blocksForNN)); } } try { leaseUpdateThreadPool.shutdown(); // Wait till update tasks finish successfully (max 20 mins?) if (!leaseUpdateThreadPool.awaitTermination(1200, TimeUnit.SECONDS)) { throw new IOException("Updating lease files failed"); } } catch (InterruptedException e) { throw new IOException("Snapshot creation interrupted while updating leased files"); } // Fetch block lengths for renamed/deleted leases from NN long[] blockIds = new long[blocksForNN.size()]; for (int i = 0; i < blocksForNN.size(); ++i) { blockIds[i] = blocksForNN.get(i).getBlockId(); } long[] lengths = namenode.getBlockLengths(blockIds); for (int i = 0; i < blocksForNN.size(); ++i) { if (lengths[i] == -1) { // Couldn't update block length, keep preferred length LOG.error("Couldn't update length for block " + blocksForNN.get(i)); } else { blocksForNN.get(i).setNumBytes(lengths[i]); } } } /** * Download fsimage, edits and edits.new files from the name-node. * Files will be downloaded in CURRENT_DIR * @throws IOException */ void downloadSnapshotFiles(SnapshotStorage ssStore) throws IOException { CheckpointSignature start = namenode.getCheckpointSignature(); ssStore.storage.setStorageInfo(start); CheckpointSignature end = null; boolean success; do { // Clear temp files prepareDownloadDirs(); // get fsimage File[] srcNames = ssStore.getImageFiles(); assert srcNames.length == 1 : "No snapshot temporary dir."; TransferFsImage.downloadImageToStorage(fileServer, HdfsConstants.INVALID_TXID, ssStore, true, srcNames); LOG.info("Downloaded file " + srcNames[0].getName() + " size " + srcNames[0].length() + " bytes."); // get edits file srcNames = ssStore.getEditsFiles(); assert srcNames.length == 1 : "No snapshot temporary dir."; TransferFsImage.downloadEditsToStorage(fileServer, new RemoteEditLog(), ssStore, false); LOG.info("Downloaded file " + srcNames[0].getName() + " size " + srcNames[0].length() + " bytes."); // get edits.new file (only if in the middle of ckpt) try { srcNames = ssStore.getEditsNewFiles(); assert srcNames.length == 1 : "No snapshot temporary dir."; TransferFsImage.downloadEditsToStorage(fileServer, new RemoteEditLog(), ssStore, true); LOG.info("Downloaded file " + srcNames[0].getName() + " size " + srcNames[0].length() + " bytes."); } catch (FileNotFoundException e) { // do nothing } end = namenode.getCheckpointSignature(); // Are the downloaded files consistent? success = end.checkpointTime == start.checkpointTime && end.checkpointState != CheckpointStates.UPLOAD_DONE; start = end; } while (!success); } /** * Returns the jetty image server that the Namenode is listening on. * @throws IOException */ private String getImageServer() throws IOException { URI fsName = FileSystem.getDefaultUri(conf); if (!"hdfs".equals(fsName.getScheme())) { throw new IOException("This is not a DFS"); } return NetUtils.getServerAddress(conf, "dfs.info.bindAddress", "dfs.info.port", "dfs.http.address"); } static class SnapshotStorage extends FSImage { Configuration conf; File tempDir; DataOutputStream out; public SnapshotStorage(Configuration conf, URI tempDir) throws IOException { super(tempDir); this.conf = conf; this.tempDir = new File(tempDir.getPath()); } /** * Merge image and edit log (in memory). * Files to merge include fsimage, edits, and possibly edits.new * @throws IOException */ void doMerge() throws IOException { StorageDirectory sdTemp = null; Iterator<StorageDirectory> it = dirIterator(NameNodeDirType.IMAGE_AND_EDITS); if (it.hasNext()) { sdTemp = it.next(); } else { throw new IOException("Could not locate snapshot temp directory."); } loadFSImage(NNStorage.getStorageFile(sdTemp, NameNodeFile.IMAGE)); Collection<EditLogInputStream> editStreams = new ArrayList<EditLogInputStream>(); EditLogInputStream is = new EditLogFileInputStream(NNStorage.getStorageFile(sdTemp, NameNodeFile.EDITS)); editStreams.add(is); File editsNew = NNStorage.getStorageFile(sdTemp, NameNodeFile.EDITS_NEW); if (editsNew.exists()) { is = new EditLogFileInputStream(editsNew); editStreams.add(is); } loadEdits(editStreams); } /** * Writes snapshot to the OutputStream. * @param out Stream to write snapshot to */ void saveSnapshot(String dest, DataOutputStream out) throws IOException { saveFSImage(dest, out); } } private class LeaseUpdateWorker implements Runnable { String path; Configuration conf; List<Block> blocks; FSNamesystem fsNamesys; public LeaseUpdateWorker(Configuration conf, String path, FSNamesystem namesystem, List<Block> blocks) { this.path = path; this.conf = conf; this.blocks = blocks; this.fsNamesys = namesystem; } @Override public void run() { boolean error = false; INodeFile node = null; DFSClient client = null; try { client = new DFSClient(conf); LOG.info("Trying to update lease for file at " + path); // verify that path exists in namespace node = fsNamesys.dir.getFileINode(path); if (node == null) { error = true; } if (!node.isUnderConstruction()) { error = true; } } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); error = true; } // Could not find inode in FSNamespace, quit now if (error) { LOG.error("Couldn't update length for leased file at " + path + " because file not in namespace"); return; } BlockInfo[] blks = node.getBlocks(); // If NN has not leased out any block, return if (blks.length == 0) return; int index = blks.length - 1; // index of last file block LOG.info("Block at index " + index + " being written for file at " + path); // Pessimistically update last block length from DataNode. // File could have been renamed, and a new file created in its place. try { DFSInputStream stm = client.open(path); DFSLocatedBlocks locBlks = stm.fetchLocatedBlocks(); if (locBlks.locatedBlockCount() >= blks.length) { if (blks[index] != null && locBlks.get(index) != null) { if (blks[index].getBlockId() == locBlks.get(index).getBlock().getBlockId()) { blks[index].setNumBytes(locBlks.get(index).getBlock().getNumBytes()); return; } } } stm.close(); client.close(); // close dfs client } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); } // If file was renamed/deleted, set block length to preferred size // and add it to list of blocks which we should try to update from NN LOG.info("Couldn't update block " + blks[index] + " for file " + "at " + path + " from DN. Setting length to preferred length " + "and queuing block to be checked from NN for updated length."); blks[index].setNumBytes(node.getPreferredBlockSize()); synchronized(blocks) { blocks.add(blks[index]); } } } }