/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.datanode; import java.nio.channels.FileChannel; import java.io.DataInputStream; import java.io.File; import java.io.FileDescriptor; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InterruptedIOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Random; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.locks.ReentrantReadWriteLock; import javax.management.NotCompliantMBeanException; import javax.management.ObjectName; import javax.management.StandardMBean; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.DF; import org.apache.hadoop.fs.DU; import org.apache.hadoop.fs.DU.NamespaceSliceDU; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.FSConstants; import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.common.GenerationStamp; import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; import org.apache.hadoop.hdfs.server.datanode.BlockInlineChecksumReader.GenStampAndChecksum; import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.datanode.DirectoryScanner.ScanDifference; import org.apache.hadoop.hdfs.server.datanode.NamespaceMap.BlockBucket; import org.apache.hadoop.hdfs.server.datanode.metrics.FSDatasetMBean; import org.apache.hadoop.hdfs.server.protocol.BlockFlags; import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryInfo; import org.apache.hadoop.hdfs.server.protocol.InterDatanodeProtocol; import org.apache.hadoop.hdfs.util.LightWeightHashSet; import org.apache.hadoop.io.nativeio.NativeIO; import org.apache.hadoop.metrics.util.MBeanUtil; import org.apache.hadoop.util.DataChecksum; import org.apache.hadoop.util.VersionInfo; import org.apache.hadoop.util.DiskChecker; import org.apache.hadoop.util.DiskChecker.DiskErrorException; import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException; /************************************************** * FSDataset manages a set of data blocks. Each block * has a unique name and an extent on disk. * ***************************************************/ public class FSDataset implements FSConstants, FSDatasetInterface { public static final Log LOG = LogFactory.getLog(FSDataset.class); interface FSDatasetDeltaInterface { void addBlock(int namespaceId, Block block); void removeBlock(int namespaceId, Block block); void updateBlock(int namespaceId, Block oldBlock, Block newBlock); } static String[] getFileNames(File[] files) { String[] fileNames = new String[files.length]; for (int i = 0; i < files.length; i++) { fileNames[i] = files[i].getName(); } return fileNames; } static Block getBlockFromNames(File blockFiles[], String[] blockFilesNames, int index) throws IOException { if (Block.isSeparateChecksumBlockFilename(blockFilesNames[index])) { long genStamp = BlockWithChecksumFileReader .getGenerationStampFromSeperateChecksumFile(blockFilesNames, blockFilesNames[index]); return new Block(blockFiles[index], blockFiles[index].length(), genStamp); } else if (Block.isInlineChecksumBlockFilename(blockFilesNames[index])) { // TODO: We might want to optimize it. GenStampAndChecksum sac = BlockInlineChecksumReader .getGenStampAndChecksumFromInlineChecksumFile(blockFilesNames[index]); long blockLengh = BlockInlineChecksumReader.getBlockSizeFromFileLength( blockFiles[index].length(), sac.checksumType, sac.bytesPerChecksum); return new Block(blockFiles[index], blockLengh, sac.generationStamp); } return null; } /** * A NamespaceSlice represents a portion of a namespace stored on a volume. * Taken together, all BNamespaceSlices sharing a namespaceID across a * cluster represent a single namespace. */ class NamespaceSlice { private final int namespaceId; private final FSVolume volume; // volume to which this namespaceSlice belongs to private final FSDir dataDir; // StorageDirectory/current/nsid/current/finalized private final File detachDir; // directory store Finalized replica private final File rbwDir ; // directory store RBW replica private final File tmpDir; // directory store Temporary replica private final NamespaceSliceDU dfsUsage; private volatile boolean blockCrcFileLoaded; /** * * @param namespaceId * @param volume {@link FSVolume} to which this NamespaceSlice belongs to * @param nsDir directory corresponding to the NameSpaceSlice * @param conf * @throws IOException */ NamespaceSlice(int namespaceId, FSVolume volume, File nsDir, Configuration conf, boolean supportAppends) throws IOException { this.namespaceId = namespaceId; this.volume = volume; File nsDirCur = new File(nsDir, DataStorage.STORAGE_DIR_CURRENT); File dataDirFile = new File(nsDirCur, DataStorage.STORAGE_DIR_FINALIZED); this.dataDir = new FSDir(namespaceId, dataDirFile, volume); this.detachDir = new File(nsDir, "detach"); if (detachDir.exists()) { recoverDetachedBlocks(dataDirFile, detachDir); } // Files that were being written when the datanode was last shutdown // are now moved back to the data directory. It is possible that // in the future, we might want to do some sort of datanode-local // recovery for these blocks. For example, crc validation. // this.tmpDir = new File(nsDir, "tmp"); if (tmpDir.exists()) { // rename tmpDir to prepare delete File toDeleteDir = new File(tmpDir.getParent(), DELETE_FILE_EXT + tmpDir.getName()); if (tmpDir.renameTo(toDeleteDir)) { // asyncly delete the renamed directory asyncDiskService.deleteAsyncFile(volume, toDeleteDir); } else { // rename failed, let's synchronously delete the directory FileUtil.fullyDelete(tmpDir); DataNode.LOG.warn("Deleted " + tmpDir.getPath()); } } this.rbwDir = new File(nsDirCur, DataStorage.STORAGE_DIR_RBW); // Files that were being written when the datanode was last shutdown // should not be deleted if append mode is enabled. if (rbwDir.exists()) { recoverBlocksBeingWritten(rbwDir); } if (!rbwDir.mkdirs()) { if (!rbwDir.isDirectory()) { throw new IOException("Mkdirs failed to create " + rbwDir.toString()); } } if (!tmpDir.mkdirs()) { if (!tmpDir.isDirectory()) { throw new IOException("Mkdirs failed to create " + tmpDir.toString()); } } if (!detachDir.mkdirs()) { if (!detachDir.isDirectory()) { throw new IOException("Mkdirs failed to create " + detachDir.toString()); } } this.dfsUsage = volume.dfsUsage.addNamespace(namespaceId, nsDir, conf); this.blockCrcFileLoaded = false; } void getBlockInfo(LightWeightHashSet<Block> blocks) throws IOException{ dataDir.getBlockInfo(blocks); } boolean isBlockCrcFileLoaded() { return blockCrcFileLoaded; } void setBlockCrcFileLoaded(boolean blockCrcFileLoaded) { this.blockCrcFileLoaded = blockCrcFileLoaded; } /** * Recover detached files on datanode restart. If a detached block * does not exist in the original directory, then it is moved to the * original directory. */ private void recoverDetachedBlocks(File dataDir, File dir) throws IOException { File contents[] = dir.listFiles(); if (contents == null) { return; } for (int i = 0; i < contents.length; i++) { if (!contents[i].isFile()) { throw new IOException ("Found " + contents[i] + " in " + dir + " but it is not a file."); } // // If the original block file still exists, then no recovery // is needed. // File blk = new File(dataDir, contents[i].getName()); if (!blk.exists()) { if (!contents[i].renameTo(blk)) { throw new IOException("Unable to recover detached file " + contents[i]); } continue; } if (!contents[i].delete()) { throw new IOException("Unable to cleanup detached file " + contents[i]); } } } void getBlocksBeingWrittenInfo(LightWeightHashSet<Block> blockSet) throws IOException { if (rbwDir == null) { return; } File[] blockFiles = rbwDir.listFiles(); if (blockFiles == null) { return; } String[] blockFileNames = getFileNames(blockFiles); for (int i = 0; i < blockFiles.length; i++) { if (!blockFiles[i].isDirectory()) { // get each block in the rbwDir directory Block block = FSDataset.getBlockFromNames(blockFiles, blockFileNames, i); if (block != null) { // add this block to block set blockSet.add(block); if (DataNode.LOG.isDebugEnabled()) { DataNode.LOG.debug("recoverBlocksBeingWritten for block " + block); } } } } } /** * Recover blocks that were being written when the datanode * was earlier shut down. These blocks get re-inserted into * ongoingCreates. Also, send a blockreceived message to the NN * for each of these blocks because these are not part of a * block report. */ private void recoverBlocksBeingWritten(File bbw) throws IOException { FSDir fsd = new FSDir(namespaceId, bbw, this.volume); LightWeightHashSet<BlockAndFile> blockSet = new LightWeightHashSet<BlockAndFile>(); fsd.getBlockAndFileInfo(blockSet); for (BlockAndFile b : blockSet) { File f = b.pathfile; // full path name of block file lock.writeLock().lock(); try { boolean isInlineChecksum = Block.isInlineChecksumBlockFilename(f .getName()); int checksumType = DataChecksum.CHECKSUM_UNKNOWN; int bytesPerChecksum = -1; if (isInlineChecksum) { GenStampAndChecksum sac = BlockInlineChecksumReader .getGenStampAndChecksumFromInlineChecksumFile(f.getName()); checksumType = sac.checksumType; bytesPerChecksum = sac.bytesPerChecksum; } DatanodeBlockInfo binfo = new DatanodeBlockInfo(volume, f, DatanodeBlockInfo.UNFINALIZED, true, isInlineChecksum, checksumType, bytesPerChecksum, false, 0); volumeMap.add(namespaceId, b.block, binfo); volumeMap.addOngoingCreates(namespaceId, b.block, new ActiveFile( binfo, true, ActiveFile.UNKNOWN_SIZE, false)); } finally { lock.writeLock().unlock(); } if (DataNode.LOG.isDebugEnabled()) { DataNode.LOG.debug("recoverBlocksBeingWritten for block " + b.block + "namespaceId: "+namespaceId); } } } File getDirectory() { return dataDir.getDirectory().getParentFile(); } File getCurrentDir() { return dataDir.getDirectory(); } File getRbwDir() { return rbwDir; } void decDfsUsed(long value) { dfsUsage.decDfsUsed(value); } long getDfsUsed() throws IOException { return dfsUsage.getUsed(); } /** * Temporary files. They get moved to the finalized block directory when * the block is finalized. */ File createTmpFile(Block b) throws IOException { File f = new File(tmpDir, b.getBlockName()); return FSDataset.createTmpFile(b, f); } File createDetachFile(Block b) throws IOException { File f = new File(detachDir, b.getBlockName()); return FSDataset.createTmpFile(b, f); } File getTmpFile(Block b) throws IOException { File f = new File(tmpDir, b.getBlockName()); return f; } /** * Temporary files. They get moved to the finalized block directory when * the block is finalized. */ File createTmpFile(Block b, boolean replicationRequest, boolean inlineChecksum, int checksumType, int bytesPerChecksum) throws IOException { File f= null; String fileName; if (inlineChecksum) { fileName = BlockInlineChecksumWriter.getInlineChecksumFileName(b, checksumType, bytesPerChecksum); } else { fileName = b.getBlockName(); } if (!replicationRequest) { f = new File(rbwDir, fileName); } else { f = new File(tmpDir, fileName); } return FSDataset.createTmpFile(b, f); } /** * RBW files. They get moved to the finalized block directory when * the block is finalized. */ File createRbwFile(Block b) throws IOException { File f = new File(rbwDir, b.getBlockName()); return FSDataset.createTmpFile(b, f); } File addBlock(Block b, File f, boolean inlineChecksum, int checksumType, int bytesPerChecksum) throws IOException { File blockFile = dataDir.addBlock(namespaceId, b, f, inlineChecksum, checksumType, bytesPerChecksum); long spaceAdded; if (!inlineChecksum) { File metaFile = BlockWithChecksumFileWriter.getMetaFile(blockFile , b); spaceAdded = b.getNumBytes() + metaFile.length(); } else { spaceAdded = blockFile.length(); } dfsUsage.incDfsUsed(spaceAdded); return blockFile; } void checkDirs() throws DiskErrorException { dataDir.checkDirTree(); DiskChecker.checkDir(tmpDir); DiskChecker.checkDir(detachDir); DiskChecker.checkDir(rbwDir); } void clearPath(File f) { dataDir.clearPath(f); } public String toString() { return dataDir.getDirectory().getAbsolutePath(); } public void shutdown() { volume.dfsUsage.removeNamespace(namespaceId); } } /** * A data structure than encapsulates a Block along with the full pathname * of the block file */ static class BlockAndFile implements Comparable<BlockAndFile> { final Block block; final File pathfile; BlockAndFile(File fullpathname, Block block) { this.pathfile = fullpathname; this.block = block; } public int compareTo(BlockAndFile o) { return this.block.compareTo(o.block); } } /** * A node type that can be built into a tree reflecting the * hierarchy of blocks on the local disk. */ class FSDir { File dir; int numBlocks = 0; volatile FSDir childrenDirs[]; int lastChildIdx = 0; File getDirectory(){ return dir; } FSDir[] getChildren() { return childrenDirs; } public FSDir() { } public FSDir(int namespaceId, File dir) throws IOException{ this(namespaceId, dir, null); } public FSDir(int namespaceId, File dir, FSVolume volume) throws IOException { this.dir = dir; this.childrenDirs = null; if (!dir.exists()) { if (!dir.mkdirs()) { throw new IOException("Mkdirs failed to create " + dir.toString()); } } else { File[] files = dir.listFiles(); String[] filesNames = getFileNames(files); int numChildren = 0; for (int i = 0; i < files.length; i++) { File file = files[i]; String fileName = filesNames[i]; if (isPendingDeleteFilename(fileName)){ // Should not cause throwing an exception. // Obsolete files are not included in the block report. asyncDiskService.deleteAsyncFile(volume, file); } else if (file.isDirectory()) { numChildren++; } else if (Block.isSeparateChecksumBlockFilename(fileName)) { numBlocks++; if (volume != null) { long blkSize = file.length(); long genStamp = BlockWithChecksumFileReader .getGenerationStampFromSeperateChecksumFile(filesNames, fileName); volumeMap.add(namespaceId, new Block(file, blkSize, genStamp), new DatanodeBlockInfo(volume, file, blkSize, true, false, DataChecksum.CHECKSUM_UNKNOWN, -1, false, 0)); } } else if (Block.isInlineChecksumBlockFilename(fileName)) { numBlocks++; if (volume != null) { GenStampAndChecksum sac = BlockInlineChecksumReader .getGenStampAndChecksumFromInlineChecksumFile(fileName); long blkSize = BlockInlineChecksumReader .getBlockSizeFromFileLength(file.length(), sac.checksumType, sac.bytesPerChecksum); volumeMap.add(namespaceId, new Block(file, blkSize, sac.generationStamp), new DatanodeBlockInfo(volume, file, blkSize, true, true, sac.checksumType, sac.bytesPerChecksum, false, 0)); } } } if (numChildren > 0) { FSDir[] newChildren = new FSDir[numChildren]; int curdir = 0; for (int idx = 0; idx < files.length; idx++) { String fileName = files[idx].getName(); if (files[idx].isDirectory() && !isPendingDeleteFilename(fileName)) { newChildren[curdir] = new FSDir(namespaceId, files[idx], volume); curdir++; } } childrenDirs = newChildren; } } } public File addBlock(int namespaceId, Block b, File src, boolean inlineChecksum, int checksumType, int bytesPerChecksum) throws IOException { //First try without creating subdirectories File file = addBlock(namespaceId, b, src, false, false, inlineChecksum, checksumType, bytesPerChecksum); return (file != null) ? file : addBlock(namespaceId, b, src, true, true, inlineChecksum, checksumType, bytesPerChecksum); } private File addBlock(int namespaceId, Block b, File src, boolean createOk, boolean resetIdx, boolean inlineChecksum, int checksumType, int bytesPerChecksum) throws IOException { if (numBlocks < maxBlocksPerDir) { File dest; if (!inlineChecksum) { dest = new File(dir, b.getBlockName()); File metaData = BlockWithChecksumFileWriter.getMetaFile( src, b ); File newmeta = BlockWithChecksumFileWriter.getMetaFile(dest, b); if ( ! metaData.renameTo( newmeta )) { throw new IOException("could not move file " + metaData.getAbsolutePath() + " to " + newmeta.getAbsolutePath()); } if (DataNode.LOG.isDebugEnabled()) { DataNode.LOG.debug("addBlock: Moved " + metaData + " to " + newmeta); } } else { dest = new File(dir, BlockInlineChecksumWriter.getInlineChecksumFileName(b, checksumType, bytesPerChecksum)); } if (! src.renameTo( dest ) ) { throw new IOException( "could not move files for " + b + " from tmp to " + dest.getAbsolutePath() ); } // fsyncIfPossible parent directory to persist rename. if (datanode.syncOnClose) { NativeIO.fsyncIfPossible(dest.getParent()); } if (DataNode.LOG.isDebugEnabled()) { DataNode.LOG.debug("addBlock: Moved " + src + " to " + dest); } numBlocks += 1; return dest; } FSDir[] children = this.getChildren(); if (lastChildIdx < 0 && resetIdx) { //reset so that all children will be checked lastChildIdx = random.nextInt(children.length); } if (lastChildIdx >= 0 && children != null) { //Check if any child-tree has room for a block. for (int i=0; i < children.length; i++) { int idx = (lastChildIdx + i)%children.length; File file = children[idx].addBlock(namespaceId, b, src, false, resetIdx, inlineChecksum, checksumType, bytesPerChecksum); if (file != null) { lastChildIdx = idx; return file; } } lastChildIdx = -1; } if (!createOk) { return null; } if (children == null || children.length == 0) { // make sure children is immutable once initialized. FSDir[] newChildren = new FSDir[maxBlocksPerDir]; for (int idx = 0; idx < maxBlocksPerDir; idx++) { newChildren[idx] = new FSDir(namespaceId, new File(dir, DataStorage.BLOCK_SUBDIR_PREFIX + idx)); } childrenDirs = children = newChildren; } //now pick a child randomly for creating a new set of subdirs. lastChildIdx = random.nextInt(children.length); return children[lastChildIdx].addBlock(namespaceId, b, src, true, false, inlineChecksum, checksumType, bytesPerChecksum); } /** * Populate the given blockSet with any child blocks * found at this node. * @throws IOException */ public void getBlockInfo(LightWeightHashSet<Block> blockSet) throws IOException { FSDir[] children = this.getChildren(); if (children != null) { for (int i = 0; i < children.length; i++) { children[i].getBlockInfo(blockSet); } } File blockFiles[] = dir.listFiles(); String[] blockFilesNames = getFileNames(blockFiles); for (int i = 0; i < blockFiles.length; i++) { Block block = getBlockFromNames(blockFiles, blockFilesNames, i); if (block != null) { blockSet.add(block); } } } /** * Populate the given blockSet with any child blocks * found at this node. With each block, return the full path * of the block file. * @throws IOException */ void getBlockAndFileInfo(LightWeightHashSet<BlockAndFile> blockSet) throws IOException { FSDir[] children = this.getChildren(); if (children != null) { for (int i = 0; i < children.length; i++) { children[i].getBlockAndFileInfo(blockSet); } } File blockFiles[] = dir.listFiles(); String[] blockFilesNames = getFileNames(blockFiles); for (int i = 0; i < blockFiles.length; i++) { Block block = getBlockFromNames(blockFiles, blockFilesNames, i); if (block != null) { blockSet.add(new BlockAndFile(blockFiles[i].getAbsoluteFile(), block)); } } } /** * check if a data directory is healthy * @throws DiskErrorException */ public void checkDirTree() throws DiskErrorException { DiskChecker.checkDir(dir); FSDir[] children = this.getChildren(); if (children != null) { for (int i = 0; i < children.length; i++) { children[i].checkDirTree(); } } } void clearPath(File f) { String root = dir.getAbsolutePath(); String dir = f.getAbsolutePath(); if (dir.startsWith(root)) { String[] dirNames = dir.substring(root.length()). split(File.separator + "subdir"); if (clearPath(f, dirNames, 1)) return; } clearPath(f, null, -1); } /* * dirNames is an array of string integers derived from * usual directory structure data/subdirN/subdirXY/subdirM ... * If dirName array is non-null, we only check the child at * the children[dirNames[idx]]. This avoids iterating over * children in common case. If directory structure changes * in later versions, we need to revisit this. */ private boolean clearPath(File f, String[] dirNames, int idx) { if ((dirNames == null || idx == dirNames.length) && dir.compareTo(f) == 0) { numBlocks--; return true; } FSDir[] children = this.getChildren(); if (dirNames != null) { //guess the child index from the directory name if (idx > (dirNames.length - 1) || children == null) { return false; } int childIdx; try { childIdx = Integer.parseInt(dirNames[idx]); } catch (NumberFormatException ignored) { // layout changed? we could print a warning. return false; } return (childIdx >= 0 && childIdx < children.length) ? children[childIdx].clearPath(f, dirNames, idx+1) : false; } //guesses failed. back to blind iteration. if (children != null) { for(int i=0; i < children.length; i++) { if (children[i].clearPath(f, null, -1)){ return true; } } } return false; } public String toString() { FSDir[] children = this.getChildren(); return "FSDir{" + "dir=" + dir + ", children=" + (children == null ? null : Arrays.asList(children)) + "}"; } } /** * A map from namespace ID to NamespaceSlice object * * Only three operations are supported: add a namespace, remove a namespace * and get a snapshot of the list of the namespace map, which is an immutable * object. * * No extra locking is allowed in this object */ class NamespaceMap { /** * Any object referred here needs to be immutable. Every time this map is * updated, a new map is created and the reference here is changed to the * new map. */ private Map<Integer, NamespaceSlice> namespaceMap = new HashMap<Integer, NamespaceSlice>();; /** * It is the only method a caller is supposed to access namespaceMap. This * method will return a immutable map. It is a snapshot. * * @return */ private synchronized Map<Integer, NamespaceSlice> getNamespaceMapSnapshot() { return namespaceMap; } public synchronized void addNamespace(int namespaceId, NamespaceSlice ns) throws IOException { // add a new name-space by copying all the entries to a new map. Map<Integer, NamespaceSlice> newMap = new HashMap<Integer, NamespaceSlice>( namespaceMap); newMap.put(namespaceId, ns); namespaceMap = newMap; } public synchronized void removeNamespace(int namespaceId) { Map<Integer, NamespaceSlice> newMap = new HashMap<Integer, NamespaceSlice>( namespaceMap); newMap.remove(namespaceId); namespaceMap = newMap; } } public class FSVolume { private final NamespaceMap namespaceMap; private final File currentDir; // <StorageDirectory>/current private final DF usage; private final long reserved; private final FSDataset dataset; private DU dfsUsage; private final ExecutorService nativeIOExecutor; FSVolume(FSDataset dataset, File currentDir, Configuration conf) throws IOException { this.currentDir = currentDir; File parent = currentDir.getParentFile(); this.usage = new DF(parent, conf); this.reserved = usage.getReserved(); this.dataset = dataset; this.namespaceMap = new NamespaceMap(); this.dfsUsage = new DU(currentDir, conf); this.dfsUsage.start(); this.nativeIOExecutor = Executors.newSingleThreadExecutor(); } public Future<?> submitNativeIOTask(Runnable task) { return nativeIOExecutor.submit(task); } /** * It is the only method a caller is supposed to access namespaceMap. * This method will return a immutable map. It is a snapshot. * @return */ private Map<Integer, NamespaceSlice> getNamespaceMapSnapshot() { return namespaceMap.getNamespaceMapSnapshot(); } NamespaceSlice getNamespaceSlice(int namespaceId){ return getNamespaceMapSnapshot().get(namespaceId); } /** Return storage directory corresponding to the volume */ public File getDir() { return currentDir.getParentFile(); } public File getBlockCrcFile(int namespaceId) { NamespaceSlice ns = getNamespaceSlice(namespaceId); if (ns == null) { return null; } return new File(ns.getDirectory(), Storage.STORAGE_BLOCK_CRC); } public File getBlockCrcTmpFile(int namespaceId) { NamespaceSlice ns = getNamespaceSlice(namespaceId); if (ns == null) { return null; } return new File(ns.getDirectory(), Storage.STORAGE_TMP_BLOCK_CRC); } public File getCurrentDir() { return currentDir; } public File getRbwDir(int namespaceId) throws IOException { NamespaceSlice ns = getNamespaceSlice(namespaceId); return ns.getRbwDir(); } void setNamespaceBlockCrcLoaded(int namespaceId, boolean loaded) { NamespaceSlice ns = getNamespaceSlice(namespaceId); if (ns != null) { ns.setBlockCrcFileLoaded(loaded); } } boolean isNamespaceBlockCrcLoaded(int namespaceId) { NamespaceSlice ns = getNamespaceSlice(namespaceId); if (ns != null) { return ns.isBlockCrcFileLoaded(); } else { // if the namespace is not added return false; } } void decDfsUsed(int namespaceId, long value) { // this lock is put in FSVolume since it is called only ReplicaFileDeleteWork NamespaceSlice ns = getNamespaceSlice(namespaceId); if (ns != null) { ns.decDfsUsed(value); } } long getDfsUsed() throws IOException { long dfsUsed = 0; for (NamespaceSlice ns : getNamespaceMapSnapshot().values()) { dfsUsed += ns.getDfsUsed(); } return dfsUsed; } long getNSUsed(int namespaceId) throws IOException { return getNamespaceMapSnapshot().get(namespaceId).getDfsUsed(); } long getCapacity() throws IOException { if (reserved > usage.getCapacity()) { return 0; } return usage.getCapacity()-reserved; } long getAvailable() throws IOException { long remaining = getCapacity()-getDfsUsed(); long available = usage.getAvailable(); if (remaining>available) { remaining = available; } return (remaining > 0) ? remaining : 0; } long getReserved() { return this.reserved; } String getMount() throws IOException { return usage.getMount(); } String getFileSystem() throws IOException { return usage.getFilesystem(); } File addBlock(int namespaceId, Block b, File f, boolean inlineChecksum, int checksumType, int bytesPerChecksum) throws IOException { NamespaceSlice ns = getNamespaceSlice(namespaceId); return ns.addBlock(b, f, inlineChecksum, checksumType, bytesPerChecksum); } void checkDirs() throws DiskErrorException { for (NamespaceSlice ns : getNamespaceMapSnapshot().values()) { ns.checkDirs(); } } /** * Temporary files. They get moved to the finalized block directory when * the block is finalized. */ File createTmpFile(int namespaceId, Block b) throws IOException { NamespaceSlice ns = getNamespaceSlice(namespaceId); return ns.createTmpFile(b); } File getTmpFile(int namespaceId, Block b) throws IOException { NamespaceSlice ns = getNamespaceSlice(namespaceId); return ns.getTmpFile(b); } /** * Temporary files. They get moved to the finalized block directory when * the block is finalized. */ File createTmpFile(int namespaceId, Block b, boolean replicationRequest, boolean inlineChecksum, int checksumType, int bytesPerChecksum) throws IOException { NamespaceSlice ns = getNamespaceSlice(namespaceId); return ns.createTmpFile(b, replicationRequest, inlineChecksum, checksumType, bytesPerChecksum); } /** * Files used for copy-on-write. They need recovery when datanode * restarts. */ File createDetachFile(int namespaceId, Block b, String filename) throws IOException { NamespaceSlice ns = getNamespaceSlice(namespaceId); return ns.createDetachFile(b); } public void addNamespace(int namespaceId, String nsDir, Configuration conf, boolean supportAppends) throws IOException { File nsdir = new File(currentDir, nsDir); NamespaceSlice ns = new NamespaceSlice(namespaceId, this, nsdir, conf, supportAppends); namespaceMap.addNamespace(namespaceId, ns); } void getBlocksBeingWrittenInfo(int namespaceId, LightWeightHashSet<Block> blockSet) throws IOException { NamespaceSlice ns = getNamespaceSlice(namespaceId); if (ns == null) { return; } ns.getBlocksBeingWrittenInfo(blockSet); return; } public void shutdownNamespace(int namespaceId) { NamespaceSlice ns = getNamespaceSlice(namespaceId); if (ns != null) { this.namespaceMap.removeNamespace(namespaceId); ns.shutdown(); } } void getBlockInfo(int namespaceId, LightWeightHashSet<Block> blockSet) throws IOException { NamespaceSlice ns = getNamespaceSlice(namespaceId); ns.getBlockInfo(blockSet); return; } public void shutdown() { for (NamespaceSlice ns : getNamespaceMapSnapshot().values()) { ns.shutdown(); } dfsUsage.shutdown(); nativeIOExecutor.shutdownNow(); } void clearPath(int namespaceId, File f) throws IOException{ NamespaceSlice ns = getNamespaceSlice(namespaceId); ns.clearPath(f); return; } public String toString() { return currentDir.getAbsolutePath(); } } /** * This class maintain a list of FSVolume objects. * Only three operations are supported: add volumes, remove volumes, * and get a snapshot of the list of the volumes, which is an immutable * object. */ static class FSVolumeList { volatile FSVolume[] fsVolumes = null; public FSVolumeList(FSVolume[] volumes) { fsVolumes = volumes; } public synchronized void addVolumes(FSVolume[] volArray) { if (volArray == null || volArray.length == 0) { return; } int size = fsVolumes.length + volArray.length; FSVolume fsvs[] = new FSVolume[size]; int idx = 0; for (; idx < fsVolumes.length; idx++) { fsvs[idx] = fsVolumes[idx]; } for (; idx < size; idx++) { fsvs[idx] = volArray[idx - fsVolumes.length]; } fsVolumes = fsvs; } public synchronized void removeVolumes(List<FSVolume> removed_vols) { // repair array - copy non null elements int removed_size = (removed_vols == null) ? 0 : removed_vols.size(); if (removed_size > 0) { FSVolume fsvs[] = new FSVolume[fsVolumes.length - removed_size]; for (int idx = 0, idy = 0; idx < fsVolumes.length; idx++) { if (!removed_vols.contains(fsVolumes[idx])) { fsvs[idy] = fsVolumes[idx]; idy++; } } fsVolumes = fsvs; // replace array of volumes } } public FSVolume[] getVolumeListSnapshot() { return fsVolumes; } } static class FSVolumeSet { final FSVolumeList volumeList; int curVolume = 0; ExecutorService scannersExecutor; boolean supportAppends; private FSVolumeSet(FSVolume[] volumes, int threads, boolean supportAppends) { this.volumeList = new FSVolumeList(volumes); this.supportAppends = supportAppends; if (threads > 1) { scannersExecutor = Executors.newFixedThreadPool(threads); } } public boolean isValidDir(File currentDir) { FSVolume[] volumes = this.getVolumes(); for (int idx = 0; idx < volumes.length; idx++) { if (volumes[idx].getCurrentDir().equals(currentDir)) { return true; } } return false; } protected void addVolumes(FSVolume[] volArray) { volumeList.addVolumes(volArray); } protected int numberOfVolumes() { return getVolumes().length; } public FSVolume[] getVolumes() { return volumeList.getVolumeListSnapshot(); } boolean isValid(FSVolume volume) { for (FSVolume vol : volumeList.getVolumeListSnapshot()) { if (vol == volume) { return true; } } return false; } private FSVolume getNextVolume(long blockSize) throws IOException { FSVolume[] volumes = this.getVolumes(); if(volumes.length < 1) { throw new DiskOutOfSpaceException("No more available volumes"); } // since volumes could've been removed because of the failure // make sure we are not out of bounds if (curVolume >= volumes.length) { curVolume = 0; } int startVolume = curVolume; while (true) { FSVolume volume = volumes[curVolume]; curVolume = (curVolume + 1) % volumes.length; if (volume.getAvailable() > blockSize) { return volume; } if (curVolume == startVolume) { throw new DiskOutOfSpaceException( "Insufficient space for an additional block"); } } } private long getDfsUsed() throws IOException { long dfsUsed = 0L; FSVolume[] volumes = this.getVolumes(); for (int idx = 0; idx < volumes.length; idx++) { dfsUsed += volumes[idx].getDfsUsed(); } return dfsUsed; } private long getNSUsed(int namespaceId) throws IOException { long dfsUsed = 0L; FSVolume[] volumes = this.getVolumes(); for (int idx = 0; idx < volumes.length; idx++) { dfsUsed += volumes[idx].getNSUsed(namespaceId); } return dfsUsed; } private long getCapacity() throws IOException { long capacity = 0L; FSVolume[] volumes = this.getVolumes(); for (int idx = 0; idx < volumes.length; idx++) { capacity += volumes[idx].getCapacity(); } return capacity; } private long getRemaining() throws IOException { long remaining = 0L; FSVolume[] volumes = this.getVolumes(); for (int idx = 0; idx < volumes.length; idx++) { remaining += volumes[idx].getAvailable(); } return remaining; } private void getBlocksBeingWrittenInfo(int namespaceId, LightWeightHashSet<Block> blockSet) throws IOException { long startTime = System.currentTimeMillis(); FSVolume[] volumes = this.getVolumes(); if (scannersExecutor != null) { synchronized(scannersExecutor) { List<Future<LightWeightHashSet<Block>>> builders = new ArrayList<Future<LightWeightHashSet<Block>>>(); for (int idx = 0; idx < volumes.length; idx++) { builders.add(scannersExecutor .submit(new BlocksBeingWrittenInfoBuilder(volumes[idx], namespaceId))); } for (Future<LightWeightHashSet<Block>> future : builders) { try { blockSet.addAll(future.get()); } catch (ExecutionException ex) { DataNode.LOG.error( "Error generating block being written info from volumes ", ex.getCause()); throw new IOException(ex); } catch (InterruptedException iex) { DataNode.LOG.error( "Error waiting for generating block being written info", iex); throw new IOException(iex); } } } } else { for (int idx = 0; idx < volumes.length; idx++) { volumes[idx].getBlocksBeingWrittenInfo(namespaceId, blockSet); } } long scanTime = (System.currentTimeMillis() - startTime)/1000; DataNode.LOG.info("Finished generating blocks being written report for " + volumes.length + " volumes in " + scanTime + " seconds"); } private void getBlockInfo(int namespaceId, LightWeightHashSet<Block> blockSet) { long startTime = System.currentTimeMillis(); FSVolume[] volumes = this.getVolumes(); if (scannersExecutor != null) { synchronized (scannersExecutor) { List<Future<LightWeightHashSet<Block>>> builders = new ArrayList<Future<LightWeightHashSet<Block>>>(); for (int idx = 0; idx < volumes.length; idx++) { builders.add(scannersExecutor.submit(new BlockInfoBuilder( volumes[idx], namespaceId))); } for (Future<LightWeightHashSet<Block>> future : builders) { try { blockSet.addAll(future.get()); } catch (ExecutionException ex) { DataNode.LOG.error("Error scanning volumes ", ex.getCause()); } catch (InterruptedException iex) { DataNode.LOG.error("Error waiting for scan", iex); } } } } else { for (int idx = 0; idx < volumes.length; idx++) { try{ volumes[idx].getBlockInfo(namespaceId, blockSet); } catch (IOException e) { DataNode.LOG.error("Error scanning volumes ", e.getCause()); } } } long scanTime = (System.currentTimeMillis() - startTime)/1000; DataNode.LOG.info("Finished generating block report for " + volumes.length + " volumes in " + scanTime + " seconds"); } /** * goes over all the volumes and checkDir eachone of them * if one throws DiskErrorException - removes from the list of active * volumes. * @return list of all the removed volumes */ private List<FSVolume> checkDirs() { List<FSVolume> removed_vols = null; FSVolume[] fsVolumes = this.getVolumes(); for (int idx = 0; idx < fsVolumes.length; idx++) { FSVolume fsv = fsVolumes[idx]; try { fsv.checkDirs(); } catch (DiskErrorException e) { DataNode.LOG.warn("Removing failed volume " + fsv + ": ", e); if (removed_vols == null) { removed_vols = new ArrayList<FSVolume>(); removed_vols.add(fsVolumes[idx]); } } } if (removed_vols != null && removed_vols.size() > 0) { volumeList.removeVolumes(removed_vols); DataNode.LOG.info("Completed FSVolumeSet.checkDirs. Removed=" + removed_vols.size() + "volumes. List of current volumes: " + toString()); } return removed_vols; } private List<FSVolume> removeBVolumes(List<File> directories) { ArrayList<FSVolume> removed_vols = new ArrayList<FSVolume>(); if (directories != null && directories.size() > 0) { FSVolume[] fsVolumes = this.getVolumes(); for(int idx = 0; idx < fsVolumes.length; idx++) { FSVolume fsv = fsVolumes[idx]; if(directories.contains(fsv.getDir())) { removed_vols.add(fsv); } } volumeList.removeVolumes(removed_vols); DataNode.LOG.info("Completed FSVolumeSet.removeVolumes. Removed=" + removed_vols.size() + "volumes. List of current volumes: " + toString()); } return removed_vols; } private void addNamespace(int namespaceId, String nsDir, Configuration conf) throws IOException { FSVolume[] volumes = this.getVolumes(); for (FSVolume v : volumes) { v.addNamespace(namespaceId, nsDir, conf, supportAppends); } } private void removeNamespace(int namespaceId) { FSVolume[] volumes = this.getVolumes(); for (FSVolume v : volumes) { v.shutdownNamespace(namespaceId); } } public String toString() { StringBuffer sb = new StringBuffer(); FSVolume[] volumes = this.getVolumes(); for (int idx = 0; idx < volumes.length; idx++) { sb.append(volumes[idx].toString()); if (idx != volumes.length - 1) { sb.append(","); } } return sb.toString(); } } private static class BlockInfoBuilder implements Callable<LightWeightHashSet<Block>> { FSVolume volume; int namespaceId; public BlockInfoBuilder(FSVolume volume, int namespaceId) { this.volume = volume; this.namespaceId = namespaceId; } @Override public LightWeightHashSet<Block> call() throws Exception { LightWeightHashSet<Block> result = new LightWeightHashSet<Block>(); volume.getBlockInfo(namespaceId, result); return result; } } private static class BlocksBeingWrittenInfoBuilder implements Callable<LightWeightHashSet<Block>> { FSVolume volume; int namespaceId; public BlocksBeingWrittenInfoBuilder(FSVolume volume, int namespaceId) { this.volume = volume; this.namespaceId = namespaceId; } @Override public LightWeightHashSet<Block> call() throws Exception { LightWeightHashSet<Block> result = new LightWeightHashSet<Block>(); volume.getBlocksBeingWrittenInfo(namespaceId, result); return result; } } ////////////////////////////////////////////////////// // // FSDataSet // ////////////////////////////////////////////////////// //Find better place? public static final String METADATA_EXTENSION = ".meta"; public static final short FORMAT_VERSION_NON_INLINECHECKSUM = 1; public static final short FORMAT_VERSION_INLINECHECKSUM = 2; public static final String DELETE_FILE_EXT = "toDelete."; static class ActiveFile implements ReplicaToRead, ReplicaBeingWritten, Cloneable { static final long UNKNOWN_SIZE = -1; DatanodeBlockInfo datanodeBlockInfo; final List<Thread> threads = new ArrayList<Thread>(2); private volatile long bytesReceived; private volatile long bytesAcked; private volatile long bytesOnDisk; private volatile boolean finalized; private volatile BlockCrcUpdater crcUpdater; /** * Set to true if this file was recovered during datanode startup. * This may indicate that the file has been truncated (eg during * underlying filesystem journal replay) */ final boolean wasRecoveredOnStartup; ActiveFile(DatanodeBlockInfo datanodeBlockInfo, List<Thread> list, long expectedSize, boolean enable) throws IOException { this(datanodeBlockInfo, false, expectedSize, enable); if (list != null) { threads.addAll(list); } threads.add(Thread.currentThread()); } /** * Create an ActiveFile from a file on disk during DataNode startup. * This factory method is just to make it clear when the purpose * of this constructor is. * @throws IOException */ private ActiveFile(DatanodeBlockInfo datanodeBlockInfo, boolean recovery, long expectedSize, boolean enable) throws IOException { this.datanodeBlockInfo = datanodeBlockInfo; long sizeFromDisk; if (!isInlineChecksum()) { sizeFromDisk = getDataFile().length(); } else { GenStampAndChecksum sac = BlockInlineChecksumReader .getGenStampAndChecksumFromInlineChecksumFile(getDataFile() .getName()); sizeFromDisk = BlockInlineChecksumReader.getBlockSizeFromFileLength( getDataFile().length(), sac.checksumType, sac.bytesPerChecksum); } if (expectedSize != UNKNOWN_SIZE && sizeFromDisk != expectedSize) { throw new IOException("File " + getDataFile() + " on disk size " + sizeFromDisk + " doesn't match expected size " + expectedSize); } bytesReceived = bytesAcked = bytesOnDisk = sizeFromDisk; crcUpdater = new BlockCrcUpdater(this.getBytesPerChecksum(), enable && bytesReceived == 0); wasRecoveredOnStartup = recovery; finalized = false; } @Override public long getBytesVisible() { return bytesAcked; } public void setBytesAcked(long value) { bytesAcked = value; } @Override public long getBytesWritten() { return bytesOnDisk; } public void setBytesOnDisk(long value) { bytesOnDisk = value; } public long getBytesReceived() { return bytesReceived; } public void setBytesReceived(long length) { bytesReceived = length; } @Override public File getDataFileToRead() { return datanodeBlockInfo.getDataFileToRead(); } private File getDataFile() { return datanodeBlockInfo.getBlockDataFile().getFile(); } public String toString() { return getClass().getSimpleName() + "(file=" + getDataFile() + ", threads=" + threads + ")"; } public ActiveFile getClone() throws CloneNotSupportedException { return (ActiveFile) super.clone(); } @Override public boolean isInlineChecksum() { return datanodeBlockInfo.isInlineChecksum(); } @Override public int getChecksumType() { return datanodeBlockInfo.getChecksumType(); } @Override public int getBytesPerChecksum() { return datanodeBlockInfo.getBytesPerChecksum(); } @Override public InputStream getBlockInputStream(DataNode datanode, long offset) throws IOException { return datanodeBlockInfo.getBlockInputStream(datanode, offset); } @Override public boolean isFinalized() { return finalized; } protected void blockFinalize() { this.finalized = true; } @Override public int getBlockCrc() throws IOException{ throw new IOException("Block not finalized."); } @Override public void updateBlockCrc(long offset, int length, int crc) { crcUpdater.updateBlockCrc(offset, length, crc); } @Override public boolean hasBlockCrcInfo() { return false; } BlockCrcUpdater getCrcUpdater() { return crcUpdater; } @Override public BlockDataFile getBlockDataFile() throws IOException { return datanodeBlockInfo.getBlockDataFile(); } } /** * Check if a file is scheduled for deletion * name should be obtained by File.getName() */ static boolean isPendingDeleteFilename(String name) { return name.startsWith(DELETE_FILE_EXT); } public Block getStoredBlock(int namespaceId, long blkid) throws IOException { return getStoredBlock(namespaceId, blkid, false); } /** {@inheritDoc} */ public Block getStoredBlock(int namespaceId, long blkid, boolean useOnDiskLength) throws IOException { lock.readLock().lock(); try { ReplicaToRead replica = getReplicaToRead(namespaceId, new Block( blkid)); if (replica == null) { return null; } File blockfile = replica.getDataFileToRead(); if (blockfile == null) { return null; } File metafile = null; if (!replica.isInlineChecksum()) { metafile = BlockWithChecksumFileWriter.findMetaFile(blockfile, true); if (metafile == null) { return null; } } Block block = new Block(blkid); if (useOnDiskLength) { block.setNumBytes(replica.getBytesWritten()); } else { block.setNumBytes(replica.getBytesVisible()); } if (replica.isInlineChecksum()) { block.setGenerationStamp(BlockInlineChecksumReader .getGenerationStampFromInlineChecksumFile(blockfile.getName())); } else { block.setGenerationStamp(BlockWithChecksumFileReader .parseGenerationStampInMetaFile(blockfile, metafile)); } return block; } finally { lock.readLock().unlock(); } } FSVolumeSet volumes; private DataNode datanode; private Configuration conf; private int maxBlocksPerDir = 0; private boolean initialized = false; VolumeMap volumeMap; BlockCrcMapFlusher blockCrcMapFlusher; Thread blockCrcMapFlusherThread = null; static Random random = new Random(); FSDatasetAsyncDiskService asyncDiskService; ReentrantReadWriteLock lock = new ReentrantReadWriteLock(true); private boolean shouldHardLinkBlockCopy; private int validVolsRequired; //this constructor is used to create PersistedSimulatedFSDataset public FSDataset() { } /** * An FSDataset has a directory where it loads its data files. */ public FSDataset(DataNode datanode, Configuration conf, int numNamespaces){ this.datanode = datanode; this.conf = conf; this.maxBlocksPerDir = conf.getInt("dfs.datanode.numblocks", 64); volumeMap = new VolumeMap(numNamespaces); } void setDatasetDelta(FSDatasetDeltaInterface stateChangeCallback) { volumeMap.setDatasetDelta(stateChangeCallback); } @Override public void initialize(DataStorage storage) throws IOException{ lock.writeLock().lock(); try{ if(initialized){ return; } // The number of volumes required for operation is the total number // of volumes configured minus the number of failed volumes we can // tolerate. String[] dataDirs = DataNode.getListOfDataDirs(conf); int volsConfigured = (dataDirs == null) ? 0 : dataDirs.length; final int volFailuresTolerated = conf.getInt("dfs.datanode.failed.volumes.tolerated", volsConfigured-1); this.validVolsRequired = volsConfigured - volFailuresTolerated; if (validVolsRequired < 1 || validVolsRequired > storage.getNumStorageDirs()) { throw new DiskErrorException("Too many failed volumes - " + "current valid volumes: " + storage.getNumStorageDirs() + ", volumes configured: " + volsConfigured + ", volume failures tolerated: " + volFailuresTolerated ); } File[] roots = new File[storage.getNumStorageDirs()]; for (int idx = 0; idx < storage.getNumStorageDirs(); idx++) { roots[idx] = storage.getStorageDir(idx).getCurrentDir(); } asyncDiskService = new FSDatasetAsyncDiskService(roots, conf); FSVolume[] volArray = new FSVolume[storage.getNumStorageDirs()]; for (int idx = 0; idx < storage.getNumStorageDirs(); idx++) { volArray[idx] = new FSVolume(this, storage.getStorageDir(idx).getCurrentDir(), conf); DataNode.LOG.info("FSDataset added volume - " + storage.getStorageDir(idx).getCurrentDir()); } int threads = conf.getInt("dfs.datanode.blockscanner.threads", 1); volumes = new FSVolumeSet(volArray, threads, datanode.isSupportAppends()); registerMBean(storage.getStorageID()); blockCrcMapFlusher = new BlockCrcMapFlusher(datanode, volumeMap, volumes, conf.getLong("dfs.block.crc.flush.interval", 600000)); blockCrcMapFlusherThread = new Thread(blockCrcMapFlusher, "Block Crc Flusher"); blockCrcMapFlusherThread.start(); initialized = true; } finally { lock.writeLock().unlock(); } shouldHardLinkBlockCopy = conf.getBoolean("dfs.datanode.blkcopy.hardlink", true); } private class VolumeThread extends Thread { private Configuration conf; private FSVolume volume; private boolean hasError = false; private Map<Integer, String> namespaceIdDir; private boolean supportAppends; private VolumeThread(FSVolume volume, Configuration conf, Map<Integer, String> namespaceIdDir, boolean supportAppends) { this.namespaceIdDir = namespaceIdDir; this.volume = volume; this.conf = conf; this.supportAppends = supportAppends; } public void run() { DataNode.LOG.info("Start building volume: " + volume); try { for (Integer namespaceId : namespaceIdDir.keySet()) { volume.addNamespace(namespaceId, namespaceIdDir.get(namespaceId), conf, supportAppends); } } catch (IOException ioe) { DataNode.LOG.error("Error building volume : " + volume, ioe); hasError = true; } DataNode.LOG.info("Finish building volume for " + volume); } } private void createVolumes(FSVolumeSet volumes, DataStorage storage, Configuration conf, VolumeMap volumeMap, Map<Integer, String> namespaceIdDir) throws IOException { FSVolume[] myVolumes = volumes.getVolumes(); ArrayList<VolumeThread> scanners = new ArrayList<VolumeThread>( myVolumes.length); for(FSVolume volume : myVolumes){ scanners.add(new VolumeThread(volume, conf, namespaceIdDir, volumes.supportAppends)); } for(VolumeThread vt : scanners){ vt.start(); } boolean hasError = false; for (VolumeThread vt : scanners) { try { vt.join(); } catch (InterruptedException e) { throw (InterruptedIOException)new InterruptedIOException().initCause(e); } if (!hasError && vt.hasError) { hasError = true; } } if (hasError) { throw new IOException("Error creating volumes"); } } /** * Return the total space used by dfs datanode */ public long getDfsUsed() throws IOException { return volumes.getDfsUsed(); } /** * Return the total space used by one namespace in dfs datanode */ public long getNSUsed(int namespaceId) throws IOException { return volumes.getNSUsed(namespaceId); } /** * Return true - if there are still valid volumes * on the DataNode */ public boolean hasEnoughResource(){ return volumes.numberOfVolumes() >= this.validVolsRequired; } /** * Return total capacity, used and unused */ public long getCapacity() throws IOException { return volumes.getCapacity(); } /** * Return how many bytes can still be stored in the FSDataset */ public long getRemaining() throws IOException { return volumes.getRemaining(); } /** * Find the block's on-disk length */ public long getFinalizedBlockLength(int namespaceId, Block b) throws IOException { DatanodeBlockInfo info = volumeMap.get(namespaceId, b); if (info == null) { throw new IOException("Can't find block " + b + " in volumeMap"); } return info.getFinalizedSize(); } @Override public long getOnDiskLength(int namespaceId, Block b) throws IOException { ReplicaToRead rtr = this.getReplicaToRead(namespaceId, b); if (rtr == null) { throw new IOException("Can't find block " + b + " in volumeMap"); } return rtr.getBytesWritten(); } @Override public ReplicaBeingWritten getReplicaBeingWritten( int namespaceId, Block b) throws IOException { lock.readLock().lock(); try { return volumeMap.getOngoingCreates(namespaceId, b); } finally { lock.readLock().unlock(); } } /** * Get File name for a given block. */ public File getBlockFile(int namespaceId, Block b) throws IOException { File f = validateBlockFile(namespaceId, b); if (f == null) { if (InterDatanodeProtocol.LOG.isDebugEnabled()) { InterDatanodeProtocol.LOG .debug("b=" + b + ", volumeMap=" + volumeMap); } throw new IOException("Block " + b + ", namespace= " + namespaceId + " is not valid."); } return f; } /** * Make a copy of the block if this block is linked to an existing * snapshot. This ensures that modifying this block does not modify * data in any existing snapshots. * @param block Block * @param numLinks Detach if the number of links exceed this value * @throws IOException * @return - true if the specified block was detached */ public boolean detachBlock(int namespaceId, Block block, int numLinks) throws IOException { DatanodeBlockInfo info = null; lock.readLock().lock(); try { info = volumeMap.get(namespaceId, block); } finally { lock.readLock().unlock(); } return info.detachBlock(namespaceId, block, numLinks); } /** {@inheritDoc} */ public void updateBlock(int namespaceId, Block oldblock, Block newblock) throws IOException { if (oldblock.getBlockId() != newblock.getBlockId()) { throw new IOException("Cannot update oldblock (=" + oldblock + ") to newblock (=" + newblock + ")."); } // Protect against a straggler updateblock call moving a block backwards // in time. boolean isValidUpdate = (newblock.getGenerationStamp() > oldblock.getGenerationStamp()) || (newblock.getGenerationStamp() == oldblock.getGenerationStamp() && newblock.getNumBytes() == oldblock.getNumBytes()); if (!isValidUpdate) { throw new IOException( "Cannot update oldblock=" + oldblock + " to newblock=" + newblock + " since generation stamps must " + "increase, or else length must not change."); } for(;;) { final List<Thread> threads = tryUpdateBlock(namespaceId, oldblock, newblock); if (threads == null) { DataNode.LOG.info("Updated Block: namespaceid: " + namespaceId + " oldBlock: " + oldblock + " newBlock: " + newblock); return; } DataNode.LOG.info("Waiting other threads to update block: namespaceid: " + namespaceId + " oldBlock: " + oldblock + " newBlock: " + newblock); interruptAndJoinThreads(threads); } } /** * Try to interrupt all of the given threads, and join on them. * If interrupted, returns false, indicating some threads may * still be running. */ private boolean interruptAndJoinThreads(List<Thread> threads) { // interrupt and wait for all ongoing create threads for(Thread t : threads) { t.interrupt(); } for(Thread t : threads) { try { t.join(); } catch (InterruptedException e) { DataNode.LOG.warn("interruptOngoingCreates: t=" + t, e); return false; } } return true; } /** * Return a list of active writer threads for the given block. * @return null if there are no such threads or the file is * not being created */ private ArrayList<Thread> getActiveThreads(int namespaceId, Block block) { lock.writeLock().lock(); try { //check ongoing create threads final ActiveFile activefile = volumeMap.getOngoingCreates(namespaceId, block); if (activefile != null && !activefile.threads.isEmpty()) { //remove dead threads for(Iterator<Thread> i = activefile.threads.iterator(); i.hasNext(); ) { final Thread t = i.next(); if (!t.isAlive()) { i.remove(); } } //return living threads if (!activefile.threads.isEmpty()) { return new ArrayList<Thread>(activefile.threads); } } } finally { lock.writeLock().unlock(); } return null; } private void setDataFileForBlock(int namespaceId, Block block, File newDataFile) { DatanodeBlockInfo info = volumeMap.get(namespaceId, block); if (info != null) { info.getBlockDataFile().setFile(newDataFile); } } /** * Try to update an old block to a new block. * If there are ongoing create threads running for the old block, * the threads will be returned without updating the block. * * @return ongoing create threads if there is any. Otherwise, return null. */ private List<Thread> tryUpdateBlock(int namespaceId, Block oldblock, Block newblock) throws IOException { lock.writeLock().lock(); try { //check ongoing create threads ArrayList<Thread> activeThreads = getActiveThreads(namespaceId, oldblock); if (activeThreads != null) { return activeThreads; } DatanodeBlockInfo binfo = volumeMap.get(namespaceId, oldblock); if (binfo == null) { throw new IOException("Block " + oldblock + " doesn't exist or has been recovered to a new generation "); } File blockFile = binfo.getBlockDataFile().getFile(); long oldgs; File oldMetaFile = null; if (binfo.isInlineChecksum()) { oldgs = BlockInlineChecksumReader .getGenerationStampFromInlineChecksumFile(blockFile.getName()); } else { oldMetaFile = BlockWithChecksumFileWriter.findMetaFile(blockFile); oldgs = BlockWithChecksumFileReader.parseGenerationStampInMetaFile( blockFile, oldMetaFile); } // First validate the update //update generation stamp if (oldgs > newblock.getGenerationStamp()) { throw new IOException("Cannot update block (id=" + newblock.getBlockId() + ") generation stamp from " + oldgs + " to " + newblock.getGenerationStamp()); } //update length if (newblock.getNumBytes() > oldblock.getNumBytes()) { throw new IOException("Cannot update block file (=" + blockFile + ") length from " + oldblock.getNumBytes() + " to " + newblock.getNumBytes()); } // Although we've waited for the active threads all dead before updating // the map so there should be no data race there, we still create new // ActiveFile object to make sure in case another thread holds it, // it won't cause any problem for us. // try { volumeMap.copyOngoingCreates(namespaceId, oldblock); } catch (CloneNotSupportedException e) { // It should never happen. throw new IOException("Cannot clone ActiveFile object", e); } // Now perform the update File tmpMetaFile = null; if (!binfo.isInlineChecksum()) { // rename meta file to a tmp file tmpMetaFile = new File(oldMetaFile.getParent(), oldMetaFile.getName() + "_tmp" + newblock.getGenerationStamp()); if (!oldMetaFile.renameTo(tmpMetaFile)) { throw new IOException("Cannot rename block meta file to " + tmpMetaFile); } } long oldBlockLength; if (!binfo.isInlineChecksum()) { oldBlockLength = blockFile.length(); } else { oldBlockLength = BlockInlineChecksumReader.getBlockSizeFromFileLength( blockFile.length(), binfo.getChecksumType(), binfo.getBytesPerChecksum()); } ActiveFile file = null; if (newblock.getNumBytes() < oldBlockLength) { if (!binfo.isInlineChecksum()) { new BlockWithChecksumFileWriter(binfo.getBlockDataFile(), tmpMetaFile) .truncateBlock(oldBlockLength, newblock.getNumBytes()); } else { new BlockInlineChecksumWriter(binfo.getBlockDataFile(), binfo.getChecksumType(), binfo.getBytesPerChecksum(), datanode.writePacketSize) .truncateBlock(newblock.getNumBytes()); } file = volumeMap.getOngoingCreates(namespaceId, oldblock); if (file != null) { file.setBytesAcked(newblock.getNumBytes()); file.setBytesOnDisk(newblock.getNumBytes()); file.setBytesReceived(newblock.getNumBytes()); } else { // This should never happen unless called from unit tests. binfo.syncInMemorySize(); } } String newDataFileName; if (!binfo.isInlineChecksum()) { //rename the tmp file to the new meta file (with new generation stamp) File newMetaFile = BlockWithChecksumFileWriter.getMetaFile(blockFile, newblock); if (!tmpMetaFile.renameTo(newMetaFile)) { throw new IOException("Cannot rename tmp meta file to " + newMetaFile); } } else { newDataFileName = BlockInlineChecksumWriter.getInlineChecksumFileName( newblock, binfo.getChecksumType(), binfo.getBytesPerChecksum()); File newDataFile = new File(blockFile.getParent(), newDataFileName); if (!blockFile.renameTo(newDataFile)) { throw new IOException("Cannot rename data file to " + newDataFileName); } // fsyncIfPossible parent directory to persist rename. if (datanode.syncOnClose) { NativeIO.fsyncIfPossible(newDataFile.getParent()); } setDataFileForBlock(namespaceId, oldblock, newDataFile); } if(volumeMap.getOngoingCreates(namespaceId, oldblock) != null){ ActiveFile af = volumeMap.removeOngoingCreates(namespaceId, oldblock); volumeMap.addOngoingCreates(namespaceId, newblock, af); } volumeMap.update(namespaceId, oldblock, newblock); // paranoia! verify that the contents of the stored block // matches the block file on disk. validateBlockMetadata(namespaceId, newblock); return null; } finally { lock.writeLock().unlock(); } } private final static String DISK_ERROR = "Possible disk error on file creation: "; /** Get the cause of an I/O exception if caused by a possible disk error * @param ioe an I/O exception * @return cause if the I/O exception is caused by a possible disk error; * null otherwise. */ static IOException getCauseIfDiskError(IOException ioe) { if (ioe.getMessage()!=null && ioe.getMessage().startsWith(DISK_ERROR)) { return (IOException)ioe.getCause(); } else { return null; } } /** * Start writing to a block file * If isRecovery is true and the block pre-exists, then we kill all volumeMap.put(b, v); volumeMap.put(b, v); * other threads that might be writing to this block, and then reopen the file. * If replicationRequest is true, then this operation is part of a block * replication request. */ public DatanodeBlockWriter writeToBlock(int namespaceId, Block b, Block newBlock, boolean isRecovery, boolean replicationRequest, int checksumType, int bytesPerChecksum) throws IOException { // // Make sure the block isn't a valid one - we're still creating it! // if (isValidBlock(namespaceId, b, false)) { if (!isRecovery) { throw new BlockAlreadyExistsException("Block " + b + " is valid, and cannot be written to."); } // If the block was successfully finalized because all packets // were successfully processed at the Datanode but the ack for // some of the packets were not received by the client. The client // re-opens the connection and retries sending those packets. // The other reason is that an "append" is occurring to this block. detachBlock(namespaceId, b, 1); } long blockSize = b.getNumBytes(); // // Serialize access to /tmp, and check if file already there. // File f = null; List<Thread> threads = null; long expectedFileSize = ActiveFile.UNKNOWN_SIZE; boolean inlineChecksum = datanode.useInlineChecksum; DatanodeBlockInfo binfo; FSVolume v = null; Block targetBlock = b; if (newBlock != null && newBlock != b) { targetBlock = newBlock; } lock.writeLock().lock(); try { // // Is it already in the create process? // ActiveFile activeFile = volumeMap.getOngoingCreates(namespaceId, b); if (activeFile != null) { f = activeFile.getDataFile(); threads = activeFile.threads; expectedFileSize = activeFile.getBytesWritten(); inlineChecksum = activeFile.isInlineChecksum(); if (!isRecovery) { throw new BlockAlreadyExistsException("Block " + b + " has already been started (though not completed), and thus cannot be created."); } else { for (Thread thread:threads) { thread.interrupt(); } } volumeMap.removeOngoingCreates(namespaceId, b); } if (!isRecovery) { if (newBlock != null && b != newBlock) { throw new IOException("newBlock is not allowed except append case. "); } v = volumes.getNextVolume(blockSize); // create temporary file to hold block in the designated volume f = createTmpFile(namespaceId, v, b, replicationRequest, inlineChecksum, checksumType, bytesPerChecksum); } else if (f != null) { DataNode.LOG.info("Reopen already-open Block for append " + b); if (newBlock != null && b != newBlock) { throw new IOException("newBlock is not allowed except append case. "); } // create or reuse temporary file to hold block in the designated volume DatanodeBlockInfo oldBinfo = volumeMap.get(namespaceId, b); inlineChecksum = oldBinfo.isInlineChecksum(); v = oldBinfo.getBlockDataFile().getVolume(); volumeMap.add(namespaceId, b, new DatanodeBlockInfo(v, f, DatanodeBlockInfo.UNFINALIZED, true, inlineChecksum, checksumType, bytesPerChecksum, false, 0)); } else { // reopening block for appending to it. DataNode.LOG.info("Reopen Block for append " + b); if (newBlock == null) { throw new IOException( "newBlock is required for append af file to write. "); } DatanodeBlockInfo oldBinfo = volumeMap.get(namespaceId, b); inlineChecksum = oldBinfo.isInlineChecksum(); v = oldBinfo.getBlockDataFile().getVolume(); f = createTmpFile(namespaceId, v, newBlock, replicationRequest, inlineChecksum, checksumType, bytesPerChecksum); File blkfile = getBlockFile(namespaceId, b); if (!inlineChecksum) { File oldmeta = BlockWithChecksumFileReader.getMetaFile(this, namespaceId, b); File newmeta = BlockWithChecksumFileWriter.getMetaFile(f, newBlock); // rename meta file to tmp directory DataNode.LOG.debug("Renaming " + oldmeta + " to " + newmeta); if (!oldmeta.renameTo(newmeta)) { throw new IOException("Block " + b + " reopen failed. " + " Unable to move meta file " + oldmeta + " to tmp dir " + newmeta); } } // rename block file to tmp directory DataNode.LOG.debug("Renaming " + blkfile + " to " + f); if (!blkfile.renameTo(f)) { if (!f.delete()) { throw new IOException("Block " + b + " reopen failed. " + " Unable to remove file " + f); } if (!blkfile.renameTo(f)) { throw new IOException("Block " + b + " reopen failed. " + " Unable to move block file " + blkfile + " to tmp dir " + f); } } // fsyncIfPossible parent directory to persist rename. if (datanode.syncOnClose) { NativeIO.fsyncIfPossible(blkfile.getParent()); } } if (f == null) { DataNode.LOG.warn("Block " + b + " reopen failed " + " Unable to locate tmp file."); throw new IOException("Block " + b + " reopen failed " + " Unable to locate tmp file."); } // If this is a replication request, then this is not a permanent // block yet, it could get removed if the datanode restarts. If this // is a write or append request, then it is a valid block. if (replicationRequest) { binfo = new DatanodeBlockInfo(v, f, DatanodeBlockInfo.UNFINALIZED, false, inlineChecksum, checksumType, bytesPerChecksum, false, 0); } else { binfo = new DatanodeBlockInfo(v, f, DatanodeBlockInfo.UNFINALIZED, true, inlineChecksum, checksumType, bytesPerChecksum, false, 0); } if (newBlock != null && newBlock != b) { volumeMap.remove(namespaceId, b); } volumeMap.add(namespaceId, targetBlock, binfo); volumeMap.addOngoingCreates(namespaceId, targetBlock, new ActiveFile(binfo, threads, expectedFileSize, datanode.updateBlockCrcWhenWrite)); } finally { lock.writeLock().unlock(); } try { if (threads != null) { for (Thread thread:threads) { thread.join(); } } } catch (InterruptedException e) { throw new IOException("Recovery waiting for thread interrupted."); } // // Finally, allow a writer to the block file // REMIND - mjc - make this a filter stream that enforces a max // block size, so clients can't go crazy // if (DataNode.LOG.isDebugEnabled()) { DataNode.LOG.debug("writeTo blockfile is " + f + " of size " + f.length()); } if (inlineChecksum) { return new BlockInlineChecksumWriter(binfo.getBlockDataFile(), checksumType, bytesPerChecksum, datanode.writePacketSize); } else { File metafile = BlockWithChecksumFileWriter.getMetaFile(f, targetBlock); if (DataNode.LOG.isDebugEnabled()) { DataNode.LOG.debug("writeTo metafile is " + metafile + " of size " + metafile.length()); } return new BlockWithChecksumFileWriter(binfo.getBlockDataFile(), metafile); } } File createTmpFile(int namespaceId, FSVolume vol, Block blk, boolean replicationRequest, boolean inlineChecksum, int checksumType, int bytePerChecksum) throws IOException { lock.writeLock().lock(); try { if ( vol == null ) { vol = volumeMap.get(namespaceId, blk).getBlockDataFile().getVolume(); if ( vol == null ) { throw new IOException("Could not find volume for block " + blk); } } return vol.createTmpFile(namespaceId, blk, replicationRequest, inlineChecksum, checksumType, bytePerChecksum); } finally { lock.writeLock().unlock(); } } // // REMIND - mjc - eventually we should have a timeout system // in place to clean up block files left by abandoned clients. // We should have some timer in place, so that if a blockfile // is created but non-valid, and has been idle for >48 hours, // we can GC it safely. // /** * Complete the block write! */ @Override // FSDatasetInterface public void finalizeBlock(int namespaceId, Block b) throws IOException { finalizeBlockInternal(namespaceId, b, true); } @Override public void finalizeBlockIfNeeded(int namespaceId, Block b) throws IOException { finalizeBlockInternal(namespaceId, b, true); } /** * Complete the block write! */ public void finalizeBlockInternal(int namespaceId, Block b, boolean reFinalizeOk) throws IOException { lock.writeLock().lock(); DatanodeBlockInfo binfo = volumeMap.get(namespaceId, b); try { ActiveFile activeFile = volumeMap.getOngoingCreates(namespaceId, b); if (activeFile == null) { if (reFinalizeOk) { return; } else { throw new IOException("Block " + b + " is already finalized."); } } File f = activeFile.getDataFile(); if (f == null || !f.exists()) { throw new IOException("No temporary file " + f + " for block " + b); } FSVolume v = binfo.getBlockDataFile().getVolume(); if (v == null) { throw new IOException("No volume for temporary file " + f + " for block " + b); } File dest = null; dest = v.addBlock(namespaceId, b, f, activeFile.isInlineChecksum(), binfo.getChecksumType(), binfo.getBytesPerChecksum()); volumeMap.add( namespaceId, b, new DatanodeBlockInfo(v, dest, activeFile.getBytesWritten(), true, activeFile.isInlineChecksum(), binfo.getChecksumType(), binfo.getBytesPerChecksum(), activeFile.getCrcUpdater().isCrcValid(activeFile.getBytesWritten()), activeFile.getCrcUpdater().getBlockCrc())); ActiveFile af = volumeMap.removeOngoingCreates(namespaceId, b); af.blockFinalize(); } finally { lock.writeLock().unlock(); } } private boolean isBlockFinalizedInternal(int namespaceId, Block b, boolean validate) { DatanodeBlockInfo blockInfo = volumeMap.get(namespaceId, b); // We skip the check for validate case to avoid redundant codes // but keep old codes' behavior. Though it looks like a bug, but we // would fix it in a separate patch. // if (!validate && blockInfo == null) { return false; // block is not finalized } FSVolume v = blockInfo.getBlockDataFile().getVolume(); if (v == null) { DataNode.LOG.warn("No volume for block " + b); return false; // block is not finalized } ActiveFile activeFile = volumeMap.getOngoingCreates(namespaceId, b); if (activeFile != null) { if (validate) { File f = activeFile.getDataFile(); if (f == null || !f.exists()) { // we should never get into this position. DataNode.LOG.warn("No temporary file " + f + " for block " + b); } } return false; // block is not finalized } return true; // block is finalized } /** * is this block finalized? Returns true if the block is already * finalized, otherwise returns false. */ public boolean isBlockFinalized(int namespaceId, Block b) { return isBlockFinalizedInternal(namespaceId, b, false); } /** * is this block finalized? Returns true if the block is already * finalized, otherwise returns false. */ private boolean isBlockFinalizedWithLock(int namespaceId, Block b) { lock.readLock().lock(); try { return isBlockFinalizedInternal(namespaceId, b, true); } finally { lock.readLock().unlock(); } } /** * Remove the temporary block file (if any) */ public void unfinalizeBlock(int namespaceId, Block b) throws IOException { lock.writeLock().lock(); try { // remove the block from in-memory data structure ActiveFile activefile = volumeMap.removeOngoingCreates(namespaceId, b); if (activefile == null) { return; } volumeMap.remove(namespaceId, b); // delete the on-disk temp file File metaFile = null; if (!activefile.isInlineChecksum()) { metaFile = BlockWithChecksumFileWriter.getMetaFile( activefile.getDataFileToRead(), b); } if (delBlockFromDisk(activefile.getDataFileToRead(), metaFile, b)) { DataNode.LOG.warn("Block " + b + " unfinalized and removed. " ); } } finally { lock.writeLock().unlock(); } } /** * Remove a block from disk * @param blockFile block file * @param metaFile block meta file * @param b a block * @return true if on-disk files are deleted; false otherwise */ private boolean delBlockFromDisk(File blockFile, File metaFile, Block b) { if (blockFile == null) { DataNode.LOG.warn("No file exists for block: " + b); return true; } if (!blockFile.delete()) { DataNode.LOG.warn("Not able to delete the block file: " + blockFile); return false; } else { // remove the meta file if (metaFile != null && !metaFile.delete()) { DataNode.LOG.warn( "Not able to delete the meta block file: " + metaFile); return false; } } return true; } /** * Return a table of blocks being written data * @throws IOException */ public Block[] getBlocksBeingWrittenReport(int namespaceId) throws IOException { LightWeightHashSet<Block> blockSet = new LightWeightHashSet<Block>(); volumes.getBlocksBeingWrittenInfo(namespaceId, blockSet); Block blockTable[] = new Block[blockSet.size()]; int i = 0; for (Iterator<Block> it = blockSet.iterator(); it.hasNext(); i++) { blockTable[i] = it.next(); } return blockTable; } /** * Get the list of finalized blocks from in-memory blockmap for a block pool. */ public Block[] getBlockReport(int namespaceId) throws IOException { ArrayList<Block> ret = new ArrayList<Block>(); org.apache.hadoop.hdfs.server.datanode.NamespaceMap nm = volumeMap .getNamespaceMap(namespaceId); if (nm == null) { return new Block[0]; } int n = nm.getNumBucket(); for (int i = 0; i < n; i++) { BlockBucket bb = nm.getBucket(i); bb.getBlockReport(ret); } return ret.toArray(new Block[ret.size()]); } /** * Check whether the given block is a valid one. */ public boolean isValidBlock(int namespaceId, Block b, boolean checkSize) throws IOException { File f = null; ; try { f = getValidateBlockFile(namespaceId, b, checkSize); } catch (IOException e) { DataNode.LOG.warn("Block " + b + " is not valid:", e); } return ((f != null) ? isBlockFinalizedWithLock(namespaceId, b) : false); } public boolean isValidVolume(File currentDir) throws IOException { return volumes.isValidDir(currentDir); } /** * Find the file corresponding to the block and return it if it exists. */ File validateBlockFile(int namespaceId, Block b) throws IOException { return getValidateBlockFile(namespaceId, b, false); } /** * Find the file corresponding to the block and return it if it exists. */ File getValidateBlockFile(int namespaceId, Block b, boolean checkSize) throws IOException { //Should we check for metadata file too? DatanodeBlockInfo blockInfo = this.getDatanodeBlockInfo(namespaceId, b); File f = null; if (blockInfo != null) { if (checkSize) { blockInfo.verifyFinalizedSize(); } f = blockInfo.getBlockDataFile().getFile(); assert f != null; if(f.exists()) { return f; } // if file is not null, but doesn't exist - possibly disk failed datanode.checkDiskError(); } if (InterDatanodeProtocol.LOG.isDebugEnabled()) { InterDatanodeProtocol.LOG.debug("b=" + b + ", f=" + ((f == null) ? "null" : f)); } return null; } /** {@inheritDoc} */ public void validateBlockMetadata(int namespaceId, Block b) throws IOException { DatanodeBlockInfo info; lock.readLock().lock(); try { info = volumeMap.get(namespaceId, b); } finally { lock.readLock().unlock(); } if (info == null) { throw new IOException("Block " + b + " does not exist in volumeMap."); } File f = info.getDataFileToRead(); // Try to find out block size long localBlockSize; if (f == null) { f = info.getBlockDataFile().getTmpFile(namespaceId, b); if (f == null) { throw new IOException("Block " + b + " does not exist on disk."); } if (!f.exists()) { throw new IOException("Block " + b + " block file " + f + " does not exist on disk."); } if (info.isInlineChecksum()) { // TODO: do we want to do it? localBlockSize = BlockInlineChecksumReader.getBlockSizeFromFileLength( f.length(), info.getChecksumType(), info.getBytesPerChecksum()); } else { localBlockSize = f.length(); } } else { if (info.isFinalized()) { info.verifyFinalizedSize(); localBlockSize = info.getFinalizedSize(); } else { if (info.isInlineChecksum()) { // TODO: do we want to do it? localBlockSize = BlockInlineChecksumReader .getBlockSizeFromFileLength(f.length(), info.getChecksumType(), info.getBytesPerChecksum()); } else { localBlockSize = f.length(); } } } if (b.getNumBytes() > localBlockSize) { throw new IOException("Block " + b + " length is " + b.getNumBytes() + " does not match block file length " + f.length()); } long stamp; DataChecksum dcs; if (!info.isInlineChecksum()) { File meta = BlockWithChecksumFileWriter.getMetaFile(f, b); if (meta == null) { throw new IOException("Block " + b + " metafile does not exist."); } if (!meta.exists()) { throw new IOException("Block " + b + " metafile " + meta + " does not exist on disk."); } long metaFileSize = meta.length(); if (metaFileSize == 0 && localBlockSize > 0) { throw new IOException("Block " + b + " metafile " + meta + " is empty."); } stamp = BlockWithChecksumFileReader.parseGenerationStampInMetaFile(f, meta); if (metaFileSize == 0) { // no need to check metadata size for 0 size file return; } dcs = BlockMetadataHeader.readHeader(meta).getChecksum(); // verify that checksum file has an integral number of checkum values. int checksumsize = dcs.getChecksumSize(); long actual = metaFileSize - BlockMetadataHeader.getHeaderSize(); long numChunksInMeta = actual/checksumsize; if (actual % checksumsize != 0) { throw new IOException("Block " + b + " has a checksum file of size " + metaFileSize + " but it does not align with checksum size of " + checksumsize); } int bpc = dcs.getBytesPerChecksum(); long minDataSize = (numChunksInMeta - 1) * bpc; long maxDataSize = numChunksInMeta * bpc; if (localBlockSize > maxDataSize || localBlockSize <= minDataSize) { throw new IOException("Block " + b + " is of size " + f.length() + " but has " + (numChunksInMeta + 1) + " checksums and each checksum size is " + checksumsize + " bytes."); } } else { stamp = BlockInlineChecksumReader .getGenerationStampFromInlineChecksumFile(f.getName()); if (localBlockSize == 0) { // no need to check metadata size for 0 size file return; } // TODO: What verification we can do here? } if (stamp != b.getGenerationStamp()) { throw new IOException("Block " + b + " genstamp is " + b.getGenerationStamp() + " does not match meta file stamp " + stamp); } // We could crc-check the entire block here, but it will be a costly // operation. Instead we rely on the above check (file length mismatch) // to detect corrupt blocks. } /** * We're informed that a block is no longer valid. We * could lazily garbage-collect the block, but why bother? * just get rid of it. */ public void invalidate(int namespaceId, Block invalidBlks[]) throws IOException { boolean error = false; for (int i = 0; i < invalidBlks.length; i++) { File f = null; FSVolume v; boolean inlineChecksum; DatanodeBlockInfo dinfo = null; lock.writeLock().lock(); try { dinfo = volumeMap.get(namespaceId, invalidBlks[i]); if (dinfo == null) { // It is possible that after block reports, Datanodes receive // duplicate invalidate requests from name-node. We just skip // the block. In the end of the function, we don't throw an exception, // since no need for a disk check. // DataNode.LOG.info("Unexpected error trying to delete block " + invalidBlks[i] + ". BlockInfo not found in volumeMap."); continue; } inlineChecksum = dinfo.isInlineChecksum(); f = dinfo.getDataFileToRead(); v = dinfo.getBlockDataFile().getVolume(); if (f == null) { DataNode.LOG.warn("Unexpected error trying to delete block " + invalidBlks[i] + ". Block not found in blockMap." + ((v == null) ? " " : " Block found in volumeMap.")); error = true; continue; } if (v == null) { DataNode.LOG.warn("Unexpected error trying to delete block " + invalidBlks[i] + ". No volume for this block." + " Block found in blockMap. " + f + "."); error = true; continue; } File parent = f.getParentFile(); if (parent == null) { DataNode.LOG.warn("Unexpected error trying to delete block " + invalidBlks[i] + ". Parent not found for file " + f + "."); error = true; continue; } //TODO ??? v.clearPath(namespaceId, parent); volumeMap.remove(namespaceId, invalidBlks[i]); } finally { lock.writeLock().unlock(); } // close the File Channel dinfo.getBlockDataFile().closeFileChannel(); //rename the files to be deleted //for safety we add prefix instead of suffix, //so the valid block files still start with "blk_" File blockFileRenamed = new File(f.getParent() + File.separator + DELETE_FILE_EXT + f.getName()); File metaFile = null; File metaFileRenamed = null; if (!inlineChecksum) { metaFile = BlockWithChecksumFileWriter.getMetaFile( f, invalidBlks[i]); metaFileRenamed = new File(metaFile.getParent() + File.separator + DELETE_FILE_EXT + metaFile.getName()); } if((!f.renameTo(blockFileRenamed)) || (!inlineChecksum && !metaFile.renameTo(metaFileRenamed))) { DataNode.LOG.warn("Unexpected error trying to delete block " + invalidBlks[i] + ". Cannot rename files for deletion."); error = true; continue; } if(invalidBlks[i].getNumBytes() != BlockFlags.NO_ACK){ datanode.notifyNamenodeDeletedBlock(namespaceId, invalidBlks[i]); } // Delete the block asynchronously to make sure we can do it fast enough asyncDiskService.deleteAsync(v, blockFileRenamed, metaFileRenamed, invalidBlks[i].toString(), namespaceId); } if (error) { throw new IOException("Error in deleting blocks."); } } /** * Turn the block identifier into a filename. */ public File getFile(int namespaceId, Block b) { lock.readLock().lock(); try { DatanodeBlockInfo info = volumeMap.get(namespaceId, b); if (info != null) { return info.getDataFileToRead(); } return null; } finally { lock.readLock().unlock(); } } @Override public DatanodeBlockInfo getDatanodeBlockInfo(int namespaceId, Block b) { return volumeMap.get(namespaceId, b); } @Override public ReplicaToRead getReplicaToRead(int namespaceId, Block block) { lock.readLock().lock(); try { ActiveFile activefile = volumeMap.getOngoingCreates(namespaceId, block); if (activefile != null) { return activefile; } DatanodeBlockInfo info = volumeMap.get(namespaceId, block); if (info == null) { if (DataNode.LOG.isDebugEnabled()) { DataNode.LOG.debug("volumeMap=" + volumeMap); } } return info; } finally { lock.readLock().unlock(); } } /** * check if a data directory is healthy * if some volumes failed - make sure to remove all the blocks that belong * to these volumes * @throws DiskErrorException */ public void checkDataDir() throws DiskErrorException { long total_blocks=0, removed_blocks=0; List<FSVolume> failed_vols = null; failed_vols = volumes.checkDirs(); //if there no failed volumes return if(failed_vols == null) return; // else // remove related blocks long mlsec = System.currentTimeMillis(); lock.writeLock().lock(); try { volumeMap.removeUnhealthyVolumes(failed_vols); } finally { lock.writeLock().unlock(); } mlsec = System.currentTimeMillis() - mlsec; DataNode.LOG.warn(">>>>>>>>>>>>Removed " + removed_blocks + " out of " + total_blocks + "(took " + mlsec + " millisecs)"); // report the error StringBuilder sb = new StringBuilder(); for(FSVolume fv : failed_vols) { sb.append(fv.toString() + ";"); } throw new DiskErrorException("DataNode failed volumes:" + sb); } /** * remove directories that are given from the list of volumes to use. * This function also makes sure to remove all the blocks that belong to * these volumes. */ public void removeVolumes(Configuration conf, List<File> directories) throws Exception { if (directories == null || directories.isEmpty()) { DataNode.LOG.warn("There were no directories to remove. Exiting "); return; } List<FSVolume> volArray = null; lock.readLock().lock(); try { volArray = volumes.removeBVolumes(directories); } finally { lock.readLock().unlock(); } // remove related blocks long mlsec = System.currentTimeMillis(); lock.writeLock().lock(); try { volumeMap.removeUnhealthyVolumes(volArray); } finally { lock.writeLock().unlock(); } mlsec = System.currentTimeMillis() - mlsec; DataNode.LOG.warn(">>>>>>>>>Removing these blocks took " + mlsec + " millisecs in refresh<<<<<<<<<<<<<<< "); StringBuilder sb = new StringBuilder(); for(FSVolume fv : volArray) { sb.append(fv.toString() + ";"); } throw new DiskErrorException("These volumes were removed: " + sb); } public void addVolumes(Configuration conf, int namespaceId, String nsDir, Collection<StorageDirectory> dirs) throws Exception { if (dirs == null || dirs.isEmpty()) { return; } FSVolume[] volArray = new FSVolume[dirs.size()]; File[] dirArray = new File[dirs.size()]; int idx = 0; for (Iterator<StorageDirectory> iter = dirs.iterator() ; iter.hasNext(); idx++) { dirArray[idx] = iter.next().getCurrentDir(); volArray[idx] = new FSVolume(this, dirArray[idx], conf); } lock.writeLock().lock(); try { volumes.addVolumes(volArray); for (FSVolume vol : volArray) { vol.addNamespace(namespaceId, nsDir, conf, datanode.isSupportAppends()); } } finally { lock.writeLock().unlock(); } asyncDiskService.insertDisk(dirArray, conf); } public String toString() { return "FSDataset{dirpath='"+volumes+"'}"; } ObjectName mbeanName; ObjectName versionBeanName; Random rand = new Random(); /** * Register the FSDataset MBean using the name * "hadoop:service=DataNode,name=FSDatasetState-<storageid>" */ void registerMBean(final String storageId) { // We wrap to bypass standard mbean naming convetion. // This wraping can be removed in java 6 as it is more flexible in // package naming for mbeans and their impl. StandardMBean bean; String storageName; if (storageId == null || storageId.equals("")) {// Temp fix for the uninitialized storage storageName = "UndefinedStorageId" + rand.nextInt(); } else { storageName = storageId; } try { bean = new StandardMBean(this,FSDatasetMBean.class); mbeanName = MBeanUtil.registerMBean("DataNode", "FSDatasetState-" + storageName, bean); versionBeanName = VersionInfo.registerJMX("DataNode"); } catch (NotCompliantMBeanException e) { e.printStackTrace(); } DataNode.LOG.info("Registered FSDatasetStatusMBean"); } public void shutdown() { if (blockCrcMapFlusher != null) { blockCrcMapFlusher.setClose(); } if (blockCrcMapFlusherThread != null) { blockCrcMapFlusherThread.interrupt(); try { this.blockCrcMapFlusherThread.join(); this.blockCrcMapFlusherThread = null; } catch (InterruptedException ie) { } } if (mbeanName != null) MBeanUtil.unregisterMBean(mbeanName); if (versionBeanName != null) { MBeanUtil.unregisterMBean(versionBeanName); } if (asyncDiskService != null) { asyncDiskService.shutdown(); } if(volumes != null) { lock.writeLock().lock(); try { if (volumes.scannersExecutor != null) { volumes.scannersExecutor.shutdown(); } for (FSVolume volume : volumes.getVolumes()) { if(volume != null) { volume.shutdown(); } } } finally { lock.writeLock().unlock(); } } } public void addNamespace(int namespaceId, String nsDir, Configuration conf) throws IOException { DataNode.LOG.info("Adding namespace " + namespaceId); lock.writeLock().lock(); try{ volumeMap.initNamespace(namespaceId); volumes.addNamespace(namespaceId, nsDir, conf); } finally { lock.writeLock().unlock(); } // Load block CRCs file files int numBuckets = volumeMap.getNumBuckets(namespaceId); for (FSVolume volume : volumes.getVolumes()) { try { File blockCrcFile = volume.getBlockCrcFile(namespaceId); if (blockCrcFile == null || !blockCrcFile.exists()) { continue; } int numUpdated = 0; FileInputStream fis = new FileInputStream(blockCrcFile); try { BlockCrcFileReader reader = new BlockCrcFileReader( new DataInputStream(fis)); reader.readHeader(); if (reader.getNumBuckets() != numBuckets) { // TODO: support it if needed. Now it's not clear whether we will // ever need it. DataNode.LOG .warn("Do not yet support loading block CRCs if bucket size changes: bucket size on disk: " + reader.getNumBuckets()); } else { numUpdated += volumeMap.updateBlockCrc(namespaceId, reader); } } finally { fis.close(); } DataNode.LOG.info("Finish loading Block CRC file for namespace " + namespaceId + " volume " + volume + " " + numUpdated + " blocks' CRC updated."); } catch (IOException ioe) { DataNode.LOG.warn("IOException when try to load block CRC fle from volume" + volume.getDir(), ioe); } finally { volume.setNamespaceBlockCrcLoaded(namespaceId, true); } } } public void removeNamespace(int namespaceId){ DataNode.LOG.info("Removing namespace " + namespaceId); lock.writeLock().lock(); try{ if (volumeMap != null) { volumeMap.removeNamespace(namespaceId); } if (volumes != null) { volumes.removeNamespace(namespaceId); } } finally { lock.writeLock().unlock(); } } public String getStorageInfo() { return toString(); } @Override public BlockRecoveryInfo startBlockRecovery(int namespaceId, long blockId) throws IOException { Block stored = getStoredBlock(namespaceId, blockId, true); if (stored == null) { return null; } // It's important that this loop not be synchronized - otherwise // this will deadlock against the thread it's joining against! while (true) { DataNode.LOG.debug( "Interrupting active writer threads for block " + stored); List<Thread> activeThreads = getActiveThreads(namespaceId, stored); if (activeThreads == null) break; if (interruptAndJoinThreads(activeThreads)) break; } lock.readLock().lock(); try { // now that writers are stopped, re-fetch the block's meta info stored = getStoredBlock(namespaceId, blockId, true); if (stored == null) { return null; } ActiveFile activeFile = volumeMap.getOngoingCreates(namespaceId, stored); boolean isRecovery = (activeFile != null) && activeFile.wasRecoveredOnStartup; BlockRecoveryInfo info = new BlockRecoveryInfo(stored, isRecovery); if (DataNode.LOG.isDebugEnabled()) { DataNode.LOG.debug("getBlockMetaDataInfo successful block=" + stored + " length " + stored.getNumBytes() + " genstamp " + stored.getGenerationStamp()); } // paranoia! verify that the contents of the stored block // matches the block file on disk. validateBlockMetadata(namespaceId, stored); return info; } finally { lock.readLock().unlock(); } } /** * Copies a file as fast as possible. Tries to do a hardlink instead of a copy * if the hardlink parameter is specified. * * @param src * the source file for copying * @param dst * the destination file for copying * @param hardlink * whether or not to attempt a hardlink * @throws IOException */ public void copyFile(File src, File dst, boolean hardlink) throws IOException { if (src == null || dst == null) { throw new IOException("src/dst file is null"); } try { if (hardlink && shouldHardLinkBlockCopy) { // Remove destination before hard linking, since this file might already // exist and a hardlink would fail as a result. if (dst.exists()) { if(!dst.delete()) { throw new IOException("Deletion of file : " + dst + " failed"); } } NativeIO.link(src, dst); DataNode.LOG.info("Hard Link Created from : " + src + " to " + dst); return; } } catch (IOException e) { DataNode.LOG.warn("Hard link failed from : " + src + " to " + dst + " continuing with regular file copy"); } FileChannel input = null; FileChannel output = null; try { // This improves copying performance a lot, it uses native buffers // for copying. input = new FileInputStream(src).getChannel(); output = new FileOutputStream(dst).getChannel(); if (input == null || output == null) { throw new IOException("Could not create file channels for src : " + src + " dst : " + dst); } long bytesLeft = input.size(); long position = 0; while (bytesLeft > 0) { long bytesWritten = output.transferFrom(input, position, bytesLeft); bytesLeft -= bytesWritten; position += bytesWritten; } if (datanode.syncOnClose) { output.force(true); } } finally { if (input != null) { input.close(); } if (output != null) { output.close(); } } } /** * Find a volume on the datanode for the destination block to be placed on. * It tries to place the destination block on the same volume as the source * block since hardlinks can be performed only between two files on the same * disk * * @param srcFileSystem * the file system for srcBlockFile * @param srcNamespaceId * the namespace id for srcBlock * @param srcBlock * the source block which needs to be hardlinked * @param srcBlockFile * the block file for srcBlock * @return the FSVolume on which we should put the dstBlock, null if we can't * find such a volume. * @throws IOException */ private FSVolume findVolumeForHardLink(String srcFileSystem, int srcNamespaceId, Block srcBlock, File srcBlockFile) throws IOException { FSVolume dstVol = null; if (srcBlockFile == null || !srcBlockFile.exists()) { throw new IOException("File " + srcBlockFile + " is not valid or does not have" + " a valid block file"); } // The source file might not necessarily be a part of the FSVolumeSet of // this datanode, it could be part of a FSVolumeSet of another datanode on // the same host. DatanodeBlockInfo blockInfo = volumeMap.get(srcNamespaceId, srcBlock); if (blockInfo != null) { dstVol = blockInfo.getBlockDataFile().getVolume(); } else { for(FSVolume volume : volumes.getVolumes()) { String volFileSystem = volume.getFileSystem(); if (volFileSystem.equals(srcFileSystem)) { dstVol = volume; break; } } } return dstVol; } /** * Finds a volume for the dstBlock and adds the new block to the FSDataset * data structures to indicate we are going to start writing to the block. * * @param srcFileSystem * the file system for srcBlockFile * @param srcBlockFile * the block file for the srcBlock * @param srcNamespaceId * the namespace id for source block * @param srcBlock * the source block that needs to be copied over * @param dstNamespaceId * the namespace id for destination block * @param dstBlock * the new destination block that needs to be created for copying * @return returns whether or not a hardlink is possible, if hardlink was not * requested this is always false. * @throws IOException */ private boolean copyBlockLocalAdd(String srcFileSystem, File srcBlockFile, int srcNamespaceId, Block srcBlock, int dstNamespaceId, Block dstBlock) throws IOException { boolean hardlink = true; File dstBlockFile = null; lock.writeLock().lock(); try { if (isValidBlock(dstNamespaceId, dstBlock, false) || volumeMap.getOngoingCreates(dstNamespaceId, dstBlock) != null) { throw new BlockAlreadyExistsException("Block " + dstBlock + " already exists"); } if (srcBlockFile == null || !srcBlockFile.exists()) { throw new IOException("Block " + srcBlock.getBlockName() + " is not valid or does not have a valid block file"); } boolean inlineChecksum = Block.isInlineChecksumBlockFilename(srcBlockFile .getName()); FSVolume dstVol = null; if (shouldHardLinkBlockCopy) { dstVol = findVolumeForHardLink( srcFileSystem, srcNamespaceId, srcBlock, srcBlockFile); } // Could not find a volume for a hard link, fall back to regular file // copy. if (dstVol == null) { dstVol = volumes.getNextVolume(srcBlock.getNumBytes()); hardlink = false; } int checksumType = DataChecksum.CHECKSUM_UNKNOWN; int bytesPerChecksum = -1; if (inlineChecksum) { GenStampAndChecksum sac = BlockInlineChecksumReader .getGenStampAndChecksumFromInlineChecksumFile(srcBlockFile .getName()); checksumType = sac.checksumType; bytesPerChecksum = sac.bytesPerChecksum; } List<Thread> threads = null; // We do not want to create a BBW, hence treat this as a replication // request. dstBlockFile = createTmpFile(dstNamespaceId, dstVol, dstBlock, true, inlineChecksum, checksumType, bytesPerChecksum); DatanodeBlockInfo binfo = new DatanodeBlockInfo(dstVol, dstBlockFile, DatanodeBlockInfo.UNFINALIZED, true, inlineChecksum, checksumType, bytesPerChecksum, false, 0); volumeMap.add(dstNamespaceId, dstBlock, binfo); volumeMap.addOngoingCreates(dstNamespaceId, dstBlock, new ActiveFile( binfo, threads, ActiveFile.UNKNOWN_SIZE, false)); } finally { lock.writeLock().unlock(); } if (dstBlockFile == null) { throw new IOException("Could not allocate block file for : " + dstBlock.getBlockName()); } return hardlink; } /** * Finalize the block in FSDataset. * * @param dstNamespaceId * the namespace id for dstBlock * @param dstBlock * the block that needs to be finalized * @param dstBlockFile * the block file for the block that has to be finalized * @throws IOException */ private void copyBlockLocalFinalize(int dstNamespaceId, Block dstBlock, File dstBlockFile) throws IOException { boolean inlineChecksum = Block.isInlineChecksumBlockFilename(dstBlockFile .getName()); long blkSize = 0; long fileSize = dstBlockFile.length(); lock.writeLock().lock(); try { DatanodeBlockInfo info = volumeMap.get(dstNamespaceId, dstBlock); if (info == null) { throw new IOException("Could not find information for " + dstBlock); } if (inlineChecksum) { blkSize = BlockInlineChecksumReader.getBlockSizeFromFileLength(fileSize, info.getChecksumType(), info.getBytesPerChecksum()); } else { blkSize = fileSize; } FSVolume dstVol = info.getBlockDataFile().getVolume(); // Finalize block on disk. File dest = dstVol.addBlock(dstNamespaceId, dstBlock, dstBlockFile, info.isInlineChecksum(), info.getChecksumType(), info.getBytesPerChecksum()); volumeMap.add(dstNamespaceId, dstBlock, new DatanodeBlockInfo(dstVol, dest, blkSize, true, inlineChecksum, info.getChecksumType(), info.getBytesPerChecksum(), false, 0)); volumeMap.removeOngoingCreates(dstNamespaceId, dstBlock); } finally { lock.writeLock().unlock(); } } /** {@inheritDoc} */ @Override public void copyBlockLocal(String srcFileSystem, File srcBlockFile, int srcNamespaceId, Block srcBlock, int dstNamespaceId, Block dstBlock) throws IOException { File dstBlockFile = null; try { boolean hardlink = copyBlockLocalAdd(srcFileSystem, srcBlockFile, srcNamespaceId, srcBlock, dstNamespaceId, dstBlock); DatanodeBlockInfo binfo = volumeMap.get(dstNamespaceId, dstBlock); dstBlockFile = binfo.getDataFileToRead(); // Copy files. copyFile(srcBlockFile, dstBlockFile, hardlink); // Copy metafile. if (!binfo.isInlineChecksum()) { File metaFileSrc = BlockWithChecksumFileWriter.getMetaFile(srcBlockFile, srcBlock); File metaFileDst = BlockWithChecksumFileWriter.getMetaFile(dstBlockFile, dstBlock); copyFile(metaFileSrc, metaFileDst, hardlink); } // Finalize block copyBlockLocalFinalize(dstNamespaceId, dstBlock, dstBlockFile); } catch (BlockAlreadyExistsException be) { throw be; } catch (IOException e) { unfinalizeBlock(dstNamespaceId, dstBlock); throw e; } } /** {@inheritDoc} */ @Override public String getFileSystemForBlock(int namespaceId, Block block) throws IOException { if (!isValidBlock(namespaceId, block, false)) { throw new IOException("Invalid block"); } return volumeMap.get(namespaceId, block).getBlockDataFile().getVolume() .getFileSystem(); } static File createTmpFile(Block b, File f) throws IOException { if (f.exists()) { throw new IOException("Unexpected problem in creating temporary file for "+ b + ". File " + f + " should not be present, but is."); } // Create the zero-length temp file // boolean fileCreated = false; try { fileCreated = f.createNewFile(); } catch (IOException ioe) { throw (IOException)new IOException(DISK_ERROR +f).initCause(ioe); } if (!fileCreated) { throw new IOException("Unexpected problem in creating temporary file for "+ b + ". File " + f + " should be creatable, but is already present."); } return f; } @Override public long size(int namespaceId) { try { return volumeMap.size(namespaceId); } catch (Exception e) { return -1; } } /** * Reconcile the difference between blocks on the disk and blocks in * volumeMap * * Check the given block for inconsistencies. Look at the * current state of the block and reconcile the differences as follows: * <ul> * <li>If the block file is missing, delete the block from volumeMap</li> * <li>If the block file exists and the block is missing in volumeMap, * add the block to volumeMap <li> * <li>If generation stamp does not match, then update the block with right * generation stamp</li> * <li>If the block length in memory does not match the actual block file length * then mark the block as corrupt and update the block length in memory</li> * <li>If the file in {@link ReplicaInfo} does not match the file on * the disk, update {@link ReplicaInfo} with the correct file</li> * </ul> * * @param blockId Block that differs * @param diskFile Block file on the disk * @param diskMetaFile Metadata file from on the disk * @param vol Volume of the block file */ public void checkAndUpdate(Integer nsid, FSDatasetDelta delta, ScanDifference info) throws IOException { long blockId = info.getBlockId(); lock.writeLock().lock(); try { // we don't want delta to record changes we do during reconciliation delta.stopRecordingDelta(); if (delta.get(nsid, blockId) != null) { // FIXME Presence of the block in delta means that it was changed // somehow // during the interval of time right after the difference computation in // directory scanner and before acquiring of writeLock in this method. // We can probably go through different operations that could happen // with the block // and write some logic for each of them, but this adds lots of // complexity. Instead // we just skip reconciliation for the block at this time. If it has // problems we're likely // to solve them next time return; } Block memBlock = new Block(blockId, 0, GenerationStamp.WILDCARD_STAMP); DatanodeBlockInfo memBlockInfo = volumeMap.get(nsid, memBlock); if (memBlockInfo != null && !memBlockInfo.isFinalized()) { // Block is not finalized - ignore the difference return; } // We don't have any files for this block on disk if (info.getState() == ScanDifference.DISK_FILES_MISSING) { if (memBlockInfo == null) { return; } volumeMap.remove(nsid, memBlock); LOG.info("checkAndUpdate: removing block: " + memBlock + " for namespace: " + nsid); if (datanode.blockScanner != null) { datanode.blockScanner.deleteBlock(nsid, memBlock); } return; } // We dont' have block in memory, but have some of its files on disk if (info.getState() == ScanDifference.MEMORY_BLOCK_MISSING) { // if there's a block file, then add it to volumeMap, otherwise // remove metaFile if any if (info.getBlockFile() != null) { Block newBlock = new Block(blockId, info.getLength(), info.getGenStamp()); boolean isInlineChecksum = info.isInlineChecksum(); DatanodeBlockInfo diskBlockInfo = null; if (isInlineChecksum) { GenStampAndChecksum sac = BlockInlineChecksumReader .getGenStampAndChecksumFromInlineChecksumFile(info .getBlockFile().getName()); diskBlockInfo = new DatanodeBlockInfo(info.getVolume(), info.getBlockFile(), info.getLength(), true, true, sac.checksumType, sac.bytesPerChecksum, false, 0); } else { diskBlockInfo = new DatanodeBlockInfo(info.getVolume(), info.getBlockFile(), info.getLength(), true, false, DataChecksum.CHECKSUM_UNKNOWN, -1, false, 0); } volumeMap.add(nsid, newBlock, diskBlockInfo); LOG.info("checkAndUpdate: adding block: " + newBlock + " for namespace: " + nsid + " size: " + diskBlockInfo.getBytesVisible()); if (datanode.blockScanner != null) { datanode.blockScanner.addBlock(nsid, newBlock); } } else { // scheduling a file for deletion asyncDiskService .deleteAsyncFile(info.getVolume(), info.getMetaFile()); } return; } // We have this block in memory and some of its files on disk if (info.getState() == ScanDifference.OUT_OF_SYNC) { if (info.getBlockFile() == null) { volumeMap.remove(nsid, memBlock); LOG.info("checkAndUpdate: removing block: " + memBlock + " for namespace: " + nsid); if (datanode.blockScanner != null) { datanode.blockScanner.deleteBlock(nsid, memBlock); } // scheduling a file for deletion asyncDiskService .deleteAsyncFile(info.getVolume(), info.getMetaFile()); } else { if (memBlockInfo == null) { return; } memBlockInfo.getBlock().setNumBytes(info.getLength()); memBlockInfo.getBlock().setGenerationStamp(info.getGenStamp()); LOG.info("checkAndUpdate: updating block: " + memBlockInfo + " for namespace: " + nsid); } return; } } finally { try { delta.startRecordingDelta(); } finally { lock.writeLock().unlock(); } } } }