/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.datanode;
import java.nio.channels.FileChannel;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileDescriptor;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InterruptedIOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import javax.management.NotCompliantMBeanException;
import javax.management.ObjectName;
import javax.management.StandardMBean;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.DF;
import org.apache.hadoop.fs.DU;
import org.apache.hadoop.fs.DU.NamespaceSliceDU;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.common.GenerationStamp;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
import org.apache.hadoop.hdfs.server.datanode.BlockInlineChecksumReader.GenStampAndChecksum;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.DirectoryScanner.ScanDifference;
import org.apache.hadoop.hdfs.server.datanode.NamespaceMap.BlockBucket;
import org.apache.hadoop.hdfs.server.datanode.metrics.FSDatasetMBean;
import org.apache.hadoop.hdfs.server.protocol.BlockFlags;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryInfo;
import org.apache.hadoop.hdfs.server.protocol.InterDatanodeProtocol;
import org.apache.hadoop.hdfs.util.LightWeightHashSet;
import org.apache.hadoop.io.nativeio.NativeIO;
import org.apache.hadoop.metrics.util.MBeanUtil;
import org.apache.hadoop.util.DataChecksum;
import org.apache.hadoop.util.VersionInfo;
import org.apache.hadoop.util.DiskChecker;
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException;
/**************************************************
* FSDataset manages a set of data blocks. Each block
* has a unique name and an extent on disk.
*
***************************************************/
public class FSDataset implements FSConstants, FSDatasetInterface {
public static final Log LOG = LogFactory.getLog(FSDataset.class);
interface FSDatasetDeltaInterface {
void addBlock(int namespaceId, Block block);
void removeBlock(int namespaceId, Block block);
void updateBlock(int namespaceId, Block oldBlock, Block newBlock);
}
static String[] getFileNames(File[] files) {
String[] fileNames = new String[files.length];
for (int i = 0; i < files.length; i++) {
fileNames[i] = files[i].getName();
}
return fileNames;
}
static Block getBlockFromNames(File blockFiles[], String[] blockFilesNames, int index)
throws IOException {
if (Block.isSeparateChecksumBlockFilename(blockFilesNames[index])) {
long genStamp = BlockWithChecksumFileReader
.getGenerationStampFromSeperateChecksumFile(blockFilesNames,
blockFilesNames[index]);
return new Block(blockFiles[index], blockFiles[index].length(),
genStamp);
} else if (Block.isInlineChecksumBlockFilename(blockFilesNames[index])) {
// TODO: We might want to optimize it.
GenStampAndChecksum sac = BlockInlineChecksumReader
.getGenStampAndChecksumFromInlineChecksumFile(blockFilesNames[index]);
long blockLengh = BlockInlineChecksumReader.getBlockSizeFromFileLength(
blockFiles[index].length(), sac.checksumType, sac.bytesPerChecksum);
return new Block(blockFiles[index], blockLengh, sac.generationStamp);
}
return null;
}
/**
* A NamespaceSlice represents a portion of a namespace stored on a volume.
* Taken together, all BNamespaceSlices sharing a namespaceID across a
* cluster represent a single namespace.
*/
class NamespaceSlice {
private final int namespaceId;
private final FSVolume volume; // volume to which this namespaceSlice belongs to
private final FSDir dataDir; // StorageDirectory/current/nsid/current/finalized
private final File detachDir; // directory store Finalized replica
private final File rbwDir ; // directory store RBW replica
private final File tmpDir; // directory store Temporary replica
private final NamespaceSliceDU dfsUsage;
private volatile boolean blockCrcFileLoaded;
/**
*
* @param namespaceId
* @param volume {@link FSVolume} to which this NamespaceSlice belongs to
* @param nsDir directory corresponding to the NameSpaceSlice
* @param conf
* @throws IOException
*/
NamespaceSlice(int namespaceId, FSVolume volume, File nsDir, Configuration conf, boolean supportAppends)
throws IOException {
this.namespaceId = namespaceId;
this.volume = volume;
File nsDirCur = new File(nsDir, DataStorage.STORAGE_DIR_CURRENT);
File dataDirFile = new File(nsDirCur, DataStorage.STORAGE_DIR_FINALIZED);
this.dataDir = new FSDir(namespaceId, dataDirFile, volume);
this.detachDir = new File(nsDir, "detach");
if (detachDir.exists()) {
recoverDetachedBlocks(dataDirFile, detachDir);
}
// Files that were being written when the datanode was last shutdown
// are now moved back to the data directory. It is possible that
// in the future, we might want to do some sort of datanode-local
// recovery for these blocks. For example, crc validation.
//
this.tmpDir = new File(nsDir, "tmp");
if (tmpDir.exists()) {
// rename tmpDir to prepare delete
File toDeleteDir = new File(tmpDir.getParent(),
DELETE_FILE_EXT + tmpDir.getName());
if (tmpDir.renameTo(toDeleteDir)) {
// asyncly delete the renamed directory
asyncDiskService.deleteAsyncFile(volume, toDeleteDir);
} else {
// rename failed, let's synchronously delete the directory
FileUtil.fullyDelete(tmpDir);
DataNode.LOG.warn("Deleted " + tmpDir.getPath());
}
}
this.rbwDir = new File(nsDirCur, DataStorage.STORAGE_DIR_RBW);
// Files that were being written when the datanode was last shutdown
// should not be deleted if append mode is enabled.
if (rbwDir.exists()) {
recoverBlocksBeingWritten(rbwDir);
}
if (!rbwDir.mkdirs()) {
if (!rbwDir.isDirectory()) {
throw new IOException("Mkdirs failed to create " + rbwDir.toString());
}
}
if (!tmpDir.mkdirs()) {
if (!tmpDir.isDirectory()) {
throw new IOException("Mkdirs failed to create " + tmpDir.toString());
}
}
if (!detachDir.mkdirs()) {
if (!detachDir.isDirectory()) {
throw new IOException("Mkdirs failed to create " + detachDir.toString());
}
}
this.dfsUsage = volume.dfsUsage.addNamespace(namespaceId, nsDir, conf);
this.blockCrcFileLoaded = false;
}
void getBlockInfo(LightWeightHashSet<Block> blocks) throws IOException{
dataDir.getBlockInfo(blocks);
}
boolean isBlockCrcFileLoaded() {
return blockCrcFileLoaded;
}
void setBlockCrcFileLoaded(boolean blockCrcFileLoaded) {
this.blockCrcFileLoaded = blockCrcFileLoaded;
}
/**
* Recover detached files on datanode restart. If a detached block
* does not exist in the original directory, then it is moved to the
* original directory.
*/
private void recoverDetachedBlocks(File dataDir, File dir)
throws IOException {
File contents[] = dir.listFiles();
if (contents == null) {
return;
}
for (int i = 0; i < contents.length; i++) {
if (!contents[i].isFile()) {
throw new IOException ("Found " + contents[i] + " in " + dir +
" but it is not a file.");
}
//
// If the original block file still exists, then no recovery
// is needed.
//
File blk = new File(dataDir, contents[i].getName());
if (!blk.exists()) {
if (!contents[i].renameTo(blk)) {
throw new IOException("Unable to recover detached file " +
contents[i]);
}
continue;
}
if (!contents[i].delete()) {
throw new IOException("Unable to cleanup detached file " +
contents[i]);
}
}
}
void getBlocksBeingWrittenInfo(LightWeightHashSet<Block> blockSet) throws IOException {
if (rbwDir == null) {
return;
}
File[] blockFiles = rbwDir.listFiles();
if (blockFiles == null) {
return;
}
String[] blockFileNames = getFileNames(blockFiles);
for (int i = 0; i < blockFiles.length; i++) {
if (!blockFiles[i].isDirectory()) {
// get each block in the rbwDir directory
Block block = FSDataset.getBlockFromNames(blockFiles, blockFileNames, i);
if (block != null) {
// add this block to block set
blockSet.add(block);
if (DataNode.LOG.isDebugEnabled()) {
DataNode.LOG.debug("recoverBlocksBeingWritten for block " + block);
}
}
}
}
}
/**
* Recover blocks that were being written when the datanode
* was earlier shut down. These blocks get re-inserted into
* ongoingCreates. Also, send a blockreceived message to the NN
* for each of these blocks because these are not part of a
* block report.
*/
private void recoverBlocksBeingWritten(File bbw) throws IOException {
FSDir fsd = new FSDir(namespaceId, bbw, this.volume);
LightWeightHashSet<BlockAndFile> blockSet = new LightWeightHashSet<BlockAndFile>();
fsd.getBlockAndFileInfo(blockSet);
for (BlockAndFile b : blockSet) {
File f = b.pathfile; // full path name of block file
lock.writeLock().lock();
try {
boolean isInlineChecksum = Block.isInlineChecksumBlockFilename(f
.getName());
int checksumType = DataChecksum.CHECKSUM_UNKNOWN;
int bytesPerChecksum = -1;
if (isInlineChecksum) {
GenStampAndChecksum sac = BlockInlineChecksumReader
.getGenStampAndChecksumFromInlineChecksumFile(f.getName());
checksumType = sac.checksumType;
bytesPerChecksum = sac.bytesPerChecksum;
}
DatanodeBlockInfo binfo = new DatanodeBlockInfo(volume, f,
DatanodeBlockInfo.UNFINALIZED, true, isInlineChecksum,
checksumType, bytesPerChecksum, false, 0);
volumeMap.add(namespaceId, b.block, binfo);
volumeMap.addOngoingCreates(namespaceId, b.block, new ActiveFile(
binfo, true, ActiveFile.UNKNOWN_SIZE, false));
} finally {
lock.writeLock().unlock();
}
if (DataNode.LOG.isDebugEnabled()) {
DataNode.LOG.debug("recoverBlocksBeingWritten for block " + b.block + "namespaceId: "+namespaceId);
}
}
}
File getDirectory() {
return dataDir.getDirectory().getParentFile();
}
File getCurrentDir() {
return dataDir.getDirectory();
}
File getRbwDir() {
return rbwDir;
}
void decDfsUsed(long value) {
dfsUsage.decDfsUsed(value);
}
long getDfsUsed() throws IOException {
return dfsUsage.getUsed();
}
/**
* Temporary files. They get moved to the finalized block directory when
* the block is finalized.
*/
File createTmpFile(Block b) throws IOException {
File f = new File(tmpDir, b.getBlockName());
return FSDataset.createTmpFile(b, f);
}
File createDetachFile(Block b) throws IOException {
File f = new File(detachDir, b.getBlockName());
return FSDataset.createTmpFile(b, f);
}
File getTmpFile(Block b) throws IOException {
File f = new File(tmpDir, b.getBlockName());
return f;
}
/**
* Temporary files. They get moved to the finalized block directory when
* the block is finalized.
*/
File createTmpFile(Block b, boolean replicationRequest,
boolean inlineChecksum, int checksumType, int bytesPerChecksum)
throws IOException {
File f= null;
String fileName;
if (inlineChecksum) {
fileName = BlockInlineChecksumWriter.getInlineChecksumFileName(b,
checksumType, bytesPerChecksum);
} else {
fileName = b.getBlockName();
}
if (!replicationRequest) {
f = new File(rbwDir, fileName);
} else {
f = new File(tmpDir, fileName);
}
return FSDataset.createTmpFile(b, f);
}
/**
* RBW files. They get moved to the finalized block directory when
* the block is finalized.
*/
File createRbwFile(Block b) throws IOException {
File f = new File(rbwDir, b.getBlockName());
return FSDataset.createTmpFile(b, f);
}
File addBlock(Block b, File f, boolean inlineChecksum, int checksumType,
int bytesPerChecksum) throws IOException {
File blockFile = dataDir.addBlock(namespaceId, b, f, inlineChecksum,
checksumType, bytesPerChecksum);
long spaceAdded;
if (!inlineChecksum) {
File metaFile = BlockWithChecksumFileWriter.getMetaFile(blockFile , b);
spaceAdded = b.getNumBytes() + metaFile.length();
} else {
spaceAdded = blockFile.length();
}
dfsUsage.incDfsUsed(spaceAdded);
return blockFile;
}
void checkDirs() throws DiskErrorException {
dataDir.checkDirTree();
DiskChecker.checkDir(tmpDir);
DiskChecker.checkDir(detachDir);
DiskChecker.checkDir(rbwDir);
}
void clearPath(File f) {
dataDir.clearPath(f);
}
public String toString() {
return dataDir.getDirectory().getAbsolutePath();
}
public void shutdown() {
volume.dfsUsage.removeNamespace(namespaceId);
}
}
/**
* A data structure than encapsulates a Block along with the full pathname
* of the block file
*/
static class BlockAndFile implements Comparable<BlockAndFile> {
final Block block;
final File pathfile;
BlockAndFile(File fullpathname, Block block) {
this.pathfile = fullpathname;
this.block = block;
}
public int compareTo(BlockAndFile o)
{
return this.block.compareTo(o.block);
}
}
/**
* A node type that can be built into a tree reflecting the
* hierarchy of blocks on the local disk.
*/
class FSDir {
File dir;
int numBlocks = 0;
volatile FSDir childrenDirs[];
int lastChildIdx = 0;
File getDirectory(){
return dir;
}
FSDir[] getChildren() {
return childrenDirs;
}
public FSDir() {
}
public FSDir(int namespaceId, File dir) throws IOException{
this(namespaceId, dir, null);
}
public FSDir(int namespaceId, File dir, FSVolume volume) throws IOException {
this.dir = dir;
this.childrenDirs = null;
if (!dir.exists()) {
if (!dir.mkdirs()) {
throw new IOException("Mkdirs failed to create " +
dir.toString());
}
} else {
File[] files = dir.listFiles();
String[] filesNames = getFileNames(files);
int numChildren = 0;
for (int i = 0; i < files.length; i++) {
File file = files[i];
String fileName = filesNames[i];
if (isPendingDeleteFilename(fileName)){
// Should not cause throwing an exception.
// Obsolete files are not included in the block report.
asyncDiskService.deleteAsyncFile(volume, file);
} else if (file.isDirectory()) {
numChildren++;
} else if (Block.isSeparateChecksumBlockFilename(fileName)) {
numBlocks++;
if (volume != null) {
long blkSize = file.length();
long genStamp = BlockWithChecksumFileReader
.getGenerationStampFromSeperateChecksumFile(filesNames,
fileName);
volumeMap.add(namespaceId, new Block(file, blkSize, genStamp),
new DatanodeBlockInfo(volume, file, blkSize, true, false,
DataChecksum.CHECKSUM_UNKNOWN, -1, false, 0));
}
} else if (Block.isInlineChecksumBlockFilename(fileName)) {
numBlocks++;
if (volume != null) {
GenStampAndChecksum sac = BlockInlineChecksumReader
.getGenStampAndChecksumFromInlineChecksumFile(fileName);
long blkSize = BlockInlineChecksumReader
.getBlockSizeFromFileLength(file.length(), sac.checksumType,
sac.bytesPerChecksum);
volumeMap.add(namespaceId, new Block(file, blkSize,
sac.generationStamp), new DatanodeBlockInfo(volume, file,
blkSize, true, true, sac.checksumType, sac.bytesPerChecksum,
false, 0));
}
}
}
if (numChildren > 0) {
FSDir[] newChildren = new FSDir[numChildren];
int curdir = 0;
for (int idx = 0; idx < files.length; idx++) {
String fileName = files[idx].getName();
if (files[idx].isDirectory() && !isPendingDeleteFilename(fileName)) {
newChildren[curdir] = new FSDir(namespaceId, files[idx], volume);
curdir++;
}
}
childrenDirs = newChildren;
}
}
}
public File addBlock(int namespaceId, Block b, File src,
boolean inlineChecksum, int checksumType, int bytesPerChecksum)
throws IOException {
//First try without creating subdirectories
File file = addBlock(namespaceId, b, src, false, false, inlineChecksum,
checksumType, bytesPerChecksum);
return (file != null) ? file : addBlock(namespaceId, b, src, true, true,
inlineChecksum, checksumType, bytesPerChecksum);
}
private File addBlock(int namespaceId, Block b, File src, boolean createOk,
boolean resetIdx, boolean inlineChecksum, int checksumType,
int bytesPerChecksum) throws IOException {
if (numBlocks < maxBlocksPerDir) {
File dest;
if (!inlineChecksum) {
dest = new File(dir, b.getBlockName());
File metaData = BlockWithChecksumFileWriter.getMetaFile( src, b );
File newmeta = BlockWithChecksumFileWriter.getMetaFile(dest, b);
if ( ! metaData.renameTo( newmeta )) {
throw new IOException("could not move file "
+ metaData.getAbsolutePath() + " to "
+ newmeta.getAbsolutePath());
}
if (DataNode.LOG.isDebugEnabled()) {
DataNode.LOG.debug("addBlock: Moved " + metaData + " to " + newmeta);
}
} else {
dest = new File(dir,
BlockInlineChecksumWriter.getInlineChecksumFileName(b,
checksumType, bytesPerChecksum));
}
if (! src.renameTo( dest ) ) {
throw new IOException( "could not move files for " + b +
" from tmp to " +
dest.getAbsolutePath() );
}
// fsyncIfPossible parent directory to persist rename.
if (datanode.syncOnClose) {
NativeIO.fsyncIfPossible(dest.getParent());
}
if (DataNode.LOG.isDebugEnabled()) {
DataNode.LOG.debug("addBlock: Moved " + src + " to " + dest);
}
numBlocks += 1;
return dest;
}
FSDir[] children = this.getChildren();
if (lastChildIdx < 0 && resetIdx) {
//reset so that all children will be checked
lastChildIdx = random.nextInt(children.length);
}
if (lastChildIdx >= 0 && children != null) {
//Check if any child-tree has room for a block.
for (int i=0; i < children.length; i++) {
int idx = (lastChildIdx + i)%children.length;
File file = children[idx].addBlock(namespaceId, b, src, false,
resetIdx, inlineChecksum, checksumType, bytesPerChecksum);
if (file != null) {
lastChildIdx = idx;
return file;
}
}
lastChildIdx = -1;
}
if (!createOk) {
return null;
}
if (children == null || children.length == 0) {
// make sure children is immutable once initialized.
FSDir[] newChildren = new FSDir[maxBlocksPerDir];
for (int idx = 0; idx < maxBlocksPerDir; idx++) {
newChildren[idx] = new FSDir(namespaceId, new File(dir,
DataStorage.BLOCK_SUBDIR_PREFIX + idx));
}
childrenDirs = children = newChildren;
}
//now pick a child randomly for creating a new set of subdirs.
lastChildIdx = random.nextInt(children.length);
return children[lastChildIdx].addBlock(namespaceId, b, src, true, false,
inlineChecksum, checksumType, bytesPerChecksum);
}
/**
* Populate the given blockSet with any child blocks
* found at this node.
* @throws IOException
*/
public void getBlockInfo(LightWeightHashSet<Block> blockSet) throws IOException {
FSDir[] children = this.getChildren();
if (children != null) {
for (int i = 0; i < children.length; i++) {
children[i].getBlockInfo(blockSet);
}
}
File blockFiles[] = dir.listFiles();
String[] blockFilesNames = getFileNames(blockFiles);
for (int i = 0; i < blockFiles.length; i++) {
Block block = getBlockFromNames(blockFiles, blockFilesNames, i);
if (block != null) {
blockSet.add(block);
}
}
}
/**
* Populate the given blockSet with any child blocks
* found at this node. With each block, return the full path
* of the block file.
* @throws IOException
*/
void getBlockAndFileInfo(LightWeightHashSet<BlockAndFile> blockSet) throws IOException {
FSDir[] children = this.getChildren();
if (children != null) {
for (int i = 0; i < children.length; i++) {
children[i].getBlockAndFileInfo(blockSet);
}
}
File blockFiles[] = dir.listFiles();
String[] blockFilesNames = getFileNames(blockFiles);
for (int i = 0; i < blockFiles.length; i++) {
Block block = getBlockFromNames(blockFiles, blockFilesNames, i);
if (block != null) {
blockSet.add(new BlockAndFile(blockFiles[i].getAbsoluteFile(), block));
}
}
}
/**
* check if a data directory is healthy
* @throws DiskErrorException
*/
public void checkDirTree() throws DiskErrorException {
DiskChecker.checkDir(dir);
FSDir[] children = this.getChildren();
if (children != null) {
for (int i = 0; i < children.length; i++) {
children[i].checkDirTree();
}
}
}
void clearPath(File f) {
String root = dir.getAbsolutePath();
String dir = f.getAbsolutePath();
if (dir.startsWith(root)) {
String[] dirNames = dir.substring(root.length()).
split(File.separator + "subdir");
if (clearPath(f, dirNames, 1))
return;
}
clearPath(f, null, -1);
}
/*
* dirNames is an array of string integers derived from
* usual directory structure data/subdirN/subdirXY/subdirM ...
* If dirName array is non-null, we only check the child at
* the children[dirNames[idx]]. This avoids iterating over
* children in common case. If directory structure changes
* in later versions, we need to revisit this.
*/
private boolean clearPath(File f, String[] dirNames, int idx) {
if ((dirNames == null || idx == dirNames.length) &&
dir.compareTo(f) == 0) {
numBlocks--;
return true;
}
FSDir[] children = this.getChildren();
if (dirNames != null) {
//guess the child index from the directory name
if (idx > (dirNames.length - 1) || children == null) {
return false;
}
int childIdx;
try {
childIdx = Integer.parseInt(dirNames[idx]);
} catch (NumberFormatException ignored) {
// layout changed? we could print a warning.
return false;
}
return (childIdx >= 0 && childIdx < children.length) ?
children[childIdx].clearPath(f, dirNames, idx+1) : false;
}
//guesses failed. back to blind iteration.
if (children != null) {
for(int i=0; i < children.length; i++) {
if (children[i].clearPath(f, null, -1)){
return true;
}
}
}
return false;
}
public String toString() {
FSDir[] children = this.getChildren();
return "FSDir{" +
"dir=" + dir +
", children=" + (children == null ? null : Arrays.asList(children)) +
"}";
}
}
/**
* A map from namespace ID to NamespaceSlice object
*
* Only three operations are supported: add a namespace, remove a namespace
* and get a snapshot of the list of the namespace map, which is an immutable
* object.
*
* No extra locking is allowed in this object
*/
class NamespaceMap {
/**
* Any object referred here needs to be immutable. Every time this map is
* updated, a new map is created and the reference here is changed to the
* new map.
*/
private Map<Integer, NamespaceSlice> namespaceMap = new HashMap<Integer, NamespaceSlice>();;
/**
* It is the only method a caller is supposed to access namespaceMap. This
* method will return a immutable map. It is a snapshot.
*
* @return
*/
private synchronized Map<Integer, NamespaceSlice> getNamespaceMapSnapshot() {
return namespaceMap;
}
public synchronized void addNamespace(int namespaceId, NamespaceSlice ns)
throws IOException {
// add a new name-space by copying all the entries to a new map.
Map<Integer, NamespaceSlice> newMap = new HashMap<Integer, NamespaceSlice>(
namespaceMap);
newMap.put(namespaceId, ns);
namespaceMap = newMap;
}
public synchronized void removeNamespace(int namespaceId) {
Map<Integer, NamespaceSlice> newMap = new HashMap<Integer, NamespaceSlice>(
namespaceMap);
newMap.remove(namespaceId);
namespaceMap = newMap;
}
}
public class FSVolume {
private final NamespaceMap namespaceMap;
private final File currentDir; // <StorageDirectory>/current
private final DF usage;
private final long reserved;
private final FSDataset dataset;
private DU dfsUsage;
private final ExecutorService nativeIOExecutor;
FSVolume(FSDataset dataset, File currentDir, Configuration conf) throws IOException {
this.currentDir = currentDir;
File parent = currentDir.getParentFile();
this.usage = new DF(parent, conf);
this.reserved = usage.getReserved();
this.dataset = dataset;
this.namespaceMap = new NamespaceMap();
this.dfsUsage = new DU(currentDir, conf);
this.dfsUsage.start();
this.nativeIOExecutor = Executors.newSingleThreadExecutor();
}
public Future<?> submitNativeIOTask(Runnable task) {
return nativeIOExecutor.submit(task);
}
/**
* It is the only method a caller is supposed to access namespaceMap.
* This method will return a immutable map. It is a snapshot.
* @return
*/
private Map<Integer, NamespaceSlice> getNamespaceMapSnapshot() {
return namespaceMap.getNamespaceMapSnapshot();
}
NamespaceSlice getNamespaceSlice(int namespaceId){
return getNamespaceMapSnapshot().get(namespaceId);
}
/** Return storage directory corresponding to the volume */
public File getDir() {
return currentDir.getParentFile();
}
public File getBlockCrcFile(int namespaceId) {
NamespaceSlice ns = getNamespaceSlice(namespaceId);
if (ns == null) {
return null;
}
return new File(ns.getDirectory(), Storage.STORAGE_BLOCK_CRC);
}
public File getBlockCrcTmpFile(int namespaceId) {
NamespaceSlice ns = getNamespaceSlice(namespaceId);
if (ns == null) {
return null;
}
return new File(ns.getDirectory(), Storage.STORAGE_TMP_BLOCK_CRC);
}
public File getCurrentDir() {
return currentDir;
}
public File getRbwDir(int namespaceId) throws IOException {
NamespaceSlice ns = getNamespaceSlice(namespaceId);
return ns.getRbwDir();
}
void setNamespaceBlockCrcLoaded(int namespaceId, boolean loaded) {
NamespaceSlice ns = getNamespaceSlice(namespaceId);
if (ns != null) {
ns.setBlockCrcFileLoaded(loaded);
}
}
boolean isNamespaceBlockCrcLoaded(int namespaceId) {
NamespaceSlice ns = getNamespaceSlice(namespaceId);
if (ns != null) {
return ns.isBlockCrcFileLoaded();
} else {
// if the namespace is not added
return false;
}
}
void decDfsUsed(int namespaceId, long value) {
// this lock is put in FSVolume since it is called only ReplicaFileDeleteWork
NamespaceSlice ns = getNamespaceSlice(namespaceId);
if (ns != null) {
ns.decDfsUsed(value);
}
}
long getDfsUsed() throws IOException {
long dfsUsed = 0;
for (NamespaceSlice ns : getNamespaceMapSnapshot().values()) {
dfsUsed += ns.getDfsUsed();
}
return dfsUsed;
}
long getNSUsed(int namespaceId) throws IOException {
return getNamespaceMapSnapshot().get(namespaceId).getDfsUsed();
}
long getCapacity() throws IOException {
if (reserved > usage.getCapacity()) {
return 0;
}
return usage.getCapacity()-reserved;
}
long getAvailable() throws IOException {
long remaining = getCapacity()-getDfsUsed();
long available = usage.getAvailable();
if (remaining>available) {
remaining = available;
}
return (remaining > 0) ? remaining : 0;
}
long getReserved() {
return this.reserved;
}
String getMount() throws IOException {
return usage.getMount();
}
String getFileSystem() throws IOException {
return usage.getFilesystem();
}
File addBlock(int namespaceId, Block b, File f, boolean inlineChecksum,
int checksumType, int bytesPerChecksum) throws IOException {
NamespaceSlice ns = getNamespaceSlice(namespaceId);
return ns.addBlock(b, f, inlineChecksum, checksumType, bytesPerChecksum);
}
void checkDirs() throws DiskErrorException {
for (NamespaceSlice ns : getNamespaceMapSnapshot().values()) {
ns.checkDirs();
}
}
/**
* Temporary files. They get moved to the finalized block directory when
* the block is finalized.
*/
File createTmpFile(int namespaceId, Block b) throws IOException {
NamespaceSlice ns = getNamespaceSlice(namespaceId);
return ns.createTmpFile(b);
}
File getTmpFile(int namespaceId, Block b) throws IOException {
NamespaceSlice ns = getNamespaceSlice(namespaceId);
return ns.getTmpFile(b);
}
/**
* Temporary files. They get moved to the finalized block directory when
* the block is finalized.
*/
File createTmpFile(int namespaceId, Block b, boolean replicationRequest,
boolean inlineChecksum, int checksumType, int bytesPerChecksum)
throws IOException {
NamespaceSlice ns = getNamespaceSlice(namespaceId);
return ns.createTmpFile(b, replicationRequest, inlineChecksum,
checksumType, bytesPerChecksum);
}
/**
* Files used for copy-on-write. They need recovery when datanode
* restarts.
*/
File createDetachFile(int namespaceId, Block b, String filename) throws IOException {
NamespaceSlice ns = getNamespaceSlice(namespaceId);
return ns.createDetachFile(b);
}
public void addNamespace(int namespaceId, String nsDir, Configuration conf, boolean supportAppends)
throws IOException {
File nsdir = new File(currentDir, nsDir);
NamespaceSlice ns = new NamespaceSlice(namespaceId, this, nsdir, conf, supportAppends);
namespaceMap.addNamespace(namespaceId, ns);
}
void getBlocksBeingWrittenInfo(int namespaceId,
LightWeightHashSet<Block> blockSet) throws IOException {
NamespaceSlice ns = getNamespaceSlice(namespaceId);
if (ns == null) {
return;
}
ns.getBlocksBeingWrittenInfo(blockSet);
return;
}
public void shutdownNamespace(int namespaceId) {
NamespaceSlice ns = getNamespaceSlice(namespaceId);
if (ns != null) {
this.namespaceMap.removeNamespace(namespaceId);
ns.shutdown();
}
}
void getBlockInfo(int namespaceId, LightWeightHashSet<Block> blockSet)
throws IOException {
NamespaceSlice ns = getNamespaceSlice(namespaceId);
ns.getBlockInfo(blockSet);
return;
}
public void shutdown() {
for (NamespaceSlice ns : getNamespaceMapSnapshot().values()) {
ns.shutdown();
}
dfsUsage.shutdown();
nativeIOExecutor.shutdownNow();
}
void clearPath(int namespaceId, File f) throws IOException{
NamespaceSlice ns = getNamespaceSlice(namespaceId);
ns.clearPath(f);
return;
}
public String toString() {
return currentDir.getAbsolutePath();
}
}
/**
* This class maintain a list of FSVolume objects.
* Only three operations are supported: add volumes, remove volumes,
* and get a snapshot of the list of the volumes, which is an immutable
* object.
*/
static class FSVolumeList {
volatile FSVolume[] fsVolumes = null;
public FSVolumeList(FSVolume[] volumes) {
fsVolumes = volumes;
}
public synchronized void addVolumes(FSVolume[] volArray) {
if (volArray == null || volArray.length == 0) {
return;
}
int size = fsVolumes.length + volArray.length;
FSVolume fsvs[] = new FSVolume[size];
int idx = 0;
for (; idx < fsVolumes.length; idx++) {
fsvs[idx] = fsVolumes[idx];
}
for (; idx < size; idx++) {
fsvs[idx] = volArray[idx - fsVolumes.length];
}
fsVolumes = fsvs;
}
public synchronized void removeVolumes(List<FSVolume> removed_vols) {
// repair array - copy non null elements
int removed_size = (removed_vols == null) ? 0 : removed_vols.size();
if (removed_size > 0) {
FSVolume fsvs[] = new FSVolume[fsVolumes.length - removed_size];
for (int idx = 0, idy = 0; idx < fsVolumes.length; idx++) {
if (!removed_vols.contains(fsVolumes[idx])) {
fsvs[idy] = fsVolumes[idx];
idy++;
}
}
fsVolumes = fsvs; // replace array of volumes
}
}
public FSVolume[] getVolumeListSnapshot() {
return fsVolumes;
}
}
static class FSVolumeSet {
final FSVolumeList volumeList;
int curVolume = 0;
ExecutorService scannersExecutor;
boolean supportAppends;
private FSVolumeSet(FSVolume[] volumes, int threads, boolean supportAppends) {
this.volumeList = new FSVolumeList(volumes);
this.supportAppends = supportAppends;
if (threads > 1) {
scannersExecutor = Executors.newFixedThreadPool(threads);
}
}
public boolean isValidDir(File currentDir) {
FSVolume[] volumes = this.getVolumes();
for (int idx = 0; idx < volumes.length; idx++) {
if (volumes[idx].getCurrentDir().equals(currentDir)) {
return true;
}
}
return false;
}
protected void addVolumes(FSVolume[] volArray) {
volumeList.addVolumes(volArray);
}
protected int numberOfVolumes() {
return getVolumes().length;
}
public FSVolume[] getVolumes() {
return volumeList.getVolumeListSnapshot();
}
boolean isValid(FSVolume volume) {
for (FSVolume vol : volumeList.getVolumeListSnapshot()) {
if (vol == volume) {
return true;
}
}
return false;
}
private FSVolume getNextVolume(long blockSize) throws IOException {
FSVolume[] volumes = this.getVolumes();
if(volumes.length < 1) {
throw new DiskOutOfSpaceException("No more available volumes");
}
// since volumes could've been removed because of the failure
// make sure we are not out of bounds
if (curVolume >= volumes.length) {
curVolume = 0;
}
int startVolume = curVolume;
while (true) {
FSVolume volume = volumes[curVolume];
curVolume = (curVolume + 1) % volumes.length;
if (volume.getAvailable() > blockSize) {
return volume;
}
if (curVolume == startVolume) {
throw new DiskOutOfSpaceException(
"Insufficient space for an additional block");
}
}
}
private long getDfsUsed() throws IOException {
long dfsUsed = 0L;
FSVolume[] volumes = this.getVolumes();
for (int idx = 0; idx < volumes.length; idx++) {
dfsUsed += volumes[idx].getDfsUsed();
}
return dfsUsed;
}
private long getNSUsed(int namespaceId) throws IOException {
long dfsUsed = 0L;
FSVolume[] volumes = this.getVolumes();
for (int idx = 0; idx < volumes.length; idx++) {
dfsUsed += volumes[idx].getNSUsed(namespaceId);
}
return dfsUsed;
}
private long getCapacity() throws IOException {
long capacity = 0L;
FSVolume[] volumes = this.getVolumes();
for (int idx = 0; idx < volumes.length; idx++) {
capacity += volumes[idx].getCapacity();
}
return capacity;
}
private long getRemaining() throws IOException {
long remaining = 0L;
FSVolume[] volumes = this.getVolumes();
for (int idx = 0; idx < volumes.length; idx++) {
remaining += volumes[idx].getAvailable();
}
return remaining;
}
private void getBlocksBeingWrittenInfo(int namespaceId,
LightWeightHashSet<Block> blockSet) throws IOException {
long startTime = System.currentTimeMillis();
FSVolume[] volumes = this.getVolumes();
if (scannersExecutor != null) {
synchronized(scannersExecutor) {
List<Future<LightWeightHashSet<Block>>> builders =
new ArrayList<Future<LightWeightHashSet<Block>>>();
for (int idx = 0; idx < volumes.length; idx++) {
builders.add(scannersExecutor
.submit(new BlocksBeingWrittenInfoBuilder(volumes[idx],
namespaceId)));
}
for (Future<LightWeightHashSet<Block>> future : builders) {
try {
blockSet.addAll(future.get());
} catch (ExecutionException ex) {
DataNode.LOG.error(
"Error generating block being written info from volumes ",
ex.getCause());
throw new IOException(ex);
} catch (InterruptedException iex) {
DataNode.LOG.error(
"Error waiting for generating block being written info", iex);
throw new IOException(iex);
}
}
}
} else {
for (int idx = 0; idx < volumes.length; idx++) {
volumes[idx].getBlocksBeingWrittenInfo(namespaceId, blockSet);
}
}
long scanTime = (System.currentTimeMillis() - startTime)/1000;
DataNode.LOG.info("Finished generating blocks being written report for " +
volumes.length + " volumes in " + scanTime + " seconds");
}
private void getBlockInfo(int namespaceId, LightWeightHashSet<Block> blockSet) {
long startTime = System.currentTimeMillis();
FSVolume[] volumes = this.getVolumes();
if (scannersExecutor != null) {
synchronized (scannersExecutor) {
List<Future<LightWeightHashSet<Block>>> builders =
new ArrayList<Future<LightWeightHashSet<Block>>>();
for (int idx = 0; idx < volumes.length; idx++) {
builders.add(scannersExecutor.submit(new BlockInfoBuilder(
volumes[idx], namespaceId)));
}
for (Future<LightWeightHashSet<Block>> future : builders) {
try {
blockSet.addAll(future.get());
} catch (ExecutionException ex) {
DataNode.LOG.error("Error scanning volumes ", ex.getCause());
} catch (InterruptedException iex) {
DataNode.LOG.error("Error waiting for scan", iex);
}
}
}
} else {
for (int idx = 0; idx < volumes.length; idx++) {
try{
volumes[idx].getBlockInfo(namespaceId, blockSet);
} catch (IOException e) {
DataNode.LOG.error("Error scanning volumes ", e.getCause());
}
}
}
long scanTime = (System.currentTimeMillis() - startTime)/1000;
DataNode.LOG.info("Finished generating block report for " +
volumes.length + " volumes in " + scanTime + " seconds");
}
/**
* goes over all the volumes and checkDir eachone of them
* if one throws DiskErrorException - removes from the list of active
* volumes.
* @return list of all the removed volumes
*/
private List<FSVolume> checkDirs() {
List<FSVolume> removed_vols = null;
FSVolume[] fsVolumes = this.getVolumes();
for (int idx = 0; idx < fsVolumes.length; idx++) {
FSVolume fsv = fsVolumes[idx];
try {
fsv.checkDirs();
} catch (DiskErrorException e) {
DataNode.LOG.warn("Removing failed volume " + fsv + ": ", e);
if (removed_vols == null) {
removed_vols = new ArrayList<FSVolume>();
removed_vols.add(fsVolumes[idx]);
}
}
}
if (removed_vols != null && removed_vols.size() > 0) {
volumeList.removeVolumes(removed_vols);
DataNode.LOG.info("Completed FSVolumeSet.checkDirs. Removed="
+ removed_vols.size() + "volumes. List of current volumes: "
+ toString());
}
return removed_vols;
}
private List<FSVolume> removeBVolumes(List<File> directories) {
ArrayList<FSVolume> removed_vols = new ArrayList<FSVolume>();
if (directories != null && directories.size() > 0) {
FSVolume[] fsVolumes = this.getVolumes();
for(int idx = 0; idx < fsVolumes.length; idx++) {
FSVolume fsv = fsVolumes[idx];
if(directories.contains(fsv.getDir())) {
removed_vols.add(fsv);
}
}
volumeList.removeVolumes(removed_vols);
DataNode.LOG.info("Completed FSVolumeSet.removeVolumes. Removed="
+ removed_vols.size() + "volumes. List of current volumes: "
+ toString());
}
return removed_vols;
}
private void addNamespace(int namespaceId, String nsDir, Configuration conf)
throws IOException {
FSVolume[] volumes = this.getVolumes();
for (FSVolume v : volumes) {
v.addNamespace(namespaceId, nsDir, conf, supportAppends);
}
}
private void removeNamespace(int namespaceId) {
FSVolume[] volumes = this.getVolumes();
for (FSVolume v : volumes) {
v.shutdownNamespace(namespaceId);
}
}
public String toString() {
StringBuffer sb = new StringBuffer();
FSVolume[] volumes = this.getVolumes();
for (int idx = 0; idx < volumes.length; idx++) {
sb.append(volumes[idx].toString());
if (idx != volumes.length - 1) { sb.append(","); }
}
return sb.toString();
}
}
private static class BlockInfoBuilder implements Callable<LightWeightHashSet<Block>> {
FSVolume volume;
int namespaceId;
public BlockInfoBuilder(FSVolume volume, int namespaceId) {
this.volume = volume;
this.namespaceId = namespaceId;
}
@Override
public LightWeightHashSet<Block> call() throws Exception {
LightWeightHashSet<Block> result = new LightWeightHashSet<Block>();
volume.getBlockInfo(namespaceId, result);
return result;
}
}
private static class BlocksBeingWrittenInfoBuilder implements
Callable<LightWeightHashSet<Block>> {
FSVolume volume;
int namespaceId;
public BlocksBeingWrittenInfoBuilder(FSVolume volume, int namespaceId) {
this.volume = volume;
this.namespaceId = namespaceId;
}
@Override
public LightWeightHashSet<Block> call() throws Exception {
LightWeightHashSet<Block> result = new LightWeightHashSet<Block>();
volume.getBlocksBeingWrittenInfo(namespaceId, result);
return result;
}
}
//////////////////////////////////////////////////////
//
// FSDataSet
//
//////////////////////////////////////////////////////
//Find better place?
public static final String METADATA_EXTENSION = ".meta";
public static final short FORMAT_VERSION_NON_INLINECHECKSUM = 1;
public static final short FORMAT_VERSION_INLINECHECKSUM = 2;
public static final String DELETE_FILE_EXT = "toDelete.";
static class ActiveFile implements ReplicaToRead, ReplicaBeingWritten,
Cloneable {
static final long UNKNOWN_SIZE = -1;
DatanodeBlockInfo datanodeBlockInfo;
final List<Thread> threads = new ArrayList<Thread>(2);
private volatile long bytesReceived;
private volatile long bytesAcked;
private volatile long bytesOnDisk;
private volatile boolean finalized;
private volatile BlockCrcUpdater crcUpdater;
/**
* Set to true if this file was recovered during datanode startup.
* This may indicate that the file has been truncated (eg during
* underlying filesystem journal replay)
*/
final boolean wasRecoveredOnStartup;
ActiveFile(DatanodeBlockInfo datanodeBlockInfo, List<Thread> list,
long expectedSize, boolean enable) throws IOException {
this(datanodeBlockInfo, false, expectedSize, enable);
if (list != null) {
threads.addAll(list);
}
threads.add(Thread.currentThread());
}
/**
* Create an ActiveFile from a file on disk during DataNode startup.
* This factory method is just to make it clear when the purpose
* of this constructor is.
* @throws IOException
*/
private ActiveFile(DatanodeBlockInfo datanodeBlockInfo, boolean recovery,
long expectedSize, boolean enable)
throws IOException {
this.datanodeBlockInfo = datanodeBlockInfo;
long sizeFromDisk;
if (!isInlineChecksum()) {
sizeFromDisk = getDataFile().length();
} else {
GenStampAndChecksum sac = BlockInlineChecksumReader
.getGenStampAndChecksumFromInlineChecksumFile(getDataFile()
.getName());
sizeFromDisk = BlockInlineChecksumReader.getBlockSizeFromFileLength(
getDataFile().length(), sac.checksumType, sac.bytesPerChecksum);
}
if (expectedSize != UNKNOWN_SIZE && sizeFromDisk != expectedSize) {
throw new IOException("File " + getDataFile()
+ " on disk size " + sizeFromDisk + " doesn't match expected size "
+ expectedSize);
}
bytesReceived = bytesAcked = bytesOnDisk = sizeFromDisk;
crcUpdater = new BlockCrcUpdater(this.getBytesPerChecksum(),
enable && bytesReceived == 0);
wasRecoveredOnStartup = recovery;
finalized = false;
}
@Override
public long getBytesVisible() {
return bytesAcked;
}
public void setBytesAcked(long value) {
bytesAcked = value;
}
@Override
public long getBytesWritten() {
return bytesOnDisk;
}
public void setBytesOnDisk(long value) {
bytesOnDisk = value;
}
public long getBytesReceived() {
return bytesReceived;
}
public void setBytesReceived(long length) {
bytesReceived = length;
}
@Override
public File getDataFileToRead() {
return datanodeBlockInfo.getDataFileToRead();
}
private File getDataFile() {
return datanodeBlockInfo.getBlockDataFile().getFile();
}
public String toString() {
return getClass().getSimpleName() + "(file=" + getDataFile()
+ ", threads=" + threads + ")";
}
public ActiveFile getClone() throws CloneNotSupportedException {
return (ActiveFile) super.clone();
}
@Override
public boolean isInlineChecksum() {
return datanodeBlockInfo.isInlineChecksum();
}
@Override
public int getChecksumType() {
return datanodeBlockInfo.getChecksumType();
}
@Override
public int getBytesPerChecksum() {
return datanodeBlockInfo.getBytesPerChecksum();
}
@Override
public InputStream getBlockInputStream(DataNode datanode, long offset)
throws IOException {
return datanodeBlockInfo.getBlockInputStream(datanode, offset);
}
@Override
public boolean isFinalized() {
return finalized;
}
protected void blockFinalize() {
this.finalized = true;
}
@Override
public int getBlockCrc() throws IOException{
throw new IOException("Block not finalized.");
}
@Override
public void updateBlockCrc(long offset, int length,
int crc) {
crcUpdater.updateBlockCrc(offset, length, crc);
}
@Override
public boolean hasBlockCrcInfo() {
return false;
}
BlockCrcUpdater getCrcUpdater() {
return crcUpdater;
}
@Override
public BlockDataFile getBlockDataFile() throws IOException {
return datanodeBlockInfo.getBlockDataFile();
}
}
/**
* Check if a file is scheduled for deletion
* name should be obtained by File.getName()
*/
static boolean isPendingDeleteFilename(String name) {
return name.startsWith(DELETE_FILE_EXT);
}
public Block getStoredBlock(int namespaceId, long blkid) throws IOException {
return getStoredBlock(namespaceId, blkid, false);
}
/** {@inheritDoc} */
public Block getStoredBlock(int namespaceId, long blkid,
boolean useOnDiskLength) throws IOException {
lock.readLock().lock();
try {
ReplicaToRead replica = getReplicaToRead(namespaceId, new Block(
blkid));
if (replica == null) {
return null;
}
File blockfile = replica.getDataFileToRead();
if (blockfile == null) {
return null;
}
File metafile = null;
if (!replica.isInlineChecksum()) {
metafile = BlockWithChecksumFileWriter.findMetaFile(blockfile, true);
if (metafile == null) {
return null;
}
}
Block block = new Block(blkid);
if (useOnDiskLength) {
block.setNumBytes(replica.getBytesWritten());
} else {
block.setNumBytes(replica.getBytesVisible());
}
if (replica.isInlineChecksum()) {
block.setGenerationStamp(BlockInlineChecksumReader
.getGenerationStampFromInlineChecksumFile(blockfile.getName()));
} else {
block.setGenerationStamp(BlockWithChecksumFileReader
.parseGenerationStampInMetaFile(blockfile, metafile));
}
return block;
} finally {
lock.readLock().unlock();
}
}
FSVolumeSet volumes;
private DataNode datanode;
private Configuration conf;
private int maxBlocksPerDir = 0;
private boolean initialized = false;
VolumeMap volumeMap;
BlockCrcMapFlusher blockCrcMapFlusher;
Thread blockCrcMapFlusherThread = null;
static Random random = new Random();
FSDatasetAsyncDiskService asyncDiskService;
ReentrantReadWriteLock lock = new ReentrantReadWriteLock(true);
private boolean shouldHardLinkBlockCopy;
private int validVolsRequired;
//this constructor is used to create PersistedSimulatedFSDataset
public FSDataset() {
}
/**
* An FSDataset has a directory where it loads its data files.
*/
public FSDataset(DataNode datanode, Configuration conf, int numNamespaces){
this.datanode = datanode;
this.conf = conf;
this.maxBlocksPerDir = conf.getInt("dfs.datanode.numblocks", 64);
volumeMap = new VolumeMap(numNamespaces);
}
void setDatasetDelta(FSDatasetDeltaInterface stateChangeCallback) {
volumeMap.setDatasetDelta(stateChangeCallback);
}
@Override
public void initialize(DataStorage storage) throws IOException{
lock.writeLock().lock();
try{
if(initialized){
return;
}
// The number of volumes required for operation is the total number
// of volumes configured minus the number of failed volumes we can
// tolerate.
String[] dataDirs = DataNode.getListOfDataDirs(conf);
int volsConfigured = (dataDirs == null) ? 0 : dataDirs.length;
final int volFailuresTolerated =
conf.getInt("dfs.datanode.failed.volumes.tolerated", volsConfigured-1);
this.validVolsRequired = volsConfigured - volFailuresTolerated;
if (validVolsRequired < 1 || validVolsRequired > storage.getNumStorageDirs()) {
throw new DiskErrorException("Too many failed volumes - "
+ "current valid volumes: " + storage.getNumStorageDirs()
+ ", volumes configured: " + volsConfigured
+ ", volume failures tolerated: " + volFailuresTolerated );
}
File[] roots = new File[storage.getNumStorageDirs()];
for (int idx = 0; idx < storage.getNumStorageDirs(); idx++) {
roots[idx] = storage.getStorageDir(idx).getCurrentDir();
}
asyncDiskService = new FSDatasetAsyncDiskService(roots, conf);
FSVolume[] volArray = new FSVolume[storage.getNumStorageDirs()];
for (int idx = 0; idx < storage.getNumStorageDirs(); idx++) {
volArray[idx] = new FSVolume(this, storage.getStorageDir(idx).getCurrentDir(),
conf);
DataNode.LOG.info("FSDataset added volume - "
+ storage.getStorageDir(idx).getCurrentDir());
}
int threads = conf.getInt("dfs.datanode.blockscanner.threads", 1);
volumes = new FSVolumeSet(volArray, threads, datanode.isSupportAppends());
registerMBean(storage.getStorageID());
blockCrcMapFlusher = new BlockCrcMapFlusher(datanode, volumeMap,
volumes, conf.getLong("dfs.block.crc.flush.interval", 600000));
blockCrcMapFlusherThread = new Thread(blockCrcMapFlusher,
"Block Crc Flusher");
blockCrcMapFlusherThread.start();
initialized = true;
} finally {
lock.writeLock().unlock();
}
shouldHardLinkBlockCopy = conf.getBoolean("dfs.datanode.blkcopy.hardlink",
true);
}
private class VolumeThread extends Thread {
private Configuration conf;
private FSVolume volume;
private boolean hasError = false;
private Map<Integer, String> namespaceIdDir;
private boolean supportAppends;
private VolumeThread(FSVolume volume,
Configuration conf,
Map<Integer, String> namespaceIdDir, boolean supportAppends) {
this.namespaceIdDir = namespaceIdDir;
this.volume = volume;
this.conf = conf;
this.supportAppends = supportAppends;
}
public void run() {
DataNode.LOG.info("Start building volume: " + volume);
try {
for (Integer namespaceId : namespaceIdDir.keySet()) {
volume.addNamespace(namespaceId, namespaceIdDir.get(namespaceId),
conf, supportAppends);
}
} catch (IOException ioe) {
DataNode.LOG.error("Error building volume : " + volume, ioe);
hasError = true;
}
DataNode.LOG.info("Finish building volume for " + volume);
}
}
private void createVolumes(FSVolumeSet volumes, DataStorage storage,
Configuration conf, VolumeMap volumeMap,
Map<Integer, String> namespaceIdDir) throws IOException {
FSVolume[] myVolumes = volumes.getVolumes();
ArrayList<VolumeThread> scanners = new ArrayList<VolumeThread>(
myVolumes.length);
for(FSVolume volume : myVolumes){
scanners.add(new VolumeThread(volume, conf,
namespaceIdDir, volumes.supportAppends));
}
for(VolumeThread vt : scanners){
vt.start();
}
boolean hasError = false;
for (VolumeThread vt : scanners) {
try {
vt.join();
} catch (InterruptedException e) {
throw (InterruptedIOException)new InterruptedIOException().initCause(e);
}
if (!hasError && vt.hasError) {
hasError = true;
}
}
if (hasError) {
throw new IOException("Error creating volumes");
}
}
/**
* Return the total space used by dfs datanode
*/
public long getDfsUsed() throws IOException {
return volumes.getDfsUsed();
}
/**
* Return the total space used by one namespace in dfs datanode
*/
public long getNSUsed(int namespaceId) throws IOException {
return volumes.getNSUsed(namespaceId);
}
/**
* Return true - if there are still valid volumes
* on the DataNode
*/
public boolean hasEnoughResource(){
return volumes.numberOfVolumes() >= this.validVolsRequired;
}
/**
* Return total capacity, used and unused
*/
public long getCapacity() throws IOException {
return volumes.getCapacity();
}
/**
* Return how many bytes can still be stored in the FSDataset
*/
public long getRemaining() throws IOException {
return volumes.getRemaining();
}
/**
* Find the block's on-disk length
*/
public long getFinalizedBlockLength(int namespaceId, Block b) throws IOException {
DatanodeBlockInfo info = volumeMap.get(namespaceId, b);
if (info == null) {
throw new IOException("Can't find block " + b + " in volumeMap");
}
return info.getFinalizedSize();
}
@Override
public long getOnDiskLength(int namespaceId, Block b) throws IOException {
ReplicaToRead rtr = this.getReplicaToRead(namespaceId, b);
if (rtr == null) {
throw new IOException("Can't find block " + b + " in volumeMap");
}
return rtr.getBytesWritten();
}
@Override
public ReplicaBeingWritten getReplicaBeingWritten(
int namespaceId, Block b) throws IOException {
lock.readLock().lock();
try {
return volumeMap.getOngoingCreates(namespaceId, b);
} finally {
lock.readLock().unlock();
}
}
/**
* Get File name for a given block.
*/
public File getBlockFile(int namespaceId, Block b) throws IOException {
File f = validateBlockFile(namespaceId, b);
if (f == null) {
if (InterDatanodeProtocol.LOG.isDebugEnabled()) {
InterDatanodeProtocol.LOG
.debug("b=" + b + ", volumeMap=" + volumeMap);
}
throw new IOException("Block " + b + ", namespace= " + namespaceId
+ " is not valid.");
}
return f;
}
/**
* Make a copy of the block if this block is linked to an existing
* snapshot. This ensures that modifying this block does not modify
* data in any existing snapshots.
* @param block Block
* @param numLinks Detach if the number of links exceed this value
* @throws IOException
* @return - true if the specified block was detached
*/
public boolean detachBlock(int namespaceId, Block block, int numLinks) throws IOException {
DatanodeBlockInfo info = null;
lock.readLock().lock();
try {
info = volumeMap.get(namespaceId, block);
} finally {
lock.readLock().unlock();
}
return info.detachBlock(namespaceId, block, numLinks);
}
/** {@inheritDoc} */
public void updateBlock(int namespaceId, Block oldblock, Block newblock) throws IOException {
if (oldblock.getBlockId() != newblock.getBlockId()) {
throw new IOException("Cannot update oldblock (=" + oldblock
+ ") to newblock (=" + newblock + ").");
}
// Protect against a straggler updateblock call moving a block backwards
// in time.
boolean isValidUpdate =
(newblock.getGenerationStamp() > oldblock.getGenerationStamp()) ||
(newblock.getGenerationStamp() == oldblock.getGenerationStamp() &&
newblock.getNumBytes() == oldblock.getNumBytes());
if (!isValidUpdate) {
throw new IOException(
"Cannot update oldblock=" + oldblock +
" to newblock=" + newblock + " since generation stamps must " +
"increase, or else length must not change.");
}
for(;;) {
final List<Thread> threads = tryUpdateBlock(namespaceId, oldblock, newblock);
if (threads == null) {
DataNode.LOG.info("Updated Block: namespaceid: " + namespaceId + " oldBlock: "
+ oldblock + " newBlock: " + newblock);
return;
}
DataNode.LOG.info("Waiting other threads to update block: namespaceid: "
+ namespaceId + " oldBlock: " + oldblock + " newBlock: " + newblock);
interruptAndJoinThreads(threads);
}
}
/**
* Try to interrupt all of the given threads, and join on them.
* If interrupted, returns false, indicating some threads may
* still be running.
*/
private boolean interruptAndJoinThreads(List<Thread> threads) {
// interrupt and wait for all ongoing create threads
for(Thread t : threads) {
t.interrupt();
}
for(Thread t : threads) {
try {
t.join();
} catch (InterruptedException e) {
DataNode.LOG.warn("interruptOngoingCreates: t=" + t, e);
return false;
}
}
return true;
}
/**
* Return a list of active writer threads for the given block.
* @return null if there are no such threads or the file is
* not being created
*/
private ArrayList<Thread> getActiveThreads(int namespaceId, Block block) {
lock.writeLock().lock();
try {
//check ongoing create threads
final ActiveFile activefile = volumeMap.getOngoingCreates(namespaceId, block);
if (activefile != null && !activefile.threads.isEmpty()) {
//remove dead threads
for(Iterator<Thread> i = activefile.threads.iterator(); i.hasNext(); ) {
final Thread t = i.next();
if (!t.isAlive()) {
i.remove();
}
}
//return living threads
if (!activefile.threads.isEmpty()) {
return new ArrayList<Thread>(activefile.threads);
}
}
} finally {
lock.writeLock().unlock();
}
return null;
}
private void setDataFileForBlock(int namespaceId, Block block, File newDataFile) {
DatanodeBlockInfo info = volumeMap.get(namespaceId, block);
if (info != null) {
info.getBlockDataFile().setFile(newDataFile);
}
}
/**
* Try to update an old block to a new block.
* If there are ongoing create threads running for the old block,
* the threads will be returned without updating the block.
*
* @return ongoing create threads if there is any. Otherwise, return null.
*/
private List<Thread> tryUpdateBlock(int namespaceId,
Block oldblock, Block newblock) throws IOException {
lock.writeLock().lock();
try {
//check ongoing create threads
ArrayList<Thread> activeThreads = getActiveThreads(namespaceId, oldblock);
if (activeThreads != null) {
return activeThreads;
}
DatanodeBlockInfo binfo = volumeMap.get(namespaceId, oldblock);
if (binfo == null) {
throw new IOException("Block " + oldblock
+ " doesn't exist or has been recovered to a new generation ");
}
File blockFile = binfo.getBlockDataFile().getFile();
long oldgs;
File oldMetaFile = null;
if (binfo.isInlineChecksum()) {
oldgs = BlockInlineChecksumReader
.getGenerationStampFromInlineChecksumFile(blockFile.getName());
} else {
oldMetaFile = BlockWithChecksumFileWriter.findMetaFile(blockFile);
oldgs = BlockWithChecksumFileReader.parseGenerationStampInMetaFile(
blockFile, oldMetaFile);
}
// First validate the update
//update generation stamp
if (oldgs > newblock.getGenerationStamp()) {
throw new IOException("Cannot update block (id=" + newblock.getBlockId()
+ ") generation stamp from " + oldgs
+ " to " + newblock.getGenerationStamp());
}
//update length
if (newblock.getNumBytes() > oldblock.getNumBytes()) {
throw new IOException("Cannot update block file (=" + blockFile
+ ") length from " + oldblock.getNumBytes() + " to " + newblock.getNumBytes());
}
// Although we've waited for the active threads all dead before updating
// the map so there should be no data race there, we still create new
// ActiveFile object to make sure in case another thread holds it,
// it won't cause any problem for us.
//
try {
volumeMap.copyOngoingCreates(namespaceId, oldblock);
} catch (CloneNotSupportedException e) {
// It should never happen.
throw new IOException("Cannot clone ActiveFile object", e);
}
// Now perform the update
File tmpMetaFile = null;
if (!binfo.isInlineChecksum()) {
// rename meta file to a tmp file
tmpMetaFile = new File(oldMetaFile.getParent(),
oldMetaFile.getName() + "_tmp" + newblock.getGenerationStamp());
if (!oldMetaFile.renameTo(tmpMetaFile)) {
throw new IOException("Cannot rename block meta file to " + tmpMetaFile);
}
}
long oldBlockLength;
if (!binfo.isInlineChecksum()) {
oldBlockLength = blockFile.length();
} else {
oldBlockLength = BlockInlineChecksumReader.getBlockSizeFromFileLength(
blockFile.length(), binfo.getChecksumType(),
binfo.getBytesPerChecksum());
}
ActiveFile file = null;
if (newblock.getNumBytes() < oldBlockLength) {
if (!binfo.isInlineChecksum()) {
new BlockWithChecksumFileWriter(binfo.getBlockDataFile(), tmpMetaFile)
.truncateBlock(oldBlockLength, newblock.getNumBytes());
} else {
new BlockInlineChecksumWriter(binfo.getBlockDataFile(), binfo.getChecksumType(),
binfo.getBytesPerChecksum(), datanode.writePacketSize)
.truncateBlock(newblock.getNumBytes());
}
file = volumeMap.getOngoingCreates(namespaceId, oldblock);
if (file != null) {
file.setBytesAcked(newblock.getNumBytes());
file.setBytesOnDisk(newblock.getNumBytes());
file.setBytesReceived(newblock.getNumBytes());
} else {
// This should never happen unless called from unit tests.
binfo.syncInMemorySize();
}
}
String newDataFileName;
if (!binfo.isInlineChecksum()) {
//rename the tmp file to the new meta file (with new generation stamp)
File newMetaFile = BlockWithChecksumFileWriter.getMetaFile(blockFile, newblock);
if (!tmpMetaFile.renameTo(newMetaFile)) {
throw new IOException("Cannot rename tmp meta file to " + newMetaFile);
}
} else {
newDataFileName = BlockInlineChecksumWriter.getInlineChecksumFileName(
newblock, binfo.getChecksumType(), binfo.getBytesPerChecksum());
File newDataFile = new File(blockFile.getParent(), newDataFileName);
if (!blockFile.renameTo(newDataFile)) {
throw new IOException("Cannot rename data file to " + newDataFileName);
}
// fsyncIfPossible parent directory to persist rename.
if (datanode.syncOnClose) {
NativeIO.fsyncIfPossible(newDataFile.getParent());
}
setDataFileForBlock(namespaceId, oldblock, newDataFile);
}
if(volumeMap.getOngoingCreates(namespaceId, oldblock) != null){
ActiveFile af = volumeMap.removeOngoingCreates(namespaceId, oldblock);
volumeMap.addOngoingCreates(namespaceId, newblock, af);
}
volumeMap.update(namespaceId, oldblock, newblock);
// paranoia! verify that the contents of the stored block
// matches the block file on disk.
validateBlockMetadata(namespaceId, newblock);
return null;
} finally {
lock.writeLock().unlock();
}
}
private final static String DISK_ERROR = "Possible disk error on file creation: ";
/** Get the cause of an I/O exception if caused by a possible disk error
* @param ioe an I/O exception
* @return cause if the I/O exception is caused by a possible disk error;
* null otherwise.
*/
static IOException getCauseIfDiskError(IOException ioe) {
if (ioe.getMessage()!=null && ioe.getMessage().startsWith(DISK_ERROR)) {
return (IOException)ioe.getCause();
} else {
return null;
}
}
/**
* Start writing to a block file
* If isRecovery is true and the block pre-exists, then we kill all
volumeMap.put(b, v);
volumeMap.put(b, v);
* other threads that might be writing to this block, and then reopen the file.
* If replicationRequest is true, then this operation is part of a block
* replication request.
*/
public DatanodeBlockWriter writeToBlock(int namespaceId, Block b, Block newBlock,
boolean isRecovery, boolean replicationRequest, int checksumType, int bytesPerChecksum)
throws IOException {
//
// Make sure the block isn't a valid one - we're still creating it!
//
if (isValidBlock(namespaceId, b, false)) {
if (!isRecovery) {
throw new BlockAlreadyExistsException("Block " + b + " is valid, and cannot be written to.");
}
// If the block was successfully finalized because all packets
// were successfully processed at the Datanode but the ack for
// some of the packets were not received by the client. The client
// re-opens the connection and retries sending those packets.
// The other reason is that an "append" is occurring to this block.
detachBlock(namespaceId, b, 1);
}
long blockSize = b.getNumBytes();
//
// Serialize access to /tmp, and check if file already there.
//
File f = null;
List<Thread> threads = null;
long expectedFileSize = ActiveFile.UNKNOWN_SIZE;
boolean inlineChecksum = datanode.useInlineChecksum;
DatanodeBlockInfo binfo;
FSVolume v = null;
Block targetBlock = b;
if (newBlock != null && newBlock != b) {
targetBlock = newBlock;
}
lock.writeLock().lock();
try {
//
// Is it already in the create process?
//
ActiveFile activeFile = volumeMap.getOngoingCreates(namespaceId, b);
if (activeFile != null) {
f = activeFile.getDataFile();
threads = activeFile.threads;
expectedFileSize = activeFile.getBytesWritten();
inlineChecksum = activeFile.isInlineChecksum();
if (!isRecovery) {
throw new BlockAlreadyExistsException("Block " + b +
" has already been started (though not completed), and thus cannot be created.");
} else {
for (Thread thread:threads) {
thread.interrupt();
}
}
volumeMap.removeOngoingCreates(namespaceId, b);
}
if (!isRecovery) {
if (newBlock != null && b != newBlock) {
throw new IOException("newBlock is not allowed except append case. ");
}
v = volumes.getNextVolume(blockSize);
// create temporary file to hold block in the designated volume
f = createTmpFile(namespaceId, v, b, replicationRequest,
inlineChecksum, checksumType, bytesPerChecksum);
} else if (f != null) {
DataNode.LOG.info("Reopen already-open Block for append " + b);
if (newBlock != null && b != newBlock) {
throw new IOException("newBlock is not allowed except append case. ");
}
// create or reuse temporary file to hold block in the designated volume
DatanodeBlockInfo oldBinfo = volumeMap.get(namespaceId, b);
inlineChecksum = oldBinfo.isInlineChecksum();
v = oldBinfo.getBlockDataFile().getVolume();
volumeMap.add(namespaceId, b, new DatanodeBlockInfo(v, f,
DatanodeBlockInfo.UNFINALIZED, true, inlineChecksum, checksumType,
bytesPerChecksum, false, 0));
} else {
// reopening block for appending to it.
DataNode.LOG.info("Reopen Block for append " + b);
if (newBlock == null) {
throw new IOException(
"newBlock is required for append af file to write. ");
}
DatanodeBlockInfo oldBinfo = volumeMap.get(namespaceId, b);
inlineChecksum = oldBinfo.isInlineChecksum();
v = oldBinfo.getBlockDataFile().getVolume();
f = createTmpFile(namespaceId, v, newBlock, replicationRequest,
inlineChecksum, checksumType, bytesPerChecksum);
File blkfile = getBlockFile(namespaceId, b);
if (!inlineChecksum) {
File oldmeta = BlockWithChecksumFileReader.getMetaFile(this, namespaceId, b);
File newmeta = BlockWithChecksumFileWriter.getMetaFile(f, newBlock);
// rename meta file to tmp directory
DataNode.LOG.debug("Renaming " + oldmeta + " to " + newmeta);
if (!oldmeta.renameTo(newmeta)) {
throw new IOException("Block " + b + " reopen failed. "
+ " Unable to move meta file " + oldmeta + " to tmp dir "
+ newmeta);
}
}
// rename block file to tmp directory
DataNode.LOG.debug("Renaming " + blkfile + " to " + f);
if (!blkfile.renameTo(f)) {
if (!f.delete()) {
throw new IOException("Block " + b + " reopen failed. " +
" Unable to remove file " + f);
}
if (!blkfile.renameTo(f)) {
throw new IOException("Block " + b + " reopen failed. " +
" Unable to move block file " + blkfile +
" to tmp dir " + f);
}
}
// fsyncIfPossible parent directory to persist rename.
if (datanode.syncOnClose) {
NativeIO.fsyncIfPossible(blkfile.getParent());
}
}
if (f == null) {
DataNode.LOG.warn("Block " + b + " reopen failed " +
" Unable to locate tmp file.");
throw new IOException("Block " + b + " reopen failed " +
" Unable to locate tmp file.");
}
// If this is a replication request, then this is not a permanent
// block yet, it could get removed if the datanode restarts. If this
// is a write or append request, then it is a valid block.
if (replicationRequest) {
binfo = new DatanodeBlockInfo(v, f, DatanodeBlockInfo.UNFINALIZED,
false, inlineChecksum, checksumType, bytesPerChecksum, false, 0);
} else {
binfo = new DatanodeBlockInfo(v, f, DatanodeBlockInfo.UNFINALIZED,
true, inlineChecksum, checksumType, bytesPerChecksum, false, 0);
}
if (newBlock != null && newBlock != b) {
volumeMap.remove(namespaceId, b);
}
volumeMap.add(namespaceId, targetBlock, binfo);
volumeMap.addOngoingCreates(namespaceId, targetBlock, new ActiveFile(binfo,
threads, expectedFileSize, datanode.updateBlockCrcWhenWrite));
} finally {
lock.writeLock().unlock();
}
try {
if (threads != null) {
for (Thread thread:threads) {
thread.join();
}
}
} catch (InterruptedException e) {
throw new IOException("Recovery waiting for thread interrupted.");
}
//
// Finally, allow a writer to the block file
// REMIND - mjc - make this a filter stream that enforces a max
// block size, so clients can't go crazy
//
if (DataNode.LOG.isDebugEnabled()) {
DataNode.LOG.debug("writeTo blockfile is " + f + " of size " + f.length());
}
if (inlineChecksum) {
return new BlockInlineChecksumWriter(binfo.getBlockDataFile(),
checksumType, bytesPerChecksum, datanode.writePacketSize);
} else {
File metafile = BlockWithChecksumFileWriter.getMetaFile(f, targetBlock);
if (DataNode.LOG.isDebugEnabled()) {
DataNode.LOG.debug("writeTo metafile is " + metafile + " of size "
+ metafile.length());
}
return new BlockWithChecksumFileWriter(binfo.getBlockDataFile(), metafile);
}
}
File createTmpFile(int namespaceId, FSVolume vol, Block blk,
boolean replicationRequest, boolean inlineChecksum, int checksumType,
int bytePerChecksum) throws IOException {
lock.writeLock().lock();
try {
if ( vol == null ) {
vol = volumeMap.get(namespaceId, blk).getBlockDataFile().getVolume();
if ( vol == null ) {
throw new IOException("Could not find volume for block " + blk);
}
}
return vol.createTmpFile(namespaceId, blk, replicationRequest,
inlineChecksum, checksumType, bytePerChecksum);
} finally {
lock.writeLock().unlock();
}
}
//
// REMIND - mjc - eventually we should have a timeout system
// in place to clean up block files left by abandoned clients.
// We should have some timer in place, so that if a blockfile
// is created but non-valid, and has been idle for >48 hours,
// we can GC it safely.
//
/**
* Complete the block write!
*/
@Override // FSDatasetInterface
public void finalizeBlock(int namespaceId, Block b) throws IOException {
finalizeBlockInternal(namespaceId, b, true);
}
@Override
public void finalizeBlockIfNeeded(int namespaceId, Block b) throws IOException {
finalizeBlockInternal(namespaceId, b, true);
}
/**
* Complete the block write!
*/
public void finalizeBlockInternal(int namespaceId, Block b, boolean reFinalizeOk)
throws IOException {
lock.writeLock().lock();
DatanodeBlockInfo binfo = volumeMap.get(namespaceId, b);
try {
ActiveFile activeFile = volumeMap.getOngoingCreates(namespaceId, b);
if (activeFile == null) {
if (reFinalizeOk) {
return;
} else {
throw new IOException("Block " + b + " is already finalized.");
}
}
File f = activeFile.getDataFile();
if (f == null || !f.exists()) {
throw new IOException("No temporary file " + f + " for block " + b);
}
FSVolume v = binfo.getBlockDataFile().getVolume();
if (v == null) {
throw new IOException("No volume for temporary file " + f +
" for block " + b);
}
File dest = null;
dest = v.addBlock(namespaceId, b, f, activeFile.isInlineChecksum(),
binfo.getChecksumType(), binfo.getBytesPerChecksum());
volumeMap.add(
namespaceId,
b,
new DatanodeBlockInfo(v, dest, activeFile.getBytesWritten(), true,
activeFile.isInlineChecksum(), binfo.getChecksumType(),
binfo.getBytesPerChecksum(),
activeFile.getCrcUpdater().isCrcValid(activeFile.getBytesWritten()),
activeFile.getCrcUpdater().getBlockCrc()));
ActiveFile af = volumeMap.removeOngoingCreates(namespaceId, b);
af.blockFinalize();
} finally {
lock.writeLock().unlock();
}
}
private boolean isBlockFinalizedInternal(int namespaceId, Block b,
boolean validate) {
DatanodeBlockInfo blockInfo = volumeMap.get(namespaceId, b);
// We skip the check for validate case to avoid redundant codes
// but keep old codes' behavior. Though it looks like a bug, but we
// would fix it in a separate patch.
//
if (!validate && blockInfo == null) {
return false; // block is not finalized
}
FSVolume v = blockInfo.getBlockDataFile().getVolume();
if (v == null) {
DataNode.LOG.warn("No volume for block " + b);
return false; // block is not finalized
}
ActiveFile activeFile = volumeMap.getOngoingCreates(namespaceId, b);
if (activeFile != null) {
if (validate) {
File f = activeFile.getDataFile();
if (f == null || !f.exists()) {
// we should never get into this position.
DataNode.LOG.warn("No temporary file " + f + " for block " + b);
}
}
return false; // block is not finalized
}
return true; // block is finalized
}
/**
* is this block finalized? Returns true if the block is already
* finalized, otherwise returns false.
*/
public boolean isBlockFinalized(int namespaceId, Block b) {
return isBlockFinalizedInternal(namespaceId, b, false);
}
/**
* is this block finalized? Returns true if the block is already
* finalized, otherwise returns false.
*/
private boolean isBlockFinalizedWithLock(int namespaceId, Block b) {
lock.readLock().lock();
try {
return isBlockFinalizedInternal(namespaceId, b, true);
} finally {
lock.readLock().unlock();
}
}
/**
* Remove the temporary block file (if any)
*/
public void unfinalizeBlock(int namespaceId, Block b) throws IOException {
lock.writeLock().lock();
try {
// remove the block from in-memory data structure
ActiveFile activefile = volumeMap.removeOngoingCreates(namespaceId, b);
if (activefile == null) {
return;
}
volumeMap.remove(namespaceId, b);
// delete the on-disk temp file
File metaFile = null;
if (!activefile.isInlineChecksum()) {
metaFile = BlockWithChecksumFileWriter.getMetaFile(
activefile.getDataFileToRead(), b);
}
if (delBlockFromDisk(activefile.getDataFileToRead(), metaFile, b)) {
DataNode.LOG.warn("Block " + b + " unfinalized and removed. " );
}
} finally {
lock.writeLock().unlock();
}
}
/**
* Remove a block from disk
* @param blockFile block file
* @param metaFile block meta file
* @param b a block
* @return true if on-disk files are deleted; false otherwise
*/
private boolean delBlockFromDisk(File blockFile, File metaFile, Block b) {
if (blockFile == null) {
DataNode.LOG.warn("No file exists for block: " + b);
return true;
}
if (!blockFile.delete()) {
DataNode.LOG.warn("Not able to delete the block file: " + blockFile);
return false;
} else { // remove the meta file
if (metaFile != null && !metaFile.delete()) {
DataNode.LOG.warn(
"Not able to delete the meta block file: " + metaFile);
return false;
}
}
return true;
}
/**
* Return a table of blocks being written data
* @throws IOException
*/
public Block[] getBlocksBeingWrittenReport(int namespaceId) throws IOException {
LightWeightHashSet<Block> blockSet = new LightWeightHashSet<Block>();
volumes.getBlocksBeingWrittenInfo(namespaceId, blockSet);
Block blockTable[] = new Block[blockSet.size()];
int i = 0;
for (Iterator<Block> it = blockSet.iterator(); it.hasNext(); i++) {
blockTable[i] = it.next();
}
return blockTable;
}
/**
* Get the list of finalized blocks from in-memory blockmap for a block pool.
*/
public Block[] getBlockReport(int namespaceId) throws IOException {
ArrayList<Block> ret = new ArrayList<Block>();
org.apache.hadoop.hdfs.server.datanode.NamespaceMap nm = volumeMap
.getNamespaceMap(namespaceId);
if (nm == null) {
return new Block[0];
}
int n = nm.getNumBucket();
for (int i = 0; i < n; i++) {
BlockBucket bb = nm.getBucket(i);
bb.getBlockReport(ret);
}
return ret.toArray(new Block[ret.size()]);
}
/**
* Check whether the given block is a valid one.
*/
public boolean isValidBlock(int namespaceId, Block b, boolean checkSize)
throws IOException {
File f = null;
;
try {
f = getValidateBlockFile(namespaceId, b, checkSize);
} catch (IOException e) {
DataNode.LOG.warn("Block " + b + " is not valid:", e);
}
return ((f != null) ? isBlockFinalizedWithLock(namespaceId, b) : false);
}
public boolean isValidVolume(File currentDir) throws IOException {
return volumes.isValidDir(currentDir);
}
/**
* Find the file corresponding to the block and return it if it exists.
*/
File validateBlockFile(int namespaceId, Block b) throws IOException {
return getValidateBlockFile(namespaceId, b, false);
}
/**
* Find the file corresponding to the block and return it if it exists.
*/
File getValidateBlockFile(int namespaceId, Block b, boolean checkSize)
throws IOException {
//Should we check for metadata file too?
DatanodeBlockInfo blockInfo = this.getDatanodeBlockInfo(namespaceId, b);
File f = null;
if (blockInfo != null) {
if (checkSize) {
blockInfo.verifyFinalizedSize();
}
f = blockInfo.getBlockDataFile().getFile();
assert f != null;
if(f.exists()) {
return f;
}
// if file is not null, but doesn't exist - possibly disk failed
datanode.checkDiskError();
}
if (InterDatanodeProtocol.LOG.isDebugEnabled()) {
InterDatanodeProtocol.LOG.debug("b=" + b + ", f=" + ((f == null) ? "null"
: f));
}
return null;
}
/** {@inheritDoc} */
public void validateBlockMetadata(int namespaceId, Block b) throws IOException {
DatanodeBlockInfo info;
lock.readLock().lock();
try {
info = volumeMap.get(namespaceId, b);
} finally {
lock.readLock().unlock();
}
if (info == null) {
throw new IOException("Block " + b + " does not exist in volumeMap.");
}
File f = info.getDataFileToRead();
// Try to find out block size
long localBlockSize;
if (f == null) {
f = info.getBlockDataFile().getTmpFile(namespaceId, b);
if (f == null) {
throw new IOException("Block " + b + " does not exist on disk.");
}
if (!f.exists()) {
throw new IOException("Block " + b +
" block file " + f +
" does not exist on disk.");
}
if (info.isInlineChecksum()) {
// TODO: do we want to do it?
localBlockSize = BlockInlineChecksumReader.getBlockSizeFromFileLength(
f.length(), info.getChecksumType(), info.getBytesPerChecksum());
} else {
localBlockSize = f.length();
}
} else {
if (info.isFinalized()) {
info.verifyFinalizedSize();
localBlockSize = info.getFinalizedSize();
} else {
if (info.isInlineChecksum()) {
// TODO: do we want to do it?
localBlockSize = BlockInlineChecksumReader
.getBlockSizeFromFileLength(f.length(), info.getChecksumType(),
info.getBytesPerChecksum());
} else {
localBlockSize = f.length();
}
}
}
if (b.getNumBytes() > localBlockSize) {
throw new IOException("Block " + b +
" length is " + b.getNumBytes() +
" does not match block file length " +
f.length());
}
long stamp;
DataChecksum dcs;
if (!info.isInlineChecksum()) {
File meta = BlockWithChecksumFileWriter.getMetaFile(f, b);
if (meta == null) {
throw new IOException("Block " + b +
" metafile does not exist.");
}
if (!meta.exists()) {
throw new IOException("Block " + b +
" metafile " + meta +
" does not exist on disk.");
}
long metaFileSize = meta.length();
if (metaFileSize == 0 && localBlockSize > 0) {
throw new IOException("Block " + b + " metafile " + meta + " is empty.");
}
stamp = BlockWithChecksumFileReader.parseGenerationStampInMetaFile(f,
meta);
if (metaFileSize == 0) {
// no need to check metadata size for 0 size file
return;
}
dcs = BlockMetadataHeader.readHeader(meta).getChecksum();
// verify that checksum file has an integral number of checkum values.
int checksumsize = dcs.getChecksumSize();
long actual = metaFileSize - BlockMetadataHeader.getHeaderSize();
long numChunksInMeta = actual/checksumsize;
if (actual % checksumsize != 0) {
throw new IOException("Block " + b +
" has a checksum file of size " + metaFileSize +
" but it does not align with checksum size of " +
checksumsize);
}
int bpc = dcs.getBytesPerChecksum();
long minDataSize = (numChunksInMeta - 1) * bpc;
long maxDataSize = numChunksInMeta * bpc;
if (localBlockSize > maxDataSize || localBlockSize <= minDataSize) {
throw new IOException("Block " + b +
" is of size " + f.length() +
" but has " + (numChunksInMeta + 1) +
" checksums and each checksum size is " +
checksumsize + " bytes.");
}
} else {
stamp = BlockInlineChecksumReader
.getGenerationStampFromInlineChecksumFile(f.getName());
if (localBlockSize == 0) {
// no need to check metadata size for 0 size file
return;
}
// TODO: What verification we can do here?
}
if (stamp != b.getGenerationStamp()) {
throw new IOException("Block " + b +
" genstamp is " + b.getGenerationStamp() +
" does not match meta file stamp " +
stamp);
}
// We could crc-check the entire block here, but it will be a costly
// operation. Instead we rely on the above check (file length mismatch)
// to detect corrupt blocks.
}
/**
* We're informed that a block is no longer valid. We
* could lazily garbage-collect the block, but why bother?
* just get rid of it.
*/
public void invalidate(int namespaceId, Block invalidBlks[]) throws IOException {
boolean error = false;
for (int i = 0; i < invalidBlks.length; i++) {
File f = null;
FSVolume v;
boolean inlineChecksum;
DatanodeBlockInfo dinfo = null;
lock.writeLock().lock();
try {
dinfo = volumeMap.get(namespaceId, invalidBlks[i]);
if (dinfo == null) {
// It is possible that after block reports, Datanodes receive
// duplicate invalidate requests from name-node. We just skip
// the block. In the end of the function, we don't throw an exception,
// since no need for a disk check.
//
DataNode.LOG.info("Unexpected error trying to delete block "
+ invalidBlks[i] +
". BlockInfo not found in volumeMap.");
continue;
}
inlineChecksum = dinfo.isInlineChecksum();
f = dinfo.getDataFileToRead();
v = dinfo.getBlockDataFile().getVolume();
if (f == null) {
DataNode.LOG.warn("Unexpected error trying to delete block "
+ invalidBlks[i] +
". Block not found in blockMap." +
((v == null) ? " " : " Block found in volumeMap."));
error = true;
continue;
}
if (v == null) {
DataNode.LOG.warn("Unexpected error trying to delete block "
+ invalidBlks[i] +
". No volume for this block." +
" Block found in blockMap. " + f + ".");
error = true;
continue;
}
File parent = f.getParentFile();
if (parent == null) {
DataNode.LOG.warn("Unexpected error trying to delete block "
+ invalidBlks[i] +
". Parent not found for file " + f + ".");
error = true;
continue;
}
//TODO ???
v.clearPath(namespaceId, parent);
volumeMap.remove(namespaceId, invalidBlks[i]);
} finally {
lock.writeLock().unlock();
}
// close the File Channel
dinfo.getBlockDataFile().closeFileChannel();
//rename the files to be deleted
//for safety we add prefix instead of suffix,
//so the valid block files still start with "blk_"
File blockFileRenamed = new File(f.getParent()
+ File.separator + DELETE_FILE_EXT + f.getName());
File metaFile = null;
File metaFileRenamed = null;
if (!inlineChecksum) {
metaFile = BlockWithChecksumFileWriter.getMetaFile( f, invalidBlks[i]);
metaFileRenamed = new File(metaFile.getParent()
+ File.separator + DELETE_FILE_EXT + metaFile.getName());
}
if((!f.renameTo(blockFileRenamed)) ||
(!inlineChecksum && !metaFile.renameTo(metaFileRenamed))) {
DataNode.LOG.warn("Unexpected error trying to delete block "
+ invalidBlks[i] +
". Cannot rename files for deletion.");
error = true;
continue;
}
if(invalidBlks[i].getNumBytes() != BlockFlags.NO_ACK){
datanode.notifyNamenodeDeletedBlock(namespaceId, invalidBlks[i]);
}
// Delete the block asynchronously to make sure we can do it fast enough
asyncDiskService.deleteAsync(v, blockFileRenamed, metaFileRenamed,
invalidBlks[i].toString(), namespaceId);
}
if (error) {
throw new IOException("Error in deleting blocks.");
}
}
/**
* Turn the block identifier into a filename.
*/
public File getFile(int namespaceId, Block b) {
lock.readLock().lock();
try {
DatanodeBlockInfo info = volumeMap.get(namespaceId, b);
if (info != null) {
return info.getDataFileToRead();
}
return null;
} finally {
lock.readLock().unlock();
}
}
@Override
public DatanodeBlockInfo getDatanodeBlockInfo(int namespaceId, Block b) {
return volumeMap.get(namespaceId, b);
}
@Override
public ReplicaToRead getReplicaToRead(int namespaceId, Block block) {
lock.readLock().lock();
try {
ActiveFile activefile = volumeMap.getOngoingCreates(namespaceId, block);
if (activefile != null) {
return activefile;
}
DatanodeBlockInfo info = volumeMap.get(namespaceId, block);
if (info == null) {
if (DataNode.LOG.isDebugEnabled()) {
DataNode.LOG.debug("volumeMap=" + volumeMap);
}
}
return info;
} finally {
lock.readLock().unlock();
}
}
/**
* check if a data directory is healthy
* if some volumes failed - make sure to remove all the blocks that belong
* to these volumes
* @throws DiskErrorException
*/
public void checkDataDir() throws DiskErrorException {
long total_blocks=0, removed_blocks=0;
List<FSVolume> failed_vols = null;
failed_vols = volumes.checkDirs();
//if there no failed volumes return
if(failed_vols == null)
return;
// else
// remove related blocks
long mlsec = System.currentTimeMillis();
lock.writeLock().lock();
try {
volumeMap.removeUnhealthyVolumes(failed_vols);
} finally {
lock.writeLock().unlock();
}
mlsec = System.currentTimeMillis() - mlsec;
DataNode.LOG.warn(">>>>>>>>>>>>Removed " + removed_blocks + " out of " + total_blocks +
"(took " + mlsec + " millisecs)");
// report the error
StringBuilder sb = new StringBuilder();
for(FSVolume fv : failed_vols) {
sb.append(fv.toString() + ";");
}
throw new DiskErrorException("DataNode failed volumes:" + sb);
}
/**
* remove directories that are given from the list of volumes to use.
* This function also makes sure to remove all the blocks that belong to
* these volumes.
*/
public void removeVolumes(Configuration conf, List<File> directories)
throws Exception {
if (directories == null || directories.isEmpty()) {
DataNode.LOG.warn("There were no directories to remove. Exiting ");
return;
}
List<FSVolume> volArray = null;
lock.readLock().lock();
try {
volArray = volumes.removeBVolumes(directories);
} finally {
lock.readLock().unlock();
}
// remove related blocks
long mlsec = System.currentTimeMillis();
lock.writeLock().lock();
try {
volumeMap.removeUnhealthyVolumes(volArray);
} finally {
lock.writeLock().unlock();
}
mlsec = System.currentTimeMillis() - mlsec;
DataNode.LOG.warn(">>>>>>>>>Removing these blocks took " + mlsec +
" millisecs in refresh<<<<<<<<<<<<<<< ");
StringBuilder sb = new StringBuilder();
for(FSVolume fv : volArray) {
sb.append(fv.toString() + ";");
}
throw new DiskErrorException("These volumes were removed: " + sb);
}
public void addVolumes(Configuration conf, int namespaceId, String nsDir,
Collection<StorageDirectory> dirs) throws Exception {
if (dirs == null || dirs.isEmpty()) {
return;
}
FSVolume[] volArray = new FSVolume[dirs.size()];
File[] dirArray = new File[dirs.size()];
int idx = 0;
for (Iterator<StorageDirectory> iter = dirs.iterator() ; iter.hasNext(); idx++) {
dirArray[idx] = iter.next().getCurrentDir();
volArray[idx] = new FSVolume(this, dirArray[idx], conf);
}
lock.writeLock().lock();
try {
volumes.addVolumes(volArray);
for (FSVolume vol : volArray) {
vol.addNamespace(namespaceId, nsDir, conf, datanode.isSupportAppends());
}
} finally {
lock.writeLock().unlock();
}
asyncDiskService.insertDisk(dirArray, conf);
}
public String toString() {
return "FSDataset{dirpath='"+volumes+"'}";
}
ObjectName mbeanName;
ObjectName versionBeanName;
Random rand = new Random();
/**
* Register the FSDataset MBean using the name
* "hadoop:service=DataNode,name=FSDatasetState-<storageid>"
*/
void registerMBean(final String storageId) {
// We wrap to bypass standard mbean naming convetion.
// This wraping can be removed in java 6 as it is more flexible in
// package naming for mbeans and their impl.
StandardMBean bean;
String storageName;
if (storageId == null || storageId.equals("")) {// Temp fix for the uninitialized storage
storageName = "UndefinedStorageId" + rand.nextInt();
} else {
storageName = storageId;
}
try {
bean = new StandardMBean(this,FSDatasetMBean.class);
mbeanName = MBeanUtil.registerMBean("DataNode", "FSDatasetState-" + storageName, bean);
versionBeanName = VersionInfo.registerJMX("DataNode");
} catch (NotCompliantMBeanException e) {
e.printStackTrace();
}
DataNode.LOG.info("Registered FSDatasetStatusMBean");
}
public void shutdown() {
if (blockCrcMapFlusher != null) {
blockCrcMapFlusher.setClose();
}
if (blockCrcMapFlusherThread != null) {
blockCrcMapFlusherThread.interrupt();
try {
this.blockCrcMapFlusherThread.join();
this.blockCrcMapFlusherThread = null;
} catch (InterruptedException ie) {
}
}
if (mbeanName != null)
MBeanUtil.unregisterMBean(mbeanName);
if (versionBeanName != null) {
MBeanUtil.unregisterMBean(versionBeanName);
}
if (asyncDiskService != null) {
asyncDiskService.shutdown();
}
if(volumes != null) {
lock.writeLock().lock();
try {
if (volumes.scannersExecutor != null) {
volumes.scannersExecutor.shutdown();
}
for (FSVolume volume : volumes.getVolumes()) {
if(volume != null) {
volume.shutdown();
}
}
} finally {
lock.writeLock().unlock();
}
}
}
public void addNamespace(int namespaceId, String nsDir,
Configuration conf) throws IOException {
DataNode.LOG.info("Adding namespace " + namespaceId);
lock.writeLock().lock();
try{
volumeMap.initNamespace(namespaceId);
volumes.addNamespace(namespaceId, nsDir, conf);
} finally {
lock.writeLock().unlock();
}
// Load block CRCs file files
int numBuckets = volumeMap.getNumBuckets(namespaceId);
for (FSVolume volume : volumes.getVolumes()) {
try {
File blockCrcFile = volume.getBlockCrcFile(namespaceId);
if (blockCrcFile == null || !blockCrcFile.exists()) {
continue;
}
int numUpdated = 0;
FileInputStream fis = new FileInputStream(blockCrcFile);
try {
BlockCrcFileReader reader = new BlockCrcFileReader(
new DataInputStream(fis));
reader.readHeader();
if (reader.getNumBuckets() != numBuckets) {
// TODO: support it if needed. Now it's not clear whether we will
// ever need it.
DataNode.LOG
.warn("Do not yet support loading block CRCs if bucket size changes: bucket size on disk: "
+ reader.getNumBuckets());
} else {
numUpdated += volumeMap.updateBlockCrc(namespaceId, reader);
}
} finally {
fis.close();
}
DataNode.LOG.info("Finish loading Block CRC file for namespace "
+ namespaceId + " volume " + volume + " " + numUpdated
+ " blocks' CRC updated.");
} catch (IOException ioe) {
DataNode.LOG.warn("IOException when try to load block CRC fle from volume"
+ volume.getDir(), ioe);
} finally {
volume.setNamespaceBlockCrcLoaded(namespaceId, true);
}
}
}
public void removeNamespace(int namespaceId){
DataNode.LOG.info("Removing namespace " + namespaceId);
lock.writeLock().lock();
try{
if (volumeMap != null) {
volumeMap.removeNamespace(namespaceId);
}
if (volumes != null) {
volumes.removeNamespace(namespaceId);
}
} finally {
lock.writeLock().unlock();
}
}
public String getStorageInfo() {
return toString();
}
@Override
public BlockRecoveryInfo startBlockRecovery(int namespaceId, long blockId)
throws IOException {
Block stored = getStoredBlock(namespaceId, blockId, true);
if (stored == null) {
return null;
}
// It's important that this loop not be synchronized - otherwise
// this will deadlock against the thread it's joining against!
while (true) {
DataNode.LOG.debug(
"Interrupting active writer threads for block " + stored);
List<Thread> activeThreads = getActiveThreads(namespaceId, stored);
if (activeThreads == null) break;
if (interruptAndJoinThreads(activeThreads))
break;
}
lock.readLock().lock();
try {
// now that writers are stopped, re-fetch the block's meta info
stored = getStoredBlock(namespaceId, blockId, true);
if (stored == null) {
return null;
}
ActiveFile activeFile = volumeMap.getOngoingCreates(namespaceId, stored);
boolean isRecovery = (activeFile != null) && activeFile.wasRecoveredOnStartup;
BlockRecoveryInfo info = new BlockRecoveryInfo(stored, isRecovery);
if (DataNode.LOG.isDebugEnabled()) {
DataNode.LOG.debug("getBlockMetaDataInfo successful block=" + stored +
" length " + stored.getNumBytes() +
" genstamp " + stored.getGenerationStamp());
}
// paranoia! verify that the contents of the stored block
// matches the block file on disk.
validateBlockMetadata(namespaceId, stored);
return info;
} finally {
lock.readLock().unlock();
}
}
/**
* Copies a file as fast as possible. Tries to do a hardlink instead of a copy
* if the hardlink parameter is specified.
*
* @param src
* the source file for copying
* @param dst
* the destination file for copying
* @param hardlink
* whether or not to attempt a hardlink
* @throws IOException
*/
public void copyFile(File src, File dst, boolean hardlink) throws IOException {
if (src == null || dst == null) {
throw new IOException("src/dst file is null");
}
try {
if (hardlink && shouldHardLinkBlockCopy) {
// Remove destination before hard linking, since this file might already
// exist and a hardlink would fail as a result.
if (dst.exists()) {
if(!dst.delete()) {
throw new IOException("Deletion of file : " + dst + " failed");
}
}
NativeIO.link(src, dst);
DataNode.LOG.info("Hard Link Created from : " + src + " to " + dst);
return;
}
} catch (IOException e) {
DataNode.LOG.warn("Hard link failed from : " + src + " to " + dst
+ " continuing with regular file copy");
}
FileChannel input = null;
FileChannel output = null;
try {
// This improves copying performance a lot, it uses native buffers
// for copying.
input = new FileInputStream(src).getChannel();
output = new FileOutputStream(dst).getChannel();
if (input == null || output == null) {
throw new IOException("Could not create file channels for src : " + src
+ " dst : " + dst);
}
long bytesLeft = input.size();
long position = 0;
while (bytesLeft > 0) {
long bytesWritten = output.transferFrom(input, position, bytesLeft);
bytesLeft -= bytesWritten;
position += bytesWritten;
}
if (datanode.syncOnClose) {
output.force(true);
}
} finally {
if (input != null) {
input.close();
}
if (output != null) {
output.close();
}
}
}
/**
* Find a volume on the datanode for the destination block to be placed on.
* It tries to place the destination block on the same volume as the source
* block since hardlinks can be performed only between two files on the same
* disk
*
* @param srcFileSystem
* the file system for srcBlockFile
* @param srcNamespaceId
* the namespace id for srcBlock
* @param srcBlock
* the source block which needs to be hardlinked
* @param srcBlockFile
* the block file for srcBlock
* @return the FSVolume on which we should put the dstBlock, null if we can't
* find such a volume.
* @throws IOException
*/
private FSVolume findVolumeForHardLink(String srcFileSystem,
int srcNamespaceId, Block srcBlock, File srcBlockFile)
throws IOException {
FSVolume dstVol = null;
if (srcBlockFile == null || !srcBlockFile.exists()) {
throw new IOException("File " + srcBlockFile
+ " is not valid or does not have"
+ " a valid block file");
}
// The source file might not necessarily be a part of the FSVolumeSet of
// this datanode, it could be part of a FSVolumeSet of another datanode on
// the same host.
DatanodeBlockInfo blockInfo = volumeMap.get(srcNamespaceId, srcBlock);
if (blockInfo != null) {
dstVol = blockInfo.getBlockDataFile().getVolume();
} else {
for(FSVolume volume : volumes.getVolumes()) {
String volFileSystem = volume.getFileSystem();
if (volFileSystem.equals(srcFileSystem)) {
dstVol = volume;
break;
}
}
}
return dstVol;
}
/**
* Finds a volume for the dstBlock and adds the new block to the FSDataset
* data structures to indicate we are going to start writing to the block.
*
* @param srcFileSystem
* the file system for srcBlockFile
* @param srcBlockFile
* the block file for the srcBlock
* @param srcNamespaceId
* the namespace id for source block
* @param srcBlock
* the source block that needs to be copied over
* @param dstNamespaceId
* the namespace id for destination block
* @param dstBlock
* the new destination block that needs to be created for copying
* @return returns whether or not a hardlink is possible, if hardlink was not
* requested this is always false.
* @throws IOException
*/
private boolean copyBlockLocalAdd(String srcFileSystem, File srcBlockFile,
int srcNamespaceId, Block srcBlock, int dstNamespaceId, Block dstBlock)
throws IOException {
boolean hardlink = true;
File dstBlockFile = null;
lock.writeLock().lock();
try {
if (isValidBlock(dstNamespaceId, dstBlock, false) ||
volumeMap.getOngoingCreates(dstNamespaceId, dstBlock) != null) {
throw new BlockAlreadyExistsException("Block " + dstBlock
+ " already exists");
}
if (srcBlockFile == null || !srcBlockFile.exists()) {
throw new IOException("Block " + srcBlock.getBlockName()
+ " is not valid or does not have a valid block file");
}
boolean inlineChecksum = Block.isInlineChecksumBlockFilename(srcBlockFile
.getName());
FSVolume dstVol = null;
if (shouldHardLinkBlockCopy) {
dstVol = findVolumeForHardLink(
srcFileSystem, srcNamespaceId, srcBlock, srcBlockFile);
}
// Could not find a volume for a hard link, fall back to regular file
// copy.
if (dstVol == null) {
dstVol = volumes.getNextVolume(srcBlock.getNumBytes());
hardlink = false;
}
int checksumType = DataChecksum.CHECKSUM_UNKNOWN;
int bytesPerChecksum = -1;
if (inlineChecksum) {
GenStampAndChecksum sac = BlockInlineChecksumReader
.getGenStampAndChecksumFromInlineChecksumFile(srcBlockFile
.getName());
checksumType = sac.checksumType;
bytesPerChecksum = sac.bytesPerChecksum;
}
List<Thread> threads = null;
// We do not want to create a BBW, hence treat this as a replication
// request.
dstBlockFile = createTmpFile(dstNamespaceId, dstVol, dstBlock, true,
inlineChecksum, checksumType, bytesPerChecksum);
DatanodeBlockInfo binfo = new DatanodeBlockInfo(dstVol, dstBlockFile,
DatanodeBlockInfo.UNFINALIZED, true, inlineChecksum, checksumType,
bytesPerChecksum, false, 0);
volumeMap.add(dstNamespaceId, dstBlock, binfo);
volumeMap.addOngoingCreates(dstNamespaceId, dstBlock, new ActiveFile(
binfo, threads, ActiveFile.UNKNOWN_SIZE, false));
} finally {
lock.writeLock().unlock();
}
if (dstBlockFile == null) {
throw new IOException("Could not allocate block file for : " +
dstBlock.getBlockName());
}
return hardlink;
}
/**
* Finalize the block in FSDataset.
*
* @param dstNamespaceId
* the namespace id for dstBlock
* @param dstBlock
* the block that needs to be finalized
* @param dstBlockFile
* the block file for the block that has to be finalized
* @throws IOException
*/
private void copyBlockLocalFinalize(int dstNamespaceId,
Block dstBlock, File dstBlockFile)
throws IOException {
boolean inlineChecksum = Block.isInlineChecksumBlockFilename(dstBlockFile
.getName());
long blkSize = 0;
long fileSize = dstBlockFile.length();
lock.writeLock().lock();
try {
DatanodeBlockInfo info = volumeMap.get(dstNamespaceId, dstBlock);
if (info == null) {
throw new IOException("Could not find information for " + dstBlock);
}
if (inlineChecksum) {
blkSize = BlockInlineChecksumReader.getBlockSizeFromFileLength(fileSize,
info.getChecksumType(), info.getBytesPerChecksum());
} else {
blkSize = fileSize;
}
FSVolume dstVol = info.getBlockDataFile().getVolume();
// Finalize block on disk.
File dest = dstVol.addBlock(dstNamespaceId, dstBlock, dstBlockFile,
info.isInlineChecksum(), info.getChecksumType(),
info.getBytesPerChecksum());
volumeMap.add(dstNamespaceId, dstBlock,
new DatanodeBlockInfo(dstVol, dest, blkSize, true, inlineChecksum,
info.getChecksumType(), info.getBytesPerChecksum(), false, 0));
volumeMap.removeOngoingCreates(dstNamespaceId, dstBlock);
} finally {
lock.writeLock().unlock();
}
}
/** {@inheritDoc} */
@Override
public void copyBlockLocal(String srcFileSystem, File srcBlockFile,
int srcNamespaceId, Block srcBlock, int dstNamespaceId, Block dstBlock)
throws IOException {
File dstBlockFile = null;
try {
boolean hardlink = copyBlockLocalAdd(srcFileSystem, srcBlockFile,
srcNamespaceId, srcBlock, dstNamespaceId, dstBlock);
DatanodeBlockInfo binfo = volumeMap.get(dstNamespaceId, dstBlock);
dstBlockFile = binfo.getDataFileToRead();
// Copy files.
copyFile(srcBlockFile, dstBlockFile, hardlink);
// Copy metafile.
if (!binfo.isInlineChecksum()) {
File metaFileSrc = BlockWithChecksumFileWriter.getMetaFile(srcBlockFile, srcBlock);
File metaFileDst = BlockWithChecksumFileWriter.getMetaFile(dstBlockFile, dstBlock);
copyFile(metaFileSrc, metaFileDst, hardlink);
}
// Finalize block
copyBlockLocalFinalize(dstNamespaceId, dstBlock, dstBlockFile);
} catch (BlockAlreadyExistsException be) {
throw be;
} catch (IOException e) {
unfinalizeBlock(dstNamespaceId, dstBlock);
throw e;
}
}
/** {@inheritDoc} */
@Override
public String getFileSystemForBlock(int namespaceId, Block block) throws IOException {
if (!isValidBlock(namespaceId, block, false)) {
throw new IOException("Invalid block");
}
return volumeMap.get(namespaceId, block).getBlockDataFile().getVolume()
.getFileSystem();
}
static File createTmpFile(Block b, File f) throws IOException {
if (f.exists()) {
throw new IOException("Unexpected problem in creating temporary file for "+
b + ". File " + f + " should not be present, but is.");
}
// Create the zero-length temp file
//
boolean fileCreated = false;
try {
fileCreated = f.createNewFile();
} catch (IOException ioe) {
throw (IOException)new IOException(DISK_ERROR +f).initCause(ioe);
}
if (!fileCreated) {
throw new IOException("Unexpected problem in creating temporary file for "+
b + ". File " + f + " should be creatable, but is already present.");
}
return f;
}
@Override
public long size(int namespaceId) {
try {
return volumeMap.size(namespaceId);
} catch (Exception e) {
return -1;
}
}
/**
* Reconcile the difference between blocks on the disk and blocks in
* volumeMap
*
* Check the given block for inconsistencies. Look at the
* current state of the block and reconcile the differences as follows:
* <ul>
* <li>If the block file is missing, delete the block from volumeMap</li>
* <li>If the block file exists and the block is missing in volumeMap,
* add the block to volumeMap <li>
* <li>If generation stamp does not match, then update the block with right
* generation stamp</li>
* <li>If the block length in memory does not match the actual block file length
* then mark the block as corrupt and update the block length in memory</li>
* <li>If the file in {@link ReplicaInfo} does not match the file on
* the disk, update {@link ReplicaInfo} with the correct file</li>
* </ul>
*
* @param blockId Block that differs
* @param diskFile Block file on the disk
* @param diskMetaFile Metadata file from on the disk
* @param vol Volume of the block file
*/
public void checkAndUpdate(Integer nsid, FSDatasetDelta delta,
ScanDifference info) throws IOException {
long blockId = info.getBlockId();
lock.writeLock().lock();
try {
// we don't want delta to record changes we do during reconciliation
delta.stopRecordingDelta();
if (delta.get(nsid, blockId) != null) {
// FIXME Presence of the block in delta means that it was changed
// somehow
// during the interval of time right after the difference computation in
// directory scanner and before acquiring of writeLock in this method.
// We can probably go through different operations that could happen
// with the block
// and write some logic for each of them, but this adds lots of
// complexity. Instead
// we just skip reconciliation for the block at this time. If it has
// problems we're likely
// to solve them next time
return;
}
Block memBlock = new Block(blockId, 0, GenerationStamp.WILDCARD_STAMP);
DatanodeBlockInfo memBlockInfo = volumeMap.get(nsid, memBlock);
if (memBlockInfo != null && !memBlockInfo.isFinalized()) {
// Block is not finalized - ignore the difference
return;
}
// We don't have any files for this block on disk
if (info.getState() == ScanDifference.DISK_FILES_MISSING) {
if (memBlockInfo == null) {
return;
}
volumeMap.remove(nsid, memBlock);
LOG.info("checkAndUpdate: removing block: " + memBlock
+ " for namespace: " + nsid);
if (datanode.blockScanner != null) {
datanode.blockScanner.deleteBlock(nsid, memBlock);
}
return;
}
// We dont' have block in memory, but have some of its files on disk
if (info.getState() == ScanDifference.MEMORY_BLOCK_MISSING) {
// if there's a block file, then add it to volumeMap, otherwise
// remove metaFile if any
if (info.getBlockFile() != null) {
Block newBlock = new Block(blockId, info.getLength(),
info.getGenStamp());
boolean isInlineChecksum = info.isInlineChecksum();
DatanodeBlockInfo diskBlockInfo = null;
if (isInlineChecksum) {
GenStampAndChecksum sac = BlockInlineChecksumReader
.getGenStampAndChecksumFromInlineChecksumFile(info
.getBlockFile().getName());
diskBlockInfo = new DatanodeBlockInfo(info.getVolume(),
info.getBlockFile(), info.getLength(), true, true,
sac.checksumType, sac.bytesPerChecksum, false, 0);
} else {
diskBlockInfo = new DatanodeBlockInfo(info.getVolume(),
info.getBlockFile(), info.getLength(), true, false,
DataChecksum.CHECKSUM_UNKNOWN, -1, false, 0);
}
volumeMap.add(nsid, newBlock, diskBlockInfo);
LOG.info("checkAndUpdate: adding block: " + newBlock
+ " for namespace: " + nsid + " size: "
+ diskBlockInfo.getBytesVisible());
if (datanode.blockScanner != null) {
datanode.blockScanner.addBlock(nsid, newBlock);
}
} else {
// scheduling a file for deletion
asyncDiskService
.deleteAsyncFile(info.getVolume(), info.getMetaFile());
}
return;
}
// We have this block in memory and some of its files on disk
if (info.getState() == ScanDifference.OUT_OF_SYNC) {
if (info.getBlockFile() == null) {
volumeMap.remove(nsid, memBlock);
LOG.info("checkAndUpdate: removing block: " + memBlock
+ " for namespace: " + nsid);
if (datanode.blockScanner != null) {
datanode.blockScanner.deleteBlock(nsid, memBlock);
}
// scheduling a file for deletion
asyncDiskService
.deleteAsyncFile(info.getVolume(), info.getMetaFile());
} else {
if (memBlockInfo == null) {
return;
}
memBlockInfo.getBlock().setNumBytes(info.getLength());
memBlockInfo.getBlock().setGenerationStamp(info.getGenStamp());
LOG.info("checkAndUpdate: updating block: " + memBlockInfo
+ " for namespace: " + nsid);
}
return;
}
} finally {
try {
delta.startRecordingDelta();
} finally {
lock.writeLock().unlock();
}
}
}
}