package org.apache.hadoop.raid;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.nio.channels.FileChannel;
import java.nio.channels.SocketChannel;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Random;
import java.util.zip.CRC32;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FilterFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DataTransferProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlockWithMetaInfo;
import org.apache.hadoop.hdfs.protocol.LocatedBlocksWithMetaInfo;
import org.apache.hadoop.hdfs.protocol.VersionAndOpcode;
import org.apache.hadoop.hdfs.protocol.VersionedLocatedBlocks;
import org.apache.hadoop.hdfs.protocol.WriteBlockHeader;
import org.apache.hadoop.hdfs.protocol.FSConstants.DatanodeReportType;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
import org.apache.hadoop.hdfs.server.datanode.BlockDataFile;
import org.apache.hadoop.hdfs.server.datanode.BlockSender;
import org.apache.hadoop.hdfs.server.datanode.BlockWithChecksumFileReader;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.FSDataset;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.raid.StripeStore.StripeInfo;
import org.apache.hadoop.raid.LogUtils.LOGRESULTS;
import org.apache.hadoop.raid.LogUtils.LOGTYPES;
import org.apache.hadoop.util.DataChecksum;
import org.apache.hadoop.util.Progressable;
/**
* this class implements the actual reconstructing functionality
* we keep this in a separate class so that
* the distributed block fixer can use it
*/
abstract class BlockReconstructor extends Configured {
public static final Log LOG = LogFactory.getLog(BlockReconstructor.class);
public static final int SEND_BLOCK_MAX_RETRIES = 3;
BlockReconstructor(Configuration conf) throws IOException {
super(conf);
}
/**
* Is the path a parity file of a given Codec?
*/
boolean isParityFile(Path p, Codec c) {
return isParityFile(p.toUri().getPath(), c);
}
boolean isParityFile(String pathStr, Codec c) {
if (pathStr.contains(RaidNode.HAR_SUFFIX)) {
return false;
}
return pathStr.startsWith(c.getParityPrefix());
}
/**
* Fix a file, report progess.
*
* @return true if file was reconstructed, false if no reconstruction
* was necessary or possible.
*/
boolean reconstructFile(Path srcPath, Context context)
throws IOException, InterruptedException {
Progressable progress = context;
if (progress == null) {
progress = RaidUtils.NULL_PROGRESSABLE;
}
FileSystem fs = srcPath.getFileSystem(getConf());
FileStatus srcStat = null;
try {
srcStat = fs.getFileStatus(srcPath);
} catch (FileNotFoundException ex) {
return false;
}
if (RaidNode.isParityHarPartFile(srcPath)) {
return processParityHarPartFile(srcPath, progress);
}
// Reconstruct parity file
for (Codec codec : Codec.getCodecs()) {
if (isParityFile(srcPath, codec)) {
Decoder decoder = new Decoder(getConf(), codec);
decoder.connectToStore(srcPath);
return processParityFile(srcPath,
decoder, context);
}
}
// Reconstruct source file without connecting to stripe store
for (Codec codec : Codec.getCodecs()) {
ParityFilePair ppair = ParityFilePair.getParityFile(
codec, srcStat, getConf());
if (ppair != null) {
Decoder decoder = new Decoder(getConf(), codec);
decoder.connectToStore(srcPath);
return processFile(srcPath, ppair, decoder, false, context);
}
}
// Reconstruct source file through stripe store
for (Codec codec : Codec.getCodecs()) {
if (!codec.isDirRaid) {
continue;
}
try {
// try to fix through the stripe store.
Decoder decoder = new Decoder(getConf(), codec);
decoder.connectToStore(srcPath);
if (processFile(srcPath, null, decoder, true, context)) {
return true;
}
} catch (Exception ex) {
LogUtils.logRaidReconstructionMetrics(LOGRESULTS.FAILURE, 0,
codec, srcPath, -1, LOGTYPES.OFFLINE_RECONSTRUCTION_USE_STRIPE,
fs, ex, context);
}
}
return false;
}
/**
* Sorts source files ahead of parity files.
*/
void sortLostFiles(List<String> files) {
// TODO: We should first fix the files that lose more blocks
Comparator<String> comp = new Comparator<String>() {
public int compare(String p1, String p2) {
Codec c1 = null;
Codec c2 = null;
for (Codec codec : Codec.getCodecs()) {
if (isParityFile(p1, codec)) {
c1 = codec;
} else if (isParityFile(p2, codec)) {
c2 = codec;
}
}
if (c1 == null && c2 == null) {
return 0; // both are source files
}
if (c1 == null && c2 != null) {
return -1; // only p1 is a source file
}
if (c2 == null && c1 != null) {
return 1; // only p2 is a source file
}
return c2.priority - c1.priority; // descending order
}
};
Collections.sort(files, comp);
}
/**
* Returns a DistributedFileSystem hosting the path supplied.
*/
protected DistributedFileSystem getDFS(Path p) throws IOException {
FileSystem fs = p.getFileSystem(getConf());
DistributedFileSystem dfs = null;
if (fs instanceof DistributedFileSystem) {
dfs = (DistributedFileSystem) fs;
} else if (fs instanceof FilterFileSystem) {
FilterFileSystem ffs = (FilterFileSystem) fs;
if (ffs.getRawFileSystem() instanceof DistributedFileSystem) {
dfs = (DistributedFileSystem) ffs.getRawFileSystem();
}
}
return dfs;
}
/**
* Throw exceptions for blocks with lost checksums or stripes
*/
void checkLostBlocks(List<Block> blocksLostChecksum,
List<Block> blocksLostStripe, Path p, Codec codec)
throws IOException {
StringBuilder message = new StringBuilder();
if (blocksLostChecksum.size() > 0) {
message.append("Lost " + blocksLostChecksum.size() +
" checksums in blocks:");
for (Block blk : blocksLostChecksum) {
message.append(" ");
message.append(blk.toString());
}
}
if (blocksLostStripe.size() > 0) {
message.append("Lost " + blocksLostStripe.size() +
" stripes in blocks:");
for (Block blk : blocksLostStripe) {
message.append(" ");
message.append(blk.toString());
}
}
if (message.length() == 0)
return;
message.append(" in file " + p);
throw new IOException(message.toString());
}
private boolean abortReconstruction(Long oldCRC, Decoder decoder) {
// If current codec is simulated file-level raid,
// We assume we only have XOR and RS
// it's allowed to lose checksums
return oldCRC == null && decoder.checksumStore != null &&
(decoder.codec.isDirRaid ||
!decoder.codec.simulateBlockFix ||
decoder.requiredChecksumVerification);
}
/**
* Reads through a source file reconstructing lost blocks on the way.
* @param srcPath Path identifying the lost file.
* @throws IOException
* @return true if file was reconstructed, false if no reconstruction
* was necessary or possible.
*/
public boolean processFile(Path srcPath, ParityFilePair parityPair,
Decoder decoder, Boolean fromStripeStore, Context context)
throws IOException,
InterruptedException {
LOG.info("Processing file " + srcPath);
Progressable progress = context;
if (progress == null) {
progress = RaidUtils.NULL_PROGRESSABLE;
}
DistributedFileSystem srcFs = getDFS(srcPath);
FileStatus srcStat = srcFs.getFileStatus(srcPath);
long blockSize = srcStat.getBlockSize();
long srcFileSize = srcStat.getLen();
String uriPath = srcPath.toUri().getPath();
int numBlocksReconstructed = 0;
List<LocatedBlockWithMetaInfo> lostBlocks = lostBlocksInFile(srcFs,
uriPath, srcStat);
if (lostBlocks.size() == 0) {
LOG.warn("Couldn't find any lost blocks in file " + srcPath +
", ignoring...");
return false;
}
List<Block> blocksLostChecksum = new ArrayList<Block>();
List<Block> blocksLostStripe = new ArrayList<Block>();
for (LocatedBlockWithMetaInfo lb: lostBlocks) {
Block lostBlock = lb.getBlock();
long lostBlockOffset = lb.getStartOffset();
LOG.info("Found lost block " + lostBlock +
", offset " + lostBlockOffset);
Long oldCRC = decoder.retrieveChecksum(lostBlock,
srcPath, lostBlockOffset, srcFs, context);
if (abortReconstruction(oldCRC, decoder)) {
blocksLostChecksum.add(lostBlock);
continue;
}
StripeInfo si = decoder.retrieveStripe(lostBlock,
srcPath, lostBlockOffset, srcFs, context, false);
if (si == null && decoder.stripeStore != null) {
blocksLostStripe.add(lostBlock);
continue;
}
final long blockContentsSize =
Math.min(blockSize, srcFileSize - lostBlockOffset);
File localBlockFile =
File.createTempFile(lostBlock.getBlockName(), ".tmp");
localBlockFile.deleteOnExit();
try {
CRC32 crc = null;
if (fromStripeStore) {
crc = decoder.recoverBlockToFileFromStripeInfo(srcFs, srcPath,
lostBlock, localBlockFile, blockSize,
lostBlockOffset, blockContentsSize, si, context);
} else {
crc = decoder.recoverBlockToFile(srcFs, srcStat,
parityPair.getFileSystem(),
parityPair.getPath(), blockSize,
lostBlockOffset, localBlockFile,
blockContentsSize, si, context);
}
LOG.info("Recovered crc: " + ((crc == null)?null: crc.getValue()) +
" expected crc:" + oldCRC);
if (crc != null && oldCRC != null &&
crc.getValue() != oldCRC) {
// checksum doesn't match, it's dangerous to send it
IOException ioe = new IOException("Block " + lostBlock.toString() +
" new checksum " + crc.getValue() +
" doesn't match the old one " + oldCRC);
LogUtils.logRaidReconstructionMetrics(LOGRESULTS.FAILURE, 0,
decoder.codec, srcPath, lostBlockOffset,
LOGTYPES.OFFLINE_RECONSTRUCTION_CHECKSUM_VERIFICATION,
srcFs, ioe, context);
throw ioe;
}
// Now that we have recovered the file block locally, send it.
computeMetadataAndSendReconstructedBlock(localBlockFile,
lostBlock, blockContentsSize,
lb.getLocations(),
lb.getDataProtocolVersion(),
lb.getNamespaceID(),
progress);
numBlocksReconstructed++;
} finally {
localBlockFile.delete();
}
progress.progress();
}
LOG.info("Reconstructed " + numBlocksReconstructed + " blocks in " + srcPath);
checkLostBlocks(blocksLostChecksum, blocksLostStripe, srcPath, decoder.codec);
return true;
}
/**
* Reads through a parity file, reconstructing lost blocks on the way.
* This function uses the corresponding source file to regenerate parity
* file blocks.
* @return true if file was reconstructed, false if no reconstruction
* was necessary or possible.
*/
boolean processParityFile(Path parityPath, Decoder decoder,
Context context)
throws IOException, InterruptedException {
LOG.info("Processing parity file " + parityPath);
Progressable progress = context;
if (progress == null) {
progress = RaidUtils.NULL_PROGRESSABLE;
}
DistributedFileSystem parityFs = getDFS(parityPath);
Path srcPath = RaidUtils.sourcePathFromParityPath(parityPath, parityFs);
if (srcPath == null) {
LOG.warn("Could not get regular file corresponding to parity file " +
parityPath + ", ignoring...");
return false;
}
DistributedFileSystem srcFs = getDFS(srcPath);
FileStatus parityStat = parityFs.getFileStatus(parityPath);
long blockSize = parityStat.getBlockSize();
FileStatus srcStat = srcFs.getFileStatus(srcPath);
// Check timestamp.
if (srcStat.getModificationTime() != parityStat.getModificationTime()) {
LOG.warn("Mismatching timestamp for " + srcPath + " and " + parityPath +
", ignoring...");
return false;
}
String uriPath = parityPath.toUri().getPath();
int numBlocksReconstructed = 0;
List<LocatedBlockWithMetaInfo> lostBlocks =
lostBlocksInFile(parityFs, uriPath, parityStat);
if (lostBlocks.size() == 0) {
LOG.warn("Couldn't find any lost blocks in parity file " + parityPath +
", ignoring...");
return false;
}
List<Block> blocksLostChecksum = new ArrayList<Block>();
List<Block> blocksLostStripe = new ArrayList<Block>();
for (LocatedBlockWithMetaInfo lb: lostBlocks) {
Block lostBlock = lb.getBlock();
long lostBlockOffset = lb.getStartOffset();
LOG.info("Found lost block " + lostBlock +
", offset " + lostBlockOffset);
Long oldCRC = decoder.retrieveChecksum(lostBlock,
parityPath, lostBlockOffset, parityFs, context);
if (abortReconstruction(oldCRC, decoder)) {
blocksLostChecksum.add(lostBlock);
continue;
}
StripeInfo si = decoder.retrieveStripe(lostBlock,
srcPath, lostBlockOffset, srcFs, context, false);
if (si == null && decoder.stripeStore != null) {
blocksLostStripe.add(lostBlock);
continue;
}
File localBlockFile =
File.createTempFile(lostBlock.getBlockName(), ".tmp");
localBlockFile.deleteOnExit();
try {
CRC32 crc = decoder.recoverParityBlockToFile(srcFs, srcStat, parityFs,
parityPath, blockSize, lostBlockOffset, localBlockFile, si,
context);
LOG.info("Recovered crc: " + ((crc == null)?null: crc.getValue()) +
" expected crc:" + oldCRC);
if (crc != null && oldCRC != null &&
crc.getValue() != oldCRC) {
// checksum doesn't match, it's dangerous to send it
IOException ioe = new IOException("Block " + lostBlock.toString()
+ " new checksum " + crc.getValue()
+ " doesn't match the old one " + oldCRC);
LogUtils.logRaidReconstructionMetrics(LOGRESULTS.FAILURE, 0,
decoder.codec, parityPath, lostBlockOffset,
LOGTYPES.OFFLINE_RECONSTRUCTION_CHECKSUM_VERIFICATION,
parityFs, ioe, context);
throw ioe;
}
// Now that we have recovered the parity file block locally, send it.
computeMetadataAndSendReconstructedBlock(
localBlockFile,
lostBlock, blockSize, lb.getLocations(),
lb.getDataProtocolVersion(), lb.getNamespaceID(),
progress);
numBlocksReconstructed++;
} finally {
localBlockFile.delete();
}
progress.progress();
}
LOG.info("Reconstructed " + numBlocksReconstructed + " blocks in " + parityPath);
checkLostBlocks(blocksLostChecksum, blocksLostStripe, parityPath, decoder.codec);
return true;
}
/**
* Reads through a parity HAR part file, reconstructing lost blocks on the way.
* A HAR block can contain many file blocks, as long as the HAR part file
* block size is a multiple of the file block size.
* @return true if file was reconstructed, false if no reconstruction
* was necessary or possible.
*/
boolean processParityHarPartFile(Path partFile,
Progressable progress)
throws IOException {
LOG.info("Processing parity HAR file " + partFile);
// Get some basic information.
DistributedFileSystem dfs = getDFS(partFile);
FileStatus partFileStat = dfs.getFileStatus(partFile);
long partFileBlockSize = partFileStat.getBlockSize();
LOG.info(partFile + " has block size " + partFileBlockSize);
// Find the path to the index file.
// Parity file HARs are only one level deep, so the index files is at the
// same level as the part file.
// Parses through the HAR index file.
HarIndex harIndex = HarIndex.getHarIndex(dfs, partFile);
String uriPath = partFile.toUri().getPath();
int numBlocksReconstructed = 0;
List<LocatedBlockWithMetaInfo> lostBlocks = lostBlocksInFile(dfs, uriPath,
partFileStat);
if (lostBlocks.size() == 0) {
LOG.warn("Couldn't find any lost blocks in HAR file " + partFile +
", ignoring...");
return false;
}
for (LocatedBlockWithMetaInfo lb: lostBlocks) {
Block lostBlock = lb.getBlock();
long lostBlockOffset = lb.getStartOffset();
File localBlockFile =
File.createTempFile(lostBlock.getBlockName(), ".tmp");
localBlockFile.deleteOnExit();
try {
processParityHarPartBlock(dfs, partFile,
lostBlockOffset, partFileStat, harIndex,
localBlockFile, progress);
// Now that we have recovered the part file block locally, send it.
computeMetadataAndSendReconstructedBlock(localBlockFile,
lostBlock,
localBlockFile.length(),
lb.getLocations(),
lb.getDataProtocolVersion(), lb.getNamespaceID(),
progress);
numBlocksReconstructed++;
} finally {
localBlockFile.delete();
}
progress.progress();
}
LOG.info("Reconstructed " + numBlocksReconstructed + " blocks in " + partFile);
return true;
}
/**
* This reconstructs a single part file block by recovering in sequence each
* parity block in the part file block.
*/
private void processParityHarPartBlock(FileSystem dfs, Path partFile,
long blockOffset,
FileStatus partFileStat,
HarIndex harIndex,
File localBlockFile,
Progressable progress)
throws IOException {
String partName = partFile.toUri().getPath(); // Temporarily.
partName = partName.substring(1 + partName.lastIndexOf(Path.SEPARATOR));
OutputStream out = new FileOutputStream(localBlockFile);
try {
// A HAR part file block could map to several parity files. We need to
// use all of them to recover this block.
final long blockEnd = Math.min(blockOffset +
partFileStat.getBlockSize(),
partFileStat.getLen());
for (long offset = blockOffset; offset < blockEnd; ) {
HarIndex.IndexEntry entry = harIndex.findEntry(partName, offset);
if (entry == null) {
String msg = "Lost index file has no matching index entry for " +
partName + ":" + offset;
LOG.warn(msg);
throw new IOException(msg);
}
Path parityFile = new Path(entry.fileName);
Encoder encoder = null;
for (Codec codec : Codec.getCodecs()) {
if (isParityFile(parityFile, codec)) {
encoder = new Encoder(getConf(), codec);
}
}
if (encoder == null) {
String msg = "Could not figure out codec correctly for " + parityFile;
LOG.warn(msg);
throw new IOException(msg);
}
Path srcFile = RaidUtils.sourcePathFromParityPath(parityFile, dfs);
if (null == srcFile) {
String msg = "Can not find the source path for parity file: " +
parityFile;
LOG.warn(msg);
throw new IOException(msg);
}
FileStatus srcStat = dfs.getFileStatus(srcFile);
if (srcStat.getModificationTime() != entry.mtime) {
String msg = "Modification times of " + parityFile + " and " +
srcFile + " do not match.";
LOG.warn(msg);
throw new IOException(msg);
}
long lostOffsetInParity = offset - entry.startOffset;
LOG.info(partFile + ":" + offset + " maps to " +
parityFile + ":" + lostOffsetInParity +
" and will be recovered from " + srcFile);
encoder.recoverParityBlockToStream(dfs, srcStat,
srcStat.getBlockSize(), parityFile,
lostOffsetInParity, out, progress);
// Finished recovery of one parity block. Since a parity block has the
// same size as a source block, we can move offset by source block
// size.
offset += srcStat.getBlockSize();
LOG.info("Recovered " + srcStat.getBlockSize() + " part file bytes ");
if (offset > blockEnd) {
String msg =
"Recovered block spills across part file blocks. Cannot continue";
throw new IOException(msg);
}
progress.progress();
}
} finally {
out.close();
}
}
/**
* Choose a datanode (hostname:portnumber). The datanode is chosen at
* random from the live datanodes.
* @param locationsToAvoid locations to avoid.
* @return A chosen datanode.
* @throws IOException
*/
private DatanodeInfo chooseDatanode(DatanodeInfo[] locationsToAvoid,
DatanodeInfo[] live)
throws IOException {
LOG.info("Choosing a datanode from " + live.length +
" live nodes while avoiding " + locationsToAvoid.length);
Random rand = new Random();
DatanodeInfo chosen = null;
int maxAttempts = 1000;
for (int i = 0; i < maxAttempts && chosen == null; i++) {
int idx = rand.nextInt(live.length);
chosen = live[idx];
for (DatanodeInfo avoid: locationsToAvoid) {
if (chosen.equals(avoid)) {
LOG.info("Avoiding " + avoid.name);
chosen = null;
break;
}
}
}
if (chosen == null) {
throw new IOException("Could not choose datanode");
}
LOG.info("Choosing datanode " + chosen.name);
return chosen;
}
private DatanodeInfo chooseDatanode(DatanodeInfo[] locationsToAvoid)
throws IOException {
DistributedFileSystem dfs = getDFS(new Path("/"));
DatanodeInfo[] live =
dfs.getClient().datanodeReport(DatanodeReportType.LIVE);
return chooseDatanode(locationsToAvoid, live);
}
/**
* Reads data from the data stream provided and computes metadata.
*/
DataInputStream computeMetadata(Configuration conf, InputStream dataStream)
throws IOException {
ByteArrayOutputStream mdOutBase = new ByteArrayOutputStream(1024*1024);
DataOutputStream mdOut = new DataOutputStream(mdOutBase);
// First, write out the version.
mdOut.writeShort(FSDataset.FORMAT_VERSION_NON_INLINECHECKSUM);
// Create a summer and write out its header.
int bytesPerChecksum = conf.getInt("io.bytes.per.checksum", 512);
DataChecksum sum =
DataChecksum.newDataChecksum(DataChecksum.CHECKSUM_CRC32,
bytesPerChecksum);
sum.writeHeader(mdOut);
// Buffer to read in a chunk of data.
byte[] buf = new byte[bytesPerChecksum];
// Buffer to store the checksum bytes.
byte[] chk = new byte[sum.getChecksumSize()];
// Read data till we reach the end of the input stream.
int bytesSinceFlush = 0;
while (true) {
// Read some bytes.
int bytesRead = dataStream.read(buf, bytesSinceFlush,
bytesPerChecksum - bytesSinceFlush);
if (bytesRead == -1) {
if (bytesSinceFlush > 0) {
boolean reset = true;
sum.writeValue(chk, 0, reset); // This also resets the sum.
// Write the checksum to the stream.
mdOut.write(chk, 0, chk.length);
bytesSinceFlush = 0;
}
break;
}
// Update the checksum.
sum.update(buf, bytesSinceFlush, bytesRead);
bytesSinceFlush += bytesRead;
// Flush the checksum if necessary.
if (bytesSinceFlush == bytesPerChecksum) {
boolean reset = true;
sum.writeValue(chk, 0, reset); // This also resets the sum.
// Write the checksum to the stream.
mdOut.write(chk, 0, chk.length);
bytesSinceFlush = 0;
}
}
byte[] mdBytes = mdOutBase.toByteArray();
return new DataInputStream(new ByteArrayInputStream(mdBytes));
}
private void computeMetadataAndSendReconstructedBlock(
File localBlockFile,
Block block, long blockSize,
DatanodeInfo[] locations,
int dataTransferVersion,
int namespaceId,
Progressable progress)
throws IOException {
LOG.info("Computing metdata");
FileInputStream blockContents = null;
DataInputStream blockMetadata = null;
try {
blockContents = new FileInputStream(localBlockFile);
blockMetadata = computeMetadata(getConf(), blockContents);
blockContents.close();
progress.progress();
DatanodeInfo datanode = null;
DistributedFileSystem dfs = getDFS(new Path("/"));
DatanodeInfo[] live =
dfs.getClient().datanodeReport(DatanodeReportType.LIVE);
for (int retry = 0; retry < SEND_BLOCK_MAX_RETRIES; ++retry) {
try {
datanode = chooseDatanode(locations, live);
// Reopen
blockContents = new FileInputStream(localBlockFile);
sendReconstructedBlock(datanode.name, blockContents, blockMetadata, block,
blockSize, dataTransferVersion, namespaceId, progress);
return;
} catch (IOException ex) {
if (retry == SEND_BLOCK_MAX_RETRIES - 1) {
// last retry, rethrow the exception
throw ex;
}
// log the warn and retry
LOG.warn("Got exception when sending the reconstructed block to datanode " +
datanode + ", retried: " + retry + " times.", ex);
// add the bad node to the locations.
DatanodeInfo[] newLocations = new DatanodeInfo[locations.length + 1];
System.arraycopy(locations, 0, newLocations, 0, locations.length);
newLocations[locations.length] = datanode;
locations = newLocations;
}
}
} finally {
if (blockContents != null) {
blockContents.close();
blockContents = null;
}
if (blockMetadata != null) {
blockMetadata.close();
blockMetadata = null;
}
}
}
/**
* Send a generated block to a datanode.
* @param datanode Chosen datanode name in host:port form.
* @param blockContents Stream with the block contents.
* @param block Block object identifying the block to be sent.
* @param blockSize size of the block.
* @param dataTransferVersion the data transfer version
* @param namespaceId namespace id the block belongs to
* @throws IOException
*/
private void sendReconstructedBlock(String datanode,
final FileInputStream blockContents,
final DataInputStream metadataIn,
Block block, long blockSize,
int dataTransferVersion, int namespaceId,
Progressable progress)
throws IOException {
InetSocketAddress target = NetUtils.createSocketAddr(datanode);
Socket sock = SocketChannel.open().socket();
int readTimeout =
getConf().getInt(BlockIntegrityMonitor.BLOCKFIX_READ_TIMEOUT,
HdfsConstants.READ_TIMEOUT);
NetUtils.connect(sock, target, readTimeout);
sock.setSoTimeout(readTimeout);
int writeTimeout = getConf().getInt(BlockIntegrityMonitor.BLOCKFIX_WRITE_TIMEOUT,
HdfsConstants.WRITE_TIMEOUT);
OutputStream baseStream = NetUtils.getOutputStream(sock, writeTimeout);
DataOutputStream out =
new DataOutputStream(new BufferedOutputStream(baseStream,
FSConstants.
SMALL_BUFFER_SIZE));
boolean corruptChecksumOk = false;
boolean chunkOffsetOK = false;
boolean verifyChecksum = true;
boolean transferToAllowed = false;
try {
LOG.info("Sending block " + block +
" from " + sock.getLocalSocketAddress().toString() +
" to " + sock.getRemoteSocketAddress().toString());
BlockSender blockSender =
new BlockSender(namespaceId, block, blockSize, 0, blockSize,
corruptChecksumOk, chunkOffsetOK, verifyChecksum,
transferToAllowed, dataTransferVersion >= DataTransferProtocol.PACKET_INCLUDE_VERSION_VERSION,
new BlockWithChecksumFileReader.InputStreamWithChecksumFactory() {
@Override
public InputStream createStream(long offset) throws IOException {
// we are passing 0 as the offset above,
// so we can safely ignore
// the offset passed
return blockContents;
}
@Override
public DataInputStream getChecksumStream() throws IOException {
return metadataIn;
}
@Override
public BlockDataFile.Reader getBlockDataFileReader()
throws IOException {
return BlockDataFile.getDummyDataFileFromFileChannel(
blockContents.getChannel()).getReader(null);
}
});
WriteBlockHeader header = new WriteBlockHeader(new VersionAndOpcode(
dataTransferVersion, DataTransferProtocol.OP_WRITE_BLOCK));
header.set(namespaceId, block.getBlockId(), block.getGenerationStamp(),
0, false, true, new DatanodeInfo(), 0, null, "");
header.writeVersionAndOpCode(out);
header.write(out);
blockSender.sendBlock(out, baseStream, null, progress);
LOG.info("Sent block " + block + " to " + datanode);
} finally {
sock.close();
out.close();
}
}
/**
* Returns the lost blocks in a file.
*/
abstract List<LocatedBlockWithMetaInfo> lostBlocksInFile(
DistributedFileSystem fs,
String uriPath, FileStatus stat)
throws IOException;
/**
* This class implements corrupt block fixing functionality.
*/
public static class CorruptBlockReconstructor extends BlockReconstructor {
public CorruptBlockReconstructor(Configuration conf) throws IOException {
super(conf);
}
List<LocatedBlockWithMetaInfo> lostBlocksInFile(DistributedFileSystem fs,
String uriPath,
FileStatus stat)
throws IOException {
List<LocatedBlockWithMetaInfo> corrupt =
new LinkedList<LocatedBlockWithMetaInfo>();
VersionedLocatedBlocks locatedBlocks;
int namespaceId = 0;
int methodFingerprint = 0;
if (DFSClient.isMetaInfoSuppoted(fs.getClient().namenodeProtocolProxy)) {
LocatedBlocksWithMetaInfo lbksm = fs.getClient().namenode.
openAndFetchMetaInfo(uriPath, 0, stat.getLen());
namespaceId = lbksm.getNamespaceID();
locatedBlocks = lbksm;
methodFingerprint = lbksm.getMethodFingerPrint();
fs.getClient().getNewNameNodeIfNeeded(methodFingerprint);
} else {
locatedBlocks = fs.getClient().namenode.open(uriPath, 0, stat.getLen());
}
final int dataTransferVersion = locatedBlocks.getDataProtocolVersion();
for (LocatedBlock b: locatedBlocks.getLocatedBlocks()) {
if (b.isCorrupt() ||
(b.getLocations().length == 0 && b.getBlockSize() > 0)) {
corrupt.add(new LocatedBlockWithMetaInfo(b.getBlock(),
b.getLocations(), b.getStartOffset(),
dataTransferVersion, namespaceId, methodFingerprint));
}
}
return corrupt;
}
}
/**
* This class implements decommissioning block copying functionality.
*/
public static class DecommissioningBlockReconstructor extends BlockReconstructor {
public DecommissioningBlockReconstructor(Configuration conf) throws IOException {
super(conf);
}
List<LocatedBlockWithMetaInfo> lostBlocksInFile(DistributedFileSystem fs,
String uriPath,
FileStatus stat)
throws IOException {
List<LocatedBlockWithMetaInfo> decommissioning =
new LinkedList<LocatedBlockWithMetaInfo>();
VersionedLocatedBlocks locatedBlocks;
int namespaceId = 0;
int methodFingerprint = 0;
if (DFSClient.isMetaInfoSuppoted(fs.getClient().namenodeProtocolProxy)) {
LocatedBlocksWithMetaInfo lbksm = fs.getClient().namenode.
openAndFetchMetaInfo(uriPath, 0, stat.getLen());
namespaceId = lbksm.getNamespaceID();
locatedBlocks = lbksm;
methodFingerprint = lbksm.getMethodFingerPrint();
fs.getClient().getNewNameNodeIfNeeded(methodFingerprint);
} else {
locatedBlocks = fs.getClient().namenode.open(uriPath, 0, stat.getLen());
}
final int dataTransferVersion = locatedBlocks.getDataProtocolVersion();
for (LocatedBlock b : locatedBlocks.getLocatedBlocks()) {
if (b.isCorrupt() ||
(b.getLocations().length == 0 && b.getBlockSize() > 0)) {
// If corrupt, this block is the responsibility of the CorruptBlockReconstructor
continue;
}
// Copy this block iff all good copies are being decommissioned
boolean allDecommissioning = true;
for (DatanodeInfo i : b.getLocations()) {
allDecommissioning &= i.isDecommissionInProgress();
}
if (allDecommissioning) {
decommissioning.add(new LocatedBlockWithMetaInfo(b.getBlock(),
b.getLocations(), b.getStartOffset(),
dataTransferVersion, namespaceId, methodFingerprint));
}
}
return decommissioning;
}
}
}