package org.apache.hadoop.hdfs; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.apache.commons.logging.Log; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.DatanodeID; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.server.datanode.BlockRecord; import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.datanode.DataNode.BlockRecoveryTimeoutException; import org.apache.hadoop.hdfs.server.datanode.SyncBlock; import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryInfo; import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; import org.apache.hadoop.hdfs.server.protocol.InterDatanodeProtocol; public class BlockRecoveryCoordinator { final private Log LOG; final private Configuration conf; final private int socketTimeout; final private DataNode dn; final private SyncBlock blockSyncer; final private DatanodeRegistration dnr; public BlockRecoveryCoordinator(Log lOG, Configuration conf, int socketTimeout, DataNode dn, SyncBlock blockSyncer, DatanodeRegistration dnr) { LOG = lOG; this.conf = conf; this.socketTimeout = socketTimeout; this.dn = dn; this.blockSyncer = blockSyncer; this.dnr = dnr; } /** * Recover a block * * @param keepLength * if true, will only recover replicas that have the same length as * the block passed in. Otherwise, will calculate the minimum length * of the replicas and truncate the rest to that length. **/ public LocatedBlock recoverBlock(int namespaceId, Block block, boolean keepLength, DatanodeID[] datanodeids, boolean closeFile, long deadline) throws IOException { int errorCount = 0; // Number of "replicasBeingWritten" in 0.21 parlance - these are replicas // on DNs that are still alive from when the write was happening int rbwCount = 0; // Number of "replicasWaitingRecovery" in 0.21 parlance - these replicas // have survived a DN restart, and thus might be truncated (eg if the // DN died because of a machine power failure, and when the ext3 journal // replayed, it truncated the file int rwrCount = 0; List<BlockRecord> blockRecords = new ArrayList<BlockRecord>(); List<InterDatanodeProtocol> datanodeProxies = new ArrayList<InterDatanodeProtocol>(); // check generation stamps for (DatanodeID id : datanodeids) { try { InterDatanodeProtocol datanode; if (dnr!= null && dnr.equals(id)) { LOG.info("Skipping IDNPP creation for local id " + id + " when recovering " + block); datanode = dn; } else { LOG.info("Creating IDNPP for non-local id " + id + " (dnReg=" + dnr + ") when recovering " + block); datanode = DataNode.createInterDataNodeProtocolProxy(id, conf, socketTimeout); datanodeProxies.add(datanode); } BlockSyncer.throwIfAfterTime(deadline); BlockRecoveryInfo info = datanode .startBlockRecovery(namespaceId, block); if (info == null) { LOG.info("No block metadata found for block " + block + " on datanode " + id); continue; } if (info.getBlock().getGenerationStamp() < block.getGenerationStamp()) { LOG.info("Only old generation stamp " + info.getBlock().getGenerationStamp() + " found on datanode " + id + " (needed block=" + block + ")"); continue; } blockRecords.add(new BlockRecord(id, datanode, info)); if (info.wasRecoveredOnStartup()) { rwrCount++; } else { rbwCount++; } } catch (BlockRecoveryTimeoutException e) { throw e; } catch (IOException e) { ++errorCount; InterDatanodeProtocol.LOG.warn( "Failed to getBlockMetaDataInfo for block (=" + block + ") from datanode (=" + id + ")", e); } } // If we *only* have replicas from post-DN-restart, then we should // include them in determining length. Otherwise they might cause us // to truncate too short. boolean shouldRecoverRwrs = (rbwCount == 0); List<BlockRecord> syncList = new ArrayList<BlockRecord>(); long minlength = Long.MAX_VALUE; for (BlockRecord record : blockRecords) { BlockRecoveryInfo info = record.info; assert (info != null && info.getBlock().getGenerationStamp() >= block .getGenerationStamp()); if (!shouldRecoverRwrs && info.wasRecoveredOnStartup()) { LOG.info("Not recovering replica " + record + " since it was recovered on " + "startup and we have better replicas"); continue; } if (keepLength) { if (info.getBlock().getNumBytes() == block.getNumBytes()) { syncList.add(record); } } else { syncList.add(record); if (info.getBlock().getNumBytes() < minlength) { minlength = info.getBlock().getNumBytes(); } } } if (syncList.isEmpty() && errorCount > 0) { DataNode.stopAllProxies(datanodeProxies); throw new IOException("All datanodes failed: block=" + block + ", datanodeids=" + Arrays.asList(datanodeids)); } if (!keepLength) { block.setNumBytes(minlength); } return blockSyncer.syncBlock(block, syncList, closeFile, datanodeProxies, deadline); } }