BlockManager.java example

Explorer
HadoopEKG-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hdfs.server.namenode;

import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.BlockUCState;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.ReplicaState;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem.NumberReplicas;
import org.apache.hadoop.hdfs.server.namenode.UnderReplicatedBlocks.BlockIterator;
import org.apache.hadoop.hdfs.DFSConfigKeys;

/**
 * Keeps information related to the blocks stored in the Hadoop cluster.
 * This class is a helper class for {@link FSNamesystem} and requires several
 * methods to be called with lock held on {@link FSNamesystem}.
 */
@InterfaceAudience.Private
public class BlockManager {
  // Default initial capacity and load factor of map
  public static final int DEFAULT_INITIAL_MAP_CAPACITY = 16;
  public static final float DEFAULT_MAP_LOAD_FACTOR = 0.75f;
  public static final int DEFAULT_MAX_CORRUPT_FILES_RETURNED = 500;

  private final FSNamesystem namesystem;

  volatile long pendingReplicationBlocksCount = 0L;
  volatile long corruptReplicaBlocksCount = 0L;
  volatile long underReplicatedBlocksCount = 0L;
  volatile long scheduledReplicationBlocksCount = 0L;
  volatile long excessBlocksCount = 0L;
  volatile long pendingDeletionBlocksCount = 0L;

  //
  // Mapping: Block -> { INode, datanodes, self ref }
  // Updated only in response to client-sent information.
  //
  final BlocksMap blocksMap;

  //
  // Store blocks-->datanodedescriptor(s) map of corrupt replicas
  //
  CorruptReplicasMap corruptReplicas = new CorruptReplicasMap();

  //
  // Keeps a Collection for every named machine containing
  // blocks that have recently been invalidated and are thought to live
  // on the machine in question.
  // Mapping: StorageID -> ArrayList<Block>
  //
  Map<String, Collection<Block>> recentInvalidateSets =
    new TreeMap<String, Collection<Block>>();

  //
  // Keeps a TreeSet for every named node. Each treeset contains
  // a list of the blocks that are "extra" at that location. We'll
  // eventually remove these extras.
  // Mapping: StorageID -> TreeSet<Block>
  //
  Map<String, Collection<Block>> excessReplicateMap =
    new TreeMap<String, Collection<Block>>();

  //
  // Store set of Blocks that need to be replicated 1 or more times.
  // We also store pending replication-orders.
  //
  UnderReplicatedBlocks neededReplications = new UnderReplicatedBlocks();
  private PendingReplicationBlocks pendingReplications;

  //  The maximum number of replicas allowed for a block
  int maxReplication;
  //  How many outgoing replication streams a given node should have at one time
  int maxReplicationStreams;
  // Minimum copies needed or else write is disallowed
  int minReplication;
  // Default number of replicas
  int defaultReplication;
  // How many entries are returned by getCorruptInodes()
  int maxCorruptFilesReturned;
  
  // variable to enable check for enough racks 
  boolean shouldCheckForEnoughRacks = true;

  /**
   * Last block index used for replication work.
   */
  private int replIndex = 0;
  private long missingBlocksInCurIter = 0;
  private long missingBlocksInPrevIter = 0;
  Random r = new Random();

  // for block replicas placement
  BlockPlacementPolicy replicator;

  BlockManager(FSNamesystem fsn, Configuration conf) throws IOException {
    this(fsn, conf, DEFAULT_INITIAL_MAP_CAPACITY);
  }
  
  BlockManager(FSNamesystem fsn, Configuration conf, int capacity)
      throws IOException {
    namesystem = fsn;
    pendingReplications = new PendingReplicationBlocks(
        conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_KEY,
            DFSConfigKeys.DFS_NAMENODE_REPLICATION_PENDING_TIMEOUT_SEC_DEFAULT) * 1000L);
    setConfigurationParameters(conf);
    blocksMap = new BlocksMap(capacity, DEFAULT_MAP_LOAD_FACTOR);
  }

  void setConfigurationParameters(Configuration conf) throws IOException {
    this.replicator = BlockPlacementPolicy.getInstance(
                         conf,
                         namesystem,
                         namesystem.clusterMap);

    this.maxCorruptFilesReturned = conf.getInt("dfs.corruptfilesreturned.max",
        DEFAULT_MAX_CORRUPT_FILES_RETURNED);
    this.defaultReplication = conf.getInt("dfs.replication", 3);
    this.maxReplication = conf.getInt("dfs.replication.max", 512);
    this.minReplication = conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_KEY,
                                      DFSConfigKeys.DFS_NAMENODE_REPLICATION_MIN_DEFAULT);
    if (minReplication <= 0)
      throw new IOException(
                            "Unexpected configuration parameters: dfs.namenode.replication.min = "
                            + minReplication
                            + " must be greater than 0");
    if (maxReplication >= (int)Short.MAX_VALUE)
      throw new IOException(
                            "Unexpected configuration parameters: dfs.replication.max = "
                            + maxReplication + " must be less than " + (Short.MAX_VALUE));
    if (maxReplication < minReplication)
      throw new IOException(
                            "Unexpected configuration parameters: dfs.namenode.replication.min = "
                            + minReplication
                            + " must be less than dfs.replication.max = "
                            + maxReplication);
    this.maxReplicationStreams = conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY,
                                             DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_DEFAULT);
    this.shouldCheckForEnoughRacks = conf.get(DFSConfigKeys.NET_TOPOLOGY_SCRIPT_FILE_NAME_KEY) == null ? false
                                                                             : true;
    FSNamesystem.LOG.info("defaultReplication = " + defaultReplication);
    FSNamesystem.LOG.info("maxReplication = " + maxReplication);
    FSNamesystem.LOG.info("minReplication = " + minReplication);
    FSNamesystem.LOG.info("maxReplicationStreams = " + maxReplicationStreams);
    FSNamesystem.LOG.info("shouldCheckForEnoughRacks = " + shouldCheckForEnoughRacks);
  }

  void activate() {
    pendingReplications.start();
  }

  void close() {
    if (pendingReplications != null) pendingReplications.stop();
  }

  void metaSave(PrintWriter out) {
    //
    // Dump contents of neededReplication
    //
    synchronized (neededReplications) {
      out.println("Metasave: Blocks waiting for replication: " + 
                  neededReplications.size());
      for (Block block : neededReplications) {
        List<DatanodeDescriptor> containingNodes =
                                          new ArrayList<DatanodeDescriptor>();
        NumberReplicas numReplicas = new NumberReplicas();
        // source node returned is not used
        chooseSourceDatanode(block, containingNodes, numReplicas);
        int usableReplicas = numReplicas.liveReplicas() +
                             numReplicas.decommissionedReplicas();
       
        if (block instanceof BlockInfo) {
          String fileName = ((BlockInfo)block).getINode().getFullPathName();
          out.print(fileName + ": ");
        }
        // l: == live:, d: == decommissioned c: == corrupt e: == excess
        out.print(block + ((usableReplicas > 0)? "" : " MISSING") + 
                  " (replicas:" +
                  " l: " + numReplicas.liveReplicas() +
                  " d: " + numReplicas.decommissionedReplicas() +
                  " c: " + numReplicas.corruptReplicas() +
                  " e: " + numReplicas.excessReplicas() + ") "); 

        Collection<DatanodeDescriptor> corruptNodes = 
                                      corruptReplicas.getNodes(block);
        
        for (Iterator<DatanodeDescriptor> jt = blocksMap.nodeIterator(block);
             jt.hasNext();) {
          DatanodeDescriptor node = jt.next();
          String state = "";
          if (corruptNodes != null && corruptNodes.contains(node)) {
            state = "(corrupt)";
          } else if (node.isDecommissioned() || 
              node.isDecommissionInProgress()) {
            state = "(decommissioned)";
          }          
          out.print(" " + node + state + " : ");
        }
        out.println("");
      }
    }

    //
    // Dump blocks from pendingReplication
    //
    pendingReplications.metaSave(out);

    //
    // Dump blocks that are waiting to be deleted
    //
    dumpRecentInvalidateSets(out);
  }

  /**
   * @param block
   * @return true if the block has minimum replicas
   */
  boolean checkMinReplication(Block block) {
    return (countNodes(block).liveReplicas() >= minReplication);
  }

  /**
   * Commit a block of a file
   * 
   * @param fileINode file inode
   * @param block block to be committed
   * @param commitBlock - contains client reported block length and generation
   * @throws IOException if the block does not have at least a minimal number
   * of replicas reported from data-nodes.
   */
  private void commitBlock(INodeFileUnderConstruction fileINode,
                       BlockInfoUnderConstruction block,
                       Block commitBlock) throws IOException {
    if (block.getBlockUCState() == BlockUCState.COMMITTED)
      return;
    assert block.getNumBytes() <= commitBlock.getNumBytes() :
      "commitBlock length is less than the stored one "
      + commitBlock.getNumBytes() + " vs. " + block.getNumBytes();
    block.commitBlock(commitBlock);
    
    // Adjust disk space consumption if required
    long diff = fileINode.getPreferredBlockSize() - commitBlock.getNumBytes();    
    if (diff > 0) {
      try {
        String path = /* For finding parents */
        namesystem.leaseManager.findPath(fileINode);
        namesystem.dir.updateSpaceConsumed(path, 0, -diff
            * fileINode.getReplication());
      } catch (IOException e) {
        FSNamesystem.LOG
            .warn("Unexpected exception while updating disk space : "
                + e.getMessage());
      }
    }
  }
  
  /**
   * Commit the last block of the file and mark it as complete if it has
   * meets the minimum replication requirement
   * 
   * @param fileINode file inode
   * @param commitBlock - contains client reported block length and generation
   * @throws IOException if the block does not have at least a minimal number
   * of replicas reported from data-nodes.
   */
  void commitOrCompleteLastBlock(INodeFileUnderConstruction fileINode, 
      Block commitBlock) throws IOException {
    
    if(commitBlock == null)
      return; // not committing, this is a block allocation retry
    BlockInfo lastBlock = fileINode.getLastBlock();
    if(lastBlock == null)
      return; // no blocks in file yet
    if(lastBlock.isComplete())
      return; // already completed (e.g. by syncBlock)
    
    commitBlock(fileINode, (BlockInfoUnderConstruction)lastBlock, commitBlock);

    if(countNodes(lastBlock).liveReplicas() >= minReplication)
      completeBlock(fileINode,fileINode.numBlocks()-1);
  }

  /**
   * Convert a specified block of the file to a complete block.
   * @param fileINode file
   * @param blkIndex  block index in the file
   * @throws IOException if the block does not have at least a minimal number
   * of replicas reported from data-nodes.
   */
  BlockInfo completeBlock(INodeFile fileINode, int blkIndex)
  throws IOException {
    if(blkIndex < 0)
      return null;
    BlockInfo curBlock = fileINode.getBlocks()[blkIndex];
    if(curBlock.isComplete())
      return curBlock;
    BlockInfoUnderConstruction ucBlock = (BlockInfoUnderConstruction)curBlock;
    if(ucBlock.numNodes() < minReplication)
      throw new IOException("Cannot complete block: " +
          "block does not satisfy minimal replication requirement.");
    BlockInfo completeBlock = ucBlock.convertToCompleteBlock();
    // replace penultimate block in file
    fileINode.setBlock(blkIndex, completeBlock);
    // replace block in the blocksMap
    return blocksMap.replaceBlock(completeBlock);
  }

  BlockInfo completeBlock(INodeFile fileINode, BlockInfo block)
  throws IOException {
    BlockInfo[] fileBlocks = fileINode.getBlocks();
    for(int idx = 0; idx < fileBlocks.length; idx++)
      if(fileBlocks[idx] == block) {
        return completeBlock(fileINode, idx);
      }
    return block;
  }

  /**
   * Convert the last block of the file to an under construction block.
   * @param fileINode file
   * @param targets data-nodes that will form the pipeline for this block
   */
  void convertLastBlockToUnderConstruction(
      INodeFileUnderConstruction fileINode,
      DatanodeDescriptor[] targets) throws IOException {
    BlockInfo oldBlock = fileINode.getLastBlock();
    if(oldBlock == null)
      return;
    BlockInfoUnderConstruction ucBlock =
      fileINode.setLastBlock(oldBlock, targets);
    blocksMap.replaceBlock(ucBlock);
  }

  /**
   * Get all valid locations of the block
   */
  ArrayList<String> getValidLocations(Block block) {
    ArrayList<String> machineSet =
      new ArrayList<String>(blocksMap.numNodes(block));
    for(Iterator<DatanodeDescriptor> it =
      blocksMap.nodeIterator(block); it.hasNext();) {
      String storageID = it.next().getStorageID();
      // filter invalidate replicas
      Collection<Block> blocks = recentInvalidateSets.get(storageID);
      if(blocks==null || !blocks.contains(block)) {
        machineSet.add(storageID);
      }
    }
    return machineSet;
  }

  List<LocatedBlock> getBlockLocations(BlockInfo[] blocks, long offset,
      long length, int nrBlocksToReturn) throws IOException {
    int curBlk = 0;
    long curPos = 0, blkSize = 0;
    int nrBlocks = (blocks[0].getNumBytes() == 0) ? 0 : blocks.length;
    for (curBlk = 0; curBlk < nrBlocks; curBlk++) {
      blkSize = blocks[curBlk].getNumBytes();
      assert blkSize > 0 : "Block of size 0";
      if (curPos + blkSize > offset) {
        break;
      }
      curPos += blkSize;
    }

    if (nrBlocks > 0 && curBlk == nrBlocks)   // offset >= end of file
      return Collections.<LocatedBlock>emptyList();

    long endOff = offset + length;
    List<LocatedBlock> results = new ArrayList<LocatedBlock>(blocks.length);
    do {
      results.add(getBlockLocation(blocks[curBlk], curPos));
      curPos += blocks[curBlk].getNumBytes();
      curBlk++;
    } while (curPos < endOff 
          && curBlk < blocks.length
          && results.size() < nrBlocksToReturn);
    return results;
  }

  /** @return a LocatedBlock for the given block */
  LocatedBlock getBlockLocation(final BlockInfo blk, final long pos
      ) throws IOException {
    if (!blk.isComplete()) {
      final BlockInfoUnderConstruction uc = (BlockInfoUnderConstruction)blk;
      final DatanodeDescriptor[] locations = uc.getExpectedLocations();
      return namesystem.createLocatedBlock(uc, locations, pos, false);
    }

    // get block locations
    final int numCorruptNodes = countNodes(blk).corruptReplicas();
    final int numCorruptReplicas = corruptReplicas.numCorruptReplicas(blk);
    if (numCorruptNodes != numCorruptReplicas) {
      FSNamesystem.LOG.warn("Inconsistent number of corrupt replicas for "
          + blk + " blockMap has " + numCorruptNodes
          + " but corrupt replicas map has " + numCorruptReplicas);
    }

    final int numNodes = blocksMap.numNodes(blk);
    final boolean isCorrupt = numCorruptNodes == numNodes;
    final int numMachines = isCorrupt ? numNodes: numNodes - numCorruptNodes;
    final DatanodeDescriptor[] machines = new DatanodeDescriptor[numMachines];
    if (numMachines > 0) {
      int j = 0;
      for(Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(blk);
          it.hasNext();) {
        final DatanodeDescriptor d = it.next();
        final boolean replicaCorrupt = corruptReplicas.isReplicaCorrupt(blk, d);
        if (isCorrupt || (!isCorrupt && !replicaCorrupt))
          machines[j++] = d;
      }
    }
    return namesystem.createLocatedBlock(blk, machines, pos, isCorrupt);    
  }

  /**
   * Check whether the replication parameter is within the range
   * determined by system configuration.
   */
   void verifyReplication(String src,
                          short replication,
                          String clientName) throws IOException {

    if (replication >= minReplication && replication <= maxReplication) {
      //common case. avoid building 'text'
      return;
    }
    
    String text = "file " + src 
      + ((clientName != null) ? " on client " + clientName : "")
      + ".\n"
      + "Requested replication " + replication;

    if (replication > maxReplication)
      throw new IOException(text + " exceeds maximum " + maxReplication);

    if (replication < minReplication)
      throw new IOException(text + " is less than the required minimum " +
                            minReplication);
  }

  void removeFromInvalidates(String datanodeId, Block block) {
    Collection<Block> v = recentInvalidateSets.get(datanodeId);
    if (v != null && v.remove(block)) {
      pendingDeletionBlocksCount--;
      if (v.isEmpty()) {
        recentInvalidateSets.remove(datanodeId);
      }
    }
  }

  /**
   * Adds block to list of blocks which will be invalidated on specified
   * datanode
   *
   * @param b block
   * @param dn datanode
   * @param log true to create an entry in the log 
   */
  void addToInvalidates(Block b, DatanodeInfo dn, boolean log) {
    Collection<Block> invalidateSet = recentInvalidateSets
        .get(dn.getStorageID());
    if (invalidateSet == null) {
      invalidateSet = new HashSet<Block>();
      recentInvalidateSets.put(dn.getStorageID(), invalidateSet);
    }
    if (invalidateSet.add(b)) {
      pendingDeletionBlocksCount++;
      if (log) {
        NameNode.stateChangeLog.info("BLOCK* NameSystem.addToInvalidates: "
            + b + " to " + dn.getName());
      }
    }
  }

  /**
   * Adds block to list of blocks which will be invalidated on specified
   * datanode and log the operation
   *
   * @param b block
   * @param dn datanode
   */
  void addToInvalidates(Block b, DatanodeInfo dn) {
    addToInvalidates(b, dn, true);
  }

  /**
   * Adds block to list of blocks which will be invalidated on all its
   * datanodes.
   */
  private void addToInvalidates(Block b) {
    StringBuilder datanodes = new StringBuilder();
    for (Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(b); it
        .hasNext();) {
      DatanodeDescriptor node = it.next();
      addToInvalidates(b, node, false);
      datanodes.append(node.getName()).append(" ");
    }
    if (datanodes.length() != 0) {
      NameNode.stateChangeLog.info("BLOCK* NameSystem.addToInvalidates: "
          + b + " to " + datanodes.toString());
    }
  }

  /**
   * dumps the contents of recentInvalidateSets
   */
  private void dumpRecentInvalidateSets(PrintWriter out) {
    int size = recentInvalidateSets.values().size();
    out.println("Metasave: Blocks " + pendingDeletionBlocksCount 
        + " waiting deletion from " + size + " datanodes.");
    if (size == 0) {
      return;
    }
    for(Map.Entry<String,Collection<Block>> entry : recentInvalidateSets.entrySet()) {
      Collection<Block> blocks = entry.getValue();
      if (blocks.size() > 0) {
        out.println(namesystem.getDatanode(entry.getKey()).getName() + blocks);
      }
    }
  }

  void findAndMarkBlockAsCorrupt(Block blk,
                                 DatanodeInfo dn) throws IOException {
    BlockInfo storedBlock = getStoredBlock(blk);
    if (storedBlock == null) {
      // Check if the replica is in the blockMap, if not
      // ignore the request for now. This could happen when BlockScanner
      // thread of Datanode reports bad block before Block reports are sent
      // by the Datanode on startup
      NameNode.stateChangeLog.info("BLOCK* NameSystem.markBlockAsCorrupt: " +
                                   "block " + blk + " could not be marked as " +
                                   "corrupt as it does not exist in blocksMap");
      return;
    }
    markBlockAsCorrupt(storedBlock, dn);
  }

  private void markBlockAsCorrupt(BlockInfo storedBlock,
                                  DatanodeInfo dn) throws IOException {
    assert storedBlock != null : "storedBlock should not be null";
    DatanodeDescriptor node = namesystem.getDatanode(dn);
    if (node == null) {
      throw new IOException("Cannot mark block " + 
                            storedBlock.getBlockName() +
                            " as corrupt because datanode " + dn.getName() +
                            " does not exist. ");
    }

    INodeFile inode = storedBlock.getINode();
    if (inode == null) {
      NameNode.stateChangeLog.info("BLOCK NameSystem.markBlockAsCorrupt: " +
                                   "block " + storedBlock +
                                   " could not be marked as corrupt as it" +
                                   " does not belong to any file");
      addToInvalidates(storedBlock, node);
      return;
    } 

    // Add replica to the data-node if it is not already there
    node.addBlock(storedBlock);

    // Add this replica to corruptReplicas Map
    corruptReplicas.addToCorruptReplicasMap(storedBlock, node);
    if (countNodes(storedBlock).liveReplicas() > inode.getReplication()) {
      // the block is over-replicated so invalidate the replicas immediately
      invalidateBlock(storedBlock, node);
    } else {
      // add the block to neededReplication
      updateNeededReplications(storedBlock, -1, 0);
    }
  }

  /**
   * Invalidates the given block on the given datanode.
   */
  private void invalidateBlock(Block blk, DatanodeInfo dn)
      throws IOException {
    NameNode.stateChangeLog.info("DIR* NameSystem.invalidateBlock: "
                                 + blk + " on " + dn.getName());
    DatanodeDescriptor node = namesystem.getDatanode(dn);
    if (node == null) {
      throw new IOException("Cannot invalidate block " + blk +
                            " because datanode " + dn.getName() +
                            " does not exist.");
    }

    // Check how many copies we have of the block. If we have at least one
    // copy on a live node, then we can delete it.
    int count = countNodes(blk).liveReplicas();
    if (count > 1) {
      addToInvalidates(blk, dn);
      removeStoredBlock(blk, node);
      NameNode.stateChangeLog.debug("BLOCK* NameSystem.invalidateBlocks: "
                                   + blk + " on "
                                   + dn.getName() + " listed for deletion.");
    } else {
      NameNode.stateChangeLog.info("BLOCK* NameSystem.invalidateBlocks: "
          + blk + " on " + dn.getName()
          + " is the only copy and was not deleted.");
    }
  }

  void updateState() {
    pendingReplicationBlocksCount = pendingReplications.size();
    underReplicatedBlocksCount = neededReplications.size();
    corruptReplicaBlocksCount = corruptReplicas.size();
  }

  /**
   * Schedule blocks for deletion at datanodes
   * @param nodesToProcess number of datanodes to schedule deletion work
   * @return total number of block for deletion
   */
  int computeInvalidateWork(int nodesToProcess) {
    int numOfNodes = recentInvalidateSets.size();
    nodesToProcess = Math.min(numOfNodes, nodesToProcess);

    // TODO should using recentInvalidateSets be synchronized?
    // get an array of the keys
    ArrayList<String> keyArray =
      new ArrayList<String>(recentInvalidateSets.keySet());

    // randomly pick up <i>nodesToProcess</i> nodes
    // and put them at [0, nodesToProcess)
    int remainingNodes = numOfNodes - nodesToProcess;
    if (nodesToProcess < remainingNodes) {
      for(int i=0; i<nodesToProcess; i++) {
        int keyIndex = r.nextInt(numOfNodes-i)+i;
        Collections.swap(keyArray, keyIndex, i); // swap to front
      }
    } else {
      for(int i=0; i<remainingNodes; i++) {
        int keyIndex = r.nextInt(numOfNodes-i);
        Collections.swap(keyArray, keyIndex, numOfNodes-i-1); // swap to end
      }
    }

    int blockCnt = 0;
    for(int nodeCnt = 0; nodeCnt < nodesToProcess; nodeCnt++ ) {
      blockCnt += invalidateWorkForOneNode(keyArray.get(nodeCnt));
    }
    return blockCnt;
  }

  /**
   * Scan blocks in {@link #neededReplications} and assign replication
   * work to data-nodes they belong to.
   *
   * The number of process blocks equals either twice the number of live
   * data-nodes or the number of under-replicated blocks whichever is less.
   *
   * @return number of blocks scheduled for replication during this iteration.
   */
  int computeReplicationWork(int blocksToProcess) throws IOException {
    // Choose the blocks to be replicated
    List<List<Block>> blocksToReplicate =
      chooseUnderReplicatedBlocks(blocksToProcess);

    // replicate blocks
    int scheduledReplicationCount = 0;
    for (int i=0; i<blocksToReplicate.size(); i++) {
      for(Block block : blocksToReplicate.get(i)) {
        if (computeReplicationWorkForBlock(block, i)) {
          scheduledReplicationCount++;
        }
      }
    }
    return scheduledReplicationCount;
  }

  /**
   * Get a list of block lists to be replicated The index of block lists
   * represents the
   *
   * @param blocksToProcess
   * @return Return a list of block lists to be replicated. The block list index
   *         represents its replication priority.
   */
  private List<List<Block>> chooseUnderReplicatedBlocks(int blocksToProcess) {
    // initialize data structure for the return value
    List<List<Block>> blocksToReplicate = new ArrayList<List<Block>>(
        UnderReplicatedBlocks.LEVEL);
    for (int i = 0; i < UnderReplicatedBlocks.LEVEL; i++) {
      blocksToReplicate.add(new ArrayList<Block>());
    }
    synchronized (namesystem) {
      synchronized (neededReplications) {
        if (neededReplications.size() == 0) {
          missingBlocksInCurIter = 0;
          missingBlocksInPrevIter = 0;
          return blocksToReplicate;
        }

        // Go through all blocks that need replications.
        BlockIterator neededReplicationsIterator = neededReplications
            .iterator();
        // skip to the first unprocessed block, which is at replIndex
        for (int i = 0; i < replIndex && neededReplicationsIterator.hasNext(); i++) {
          neededReplicationsIterator.next();
        }
        // # of blocks to process equals either twice the number of live
        // data-nodes or the number of under-replicated blocks whichever is less
        blocksToProcess = Math.min(blocksToProcess, neededReplications.size());

        for (int blkCnt = 0; blkCnt < blocksToProcess; blkCnt++, replIndex++) {
          if (!neededReplicationsIterator.hasNext()) {
            // start from the beginning
            replIndex = 0;
            missingBlocksInPrevIter = missingBlocksInCurIter;
            missingBlocksInCurIter = 0;
            blocksToProcess = Math.min(blocksToProcess, neededReplications
                .size());
            if (blkCnt >= blocksToProcess)
              break;
            neededReplicationsIterator = neededReplications.iterator();
            assert neededReplicationsIterator.hasNext() : "neededReplications should not be empty.";
          }

          Block block = neededReplicationsIterator.next();
          int priority = neededReplicationsIterator.getPriority();
          if (priority < 0 || priority >= blocksToReplicate.size()) {
            FSNamesystem.LOG.warn("Unexpected replication priority: "
                + priority + " " + block);
          } else {
            blocksToReplicate.get(priority).add(block);
          }
        } // end for
      } // end synchronized neededReplication
    } // end synchronized namesystem

    return blocksToReplicate;
  }

  /** Replicate a block
   *
   * @param block block to be replicated
   * @param priority a hint of its priority in the neededReplication queue
   * @return if the block gets replicated or not
   */
  private boolean computeReplicationWorkForBlock(Block block, int priority) {
    int requiredReplication, numEffectiveReplicas;
    List<DatanodeDescriptor> containingNodes;
    DatanodeDescriptor srcNode;
    INodeFile fileINode = null;
    int additionalReplRequired;

    synchronized (namesystem) {
      synchronized (neededReplications) {
        // block should belong to a file
        fileINode = blocksMap.getINode(block);
        // abandoned block or block reopened for append
        if(fileINode == null || fileINode.isUnderConstruction()) {
          neededReplications.remove(block, priority); // remove from neededReplications
          replIndex--;
          return false;
        }

        requiredReplication = fileINode.getReplication();

        // get a source data-node
        containingNodes = new ArrayList<DatanodeDescriptor>();
        NumberReplicas numReplicas = new NumberReplicas();
        srcNode = chooseSourceDatanode(block, containingNodes, numReplicas);
        if ((numReplicas.liveReplicas() + numReplicas.decommissionedReplicas())
            <= 0) {
          missingBlocksInCurIter++;
        }
        if(srcNode == null) // block can not be replicated from any node
          return false;

        // do not schedule more if enough replicas is already pending
        numEffectiveReplicas = numReplicas.liveReplicas() +
                                pendingReplications.getNumReplicas(block);
      
        if (numEffectiveReplicas >= requiredReplication) {
          if ( (pendingReplications.getNumReplicas(block) > 0) ||
               (blockHasEnoughRacks(block)) ) {
            neededReplications.remove(block, priority); // remove from neededReplications
            replIndex--;
            NameNode.stateChangeLog.info("BLOCK* "
                + "Removing block " + block
                + " from neededReplications as it has enough replicas.");
            return false;
          }
        }

        if (numReplicas.liveReplicas() < requiredReplication) {
          additionalReplRequired = requiredReplication - numEffectiveReplicas;
        } else {
          additionalReplRequired = 1; //Needed on a new rack
        }

      }
    }

    // choose replication targets: NOT HOLDING THE GLOBAL LOCK
    // It is costly to extract the filename for which chooseTargets is called,
    // so for now we pass in the Inode itself.
    DatanodeDescriptor targets[] = 
                       replicator.chooseTarget(fileINode, additionalReplRequired,
                       srcNode, containingNodes, block.getNumBytes());
    if(targets.length == 0)
      return false;

    synchronized (namesystem) {
      synchronized (neededReplications) {
        // Recheck since global lock was released
        // block should belong to a file
        fileINode = blocksMap.getINode(block);
        // abandoned block or block reopened for append
        if(fileINode == null || fileINode.isUnderConstruction()) {
          neededReplications.remove(block, priority); // remove from neededReplications
          replIndex--;
          return false;
        }
        requiredReplication = fileINode.getReplication();

        // do not schedule more if enough replicas is already pending
        NumberReplicas numReplicas = countNodes(block);
        numEffectiveReplicas = numReplicas.liveReplicas() +
        pendingReplications.getNumReplicas(block);

        if (numEffectiveReplicas >= requiredReplication) {
          if ( (pendingReplications.getNumReplicas(block) > 0) ||
               (blockHasEnoughRacks(block)) ) {
            neededReplications.remove(block, priority); // remove from neededReplications
            replIndex--;
            NameNode.stateChangeLog.info("BLOCK* "
                + "Removing block " + block
                + " from neededReplications as it has enough replicas.");
            return false;
          }
        }

        if ( (numReplicas.liveReplicas() >= requiredReplication) &&
             (!blockHasEnoughRacks(block)) ) {
          if (srcNode.getNetworkLocation().equals(targets[0].getNetworkLocation())) {
            //No use continuing, unless a new rack in this case
            return false;
          }
        }

        // Add block to the to be replicated list
        srcNode.addBlockToBeReplicated(block, targets);

        for (DatanodeDescriptor dn : targets) {
          dn.incBlocksScheduled();
        }

        // Move the block-replication into a "pending" state.
        // The reason we use 'pending' is so we can retry
        // replications that fail after an appropriate amount of time.
        pendingReplications.add(block, targets.length);
        NameNode.stateChangeLog.debug(
            "BLOCK* block " + block
            + " is moved from neededReplications to pendingReplications");

        // remove from neededReplications
        if(numEffectiveReplicas + targets.length >= requiredReplication) {
          neededReplications.remove(block, priority); // remove from neededReplications
          replIndex--;
        }
        if (NameNode.stateChangeLog.isInfoEnabled()) {
          StringBuilder targetList = new StringBuilder("datanode(s)");
          for (int k = 0; k < targets.length; k++) {
            targetList.append(' ');
            targetList.append(targets[k].getName());
          }
          NameNode.stateChangeLog.info(
                    "BLOCK* ask "
                    + srcNode.getName() + " to replicate "
                    + block + " to " + targetList);
          NameNode.stateChangeLog.debug(
                    "BLOCK* neededReplications = " + neededReplications.size()
                    + " pendingReplications = " + pendingReplications.size());
        }
      }
    }

    return true;
  }

  /**
   * Parse the data-nodes the block belongs to and choose one,
   * which will be the replication source.
   *
   * We prefer nodes that are in DECOMMISSION_INPROGRESS state to other nodes
   * since the former do not have write traffic and hence are less busy.
   * We do not use already decommissioned nodes as a source.
   * Otherwise we choose a random node among those that did not reach their
   * replication limit.
   *
   * In addition form a list of all nodes containing the block
   * and calculate its replication numbers.
   */
  private DatanodeDescriptor chooseSourceDatanode(
                                    Block block,
                                    List<DatanodeDescriptor> containingNodes,
                                    NumberReplicas numReplicas) {
    containingNodes.clear();
    DatanodeDescriptor srcNode = null;
    int live = 0;
    int decommissioned = 0;
    int corrupt = 0;
    int excess = 0;
    Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(block);
    Collection<DatanodeDescriptor> nodesCorrupt = corruptReplicas.getNodes(block);
    while(it.hasNext()) {
      DatanodeDescriptor node = it.next();
      Collection<Block> excessBlocks =
        excessReplicateMap.get(node.getStorageID());
      if ((nodesCorrupt != null) && (nodesCorrupt.contains(node)))
        corrupt++;
      else if (node.isDecommissionInProgress() || node.isDecommissioned())
        decommissioned++;
      else if (excessBlocks != null && excessBlocks.contains(block)) {
        excess++;
      } else {
        live++;
      }
      containingNodes.add(node);
      // Check if this replica is corrupt
      // If so, do not select the node as src node
      if ((nodesCorrupt != null) && nodesCorrupt.contains(node))
        continue;
      if(node.getNumberOfBlocksToBeReplicated() >= maxReplicationStreams)
        continue; // already reached replication limit
      // the block must not be scheduled for removal on srcNode
      if(excessBlocks != null && excessBlocks.contains(block))
        continue;
      // never use already decommissioned nodes
      if(node.isDecommissioned())
        continue;
      // we prefer nodes that are in DECOMMISSION_INPROGRESS state
      if(node.isDecommissionInProgress() || srcNode == null) {
        srcNode = node;
        continue;
      }
      if(srcNode.isDecommissionInProgress())
        continue;
      // switch to a different node randomly
      // this to prevent from deterministically selecting the same node even
      // if the node failed to replicate the block on previous iterations
      if(r.nextBoolean())
        srcNode = node;
    }
    if(numReplicas != null)
      numReplicas.initialize(live, decommissioned, corrupt, excess);
    return srcNode;
  }

  /**
   * If there were any replication requests that timed out, reap them
   * and put them back into the neededReplication queue
   */
  void processPendingReplications() {
    Block[] timedOutItems = pendingReplications.getTimedOutBlocks();
    if (timedOutItems != null) {
      synchronized (namesystem) {
        for (int i = 0; i < timedOutItems.length; i++) {
          NumberReplicas num = countNodes(timedOutItems[i]);
          if (isNeededReplication(timedOutItems[i], getReplication(timedOutItems[i]),
                                 num.liveReplicas())) {
            neededReplications.add(timedOutItems[i],
                                   num.liveReplicas(),
                                   num.decommissionedReplicas(),
                                   getReplication(timedOutItems[i]));
          }
        }
      }
      /* If we know the target datanodes where the replication timedout,
       * we could invoke decBlocksScheduled() on it. Its ok for now.
       */
    }
  }

  /**
   * The given node is reporting all its blocks.  Use this info to
   * update the (machine-->blocklist) and (block-->machinelist) tables.
   */
  public void processReport(DatanodeDescriptor node,
                            BlockListAsLongs report) throws IOException {
    //
    // Modify the (block-->datanode) map, according to the difference
    // between the old and new block report.
    //
    Collection<Block> toAdd = new LinkedList<Block>();
    Collection<Block> toRemove = new LinkedList<Block>();
    Collection<Block> toInvalidate = new LinkedList<Block>();
    Collection<BlockInfo> toCorrupt = new LinkedList<BlockInfo>();
    node.reportDiff(this, report, toAdd, toRemove, toInvalidate, toCorrupt);

    for (Block b : toRemove) {
      removeStoredBlock(b, node);
    }
    for (Block b : toAdd) {
      addStoredBlock(b, node, null);
    }
    for (Block b : toInvalidate) {
      NameNode.stateChangeLog.info("BLOCK* NameSystem.processReport: block "
          + b + " on " + node.getName() + " size " + b.getNumBytes()
          + " does not belong to any file.");
      addToInvalidates(b, node);
    }
    for (BlockInfo b : toCorrupt) {
      markBlockAsCorrupt(b, node);
    }
  }

  /**
   * Modify (block-->datanode) map. Remove block from set of
   * needed replications if this takes care of the problem.
   * @return the block that is stored in blockMap.
   */
  private Block addStoredBlock(final Block block,
                               DatanodeDescriptor node,
                               DatanodeDescriptor delNodeHint)
  throws IOException {
    BlockInfo storedBlock = blocksMap.getStoredBlock(block);
    if (storedBlock == null || storedBlock.getINode() == null) {
      // If this block does not belong to anyfile, then we are done.
      NameNode.stateChangeLog.info("BLOCK* NameSystem.addStoredBlock: "
                                   + "addStoredBlock request received for "
                                   + block + " on " + node.getName()
                                   + " size " + block.getNumBytes()
                                   + " But it does not belong to any file.");
      // we could add this block to invalidate set of this datanode.
      // it will happen in next block report otherwise.
      return block;
    }
    assert storedBlock != null : "Block must be stored by now";
    INodeFile fileINode = storedBlock.getINode();
    assert fileINode != null : "Block must belong to a file";

    // add block to the data-node
    boolean added = node.addBlock(storedBlock);

    int curReplicaDelta = 0;
    if (added) {
      curReplicaDelta = 1;
      //
      // At startup time, because too many new blocks come in
      // they take up lots of space in the log file.
      // So, we log only when namenode is out of safemode.
      //
      if (!namesystem.isInSafeMode()) {
        NameNode.stateChangeLog.info("BLOCK* NameSystem.addStoredBlock: "
            + "blockMap updated: " + node.getName() + " is added to " + 
            storedBlock + " size " + storedBlock.getNumBytes());
      }
    } else {
      NameNode.stateChangeLog.warn("BLOCK* NameSystem.addStoredBlock: "
          + "Redundant addStoredBlock request received for " + storedBlock
          + " on " + node.getName() + " size " + storedBlock.getNumBytes());
    }

    // filter out containingNodes that are marked for decommission.
    NumberReplicas num = countNodes(storedBlock);
    int numLiveReplicas = num.liveReplicas();
    int numCurrentReplica = numLiveReplicas
      + pendingReplications.getNumReplicas(storedBlock);

    if(storedBlock.getBlockUCState() == BlockUCState.COMMITTED &&
        numLiveReplicas >= minReplication)
      storedBlock = completeBlock(fileINode, storedBlock);

    // check whether safe replication is reached for the block
    // only complete blocks are counted towards that
    if(storedBlock.isComplete())
      namesystem.incrementSafeBlockCount(numCurrentReplica);

    // if file is under construction, then check whether the block
    // can be completed
    if (fileINode.isUnderConstruction()) {
      return storedBlock;
    }

    // do not handle mis-replicated blocks during startup
    if (namesystem.isInSafeMode())
      return storedBlock;

    // handle underReplication/overReplication
    short fileReplication = fileINode.getReplication();
    if (!isNeededReplication(storedBlock, fileReplication, numCurrentReplica)) {
      neededReplications.remove(storedBlock, numCurrentReplica,
          num.decommissionedReplicas, fileReplication);
    } else {
      updateNeededReplications(storedBlock, curReplicaDelta, 0);
    }
    if (numCurrentReplica > fileReplication) {
      processOverReplicatedBlock(storedBlock, fileReplication, node, delNodeHint);
    }
    // If the file replication has reached desired value
    // we can remove any corrupt replicas the block may have
    int corruptReplicasCount = corruptReplicas.numCorruptReplicas(storedBlock);
    int numCorruptNodes = num.corruptReplicas();
    if (numCorruptNodes != corruptReplicasCount) {
      FSNamesystem.LOG.warn("Inconsistent number of corrupt replicas for " +
          storedBlock + "blockMap has " + numCorruptNodes + 
          " but corrupt replicas map has " + corruptReplicasCount);
    }
    if ((corruptReplicasCount > 0) && (numLiveReplicas >= fileReplication))
      invalidateCorruptReplicas(storedBlock);
    return storedBlock;
  }

  /**
   * Invalidate corrupt replicas.
   * <p>
   * This will remove the replicas from the block's location list,
   * add them to {@link #recentInvalidateSets} so that they could be further
   * deleted from the respective data-nodes,
   * and remove the block from corruptReplicasMap.
   * <p>
   * This method should be called when the block has sufficient
   * number of live replicas.
   *
   * @param blk Block whose corrupt replicas need to be invalidated
   */
  private void invalidateCorruptReplicas(Block blk) {
    Collection<DatanodeDescriptor> nodes = corruptReplicas.getNodes(blk);
    boolean gotException = false;
    if (nodes == null)
      return;
    // make a copy of the array of nodes in order to avoid
    // ConcurrentModificationException, when the block is removed from the node
    DatanodeDescriptor[] nodesCopy = nodes.toArray(new DatanodeDescriptor[0]);
    for (DatanodeDescriptor node : nodesCopy) {
      try {
        invalidateBlock(blk, node);
      } catch (IOException e) {
        NameNode.stateChangeLog.info("NameNode.invalidateCorruptReplicas " +
                                      "error in deleting bad block " + blk +
                                      " on " + node + e);
        gotException = true;
      }
    }
    // Remove the block from corruptReplicasMap
    if (!gotException)
      corruptReplicas.removeFromCorruptReplicasMap(blk);
  }

  /**
   * For each block in the name-node verify whether it belongs to any file,
   * over or under replicated. Place it into the respective queue.
   */
  void processMisReplicatedBlocks() {
    long nrInvalid = 0, nrOverReplicated = 0, nrUnderReplicated = 0;
    synchronized (namesystem) {
      neededReplications.clear();
      for (BlockInfo block : blocksMap.getBlocks()) {
        INodeFile fileINode = block.getINode();
        if (fileINode == null) {
          // block does not belong to any file
          nrInvalid++;
          addToInvalidates(block);
          continue;
        }
        // calculate current replication
        short expectedReplication = fileINode.getReplication();
        NumberReplicas num = countNodes(block);
        int numCurrentReplica = num.liveReplicas();
        // add to under-replicated queue if need to be
        if (isNeededReplication(block, expectedReplication, numCurrentReplica)) {
          if (neededReplications.add(block, numCurrentReplica, num
              .decommissionedReplicas(), expectedReplication)) {
            nrUnderReplicated++;
          }
        }

        if (numCurrentReplica > expectedReplication) {
          // over-replicated block
          nrOverReplicated++;
          processOverReplicatedBlock(block, expectedReplication, null, null);
        }
      }
    }
    FSNamesystem.LOG.info("Total number of blocks = " + blocksMap.size());
    FSNamesystem.LOG.info("Number of invalid blocks = " + nrInvalid);
    FSNamesystem.LOG.info("Number of under-replicated blocks = " + nrUnderReplicated);
    FSNamesystem.LOG.info("Number of  over-replicated blocks = " + nrOverReplicated);
  }

  /**
   * Find how many of the containing nodes are "extra", if any.
   * If there are any extras, call chooseExcessReplicates() to
   * mark them in the excessReplicateMap.
   */
  void processOverReplicatedBlock(Block block, short replication,
      DatanodeDescriptor addedNode, DatanodeDescriptor delNodeHint) {
    if (addedNode == delNodeHint) {
      delNodeHint = null;
    }
    Collection<DatanodeDescriptor> nonExcess = new ArrayList<DatanodeDescriptor>();
    Collection<DatanodeDescriptor> corruptNodes = corruptReplicas
        .getNodes(block);
    for (Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(block);
         it.hasNext();) {
      DatanodeDescriptor cur = it.next();
      Collection<Block> excessBlocks = excessReplicateMap.get(cur
          .getStorageID());
      if (excessBlocks == null || !excessBlocks.contains(block)) {
        if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) {
          // exclude corrupt replicas
          if (corruptNodes == null || !corruptNodes.contains(cur)) {
            nonExcess.add(cur);
          }
        }
      }
    }
    namesystem.chooseExcessReplicates(nonExcess, block, replication, 
        addedNode, delNodeHint, replicator);
  }

  void addToExcessReplicate(DatanodeInfo dn, Block block) {
    Collection<Block> excessBlocks = excessReplicateMap.get(dn.getStorageID());
    if (excessBlocks == null) {
      excessBlocks = new TreeSet<Block>();
      excessReplicateMap.put(dn.getStorageID(), excessBlocks);
    }
    if (excessBlocks.add(block)) {
      excessBlocksCount++;
      NameNode.stateChangeLog.debug("BLOCK* NameSystem.chooseExcessReplicates:"
          + " (" + dn.getName() + ", " + block
          + ") is added to excessReplicateMap");
    }
  }

  /**
   * Modify (block-->datanode) map. Possibly generate replication tasks, if the
   * removed block is still valid.
   */
  void removeStoredBlock(Block block, DatanodeDescriptor node) {
    NameNode.stateChangeLog.debug("BLOCK* NameSystem.removeStoredBlock: "
        + block + " from " + node.getName());
    synchronized (namesystem) {
      if (!blocksMap.removeNode(block, node)) {
        NameNode.stateChangeLog.debug("BLOCK* NameSystem.removeStoredBlock: "
            + block + " has already been removed from node " + node);
        return;
      }

      //
      // It's possible that the block was removed because of a datanode
      // failure. If the block is still valid, check if replication is
      // necessary. In that case, put block on a possibly-will-
      // be-replicated list.
      //
      INode fileINode = blocksMap.getINode(block);
      if (fileINode != null) {
        namesystem.decrementSafeBlockCount(block);
        updateNeededReplications(block, -1, 0);
      }

      //
      // We've removed a block from a node, so it's definitely no longer
      // in "excess" there.
      //
      Collection<Block> excessBlocks = excessReplicateMap.get(node
          .getStorageID());
      if (excessBlocks != null) {
        if (excessBlocks.remove(block)) {
          excessBlocksCount--;
          NameNode.stateChangeLog.debug("BLOCK* NameSystem.removeStoredBlock: "
              + block + " is removed from excessBlocks");
          if (excessBlocks.size() == 0) {
            excessReplicateMap.remove(node.getStorageID());
          }
        }
      }

      // Remove the replica from corruptReplicas
      corruptReplicas.removeFromCorruptReplicasMap(block, node);
    }
  }

  /**
   * The given node is reporting that it received a certain block.
   */
  void addBlock(DatanodeDescriptor node, Block block, String delHint)
      throws IOException {
    // decrement number of blocks scheduled to this datanode.
    node.decBlocksScheduled();

    // get the deletion hint node
    DatanodeDescriptor delHintNode = null;
    if (delHint != null && delHint.length() != 0) {
      delHintNode = namesystem.getDatanode(delHint);
      if (delHintNode == null) {
        NameNode.stateChangeLog.warn("BLOCK* NameSystem.blockReceived: "
            + block + " is expected to be removed from an unrecorded node "
            + delHint);
      }
    }

    //
    // Modify the blocks->datanode map and node's map.
    //
    pendingReplications.remove(block);

    // blockReceived reports a finalized block
    Collection<Block> toAdd = new LinkedList<Block>();
    Collection<Block> toInvalidate = new LinkedList<Block>();
    Collection<BlockInfo> toCorrupt = new LinkedList<BlockInfo>();
    node.processReportedBlock(this, block, ReplicaState.FINALIZED,
                              toAdd, toInvalidate, toCorrupt);
    // the block is only in one of the lists
    // if it is in none then data-node already has it
    assert toAdd.size() + toInvalidate.size() <= 1 :
      "The block should be only in one of the lists.";

    for (Block b : toAdd) {
      addStoredBlock(b, node, delHintNode);
    }
    for (Block b : toInvalidate) {
      NameNode.stateChangeLog.info("BLOCK* NameSystem.addBlock: block "
          + b + " on " + node.getName() + " size " + b.getNumBytes()
          + " does not belong to any file.");
      addToInvalidates(b, node);
    }
    for (BlockInfo b : toCorrupt) {
      markBlockAsCorrupt(b, node);
    }
  }

  /**
   * Return the number of nodes that are live and decommissioned.
   */
  NumberReplicas countNodes(Block b) {
    int count = 0;
    int live = 0;
    int corrupt = 0;
    int excess = 0;
    Iterator<DatanodeDescriptor> nodeIter = blocksMap.nodeIterator(b);
    Collection<DatanodeDescriptor> nodesCorrupt = corruptReplicas.getNodes(b);
    while (nodeIter.hasNext()) {
      DatanodeDescriptor node = nodeIter.next();
      if ((nodesCorrupt != null) && (nodesCorrupt.contains(node))) {
        corrupt++;
      } else if (node.isDecommissionInProgress() || node.isDecommissioned()) {
        count++;
      } else {
        Collection<Block> blocksExcess =
          excessReplicateMap.get(node.getStorageID());
        if (blocksExcess != null && blocksExcess.contains(b)) {
          excess++;
        } else {
          live++;
        }
      }
    }
    return new NumberReplicas(live, count, corrupt, excess);
  }

  private void logBlockReplicationInfo(Block block, DatanodeDescriptor srcNode,
      NumberReplicas num) {
    int curReplicas = num.liveReplicas();
    int curExpectedReplicas = getReplication(block);
    INode fileINode = blocksMap.getINode(block);
    Iterator<DatanodeDescriptor> nodeIter = blocksMap.nodeIterator(block);
    StringBuilder nodeList = new StringBuilder();
    while (nodeIter.hasNext()) {
      DatanodeDescriptor node = nodeIter.next();
      nodeList.append(node.name);
      nodeList.append(" ");
    }
    FSNamesystem.LOG.info("Block: " + block + ", Expected Replicas: "
        + curExpectedReplicas + ", live replicas: " + curReplicas
        + ", corrupt replicas: " + num.corruptReplicas()
        + ", decommissioned replicas: " + num.decommissionedReplicas()
        + ", excess replicas: " + num.excessReplicas()
        + ", Is Open File: " + fileINode.isUnderConstruction()
        + ", Datanodes having this block: " + nodeList + ", Current Datanode: "
        + srcNode.name + ", Is current datanode decommissioning: "
        + srcNode.isDecommissionInProgress());
  }
  
  /**
   * Return true if there are any blocks on this node that have not
   * yet reached their replication factor. Otherwise returns false.
   */
  boolean isReplicationInProgress(DatanodeDescriptor srcNode) {
    boolean status = false;
    int underReplicatedBlocks = 0;
    int decommissionOnlyReplicas = 0;
    int underReplicatedInOpenFiles = 0;
    final Iterator<? extends Block> it = srcNode.getBlockIterator();
    while(it.hasNext()) {
      final Block block = it.next();
      INode fileINode = blocksMap.getINode(block);

      if (fileINode != null) {
        NumberReplicas num = countNodes(block);
        int curReplicas = num.liveReplicas();
        int curExpectedReplicas = getReplication(block);
        if (isNeededReplication(block, curExpectedReplicas, curReplicas)) {
          if (curExpectedReplicas > curReplicas) {
            //Log info about one block for this node which needs replication
            if (!status) {
              status = true;
              logBlockReplicationInfo(block, srcNode, num);
            }
            underReplicatedBlocks++;
            if ((curReplicas == 0) && (num.decommissionedReplicas() > 0)) {
              decommissionOnlyReplicas++;
            }
            if (fileINode.isUnderConstruction()) {
              underReplicatedInOpenFiles++;
            }
          }
          if (!neededReplications.contains(block) &&
            pendingReplications.getNumReplicas(block) == 0) {
            //
            // These blocks have been reported from the datanode
            // after the startDecommission method has been executed. These
            // blocks were in flight when the decommissioning was started.
            //
            neededReplications.add(block,
                                   curReplicas,
                                   num.decommissionedReplicas(),
                                   curExpectedReplicas);
          }
        }
      }
    }
    srcNode.decommissioningStatus.set(underReplicatedBlocks,
        decommissionOnlyReplicas, 
        underReplicatedInOpenFiles);
    return status;
  }

  int getActiveBlockCount() {
    return blocksMap.size() - (int)pendingDeletionBlocksCount;
  }

  DatanodeDescriptor[] getNodes(BlockInfo block) {
    DatanodeDescriptor[] nodes =
      new DatanodeDescriptor[block.numNodes()];
    Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(block);
    for (int i = 0; it != null && it.hasNext(); i++) {
      nodes[i] = it.next();
    }
    return nodes;
  }

  int getTotalBlocks() {
    return blocksMap.size();
  }

  void removeBlock(Block block) {
    addToInvalidates(block);
    corruptReplicas.removeFromCorruptReplicasMap(block);
    blocksMap.removeBlock(block);
  }

  BlockInfo getStoredBlock(Block block) {
    return blocksMap.getStoredBlock(block);
  }

  /* updates a block in under replication queue */
  void updateNeededReplications(Block block, int curReplicasDelta,
      int expectedReplicasDelta) {
    synchronized (namesystem) {
      NumberReplicas repl = countNodes(block);
      int curExpectedReplicas = getReplication(block);
      if (isNeededReplication(block, curExpectedReplicas, repl.liveReplicas())) {
        neededReplications.update(block, repl.liveReplicas(), repl
            .decommissionedReplicas(), curExpectedReplicas, curReplicasDelta,
            expectedReplicasDelta);
      } else {
        int oldReplicas = repl.liveReplicas()-curReplicasDelta;
        int oldExpectedReplicas = curExpectedReplicas-expectedReplicasDelta;
        neededReplications.remove(block, oldReplicas, repl.decommissionedReplicas(),
                                  oldExpectedReplicas);
      }
    }
  }

  void checkReplication(Block block, int numExpectedReplicas) {
    // filter out containingNodes that are marked for decommission.
    NumberReplicas number = countNodes(block);
    if (isNeededReplication(block, numExpectedReplicas, number.liveReplicas())) { 
      neededReplications.add(block,
                             number.liveReplicas(),
                             number.decommissionedReplicas,
                             numExpectedReplicas);
    }
  }

  /* get replication factor of a block */
  private int getReplication(Block block) {
    INodeFile fileINode = blocksMap.getINode(block);
    if (fileINode == null) { // block does not belong to any file
      return 0;
    }
    assert !fileINode.isDirectory() : "Block cannot belong to a directory.";
    return fileINode.getReplication();
  }

  /**
   * Remove a datanode from the invalidatesSet
   * @param n datanode
   */
  void removeFromInvalidates(String storageID) {
    Collection<Block> blocks = recentInvalidateSets.remove(storageID);
    if (blocks != null) {
      pendingDeletionBlocksCount -= blocks.size();
    }
  }

  /**
   * Get blocks to invalidate for <i>nodeId</i>
   * in {@link #recentInvalidateSets}.
   *
   * @return number of blocks scheduled for removal during this iteration.
   */
  private int invalidateWorkForOneNode(String nodeId) {
    synchronized (namesystem) {
      // blocks should not be replicated or removed if safe mode is on
      if (namesystem.isInSafeMode())
        return 0;
      // get blocks to invalidate for the nodeId
      assert nodeId != null;
      DatanodeDescriptor dn = namesystem.getDatanode(nodeId);
      if (dn == null) {
        removeFromInvalidates(nodeId);
        return 0;
      }

      Collection<Block> invalidateSet = recentInvalidateSets.get(nodeId);
      if (invalidateSet == null)
        return 0;

      ArrayList<Block> blocksToInvalidate = new ArrayList<Block>(
          namesystem.blockInvalidateLimit);

      // # blocks that can be sent in one message is limited
      Iterator<Block> it = invalidateSet.iterator();
      for (int blkCount = 0; blkCount < namesystem.blockInvalidateLimit
          && it.hasNext(); blkCount++) {
        blocksToInvalidate.add(it.next());
        it.remove();
      }

      // If we send everything in this message, remove this node entry
      if (!it.hasNext()) {
        removeFromInvalidates(nodeId);
      }

      dn.addBlocksToBeInvalidated(blocksToInvalidate);

      if (NameNode.stateChangeLog.isInfoEnabled()) {
        StringBuilder blockList = new StringBuilder();
        for (Block blk : blocksToInvalidate) {
          blockList.append(' ');
          blockList.append(blk);
        }
        NameNode.stateChangeLog.info("BLOCK* ask " + dn.getName()
            + " to delete " + blockList);
      }
      pendingDeletionBlocksCount -= blocksToInvalidate.size();
      return blocksToInvalidate.size();
    }
  }
  
  //Returns the number of racks over which a given block is replicated
  //decommissioning/decommissioned nodes are not counted. corrupt replicas 
  //are also ignored
  int getNumberOfRacks(Block b) {
    HashSet<String> rackSet = new HashSet<String>(0);
    Collection<DatanodeDescriptor> corruptNodes = 
                                  corruptReplicas.getNodes(b);
    for (Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(b); 
         it.hasNext();) {
      DatanodeDescriptor cur = it.next();
      if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) {
        if ((corruptNodes == null ) || !corruptNodes.contains(cur)) {
          String rackName = cur.getNetworkLocation();
          if (!rackSet.contains(rackName)) {
            rackSet.add(rackName);
          }
        }
      }
    }
    return rackSet.size();
  }

  boolean blockHasEnoughRacks(Block b) {
    if (!this.shouldCheckForEnoughRacks) {
      return true;
    }
    boolean enoughRacks = false;;
    Collection<DatanodeDescriptor> corruptNodes = 
                                  corruptReplicas.getNodes(b);
    int numExpectedReplicas = getReplication(b);
    String rackName = null;
    for (Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(b); 
         it.hasNext();) {
      DatanodeDescriptor cur = it.next();
      if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) {
        if ((corruptNodes == null ) || !corruptNodes.contains(cur)) {
          if (numExpectedReplicas == 1) {
            enoughRacks = true;
            break;
          }
          String rackNameNew = cur.getNetworkLocation();
          if (rackName == null) {
            rackName = rackNameNew;
          } else if (!rackName.equals(rackNameNew)) {
            enoughRacks = true;
            break;
          }
        }
      }
    }
    return enoughRacks;
  }

  boolean isNeededReplication(Block b, int expectedReplication, int curReplicas) {
    if ((curReplicas >= expectedReplication) && (blockHasEnoughRacks(b))) {
      return false;
    } else {
      return true;
    }
  }
  
  long getMissingBlocksCount() {
    // not locking
    return Math.max(missingBlocksInPrevIter, missingBlocksInCurIter);
  }

  BlockInfo addINode(BlockInfo block, INodeFile iNode) {
    return blocksMap.addINode(block, iNode);
  }

  INodeFile getINode(Block b) {
    return blocksMap.getINode(b);
  }

  void removeFromCorruptReplicasMap(Block block) {
    corruptReplicas.removeFromCorruptReplicasMap(block);
  }

  int numCorruptReplicas(Block block) {
    return corruptReplicas.numCorruptReplicas(block);
  }

  void removeBlockFromMap(Block block) {
    blocksMap.removeBlock(block);
  }

  int getCapacity() {
    synchronized(namesystem) {
      return blocksMap.getCapacity();
    }
  }
  
  float getLoadFactor() {
    return blocksMap.getLoadFactor();
  }
  
  
  /**
   * Return a range of corrupt replica block ids. Up to numExpectedBlocks 
   * blocks starting at the next block after startingBlockId are returned
   * (fewer if numExpectedBlocks blocks are unavailable). If startingBlockId 
   * is null, up to numExpectedBlocks blocks are returned from the beginning.
   * If startingBlockId cannot be found, null is returned.
   *
   * @param numExpectedBlocks Number of block ids to return.
   *  0 <= numExpectedBlocks <= 100
   * @param startingBlockId Block id from which to start. If null, start at
   *  beginning.
   * @return Up to numExpectedBlocks blocks from startingBlockId if it exists
   *
   */
  long[] getCorruptReplicaBlockIds(int numExpectedBlocks,
                                   Long startingBlockId) {
    return corruptReplicas.getCorruptReplicaBlockIds(numExpectedBlocks,
                                                     startingBlockId);
  }  
  
  /**
   * @return inodes of files with corrupt blocks, with a maximum of 
   * MAX_CORRUPT_FILES_RETURNED inodes listed in total
   */
  INode[] getCorruptInodes() {
    LinkedHashSet<INode> set = new LinkedHashSet<INode>();

    for (Block blk : 
            neededReplications.getQueue(
                UnderReplicatedBlocks.QUEUE_WITH_CORRUPT_BLOCKS)){
      INode inode = blocksMap.getINode(blk);
      if (inode != null && countNodes(blk).liveReplicas() == 0) {
        set.add(inode);
        if (set.size() >= this.maxCorruptFilesReturned) {
          break;  
        }
      } 
    }
    return set.toArray(new INode[set.size()]);
  }
  
}