/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.namenode; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.util.GSet; import org.apache.hadoop.hdfs.util.LightWeightGSet; /** * Manage node decommissioning. */ class DecommissionManager { static final Log LOG = LogFactory.getLog(DecommissionManager.class); private final FSNamesystem fsnamesystem; DecommissionManager(FSNamesystem namesystem) { this.fsnamesystem = namesystem; } /** Periodically check decommission status. */ class Monitor implements Runnable { /** recheckInterval is how often namenode checks * if a node has finished decommission */ private final long recheckInterval; /** The number of decommission nodes to check for each interval */ private final int numNodesPerCheck; // datanodes that just started decomission, // which has higher priority to be checked next private final LinkedList<DatanodeDescriptor> newlyStarted = new LinkedList<DatanodeDescriptor>(); // datanodes that needs to be checked next private LinkedList<DatanodeDescriptor> toBeChecked = new LinkedList<DatanodeDescriptor>(); // datanodes that just finished check private LinkedList<DatanodeDescriptor> checked = new LinkedList<DatanodeDescriptor>(); // the node is under check private volatile DatanodeDescriptor nodeBeingCheck; // if there was an attempt to stop nodeBeingCheck from decommission private volatile boolean pendingToStopDecommission = false; Monitor(int recheckIntervalInSecond, int numNodesPerCheck) { this.recheckInterval = recheckIntervalInSecond * 1000L; this.numNodesPerCheck = numNodesPerCheck; } /** * Add a datanode that is just marked to start decommission * @param datanode a newly marked decommissioned node * @return true if the node is added */ synchronized boolean startDecommision(DatanodeDescriptor datanode) { if (datanode == null) { throw new IllegalArgumentException( "datanode to be decomissioned can not be null"); } if (nodeBeingCheck == datanode) { pendingToStopDecommission = false; return false; } if (!newlyStarted.contains(datanode) && !toBeChecked.contains(datanode) && !checked.contains(datanode)) { newlyStarted.offer(datanode); notifyAll(); return true; } return false; } /** * Stop a node from decommission by removing it from the queue * @param datanode a datanode * @return true if decommission is stopped; false if it is pending */ synchronized boolean stopDecommission(DatanodeDescriptor datanode) throws IOException { if (datanode == null) { throw new IllegalArgumentException( "datanode to be removed can not be null"); } if (datanode == nodeBeingCheck) { // the node to be stopped decommission is under check // so waiting for it to be done pendingToStopDecommission = true; return false; } if (newlyStarted.remove(datanode) || toBeChecked.remove(datanode)) { checked.remove(datanode); } return true; } /** * Return a list of unchecked blocks on srcNode * * @param srcNode a datanode * @param checkedBlocks all blocks that have been checked * @param numBlocks maximum number of blocks to return * @return a list of blocks to be checked */ private List<Block> fetchBlocks( GSet<Block, Block> checkedBlocks, int numBlocks) { final List<Block> blocksToCheck = new ArrayList<Block>(numBlocks); fsnamesystem.readLock(); try { final Iterator<Block> it = nodeBeingCheck.getBlockIterator(); while (blocksToCheck.size()<numBlocks && it.hasNext()) { final Block block = it.next(); if (!checkedBlocks.contains(block)) { // the block has not been checked blocksToCheck.add(block); } } } finally { fsnamesystem.readUnlock(); } return blocksToCheck; } synchronized private void handlePendingStopDecommission() { if (pendingToStopDecommission) { LOG.info("Stop (delayed) Decommissioning node " + nodeBeingCheck.getName()); nodeBeingCheck.stopDecommission(); pendingToStopDecommission = false; } } /** * Change, if appropriate, the admin state of a datanode to * decommission completed. Return true if decommission is complete. */ private boolean checkDecommissionStateInternal() { fsnamesystem.writeLock(); int numOfBlocks; try { if (!nodeBeingCheck.isDecommissionInProgress()) { return true; } // initialize decominssioning status nodeBeingCheck.decommissioningStatus.set(0, 0, 0); numOfBlocks = nodeBeingCheck.numBlocks(); } finally { fsnamesystem.writeUnlock(); } // // Check to see if all blocks in this decommissioned // node has reached their target replication factor. // // limit the number of scans final int BLOCKS_PER_ITER = 1000; final int numOfBlocksToFetch = Math.max(BLOCKS_PER_ITER, numOfBlocks/5); GSet<Block, Block> checkedBlocks = new LightWeightGSet<Block, Block>(numOfBlocks); List<Block> blocksToCheck; int numBlocksToCheck; do { // get a batch of unchecked blocks blocksToCheck = fetchBlocks(checkedBlocks, numOfBlocksToFetch); numBlocksToCheck = blocksToCheck.size(); for (int i=0; i<numBlocksToCheck; ) { fsnamesystem.writeLock(); try { for (int j=0; j<BLOCKS_PER_ITER && i<numBlocksToCheck; j++, i++) { // check if each block reaches its replication factor or not Block blk = blocksToCheck.get(i); fsnamesystem.isReplicationInProgress(nodeBeingCheck, blk); checkedBlocks.put(blk); } } finally { fsnamesystem.writeUnlock(); } } } while (numBlocksToCheck != 0); fsnamesystem.writeLock(); try { handlePendingStopDecommission(); if (!nodeBeingCheck.isDecommissionInProgress()) { return true; } if (nodeBeingCheck.decommissioningStatus. getUnderReplicatedBlocks() == 0) { nodeBeingCheck.setDecommissioned(); LOG.info("Decommission complete for node " + nodeBeingCheck.getName()); return true; } } finally { fsnamesystem.writeUnlock(); } return false; } /** * Wait for more work to do * @return true if more work to do; false if gets interrupted */ synchronized private boolean waitForWork() { try { if (newlyStarted.isEmpty() && toBeChecked.isEmpty()) { do { wait(); } while (newlyStarted.isEmpty() && toBeChecked.isEmpty()); } else { Thread.sleep(recheckInterval); } return true; } catch (InterruptedException ie) { LOG.info("Interrupted " + this.getClass().getSimpleName(), ie); return false; } } /** * Check decommission status of numNodesPerCheck nodes; * sleep if there is no decommission node; * otherwise wakeup for every recheckInterval milliseconds. */ public void run() { while (fsnamesystem.isRunning() && !Thread.interrupted()) { try { if (waitForWork()) { check(); } else { break; } } catch (Exception e) { LOG.warn("DecommissionManager encounters an error: ", e); } } } /** * Get the next datanode that's decommission in progress */ synchronized private void getDecommissionInProgressNode() { nodeBeingCheck = newlyStarted.poll(); if (nodeBeingCheck == null) { nodeBeingCheck = toBeChecked.poll(); } if (nodeBeingCheck == null) { // all datanodes have been checked; preparing for the next iteration LinkedList<DatanodeDescriptor> tmp = toBeChecked; toBeChecked = checked; checked = tmp; } } /** * Mark the given datanode as just checked * @param datanode */ synchronized private void doneCheck(final boolean isDecommissioned) { if (!isDecommissioned) { // put to checked for next iteration of check checked.add(nodeBeingCheck); } nodeBeingCheck = null; } /** * Check up to numNodesPerCheck decommissioning in progress datanodes to * see if all their blocks are replicated. */ private void check() { for (int i=0; i<numNodesPerCheck; i++) { getDecommissionInProgressNode(); if (nodeBeingCheck == null) { break; } try { boolean isDecommissioned = checkDecommissionStateInternal(); doneCheck(isDecommissioned); } catch(Exception e) { LOG.warn("entry=" + nodeBeingCheck, e); } } } } }