/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.namenode; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.server.namenode.BlocksMap.BlockInfo; import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMetrics; import org.apache.hadoop.util.*; import java.io.*; import java.util.*; import java.util.Map.Entry; import java.sql.Time; /*************************************************** * PendingReplicationBlocks does the bookkeeping of all * blocks that are getting replicated. * * It does the following: * 1) record blocks that are getting replicated at this instant. * 2) a coarse grain timer to track age of replication request * 3) a thread that periodically identifies replication-requests * that never made it. * ***************************************************/ class PendingReplicationBlocks { // order the map in insertion order private LinkedHashMap<BlockInfo, PendingBlockInfo> pendingReplications; private ArrayList<BlockInfo> timedOutItems; Daemon timerThread = null; private volatile boolean fsRunning = true; private FSNamesystemMetrics fsnamesystemMetrics; // // It might take anywhere between 5 to 10 minutes before // a request is timed out. // private long timeout = 5 * 60 * 1000; private long defaultRecheckInterval = 5 * 60 * 1000; private int maxBlocksToCheck = 5000; PendingReplicationBlocks(long timeoutPeriod) { this(timeoutPeriod, null); } PendingReplicationBlocks(long timeoutPeriod, FSNamesystemMetrics metrics) { this(timeoutPeriod, 0, null); } PendingReplicationBlocks(long timeoutPeriod, int maxBlocksToCheck, FSNamesystemMetrics metrics) { if ( timeoutPeriod > 0 ) { this.timeout = timeoutPeriod; } if ( maxBlocksToCheck > 0 ) { this.maxBlocksToCheck = maxBlocksToCheck; } fsnamesystemMetrics = metrics; init(); } PendingReplicationBlocks() { init(); } void init() { pendingReplications = new LinkedHashMap<BlockInfo, PendingBlockInfo>(); timedOutItems = new ArrayList<BlockInfo>(); this.timerThread = new Daemon(new PendingReplicationMonitor()); timerThread.start(); } /** * Add a block to the list of pending Replications */ void add(BlockInfo block, int numReplicas) { synchronized (pendingReplications) { PendingBlockInfo found = pendingReplications.get(block); if (found == null) { pendingReplications.put(block, new PendingBlockInfo(numReplicas)); } else { found.incrementReplicas(numReplicas); found.setTimeStamp(); } } } /** * One replication request for this block has finished. * Decrement the number of pending replication requests * for this block. */ void remove(Block block) { synchronized (pendingReplications) { PendingBlockInfo found = pendingReplications.get(block); if (found != null) { if (FSNamesystem.LOG.isDebugEnabled()) { FSNamesystem.LOG.debug( "Removing pending replication for block" + block); } found.decrementReplicas(); if (found.getNumReplicas() <= 0) { pendingReplications.remove(block); } } } } /** * The total number of blocks that are undergoing replication */ int size() { return pendingReplications.size(); } /** * How many copies of this block is pending replication? */ int getNumReplicas(Block block) { synchronized (pendingReplications) { PendingBlockInfo found = pendingReplications.get(block); if (found != null) { return found.getNumReplicas(); } } return 0; } /** * Returns a list of blocks that have timed out their * replication requests. Returns null if no blocks have * timed out. */ BlockInfo[] getTimedOutBlocks() { synchronized (timedOutItems) { if (timedOutItems.size() <= 0) { return null; } BlockInfo[] blockList = timedOutItems.toArray( new BlockInfo[timedOutItems.size()]); timedOutItems.clear(); return blockList; } } /** * An object that contains information about a block that * is being replicated. It records the timestamp when the * system started replicating the most recent copy of this * block. It also records the number of replication * requests that are in progress. */ static class PendingBlockInfo { private long timeStamp; private int numReplicasInProgress; PendingBlockInfo(int numReplicas) { this.timeStamp = FSNamesystem.now(); this.numReplicasInProgress = numReplicas; } long getTimeStamp() { return timeStamp; } void setTimeStamp() { timeStamp = FSNamesystem.now(); } void incrementReplicas(int increment) { numReplicasInProgress += increment; } void decrementReplicas() { numReplicasInProgress--; assert(numReplicasInProgress >= 0); } int getNumReplicas() { return numReplicasInProgress; } } /* * A periodic thread that scans for blocks that never finished * their replication request. */ class PendingReplicationMonitor implements Runnable { public void run() { while (fsRunning) { long period = Math.min(defaultRecheckInterval, timeout); try { pendingReplicationCheck(); Thread.sleep(period); } catch (InterruptedException ie) { FSNamesystem.LOG.info( "PendingReplicationMonitor thread received exception. " + ie); return; } } } /** * Iterate through all items and detect timed-out items */ void pendingReplicationCheck() throws InterruptedException { FSNamesystem.LOG.info("PendingReplicationMonitor checking Q"); int totalPendingBlocks = 0; synchronized (pendingReplications){ totalPendingBlocks = pendingReplications.size(); } List<Map.Entry<BlockInfo, PendingBlockInfo>> blocksToCheck = new LinkedList<Map.Entry<BlockInfo, PendingBlockInfo>>(); while (totalPendingBlocks > 0) { synchronized (pendingReplications) { // At most maxBlocksToCheck items per iteration int numBlocksToCheck = Math.min( maxBlocksToCheck, pendingReplications.size()); if (numBlocksToCheck == 0) { break; } numBlocksToCheck = Math.min(numBlocksToCheck, totalPendingBlocks); totalPendingBlocks -= numBlocksToCheck; // remove the number of blocks from pendingReplications Iterator<Entry<BlockInfo, PendingBlockInfo>> iter = pendingReplications.entrySet().iterator(); while (numBlocksToCheck-- > 0) { blocksToCheck.add(iter.next()); iter.remove(); } // adjust the pending block count in case it gets shorter totalPendingBlocks = Math.min( totalPendingBlocks, pendingReplications.size()); // Check if timeout long now = FSNamesystem.now(); for (iter = blocksToCheck.iterator(); iter.hasNext(); ) { Entry<BlockInfo, PendingBlockInfo> entry = iter.next(); PendingBlockInfo pendingBlock = entry.getValue(); BlockInfo block = entry.getKey(); if (now < pendingBlock.getTimeStamp() + timeout) { // not timeout; reinsert into end of pendingReplications // so it will be checked in future batches iter.remove(); pendingReplications.put(block, pendingBlock); } } } if (blocksToCheck.isEmpty()) { continue; } // blocksToCheck has timeout blocks; handle timeout blocks if (fsnamesystemMetrics != null) { fsnamesystemMetrics.numTimedoutReplications.inc(blocksToCheck.size()); } StringBuilder logMsg = new StringBuilder( "PendingReplicationMonitor timed out blocks"); while (!blocksToCheck.isEmpty()) { BlockInfo timeoutBlock = blocksToCheck.remove(0).getKey(); logMsg.append(" ").append(timeoutBlock); synchronized (timedOutItems) { timedOutItems.add(timeoutBlock); } } FSNamesystem.LOG.warn(logMsg); // sleep 1 second in between two iterations Thread.sleep(1000); } } } /* * Shuts down the pending replication monitor thread. * Waits for the thread to exit. */ void stop() { fsRunning = false; timerThread.interrupt(); try { timerThread.join(3000); } catch (InterruptedException ie) { } } /** * Iterate through all items and print them. */ void metaSave(PrintWriter out) { synchronized (pendingReplications) { out.println("Metasave: Blocks being replicated: " + pendingReplications.size()); Iterator<Map.Entry<BlockInfo, PendingBlockInfo>> iter = pendingReplications.entrySet().iterator(); while (iter.hasNext()) { Map.Entry<BlockInfo, PendingBlockInfo> entry = iter.next(); PendingBlockInfo pendingBlock = entry.getValue(); BlockInfo block = entry.getKey(); out.println(block + " StartTime: " + new Time(pendingBlock.timeStamp) + " NumReplicaInProgress: " + pendingBlock.numReplicasInProgress); } } } }