/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.raid; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.EnumMap; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.util.StringUtils; /** * Monitors and potentially fixes placement of blocks in RAIDed files. */ public class PlacementMonitor { public static final Log LOG = LogFactory.getLog(PlacementMonitor.class); Configuration conf; Map<ErasureCodeType, Map<Integer, Long>> blockHistograms; private volatile Map<ErasureCodeType, Map<Integer, Long>> lastBlockHistograms; private volatile long lastUpdateStartTime = 0L; private volatile long lastUpdateFinishTime = 0L; private volatile long lastUpdateUsedTime = 0L; RaidNodeMetrics metrics; BlockMover blockMover; final static String NUM_MOVING_THREADS_KEY = "hdfs.raid.block.move.threads"; final static String SIMULATE_KEY = "hdfs.raid.block.move.simulate"; final static String BLOCK_MOVE_QUEUE_LENGTH_KEY = "hdfs.raid.block.move.queue.length"; final static int DEFAULT_NUM_MOVING_THREADS = 10; final static int DEFAULT_BLOCK_MOVE_QUEUE_LENGTH = 30000; final int rsCodePriorityOffset; PlacementMonitor(Configuration conf) throws IOException { this.conf = conf; this.blockHistograms = createEmptyHistograms(); int numMovingThreads = conf.getInt( NUM_MOVING_THREADS_KEY, DEFAULT_NUM_MOVING_THREADS); int maxMovingQueueSize = conf.getInt( BLOCK_MOVE_QUEUE_LENGTH_KEY, DEFAULT_BLOCK_MOVE_QUEUE_LENGTH); boolean simulate = conf.getBoolean(SIMULATE_KEY, true); this.rsCodePriorityOffset = RaidNode.getStripeLength(conf) - 1; // For RS file with 3 blocks on same node or more and // XOR file with (stripeLength + 2) on same node, always submit the move int alwaysSubmitPriorityLevel = rsCodePriorityOffset + 3; blockMover = new BlockMover( numMovingThreads, maxMovingQueueSize, simulate, alwaysSubmitPriorityLevel, conf); this.metrics = RaidNodeMetrics.getInstance(); } private Map<ErasureCodeType, Map<Integer, Long>> createEmptyHistograms() { Map<ErasureCodeType, Map<Integer, Long>> histo = new HashMap<ErasureCodeType, Map<Integer, Long>>(); for (ErasureCodeType type : ErasureCodeType.values()) { histo.put(type, new HashMap<Integer, Long>()); } return new EnumMap<ErasureCodeType, Map<Integer, Long>>(histo); } public void start() { blockMover.start(); } public void stop() { blockMover.stop(); } public void startCheckingFiles() { lastUpdateStartTime = RaidNode.now(); } public int getMovingQueueSize() { return blockMover.getQueueSize(); } public void checkFile(FileSystem srcFs, FileStatus srcFile, FileSystem parityFs, Path partFile, HarIndex.IndexEntry entry, ErasureCodeType code) throws IOException { if (srcFs.getUri().equals(parityFs.getUri())) { checkLocatedBlocks( getLocatedBlocks(srcFs, srcFile), getLocatedBlocks(parityFs, partFile, entry.startOffset, entry.length), code); } else { // TODO: Move blocks in two clusters separately LOG.warn("Source and parity are in different file system. " + " source:" + srcFs.getUri() + " parity:" + parityFs.getUri() + ". Skip."); } } public void checkFile(FileSystem srcFs, FileStatus srcFile, FileSystem parityFs, FileStatus parityFile, ErasureCodeType code) throws IOException { if (srcFs.equals(parityFs)) { checkLocatedBlocks( getLocatedBlocks(srcFs, srcFile), getLocatedBlocks(parityFs, parityFile), code); } else { // TODO: Move blocks in two clusters separately LOG.warn("Source and parity are in different file systems. Skip"); } } private LocatedBlocks getLocatedBlocks( FileSystem fs, FileStatus stat) throws IOException { return getLocatedBlocks( fs, stat.getPath(), 0, stat.getLen()); } private LocatedBlocks getLocatedBlocks( FileSystem fs, Path path, long start, long length) throws IOException { if (!(fs instanceof DistributedFileSystem)) { return null; } DistributedFileSystem dfs = (DistributedFileSystem) fs; return dfs.getClient().namenode.getBlockLocations( path.toUri().getPath(), start, length); } void checkLocatedBlocks(LocatedBlocks srcBlocks, LocatedBlocks parityBlocks, ErasureCodeType code) throws IOException { if (srcBlocks == null || parityBlocks == null) { return; } int stripeLength = RaidNode.getStripeLength(conf); int parityLength = code == ErasureCodeType.XOR ? 1 : RaidNode.rsParityLength(conf); int numStripes = (int)Math.ceil( (float)(srcBlocks.getLocatedBlocks().size()) / stripeLength); Map<DatanodeInfo, Integer> nodeToNumBlocks = new HashMap<DatanodeInfo, Integer>(); Set<DatanodeInfo> nodesInThisStripe = new HashSet<DatanodeInfo>(); for (int stripeIndex = 0; stripeIndex < numStripes; ++stripeIndex) { List<LocatedBlock> stripeBlocks = getStripeBlocks( stripeIndex, srcBlocks, stripeLength, parityBlocks, parityLength); countBlocksOnEachNode(stripeBlocks, nodeToNumBlocks, nodesInThisStripe); updateBlockPlacementHistogram(nodeToNumBlocks, blockHistograms.get(code)); submitBlockMoves(nodeToNumBlocks, stripeBlocks, nodesInThisStripe, code); } } private static List<LocatedBlock> getStripeBlocks(int stripeIndex, LocatedBlocks srcBlocks, int stripeLength, LocatedBlocks parityBlocks, int parityLength) { List<LocatedBlock> stripeBlocks = new LinkedList<LocatedBlock>(); // Adding source blocks int stripeStart = stripeLength * stripeIndex; int stripeEnd = Math.min( stripeStart + stripeLength, srcBlocks.getLocatedBlocks().size()); if (stripeStart < stripeEnd) { stripeBlocks.addAll( srcBlocks.getLocatedBlocks().subList(stripeStart, stripeEnd)); } // Adding parity blocks stripeStart = parityLength * stripeIndex; stripeEnd = Math.min( stripeStart + parityLength, parityBlocks.getLocatedBlocks().size()); if (stripeStart < stripeEnd) { stripeBlocks.addAll( parityBlocks.getLocatedBlocks().subList(stripeStart, stripeEnd)); } return stripeBlocks; } private static void countBlocksOnEachNode(List<LocatedBlock> stripeBlocks, Map<DatanodeInfo, Integer> nodeToNumBlocks, Set<DatanodeInfo> nodesInThisStripe) { nodeToNumBlocks.clear(); nodesInThisStripe.clear(); for (LocatedBlock block : stripeBlocks) { for (DatanodeInfo node : block.getLocations()) { Integer n = nodeToNumBlocks.get(node); if (n == null) { n = 0; } nodeToNumBlocks.put(node, n + 1); nodesInThisStripe.add(node); } } } private static void updateBlockPlacementHistogram( Map<DatanodeInfo, Integer> nodeToNumBlocks, Map<Integer, Long> blockHistogram) { for (Integer numBlocks : nodeToNumBlocks.values()) { Long n = blockHistogram.get(numBlocks - 1); if (n == null) { n = 0L; } // Number of neighbor blocks to number of blocks blockHistogram.put(numBlocks - 1, n + 1); } } private void submitBlockMoves(Map<DatanodeInfo, Integer> nodeToNumBlocks, List<LocatedBlock> stripeBlocks, Set<DatanodeInfo> excludedNodes, ErasureCodeType code) { // For all the nodes that has more than 2 blocks, find and move the blocks // so that there are only one block left on this node. for (DatanodeInfo node : nodeToNumBlocks.keySet()) { int colocatedBlocks = nodeToNumBlocks.get(node); if (colocatedBlocks <= 1) { continue; } boolean skip = true; for (LocatedBlock block : stripeBlocks) { for (DatanodeInfo otherNode : block.getLocations()) { if (node.equals(otherNode)) { if (skip) { // leave the first block where it is skip = false; break; } int priority = calculatePriority(colocatedBlocks, code); blockMover.move(block, node, excludedNodes, priority); break; } } } } } private int calculatePriority(int colocatedBlocks, ErasureCodeType code) { int priority = colocatedBlocks; if (code == ErasureCodeType.RS) { priority += rsCodePriorityOffset; } return priority; } /** * Report the placement histogram to {@link RaidNodeMetrics}. This should only * be called right after a complete parity file traversal is done. */ public void clearAndReport() { synchronized (metrics) { int extra = 0; for (Entry<Integer, Long> e : blockHistograms.get(ErasureCodeType.RS).entrySet()) { if (e.getKey() < metrics.misplacedRs.length - 1) { metrics.misplacedRs[e.getKey()].set(e.getValue()); } else { extra += e.getValue(); } } metrics.misplacedRs[metrics.misplacedRs.length - 1].set(extra); extra = 0; for (Entry<Integer, Long> e : blockHistograms.get(ErasureCodeType.XOR).entrySet()) { if (e.getKey() < metrics.misplacedXor.length - 1) { metrics.misplacedXor[e.getKey()].set(e.getValue()); } else { extra += e.getValue(); } } metrics.misplacedXor[metrics.misplacedXor.length - 1].set(extra); } lastBlockHistograms = blockHistograms; lastUpdateFinishTime = RaidNode.now(); lastUpdateUsedTime = lastUpdateFinishTime - lastUpdateStartTime; LOG.info("Reporting metrices:\n" + toString()); blockHistograms = createEmptyHistograms(); } @Override public String toString() { if (lastBlockHistograms == null) { return "Not available"; } String result = ""; for (ErasureCodeType code : ErasureCodeType.values()) { Map<Integer, Long> histo = lastBlockHistograms.get(code); result += code + " Blocks\n"; List<Integer> neighbors = new ArrayList<Integer>(); neighbors.addAll(histo.keySet()); Collections.sort(neighbors); for (Integer i : neighbors) { Long numBlocks = histo.get(i); result += i + " co-localted blocks:" + numBlocks + "\n"; } } return result; } public String htmlTable() { if (lastBlockHistograms == null) { return "Not available"; } int max = computeMaxColocatedBlocks(); String head = ""; for (int i = 0; i <= max; ++i) { head += JspUtils.td(i + ""); } head = JspUtils.tr(JspUtils.td("CODE") + head); String result = head; for (ErasureCodeType code : ErasureCodeType.values()) { String row = JspUtils.td(code.toString()); Map<Integer, Long> histo = lastBlockHistograms.get(code); for (int i = 0; i <= max; ++i) { Long numBlocks = histo.get(i); numBlocks = numBlocks == null ? 0 : numBlocks; row += JspUtils.td(StringUtils.humanReadableInt(numBlocks)); } row = JspUtils.tr(row); result += row; } return JspUtils.table(result); } public long lastUpdateTime() { return lastUpdateFinishTime; } public long lastUpdateUsedTime() { return lastUpdateUsedTime; } private int computeMaxColocatedBlocks() { int max = 0; for (ErasureCodeType code : ErasureCodeType.values()) { Map<Integer, Long> histo = lastBlockHistograms.get(code); for (Integer i : histo.keySet()) { max = Math.max(i, max); } } return max; } }