package org.apache.hadoop.raid; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.EOFException; import java.io.IOException; import java.net.Socket; import java.util.Comparator; import java.util.Random; import java.util.Set; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.PriorityBlockingQueue; import java.util.concurrent.ThreadFactory; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.protocol.DataTransferProtocol; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.FSConstants; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.ReplaceBlockHeader; import org.apache.hadoop.hdfs.protocol.VersionAndOpcode; import org.apache.hadoop.hdfs.server.common.HdfsConstants; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.net.NetworkTopology; class BlockMover { public static final Log LOG = LogFactory.getLog(BlockMover.class); final private BlockingQueue<Runnable> movingQueue; final private int maxQueueSize; final private RaidNodeMetrics metrics; final private boolean simulate; final private Random rand; final private Configuration conf; final private int alwaysSubmitPriorityLevel; final ClusterInfo cluster; final Thread clusterUpdater; ExecutorService executor; final int chooseNodeMaxRetryTimes; // this is for test only. final boolean treatNodesOnDifferentRack; final static String RAID_CHOOSE_NODE_RETRY_TIMES_KEY = "raid.block.mover.choose.node.retry"; final static int RAID_CHOOSE_NODE_RETRY_TIMES_DEFAULT = 100; final static String RAID_TEST_TREAT_NODES_ON_DEFAULT_RACK_KEY = "raid.test.treat.nodes.on.different.rack"; BlockMover(int numMovingThreads, int maxQueueSize, boolean simulate, int alwaysSubmitPriorityLevel, Configuration conf) throws IOException { this.movingQueue = new PriorityBlockingQueue<Runnable>( 1000, new BlockMoveActionComparator()); ThreadFactory factory = new ThreadFactory() { final AtomicInteger numThreads = new AtomicInteger(); public Thread newThread(Runnable r) { Thread t = new Thread(r); t.setName("BLockMoveExecutor-" + numThreads.getAndIncrement()); return t; } }; this.executor = new ThreadPoolExecutor(numMovingThreads, numMovingThreads, 0L, TimeUnit.MILLISECONDS, movingQueue, factory); this.maxQueueSize = maxQueueSize; this.metrics = RaidNodeMetrics.getInstance(RaidNodeMetrics.DEFAULT_NAMESPACE_ID); this.cluster = new ClusterInfo(); this.clusterUpdater = new Thread(cluster); this.simulate = simulate; this.rand = new Random(); this.conf = conf; this.alwaysSubmitPriorityLevel = alwaysSubmitPriorityLevel; this.chooseNodeMaxRetryTimes = conf.getInt(RAID_CHOOSE_NODE_RETRY_TIMES_KEY, RAID_CHOOSE_NODE_RETRY_TIMES_DEFAULT); this.treatNodesOnDifferentRack = conf.getBoolean(RAID_TEST_TREAT_NODES_ON_DEFAULT_RACK_KEY, false); } public void start() { clusterUpdater.setDaemon(true); clusterUpdater.start(); } public void stop() { cluster.stop(); clusterUpdater.interrupt(); executor.shutdown(); } public int getQueueSize() { return movingQueue.size(); } public boolean isOnSameRack(DatanodeInfo n1, DatanodeInfo n2) { if (treatNodesOnDifferentRack) { // this is for test only if (n1 == null) { return false; } else if (n1.equals(n2)) { return true; } else { return false; } } return cluster.isOnSameRack(n1, n2); } public void move(LocatedBlock block, DatanodeInfo node, DatanodeInfo target, Set<DatanodeInfo> excludedNodes, int priority, int dataTransferProtocolVersion, int namespaceId) { BlockMoveAction action = new BlockMoveAction( block, node, target, excludedNodes, priority, dataTransferProtocolVersion, namespaceId); if (LOG.isDebugEnabled()) { LOG.debug("Bad block placement: " + action); } int movingQueueSize = movingQueue.size(); //For high-pri moves, the queue limit is 2*maxQueueSize if (movingQueueSize < maxQueueSize || movingQueueSize < 2 * maxQueueSize && action.priority >= alwaysSubmitPriorityLevel) { executor.execute(action); metrics.blockMoveScheduled.inc(); } else { if (LOG.isDebugEnabled()) { LOG.debug("Block move queue is full. Skip the action." + " size:" + movingQueueSize + " maxSize:" + maxQueueSize); } metrics.blockMoveSkipped.inc(); } } /** * Sort BlockMoveAction based on the priority in descending order */ static class BlockMoveActionComparator implements Comparator<Runnable> { @Override public int compare(Runnable o1, Runnable o2) { BlockMoveAction a1 = (BlockMoveAction) o1; BlockMoveAction a2 = (BlockMoveAction) o2; if (a1.priority > a2.priority) { return -1; } if (a1.priority < a2.priority) { return 1; } // if tie, sort based on the time in ascending order return a1.createTime > a2.createTime ? 1 : -1; } } /** * explicitly choose the target nodes. * @throws IOException */ public DatanodeInfo chooseTargetNodes(Set<DatanodeInfo> excludedNodes) throws IOException { DatanodeInfo target = cluster.getNodeOnDifferentRack(excludedNodes); if (target == null) { throw new IOException ("Error choose datanode"); } return target; } /** * Create one more replication of the block */ class BlockMoveAction implements Runnable { final LocatedBlock block; final Set<DatanodeInfo> excludedNodes; final DatanodeInfo source; // The datanode where this block will be removed DatanodeInfo target; // The destination for this block DatanodeInfo proxySource; // The datanode that copies this block to target final int priority; final long createTime; final int dataTransferProtocolVersion; // data transfer protocol supported by HDFS cluster final int namespaceId; // name space that the block belongs to BlockMoveAction(LocatedBlock block, DatanodeInfo source, Set<DatanodeInfo> excludedNodes, int priority, int dataTransferProtocol, int namespaceId) { this(block, source, null, excludedNodes, priority, dataTransferProtocol, namespaceId); } BlockMoveAction(LocatedBlock block, DatanodeInfo source, DatanodeInfo target, Set<DatanodeInfo> excludedNodes, int priority, int dataTransferProtocol, int namespaceId) { this.block = block; this.excludedNodes = excludedNodes; for (DatanodeInfo d : block.getLocations()) { // Also exclude the original locations excludedNodes.add(d); } this.source = source; this.target = target; this.createTime = System.currentTimeMillis(); this.priority = priority; this.dataTransferProtocolVersion = dataTransferProtocol; this.namespaceId = namespaceId; } /** * Choose target, source and proxySource for the move * @throws IOException */ void chooseNodes() throws IOException { if (target == null) { target = cluster.getNodeOnDifferentRack(excludedNodes); if (target == null) { throw new IOException("Error choose datanode"); } } for (DatanodeInfo n : block.getLocations()) { if (cluster.isOnSameRack(target, n)) { proxySource = n; return; } } proxySource = block.getLocations()[rand.nextInt(block.getLocations().length)]; } @Override public void run() { Socket sock = null; DataOutputStream out = null; DataInputStream in = null; String threadName = "[" + Thread.currentThread().getName() + "] "; try { chooseNodes(); if (simulate) { LOG.debug("Simulate mode. Skip move target:" + target + " source:" + source + " proxySource:" + proxySource); metrics.blockMove.inc(); return; } sock = new Socket(); sock.connect(NetUtils.createSocketAddr( target.getName()), HdfsConstants.READ_TIMEOUT); sock.setKeepAlive(true); sock.setSoTimeout(3600000); // set the timeout to be 1 hour out = new DataOutputStream( new BufferedOutputStream( sock.getOutputStream(), FSConstants.BUFFER_SIZE)); if (LOG.isDebugEnabled()) { LOG.debug( "Start moving block " + block.getBlock().getBlockId() + " from "+ source.getName() + " to " + target.getName() + " through " + proxySource.getName()); } sendRequest(out); in = new DataInputStream( new BufferedInputStream( sock.getInputStream(), FSConstants.BUFFER_SIZE)); receiveResponse(in); metrics.blockMove.inc(); LOG.info(threadName + "Moving block " + block.getBlock().getBlockId()); LOG.info(threadName + "priority " + priority); LOG.info(threadName + "from "+ source.getName()); LOG.info(threadName + "to " + target.getName()); LOG.info(threadName + "through " + proxySource.getName() + " succeed."); } catch (Exception e) { try { LOG.warn(threadName, e); LOG.warn(threadName + "Error moving block " + block.getBlock().getBlockId()); LOG.warn(threadName + "from " + source.getName() + " to "); LOG.warn(threadName + target.getName() + " through " + proxySource.getName()); if (e instanceof EOFException) { LOG.warn(threadName + "Moving block " + block.getBlock().getBlockId() + " was cancelled because the time exceeded the limit"); } } catch (Exception newE) { LOG.warn(threadName + "New error ", newE); } } finally { IOUtils.closeStream(out); IOUtils.closeStream(in); IOUtils.closeSocket(sock); } } @Override public String toString() { StringBuilder ret = new StringBuilder(); ret.append("block:").append(block.getBlock()).append("\t"); ret.append("locations:"); boolean first = true; for (DatanodeInfo n : block.getLocations()) { if (first) { ret.append(n.getHostName()); first = false; continue; } ret.append(",").append(n.getHostName()); } ret.append("\t"); ret.append("priority:"); ret.append(priority); ret.append("\t"); ret.append("source:"); ret.append(source); ret.append("\t"); ret.append("target:"); ret.append(target); ret.append("\t"); ret.append("createTime:"); ret.append(createTime); ret.append("\t"); ret.append("excludeNodes:"); ret.append(excludedNodes.size()); return ret.toString(); } /** * Send a block replace request to the output stream */ private void sendRequest(DataOutputStream out) throws IOException { ReplaceBlockHeader header = new ReplaceBlockHeader(new VersionAndOpcode( dataTransferProtocolVersion, DataTransferProtocol.OP_REPLACE_BLOCK)); header.set(namespaceId, block.getBlock().getBlockId(), block.getBlock() .getGenerationStamp(), source.getStorageID(), proxySource); header.writeVersionAndOpCode(out); header.write(out); out.flush(); } /** * Receive a block copy response from the input stream */ private void receiveResponse(DataInputStream in) throws IOException { short status = in.readShort(); if (status != DataTransferProtocol.OP_STATUS_SUCCESS) { throw new IOException("block move is failed"); } } } /** * Periodically obtain node information from the cluster */ class ClusterInfo implements Runnable { NetworkTopology topology = new NetworkTopology(); DatanodeInfo liveNodes[]; static final long UPDATE_PERIOD = 60000L; volatile boolean running = true; long lastUpdate = -1L; @Override public void run() { DistributedFileSystem dfs = null; do { try { dfs = DFSUtil.convertToDFS(FileSystem.get(conf)); } catch (IOException e) { LOG.warn("Failed to init file system", e); try { Thread.sleep(500); // sleep for half second } catch (InterruptedException ie) { LOG.info("Got interrupted", ie); return; } } } while (dfs == null); // Update the information about the datanodes in the cluster while (running) { try { long now = System.currentTimeMillis(); if (now - lastUpdate > UPDATE_PERIOD) { lastUpdate = now; synchronized (this) { // This obtain the datanodes from the HDFS cluster in config file. // If we need to support parity file in a different cluster, this // has to change. liveNodes = dfs.getLiveDataNodeStats(); for (DatanodeInfo n : liveNodes) { topology.add(n); } } } Thread.sleep(UPDATE_PERIOD / 10); } catch (InterruptedException e) { LOG.warn("Error update datanodes ", e); } catch (IOException e) { LOG.warn("Error update datanodes ", e); } } } public void stop() { running = false; } public synchronized DatanodeInfo getRandomNode(Set<DatanodeInfo> excluded) { if (liveNodes == null || liveNodes.length == 0) { return null; } if (liveNodes.length <= excluded.size()) { return liveNodes[rand.nextInt(liveNodes.length)]; } for (;;) { DatanodeInfo target = liveNodes[rand.nextInt(liveNodes.length)]; if (!excluded.contains(target)) { return target; } } } /** * Choose a node on different rack */ public synchronized DatanodeInfo getNodeOnDifferentRack( Set<DatanodeInfo> excluded) { if (liveNodes == null || liveNodes.length == 0) { return null; } if (liveNodes.length <= excluded.size()) { return liveNodes[rand.nextInt(liveNodes.length)]; } int retry = 0; for (;;) { retry ++; DatanodeInfo target = liveNodes[rand.nextInt(liveNodes.length)]; if (!excluded.contains(target)) { if (retry >= chooseNodeMaxRetryTimes) { return target; } if (topology.getNumOfRacks() <= 1) { return target; } else { boolean sameRack = false; for (DatanodeInfo node : excluded) { if (isOnSameRack(node, target)) { sameRack = true; break; } } if (!sameRack) { return target; } } } } } public synchronized boolean isOnSameRack(DatanodeInfo n1, DatanodeInfo n2) { topology.add(n1); topology.add(n2); return topology.isOnSameRack(n1, n2); } } }