/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.raid; import java.io.IOException; import java.net.InetSocketAddress; import java.util.LinkedList; import java.util.List; import javax.security.auth.login.LoginException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.raid.protocol.RaidProtocol; import org.apache.hadoop.security.UnixUserGroupInformation; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.mapreduce.Counters; /** * This class fixes source file blocks using the parity file, * and parity file blocks using the source file. * It periodically fetches the list of corrupt files from the namenode, * and figures out the location of the bad block by reading through * the corrupt file. */ public class LocalBlockIntegrityMonitor extends BlockIntegrityMonitor implements Runnable { public static final Log LOG = LogFactory.getLog(LocalBlockIntegrityMonitor.class); private BlockReconstructor.CorruptBlockReconstructor helper; public RaidProtocol raidnode; private UnixUserGroupInformation ugi; RaidProtocol rpcRaidnode; void initializeRpc(Configuration conf, InetSocketAddress address) throws IOException { try { this.ugi = UnixUserGroupInformation.login(conf, true); } catch (LoginException e) { throw (IOException)(new IOException().initCause(e)); } this.rpcRaidnode = RaidShell.createRPCRaidnode(address, conf, ugi); this.raidnode = RaidShell.createRaidnode(rpcRaidnode); } public LocalBlockIntegrityMonitor(Configuration conf) throws Exception { this(conf, true); } public LocalBlockIntegrityMonitor(Configuration conf, boolean initializeRPC) throws Exception { super(conf); helper = new BlockReconstructor.CorruptBlockReconstructor(getConf()); if (initializeRPC) { for (int i = 0; i < 3; i++) { try { initializeRpc(conf, RaidNode.getAddress(conf)); } catch (Exception e) { LOG.warn("Fail to initialize RPC", e); if (i == 2) { throw e; } Thread.sleep(2000); } } } } public void run() { try { while (running) { try { LOG.info("LocalBlockFixer continuing to run..."); doFix(); } catch (Exception e) { LOG.error(StringUtils.stringifyException(e)); } catch (Error err) { LOG.error("Exiting after encountering " + StringUtils.stringifyException(err)); throw err; } } } finally { RPC.stopProxy(rpcRaidnode); } } void doFix() throws InterruptedException, IOException { while (running) { // Sleep before proceeding to fix files. Thread.sleep(blockCheckInterval); List<String> corruptFiles = getCorruptFiles(); FileSystem parityFs = new Path("/").getFileSystem(getConf()); filterUnreconstructableSourceFiles(parityFs, corruptFiles.iterator()); RaidNodeMetrics.getInstance(RaidNodeMetrics.DEFAULT_NAMESPACE_ID). numFilesToFix.set(corruptFiles.size()); approximateNumRecoverableFiles = corruptFiles.size(); if (corruptFiles.isEmpty()) { // If there are no corrupt files, retry after some time. continue; } LOG.info("Found " + corruptFiles.size() + " corrupt files."); long detectionTime = System.currentTimeMillis(); helper.sortLostFiles(corruptFiles); for (String srcPathStr: corruptFiles) { if (!running) break; long recoveryTime = -1; Path srcPath = new Path(srcPathStr); try { boolean fixed = helper.reconstructFile(srcPath, null); if (fixed) { incrFilesFixed(); recoveryTime = System.currentTimeMillis() - detectionTime; lastSuccessfulFixTime = System.currentTimeMillis(); } } catch (IOException ie) { incrFileFixFailures(); LOG.error("Hit error while processing " + srcPath + ": " + StringUtils.stringifyException(ie)); // Do nothing, move on to the next file. recoveryTime = Integer.MAX_VALUE; } finally { if (recoveryTime > 0) { try { raidnode.sendRecoveryTime(srcPathStr, recoveryTime, null); } catch (Exception e) { LOG.error("Failed to send recovery time ", e); } } } } } } /** * @return A list of corrupt files as obtained from the namenode */ List<String> getCorruptFiles() throws IOException { DistributedFileSystem dfs = helper.getDFS(new Path("/")); String[] files = DFSUtil.getCorruptFiles(dfs); List<String> corruptFiles = new LinkedList<String>(); for (String f: files) { corruptFiles.add(f); } RaidUtils.filterTrash(getConf(), corruptFiles); return corruptFiles; } @Override public BlockIntegrityMonitor.Status getAggregateStatus() { throw new UnsupportedOperationException(LocalBlockIntegrityMonitor.class + " doesn't do getAggregateStatus()"); } @Override public Runnable getCorruptionMonitor() { return this; } @Override public Runnable getDecommissioningMonitor() { // This class does not monitor decommissioning files. return null; } @Override public Runnable getCorruptFileCounter() { return null; } }