/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.raid; import java.io.IOException; import java.util.LinkedList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.mapreduce.Counters; /** * This class fixes source file blocks using the parity file, * and parity file blocks using the source file. * It periodically fetches the list of corrupt files from the namenode, * and figures out the location of the bad block by reading through * the corrupt file. */ public class LocalBlockIntegrityMonitor extends BlockIntegrityMonitor implements Runnable { public static final Log LOG = LogFactory.getLog(LocalBlockIntegrityMonitor.class); private BlockReconstructor.CorruptBlockReconstructor helper; public LocalBlockIntegrityMonitor(Configuration conf) throws IOException { super(conf); helper = new BlockReconstructor.CorruptBlockReconstructor(getConf()); } public void run() { while (running) { try { LOG.info("LocalBlockFixer continuing to run..."); doFix(); } catch (Exception e) { LOG.error(StringUtils.stringifyException(e)); } catch (Error err) { LOG.error("Exiting after encountering " + StringUtils.stringifyException(err)); throw err; } } } void doFix() throws InterruptedException, IOException { while (running) { // Sleep before proceeding to fix files. Thread.sleep(blockCheckInterval); List<String> corruptFiles = getCorruptFiles(); FileSystem parityFs = new Path("/").getFileSystem(getConf()); filterUnreconstructableSourceFiles(parityFs, corruptFiles.iterator()); RaidNodeMetrics.getInstance(RaidNodeMetrics.DEFAULT_NAMESPACE_ID). numFilesToFix.set(corruptFiles.size()); if (corruptFiles.isEmpty()) { // If there are no corrupt files, retry after some time. continue; } LOG.info("Found " + corruptFiles.size() + " corrupt files."); helper.sortLostFiles(corruptFiles); for (String srcPath: corruptFiles) { if (!running) break; try { boolean fixed = helper.reconstructFile(new Path(srcPath), null); if (fixed) { incrFilesFixed(); } } catch (IOException ie) { incrFileFixFailures(); LOG.error("Hit error while processing " + srcPath + ": " + StringUtils.stringifyException(ie)); // Do nothing, move on to the next file. } } } } /** * @return A list of corrupt files as obtained from the namenode */ List<String> getCorruptFiles() throws IOException { DistributedFileSystem dfs = helper.getDFS(new Path("/")); String[] files = DFSUtil.getCorruptFiles(dfs); List<String> corruptFiles = new LinkedList<String>(); for (String f: files) { corruptFiles.add(f); } RaidUtils.filterTrash(getConf(), corruptFiles); return corruptFiles; } @Override public BlockIntegrityMonitor.Status getAggregateStatus() { throw new UnsupportedOperationException(LocalBlockIntegrityMonitor.class + " doesn't do getAggregateStatus()"); } @Override public Runnable getCorruptionMonitor() { return this; } @Override public Runnable getDecommissioningMonitor() { // This class does not monitor decommissioning files. return null; } @Override public Runnable getCorruptFileCounter() { return null; } }