/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.util.hbck; import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentSkipListSet; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.io.hfile.CacheConfig; import org.apache.hadoop.hbase.io.hfile.CorruptHFileException; import org.apache.hadoop.hbase.io.hfile.HFile; import org.apache.hadoop.hbase.util.FSUtils.FamilyDirFilter; import org.apache.hadoop.hbase.util.FSUtils.HFileFilter; import org.apache.hadoop.hbase.util.FSUtils.RegionDirFilter; import org.apache.hadoop.hbase.util.HBaseFsck; /** * This class marches through all of the region's hfiles and verifies that * they are all valid files. One just needs to instantiate the class, use * checkTables(List<Path>) and then retrieve the corrupted hfiles (and * quarantined files if in quarantining mode) * * The implementation currently parallelizes at the regionDir level. */ @InterfaceAudience.Private public class HFileCorruptionChecker { private static final Log LOG = LogFactory.getLog(HFileCorruptionChecker.class); final Configuration conf; final FileSystem fs; final CacheConfig cacheConf; final ExecutorService executor; final Set<Path> corrupted = new ConcurrentSkipListSet<Path>(); final Set<Path> failures = new ConcurrentSkipListSet<Path>(); final Set<Path> quarantined = new ConcurrentSkipListSet<Path>(); final Set<Path> missing = new ConcurrentSkipListSet<Path>(); final boolean inQuarantineMode; final AtomicInteger hfilesChecked = new AtomicInteger(); public HFileCorruptionChecker(Configuration conf, ExecutorService executor, boolean quarantine) throws IOException { this.conf = conf; this.fs = FileSystem.get(conf); this.cacheConf = new CacheConfig(conf); this.executor = executor; this.inQuarantineMode = quarantine; } /** * Checks a path to see if it is a valid hfile. * * @param p * full Path to an HFile * @throws IOException * This is a connectivity related exception */ protected void checkHFile(Path p) throws IOException { HFile.Reader r = null; try { r = HFile.createReader(fs, p, cacheConf); } catch (CorruptHFileException che) { LOG.warn("Found corrupt HFile " + p, che); corrupted.add(p); if (inQuarantineMode) { Path dest = createQuarantinePath(p); LOG.warn("Quarantining corrupt HFile " + p + " into " + dest); boolean success = fs.mkdirs(dest.getParent()); success = success ? fs.rename(p, dest): false; if (!success) { failures.add(p); } else { quarantined.add(dest); } } return; } catch (FileNotFoundException fnfe) { LOG.warn("HFile " + p + " was missing. Likely removed due to compaction/split?"); missing.add(p); } finally { hfilesChecked.addAndGet(1); if (r != null) { r.close(true); } } } /** * Given a path, generates a new path to where we move a corrupted hfile (bad * trailer, no trailer). * * @param hFile * Path to a corrupt hfile (assumes that it is HBASE_DIR/ table * /region/cf/file) * @return path to where corrupted files are stored. This should be * HBASE_DIR/.corrupt/table/region/cf/file. */ Path createQuarantinePath(Path hFile) { // extract the normal dirs structure Path cfDir = hFile.getParent(); Path regionDir = cfDir.getParent(); Path tableDir = regionDir.getParent(); // build up the corrupted dirs strcture Path corruptBaseDir = new Path(conf.get(HConstants.HBASE_DIR), conf.get( "hbase.hfile.quarantine.dir", HConstants.CORRUPT_DIR_NAME)); Path corruptTableDir = new Path(corruptBaseDir, tableDir.getName()); Path corruptRegionDir = new Path(corruptTableDir, regionDir.getName()); Path corruptFamilyDir = new Path(corruptRegionDir, cfDir.getName()); Path corruptHfile = new Path(corruptFamilyDir, hFile.getName()); return corruptHfile; } /** * Check all files in a column family dir. * * @param cfDir * column family directory * @throws IOException */ protected void checkColFamDir(Path cfDir) throws IOException { FileStatus[] hfs = null; try { hfs = fs.listStatus(cfDir, new HFileFilter(fs)); // use same filter as scanner. } catch (FileNotFoundException fnfe) { // Hadoop 0.23+ listStatus semantics throws an exception if the path does not exist. LOG.warn("Colfam Directory " + cfDir + " does not exist. Likely due to concurrent split/compaction. Skipping."); missing.add(cfDir); return; } // Hadoop 1.0 listStatus does not throw an exception if the path does not exist. if (hfs.length == 0 && !fs.exists(cfDir)) { LOG.warn("Colfam Directory " + cfDir + " does not exist. Likely due to concurrent split/compaction. Skipping."); missing.add(cfDir); return; } for (FileStatus hfFs : hfs) { Path hf = hfFs.getPath(); checkHFile(hf); } } /** * Check all column families in a region dir. * * @param regionDir * region directory * @throws IOException */ protected void checkRegionDir(Path regionDir) throws IOException { FileStatus[] cfs = null; try { cfs = fs.listStatus(regionDir, new FamilyDirFilter(fs)); } catch (FileNotFoundException fnfe) { // Hadoop 0.23+ listStatus semantics throws an exception if the path does not exist. LOG.warn("Region Directory " + regionDir + " does not exist. Likely due to concurrent split/compaction. Skipping."); missing.add(regionDir); return; } // Hadoop 1.0 listStatus does not throw an exception if the path does not exist. if (cfs.length == 0 && !fs.exists(regionDir)) { LOG.warn("Region Directory " + regionDir + " does not exist. Likely due to concurrent split/compaction. Skipping."); missing.add(regionDir); return; } for (FileStatus cfFs : cfs) { Path cfDir = cfFs.getPath(); checkColFamDir(cfDir); } } /** * Check all the regiondirs in the specified tableDir * * @param tableDir * path to a table * @throws IOException */ void checkTableDir(Path tableDir) throws IOException { FileStatus[] rds = fs.listStatus(tableDir, new RegionDirFilter(fs)); if (rds.length == 0 && !fs.exists(tableDir)) { // interestingly listStatus does not throw an exception if the path does not exist. LOG.warn("Table Directory " + tableDir + " does not exist. Likely due to concurrent delete. Skipping."); missing.add(tableDir); return; } // Parallelize check at the region dir level List<RegionDirChecker> rdcs = new ArrayList<RegionDirChecker>(); List<Future<Void>> rdFutures; for (FileStatus rdFs : rds) { Path rdDir = rdFs.getPath(); RegionDirChecker work = new RegionDirChecker(rdDir); rdcs.add(work); } // Submit and wait for completion try { rdFutures = executor.invokeAll(rdcs); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); LOG.warn("Region dirs checking interrupted!", ie); return; } for (int i = 0; i < rdFutures.size(); i++) { Future<Void> f = rdFutures.get(i); try { f.get(); } catch (ExecutionException e) { LOG.warn("Failed to quaratine an HFile in regiondir " + rdcs.get(i).regionDir, e.getCause()); // rethrow IOExceptions if (e.getCause() instanceof IOException) { throw (IOException) e.getCause(); } // rethrow RuntimeExceptions if (e.getCause() instanceof RuntimeException) { throw (RuntimeException) e.getCause(); } // this should never happen LOG.error("Unexpected exception encountered", e); return; // bailing out. } catch (InterruptedException ie) { Thread.currentThread().interrupt(); LOG.warn("Region dirs check interrupted!", ie); // bailing out return; } } } /** * An individual work item for parallelized regiondir processing. This is * intentionally an inner class so it can use the shared error sets and fs. */ private class RegionDirChecker implements Callable<Void> { final Path regionDir; RegionDirChecker(Path regionDir) { this.regionDir = regionDir; } @Override public Void call() throws IOException { checkRegionDir(regionDir); return null; } } /** * Check the specified table dirs for bad hfiles. */ public void checkTables(Collection<Path> tables) throws IOException { for (Path t : tables) { checkTableDir(t); } } /** * @return the set of check failure file paths after checkTables is called. */ public Collection<Path> getFailures() { return new HashSet<Path>(failures); } /** * @return the set of corrupted file paths after checkTables is called. */ public Collection<Path> getCorrupted() { return new HashSet<Path>(corrupted); } /** * @return number of hfiles checked in the last HfileCorruptionChecker run */ public int getHFilesChecked() { return hfilesChecked.get(); } /** * @return the set of successfully quarantined paths after checkTables is called. */ public Collection<Path> getQuarantined() { return new HashSet<Path>(quarantined); } /** * @return the set of paths that were missing. Likely due to deletion/moves from * compaction or flushes. */ public Collection<Path> getMissing() { return new HashSet<Path>(missing); } /** * Print a human readable summary of hfile quarantining operations. * @param out */ public void report(PrintWriter out) { out.println("Checked " + hfilesChecked.get() + " hfile for corruption"); out.println(" HFiles corrupted: " + corrupted.size()); if (inQuarantineMode) { out.println(" HFiles successfully quarantined: " + quarantined.size()); for (Path sq : quarantined) { out.println(" " + sq); } out.println(" HFiles failed quarantine: " + failures.size()); for (Path fq : failures) { out.println(" " + fq); } } out.println(" HFiles moved while checking: " + missing.size()); for (Path mq : missing) { out.println(" " + mq); } String initialState = (corrupted.size() == 0) ? "OK" : "CORRUPTED"; String fixedState = (corrupted.size() == quarantined.size()) ? "OK" : "CORRUPTED"; if (inQuarantineMode) { out.println("Summary: " + initialState + " => " + fixedState); } else { out.println("Summary: " + initialState); } } }