/** * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.util; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.UUID; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.ScheduledThreadPoolExecutor; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.ClusterStatus; import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.HColumnDescriptor; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HRegionLocation; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.MetaTableAccessor; import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.client.ClusterConnection; import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.Delete; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.RegionLocator; import org.apache.hadoop.hbase.client.ResultScanner; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.coprocessor.MasterObserver; import org.apache.hadoop.hbase.coprocessor.MasterCoprocessorEnvironment; import org.apache.hadoop.hbase.coprocessor.ObserverContext; import org.apache.hadoop.hbase.master.RegionStates; import org.apache.hadoop.hbase.mob.MobFileName; import org.apache.hadoop.hbase.mob.MobUtils; import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos; import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter; import org.apache.hadoop.hbase.util.HBaseFsck.HbckInfo; import org.apache.hadoop.hbase.util.HBaseFsck.TableInfo; import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker; import org.apache.zookeeper.KeeperException; import org.junit.rules.TestName; /** * This is the base class for HBaseFsck's ability to detect reasons for inconsistent tables. * * Actual tests are in : * TestHBaseFsckTwoRS * TestHBaseFsckOneRS * TestHBaseFsckMOB * TestHBaseFsckReplicas */ public class BaseTestHBaseFsck { static final int POOL_SIZE = 7; protected static final Log LOG = LogFactory.getLog(BaseTestHBaseFsck.class); protected final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); protected final static Configuration conf = TEST_UTIL.getConfiguration(); protected final static String FAM_STR = "fam"; protected final static byte[] FAM = Bytes.toBytes(FAM_STR); protected final static int REGION_ONLINE_TIMEOUT = 800; protected static RegionStates regionStates; protected static ExecutorService tableExecutorService; protected static ScheduledThreadPoolExecutor hbfsckExecutorService; protected static ClusterConnection connection; protected static Admin admin; // for the instance, reset every test run protected Table tbl; protected final static byte[][] SPLITS = new byte[][] { Bytes.toBytes("A"), Bytes.toBytes("B"), Bytes.toBytes("C") }; // one row per region. protected final static byte[][] ROWKEYS= new byte[][] { Bytes.toBytes("00"), Bytes.toBytes("50"), Bytes.toBytes("A0"), Bytes.toBytes("A5"), Bytes.toBytes("B0"), Bytes.toBytes("B5"), Bytes.toBytes("C0"), Bytes.toBytes("C5") }; /** * Create a new region in META. */ protected HRegionInfo createRegion(final HTableDescriptor htd, byte[] startKey, byte[] endKey) throws IOException { Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService); HRegionInfo hri = new HRegionInfo(htd.getTableName(), startKey, endKey); MetaTableAccessor.addRegionToMeta(meta, hri); meta.close(); return hri; } /** * Debugging method to dump the contents of meta. */ protected void dumpMeta(TableName tableName) throws IOException { List<byte[]> metaRows = TEST_UTIL.getMetaTableRows(tableName); for (byte[] row : metaRows) { LOG.info(Bytes.toString(row)); } } /** * This method is used to undeploy a region -- close it and attempt to * remove its state from the Master. */ protected void undeployRegion(Connection conn, ServerName sn, HRegionInfo hri) throws IOException, InterruptedException { try { HBaseFsckRepair.closeRegionSilentlyAndWait(conn, sn, hri); if (!hri.isMetaTable()) { admin.offline(hri.getRegionName()); } } catch (IOException ioe) { LOG.warn("Got exception when attempting to offline region " + Bytes.toString(hri.getRegionName()), ioe); } } /** * Delete a region from assignments, meta, or completely from hdfs. * @param unassign if true unassign region if assigned * @param metaRow if true remove region's row from META * @param hdfs if true remove region's dir in HDFS */ protected void deleteRegion(Configuration conf, final HTableDescriptor htd, byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow, boolean hdfs) throws IOException, InterruptedException { deleteRegion(conf, htd, startKey, endKey, unassign, metaRow, hdfs, false, HRegionInfo.DEFAULT_REPLICA_ID); } /** * Delete a region from assignments, meta, or completely from hdfs. * @param unassign if true unassign region if assigned * @param metaRow if true remove region's row from META * @param hdfs if true remove region's dir in HDFS * @param regionInfoOnly if true remove a region dir's .regioninfo file * @param replicaId replica id */ protected void deleteRegion(Configuration conf, final HTableDescriptor htd, byte[] startKey, byte[] endKey, boolean unassign, boolean metaRow, boolean hdfs, boolean regionInfoOnly, int replicaId) throws IOException, InterruptedException { LOG.info("** Before delete:"); dumpMeta(htd.getTableName()); List<HRegionLocation> locations; try(RegionLocator rl = connection.getRegionLocator(tbl.getName())) { locations = rl.getAllRegionLocations(); } for (HRegionLocation location : locations) { HRegionInfo hri = location.getRegionInfo(); ServerName hsa = location.getServerName(); if (Bytes.compareTo(hri.getStartKey(), startKey) == 0 && Bytes.compareTo(hri.getEndKey(), endKey) == 0 && hri.getReplicaId() == replicaId) { LOG.info("RegionName: " +hri.getRegionNameAsString()); byte[] deleteRow = hri.getRegionName(); if (unassign) { LOG.info("Undeploying region " + hri + " from server " + hsa); undeployRegion(connection, hsa, hri); } if (regionInfoOnly) { LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString()); Path rootDir = FSUtils.getRootDir(conf); FileSystem fs = rootDir.getFileSystem(conf); Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()), hri.getEncodedName()); Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE); fs.delete(hriPath, true); } if (hdfs) { LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString()); Path rootDir = FSUtils.getRootDir(conf); FileSystem fs = rootDir.getFileSystem(conf); Path p = new Path(FSUtils.getTableDir(rootDir, htd.getTableName()), hri.getEncodedName()); HBaseFsck.debugLsr(conf, p); boolean success = fs.delete(p, true); LOG.info("Deleted " + p + " sucessfully? " + success); HBaseFsck.debugLsr(conf, p); } if (metaRow) { try (Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService)) { Delete delete = new Delete(deleteRow); meta.delete(delete); } } } LOG.info(hri.toString() + hsa.toString()); } TEST_UTIL.getMetaTableRows(htd.getTableName()); LOG.info("*** After delete:"); dumpMeta(htd.getTableName()); } /** * Setup a clean table before we start mucking with it. * * It will set tbl which needs to be closed after test * * @throws IOException * @throws InterruptedException * @throws KeeperException */ void setupTable(TableName tablename) throws Exception { setupTableWithRegionReplica(tablename, 1); } /** * Setup a clean table with a certain region_replica count * * It will set tbl which needs to be closed after test * * @throws Exception */ void setupTableWithRegionReplica(TableName tablename, int replicaCount) throws Exception { HTableDescriptor desc = new HTableDescriptor(tablename); desc.setRegionReplication(replicaCount); HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM)); desc.addFamily(hcd); // If a table has no CF's it doesn't get checked createTable(TEST_UTIL, desc, SPLITS); tbl = connection.getTable(tablename, tableExecutorService); List<Put> puts = new ArrayList<>(ROWKEYS.length); for (byte[] row : ROWKEYS) { Put p = new Put(row); p.addColumn(FAM, Bytes.toBytes("val"), row); puts.add(p); } tbl.put(puts); } /** * Setup a clean table with a mob-enabled column. * * @param tablename The name of a table to be created. * @throws Exception */ void setupMobTable(TableName tablename) throws Exception { HTableDescriptor desc = new HTableDescriptor(tablename); HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM)); hcd.setMobEnabled(true); hcd.setMobThreshold(0); desc.addFamily(hcd); // If a table has no CF's it doesn't get checked createTable(TEST_UTIL, desc, SPLITS); tbl = connection.getTable(tablename, tableExecutorService); List<Put> puts = new ArrayList<>(ROWKEYS.length); for (byte[] row : ROWKEYS) { Put p = new Put(row); p.addColumn(FAM, Bytes.toBytes("val"), row); puts.add(p); } tbl.put(puts); } /** * Counts the number of rows to verify data loss or non-dataloss. */ int countRows() throws IOException { return TEST_UTIL.countRows(tbl); } /** * Counts the number of rows to verify data loss or non-dataloss. */ int countRows(byte[] start, byte[] end) throws IOException { return TEST_UTIL.countRows(tbl, new Scan(start, end)); } /** * delete table in preparation for next test * * @param tablename * @throws IOException */ void cleanupTable(TableName tablename) throws Exception { if (tbl != null) { tbl.close(); tbl = null; } ((ClusterConnection) connection).clearRegionCache(); deleteTable(TEST_UTIL, tablename); } /** * Get region info from local cluster. */ Map<ServerName, List<String>> getDeployedHRIs(final Admin admin) throws IOException { ClusterStatus status = admin.getClusterStatus(); Collection<ServerName> regionServers = status.getServers(); Map<ServerName, List<String>> mm = new HashMap<>(); for (ServerName hsi : regionServers) { AdminProtos.AdminService.BlockingInterface server = connection.getAdmin(hsi); // list all online regions from this region server List<HRegionInfo> regions = ProtobufUtil.getOnlineRegions(server); List<String> regionNames = new ArrayList<>(regions.size()); for (HRegionInfo hri : regions) { regionNames.add(hri.getRegionNameAsString()); } mm.put(hsi, regionNames); } return mm; } /** * Returns the HSI a region info is on. */ ServerName findDeployedHSI(Map<ServerName, List<String>> mm, HRegionInfo hri) { for (Map.Entry<ServerName,List <String>> e : mm.entrySet()) { if (e.getValue().contains(hri.getRegionNameAsString())) { return e.getKey(); } } return null; } public void deleteTableDir(TableName table) throws IOException { Path rootDir = FSUtils.getRootDir(conf); FileSystem fs = rootDir.getFileSystem(conf); Path p = FSUtils.getTableDir(rootDir, table); HBaseFsck.debugLsr(conf, p); boolean success = fs.delete(p, true); LOG.info("Deleted " + p + " sucessfully? " + success); } /** * We don't have an easy way to verify that a flush completed, so we loop until we find a * legitimate hfile and return it. * @param fs * @param table * @return Path of a flushed hfile. * @throws IOException */ Path getFlushedHFile(FileSystem fs, TableName table) throws IOException { Path tableDir= FSUtils.getTableDir(FSUtils.getRootDir(conf), table); Path regionDir = FSUtils.getRegionDirs(fs, tableDir).get(0); Path famDir = new Path(regionDir, FAM_STR); // keep doing this until we get a legit hfile while (true) { FileStatus[] hfFss = fs.listStatus(famDir); if (hfFss.length == 0) { continue; } for (FileStatus hfs : hfFss) { if (!hfs.isDirectory()) { return hfs.getPath(); } } } } /** * Gets flushed mob files. * @param fs The current file system. * @param table The current table name. * @return Path of a flushed hfile. * @throws IOException */ Path getFlushedMobFile(FileSystem fs, TableName table) throws IOException { Path famDir = MobUtils.getMobFamilyPath(conf, table, FAM_STR); // keep doing this until we get a legit hfile while (true) { FileStatus[] hfFss = fs.listStatus(famDir); if (hfFss.length == 0) { continue; } for (FileStatus hfs : hfFss) { if (!hfs.isDirectory()) { return hfs.getPath(); } } } } /** * Creates a new mob file name by the old one. * @param oldFileName The old mob file name. * @return The new mob file name. */ String createMobFileName(String oldFileName) { MobFileName mobFileName = MobFileName.create(oldFileName); String startKey = mobFileName.getStartKey(); String date = mobFileName.getDate(); return MobFileName.create(startKey, date, UUID.randomUUID().toString().replaceAll("-", "")) .getFileName(); } /** * Test that use this should have a timeout, because this method could potentially wait forever. */ protected void doQuarantineTest(TableName table, HBaseFsck hbck, int check, int corrupt, int fail, int quar, int missing) throws Exception { try { setupTable(table); assertEquals(ROWKEYS.length, countRows()); admin.flush(table); // flush is async. // Mess it up by leaving a hole in the assignment, meta, and hdfs data admin.disableTable(table); String[] args = {"-sidelineCorruptHFiles", "-repairHoles", "-ignorePreCheckPermission", table.getNameAsString()}; HBaseFsck res = hbck.exec(hbfsckExecutorService, args); HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker(); assertEquals(hfcc.getHFilesChecked(), check); assertEquals(hfcc.getCorrupted().size(), corrupt); assertEquals(hfcc.getFailures().size(), fail); assertEquals(hfcc.getQuarantined().size(), quar); assertEquals(hfcc.getMissing().size(), missing); // its been fixed, verify that we can enable admin.enableTableAsync(table); while (!admin.isTableEnabled(table)) { try { Thread.sleep(250); } catch (InterruptedException e) { e.printStackTrace(); fail("Interrupted when trying to enable table " + table); } } } finally { cleanupTable(table); } } static class MockErrorReporter implements ErrorReporter { static int calledCount = 0; @Override public void clear() { calledCount++; } @Override public void report(String message) { calledCount++; } @Override public void reportError(String message) { calledCount++; } @Override public void reportError(ERROR_CODE errorCode, String message) { calledCount++; } @Override public void reportError(ERROR_CODE errorCode, String message, TableInfo table) { calledCount++; } @Override public void reportError(ERROR_CODE errorCode, String message, TableInfo table, HbckInfo info) { calledCount++; } @Override public void reportError(ERROR_CODE errorCode, String message, TableInfo table, HbckInfo info1, HbckInfo info2) { calledCount++; } @Override public int summarize() { return ++calledCount; } @Override public void detail(String details) { calledCount++; } @Override public ArrayList<ERROR_CODE> getErrorList() { calledCount++; return new ArrayList<>(); } @Override public void progress() { calledCount++; } @Override public void print(String message) { calledCount++; } @Override public void resetErrors() { calledCount++; } @Override public boolean tableHasErrors(TableInfo table) { calledCount++; return false; } } protected void deleteMetaRegion(Configuration conf, boolean unassign, boolean hdfs, boolean regionInfoOnly) throws IOException, InterruptedException { HRegionLocation metaLocation = connection.getRegionLocator(TableName.META_TABLE_NAME) .getRegionLocation(HConstants.EMPTY_START_ROW); ServerName hsa = metaLocation.getServerName(); HRegionInfo hri = metaLocation.getRegionInfo(); if (unassign) { LOG.info("Undeploying meta region " + hri + " from server " + hsa); try (Connection unmanagedConnection = ConnectionFactory.createConnection(conf)) { undeployRegion(unmanagedConnection, hsa, hri); } } if (regionInfoOnly) { LOG.info("deleting hdfs .regioninfo data: " + hri.toString() + hsa.toString()); Path rootDir = FSUtils.getRootDir(conf); FileSystem fs = rootDir.getFileSystem(conf); Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(), hri.getEncodedName()); Path hriPath = new Path(p, HRegionFileSystem.REGION_INFO_FILE); fs.delete(hriPath, true); } if (hdfs) { LOG.info("deleting hdfs data: " + hri.toString() + hsa.toString()); Path rootDir = FSUtils.getRootDir(conf); FileSystem fs = rootDir.getFileSystem(conf); Path p = new Path(rootDir + "/" + TableName.META_TABLE_NAME.getNameAsString(), hri.getEncodedName()); HBaseFsck.debugLsr(conf, p); boolean success = fs.delete(p, true); LOG.info("Deleted " + p + " sucessfully? " + success); HBaseFsck.debugLsr(conf, p); } } @org.junit.Rule public TestName name = new TestName(); public static class MasterSyncObserver implements MasterObserver { volatile CountDownLatch tableCreationLatch = null; volatile CountDownLatch tableDeletionLatch = null; @Override public void postCompletedCreateTableAction( final ObserverContext<MasterCoprocessorEnvironment> ctx, final HTableDescriptor desc, final HRegionInfo[] regions) throws IOException { // the AccessController test, some times calls only and directly the // postCompletedCreateTableAction() if (tableCreationLatch != null) { tableCreationLatch.countDown(); } } @Override public void postCompletedDeleteTableAction( final ObserverContext<MasterCoprocessorEnvironment> ctx, final TableName tableName) throws IOException { // the AccessController test, some times calls only and directly the // postCompletedDeleteTableAction() if (tableDeletionLatch != null) { tableDeletionLatch.countDown(); } } } public static void createTable(HBaseTestingUtility testUtil, HTableDescriptor htd, byte [][] splitKeys) throws Exception { // NOTE: We need a latch because admin is not sync, // so the postOp coprocessor method may be called after the admin operation returned. MasterSyncObserver observer = (MasterSyncObserver)testUtil.getHBaseCluster().getMaster() .getMasterCoprocessorHost().findCoprocessor(MasterSyncObserver.class.getName()); observer.tableCreationLatch = new CountDownLatch(1); if (splitKeys != null) { admin.createTable(htd, splitKeys); } else { admin.createTable(htd); } observer.tableCreationLatch.await(); observer.tableCreationLatch = null; testUtil.waitUntilAllRegionsAssigned(htd.getTableName()); } public static void deleteTable(HBaseTestingUtility testUtil, TableName tableName) throws Exception { // NOTE: We need a latch because admin is not sync, // so the postOp coprocessor method may be called after the admin operation returned. MasterSyncObserver observer = (MasterSyncObserver)testUtil.getHBaseCluster().getMaster() .getMasterCoprocessorHost().findCoprocessor(MasterSyncObserver.class.getName()); observer.tableDeletionLatch = new CountDownLatch(1); try { admin.disableTable(tableName); } catch (Exception e) { LOG.debug("Table: " + tableName + " already disabled, so just deleting it."); } admin.deleteTable(tableName); observer.tableDeletionLatch.await(); observer.tableDeletionLatch = null; } }