/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.coordination; import static org.apache.hadoop.hbase.util.CollectionUtils.*; import static org.apache.hadoop.hbase.master.SplitLogManager.ResubmitDirective.CHECK; import static org.apache.hadoop.hbase.master.SplitLogManager.ResubmitDirective.FORCE; import static org.apache.hadoop.hbase.master.SplitLogManager.TerminationStatus.DELETED; import static org.apache.hadoop.hbase.master.SplitLogManager.TerminationStatus.FAILURE; import static org.apache.hadoop.hbase.master.SplitLogManager.TerminationStatus.IN_PROGRESS; import static org.apache.hadoop.hbase.master.SplitLogManager.TerminationStatus.SUCCESS; import java.io.IOException; import java.io.InterruptedIOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.CoordinatedStateManager; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.SplitLogCounters; import org.apache.hadoop.hbase.SplitLogTask; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.coordination.ZKSplitLogManagerCoordination.TaskFinisher.Status; import org.apache.hadoop.hbase.exceptions.DeserializationException; import org.apache.hadoop.hbase.master.SplitLogManager.ResubmitDirective; import org.apache.hadoop.hbase.master.SplitLogManager.Task; import org.apache.hadoop.hbase.master.SplitLogManager.TerminationStatus; import org.apache.hadoop.hbase.shaded.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.apache.hadoop.hbase.wal.WALSplitter; import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper; import org.apache.hadoop.hbase.zookeeper.ZKSplitLog; import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener; import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; import org.apache.hadoop.util.StringUtils; import org.apache.zookeeper.AsyncCallback; import org.apache.zookeeper.CreateMode; import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.KeeperException.NoNodeException; import org.apache.zookeeper.ZooDefs.Ids; import org.apache.zookeeper.data.Stat; /** * ZooKeeper based implementation of * {@link SplitLogManagerCoordination} */ @InterfaceAudience.Private public class ZKSplitLogManagerCoordination extends ZooKeeperListener implements SplitLogManagerCoordination { public static final int DEFAULT_TIMEOUT = 120000; public static final int DEFAULT_ZK_RETRIES = 3; public static final int DEFAULT_MAX_RESUBMIT = 3; private static final Log LOG = LogFactory.getLog(SplitLogManagerCoordination.class); private final TaskFinisher taskFinisher; private final Configuration conf; private long zkretries; private long resubmitThreshold; private long timeout; SplitLogManagerDetails details; // When lastRecoveringNodeCreationTime is older than the following threshold, we'll check // whether to GC stale recovering znodes private volatile long lastRecoveringNodeCreationTime = 0; public boolean ignoreZKDeleteForTesting = false; private RecoveryMode recoveryMode; private boolean isDrainingDone = false; public ZKSplitLogManagerCoordination(final CoordinatedStateManager manager, ZooKeeperWatcher watcher) { super(watcher); taskFinisher = new TaskFinisher() { @Override public Status finish(ServerName workerName, String logfile) { try { WALSplitter.finishSplitLogFile(logfile, manager.getServer().getConfiguration()); } catch (IOException e) { LOG.warn("Could not finish splitting of log file " + logfile, e); return Status.ERR; } return Status.DONE; } }; this.conf = manager.getServer().getConfiguration(); } @Override public void init() throws IOException { this.zkretries = conf.getLong("hbase.splitlog.zk.retries", DEFAULT_ZK_RETRIES); this.resubmitThreshold = conf.getLong("hbase.splitlog.max.resubmit", DEFAULT_MAX_RESUBMIT); this.timeout = conf.getInt(HConstants.HBASE_SPLITLOG_MANAGER_TIMEOUT, DEFAULT_TIMEOUT); setRecoveryMode(true); if (this.watcher != null) { this.watcher.registerListener(this); lookForOrphans(); } } @Override public String prepareTask(String taskname) { return ZKSplitLog.getEncodedNodeName(watcher, taskname); } @Override public int remainingTasksInCoordination() { int count = 0; try { List<String> tasks = ZKUtil.listChildrenNoWatch(watcher, watcher.znodePaths.splitLogZNode); if (tasks != null) { int listSize = tasks.size(); for (int i = 0; i < listSize; i++) { if (!ZKSplitLog.isRescanNode(tasks.get(i))) { count++; } } } } catch (KeeperException ke) { LOG.warn("Failed to check remaining tasks", ke); count = -1; } return count; } /** * It is possible for a task to stay in UNASSIGNED state indefinitely - say SplitLogManager wants * to resubmit a task. It forces the task to UNASSIGNED state but it dies before it could create * the RESCAN task node to signal the SplitLogWorkers to pick up the task. To prevent this * scenario the SplitLogManager resubmits all orphan and UNASSIGNED tasks at startup. * @param path */ private void handleUnassignedTask(String path) { if (ZKSplitLog.isRescanNode(watcher, path)) { return; } Task task = findOrCreateOrphanTask(path); if (task.isOrphan() && (task.incarnation.get() == 0)) { LOG.info("resubmitting unassigned orphan task " + path); // ignore failure to resubmit. The timeout-monitor will handle it later // albeit in a more crude fashion resubmitTask(path, task, FORCE); } } @Override public void deleteTask(String path) { deleteNode(path, zkretries); } @Override public boolean resubmitTask(String path, Task task, ResubmitDirective directive) { // its ok if this thread misses the update to task.deleted. It will fail later if (task.status != IN_PROGRESS) { return false; } int version; if (directive != FORCE) { // We're going to resubmit: // 1) immediately if the worker server is now marked as dead // 2) after a configurable timeout if the server is not marked as dead but has still not // finished the task. This allows to continue if the worker cannot actually handle it, // for any reason. final long time = EnvironmentEdgeManager.currentTime() - task.last_update; final boolean alive = details.getMaster().getServerManager() != null ? details.getMaster().getServerManager() .isServerOnline(task.cur_worker_name) : true; if (alive && time < timeout) { LOG.trace("Skipping the resubmit of " + task.toString() + " because the server " + task.cur_worker_name + " is not marked as dead, we waited for " + time + " while the timeout is " + timeout); return false; } if (task.unforcedResubmits.get() >= resubmitThreshold) { if (!task.resubmitThresholdReached) { task.resubmitThresholdReached = true; SplitLogCounters.tot_mgr_resubmit_threshold_reached.incrementAndGet(); LOG.info("Skipping resubmissions of task " + path + " because threshold " + resubmitThreshold + " reached"); } return false; } // race with heartbeat() that might be changing last_version version = task.last_version; } else { SplitLogCounters.tot_mgr_resubmit_force.incrementAndGet(); version = -1; } LOG.info("resubmitting task " + path); task.incarnation.incrementAndGet(); boolean result = resubmit(this.details.getServerName(), path, version); if (!result) { task.heartbeatNoDetails(EnvironmentEdgeManager.currentTime()); return false; } // don't count forced resubmits if (directive != FORCE) { task.unforcedResubmits.incrementAndGet(); } task.setUnassigned(); rescan(Long.MAX_VALUE); SplitLogCounters.tot_mgr_resubmit.incrementAndGet(); return true; } @Override public void checkTasks() { rescan(Long.MAX_VALUE); }; /** * signal the workers that a task was resubmitted by creating the RESCAN node. */ private void rescan(long retries) { // The RESCAN node will be deleted almost immediately by the // SplitLogManager as soon as it is created because it is being // created in the DONE state. This behavior prevents a buildup // of RESCAN nodes. But there is also a chance that a SplitLogWorker // might miss the watch-trigger that creation of RESCAN node provides. // Since the TimeoutMonitor will keep resubmitting UNASSIGNED tasks // therefore this behavior is safe. SplitLogTask slt = new SplitLogTask.Done(this.details.getServerName(), getRecoveryMode()); this.watcher .getRecoverableZooKeeper() .getZooKeeper() .create(ZKSplitLog.getRescanNode(watcher), slt.toByteArray(), Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL_SEQUENTIAL, new CreateRescanAsyncCallback(), Long.valueOf(retries)); } @Override public void submitTask(String path) { createNode(path, zkretries); } @Override public void checkTaskStillAvailable(String path) { // A negative retry count will lead to ignoring all error processing. this.watcher .getRecoverableZooKeeper() .getZooKeeper() .getData(path, this.watcher, new GetDataAsyncCallback(), Long.valueOf(-1) /* retry count */); SplitLogCounters.tot_mgr_get_data_queued.incrementAndGet(); } /** * It removes recovering regions under /hbase/recovering-regions/[encoded region name] so that the * region server hosting the region can allow reads to the recovered region * @param recoveredServerNameSet servers which are just recovered * @param isMetaRecovery whether current recovery is for the meta region on * <code>serverNames</code> */ @Override public void removeRecoveringRegions(final Set<String> recoveredServerNameSet, Boolean isMetaRecovery) throws IOException { final String metaEncodeRegionName = HRegionInfo.FIRST_META_REGIONINFO.getEncodedName(); int count = 0; try { List<String> tasks = ZKUtil.listChildrenNoWatch(watcher, watcher.znodePaths.splitLogZNode); if (tasks != null) { int listSize = tasks.size(); for (int i = 0; i < listSize; i++) { if (!ZKSplitLog.isRescanNode(tasks.get(i))) { count++; } } } if (count == 0 && this.details.getMaster().isInitialized() && !this.details.getMaster().getServerManager().areDeadServersInProgress()) { // No splitting work items left ZKSplitLog.deleteRecoveringRegionZNodes(watcher, null); // reset lastRecoveringNodeCreationTime because we cleared all recovering znodes at // this point. lastRecoveringNodeCreationTime = Long.MAX_VALUE; } else if (!recoveredServerNameSet.isEmpty()) { // Remove recovering regions which don't have any RS associated with it List<String> regions = ZKUtil.listChildrenNoWatch(watcher, watcher.znodePaths.recoveringRegionsZNode); if (regions != null) { int listSize = regions.size(); if (LOG.isDebugEnabled()) { LOG.debug("Processing recovering " + regions + " and servers " + recoveredServerNameSet + ", isMetaRecovery=" + isMetaRecovery); } for (int i = 0; i < listSize; i++) { String region = regions.get(i); if (isMetaRecovery != null) { if ((isMetaRecovery && !region.equalsIgnoreCase(metaEncodeRegionName)) || (!isMetaRecovery && region.equalsIgnoreCase(metaEncodeRegionName))) { // skip non-meta regions when recovering the meta region or // skip the meta region when recovering user regions continue; } } String nodePath = ZKUtil.joinZNode(watcher.znodePaths.recoveringRegionsZNode, region); List<String> failedServers = ZKUtil.listChildrenNoWatch(watcher, nodePath); if (failedServers == null || failedServers.isEmpty()) { ZKUtil.deleteNode(watcher, nodePath); continue; } if (recoveredServerNameSet.containsAll(failedServers)) { ZKUtil.deleteNodeRecursively(watcher, nodePath); } else { int tmpFailedServerSize = failedServers.size(); for (int j = 0; j < tmpFailedServerSize; j++) { String failedServer = failedServers.get(j); if (recoveredServerNameSet.contains(failedServer)) { String tmpPath = ZKUtil.joinZNode(nodePath, failedServer); ZKUtil.deleteNode(watcher, tmpPath); } } } } } } } catch (KeeperException ke) { LOG.warn("removeRecoveringRegionsFromZK got zookeeper exception. Will retry", ke); throw new IOException(ke); } } private void deleteNode(String path, Long retries) { SplitLogCounters.tot_mgr_node_delete_queued.incrementAndGet(); // Once a task znode is ready for delete, that is it is in the TASK_DONE // state, then no one should be writing to it anymore. That is no one // will be updating the znode version any more. this.watcher.getRecoverableZooKeeper().getZooKeeper() .delete(path, -1, new DeleteAsyncCallback(), retries); } private void deleteNodeSuccess(String path) { if (ignoreZKDeleteForTesting) { return; } Task task; task = details.getTasks().remove(path); if (task == null) { if (ZKSplitLog.isRescanNode(watcher, path)) { SplitLogCounters.tot_mgr_rescan_deleted.incrementAndGet(); } SplitLogCounters.tot_mgr_missing_state_in_delete.incrementAndGet(); LOG.debug("deleted task without in memory state " + path); return; } synchronized (task) { task.status = DELETED; task.notify(); } SplitLogCounters.tot_mgr_task_deleted.incrementAndGet(); } private void deleteNodeFailure(String path) { LOG.info("Failed to delete node " + path + " and will retry soon."); return; } private void createRescanSuccess(String path) { SplitLogCounters.tot_mgr_rescan.incrementAndGet(); getDataSetWatch(path, zkretries); } private void createRescanFailure() { LOG.fatal("logic failure, rescan failure must not happen"); } /** * Helper function to check whether to abandon retries in ZooKeeper AsyncCallback functions * @param statusCode integer value of a ZooKeeper exception code * @param action description message about the retried action * @return true when need to abandon retries otherwise false */ private boolean needAbandonRetries(int statusCode, String action) { if (statusCode == KeeperException.Code.SESSIONEXPIRED.intValue()) { LOG.error("ZK session expired. Master is expected to shut down. Abandoning retries for " + "action=" + action); return true; } return false; } private void createNode(String path, Long retry_count) { SplitLogTask slt = new SplitLogTask.Unassigned(details.getServerName(), getRecoveryMode()); ZKUtil.asyncCreate(this.watcher, path, slt.toByteArray(), new CreateAsyncCallback(), retry_count); SplitLogCounters.tot_mgr_node_create_queued.incrementAndGet(); return; } private void createNodeSuccess(String path) { LOG.debug("put up splitlog task at znode " + path); getDataSetWatch(path, zkretries); } private void createNodeFailure(String path) { // TODO the Manager should split the log locally instead of giving up LOG.warn("failed to create task node" + path); setDone(path, FAILURE); } private void getDataSetWatch(String path, Long retry_count) { this.watcher.getRecoverableZooKeeper().getZooKeeper() .getData(path, this.watcher, new GetDataAsyncCallback(), retry_count); SplitLogCounters.tot_mgr_get_data_queued.incrementAndGet(); } private void getDataSetWatchSuccess(String path, byte[] data, int version) throws DeserializationException { if (data == null) { if (version == Integer.MIN_VALUE) { // assume all done. The task znode suddenly disappeared. setDone(path, SUCCESS); return; } SplitLogCounters.tot_mgr_null_data.incrementAndGet(); LOG.fatal("logic error - got null data " + path); setDone(path, FAILURE); return; } data = RecoverableZooKeeper.removeMetaData(data); SplitLogTask slt = SplitLogTask.parseFrom(data); if (slt.isUnassigned()) { LOG.debug("task not yet acquired " + path + " ver = " + version); handleUnassignedTask(path); } else if (slt.isOwned()) { heartbeat(path, version, slt.getServerName()); } else if (slt.isResigned()) { LOG.info("task " + path + " entered state: " + slt.toString()); resubmitOrFail(path, FORCE); } else if (slt.isDone()) { LOG.info("task " + path + " entered state: " + slt.toString()); if (taskFinisher != null && !ZKSplitLog.isRescanNode(watcher, path)) { if (taskFinisher.finish(slt.getServerName(), ZKSplitLog.getFileName(path)) == Status.DONE) { setDone(path, SUCCESS); } else { resubmitOrFail(path, CHECK); } } else { setDone(path, SUCCESS); } } else if (slt.isErr()) { LOG.info("task " + path + " entered state: " + slt.toString()); resubmitOrFail(path, CHECK); } else { LOG.fatal("logic error - unexpected zk state for path = " + path + " data = " + slt.toString()); setDone(path, FAILURE); } } private void resubmitOrFail(String path, ResubmitDirective directive) { if (resubmitTask(path, findOrCreateOrphanTask(path), directive) == false) { setDone(path, FAILURE); } } private void getDataSetWatchFailure(String path) { LOG.warn("failed to set data watch " + path); setDone(path, FAILURE); } private void setDone(String path, TerminationStatus status) { Task task = details.getTasks().get(path); if (task == null) { if (!ZKSplitLog.isRescanNode(watcher, path)) { SplitLogCounters.tot_mgr_unacquired_orphan_done.incrementAndGet(); LOG.debug("unacquired orphan task is done " + path); } } else { synchronized (task) { if (task.status == IN_PROGRESS) { if (status == SUCCESS) { SplitLogCounters.tot_mgr_log_split_success.incrementAndGet(); LOG.info("Done splitting " + path); } else { SplitLogCounters.tot_mgr_log_split_err.incrementAndGet(); LOG.warn("Error splitting " + path); } task.status = status; if (task.batch != null) { synchronized (task.batch) { if (status == SUCCESS) { task.batch.done++; } else { task.batch.error++; } task.batch.notify(); } } } } } // delete the task node in zk. It's an async // call and no one is blocked waiting for this node to be deleted. All // task names are unique (log.<timestamp>) there is no risk of deleting // a future task. // if a deletion fails, TimeoutMonitor will retry the same deletion later deleteNode(path, zkretries); return; } private Task findOrCreateOrphanTask(String path) { return computeIfAbsent(details.getTasks(), path, Task::new, () -> { LOG.info("creating orphan task " + path); SplitLogCounters.tot_mgr_orphan_task_acquired.incrementAndGet(); }); } private void heartbeat(String path, int new_version, ServerName workerName) { Task task = findOrCreateOrphanTask(path); if (new_version != task.last_version) { if (task.isUnassigned()) { LOG.info("task " + path + " acquired by " + workerName); } task.heartbeat(EnvironmentEdgeManager.currentTime(), new_version, workerName); SplitLogCounters.tot_mgr_heartbeat.incrementAndGet(); } else { // duplicate heartbeats - heartbeats w/o zk node version // changing - are possible. The timeout thread does // getDataSetWatch() just to check whether a node still // exists or not } return; } private void lookForOrphans() { List<String> orphans; try { orphans = ZKUtil.listChildrenNoWatch(this.watcher, this.watcher.znodePaths.splitLogZNode); if (orphans == null) { LOG.warn("could not get children of " + this.watcher.znodePaths.splitLogZNode); return; } } catch (KeeperException e) { LOG.warn("could not get children of " + this.watcher.znodePaths.splitLogZNode + " " + StringUtils.stringifyException(e)); return; } int rescan_nodes = 0; int listSize = orphans.size(); for (int i = 0; i < listSize; i++) { String path = orphans.get(i); String nodepath = ZKUtil.joinZNode(watcher.znodePaths.splitLogZNode, path); if (ZKSplitLog.isRescanNode(watcher, nodepath)) { rescan_nodes++; LOG.debug("found orphan rescan node " + path); } else { LOG.info("found orphan task " + path); } getDataSetWatch(nodepath, zkretries); } LOG.info("Found " + (orphans.size() - rescan_nodes) + " orphan tasks and " + rescan_nodes + " rescan nodes"); } /** * Create znodes /hbase/recovering-regions/[region_ids...]/[failed region server names ...] for * all regions of the passed in region servers * @param serverName the name of a region server * @param userRegions user regiones assigned on the region server */ @Override public void markRegionsRecovering(final ServerName serverName, Set<HRegionInfo> userRegions) throws IOException, InterruptedIOException { this.lastRecoveringNodeCreationTime = EnvironmentEdgeManager.currentTime(); for (HRegionInfo region : userRegions) { String regionEncodeName = region.getEncodedName(); long retries = this.zkretries; do { String nodePath = ZKUtil.joinZNode(watcher.znodePaths.recoveringRegionsZNode, regionEncodeName); long lastRecordedFlushedSequenceId = -1; try { long lastSequenceId = this.details.getMaster().getServerManager() .getLastFlushedSequenceId(regionEncodeName.getBytes()).getLastFlushedSequenceId(); /* * znode layout: .../region_id[last known flushed sequence id]/failed server[last known * flushed sequence id for the server] */ byte[] data = ZKUtil.getData(this.watcher, nodePath); if (data == null) { ZKUtil .createSetData(this.watcher, nodePath, ZKUtil.positionToByteArray(lastSequenceId)); } else { lastRecordedFlushedSequenceId = ZKSplitLog.parseLastFlushedSequenceIdFrom(data); if (lastRecordedFlushedSequenceId < lastSequenceId) { // update last flushed sequence id in the region level ZKUtil.setData(this.watcher, nodePath, ZKUtil.positionToByteArray(lastSequenceId)); } } // go one level deeper with server name nodePath = ZKUtil.joinZNode(nodePath, serverName.getServerName()); if (lastSequenceId <= lastRecordedFlushedSequenceId) { // the newly assigned RS failed even before any flush to the region lastSequenceId = lastRecordedFlushedSequenceId; } ZKUtil.createSetData(this.watcher, nodePath, ZKUtil.regionSequenceIdsToByteArray(lastSequenceId, null)); if (LOG.isDebugEnabled()) { LOG.debug("Marked " + regionEncodeName + " recovering from " + serverName + ": " + nodePath); } // break retry loop break; } catch (KeeperException e) { // ignore ZooKeeper exceptions inside retry loop if (retries <= 1) { throw new IOException(e); } // wait a little bit for retry try { Thread.sleep(20); } catch (InterruptedException e1) { throw new InterruptedIOException(); } } catch (InterruptedException e) { throw new InterruptedIOException(); } } while ((--retries) > 0); } } @Override public void nodeDataChanged(String path) { Task task; task = details.getTasks().get(path); if (task != null || ZKSplitLog.isRescanNode(watcher, path)) { if (task != null) { task.heartbeatNoDetails(EnvironmentEdgeManager.currentTime()); } getDataSetWatch(path, zkretries); } } /** * ZooKeeper implementation of * {@link SplitLogManagerCoordination#removeStaleRecoveringRegions(Set)} */ @Override public void removeStaleRecoveringRegions(final Set<String> knownFailedServers) throws IOException, InterruptedIOException { try { List<String> tasks = ZKUtil.listChildrenNoWatch(watcher, watcher.znodePaths.splitLogZNode); if (tasks != null) { int listSize = tasks.size(); for (int i = 0; i < listSize; i++) { String t = tasks.get(i); byte[] data; try { data = ZKUtil.getData(this.watcher, ZKUtil.joinZNode(watcher.znodePaths.splitLogZNode, t)); } catch (InterruptedException e) { throw new InterruptedIOException(); } if (data != null) { SplitLogTask slt = null; try { slt = SplitLogTask.parseFrom(data); } catch (DeserializationException e) { LOG.warn("Failed parse data for znode " + t, e); } if (slt != null && slt.isDone()) { continue; } } // decode the file name t = ZKSplitLog.getFileName(t); ServerName serverName = AbstractFSWALProvider .getServerNameFromWALDirectoryName(new Path(t)); if (serverName != null) { knownFailedServers.add(serverName.getServerName()); } else { LOG.warn("Found invalid WAL log file name:" + t); } } } // remove recovering regions which doesn't have any RS associated with it List<String> regions = ZKUtil.listChildrenNoWatch(watcher, watcher.znodePaths.recoveringRegionsZNode); if (regions != null) { int listSize = regions.size(); for (int i = 0; i < listSize; i++) { String nodePath = ZKUtil.joinZNode(watcher.znodePaths.recoveringRegionsZNode, regions.get(i)); List<String> regionFailedServers = ZKUtil.listChildrenNoWatch(watcher, nodePath); if (regionFailedServers == null || regionFailedServers.isEmpty()) { ZKUtil.deleteNode(watcher, nodePath); continue; } boolean needMoreRecovery = false; int tmpFailedServerSize = regionFailedServers.size(); for (int j = 0; j < tmpFailedServerSize; j++) { if (knownFailedServers.contains(regionFailedServers.get(j))) { needMoreRecovery = true; break; } } if (!needMoreRecovery) { ZKUtil.deleteNodeRecursively(watcher, nodePath); } } } } catch (KeeperException e) { throw new IOException(e); } } @Override public synchronized boolean isReplaying() { return this.recoveryMode == RecoveryMode.LOG_REPLAY; } @Override public synchronized boolean isSplitting() { return this.recoveryMode == RecoveryMode.LOG_SPLITTING; } private List<String> listSplitLogTasks() throws KeeperException { List<String> taskOrRescanList = ZKUtil.listChildrenNoWatch(watcher, watcher.znodePaths.splitLogZNode); if (taskOrRescanList == null || taskOrRescanList.isEmpty()) { return Collections.<String> emptyList(); } List<String> taskList = new ArrayList<>(); for (String taskOrRescan : taskOrRescanList) { // Remove rescan nodes if (!ZKSplitLog.isRescanNode(taskOrRescan)) { taskList.add(taskOrRescan); } } return taskList; } /** * This function is to set recovery mode from outstanding split log tasks from before or current * configuration setting * @param isForInitialization * @throws IOException */ @Override public void setRecoveryMode(boolean isForInitialization) throws IOException { synchronized(this) { if (this.isDrainingDone) { // when there is no outstanding splitlogtask after master start up, we already have up to // date recovery mode return; } } if (this.watcher == null) { // when watcher is null(testing code) and recovery mode can only be LOG_SPLITTING synchronized(this) { this.isDrainingDone = true; this.recoveryMode = RecoveryMode.LOG_SPLITTING; } return; } boolean hasSplitLogTask = false; boolean hasRecoveringRegions = false; RecoveryMode previousRecoveryMode = RecoveryMode.UNKNOWN; RecoveryMode recoveryModeInConfig = (isDistributedLogReplay(conf)) ? RecoveryMode.LOG_REPLAY : RecoveryMode.LOG_SPLITTING; // Firstly check if there are outstanding recovering regions try { List<String> regions = ZKUtil.listChildrenNoWatch(watcher, watcher.znodePaths.recoveringRegionsZNode); if (regions != null && !regions.isEmpty()) { hasRecoveringRegions = true; previousRecoveryMode = RecoveryMode.LOG_REPLAY; } if (previousRecoveryMode == RecoveryMode.UNKNOWN) { // Secondly check if there are outstanding split log task List<String> tasks = listSplitLogTasks(); if (!tasks.isEmpty()) { hasSplitLogTask = true; if (isForInitialization) { // during initialization, try to get recovery mode from splitlogtask int listSize = tasks.size(); for (int i = 0; i < listSize; i++) { String task = tasks.get(i); try { byte[] data = ZKUtil.getData(this.watcher, ZKUtil.joinZNode(watcher.znodePaths.splitLogZNode, task)); if (data == null) continue; SplitLogTask slt = SplitLogTask.parseFrom(data); previousRecoveryMode = slt.getMode(); if (previousRecoveryMode == RecoveryMode.UNKNOWN) { // created by old code base where we don't set recovery mode in splitlogtask // we can safely set to LOG_SPLITTING because we're in master initialization code // before SSH is enabled & there is no outstanding recovering regions previousRecoveryMode = RecoveryMode.LOG_SPLITTING; } break; } catch (DeserializationException e) { LOG.warn("Failed parse data for znode " + task, e); } catch (InterruptedException e) { throw new InterruptedIOException(); } } } } } } catch (KeeperException e) { throw new IOException(e); } synchronized (this) { if (this.isDrainingDone) { return; } if (!hasSplitLogTask && !hasRecoveringRegions) { this.isDrainingDone = true; this.recoveryMode = recoveryModeInConfig; return; } else if (!isForInitialization) { // splitlogtask hasn't drained yet, keep existing recovery mode return; } if (previousRecoveryMode != RecoveryMode.UNKNOWN) { this.isDrainingDone = (previousRecoveryMode == recoveryModeInConfig); this.recoveryMode = previousRecoveryMode; } else { this.recoveryMode = recoveryModeInConfig; } } } /** * Returns if distributed log replay is turned on or not * @param conf * @return true when distributed log replay is turned on */ private boolean isDistributedLogReplay(Configuration conf) { return false; } private boolean resubmit(ServerName serverName, String path, int version) { try { // blocking zk call but this is done from the timeout thread SplitLogTask slt = new SplitLogTask.Unassigned(this.details.getServerName(), getRecoveryMode()); if (ZKUtil.setData(this.watcher, path, slt.toByteArray(), version) == false) { LOG.debug("failed to resubmit task " + path + " version changed"); return false; } } catch (NoNodeException e) { LOG.warn("failed to resubmit because znode doesn't exist " + path + " task done (or forced done by removing the znode)"); try { getDataSetWatchSuccess(path, null, Integer.MIN_VALUE); } catch (DeserializationException e1) { LOG.debug("Failed to re-resubmit task " + path + " because of deserialization issue", e1); return false; } return false; } catch (KeeperException.BadVersionException e) { LOG.debug("failed to resubmit task " + path + " version changed"); return false; } catch (KeeperException e) { SplitLogCounters.tot_mgr_resubmit_failed.incrementAndGet(); LOG.warn("failed to resubmit " + path, e); return false; } return true; } /** * {@link org.apache.hadoop.hbase.master.SplitLogManager} can use objects implementing this * interface to finish off a partially done task by * {@link org.apache.hadoop.hbase.regionserver.SplitLogWorker}. This provides a * serialization point at the end of the task processing. Must be restartable and idempotent. */ public interface TaskFinisher { /** * status that can be returned finish() */ enum Status { /** * task completed successfully */ DONE(), /** * task completed with error */ ERR(); } /** * finish the partially done task. workername provides clue to where the partial results of the * partially done tasks are present. taskname is the name of the task that was put up in * zookeeper. * <p> * @param workerName * @param taskname * @return DONE if task completed successfully, ERR otherwise */ Status finish(ServerName workerName, String taskname); } /** * Asynchronous handler for zk create node results. Retries on failures. */ public class CreateAsyncCallback implements AsyncCallback.StringCallback { private final Log LOG = LogFactory.getLog(CreateAsyncCallback.class); @Override public void processResult(int rc, String path, Object ctx, String name) { SplitLogCounters.tot_mgr_node_create_result.incrementAndGet(); if (rc != 0) { if (needAbandonRetries(rc, "Create znode " + path)) { createNodeFailure(path); return; } if (rc == KeeperException.Code.NODEEXISTS.intValue()) { // What if there is a delete pending against this pre-existing // znode? Then this soon-to-be-deleted task znode must be in TASK_DONE // state. Only operations that will be carried out on this node by // this manager are get-znode-data, task-finisher and delete-znode. // And all code pieces correctly handle the case of suddenly // disappearing task-znode. LOG.debug("found pre-existing znode " + path); SplitLogCounters.tot_mgr_node_already_exists.incrementAndGet(); } else { Long retry_count = (Long) ctx; LOG.warn("create rc =" + KeeperException.Code.get(rc) + " for " + path + " remaining retries=" + retry_count); if (retry_count == 0) { SplitLogCounters.tot_mgr_node_create_err.incrementAndGet(); createNodeFailure(path); } else { SplitLogCounters.tot_mgr_node_create_retry.incrementAndGet(); createNode(path, retry_count - 1); } return; } } createNodeSuccess(path); } } /** * Asynchronous handler for zk get-data-set-watch on node results. Retries on failures. */ public class GetDataAsyncCallback implements AsyncCallback.DataCallback { private final Log LOG = LogFactory.getLog(GetDataAsyncCallback.class); @Override public void processResult(int rc, String path, Object ctx, byte[] data, Stat stat) { SplitLogCounters.tot_mgr_get_data_result.incrementAndGet(); if (rc != 0) { if (needAbandonRetries(rc, "GetData from znode " + path)) { return; } if (rc == KeeperException.Code.NONODE.intValue()) { SplitLogCounters.tot_mgr_get_data_nonode.incrementAndGet(); LOG.warn("task znode " + path + " vanished or not created yet."); // ignore since we should not end up in a case where there is in-memory task, // but no znode. The only case is between the time task is created in-memory // and the znode is created. See HBASE-11217. return; } Long retry_count = (Long) ctx; if (retry_count < 0) { LOG.warn("getdata rc = " + KeeperException.Code.get(rc) + " " + path + ". Ignoring error. No error handling. No retrying."); return; } LOG.warn("getdata rc = " + KeeperException.Code.get(rc) + " " + path + " remaining retries=" + retry_count); if (retry_count == 0) { SplitLogCounters.tot_mgr_get_data_err.incrementAndGet(); getDataSetWatchFailure(path); } else { SplitLogCounters.tot_mgr_get_data_retry.incrementAndGet(); getDataSetWatch(path, retry_count - 1); } return; } try { getDataSetWatchSuccess(path, data, stat.getVersion()); } catch (DeserializationException e) { LOG.warn("Deserialization problem", e); } return; } } /** * Asynchronous handler for zk delete node results. Retries on failures. */ public class DeleteAsyncCallback implements AsyncCallback.VoidCallback { private final Log LOG = LogFactory.getLog(DeleteAsyncCallback.class); @Override public void processResult(int rc, String path, Object ctx) { SplitLogCounters.tot_mgr_node_delete_result.incrementAndGet(); if (rc != 0) { if (needAbandonRetries(rc, "Delete znode " + path)) { details.getFailedDeletions().add(path); return; } if (rc != KeeperException.Code.NONODE.intValue()) { SplitLogCounters.tot_mgr_node_delete_err.incrementAndGet(); Long retry_count = (Long) ctx; LOG.warn("delete rc=" + KeeperException.Code.get(rc) + " for " + path + " remaining retries=" + retry_count); if (retry_count == 0) { LOG.warn("delete failed " + path); details.getFailedDeletions().add(path); deleteNodeFailure(path); } else { deleteNode(path, retry_count - 1); } return; } else { LOG.info(path + " does not exist. Either was created but deleted behind our" + " back by another pending delete OR was deleted" + " in earlier retry rounds. zkretries = " + ctx); } } else { LOG.debug("deleted " + path); } deleteNodeSuccess(path); } } /** * Asynchronous handler for zk create RESCAN-node results. Retries on failures. * <p> * A RESCAN node is created using PERSISTENT_SEQUENTIAL flag. It is a signal for all the * {@link org.apache.hadoop.hbase.regionserver.SplitLogWorker}s to rescan for new tasks. */ public class CreateRescanAsyncCallback implements AsyncCallback.StringCallback { private final Log LOG = LogFactory.getLog(CreateRescanAsyncCallback.class); @Override public void processResult(int rc, String path, Object ctx, String name) { if (rc != 0) { if (needAbandonRetries(rc, "CreateRescan znode " + path)) { return; } Long retry_count = (Long) ctx; LOG.warn("rc=" + KeeperException.Code.get(rc) + " for " + path + " remaining retries=" + retry_count); if (retry_count == 0) { createRescanFailure(); } else { rescan(retry_count - 1); } return; } // path is the original arg, name is the actual name that was created createRescanSuccess(name); } } @Override public void setDetails(SplitLogManagerDetails details) { this.details = details; } @Override public SplitLogManagerDetails getDetails() { return details; } @Override public synchronized RecoveryMode getRecoveryMode() { return recoveryMode; } @Override public long getLastRecoveryTime() { return lastRecoveringNodeCreationTime; } /** * Temporary function that is used by unit tests only */ public void setIgnoreDeleteForTesting(boolean b) { ignoreZKDeleteForTesting = b; } }