/**
* Copyright 2010 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.EOFException;
import java.net.ConnectException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Chore;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HServerAddress;
import org.apache.hadoop.hbase.HServerInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.NotServingRegionException;
import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.Stoppable;
import org.apache.hadoop.hbase.catalog.CatalogTracker;
import org.apache.hadoop.hbase.catalog.MetaReader;
import org.apache.hadoop.hbase.catalog.RootLocationEditor;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.executor.ExecutorService;
import org.apache.hadoop.hbase.executor.RegionTransitionData;
import org.apache.hadoop.hbase.executor.EventHandler.EventType;
import org.apache.hadoop.hbase.master.LoadBalancer.RegionPlan;
import org.apache.hadoop.hbase.master.handler.ClosedRegionHandler;
import org.apache.hadoop.hbase.master.handler.OpenedRegionHandler;
import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
import org.apache.hadoop.hbase.zookeeper.ZKTable;
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.hadoop.hbase.zookeeper.ZKUtil.NodeAndData;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.zookeeper.AsyncCallback;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.NoNodeException;
import org.apache.zookeeper.data.Stat;
/**
* Manages and performs region assignment.
* <p>
* Monitors ZooKeeper for events related to regions in transition.
* <p>
* Handles existing regions in transition during master failover.
*/
public class AssignmentManager extends ZooKeeperListener {
private static final Log LOG = LogFactory.getLog(AssignmentManager.class);
protected Server master;
private ServerManager serverManager;
private CatalogTracker catalogTracker;
private TimeoutMonitor timeoutMonitor;
/*
* Maximum times we recurse an assignment. See below in {@link #assign()}.
*/
private final int maximumAssignmentAttempts;
/**
* Regions currently in transition. Map of encoded region names to the master
* in-memory state for that region.
*/
final ConcurrentSkipListMap<String, RegionState> regionsInTransition =
new ConcurrentSkipListMap<String, RegionState>();
/** Plans for region movement. Key is the encoded version of a region name*/
// TODO: When do plans get cleaned out? Ever? In server open and in server
// shutdown processing -- St.Ack
// All access to this Map must be synchronized.
final NavigableMap<String, RegionPlan> regionPlans =
new TreeMap<String, RegionPlan>();
private final ZKTable zkTable;
/**
* Server to regions assignment map.
* Contains the set of regions currently assigned to a given server.
* This Map and {@link #regions} are tied. Always update this in tandem
* with the other under a lock on {@link #regions}
* @see #regions
*/
private final NavigableMap<HServerInfo, List<HRegionInfo>> servers =
new TreeMap<HServerInfo, List<HRegionInfo>>();
/**
* Region to server assignment map.
* Contains the server a given region is currently assigned to.
* This Map and {@link #servers} are tied. Always update this in tandem
* with the other under a lock on {@link #regions}
* @see #servers
*/
private final SortedMap<HRegionInfo,HServerInfo> regions =
new TreeMap<HRegionInfo,HServerInfo>();
private final ExecutorService executorService;
/**
* Constructs a new assignment manager.
*
* @param master
* @param serverManager
* @param catalogTracker
* @param service
* @throws KeeperException
*/
public AssignmentManager(Server master, ServerManager serverManager,
CatalogTracker catalogTracker, final ExecutorService service)
throws KeeperException {
super(master.getZooKeeper());
this.master = master;
this.serverManager = serverManager;
this.catalogTracker = catalogTracker;
this.executorService = service;
Configuration conf = master.getConfiguration();
this.timeoutMonitor = new TimeoutMonitor(
conf.getInt("hbase.master.assignment.timeoutmonitor.period", 10000),
master,
conf.getInt("hbase.master.assignment.timeoutmonitor.timeout", 30000));
Threads.setDaemonThreadRunning(timeoutMonitor,
master.getServerName() + ".timeoutMonitor");
this.zkTable = new ZKTable(this.master.getZooKeeper());
this.maximumAssignmentAttempts =
this.master.getConfiguration().getInt("hbase.assignment.maximum.attempts", 10);
}
/**
* @return Instance of ZKTable.
*/
public ZKTable getZKTable() {
// These are 'expensive' to make involving trip to zk ensemble so allow
// sharing.
return this.zkTable;
}
/**
* Reset all unassigned znodes. Called on startup of master.
* Call {@link #assignAllUserRegions()} after root and meta have been assigned.
* @throws IOException
* @throws KeeperException
*/
void cleanoutUnassigned() throws IOException, KeeperException {
// Cleanup any existing ZK nodes and start watching
ZKAssign.deleteAllNodes(watcher);
ZKUtil.listChildrenAndWatchForNewChildren(this.watcher,
this.watcher.assignmentZNode);
}
/**
* Handle failover. Restore state from META and ZK. Handle any regions in
* transition. Presumes <code>.META.</code> and <code>-ROOT-</code> deployed.
* @throws KeeperException
* @throws IOException
*/
void processFailover() throws KeeperException, IOException {
// Concurrency note: In the below the accesses on regionsInTransition are
// outside of a synchronization block where usually all accesses to RIT are
// synchronized. The presumption is that in this case it is safe since this
// method is being played by a single thread on startup.
// TODO: Check list of user regions and their assignments against regionservers.
// TODO: Regions that have a null location and are not in regionsInTransitions
// need to be handled.
// Scan META to build list of existing regions, servers, and assignment
// Returns servers who have not checked in (assumed dead) and their regions
Map<HServerInfo,List<Pair<HRegionInfo,Result>>> deadServers =
rebuildUserRegions();
// Process list of dead servers
processDeadServers(deadServers);
// Check existing regions in transition
List<String> nodes = ZKUtil.listChildrenAndWatchForNewChildren(watcher,
watcher.assignmentZNode);
if (nodes.isEmpty()) {
LOG.info("No regions in transition in ZK to process on failover");
return;
}
LOG.info("Failed-over master needs to process " + nodes.size() +
" regions in transition");
for (String encodedRegionName: nodes) {
processRegionInTransition(encodedRegionName, null);
}
}
/**
* If region is up in zk in transition, then do fixup and block and wait until
* the region is assigned and out of transition. Used on startup for
* catalog regions.
* @param hri Region to look for.
* @return True if we processed a region in transition else false if region
* was not up in zk in transition.
* @throws InterruptedException
* @throws KeeperException
* @throws IOException
*/
boolean processRegionInTransitionAndBlockUntilAssigned(final HRegionInfo hri)
throws InterruptedException, KeeperException, IOException {
boolean intransistion = processRegionInTransition(hri.getEncodedName(), hri);
if (!intransistion) return intransistion;
synchronized(this.regionsInTransition) {
while (!this.master.isStopped() &&
this.regionsInTransition.containsKey(hri.getEncodedName())) {
this.regionsInTransition.wait();
}
}
return intransistion;
}
/**
* Process failover of <code>encodedName</code>. Look in
* @param encodedRegionName Region to process failover for.
* @param encodedRegionName RegionInfo. If null we'll go get it from meta table.
* @return
* @throws KeeperException
* @throws IOException
*/
boolean processRegionInTransition(final String encodedRegionName,
final HRegionInfo regionInfo)
throws KeeperException, IOException {
RegionTransitionData data = ZKAssign.getData(watcher, encodedRegionName);
if (data == null) return false;
HRegionInfo hri = (regionInfo != null)? regionInfo:
MetaReader.getRegion(catalogTracker, data.getRegionName()).getFirst();
processRegionsInTransition(data, hri);
return true;
}
void processRegionsInTransition(final RegionTransitionData data,
final HRegionInfo regionInfo)
throws KeeperException {
String encodedRegionName = regionInfo.getEncodedName();
LOG.info("Processing region " + regionInfo.getRegionNameAsString() +
" in state " + data.getEventType());
synchronized (regionsInTransition) {
switch (data.getEventType()) {
case RS_ZK_REGION_CLOSING:
// Just insert region into RIT.
// If this never updates the timeout will trigger new assignment
regionsInTransition.put(encodedRegionName, new RegionState(
regionInfo, RegionState.State.CLOSING, data.getStamp()));
break;
case RS_ZK_REGION_CLOSED:
// Region is closed, insert into RIT and handle it
regionsInTransition.put(encodedRegionName, new RegionState(
regionInfo, RegionState.State.CLOSED, data.getStamp()));
new ClosedRegionHandler(master, this, regionInfo).process();
break;
case M_ZK_REGION_OFFLINE:
// Region is offline, insert into RIT and handle it like a closed
regionsInTransition.put(encodedRegionName, new RegionState(
regionInfo, RegionState.State.OFFLINE, data.getStamp()));
new ClosedRegionHandler(master, this, regionInfo).process();
break;
case RS_ZK_REGION_OPENING:
// Just insert region into RIT
// If this never updates the timeout will trigger new assignment
regionsInTransition.put(encodedRegionName, new RegionState(
regionInfo, RegionState.State.OPENING, data.getStamp()));
break;
case RS_ZK_REGION_OPENED:
// Region is opened, insert into RIT and handle it
regionsInTransition.put(encodedRegionName, new RegionState(
regionInfo, RegionState.State.OPENING, data.getStamp()));
HServerInfo hsi = serverManager.getServerInfo(data.getServerName());
// hsi could be null if this server is no longer online. If
// that the case, just let this RIT timeout; it'll be assigned
// to new server then.
if (hsi == null) {
LOG.warn("Region in transition " + regionInfo.getEncodedName() +
" references a server no longer up " + data.getServerName() +
"; letting RIT timeout so will be assigned elsewhere");
break;
}
new OpenedRegionHandler(master, this, regionInfo, hsi).process();
break;
}
}
}
/**
* Handles various states an unassigned node can be in.
* <p>
* Method is called when a state change is suspected for an unassigned node.
* <p>
* This deals with skipped transitions (we got a CLOSED but didn't see CLOSING
* yet).
* @param data
*/
private void handleRegion(final RegionTransitionData data) {
synchronized(regionsInTransition) {
if (data == null || data.getServerName() == null) {
LOG.warn("Unexpected NULL input " + data);
return;
}
// Check if this is a special HBCK transition
if (data.getServerName().equals(HConstants.HBCK_CODE_NAME)) {
handleHBCK(data);
return;
}
// Verify this is a known server
if (!serverManager.isServerOnline(data.getServerName()) &&
!this.master.getServerName().equals(data.getServerName())) {
LOG.warn("Attempted to handle region transition for server but " +
"server is not online: " + data.getRegionName());
return;
}
String encodedName = HRegionInfo.encodeRegionName(data.getRegionName());
String prettyPrintedRegionName = HRegionInfo.prettyPrint(encodedName);
LOG.debug("Handling transition=" + data.getEventType() +
", server=" + data.getServerName() + ", region=" + prettyPrintedRegionName);
RegionState regionState = regionsInTransition.get(encodedName);
switch (data.getEventType()) {
case M_ZK_REGION_OFFLINE:
// Nothing to do.
break;
case RS_ZK_REGION_CLOSING:
// Should see CLOSING after we have asked it to CLOSE or additional
// times after already being in state of CLOSING
if (regionState == null ||
(!regionState.isPendingClose() && !regionState.isClosing())) {
LOG.warn("Received CLOSING for region " + prettyPrintedRegionName +
" from server " + data.getServerName() + " but region was in " +
" the state " + regionState + " and not " +
"in expected PENDING_CLOSE or CLOSING states");
return;
}
// Transition to CLOSING (or update stamp if already CLOSING)
regionState.update(RegionState.State.CLOSING, data.getStamp());
break;
case RS_ZK_REGION_CLOSED:
// Should see CLOSED after CLOSING but possible after PENDING_CLOSE
if (regionState == null ||
(!regionState.isPendingClose() && !regionState.isClosing())) {
LOG.warn("Received CLOSED for region " + prettyPrintedRegionName +
" from server " + data.getServerName() + " but region was in " +
" the state " + regionState + " and not " +
"in expected PENDING_CLOSE or CLOSING states");
return;
}
// Handle CLOSED by assigning elsewhere or stopping if a disable
// If we got here all is good. Need to update RegionState -- else
// what follows will fail because not in expected state.
regionState.update(RegionState.State.CLOSED, data.getStamp());
this.executorService.submit(new ClosedRegionHandler(master,
this, regionState.getRegion()));
break;
case RS_ZK_REGION_OPENING:
// Should see OPENING after we have asked it to OPEN or additional
// times after already being in state of OPENING
if(regionState == null ||
(!regionState.isPendingOpen() && !regionState.isOpening())) {
LOG.warn("Received OPENING for region " +
prettyPrintedRegionName +
" from server " + data.getServerName() + " but region was in " +
" the state " + regionState + " and not " +
"in expected PENDING_OPEN or OPENING states");
return;
}
// Transition to OPENING (or update stamp if already OPENING)
regionState.update(RegionState.State.OPENING, data.getStamp());
break;
case RS_ZK_REGION_OPENED:
// Should see OPENED after OPENING but possible after PENDING_OPEN
if(regionState == null ||
(!regionState.isPendingOpen() && !regionState.isOpening())) {
LOG.warn("Received OPENED for region " +
prettyPrintedRegionName +
" from server " + data.getServerName() + " but region was in " +
" the state " + regionState + " and not " +
"in expected PENDING_OPEN or OPENING states");
return;
}
// Handle OPENED by removing from transition and deleted zk node
regionState.update(RegionState.State.OPEN, data.getStamp());
this.executorService.submit(
new OpenedRegionHandler(master, this, regionState.getRegion(),
this.serverManager.getServerInfo(data.getServerName())));
break;
}
}
}
/**
* Handle a ZK unassigned node transition triggered by HBCK repair tool.
* <p>
* This is handled in a separate code path because it breaks the normal rules.
* @param data
*/
private void handleHBCK(RegionTransitionData data) {
String encodedName = HRegionInfo.encodeRegionName(data.getRegionName());
LOG.info("Handling HBCK triggered transition=" + data.getEventType() +
", server=" + data.getServerName() + ", region=" +
HRegionInfo.prettyPrint(encodedName));
RegionState regionState = regionsInTransition.get(encodedName);
switch (data.getEventType()) {
case M_ZK_REGION_OFFLINE:
HRegionInfo regionInfo = null;
if (regionState != null) {
regionInfo = regionState.getRegion();
} else {
try {
regionInfo = MetaReader.getRegion(catalogTracker,
data.getRegionName()).getFirst();
} catch (IOException e) {
LOG.info("Exception reading META doing HBCK repair operation", e);
return;
}
}
LOG.info("HBCK repair is triggering assignment of region=" +
regionInfo.getRegionNameAsString());
// trigger assign, node is already in OFFLINE so don't need to update ZK
assign(regionInfo, false);
break;
default:
LOG.warn("Received unexpected region state from HBCK (" +
data.getEventType() + ")");
break;
}
}
// ZooKeeper events
/**
* New unassigned node has been created.
*
* <p>This happens when an RS begins the OPENING or CLOSING of a region by
* creating an unassigned node.
*
* <p>When this happens we must:
* <ol>
* <li>Watch the node for further events</li>
* <li>Read and handle the state in the node</li>
* </ol>
*/
@Override
public void nodeCreated(String path) {
if(path.startsWith(watcher.assignmentZNode)) {
synchronized(regionsInTransition) {
try {
RegionTransitionData data = ZKAssign.getData(watcher, path);
if(data == null) {
return;
}
handleRegion(data);
} catch (KeeperException e) {
master.abort("Unexpected ZK exception reading unassigned node data", e);
}
}
}
}
/**
* Existing unassigned node has had data changed.
*
* <p>This happens when an RS transitions from OFFLINE to OPENING, or between
* OPENING/OPENED and CLOSING/CLOSED.
*
* <p>When this happens we must:
* <ol>
* <li>Watch the node for further events</li>
* <li>Read and handle the state in the node</li>
* </ol>
*/
@Override
public void nodeDataChanged(String path) {
if(path.startsWith(watcher.assignmentZNode)) {
synchronized(regionsInTransition) {
try {
RegionTransitionData data = ZKAssign.getData(watcher, path);
if(data == null) {
return;
}
handleRegion(data);
} catch (KeeperException e) {
master.abort("Unexpected ZK exception reading unassigned node data", e);
}
}
}
}
/**
* New unassigned node has been created.
*
* <p>This happens when an RS begins the OPENING or CLOSING of a region by
* creating an unassigned node.
*
* <p>When this happens we must:
* <ol>
* <li>Watch the node for further children changed events</li>
* <li>Watch all new children for changed events</li>
* <li>Read all children and handle them</li>
* </ol>
*/
@Override
public void nodeChildrenChanged(String path) {
if(path.equals(watcher.assignmentZNode)) {
synchronized(regionsInTransition) {
try {
List<NodeAndData> newNodes = ZKUtil.watchAndGetNewChildren(watcher,
watcher.assignmentZNode);
for(NodeAndData newNode : newNodes) {
LOG.debug("Handling new unassigned node: " + newNode);
handleRegion(RegionTransitionData.fromBytes(newNode.getData()));
}
} catch(KeeperException e) {
master.abort("Unexpected ZK exception reading unassigned children", e);
}
}
}
}
/**
* Marks the region as online. Removes it from regions in transition and
* updates the in-memory assignment information.
* <p>
* Used when a region has been successfully opened on a region server.
* @param regionInfo
* @param serverInfo
*/
public void regionOnline(HRegionInfo regionInfo, HServerInfo serverInfo) {
synchronized (this.regionsInTransition) {
RegionState rs =
this.regionsInTransition.remove(regionInfo.getEncodedName());
if (rs != null) {
this.regionsInTransition.notifyAll();
}
}
synchronized (this.regions) {
// Add check
HServerInfo hsi = this.regions.get(regionInfo);
if (hsi != null) LOG.warn("Overwriting " + regionInfo.getEncodedName() +
" on " + hsi);
this.regions.put(regionInfo, serverInfo);
addToServers(serverInfo, regionInfo);
}
// Remove plan if one.
clearRegionPlan(regionInfo);
// Update timers for all regions in transition going against this server.
updateTimers(serverInfo);
}
/**
* Touch timers for all regions in transition that have the passed
* <code>hsi</code> in common.
* Call this method whenever a server checks in. Doing so helps the case where
* a new regionserver has joined the cluster and its been given 1k regions to
* open. If this method is tickled every time the region reports in a
* successful open then the 1k-th region won't be timed out just because its
* sitting behind the open of 999 other regions. This method is NOT used
* as part of bulk assign -- there we have a different mechanism for extending
* the regions in transition timer (we turn it off temporarily -- because
* there is no regionplan involved when bulk assigning.
* @param hsi
*/
private void updateTimers(final HServerInfo hsi) {
// This loop could be expensive.
// First make a copy of current regionPlan rather than hold sync while
// looping because holding sync can cause deadlock. Its ok in this loop
// if the Map we're going against is a little stale
Map<String, RegionPlan> copy = new HashMap<String, RegionPlan>();
synchronized(this.regionPlans) {
copy.putAll(this.regionPlans);
}
for (Map.Entry<String, RegionPlan> e: copy.entrySet()) {
if (!e.getValue().getDestination().equals(hsi)) continue;
RegionState rs = null;
synchronized (this.regionsInTransition) {
rs = this.regionsInTransition.get(e.getKey());
}
if (rs == null) continue;
synchronized (rs) {
rs.update(rs.getState());
}
}
}
/**
* Marks the region as offline. Removes it from regions in transition and
* removes in-memory assignment information.
* <p>
* Used when a region has been closed and should remain closed.
* @param regionInfo
*/
public void regionOffline(final HRegionInfo regionInfo) {
synchronized(this.regionsInTransition) {
if (this.regionsInTransition.remove(regionInfo.getEncodedName()) != null) {
this.regionsInTransition.notifyAll();
}
}
// remove the region plan as well just in case.
clearRegionPlan(regionInfo);
setOffline(regionInfo);
}
/**
* Sets the region as offline by removing in-memory assignment information but
* retaining transition information.
* <p>
* Used when a region has been closed but should be reassigned.
* @param regionInfo
*/
public void setOffline(HRegionInfo regionInfo) {
synchronized (this.regions) {
HServerInfo serverInfo = this.regions.remove(regionInfo);
if (serverInfo == null) return;
List<HRegionInfo> serverRegions = this.servers.get(serverInfo);
if (!serverRegions.remove(regionInfo)) {
LOG.warn("No " + regionInfo + " on " + serverInfo);
}
}
}
public void offlineDisabledRegion(HRegionInfo regionInfo) {
// Disabling so should not be reassigned, just delete the CLOSED node
LOG.debug("Table being disabled so deleting ZK node and removing from " +
"regions in transition, skipping assignment of region " +
regionInfo.getRegionNameAsString());
try {
if (!ZKAssign.deleteClosedNode(watcher, regionInfo.getEncodedName())) {
// Could also be in OFFLINE mode
ZKAssign.deleteOfflineNode(watcher, regionInfo.getEncodedName());
}
} catch (KeeperException.NoNodeException nne) {
LOG.debug("Tried to delete closed node for " + regionInfo + " but it " +
"does not exist so just offlining");
} catch (KeeperException e) {
this.master.abort("Error deleting CLOSED node in ZK", e);
}
regionOffline(regionInfo);
}
// Assignment methods
/**
* Assigns the specified region.
* <p>
* If a RegionPlan is available with a valid destination then it will be used
* to determine what server region is assigned to. If no RegionPlan is
* available, region will be assigned to a random available server.
* <p>
* Updates the RegionState and sends the OPEN RPC.
* <p>
* This will only succeed if the region is in transition and in a CLOSED or
* OFFLINE state or not in transition (in-memory not zk), and of course, the
* chosen server is up and running (It may have just crashed!). If the
* in-memory checks pass, the zk node is forced to OFFLINE before assigning.
*
* @param region server to be assigned
* @param setOfflineInZK whether ZK node should be created/transitioned to an
* OFFLINE state before assigning the region
*/
public void assign(HRegionInfo region, boolean setOfflineInZK) {
assign(region, setOfflineInZK, false);
}
public void assign(HRegionInfo region, boolean setOfflineInZK,
boolean forceNewPlan) {
String tableName = region.getTableDesc().getNameAsString();
boolean disabled = this.zkTable.isDisabledTable(tableName);
if (disabled || this.zkTable.isDisablingTable(tableName)) {
LOG.info("Table " + tableName + (disabled? " disabled;": " disabling;") +
" skipping assign of " + region.getRegionNameAsString());
offlineDisabledRegion(region);
return;
}
if (this.serverManager.isClusterShutdown()) {
LOG.info("Cluster shutdown is set; skipping assign of " +
region.getRegionNameAsString());
return;
}
RegionState state = addToRegionsInTransition(region);
synchronized (state) {
assign(state, setOfflineInZK, forceNewPlan);
}
}
/**
* Bulk assign regions to <code>destination</code>. If we fail in any way,
* we'll abort the server.
* @param destination
* @param regions Regions to assign.
*/
void assign(final HServerInfo destination,
final List<HRegionInfo> regions) {
LOG.debug("Bulk assigning " + regions.size() + " region(s) to " +
destination.getServerName());
List<RegionState> states = new ArrayList<RegionState>(regions.size());
synchronized (this.regionsInTransition) {
for (HRegionInfo region: regions) {
states.add(forceRegionStateToOffline(region));
}
}
// Presumption is that only this thread will be updating the state at this
// time; i.e. handlers on backend won't be trying to set it to OPEN, etc.
AtomicInteger counter = new AtomicInteger(0);
CreateUnassignedAsyncCallback cb =
new CreateUnassignedAsyncCallback(this.watcher, destination, counter);
for (RegionState state: states) {
if (!asyncSetOfflineInZooKeeper(state, cb, state)) {
return;
}
}
// Wait until all unassigned nodes have been put up and watchers set.
int total = regions.size();
for (int oldCounter = 0; true;) {
int count = counter.get();
if (oldCounter != count) {
LOG.info(destination.getServerName() + " unassigned znodes=" + count +
" of total=" + total);
oldCounter = count;
}
if (count == total) break;
Threads.sleep(1);
}
// Move on to open regions.
try {
// Send OPEN RPC. This can fail if the server on other end is is not up.
this.serverManager.sendRegionOpen(destination, regions);
} catch (Throwable t) {
this.master.abort("Failed assignment of regions to " + destination, t);
return;
}
LOG.debug("Bulk assigning done for " + destination.getServerName());
}
/**
* Callback handler for create unassigned znodes used during bulk assign.
*/
static class CreateUnassignedAsyncCallback implements AsyncCallback.StringCallback {
private final Log LOG = LogFactory.getLog(CreateUnassignedAsyncCallback.class);
private final ZooKeeperWatcher zkw;
private final HServerInfo destination;
private final AtomicInteger counter;
CreateUnassignedAsyncCallback(final ZooKeeperWatcher zkw,
final HServerInfo destination, final AtomicInteger counter) {
this.zkw = zkw;
this.destination = destination;
this.counter = counter;
}
@Override
public void processResult(int rc, String path, Object ctx, String name) {
if (rc != 0) {
// Thisis resultcode. If non-zero, need to resubmit.
LOG.warn("rc != 0 for " + path + " -- retryable connectionloss -- " +
"FIX see http://wiki.apache.org/hadoop/ZooKeeper/FAQ#A2");
this.zkw.abort("Connectionloss writing unassigned at " + path +
", rc=" + rc, null);
return;
}
LOG.debug("rs=" + (RegionState)ctx + ", server=" + this.destination.getServerName());
// Async exists to set a watcher so we'll get triggered when
// unassigned node changes.
this.zkw.getZooKeeper().exists(path, this.zkw,
new ExistsUnassignedAsyncCallback(this.counter), ctx);
}
}
/**
* Callback handler for the exists call that sets watcher on unassigned znodes.
* Used during bulk assign on startup.
*/
static class ExistsUnassignedAsyncCallback implements AsyncCallback.StatCallback {
private final Log LOG = LogFactory.getLog(ExistsUnassignedAsyncCallback.class);
private final AtomicInteger counter;
ExistsUnassignedAsyncCallback(final AtomicInteger counter) {
this.counter = counter;
}
@Override
public void processResult(int rc, String path, Object ctx, Stat stat) {
if (rc != 0) {
// Thisis resultcode. If non-zero, need to resubmit.
LOG.warn("rc != 0 for " + path + " -- retryable connectionloss -- " +
"FIX see http://wiki.apache.org/hadoop/ZooKeeper/FAQ#A2");
return;
}
RegionState state = (RegionState)ctx;
LOG.debug("rs=" + state);
// Transition RegionState to PENDING_OPEN here in master; means we've
// sent the open. We're a little ahead of ourselves here since we've not
// yet sent out the actual open but putting this state change after the
// call to open risks our writing PENDING_OPEN after state has been moved
// to OPENING by the regionserver.
state.update(RegionState.State.PENDING_OPEN);
this.counter.addAndGet(1);
}
}
/**
* @param region
* @return
*/
private RegionState addToRegionsInTransition(final HRegionInfo region) {
synchronized (regionsInTransition) {
return forceRegionStateToOffline(region);
}
}
/**
* Sets regions {@link RegionState} to {@link RegionState.State#OFFLINE}.
* Caller must hold lock on this.regionsInTransition.
* @param region
* @return Amended RegionState.
*/
private RegionState forceRegionStateToOffline(final HRegionInfo region) {
String encodedName = region.getEncodedName();
RegionState state = this.regionsInTransition.get(encodedName);
if (state == null) {
state = new RegionState(region, RegionState.State.OFFLINE);
this.regionsInTransition.put(encodedName, state);
} else {
LOG.debug("Forcing OFFLINE; was=" + state);
state.update(RegionState.State.OFFLINE);
}
return state;
}
/**
* Caller must hold lock on the passed <code>state</code> object.
* @param state
* @param setOfflineInZK
* @param forceNewPlan
*/
private void assign(final RegionState state, final boolean setOfflineInZK,
final boolean forceNewPlan) {
for (int i = 0; i < this.maximumAssignmentAttempts; i++) {
if (setOfflineInZK && !setOfflineInZooKeeper(state)) return;
if (this.master.isStopped()) {
LOG.debug("Server stopped; skipping assign of " + state);
return;
}
RegionPlan plan = getRegionPlan(state, forceNewPlan);
if (plan == null) return; // Should get reassigned later when RIT times out.
try {
LOG.debug("Assigning region " + state.getRegion().getRegionNameAsString() +
" to " + plan.getDestination().getServerName());
// Transition RegionState to PENDING_OPEN
state.update(RegionState.State.PENDING_OPEN);
// Send OPEN RPC. This can fail if the server on other end is is not up.
serverManager.sendRegionOpen(plan.getDestination(), state.getRegion());
break;
} catch (Throwable t) {
LOG.warn("Failed assignment of " +
state.getRegion().getRegionNameAsString() + " to " +
plan.getDestination() + ", trying to assign elsewhere instead; " +
"retry=" + i, t);
// Clean out plan we failed execute and one that doesn't look like it'll
// succeed anyways; we need a new plan!
// Transition back to OFFLINE
state.update(RegionState.State.OFFLINE);
// Force a new plan and reassign. Will return null if no servers.
if (getRegionPlan(state, plan.getDestination(), true) == null) {
LOG.warn("Unable to find a viable location to assign region " +
state.getRegion().getRegionNameAsString());
return;
}
}
}
}
/**
* Set region as OFFLINED up in zookeeper
* @param state
* @return True if we succeeded, false otherwise (State was incorrect or failed
* updating zk).
*/
boolean setOfflineInZooKeeper(final RegionState state) {
if (!state.isClosed() && !state.isOffline()) {
new RuntimeException("Unexpected state trying to OFFLINE; " + state);
this.master.abort("Unexpected state trying to OFFLINE; " + state,
new IllegalStateException());
return false;
}
state.update(RegionState.State.OFFLINE);
try {
if(!ZKAssign.createOrForceNodeOffline(master.getZooKeeper(),
state.getRegion(), master.getServerName())) {
LOG.warn("Attempted to create/force node into OFFLINE state before " +
"completing assignment but failed to do so for " + state);
return false;
}
} catch (KeeperException e) {
master.abort("Unexpected ZK exception creating/setting node OFFLINE", e);
return false;
}
return true;
}
/**
* Set region as OFFLINED up in zookeeper asynchronously.
* @param state
* @return True if we succeeded, false otherwise (State was incorrect or failed
* updating zk).
*/
boolean asyncSetOfflineInZooKeeper(final RegionState state,
final AsyncCallback.StringCallback cb, final Object ctx) {
if (!state.isClosed() && !state.isOffline()) {
new RuntimeException("Unexpected state trying to OFFLINE; " + state);
this.master.abort("Unexpected state trying to OFFLINE; " + state,
new IllegalStateException());
return false;
}
state.update(RegionState.State.OFFLINE);
try {
ZKAssign.asyncCreateNodeOffline(master.getZooKeeper(), state.getRegion(),
master.getServerName(), cb, ctx);
} catch (KeeperException e) {
master.abort("Unexpected ZK exception creating/setting node OFFLINE", e);
return false;
}
return true;
}
/**
* @param state
* @return Plan for passed <code>state</code> (If none currently, it creates one or
* if no servers to assign, it returns null).
*/
RegionPlan getRegionPlan(final RegionState state,
final boolean forceNewPlan) {
return getRegionPlan(state, null, forceNewPlan);
}
/**
* @param state
* @param serverToExclude Server to exclude (we know its bad). Pass null if
* all servers are thought to be assignable.
* @param forceNewPlan If true, then if an existing plan exists, a new plan
* will be generated.
* @return Plan for passed <code>state</code> (If none currently, it creates one or
* if no servers to assign, it returns null).
*/
RegionPlan getRegionPlan(final RegionState state,
final HServerInfo serverToExclude, final boolean forceNewPlan) {
// Pickup existing plan or make a new one
String encodedName = state.getRegion().getEncodedName();
List<HServerInfo> servers = this.serverManager.getOnlineServersList();
// The remove below hinges on the fact that the call to
// serverManager.getOnlineServersList() returns a copy
if (serverToExclude != null) servers.remove(serverToExclude);
if (servers.isEmpty()) return null;
RegionPlan randomPlan = new RegionPlan(state.getRegion(), null,
LoadBalancer.randomAssignment(servers));
boolean newPlan = false;
RegionPlan existingPlan = null;
synchronized (this.regionPlans) {
existingPlan = this.regionPlans.get(encodedName);
if (existingPlan == null || forceNewPlan ||
(existingPlan != null && existingPlan.getDestination().equals(serverToExclude))) {
newPlan = true;
this.regionPlans.put(encodedName, randomPlan);
}
}
if (newPlan) {
LOG.debug("No previous transition plan was found (or we are ignoring " +
"an existing plan) for " + state.getRegion().getRegionNameAsString() +
" so generated a random one; " + randomPlan + "; " +
serverManager.countOfRegionServers() +
" (online=" + serverManager.getOnlineServers().size() +
", exclude=" + serverToExclude + ") available servers");
return randomPlan;
}
LOG.debug("Using pre-existing plan for region " +
state.getRegion().getRegionNameAsString() + "; plan=" + existingPlan);
return existingPlan;
}
/**
* Unassigns the specified region.
* <p>
* Updates the RegionState and sends the CLOSE RPC.
* <p>
* If a RegionPlan is already set, it will remain.
*
* @param region server to be unassigned
*/
public void unassign(HRegionInfo region) {
unassign(region, false);
}
/**
* Unassigns the specified region.
* <p>
* Updates the RegionState and sends the CLOSE RPC.
* <p>
* If a RegionPlan is already set, it will remain.
*
* @param region server to be unassigned
* @param force if region should be closed even if already closing
*/
public void unassign(HRegionInfo region, boolean force) {
LOG.debug("Starting unassignment of region " +
region.getRegionNameAsString() + " (offlining)");
synchronized (this.regions) {
// Check if this region is currently assigned
if (!regions.containsKey(region)) {
LOG.debug("Attempted to unassign region " +
region.getRegionNameAsString() + " but it is not " +
"currently assigned anywhere");
return;
}
}
String encodedName = region.getEncodedName();
// Grab the state of this region and synchronize on it
RegionState state;
synchronized (regionsInTransition) {
state = regionsInTransition.get(encodedName);
if (state == null) {
state = new RegionState(region, RegionState.State.PENDING_CLOSE);
regionsInTransition.put(encodedName, state);
} else if (force && state.isPendingClose()) {
LOG.debug("Attempting to unassign region " +
region.getRegionNameAsString() + " which is already pending close "
+ "but forcing an additional close");
state.update(RegionState.State.PENDING_CLOSE);
} else {
LOG.debug("Attempting to unassign region " +
region.getRegionNameAsString() + " but it is " +
"already in transition (" + state.getState() + ")");
return;
}
}
// Send CLOSE RPC
HServerInfo server = null;
synchronized (this.regions) {
server = regions.get(region);
}
try {
// TODO: We should consider making this look more like it does for the
// region open where we catch all throwables and never abort
if (serverManager.sendRegionClose(server, state.getRegion())) {
LOG.debug("Sent CLOSE to " + server + " for region " +
region.getRegionNameAsString());
return;
}
// This never happens. Currently regionserver close always return true.
LOG.debug("Server " + server + " region CLOSE RPC returned false for " +
region.getEncodedName());
} catch (NotServingRegionException nsre) {
LOG.info("Server " + server + " returned " + nsre + " for " +
region.getEncodedName());
// Presume that master has stale data. Presume remote side just split.
// Presume that the split message when it comes in will fix up the master's
// in memory cluster state.
return;
} catch (ConnectException e) {
LOG.info("Failed connect to " + server + ", message=" + e.getMessage() +
", region=" + region.getEncodedName());
// Presume that regionserver just failed and we haven't got expired
// server from zk yet. Let expired server deal with clean up.
} catch (java.net.SocketTimeoutException e) {
LOG.info("Server " + server + " returned " + e.getMessage() + " for " +
region.getEncodedName());
// Presume retry or server will expire.
} catch (EOFException e) {
LOG.info("Server " + server + " returned " + e.getMessage() + " for " +
region.getEncodedName());
// Presume retry or server will expire.
} catch (RemoteException re) {
IOException ioe = re.unwrapRemoteException();
if (ioe instanceof NotServingRegionException) {
// Failed to close, so pass through and reassign
LOG.debug("Server " + server + " returned " + ioe + " for " +
region.getEncodedName());
} else {
this.master.abort("Remote unexpected exception", ioe);
}
} catch (Throwable t) {
// For now call abort if unexpected exception -- radical, but will get
// fellas attention. St.Ack 20101012
this.master.abort("Remote unexpected exception", t);
}
}
/**
* Waits until the specified region has completed assignment.
* <p>
* If the region is already assigned, returns immediately. Otherwise, method
* blocks until the region is assigned.
* @param regionInfo region to wait on assignment for
* @throws InterruptedException
*/
public void waitForAssignment(HRegionInfo regionInfo)
throws InterruptedException {
synchronized(regions) {
while(!regions.containsKey(regionInfo)) {
regions.wait();
}
}
}
/**
* Assigns the ROOT region.
* <p>
* Assumes that ROOT is currently closed and is not being actively served by
* any RegionServer.
* <p>
* Forcibly unsets the current root region location in ZooKeeper and assigns
* ROOT to a random RegionServer.
* @throws KeeperException
*/
public void assignRoot() throws KeeperException {
RootLocationEditor.deleteRootLocation(this.master.getZooKeeper());
assign(HRegionInfo.ROOT_REGIONINFO, true);
}
/**
* Assigns the META region.
* <p>
* Assumes that META is currently closed and is not being actively served by
* any RegionServer.
* <p>
* Forcibly assigns META to a random RegionServer.
*/
public void assignMeta() {
// Force assignment to a random server
assign(HRegionInfo.FIRST_META_REGIONINFO, true);
}
/**
* Assigns all user regions, if any exist. Used during cluster startup.
* <p>
* This is a synchronous call and will return once every region has been
* assigned. If anything fails, an exception is thrown and the cluster
* should be shutdown.
* @throws InterruptedException
* @throws IOException
*/
public void assignAllUserRegions() throws IOException, InterruptedException {
// Get all available servers
List<HServerInfo> servers = serverManager.getOnlineServersList();
// Scan META for all user regions, skipping any disabled tables
Map<HRegionInfo,HServerAddress> allRegions =
MetaReader.fullScan(catalogTracker, this.zkTable.getDisabledTables(), true);
if (allRegions == null || allRegions.isEmpty()) return;
// Determine what type of assignment to do on startup
boolean retainAssignment = master.getConfiguration().
getBoolean("hbase.master.startup.retainassign", true);
Map<HServerInfo, List<HRegionInfo>> bulkPlan = null;
if (retainAssignment) {
// Reuse existing assignment info
bulkPlan = LoadBalancer.retainAssignment(allRegions, servers);
} else {
// Generate a round-robin bulk assignment plan
bulkPlan = LoadBalancer.roundRobinAssignment(
new ArrayList<HRegionInfo>(allRegions.keySet()), servers);
}
LOG.info("Bulk assigning " + allRegions.size() + " region(s) across " +
servers.size() + " server(s), retainAssignment=" + retainAssignment);
// Use fixed count thread pool assigning.
BulkAssigner ba = new BulkStartupAssigner(this.master, bulkPlan, this);
ba.bulkAssign();
LOG.info("Bulk assigning done");
}
/**
* Run bulk assign on startup.
*/
static class BulkStartupAssigner extends BulkAssigner {
private final Map<HServerInfo, List<HRegionInfo>> bulkPlan;
private final AssignmentManager assignmentManager;
BulkStartupAssigner(final Server server,
final Map<HServerInfo, List<HRegionInfo>> bulkPlan,
final AssignmentManager am) {
super(server);
this.bulkPlan = bulkPlan;
this.assignmentManager = am;
}
@Override
public boolean bulkAssign() throws InterruptedException {
// Disable timing out regions in transition up in zk while bulk assigning.
this.assignmentManager.timeoutMonitor.bulkAssign(true);
try {
return super.bulkAssign();
} finally {
// Reenable timing out regions in transition up in zi.
this.assignmentManager.timeoutMonitor.bulkAssign(false);
}
}
@Override
protected String getThreadNamePrefix() {
return super.getThreadNamePrefix() + "-startup";
}
@Override
protected void populatePool(java.util.concurrent.ExecutorService pool) {
for (Map.Entry<HServerInfo, List<HRegionInfo>> e: this.bulkPlan.entrySet()) {
pool.execute(new SingleServerBulkAssigner(e.getKey(), e.getValue(),
this.assignmentManager));
}
}
protected boolean waitUntilDone(final long timeout)
throws InterruptedException {
return this.assignmentManager.waitUntilNoRegionsInTransition(timeout);
}
}
/**
* Manage bulk assigning to a server.
*/
static class SingleServerBulkAssigner implements Runnable {
private final HServerInfo regionserver;
private final List<HRegionInfo> regions;
private final AssignmentManager assignmentManager;
SingleServerBulkAssigner(final HServerInfo regionserver,
final List<HRegionInfo> regions, final AssignmentManager am) {
this.regionserver = regionserver;
this.regions = regions;
this.assignmentManager = am;
}
@Override
public void run() {
this.assignmentManager.assign(this.regionserver, this.regions);
}
}
/**
* Wait until no regions in transition.
* @param timeout How long to wait.
* @return True if nothing in regions in transition.
* @throws InterruptedException
*/
boolean waitUntilNoRegionsInTransition(final long timeout)
throws InterruptedException {
// Blocks until there are no regions in transition. It is possible that
// there
// are regions in transition immediately after this returns but guarantees
// that if it returns without an exception that there was a period of time
// with no regions in transition from the point-of-view of the in-memory
// state of the Master.
long startTime = System.currentTimeMillis();
long remaining = timeout;
synchronized (regionsInTransition) {
while (regionsInTransition.size() > 0 && !this.master.isStopped()
&& remaining > 0) {
regionsInTransition.wait(remaining);
remaining = timeout - (System.currentTimeMillis() - startTime);
}
}
return regionsInTransition.isEmpty();
}
/**
* Rebuild the list of user regions and assignment information.
* <p>
* Returns a map of servers that are not found to be online and the regions
* they were hosting.
* @return map of servers not online to their assigned regions, as stored
* in META
* @throws IOException
*/
private Map<HServerInfo,List<Pair<HRegionInfo,Result>>> rebuildUserRegions()
throws IOException {
// Region assignment from META
List<Result> results = MetaReader.fullScanOfResults(catalogTracker);
// Map of offline servers and their regions to be returned
Map<HServerInfo,List<Pair<HRegionInfo,Result>>> offlineServers =
new TreeMap<HServerInfo,List<Pair<HRegionInfo,Result>>>();
// Iterate regions in META
for (Result result : results) {
Pair<HRegionInfo,HServerInfo> region =
MetaReader.metaRowToRegionPairWithInfo(result);
if (region == null) continue;
HServerInfo regionLocation = region.getSecond();
HRegionInfo regionInfo = region.getFirst();
if (regionLocation == null) {
// Region not being served, add to region map with no assignment
// If this needs to be assigned out, it will also be in ZK as RIT
this.regions.put(regionInfo, null);
} else if (!serverManager.isServerOnline(
regionLocation.getServerName())) {
// Region is located on a server that isn't online
List<Pair<HRegionInfo,Result>> offlineRegions =
offlineServers.get(regionLocation);
if (offlineRegions == null) {
offlineRegions = new ArrayList<Pair<HRegionInfo,Result>>(1);
offlineServers.put(regionLocation, offlineRegions);
}
offlineRegions.add(new Pair<HRegionInfo,Result>(regionInfo, result));
} else {
// Region is being served and on an active server
regions.put(regionInfo, regionLocation);
addToServers(regionLocation, regionInfo);
}
}
return offlineServers;
}
/**
* Processes list of dead servers from result of META scan.
* <p>
* This is used as part of failover to handle RegionServers which failed
* while there was no active master.
* <p>
* Method stubs in-memory data to be as expected by the normal server shutdown
* handler.
*
* @param deadServers
* @throws IOException
* @throws KeeperException
*/
private void processDeadServers(
Map<HServerInfo, List<Pair<HRegionInfo, Result>>> deadServers)
throws IOException, KeeperException {
for (Map.Entry<HServerInfo, List<Pair<HRegionInfo,Result>>> deadServer :
deadServers.entrySet()) {
List<Pair<HRegionInfo,Result>> regions = deadServer.getValue();
for (Pair<HRegionInfo,Result> region : regions) {
HRegionInfo regionInfo = region.getFirst();
Result result = region.getSecond();
// If region was in transition (was in zk) force it offline for reassign
try {
ZKAssign.createOrForceNodeOffline(watcher, regionInfo,
master.getServerName());
} catch (KeeperException.NoNodeException nne) {
// This is fine
}
// Process with existing RS shutdown code
ServerShutdownHandler.processDeadRegion(regionInfo, result, this,
this.catalogTracker);
}
}
}
/*
* Presumes caller has taken care of necessary locking modifying servers Map.
* @param hsi
* @param hri
*/
private void addToServers(final HServerInfo hsi, final HRegionInfo hri) {
List<HRegionInfo> hris = servers.get(hsi);
if (hris == null) {
hris = new ArrayList<HRegionInfo>();
servers.put(hsi, hris);
}
hris.add(hri);
}
/**
* @return A copy of the Map of regions currently in transition.
*/
public NavigableMap<String, RegionState> getRegionsInTransition() {
synchronized (this.regionsInTransition) {
return new TreeMap<String, RegionState>(this.regionsInTransition);
}
}
/**
* @return True if regions in transition.
*/
public boolean isRegionsInTransition() {
synchronized (this.regionsInTransition) {
return !this.regionsInTransition.isEmpty();
}
}
/**
* @param hri Region to check.
* @return Returns null if passed region is not in transition else the current
* RegionState
*/
public RegionState isRegionInTransition(final HRegionInfo hri) {
synchronized (this.regionsInTransition) {
return this.regionsInTransition.get(hri.getEncodedName());
}
}
/**
* Clears the specified region from being in transition.
* <p>
* Used only by HBCK tool.
* @param hri
*/
public void clearRegionFromTransition(HRegionInfo hri) {
synchronized (this.regionsInTransition) {
this.regionsInTransition.remove(hri.getEncodedName());
}
synchronized (this.regions) {
this.regions.remove(hri);
for (List<HRegionInfo> regions : this.servers.values()) {
for (int i=0;i<regions.size();i++) {
if (regions.get(i).equals(hri)) {
regions.remove(i);
break;
}
}
}
}
clearRegionPlan(hri);
}
/**
* @param region Region whose plan we are to clear.
*/
void clearRegionPlan(final HRegionInfo region) {
synchronized (this.regionPlans) {
this.regionPlans.remove(region.getEncodedName());
}
}
/**
* Wait on region to clear regions-in-transition.
* @param hri Region to wait on.
* @throws IOException
*/
public void waitOnRegionToClearRegionsInTransition(final HRegionInfo hri)
throws IOException {
if (isRegionInTransition(hri) == null) return;
RegionState rs = null;
// There is already a timeout monitor on regions in transition so I
// should not have to have one here too?
while(!this.master.isStopped() && (rs = isRegionInTransition(hri)) != null) {
Threads.sleep(1000);
LOG.info("Waiting on " + rs + " to clear regions-in-transition");
}
if (this.master.isStopped()) {
LOG.info("Giving up wait on regions in " +
"transition because stoppable.isStopped is set");
}
}
/**
* Gets the online regions of the specified table.
* This method looks at the in-memory state. It does not go to <code>.META.</code>.
* Only returns <em>online</em> regions. If a region on this table has been
* closed during a disable, etc., it will be included in the returned list.
* So, the returned list may not necessarily be ALL regions in this table, its
* all the ONLINE regions in the table.
* @param tableName
* @return Online regions from <code>tableName</code>
*/
public List<HRegionInfo> getRegionsOfTable(byte[] tableName) {
List<HRegionInfo> tableRegions = new ArrayList<HRegionInfo>();
HRegionInfo boundary =
new HRegionInfo(new HTableDescriptor(tableName), null, null);
synchronized (this.regions) {
for (HRegionInfo regionInfo: this.regions.tailMap(boundary).keySet()) {
if(Bytes.equals(regionInfo.getTableDesc().getName(), tableName)) {
tableRegions.add(regionInfo);
} else {
break;
}
}
}
return tableRegions;
}
/**
* Monitor to check for time outs on region transition operations
*/
public class TimeoutMonitor extends Chore {
private final int timeout;
private boolean bulkAssign = false;
/**
* Creates a periodic monitor to check for time outs on region transition
* operations. This will deal with retries if for some reason something
* doesn't happen within the specified timeout.
* @param period
* @param stopper When {@link Stoppable#isStopped()} is true, this thread will
* cleanup and exit cleanly.
* @param timeout
*/
public TimeoutMonitor(final int period, final Stoppable stopper,
final int timeout) {
super("AssignmentTimeoutMonitor", period, stopper);
this.timeout = timeout;
}
/**
* @param bulkAssign If true, we'll suspend checking regions in transition
* up in zookeeper. If false, will reenable check.
* @return Old setting for bulkAssign.
*/
public boolean bulkAssign(final boolean bulkAssign) {
boolean result = this.bulkAssign;
this.bulkAssign = bulkAssign;
return result;
}
@Override
protected void chore() {
// If bulkAssign in progress, suspend checks
if (this.bulkAssign) return;
synchronized (regionsInTransition) {
// Iterate all regions in transition checking for time outs
long now = System.currentTimeMillis();
for (RegionState regionState : regionsInTransition.values()) {
if(regionState.getStamp() + timeout <= now) {
HRegionInfo regionInfo = regionState.getRegion();
LOG.info("Regions in transition timed out: " + regionState);
// Expired! Do a retry.
switch (regionState.getState()) {
case CLOSED:
LOG.info("Region has been CLOSED for too long, " +
"retriggering ClosedRegionHandler");
AssignmentManager.this.executorService.submit(
new ClosedRegionHandler(master, AssignmentManager.this,
regionState.getRegion()));
break;
case OFFLINE:
LOG.info("Region has been OFFLINE for too long, " +
"reassigning " + regionInfo.getRegionNameAsString() +
" to a random server");
assign(regionState.getRegion(), false);
break;
case PENDING_OPEN:
LOG.info("Region has been PENDING_OPEN for too " +
"long, reassigning region=" +
regionInfo.getRegionNameAsString());
assign(regionState.getRegion(), false, true);
break;
case OPENING:
LOG.info("Region has been OPENING for too " +
"long, reassigning region=" +
regionInfo.getRegionNameAsString());
// Should have a ZK node in OPENING state
try {
String node = ZKAssign.getNodeName(watcher,
regionInfo.getEncodedName());
Stat stat = new Stat();
RegionTransitionData data = ZKAssign.getDataNoWatch(watcher,
node, stat);
if (data.getEventType() == EventType.RS_ZK_REGION_OPENED) {
LOG.debug("Region has transitioned to OPENED, allowing " +
"watched event handlers to process");
break;
} else if (data.getEventType() !=
EventType.RS_ZK_REGION_OPENING) {
LOG.warn("While timing out a region in state OPENING, " +
"found ZK node in unexpected state: " +
data.getEventType());
break;
}
// Attempt to transition node into OFFLINE
try {
data = new RegionTransitionData(
EventType.M_ZK_REGION_OFFLINE, regionInfo.getRegionName(),
master.getServerName());
if (ZKUtil.setData(watcher, node, data.getBytes(),
stat.getVersion())) {
// Node is now OFFLINE, let's trigger another assignment
ZKUtil.getDataAndWatch(watcher, node); // re-set the watch
LOG.info("Successfully transitioned region=" +
regionInfo.getRegionNameAsString() + " into OFFLINE" +
" and forcing a new assignment");
assign(regionState, false, true);
}
} catch (KeeperException.NoNodeException nne) {
// Node did not exist, can't time this out
}
} catch (KeeperException ke) {
LOG.error("Unexpected ZK exception timing out CLOSING region",
ke);
break;
}
break;
case OPEN:
LOG.error("Region has been OPEN for too long, " +
"we don't know where region was opened so can't do anything");
break;
case PENDING_CLOSE:
LOG.info("Region has been PENDING_CLOSE for too " +
"long, running forced unassign again on region=" +
regionInfo.getRegionNameAsString());
try {
// If the server got the RPC, it will transition the node
// to CLOSING, so only do something here if no node exists
if (!ZKUtil.watchAndCheckExists(watcher,
ZKAssign.getNodeName(watcher,
regionInfo.getEncodedName()))) {
unassign(regionInfo, true);
}
} catch (NoNodeException e) {
LOG.debug("Node no longer existed so not forcing another " +
"unassignment");
} catch (KeeperException e) {
LOG.warn("Unexpected ZK exception timing out a region " +
"close", e);
}
break;
case CLOSING:
LOG.info("Region has been CLOSING for too " +
"long, this should eventually complete or the server will " +
"expire, doing nothing");
break;
}
}
}
}
}
}
/**
* Process shutdown server removing any assignments.
* @param hsi Server that went down.
* @return list of regions in transition on this server
*/
public List<RegionState> processServerShutdown(final HServerInfo hsi) {
// Clean out any existing assignment plans for this server
synchronized (this.regionPlans) {
for (Iterator <Map.Entry<String, RegionPlan>> i =
this.regionPlans.entrySet().iterator(); i.hasNext();) {
Map.Entry<String, RegionPlan> e = i.next();
if (e.getValue().getDestination().equals(hsi)) {
// Use iterator's remove else we'll get CME
i.remove();
}
}
}
// TODO: Do we want to sync on RIT here?
// Remove this server from map of servers to regions, and remove all regions
// of this server from online map of regions.
Set<HRegionInfo> deadRegions = null;
List<RegionState> rits = new ArrayList<RegionState>();
synchronized (this.regions) {
List<HRegionInfo> assignedRegions = this.servers.remove(hsi);
if (assignedRegions == null || assignedRegions.isEmpty()) {
// No regions on this server, we are done, return empty list of RITs
return rits;
}
deadRegions = new TreeSet<HRegionInfo>(assignedRegions);
for (HRegionInfo region : deadRegions) {
this.regions.remove(region);
}
}
// See if any of the regions that were online on this server were in RIT
// If they are, normal timeouts will deal with them appropriately so
// let's skip a manual re-assignment.
synchronized (regionsInTransition) {
for (RegionState region : this.regionsInTransition.values()) {
if (deadRegions.remove(region.getRegion())) {
rits.add(region);
}
}
}
return rits;
}
/**
* Update inmemory structures.
* @param hsi Server that reported the split
* @param parent Parent region that was split
* @param a Daughter region A
* @param b Daughter region B
*/
public void handleSplitReport(final HServerInfo hsi, final HRegionInfo parent,
final HRegionInfo a, final HRegionInfo b) {
regionOffline(parent);
// Remove any CLOSING node, if exists, due to race between master & rs
// for close & split. Not putting into regionOffline method because it is
// called from various locations.
try {
RegionTransitionData node = ZKAssign.getDataNoWatch(this.watcher,
parent.getEncodedName(), null);
if (node != null) {
if (node.getEventType().equals(EventType.RS_ZK_REGION_CLOSING)) {
ZKAssign.deleteClosingNode(this.watcher, parent);
} else {
LOG.warn("Split report has RIT node (shouldnt have one): " +
parent + " node: " + node);
}
}
} catch (KeeperException e) {
LOG.warn("Exception while validating RIT during split report", e);
}
regionOnline(a, hsi);
regionOnline(b, hsi);
// There's a possibility that the region was splitting while a user asked
// the master to disable, we need to make sure we close those regions in
// that case. This is not racing with the region server itself since RS
// report is done after the split transaction completed.
if (this.zkTable.isDisablingOrDisabledTable(
parent.getTableDesc().getNameAsString())) {
unassign(a);
unassign(b);
}
}
/**
* @return A clone of current assignments. Note, this is assignments only.
* If a new server has come in and it has no regions, it will not be included
* in the returned Map.
*/
Map<HServerInfo, List<HRegionInfo>> getAssignments() {
// This is an EXPENSIVE clone. Cloning though is the safest thing to do.
// Can't let out original since it can change and at least the loadbalancer
// wants to iterate this exported list. We need to synchronize on regions
// since all access to this.servers is under a lock on this.regions.
Map<HServerInfo, List<HRegionInfo>> result = null;
synchronized (this.regions) {
result = new HashMap<HServerInfo, List<HRegionInfo>>(this.servers.size());
for (Map.Entry<HServerInfo, List<HRegionInfo>> e: this.servers.entrySet()) {
List<HRegionInfo> shallowCopy = new ArrayList<HRegionInfo>(e.getValue());
HServerInfo clone = new HServerInfo(e.getKey());
// Set into server load the number of regions this server is carrying
// The load balancer calculation needs it at least and its handy.
clone.getLoad().setNumberOfRegions(e.getValue().size());
result.put(clone, shallowCopy);
}
}
return result;
}
/**
* @param encodedRegionName Region encoded name.
* @return Null or a {@link Pair} instance that holds the full {@link HRegionInfo}
* and the hosting servers {@link HServerInfo}.
*/
Pair<HRegionInfo, HServerInfo> getAssignment(final byte [] encodedRegionName) {
String name = Bytes.toString(encodedRegionName);
synchronized(this.regions) {
for (Map.Entry<HRegionInfo, HServerInfo> e: this.regions.entrySet()) {
if (e.getKey().getEncodedName().equals(name)) {
return new Pair<HRegionInfo, HServerInfo>(e.getKey(), e.getValue());
}
}
}
return null;
}
/**
* @param plan Plan to execute.
*/
void balance(final RegionPlan plan) {
synchronized (this.regionPlans) {
this.regionPlans.put(plan.getRegionName(), plan);
}
unassign(plan.getRegionInfo());
}
/**
* State of a Region while undergoing transitions.
*/
public static class RegionState implements Writable {
private HRegionInfo region;
public enum State {
OFFLINE, // region is in an offline state
PENDING_OPEN, // sent rpc to server to open but has not begun
OPENING, // server has begun to open but not yet done
OPEN, // server opened region and updated meta
PENDING_CLOSE, // sent rpc to server to close but has not begun
CLOSING, // server has begun to close but not yet done
CLOSED // server closed region and updated meta
}
private State state;
private long stamp;
public RegionState() {}
RegionState(HRegionInfo region, State state) {
this(region, state, System.currentTimeMillis());
}
RegionState(HRegionInfo region, State state, long stamp) {
this.region = region;
this.state = state;
this.stamp = stamp;
}
public void update(State state, long stamp) {
this.state = state;
this.stamp = stamp;
}
public void update(State state) {
this.state = state;
this.stamp = System.currentTimeMillis();
}
public State getState() {
return state;
}
public long getStamp() {
return stamp;
}
public HRegionInfo getRegion() {
return region;
}
public boolean isClosing() {
return state == State.CLOSING;
}
public boolean isClosed() {
return state == State.CLOSED;
}
public boolean isPendingClose() {
return state == State.PENDING_CLOSE;
}
public boolean isOpening() {
return state == State.OPENING;
}
public boolean isOpened() {
return state == State.OPEN;
}
public boolean isPendingOpen() {
return state == State.PENDING_OPEN;
}
public boolean isOffline() {
return state == State.OFFLINE;
}
@Override
public String toString() {
return region.getRegionNameAsString() + " state=" + state +
", ts=" + stamp;
}
@Override
public void readFields(DataInput in) throws IOException {
region = new HRegionInfo();
region.readFields(in);
state = State.valueOf(in.readUTF());
stamp = in.readLong();
}
@Override
public void write(DataOutput out) throws IOException {
region.write(out);
out.writeUTF(state.name());
out.writeLong(stamp);
}
}
public void stop() {
this.timeoutMonitor.interrupt();
}
}