/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.chaos.actions;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import org.apache.commons.lang.math.RandomUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey;
/**
* Restarts a ratio of the regionservers in a rolling fashion. At each step, either kills a
* server, or starts one, sleeping randomly (0-sleepTime) in between steps. The parameter maxDeadServers
* limits the maximum number of servers that can be down at the same time during rolling restarts.
*/
public class RollingBatchRestartRsAction extends BatchRestartRsAction {
private static final Log LOG = LogFactory.getLog(RollingBatchRestartRsAction.class);
protected int maxDeadServers; // number of maximum dead servers at any given time. Defaults to 5
public RollingBatchRestartRsAction(long sleepTime, float ratio) {
this(sleepTime, ratio, 5);
}
public RollingBatchRestartRsAction(long sleepTime, float ratio, int maxDeadServers) {
super(sleepTime, ratio);
this.maxDeadServers = maxDeadServers;
}
enum KillOrStart {
KILL,
START
}
@Override
public void perform() throws Exception {
LOG.info(String.format("Performing action: Rolling batch restarting %d%% of region servers",
(int)(ratio * 100)));
List<ServerName> selectedServers = selectServers();
Queue<ServerName> serversToBeKilled = new LinkedList<>(selectedServers);
Queue<ServerName> deadServers = new LinkedList<>();
// loop while there are servers to be killed or dead servers to be restarted
while ((!serversToBeKilled.isEmpty() || !deadServers.isEmpty()) && !context.isStopping()) {
KillOrStart action = KillOrStart.KILL;
if (serversToBeKilled.isEmpty()) { // no more servers to kill
action = KillOrStart.START;
} else if (deadServers.isEmpty()) {
action = KillOrStart.KILL; // no more servers to start
} else if (deadServers.size() >= maxDeadServers) {
// we have too many dead servers. Don't kill any more
action = KillOrStart.START;
} else {
// do a coin toss
action = RandomUtils.nextBoolean() ? KillOrStart.KILL : KillOrStart.START;
}
ServerName server;
switch (action) {
case KILL:
server = serversToBeKilled.remove();
try {
killRs(server);
} catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
// We've seen this in test runs where we timeout but the kill went through. HBASE-9743
// So, add to deadServers even if exception so the start gets called.
LOG.info("Problem killing but presume successful; code=" + e.getExitCode(), e);
}
deadServers.add(server);
break;
case START:
try {
server = deadServers.remove();
startRs(server);
} catch (org.apache.hadoop.util.Shell.ExitCodeException e) {
// The start may fail but better to just keep going though we may lose server.
//
LOG.info("Problem starting, will retry; code=" + e.getExitCode(), e);
}
break;
}
sleep(RandomUtils.nextInt((int)sleepTime));
}
}
protected List<ServerName> selectServers() throws IOException {
return PolicyBasedChaosMonkey.selectRandomItems(getCurrentServers(), ratio);
}
/**
* Small test to ensure the class basically works.
* @param args
* @throws Exception
*/
public static void main(final String[] args) throws Exception {
RollingBatchRestartRsAction action = new RollingBatchRestartRsAction(1, 1.0f) {
private int invocations = 0;
@Override
protected ServerName[] getCurrentServers() throws IOException {
final int count = 4;
List<ServerName> serverNames = new ArrayList<>(count);
for (int i = 0; i < 4; i++) {
serverNames.add(ServerName.valueOf(i + ".example.org", i, i));
}
return serverNames.toArray(new ServerName[serverNames.size()]);
}
@Override
protected void killRs(ServerName server) throws IOException {
LOG.info("Killed " + server);
if (this.invocations++ % 3 == 0) {
throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
}
}
@Override
protected void startRs(ServerName server) throws IOException {
LOG.info("Started " + server);
if (this.invocations++ % 3 == 0) {
throw new org.apache.hadoop.util.Shell.ExitCodeException(-1, "Failed");
}
}
};
action.perform();
}
}