/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.cloud;
import java.io.Closeable;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.CoreAdminRequest.Create;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.ClusterStateUtil;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.core.CloudConfig;
import org.apache.solr.update.UpdateShardHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.MDC;
// TODO: how to tmp exclude nodes?
// TODO: more fine grained failover rules?
// TODO: test with lots of collections
// TODO: add config for only failover if replicas is < N
// TODO: general support for non shared filesystems
// this is specialized for a shared file system, but it should
// not be much work to generalize
// NOTE: using replication can slow down failover if a whole
// shard is lost.
/**
*
* In this simple initial implementation we are limited in how quickly we detect
* a failure by a worst case of roughly zk session timeout + WAIT_AFTER_EXPIRATION_SECONDS + WORK_LOOP_DELAY_MS
* and best case of roughly zk session timeout + WAIT_AFTER_EXPIRATION_SECONDS. Also, consider the time to
* create the SolrCore, do any recovery necessary, and warm up the readers.
*
* NOTE: this will only work with collections created via the collections api because they will have defined
* replicationFactor and maxShardsPerNode.
*
* @lucene.experimental
*/
public class OverseerAutoReplicaFailoverThread implements Runnable, Closeable {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private Integer lastClusterStateVersion;
private final ExecutorService updateExecutor;
private volatile boolean isClosed;
private ZkStateReader zkStateReader;
private final Cache<String,Long> baseUrlForBadNodes;
private Set<String> liveNodes = Collections.EMPTY_SET;
private final int workLoopDelay;
private final int waitAfterExpiration;
private volatile Thread thread;
public OverseerAutoReplicaFailoverThread(CloudConfig config, ZkStateReader zkStateReader,
UpdateShardHandler updateShardHandler) {
this.zkStateReader = zkStateReader;
this.workLoopDelay = config.getAutoReplicaFailoverWorkLoopDelay();
this.waitAfterExpiration = config.getAutoReplicaFailoverWaitAfterExpiration();
int badNodeExpiration = config.getAutoReplicaFailoverBadNodeExpiration();
log.debug(
"Starting "
+ this.getClass().getSimpleName()
+ " autoReplicaFailoverWorkLoopDelay={} autoReplicaFailoverWaitAfterExpiration={} autoReplicaFailoverBadNodeExpiration={}",
workLoopDelay, waitAfterExpiration, badNodeExpiration);
baseUrlForBadNodes = CacheBuilder.newBuilder()
.concurrencyLevel(1).expireAfterWrite(badNodeExpiration, TimeUnit.MILLISECONDS).build();
// TODO: Speed up our work loop when live_nodes changes??
updateExecutor = updateShardHandler.getUpdateExecutor();
// TODO: perhaps do a health ping periodically to each node (scaryish)
// And/OR work on JIRA issue around self health checks (SOLR-5805)
}
@Override
public void run() {
this.thread = Thread.currentThread();
while (!this.isClosed) {
// work loop
log.debug("do " + this.getClass().getSimpleName() + " work loop");
// every n, look at state and make add / remove calls
try {
doWork();
} catch (Exception e) {
SolrException.log(log, this.getClass().getSimpleName()
+ " had an error in its thread work loop.", e);
}
if (!this.isClosed) {
try {
Thread.sleep(workLoopDelay);
} catch (InterruptedException e) {
return;
}
}
}
}
private void doWork() {
// TODO: extract to configurable strategy class ??
ClusterState clusterState = zkStateReader.getClusterState();
//check if we have disabled autoAddReplicas cluster wide
String autoAddReplicas = zkStateReader.getClusterProperty(ZkStateReader.AUTO_ADD_REPLICAS, (String) null);
if (autoAddReplicas != null && autoAddReplicas.equals("false")) {
return;
}
if (clusterState != null) {
if (clusterState.getZkClusterStateVersion() != null &&
clusterState.getZkClusterStateVersion().equals(lastClusterStateVersion) && baseUrlForBadNodes.size() == 0 &&
liveNodes.equals(clusterState.getLiveNodes())) {
// nothing has changed, no work to do
return;
}
liveNodes = clusterState.getLiveNodes();
lastClusterStateVersion = clusterState.getZkClusterStateVersion();
Map<String, DocCollection> collections = clusterState.getCollectionsMap();
for (Map.Entry<String, DocCollection> entry : collections.entrySet()) {
log.debug("look at collection={}", entry.getKey());
DocCollection docCollection = entry.getValue();
if (!docCollection.getAutoAddReplicas()) {
log.debug("Collection {} is not setup to use autoAddReplicas, skipping..", docCollection.getName());
continue;
}
if (docCollection.getReplicationFactor() == null) {
log.debug("Skipping collection because it has no defined replicationFactor, name={}", docCollection.getName());
continue;
}
log.debug("Found collection, name={} replicationFactor={}", entry.getKey(), docCollection.getReplicationFactor());
Collection<Slice> slices = docCollection.getSlices();
for (Slice slice : slices) {
if (slice.getState() == Slice.State.ACTIVE) {
final Collection<DownReplica> downReplicas = new ArrayList<DownReplica>();
int goodReplicas = findDownReplicasInSlice(clusterState, docCollection, slice, downReplicas);
log.debug("collection={} replicationFactor={} goodReplicaCount={}", docCollection.getName(), docCollection.getReplicationFactor(), goodReplicas);
if (downReplicas.size() > 0 && goodReplicas < docCollection.getReplicationFactor()) {
// badReplicaMap.put(collection, badReplicas);
processBadReplicas(entry.getKey(), downReplicas);
} else if (goodReplicas > docCollection.getReplicationFactor()) {
log.debug("There are too many replicas");
}
}
}
}
}
}
private void processBadReplicas(final String collection, final Collection<DownReplica> badReplicas) {
for (DownReplica badReplica : badReplicas) {
log.debug("process down replica={} from collection={}", badReplica.replica.getName(), collection);
String baseUrl = badReplica.replica.getStr(ZkStateReader.BASE_URL_PROP);
Long wentBadAtNS = baseUrlForBadNodes.getIfPresent(baseUrl);
if (wentBadAtNS == null) {
log.warn("Replica {} may need to failover.",
badReplica.replica.getName());
baseUrlForBadNodes.put(baseUrl, System.nanoTime());
} else {
long elasped = System.nanoTime() - wentBadAtNS;
if (elasped < TimeUnit.NANOSECONDS.convert(waitAfterExpiration, TimeUnit.MILLISECONDS)) {
// protect against ZK 'flapping', startup and shutdown
log.debug("Looks troublesome...continue. Elapsed={}", elasped + "ns");
} else {
log.debug("We need to add a replica. Elapsed={}", elasped + "ns");
if (addReplica(collection, badReplica)) {
baseUrlForBadNodes.invalidate(baseUrl);
}
}
}
}
}
private boolean addReplica(final String collection, DownReplica badReplica) {
// first find best home - first strategy, sort by number of cores
// hosted where maxCoresPerNode is not violated
final Integer maxCoreCount = zkStateReader.getClusterProperty(ZkStateReader.MAX_CORES_PER_NODE, (Integer) null);
final String createUrl = getBestCreateUrl(zkStateReader, badReplica, maxCoreCount);
if (createUrl == null) {
log.warn("Could not find a node to create new replica on.");
return false;
}
// NOTE: we send the absolute path, which will slightly change
// behavior of these cores as they won't respond to changes
// in the solr.hdfs.home sys prop as they would have.
final String dataDir = badReplica.replica.getStr("dataDir");
final String ulogDir = badReplica.replica.getStr("ulogDir");
final String coreNodeName = badReplica.replica.getName();
final String shardId = badReplica.slice.getName();
if (dataDir != null) {
// need an async request - full shard goes down leader election
final String coreName = badReplica.replica.getStr(ZkStateReader.CORE_NAME_PROP);
log.debug("submit call to {}", createUrl);
MDC.put("OverseerAutoReplicaFailoverThread.createUrl", createUrl);
try {
updateExecutor.submit(() -> createSolrCore(collection, createUrl, dataDir, ulogDir, coreNodeName, coreName, shardId));
} finally {
MDC.remove("OverseerAutoReplicaFailoverThread.createUrl");
}
// wait to see state for core we just created
boolean success = ClusterStateUtil.waitToSeeLiveReplica(zkStateReader, collection, coreNodeName, createUrl, 30000);
if (!success) {
log.error("Creating new replica appears to have failed, timed out waiting to see created SolrCore register in the clusterstate.");
return false;
}
return true;
}
log.warn("Could not find dataDir or ulogDir in cluster state.");
return false;
}
private static int findDownReplicasInSlice(ClusterState clusterState, DocCollection collection, Slice slice, final Collection<DownReplica> badReplicas) {
int goodReplicas = 0;
Collection<Replica> replicas = slice.getReplicas();
if (replicas != null) {
for (Replica replica : replicas) {
// on a live node?
boolean live = clusterState.liveNodesContain(replica.getNodeName());
final Replica.State state = replica.getState();
final boolean okayState = state == Replica.State.DOWN
|| state == Replica.State.RECOVERING
|| state == Replica.State.ACTIVE;
log.debug("Process replica name={} live={} state={}", replica.getName(), live, state.toString());
if (live && okayState) {
goodReplicas++;
} else {
DownReplica badReplica = new DownReplica();
badReplica.replica = replica;
badReplica.slice = slice;
badReplica.collection = collection;
badReplicas.add(badReplica);
}
}
}
log.debug("bad replicas for slice {}", badReplicas);
return goodReplicas;
}
/**
*
* @return the best node to replace the badReplica on or null if there is no
* such node
*/
static String getBestCreateUrl(ZkStateReader zkStateReader, DownReplica badReplica, Integer maxCoreCount) {
assert badReplica != null;
assert badReplica.collection != null;
assert badReplica.slice != null;
log.debug("getBestCreateUrl for " + badReplica.replica);
Map<String,Counts> counts = new HashMap<>();
Set<String> unsuitableHosts = new HashSet<>();
Set<String> liveNodes = new HashSet<>(zkStateReader.getClusterState().getLiveNodes());
Map<String, Integer> coresPerNode = new HashMap<>();
ClusterState clusterState = zkStateReader.getClusterState();
if (clusterState != null) {
Map<String, DocCollection> collections = clusterState.getCollectionsMap();
for (Map.Entry<String, DocCollection> entry : collections.entrySet()) {
String collection = entry.getKey();
log.debug("look at collection {} as possible create candidate", collection);
DocCollection docCollection = entry.getValue();
// TODO - only operate on collections with sharedfs failover = true ??
Collection<Slice> slices = docCollection.getSlices();
for (Slice slice : slices) {
// only look at active shards
if (slice.getState() == Slice.State.ACTIVE) {
log.debug("look at slice {} for collection {} as possible create candidate", slice.getName(), collection);
Collection<Replica> replicas = slice.getReplicas();
for (Replica replica : replicas) {
liveNodes.remove(replica.getNodeName());
String baseUrl = replica.getStr(ZkStateReader.BASE_URL_PROP);
if (coresPerNode.containsKey(baseUrl)) {
Integer nodeCount = coresPerNode.get(baseUrl);
coresPerNode.put(baseUrl, nodeCount++);
} else {
coresPerNode.put(baseUrl, 1);
}
if (baseUrl.equals(badReplica.replica.getStr(ZkStateReader.BASE_URL_PROP))) {
continue;
}
// on a live node?
log.debug("collection={} nodename={} livenodes={}", collection, replica.getNodeName(), clusterState.getLiveNodes());
boolean live = clusterState.liveNodesContain(replica.getNodeName());
log.debug("collection={} look at replica {} as possible create candidate, live={}", collection, replica.getName(), live);
if (live) {
Counts cnt = counts.get(baseUrl);
if (cnt == null) {
cnt = new Counts();
}
if (badReplica.collection.getName().equals(collection)) {
cnt.negRankingWeight += 3;
cnt.collectionShardsOnNode += 1;
} else {
cnt.negRankingWeight += 1;
}
if (badReplica.collection.getName().equals(collection) && badReplica.slice.getName().equals(slice.getName())) {
cnt.ourReplicas++;
}
Integer maxShardsPerNode = badReplica.collection.getMaxShardsPerNode();
if (maxShardsPerNode == null) {
log.warn("maxShardsPerNode is not defined for collection, name=" + badReplica.collection.getName());
maxShardsPerNode = Integer.MAX_VALUE;
}
log.debug("collection={} node={} maxShardsPerNode={} maxCoresPerNode={} potential hosts={}",
collection, baseUrl, maxShardsPerNode, maxCoreCount, cnt);
Collection<Replica> badSliceReplicas = null;
DocCollection c = clusterState.getCollection(badReplica.collection.getName());
if (c != null) {
Slice s = c.getSlice(badReplica.slice.getName());
if (s != null) {
badSliceReplicas = s.getReplicas();
}
}
boolean alreadyExistsOnNode = replicaAlreadyExistsOnNode(zkStateReader.getClusterState(), badSliceReplicas, badReplica, baseUrl);
if (unsuitableHosts.contains(baseUrl) || alreadyExistsOnNode || cnt.collectionShardsOnNode >= maxShardsPerNode
|| (maxCoreCount != null && coresPerNode.get(baseUrl) >= maxCoreCount) ) {
counts.remove(baseUrl);
unsuitableHosts.add(baseUrl);
log.debug("not a candidate node, collection={} node={} max shards per node={} good replicas={}", collection, baseUrl, maxShardsPerNode, cnt);
} else {
counts.put(baseUrl, cnt);
log.debug("is a candidate node, collection={} node={} max shards per node={} good replicas={}", collection, baseUrl, maxShardsPerNode, cnt);
}
}
}
}
}
}
}
for (String node : liveNodes) {
counts.put(zkStateReader.getBaseUrlForNodeName(node), new Counts(0, 0));
}
if (counts.size() == 0) {
log.debug("no suitable hosts found for getBestCreateUrl for collection={}", badReplica.collection.getName());
return null;
}
ValueComparator vc = new ValueComparator(counts);
Map<String,Counts> sortedCounts = new TreeMap<String, Counts>(vc);
sortedCounts.putAll(counts);
log.debug("empty nodes={} for collection={}", liveNodes, badReplica.collection.getName());
log.debug("sorted hosts={} for collection={}", sortedCounts, badReplica.collection.getName());
log.debug("unsuitable hosts={} for collection={}", unsuitableHosts, badReplica.collection.getName());
return sortedCounts.keySet().iterator().next();
}
private static boolean replicaAlreadyExistsOnNode(ClusterState clusterState, Collection<Replica> replicas, DownReplica badReplica, String baseUrl) {
if (replicas != null) {
log.debug("collection={} check if replica already exists on node using replicas {}", badReplica.collection.getName(), getNames(replicas));
for (Replica replica : replicas) {
final Replica.State state = replica.getState();
if (!replica.getName().equals(badReplica.replica.getName()) && replica.getStr(ZkStateReader.BASE_URL_PROP).equals(baseUrl)
&& clusterState.liveNodesContain(replica.getNodeName())
&& (state == Replica.State.ACTIVE || state == Replica.State.DOWN || state == Replica.State.RECOVERING)) {
log.debug("collection={} replica already exists on node, bad replica={}, existing replica={}, node name={}", badReplica.collection.getName(), badReplica.replica.getName(), replica.getName(), replica.getNodeName());
return true;
}
}
}
log.debug("collection={} replica does not yet exist on node: {}", badReplica.collection.getName(), baseUrl);
return false;
}
private static Object getNames(Collection<Replica> replicas) {
Set<String> names = new HashSet<>(replicas.size());
for (Replica replica : replicas) {
names.add(replica.getName());
}
return names;
}
private boolean createSolrCore(final String collection,
final String createUrl, final String dataDir, final String ulogDir,
final String coreNodeName, final String coreName, final String shardId) {
try (HttpSolrClient client = new HttpSolrClient.Builder(createUrl).build()) {
log.debug("create url={}", createUrl);
client.setConnectionTimeout(30000);
client.setSoTimeout(60000);
Create createCmd = new Create();
createCmd.setCollection(collection);
createCmd.setCoreNodeName(coreNodeName);
// TODO: how do we ensure unique coreName
// for now, the collections API will use unique names
createCmd.setShardId(shardId);
createCmd.setCoreName(coreName);
createCmd.setDataDir(dataDir);
createCmd.setUlogDir(ulogDir.substring(0, ulogDir.length() - "/tlog".length()));
client.request(createCmd);
} catch (Exception e) {
SolrException.log(log, "Exception trying to create new replica on " + createUrl, e);
return false;
}
return true;
}
private static class ValueComparator implements Comparator<String> {
Map<String,Counts> map;
public ValueComparator(Map<String,Counts> map) {
this.map = map;
}
public int compare(String a, String b) {
if (map.get(a).negRankingWeight >= map.get(b).negRankingWeight) {
return 1;
} else {
return -1;
}
}
}
@Override
public void close() {
isClosed = true;
Thread lThread = thread;
if (lThread != null) {
lThread.interrupt();
}
}
public boolean isClosed() {
return isClosed;
}
private static class Counts {
int collectionShardsOnNode = 0;
int negRankingWeight = 0;
int ourReplicas = 0;
private Counts() {
}
private Counts(int totalReplicas, int ourReplicas) {
this.negRankingWeight = totalReplicas;
this.ourReplicas = ourReplicas;
}
@Override
public String toString() {
return "Counts [negRankingWeight=" + negRankingWeight + ", sameSliceCount="
+ ourReplicas + ", collectionShardsOnNode=" + collectionShardsOnNode + "]";
}
}
static class DownReplica {
Replica replica;
Slice slice;
DocCollection collection;
@Override
public String toString() {
return "DownReplica [replica=" + replica.getName() + ", slice="
+ slice.getName() + ", collection=" + collection.getName() + "]";
}
}
}