/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.cloud; import java.io.Closeable; import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import org.apache.solr.client.solrj.impl.HttpSolrClient; import org.apache.solr.client.solrj.request.CoreAdminRequest.Create; import org.apache.solr.common.SolrException; import org.apache.solr.common.cloud.ClusterState; import org.apache.solr.common.cloud.ClusterStateUtil; import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.core.CloudConfig; import org.apache.solr.update.UpdateShardHandler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.MDC; // TODO: how to tmp exclude nodes? // TODO: more fine grained failover rules? // TODO: test with lots of collections // TODO: add config for only failover if replicas is < N // TODO: general support for non shared filesystems // this is specialized for a shared file system, but it should // not be much work to generalize // NOTE: using replication can slow down failover if a whole // shard is lost. /** * * In this simple initial implementation we are limited in how quickly we detect * a failure by a worst case of roughly zk session timeout + WAIT_AFTER_EXPIRATION_SECONDS + WORK_LOOP_DELAY_MS * and best case of roughly zk session timeout + WAIT_AFTER_EXPIRATION_SECONDS. Also, consider the time to * create the SolrCore, do any recovery necessary, and warm up the readers. * * NOTE: this will only work with collections created via the collections api because they will have defined * replicationFactor and maxShardsPerNode. * * @lucene.experimental */ public class OverseerAutoReplicaFailoverThread implements Runnable, Closeable { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private Integer lastClusterStateVersion; private final ExecutorService updateExecutor; private volatile boolean isClosed; private ZkStateReader zkStateReader; private final Cache<String,Long> baseUrlForBadNodes; private Set<String> liveNodes = Collections.EMPTY_SET; private final int workLoopDelay; private final int waitAfterExpiration; private volatile Thread thread; public OverseerAutoReplicaFailoverThread(CloudConfig config, ZkStateReader zkStateReader, UpdateShardHandler updateShardHandler) { this.zkStateReader = zkStateReader; this.workLoopDelay = config.getAutoReplicaFailoverWorkLoopDelay(); this.waitAfterExpiration = config.getAutoReplicaFailoverWaitAfterExpiration(); int badNodeExpiration = config.getAutoReplicaFailoverBadNodeExpiration(); log.debug( "Starting " + this.getClass().getSimpleName() + " autoReplicaFailoverWorkLoopDelay={} autoReplicaFailoverWaitAfterExpiration={} autoReplicaFailoverBadNodeExpiration={}", workLoopDelay, waitAfterExpiration, badNodeExpiration); baseUrlForBadNodes = CacheBuilder.newBuilder() .concurrencyLevel(1).expireAfterWrite(badNodeExpiration, TimeUnit.MILLISECONDS).build(); // TODO: Speed up our work loop when live_nodes changes?? updateExecutor = updateShardHandler.getUpdateExecutor(); // TODO: perhaps do a health ping periodically to each node (scaryish) // And/OR work on JIRA issue around self health checks (SOLR-5805) } @Override public void run() { this.thread = Thread.currentThread(); while (!this.isClosed) { // work loop log.debug("do " + this.getClass().getSimpleName() + " work loop"); // every n, look at state and make add / remove calls try { doWork(); } catch (Exception e) { SolrException.log(log, this.getClass().getSimpleName() + " had an error in its thread work loop.", e); } if (!this.isClosed) { try { Thread.sleep(workLoopDelay); } catch (InterruptedException e) { return; } } } } private void doWork() { // TODO: extract to configurable strategy class ?? ClusterState clusterState = zkStateReader.getClusterState(); //check if we have disabled autoAddReplicas cluster wide String autoAddReplicas = zkStateReader.getClusterProperty(ZkStateReader.AUTO_ADD_REPLICAS, (String) null); if (autoAddReplicas != null && autoAddReplicas.equals("false")) { return; } if (clusterState != null) { if (clusterState.getZkClusterStateVersion() != null && clusterState.getZkClusterStateVersion().equals(lastClusterStateVersion) && baseUrlForBadNodes.size() == 0 && liveNodes.equals(clusterState.getLiveNodes())) { // nothing has changed, no work to do return; } liveNodes = clusterState.getLiveNodes(); lastClusterStateVersion = clusterState.getZkClusterStateVersion(); Map<String, DocCollection> collections = clusterState.getCollectionsMap(); for (Map.Entry<String, DocCollection> entry : collections.entrySet()) { log.debug("look at collection={}", entry.getKey()); DocCollection docCollection = entry.getValue(); if (!docCollection.getAutoAddReplicas()) { log.debug("Collection {} is not setup to use autoAddReplicas, skipping..", docCollection.getName()); continue; } if (docCollection.getReplicationFactor() == null) { log.debug("Skipping collection because it has no defined replicationFactor, name={}", docCollection.getName()); continue; } log.debug("Found collection, name={} replicationFactor={}", entry.getKey(), docCollection.getReplicationFactor()); Collection<Slice> slices = docCollection.getSlices(); for (Slice slice : slices) { if (slice.getState() == Slice.State.ACTIVE) { final Collection<DownReplica> downReplicas = new ArrayList<DownReplica>(); int goodReplicas = findDownReplicasInSlice(clusterState, docCollection, slice, downReplicas); log.debug("collection={} replicationFactor={} goodReplicaCount={}", docCollection.getName(), docCollection.getReplicationFactor(), goodReplicas); if (downReplicas.size() > 0 && goodReplicas < docCollection.getReplicationFactor()) { // badReplicaMap.put(collection, badReplicas); processBadReplicas(entry.getKey(), downReplicas); } else if (goodReplicas > docCollection.getReplicationFactor()) { log.debug("There are too many replicas"); } } } } } } private void processBadReplicas(final String collection, final Collection<DownReplica> badReplicas) { for (DownReplica badReplica : badReplicas) { log.debug("process down replica={} from collection={}", badReplica.replica.getName(), collection); String baseUrl = badReplica.replica.getStr(ZkStateReader.BASE_URL_PROP); Long wentBadAtNS = baseUrlForBadNodes.getIfPresent(baseUrl); if (wentBadAtNS == null) { log.warn("Replica {} may need to failover.", badReplica.replica.getName()); baseUrlForBadNodes.put(baseUrl, System.nanoTime()); } else { long elasped = System.nanoTime() - wentBadAtNS; if (elasped < TimeUnit.NANOSECONDS.convert(waitAfterExpiration, TimeUnit.MILLISECONDS)) { // protect against ZK 'flapping', startup and shutdown log.debug("Looks troublesome...continue. Elapsed={}", elasped + "ns"); } else { log.debug("We need to add a replica. Elapsed={}", elasped + "ns"); if (addReplica(collection, badReplica)) { baseUrlForBadNodes.invalidate(baseUrl); } } } } } private boolean addReplica(final String collection, DownReplica badReplica) { // first find best home - first strategy, sort by number of cores // hosted where maxCoresPerNode is not violated final Integer maxCoreCount = zkStateReader.getClusterProperty(ZkStateReader.MAX_CORES_PER_NODE, (Integer) null); final String createUrl = getBestCreateUrl(zkStateReader, badReplica, maxCoreCount); if (createUrl == null) { log.warn("Could not find a node to create new replica on."); return false; } // NOTE: we send the absolute path, which will slightly change // behavior of these cores as they won't respond to changes // in the solr.hdfs.home sys prop as they would have. final String dataDir = badReplica.replica.getStr("dataDir"); final String ulogDir = badReplica.replica.getStr("ulogDir"); final String coreNodeName = badReplica.replica.getName(); final String shardId = badReplica.slice.getName(); if (dataDir != null) { // need an async request - full shard goes down leader election final String coreName = badReplica.replica.getStr(ZkStateReader.CORE_NAME_PROP); log.debug("submit call to {}", createUrl); MDC.put("OverseerAutoReplicaFailoverThread.createUrl", createUrl); try { updateExecutor.submit(() -> createSolrCore(collection, createUrl, dataDir, ulogDir, coreNodeName, coreName, shardId)); } finally { MDC.remove("OverseerAutoReplicaFailoverThread.createUrl"); } // wait to see state for core we just created boolean success = ClusterStateUtil.waitToSeeLiveReplica(zkStateReader, collection, coreNodeName, createUrl, 30000); if (!success) { log.error("Creating new replica appears to have failed, timed out waiting to see created SolrCore register in the clusterstate."); return false; } return true; } log.warn("Could not find dataDir or ulogDir in cluster state."); return false; } private static int findDownReplicasInSlice(ClusterState clusterState, DocCollection collection, Slice slice, final Collection<DownReplica> badReplicas) { int goodReplicas = 0; Collection<Replica> replicas = slice.getReplicas(); if (replicas != null) { for (Replica replica : replicas) { // on a live node? boolean live = clusterState.liveNodesContain(replica.getNodeName()); final Replica.State state = replica.getState(); final boolean okayState = state == Replica.State.DOWN || state == Replica.State.RECOVERING || state == Replica.State.ACTIVE; log.debug("Process replica name={} live={} state={}", replica.getName(), live, state.toString()); if (live && okayState) { goodReplicas++; } else { DownReplica badReplica = new DownReplica(); badReplica.replica = replica; badReplica.slice = slice; badReplica.collection = collection; badReplicas.add(badReplica); } } } log.debug("bad replicas for slice {}", badReplicas); return goodReplicas; } /** * * @return the best node to replace the badReplica on or null if there is no * such node */ static String getBestCreateUrl(ZkStateReader zkStateReader, DownReplica badReplica, Integer maxCoreCount) { assert badReplica != null; assert badReplica.collection != null; assert badReplica.slice != null; log.debug("getBestCreateUrl for " + badReplica.replica); Map<String,Counts> counts = new HashMap<>(); Set<String> unsuitableHosts = new HashSet<>(); Set<String> liveNodes = new HashSet<>(zkStateReader.getClusterState().getLiveNodes()); Map<String, Integer> coresPerNode = new HashMap<>(); ClusterState clusterState = zkStateReader.getClusterState(); if (clusterState != null) { Map<String, DocCollection> collections = clusterState.getCollectionsMap(); for (Map.Entry<String, DocCollection> entry : collections.entrySet()) { String collection = entry.getKey(); log.debug("look at collection {} as possible create candidate", collection); DocCollection docCollection = entry.getValue(); // TODO - only operate on collections with sharedfs failover = true ?? Collection<Slice> slices = docCollection.getSlices(); for (Slice slice : slices) { // only look at active shards if (slice.getState() == Slice.State.ACTIVE) { log.debug("look at slice {} for collection {} as possible create candidate", slice.getName(), collection); Collection<Replica> replicas = slice.getReplicas(); for (Replica replica : replicas) { liveNodes.remove(replica.getNodeName()); String baseUrl = replica.getStr(ZkStateReader.BASE_URL_PROP); if (coresPerNode.containsKey(baseUrl)) { Integer nodeCount = coresPerNode.get(baseUrl); coresPerNode.put(baseUrl, nodeCount++); } else { coresPerNode.put(baseUrl, 1); } if (baseUrl.equals(badReplica.replica.getStr(ZkStateReader.BASE_URL_PROP))) { continue; } // on a live node? log.debug("collection={} nodename={} livenodes={}", collection, replica.getNodeName(), clusterState.getLiveNodes()); boolean live = clusterState.liveNodesContain(replica.getNodeName()); log.debug("collection={} look at replica {} as possible create candidate, live={}", collection, replica.getName(), live); if (live) { Counts cnt = counts.get(baseUrl); if (cnt == null) { cnt = new Counts(); } if (badReplica.collection.getName().equals(collection)) { cnt.negRankingWeight += 3; cnt.collectionShardsOnNode += 1; } else { cnt.negRankingWeight += 1; } if (badReplica.collection.getName().equals(collection) && badReplica.slice.getName().equals(slice.getName())) { cnt.ourReplicas++; } Integer maxShardsPerNode = badReplica.collection.getMaxShardsPerNode(); if (maxShardsPerNode == null) { log.warn("maxShardsPerNode is not defined for collection, name=" + badReplica.collection.getName()); maxShardsPerNode = Integer.MAX_VALUE; } log.debug("collection={} node={} maxShardsPerNode={} maxCoresPerNode={} potential hosts={}", collection, baseUrl, maxShardsPerNode, maxCoreCount, cnt); Collection<Replica> badSliceReplicas = null; DocCollection c = clusterState.getCollection(badReplica.collection.getName()); if (c != null) { Slice s = c.getSlice(badReplica.slice.getName()); if (s != null) { badSliceReplicas = s.getReplicas(); } } boolean alreadyExistsOnNode = replicaAlreadyExistsOnNode(zkStateReader.getClusterState(), badSliceReplicas, badReplica, baseUrl); if (unsuitableHosts.contains(baseUrl) || alreadyExistsOnNode || cnt.collectionShardsOnNode >= maxShardsPerNode || (maxCoreCount != null && coresPerNode.get(baseUrl) >= maxCoreCount) ) { counts.remove(baseUrl); unsuitableHosts.add(baseUrl); log.debug("not a candidate node, collection={} node={} max shards per node={} good replicas={}", collection, baseUrl, maxShardsPerNode, cnt); } else { counts.put(baseUrl, cnt); log.debug("is a candidate node, collection={} node={} max shards per node={} good replicas={}", collection, baseUrl, maxShardsPerNode, cnt); } } } } } } } for (String node : liveNodes) { counts.put(zkStateReader.getBaseUrlForNodeName(node), new Counts(0, 0)); } if (counts.size() == 0) { log.debug("no suitable hosts found for getBestCreateUrl for collection={}", badReplica.collection.getName()); return null; } ValueComparator vc = new ValueComparator(counts); Map<String,Counts> sortedCounts = new TreeMap<String, Counts>(vc); sortedCounts.putAll(counts); log.debug("empty nodes={} for collection={}", liveNodes, badReplica.collection.getName()); log.debug("sorted hosts={} for collection={}", sortedCounts, badReplica.collection.getName()); log.debug("unsuitable hosts={} for collection={}", unsuitableHosts, badReplica.collection.getName()); return sortedCounts.keySet().iterator().next(); } private static boolean replicaAlreadyExistsOnNode(ClusterState clusterState, Collection<Replica> replicas, DownReplica badReplica, String baseUrl) { if (replicas != null) { log.debug("collection={} check if replica already exists on node using replicas {}", badReplica.collection.getName(), getNames(replicas)); for (Replica replica : replicas) { final Replica.State state = replica.getState(); if (!replica.getName().equals(badReplica.replica.getName()) && replica.getStr(ZkStateReader.BASE_URL_PROP).equals(baseUrl) && clusterState.liveNodesContain(replica.getNodeName()) && (state == Replica.State.ACTIVE || state == Replica.State.DOWN || state == Replica.State.RECOVERING)) { log.debug("collection={} replica already exists on node, bad replica={}, existing replica={}, node name={}", badReplica.collection.getName(), badReplica.replica.getName(), replica.getName(), replica.getNodeName()); return true; } } } log.debug("collection={} replica does not yet exist on node: {}", badReplica.collection.getName(), baseUrl); return false; } private static Object getNames(Collection<Replica> replicas) { Set<String> names = new HashSet<>(replicas.size()); for (Replica replica : replicas) { names.add(replica.getName()); } return names; } private boolean createSolrCore(final String collection, final String createUrl, final String dataDir, final String ulogDir, final String coreNodeName, final String coreName, final String shardId) { try (HttpSolrClient client = new HttpSolrClient.Builder(createUrl).build()) { log.debug("create url={}", createUrl); client.setConnectionTimeout(30000); client.setSoTimeout(60000); Create createCmd = new Create(); createCmd.setCollection(collection); createCmd.setCoreNodeName(coreNodeName); // TODO: how do we ensure unique coreName // for now, the collections API will use unique names createCmd.setShardId(shardId); createCmd.setCoreName(coreName); createCmd.setDataDir(dataDir); createCmd.setUlogDir(ulogDir.substring(0, ulogDir.length() - "/tlog".length())); client.request(createCmd); } catch (Exception e) { SolrException.log(log, "Exception trying to create new replica on " + createUrl, e); return false; } return true; } private static class ValueComparator implements Comparator<String> { Map<String,Counts> map; public ValueComparator(Map<String,Counts> map) { this.map = map; } public int compare(String a, String b) { if (map.get(a).negRankingWeight >= map.get(b).negRankingWeight) { return 1; } else { return -1; } } } @Override public void close() { isClosed = true; Thread lThread = thread; if (lThread != null) { lThread.interrupt(); } } public boolean isClosed() { return isClosed; } private static class Counts { int collectionShardsOnNode = 0; int negRankingWeight = 0; int ourReplicas = 0; private Counts() { } private Counts(int totalReplicas, int ourReplicas) { this.negRankingWeight = totalReplicas; this.ourReplicas = ourReplicas; } @Override public String toString() { return "Counts [negRankingWeight=" + negRankingWeight + ", sameSliceCount=" + ourReplicas + ", collectionShardsOnNode=" + collectionShardsOnNode + "]"; } } static class DownReplica { Replica replica; Slice slice; DocCollection collection; @Override public String toString() { return "DownReplica [replica=" + replica.getName() + ", slice=" + slice.getName() + ", collection=" + collection.getName() + "]"; } } }