LeaderInitiatedRecoveryThread.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.cloud;

import org.apache.http.NoHttpResponseException;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.CoreAdminRequest.RequestRecovery;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.ZkCoreNodeProps;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.CoreAdminParams.CoreAdminAction;
import org.apache.solr.common.util.Utils;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.CoreDescriptor;
import org.apache.zookeeper.KeeperException;
import org.apache.solr.util.RTimer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.lang.invoke.MethodHandles;
import java.net.ConnectException;
import java.net.SocketException;
import java.util.List;

/**
 * Background daemon thread that tries to send the REQUESTRECOVERY to a downed
 * replica; used by a shard leader to nag a replica into recovering after the
 * leader experiences an error trying to send an update request to the replica.
 */
public class LeaderInitiatedRecoveryThread extends Thread {

  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  protected ZkController zkController;
  protected CoreContainer coreContainer;
  protected String collection;
  protected String shardId;
  protected ZkCoreNodeProps nodeProps;
  protected int maxTries;
  private CoreDescriptor leaderCd;
  
  public LeaderInitiatedRecoveryThread(ZkController zkController, 
                                       CoreContainer cc, 
                                       String collection, 
                                       String shardId, 
                                       ZkCoreNodeProps nodeProps,
                                       int maxTries,
                                       CoreDescriptor leaderCd)
  {
    super("LeaderInitiatedRecoveryThread-"+nodeProps.getCoreName());
    this.zkController = zkController;
    this.coreContainer = cc;
    this.collection = collection;
    this.shardId = shardId;    
    this.nodeProps = nodeProps;
    this.maxTries = maxTries;
    this.leaderCd = leaderCd;
    setDaemon(true);
  }
  
  public void run() {
    RTimer timer = new RTimer();

    String replicaCoreName = nodeProps.getCoreName();
    String replicaCoreNodeName = ((Replica) nodeProps.getNodeProps()).getName();
    String replicaNodeName = nodeProps.getNodeName();
    final String replicaUrl = nodeProps.getCoreUrl();

    if (!zkController.isReplicaInRecoveryHandling(replicaUrl)) {
      throw new SolrException(ErrorCode.INVALID_STATE, "Replica: " + replicaUrl + " should have been marked under leader initiated recovery in ZkController but wasn't.");
    }

    boolean sendRecoveryCommand = publishDownState(replicaCoreName, replicaCoreNodeName, replicaNodeName, replicaUrl, false);

    if (sendRecoveryCommand)  {
      try {
        sendRecoveryCommandWithRetry();
      } catch (Exception exc) {
        log.error(getName()+" failed due to: "+exc, exc);
        if (exc instanceof SolrException) {
          throw (SolrException)exc;
        } else {
          throw new SolrException(ErrorCode.SERVER_ERROR, exc);
        }
      } finally {
        zkController.removeReplicaFromLeaderInitiatedRecoveryHandling(replicaUrl);
      }
    } else  {
      // replica is no longer in recovery on this node (may be handled on another node)
      zkController.removeReplicaFromLeaderInitiatedRecoveryHandling(replicaUrl);
    }
    log.info("{} completed successfully after running for {}ms", getName(), timer.getTime());
  }

  public boolean publishDownState(String replicaCoreName, String replicaCoreNodeName, String replicaNodeName, String replicaUrl, boolean forcePublishState) {
    boolean sendRecoveryCommand = true;
    boolean publishDownState = false;

    if (zkController.getZkStateReader().getClusterState().liveNodesContain(replicaNodeName)) {
      try {
        // create a znode that requires the replica needs to "ack" to verify it knows it was out-of-sync
        updateLIRState(replicaCoreNodeName);

        log.info("Put replica core={} coreNodeName={} on " +
            replicaNodeName + " into leader-initiated recovery.", replicaCoreName, replicaCoreNodeName);
        publishDownState = true;
      } catch (Exception e) {
        Throwable setLirZnodeFailedCause = SolrException.getRootCause(e);
        log.error("Leader failed to set replica " +
            nodeProps.getCoreUrl() + " state to DOWN due to: " + setLirZnodeFailedCause, setLirZnodeFailedCause);
        if (setLirZnodeFailedCause instanceof KeeperException.SessionExpiredException
            || setLirZnodeFailedCause instanceof KeeperException.ConnectionLossException
            || setLirZnodeFailedCause instanceof ZkController.NotLeaderException) {
          // our session is expired, which means our state is suspect, so don't go
          // putting other replicas in recovery (see SOLR-6511)
          sendRecoveryCommand = false;
          forcePublishState = false; // no need to force publish any state in this case
        } // else will go ahead and try to send the recovery command once after this error
      }
    } else  {
      log.info("Node " + replicaNodeName +
              " is not live, so skipping leader-initiated recovery for replica: core={} coreNodeName={}",
          replicaCoreName, replicaCoreNodeName);
      // publishDownState will be false to avoid publishing the "down" state too many times
      // as many errors can occur together and will each call into this method (SOLR-6189)
      forcePublishState = false; // no need to force publish the state because replica is not live
      sendRecoveryCommand = false; // no need to send recovery messages as well
    }

    try {
      if (publishDownState || forcePublishState) {
        ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, "state",
            ZkStateReader.STATE_PROP, Replica.State.DOWN.toString(),
            ZkStateReader.BASE_URL_PROP, nodeProps.getBaseUrl(),
            ZkStateReader.CORE_NAME_PROP, nodeProps.getCoreName(),
            ZkStateReader.NODE_NAME_PROP, nodeProps.getNodeName(),
            ZkStateReader.SHARD_ID_PROP, shardId,
            ZkStateReader.COLLECTION_PROP, collection);
        log.warn("Leader is publishing core={} coreNodeName ={} state={} on behalf of un-reachable replica {}",
            replicaCoreName, replicaCoreNodeName, Replica.State.DOWN.toString(), replicaUrl);
        zkController.getOverseerJobQueue().offer(Utils.toJSON(m));
      }
    } catch (Exception e) {
      log.error("Could not publish 'down' state for replicaUrl: {}", replicaUrl, e);
    }

    return sendRecoveryCommand;
  }

  /*
  protected scope for testing purposes
   */
  protected void updateLIRState(String replicaCoreNodeName) {
    zkController.updateLeaderInitiatedRecoveryState(collection,
        shardId,
        replicaCoreNodeName, Replica.State.DOWN, leaderCd, true);
  }

  protected void sendRecoveryCommandWithRetry() throws Exception {    
    int tries = 0;
    long waitBetweenTriesMs = 5000L;
    boolean continueTrying = true;

    String replicaCoreName = nodeProps.getCoreName();
    String recoveryUrl = nodeProps.getBaseUrl();
    String replicaNodeName = nodeProps.getNodeName();
    String coreNeedingRecovery = nodeProps.getCoreName();
    String replicaCoreNodeName = ((Replica) nodeProps.getNodeProps()).getName();
    String replicaUrl = nodeProps.getCoreUrl();
    
    log.info(getName()+" started running to send REQUESTRECOVERY command to "+replicaUrl+
        "; will try for a max of "+(maxTries * (waitBetweenTriesMs/1000))+" secs");

    RequestRecovery recoverRequestCmd = new RequestRecovery();
    recoverRequestCmd.setAction(CoreAdminAction.REQUESTRECOVERY);
    recoverRequestCmd.setCoreName(coreNeedingRecovery);
    
    while (continueTrying && ++tries <= maxTries) {
      if (tries > 1) {
        log.warn("Asking core={} coreNodeName={} on " + recoveryUrl +
            " to recover; unsuccessful after "+tries+" of "+maxTries+" attempts so far ...", coreNeedingRecovery, replicaCoreNodeName);
      } else {
        log.info("Asking core={} coreNodeName={} on " + recoveryUrl + " to recover", coreNeedingRecovery, replicaCoreNodeName);
      }

      try (HttpSolrClient client = new HttpSolrClient.Builder(recoveryUrl).build()) {
        client.setSoTimeout(60000);
        client.setConnectionTimeout(15000);
        try {
          client.request(recoverRequestCmd);
          
          log.info("Successfully sent " + CoreAdminAction.REQUESTRECOVERY +
              " command to core={} coreNodeName={} on " + recoveryUrl, coreNeedingRecovery, replicaCoreNodeName);
          
          continueTrying = false; // succeeded, so stop looping
        } catch (Exception t) {
          Throwable rootCause = SolrException.getRootCause(t);
          boolean wasCommError =
              (rootCause instanceof ConnectException ||
                  rootCause instanceof ConnectTimeoutException ||
                  rootCause instanceof NoHttpResponseException ||
                  rootCause instanceof SocketException);

          SolrException.log(log, recoveryUrl + ": Could not tell a replica to recover", t);
          
          if (!wasCommError) {
            continueTrying = false;
          }                                                
        }
      }
      
      // wait a few seconds
      if (continueTrying) {
        try {
          Thread.sleep(waitBetweenTriesMs);
        } catch (InterruptedException ignoreMe) {
          Thread.currentThread().interrupt();          
        }
        
        if (coreContainer.isShutDown()) {
          log.warn("Stop trying to send recovery command to downed replica core={} coreNodeName={} on "
              + replicaNodeName + " because my core container is closed.", coreNeedingRecovery, replicaCoreNodeName);
          continueTrying = false;
          break;
        }
        
        // see if the replica's node is still live, if not, no need to keep doing this loop
        ZkStateReader zkStateReader = zkController.getZkStateReader();
        if (!zkStateReader.getClusterState().liveNodesContain(replicaNodeName)) {
          log.warn("Node "+replicaNodeName+" hosting core "+coreNeedingRecovery+
              " is no longer live. No need to keep trying to tell it to recover!");
          continueTrying = false;
          break;
        }

        String leaderCoreNodeName = leaderCd.getCloudDescriptor().getCoreNodeName();
        // stop trying if I'm no longer the leader
        if (leaderCoreNodeName != null && collection != null) {
          String leaderCoreNodeNameFromZk = null;
          try {
            leaderCoreNodeNameFromZk = zkController.getZkStateReader().getLeaderRetry(collection, shardId, 1000).getName();
          } catch (Exception exc) {
            log.error("Failed to determine if " + leaderCoreNodeName + " is still the leader for " + collection +
                " " + shardId + " before starting leader-initiated recovery thread for " + replicaUrl + " due to: " + exc);
          }
          if (!leaderCoreNodeName.equals(leaderCoreNodeNameFromZk)) {
            log.warn("Stop trying to send recovery command to downed replica core=" + coreNeedingRecovery +
                ",coreNodeName=" + replicaCoreNodeName + " on " + replicaNodeName + " because " +
                leaderCoreNodeName + " is no longer the leader! New leader is " + leaderCoreNodeNameFromZk);
            continueTrying = false;
            break;
          }
          if (!leaderCd.getCloudDescriptor().isLeader()) {
            log.warn("Stop trying to send recovery command to downed replica core=" + coreNeedingRecovery +
                ",coreNodeName=" + replicaCoreNodeName + " on " + replicaNodeName + " because " +
                leaderCoreNodeName + " is no longer the leader!");
            continueTrying = false;
            break;
          }
        }

        // additional safeguard against the replica trying to be in the active state
        // before acknowledging the leader initiated recovery command
        if (collection != null && shardId != null) {
          try {
            // call out to ZooKeeper to get the leader-initiated recovery state
            final Replica.State lirState = zkController.getLeaderInitiatedRecoveryState(collection, shardId, replicaCoreNodeName);
            
            if (lirState == null) {
              log.warn("Stop trying to send recovery command to downed replica core="+coreNeedingRecovery+
                  ",coreNodeName=" + replicaCoreNodeName + " on "+replicaNodeName+" because the znode no longer exists.");
              continueTrying = false;
              break;              
            }
            
            if (lirState == Replica.State.RECOVERING) {
              // replica has ack'd leader initiated recovery and entered the recovering state
              // so we don't need to keep looping to send the command
              continueTrying = false;  
              log.info("Replica "+coreNeedingRecovery+
                  " on node "+replicaNodeName+" ack'd the leader initiated recovery state, "
                      + "no need to keep trying to send recovery command");
            } else {
              String lcnn = zkStateReader.getLeaderRetry(collection, shardId, 5000).getName();
              List<ZkCoreNodeProps> replicaProps = 
                  zkStateReader.getReplicaProps(collection, shardId, lcnn);
              if (replicaProps != null && replicaProps.size() > 0) {
                for (ZkCoreNodeProps prop : replicaProps) {
                  final Replica replica = (Replica) prop.getNodeProps();
                  if (replicaCoreNodeName.equals(replica.getName())) {
                    if (replica.getState() == Replica.State.ACTIVE) {
                      // replica published its state as "active",
                      // which is bad if lirState is still "down"
                      if (lirState == Replica.State.DOWN) {
                        // OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
                        // so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
                        log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;"
                            + " forcing it back to down state to re-run the leader-initiated recovery process; props: " + replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
                        publishDownState(replicaCoreName, replicaCoreNodeName, replicaNodeName, replicaUrl, true);
                      }
                    }
                    break;
                  }
                }
              }
            }                  
          } catch (Exception ignoreMe) {
            log.warn("Failed to determine state of core={} coreNodeName={} due to: "+ignoreMe, coreNeedingRecovery, replicaCoreNodeName);
            // eventually this loop will exhaust max tries and stop so we can just log this for now
          }                
        }
      }
    }
    
    // replica is no longer in recovery on this node (may be handled on another node)
    zkController.removeReplicaFromLeaderInitiatedRecoveryHandling(replicaUrl);
    
    if (continueTrying) {
      // ugh! this means the loop timed out before the recovery command could be delivered
      // how exotic do we want to get here?
      log.error("Timed out after waiting for "+(tries * (waitBetweenTriesMs/1000))+
          " secs to send the recovery request to: "+replicaUrl+"; not much more we can do here?");
      
      // TODO: need to raise a JMX event to allow monitoring tools to take over from here
      
    }    
  }
}