RecoveryStrategy.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.cloud;

import java.io.Closeable;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;

import org.apache.http.client.methods.HttpUriRequest;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.store.Directory;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.impl.HttpSolrClient.HttpUriRequestResponse;
import org.apache.solr.client.solrj.request.AbstractUpdateRequest;
import org.apache.solr.client.solrj.request.CoreAdminRequest.WaitForState;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.ZkCoreNodeProps;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.cloud.ZooKeeperException;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.CoreDescriptor;
import org.apache.solr.core.DirectoryFactory.DirContext;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.ReplicationHandler;
import org.apache.solr.logging.MDCLoggingContext;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrRequestHandler;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.update.CommitUpdateCommand;
import org.apache.solr.update.PeerSync;
import org.apache.solr.update.UpdateLog;
import org.apache.solr.update.UpdateLog.RecoveryInfo;
import org.apache.solr.update.processor.DistributedUpdateProcessor;
import org.apache.solr.util.RefCounted;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.NamedListInitializedPlugin;
import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This class may change in future and customisations are not supported
 * between versions in terms of API or back compat behaviour.
 * @lucene.experimental
 */
public class RecoveryStrategy implements Runnable, Closeable {

  public static class Builder implements NamedListInitializedPlugin {
    private NamedList args;
    @Override
    public void init(NamedList args) {
      this.args = args;
    }
    // this should only be used from SolrCoreState
    public RecoveryStrategy create(CoreContainer cc, CoreDescriptor cd,
        RecoveryStrategy.RecoveryListener recoveryListener) {
      final RecoveryStrategy recoveryStrategy = newRecoveryStrategy(cc, cd, recoveryListener);
      SolrPluginUtils.invokeSetters(recoveryStrategy, args);
      return recoveryStrategy;
    }
    protected RecoveryStrategy newRecoveryStrategy(CoreContainer cc, CoreDescriptor cd,
        RecoveryStrategy.RecoveryListener recoveryListener) {
      return new RecoveryStrategy(cc, cd, recoveryListener);
    }
  }

  private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  private int waitForUpdatesWithStaleStatePauseMilliSeconds = Integer.getInteger("solr.cloud.wait-for-updates-with-stale-state-pause", 2500);
  private int maxRetries = 500;
  private int startingRecoveryDelayMilliSeconds = 5000;

  public static interface RecoveryListener {
    public void recovered();
    public void failed();
  }
  
  private volatile boolean close = false;

  private RecoveryListener recoveryListener;
  private ZkController zkController;
  private String baseUrl;
  private String coreZkNodeName;
  private ZkStateReader zkStateReader;
  private volatile String coreName;
  private int retries;
  private boolean recoveringAfterStartup;
  private CoreContainer cc;
  private volatile HttpUriRequest prevSendPreRecoveryHttpUriRequest;
  private boolean onlyLeaderIndexes;

  protected RecoveryStrategy(CoreContainer cc, CoreDescriptor cd, RecoveryListener recoveryListener) {
    this.cc = cc;
    this.coreName = cd.getName();
    this.recoveryListener = recoveryListener;
    zkController = cc.getZkController();
    zkStateReader = zkController.getZkStateReader();
    baseUrl = zkController.getBaseUrl();
    coreZkNodeName = cd.getCloudDescriptor().getCoreNodeName();
    String collection = cd.getCloudDescriptor().getCollectionName();
    onlyLeaderIndexes = zkStateReader.getClusterState().getCollection(collection).getRealtimeReplicas() == 1;
  }

  final public int getWaitForUpdatesWithStaleStatePauseMilliSeconds() {
    return waitForUpdatesWithStaleStatePauseMilliSeconds;
  }

  final public void setWaitForUpdatesWithStaleStatePauseMilliSeconds(int waitForUpdatesWithStaleStatePauseMilliSeconds) {
    this.waitForUpdatesWithStaleStatePauseMilliSeconds = waitForUpdatesWithStaleStatePauseMilliSeconds;
  }

  final public int getMaxRetries() {
    return maxRetries;
  }

  final public void setMaxRetries(int maxRetries) {
    this.maxRetries = maxRetries;
  }

  final public int getStartingRecoveryDelayMilliSeconds() {
    return startingRecoveryDelayMilliSeconds;
  }

  final public void setStartingRecoveryDelayMilliSeconds(int startingRecoveryDelayMilliSeconds) {
    this.startingRecoveryDelayMilliSeconds = startingRecoveryDelayMilliSeconds;
  }

  final public boolean getRecoveringAfterStartup() {
    return recoveringAfterStartup;
  }

  final public void setRecoveringAfterStartup(boolean recoveringAfterStartup) {
    this.recoveringAfterStartup = recoveringAfterStartup;
  }

  // make sure any threads stop retrying
  @Override
  final public void close() {
    close = true;
    if (prevSendPreRecoveryHttpUriRequest != null) {
      prevSendPreRecoveryHttpUriRequest.abort();
    }
    LOG.warn("Stopping recovery for core=[{}] coreNodeName=[{}]", coreName, coreZkNodeName);
  }

  final private void recoveryFailed(final SolrCore core,
      final ZkController zkController, final String baseUrl,
      final String shardZkNodeName, final CoreDescriptor cd) throws KeeperException, InterruptedException {
    SolrException.log(LOG, "Recovery failed - I give up.");
    try {
      zkController.publish(cd, Replica.State.RECOVERY_FAILED);
    } finally {
      close();
      recoveryListener.failed();
    }
  }
  
  /**
   * This method may change in future and customisations are not supported
   * between versions in terms of API or back compat behaviour.
   * @lucene.experimental
   */
  protected String getReplicateLeaderUrl(ZkNodeProps leaderprops) {
    return new ZkCoreNodeProps(leaderprops).getCoreUrl();
  }

  final private void replicate(String nodeName, SolrCore core, ZkNodeProps leaderprops)
      throws SolrServerException, IOException {

    final String leaderUrl = getReplicateLeaderUrl(leaderprops);
    
    LOG.info("Attempting to replicate from [{}].", leaderUrl);
    
    // send commit
    commitOnLeader(leaderUrl);
    
    // use rep handler directly, so we can do this sync rather than async
    SolrRequestHandler handler = core.getRequestHandler(ReplicationHandler.PATH);
    ReplicationHandler replicationHandler = (ReplicationHandler) handler;
    
    if (replicationHandler == null) {
      throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE,
          "Skipping recovery, no " + ReplicationHandler.PATH + " handler found");
    }
    
    ModifiableSolrParams solrParams = new ModifiableSolrParams();
    solrParams.set(ReplicationHandler.MASTER_URL, leaderUrl);
    
    if (isClosed()) return; // we check closed on return
    boolean success = replicationHandler.doFetch(solrParams, false).getSuccessful();
    
    if (!success) {
      throw new SolrException(ErrorCode.SERVER_ERROR, "Replication for recovery failed.");
    }
    
    // solrcloud_debug
    if (LOG.isDebugEnabled()) {
      try {
        RefCounted<SolrIndexSearcher> searchHolder = core
            .getNewestSearcher(false);
        SolrIndexSearcher searcher = searchHolder.get();
        Directory dir = core.getDirectoryFactory().get(core.getIndexDir(), DirContext.META_DATA, null);
        try {
          LOG.debug(core.getCoreContainer()
              .getZkController().getNodeName()
              + " replicated "
              + searcher.search(new MatchAllDocsQuery(), 1).totalHits
              + " from "
              + leaderUrl
              + " gen:"
              + (core.getDeletionPolicy().getLatestCommit() != null ? "null" : core.getDeletionPolicy().getLatestCommit().getGeneration())
              + " data:" + core.getDataDir()
              + " index:" + core.getIndexDir()
              + " newIndex:" + core.getNewIndexDir()
              + " files:" + Arrays.asList(dir.listAll()));
        } finally {
          core.getDirectoryFactory().release(dir);
          searchHolder.decref();
        }
      } catch (Exception e) {
        LOG.debug("Error in solrcloud_debug block", e);
      }
    }

  }

  final private void commitOnLeader(String leaderUrl) throws SolrServerException,
      IOException {
    try (HttpSolrClient client = new HttpSolrClient.Builder(leaderUrl).build()) {
      client.setConnectionTimeout(30000);
      UpdateRequest ureq = new UpdateRequest();
      ureq.setParams(new ModifiableSolrParams());
      ureq.getParams().set(DistributedUpdateProcessor.COMMIT_END_POINT, true);
      ureq.getParams().set(UpdateParams.OPEN_SEARCHER, onlyLeaderIndexes);
      ureq.setAction(AbstractUpdateRequest.ACTION.COMMIT, false, true).process(
          client);
    }
  }

  @Override
  final public void run() {

    // set request info for logging
    try (SolrCore core = cc.getCore(coreName)) {

      if (core == null) {
        SolrException.log(LOG, "SolrCore not found - cannot recover:" + coreName);
        return;
      }
      MDCLoggingContext.setCore(core);

      LOG.info("Starting recovery process. recoveringAfterStartup=" + recoveringAfterStartup);

      try {
        doRecovery(core);
      } catch (InterruptedException e) {
        Thread.currentThread().interrupt();
        SolrException.log(LOG, "", e);
        throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
      } catch (Exception e) {
        LOG.error("", e);
        throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
      }
    } finally {
      MDCLoggingContext.clear();
    }
  }

  // TODO: perhaps make this grab a new core each time through the loop to handle core reloads?
  final public void doRecovery(SolrCore core) throws KeeperException, InterruptedException {
    boolean replayed = false;
    boolean successfulRecovery = false;

    UpdateLog ulog;
    ulog = core.getUpdateHandler().getUpdateLog();
    if (ulog == null) {
      SolrException.log(LOG, "No UpdateLog found - cannot recover.");
      recoveryFailed(core, zkController, baseUrl, coreZkNodeName,
          core.getCoreDescriptor());
      return;
    }

    // we temporary ignore peersync for realtimeReplicas mode
    boolean firstTime = !onlyLeaderIndexes;

    List<Long> recentVersions;
    try (UpdateLog.RecentUpdates recentUpdates = ulog.getRecentUpdates()) {
      recentVersions = recentUpdates.getVersions(ulog.getNumRecordsToKeep());
    } catch (Exception e) {
      SolrException.log(LOG, "Corrupt tlog - ignoring.", e);
      recentVersions = new ArrayList<>(0);
    }

    List<Long> startingVersions = ulog.getStartingVersions();

    if (startingVersions != null && recoveringAfterStartup) {
      try {
        int oldIdx = 0; // index of the start of the old list in the current list
        long firstStartingVersion = startingVersions.size() > 0 ? startingVersions.get(0) : 0;
        
        for (; oldIdx < recentVersions.size(); oldIdx++) {
          if (recentVersions.get(oldIdx) == firstStartingVersion) break;
        }
        
        if (oldIdx > 0) {
          LOG.info("####### Found new versions added after startup: num=[{}]", oldIdx);
          LOG.info("###### currentVersions=[{}]",recentVersions);
        }
        
        LOG.info("###### startupVersions=[{}]", startingVersions);
      } catch (Exception e) {
        SolrException.log(LOG, "Error getting recent versions.", e);
        recentVersions = new ArrayList<>(0);
      }
    }

    if (recoveringAfterStartup) {
      // if we're recovering after startup (i.e. we have been down), then we need to know what the last versions were
      // when we went down.  We may have received updates since then.
      recentVersions = startingVersions;
      try {
        if ((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0) {
          // last operation at the time of startup had the GAP flag set...
          // this means we were previously doing a full index replication
          // that probably didn't complete and buffering updates in the
          // meantime.
          LOG.info("Looks like a previous replication recovery did not complete - skipping peer sync.");
          firstTime = false; // skip peersync
        }
      } catch (Exception e) {
        SolrException.log(LOG, "Error trying to get ulog starting operation.", e);
        firstTime = false; // skip peersync
      }
    }

    if (onlyLeaderIndexes) {
      zkController.stopReplicationFromLeader(coreName);
    }

    Future<RecoveryInfo> replayFuture = null;
    while (!successfulRecovery && !Thread.currentThread().isInterrupted() && !isClosed()) { // don't use interruption or it will close channels though
      try {
        CloudDescriptor cloudDesc = core.getCoreDescriptor().getCloudDescriptor();
        ZkNodeProps leaderprops = zkStateReader.getLeaderRetry(
            cloudDesc.getCollectionName(), cloudDesc.getShardId());
      
        final String leaderBaseUrl = leaderprops.getStr(ZkStateReader.BASE_URL_PROP);
        final String leaderCoreName = leaderprops.getStr(ZkStateReader.CORE_NAME_PROP);

        String leaderUrl = ZkCoreNodeProps.getCoreUrl(leaderBaseUrl, leaderCoreName);

        String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName);

        boolean isLeader = leaderUrl.equals(ourUrl);
        if (isLeader && !cloudDesc.isLeader()) {
          throw new SolrException(ErrorCode.SERVER_ERROR, "Cloud state still says we are leader.");
        }
        if (cloudDesc.isLeader()) {
          // we are now the leader - no one else must have been suitable
          LOG.warn("We have not yet recovered - but we are now the leader!");
          LOG.info("Finished recovery process.");
          zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
          return;
        }
        
        LOG.info("Begin buffering updates. core=[{}]", coreName);
        ulog.bufferUpdates();
        replayed = false;
        
        LOG.info("Publishing state of core [{}] as recovering, leader is [{}] and I am [{}]", core.getName(), leaderUrl,
            ourUrl);
        zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING);
        
        
        final Slice slice = zkStateReader.getClusterState().getSlice(cloudDesc.getCollectionName(),
            cloudDesc.getShardId());
            
        try {
          prevSendPreRecoveryHttpUriRequest.abort();
        } catch (NullPointerException e) {
          // okay
        }
        
        if (isClosed()) {
          LOG.info("RecoveryStrategy has been closed");
          break;
        }

        sendPrepRecoveryCmd(leaderBaseUrl, leaderCoreName, slice);
        
        if (isClosed()) {
          LOG.info("RecoveryStrategy has been closed");
          break;
        }
        
        // we wait a bit so that any updates on the leader
        // that started before they saw recovering state 
        // are sure to have finished (see SOLR-7141 for
        // discussion around current value)
        try {
          Thread.sleep(waitForUpdatesWithStaleStatePauseMilliSeconds);
        } catch (InterruptedException e) {
          Thread.currentThread().interrupt();
        }

        // first thing we just try to sync
        if (firstTime) {
          firstTime = false; // only try sync the first time through the loop
          LOG.info("Attempting to PeerSync from [{}] - recoveringAfterStartup=[{}]", leaderUrl, recoveringAfterStartup);
          // System.out.println("Attempting to PeerSync from " + leaderUrl
          // + " i am:" + zkController.getNodeName());
          PeerSync peerSync = new PeerSync(core,
              Collections.singletonList(leaderUrl), ulog.getNumRecordsToKeep(), false, false);
          peerSync.setStartingVersions(recentVersions);
          boolean syncSuccess = peerSync.sync().isSuccess();
          if (syncSuccess) {
            SolrQueryRequest req = new LocalSolrQueryRequest(core,
                new ModifiableSolrParams());
            // force open a new searcher
            core.getUpdateHandler().commit(new CommitUpdateCommand(req, false));
            LOG.info("PeerSync stage of recovery was successful.");

            // solrcloud_debug
            cloudDebugLog(core, "synced");
            
            LOG.info("Replaying updates buffered during PeerSync.");
            replay(core);
            replayed = true;
            
            // sync success
            successfulRecovery = true;
            return;
          }

          LOG.info("PeerSync Recovery was not successful - trying replication.");
        }

        if (isClosed()) {
          LOG.info("RecoveryStrategy has been closed");
          break;
        }
        
        LOG.info("Starting Replication Recovery.");

        try {

          replicate(zkController.getNodeName(), core, leaderprops);

          if (isClosed()) {
            LOG.info("RecoveryStrategy has been closed");
            break;
          }

          replayFuture = replay(core);
          replayed = true;
          
          if (isClosed()) {
            LOG.info("RecoveryStrategy has been closed");
            break;
          }

          LOG.info("Replication Recovery was successful.");
          successfulRecovery = true;
        } catch (InterruptedException e) {
          Thread.currentThread().interrupt();
          LOG.warn("Recovery was interrupted", e);
          close = true;
        } catch (Exception e) {
          SolrException.log(LOG, "Error while trying to recover", e);
        }

      } catch (Exception e) {
        SolrException.log(LOG, "Error while trying to recover. core=" + coreName, e);
      } finally {
        if (!replayed) {
          // dropBufferedUpdate()s currently only supports returning to ACTIVE state, which risks additional updates
          // being added w/o UpdateLog.FLAG_GAP, hence losing the info on restart that we are not up-to-date.
          // For now, ulog will simply remain in BUFFERING state, and an additional call to bufferUpdates() will
          // reset our starting point for playback.
          LOG.info("Replay not started, or was not successful... still buffering updates.");

          /** this prev code is retained in case we want to switch strategies.
          try {
            ulog.dropBufferedUpdates();
          } catch (Exception e) {
            SolrException.log(log, "", e);
          }
          **/
        }
        if (successfulRecovery) {
          LOG.info("Registering as Active after recovery.");
          try {
            if (onlyLeaderIndexes) {
              zkController.startReplicationFromLeader(coreName);
            }
            zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
          } catch (Exception e) {
            LOG.error("Could not publish as ACTIVE after succesful recovery", e);
            successfulRecovery = false;
          }
          
          if (successfulRecovery) {
            close = true;
            recoveryListener.recovered();
          }
        }
      }

      if (!successfulRecovery) {
        // lets pause for a moment and we need to try again...
        // TODO: we don't want to retry for some problems?
        // Or do a fall off retry...
        try {

          if (isClosed()) {
            LOG.info("RecoveryStrategy has been closed");
            break;
          }
          
          LOG.error("Recovery failed - trying again... (" + retries + ")");
          
          retries++;
          if (retries >= maxRetries) {
            SolrException.log(LOG, "Recovery failed - max retries exceeded (" + retries + ").");
            try {
              recoveryFailed(core, zkController, baseUrl, coreZkNodeName, core.getCoreDescriptor());
            } catch (Exception e) {
              SolrException.log(LOG, "Could not publish that recovery failed", e);
            }
            break;
          }
        } catch (Exception e) {
          SolrException.log(LOG, "An error has occurred during recovery", e);
        }

        try {
          // Wait an exponential interval between retries, start at 5 seconds and work up to a minute.
          // If we're at attempt >= 4, there's no point computing pow(2, retries) because the result 
          // will always be the minimum of the two (12). Since we sleep at 5 seconds sub-intervals in
          // order to check if we were closed, 12 is chosen as the maximum loopCount (5s * 12 = 1m).
          double loopCount = retries < 4 ? Math.min(Math.pow(2, retries), 12) : 12;
          LOG.info("Wait [{}] seconds before trying to recover again (attempt={})", loopCount, retries);
          for (int i = 0; i < loopCount; i++) {
            if (isClosed()) {
              LOG.info("RecoveryStrategy has been closed");
              break; // check if someone closed us
            }
            Thread.sleep(startingRecoveryDelayMilliSeconds);
          }
        } catch (InterruptedException e) {
          Thread.currentThread().interrupt();
          LOG.warn("Recovery was interrupted.", e);
          close = true;
        }
      }

    }

    // if replay was skipped (possibly to due pulling a full index from the leader),
    // then we still need to update version bucket seeds after recovery
    if (successfulRecovery && replayFuture == null) {
      LOG.info("Updating version bucket highest from index after successful recovery.");
      core.seedVersionBuckets();
    }

    LOG.info("Finished recovery process, successful=[{}]", Boolean.toString(successfulRecovery));
  }

  public static Runnable testing_beforeReplayBufferingUpdates;

  final private Future<RecoveryInfo> replay(SolrCore core)
      throws InterruptedException, ExecutionException {
    if (testing_beforeReplayBufferingUpdates != null) {
      testing_beforeReplayBufferingUpdates.run();
    }
    if (onlyLeaderIndexes) {
      // roll over all updates during buffering to new tlog, make RTG available
      SolrQueryRequest req = new LocalSolrQueryRequest(core,
          new ModifiableSolrParams());
      core.getUpdateHandler().getUpdateLog().copyOverBufferingUpdates(new CommitUpdateCommand(req, false));
      return null;
    }
    Future<RecoveryInfo> future = core.getUpdateHandler().getUpdateLog().applyBufferedUpdates();
    if (future == null) {
      // no replay needed\
      LOG.info("No replay needed.");
    } else {
      LOG.info("Replaying buffered documents.");
      // wait for replay
      RecoveryInfo report = future.get();
      if (report.failed) {
        SolrException.log(LOG, "Replay failed");
        throw new SolrException(ErrorCode.SERVER_ERROR, "Replay failed");
      }
    }
    
    // solrcloud_debug
    cloudDebugLog(core, "replayed");
    
    return future;
  }
  
  final private void cloudDebugLog(SolrCore core, String op) {
    if (!LOG.isDebugEnabled()) {
      return;
    }
    try {
      RefCounted<SolrIndexSearcher> searchHolder = core.getNewestSearcher(false);
      SolrIndexSearcher searcher = searchHolder.get();
      try {
        final int totalHits = searcher.search(new MatchAllDocsQuery(), 1).totalHits;
        final String nodeName = core.getCoreContainer().getZkController().getNodeName();
        LOG.debug("[{}] {} [{} total hits]", nodeName, op, totalHits);
      } finally {
        searchHolder.decref();
      }
    } catch (Exception e) {
      LOG.debug("Error in solrcloud_debug block", e);
    }
  }

  final public boolean isClosed() {
    return close;
  }
  
  final private void sendPrepRecoveryCmd(String leaderBaseUrl, String leaderCoreName, Slice slice)
      throws SolrServerException, IOException, InterruptedException, ExecutionException {

    WaitForState prepCmd = new WaitForState();
    prepCmd.setCoreName(leaderCoreName);
    prepCmd.setNodeName(zkController.getNodeName());
    prepCmd.setCoreNodeName(coreZkNodeName);
    prepCmd.setState(Replica.State.RECOVERING);
    prepCmd.setCheckLive(true);
    prepCmd.setOnlyIfLeader(true);
    final Slice.State state = slice.getState();
    if (state != Slice.State.CONSTRUCTION && state != Slice.State.RECOVERY && state != Slice.State.RECOVERY_FAILED) {
      prepCmd.setOnlyIfLeaderActive(true);
    }

    final int maxTries = 30;
    for (int numTries = 0; numTries < maxTries; numTries++) {
      try {
        sendPrepRecoveryCmd(leaderBaseUrl, prepCmd);
        break;
      } catch (ExecutionException e) {
        if (e.getCause() instanceof SolrServerException) {
          SolrServerException solrException = (SolrServerException) e.getCause();
          if (solrException.getRootCause() instanceof SocketTimeoutException && numTries < maxTries) {
            LOG.warn("Socket timeout on send prep recovery cmd, retrying.. ");
            continue;
          }
        }
        throw  e;
      }
    }
  }

  final private void sendPrepRecoveryCmd(String leaderBaseUrl, WaitForState prepCmd)
      throws SolrServerException, IOException, InterruptedException, ExecutionException {
    try (HttpSolrClient client = new HttpSolrClient.Builder(leaderBaseUrl).build()) {
      client.setConnectionTimeout(10000);
      client.setSoTimeout(10000);
      HttpUriRequestResponse mrr = client.httpUriRequest(prepCmd);
      prevSendPreRecoveryHttpUriRequest = mrr.httpUriRequest;

      LOG.info("Sending prep recovery command to [{}]; [{}]", leaderBaseUrl, prepCmd.toString());

      mrr.future.get();
    }
  }

}