TestStressCloudBlindAtomicUpdates.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.cloud;

import java.lang.invoke.MethodHandles;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.lucene.util.TestUtil;
import org.apache.solr.SolrTestCaseJ4.SuppressSSL;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.client.solrj.request.schema.SchemaRequest.Field;
import org.apache.solr.client.solrj.request.schema.SchemaRequest.FieldType;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.client.solrj.response.schema.SchemaResponse.FieldResponse;
import org.apache.solr.client.solrj.response.schema.SchemaResponse.FieldTypeResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.ExecutorUtil;
import org.apache.solr.util.DefaultSolrThreadFactory;
import org.apache.solr.util.TestInjection;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Stress test of Atomic Updates in a MinCloud Cluster.
 * 
 * Focus of test is parallel threads hammering updates on diff docs using random clients/nodes, 
 * Optimistic Concurrency is not used here because of SOLR-8733, instead we just throw lots of 
 * "inc" operations at a numeric field and check that the math works out at the end.
 */
@Slow
@SuppressSSL(bugUrl="SSL overhead seems to cause OutOfMemory when stress testing")
public class TestStressCloudBlindAtomicUpdates extends SolrCloudTestCase {

  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  private static final String DEBUG_LABEL = MethodHandles.lookup().lookupClass().getName();
  private static final String COLLECTION_NAME = "test_col";
  
  /** A basic client for operations at the cloud level, default collection will be set */
  private static CloudSolrClient CLOUD_CLIENT;
  /** One client per node */
  private static ArrayList<HttpSolrClient> CLIENTS = new ArrayList<>(5);

  /** Service to execute all parallel work 
   * @see #NUM_THREADS
   */
  private static ExecutorService EXEC_SERVICE;

  /** num parallel threads in use by {@link #EXEC_SERVICE} */
  private static int NUM_THREADS;

  /** 
   * Used as an increment and multiplier when deciding how many docs should be in
   * the test index.  1 means every doc in the index is a candidate for updates, bigger numbers mean a
   * larger index is used (so tested docs are more likeely to be spread out in multiple segments)
   */
  private static int DOC_ID_INCR;

  /**
   * The TestInjection configuration to be used for the current test method.
   *
   * Value is set by {@link #clearCloudCollection}, and used by {@link #startTestInjection} -- but only once 
   * initial index seeding has finished (we're focusing on testing atomic updates, not basic indexing).
   */
  private String testInjection = null;
  
  @BeforeClass
  private static void createMiniSolrCloudCluster() throws Exception {
    // NOTE: numDocsToCheck uses atLeast, so nightly & multiplier are alreayd a factor in index size
    // no need to redundently factor them in here as well
    DOC_ID_INCR = TestUtil.nextInt(random(), 1, 7);
    
    NUM_THREADS = atLeast(3);
    EXEC_SERVICE = ExecutorUtil.newMDCAwareFixedThreadPool
      (NUM_THREADS, new DefaultSolrThreadFactory(DEBUG_LABEL));
    
    // at least 2, but don't go crazy on nightly/test.multiplier with "atLeast()"
    final int numShards = TEST_NIGHTLY ? 5 : 2; 
    final int repFactor = 2; 
    final int numNodes = numShards * repFactor;
   
    final String configName = DEBUG_LABEL + "_config-set";
    final Path configDir = Paths.get(TEST_HOME(), "collection1", "conf");
    
    configureCluster(numNodes).addConfig(configName, configDir).configure();

    CLOUD_CLIENT = cluster.getSolrClient();
    CLOUD_CLIENT.setDefaultCollection(COLLECTION_NAME);

    CollectionAdminRequest.createCollection(COLLECTION_NAME, configName, numShards, repFactor)
        .withProperty("config", "solrconfig-tlog.xml")
        .withProperty("schema", "schema-minimal-atomic-stress.xml")
        .process(CLOUD_CLIENT);

    waitForRecoveriesToFinish(CLOUD_CLIENT);

    for (JettySolrRunner jetty : cluster.getJettySolrRunners()) {
      CLIENTS.add(getHttpSolrClient(jetty.getBaseUrl() + "/" + COLLECTION_NAME + "/"));
    }

    // sanity check no one broke the assumptions we make about our schema
    checkExpectedSchemaType( map("name","long",
                                 "class","solr.TrieLongField",
                                 "multiValued",Boolean.FALSE,
                                 "indexed",Boolean.FALSE,
                                 "stored",Boolean.FALSE,
                                 "docValues",Boolean.FALSE) );
  }
  
  @AfterClass
  private static void afterClass() throws Exception {
    TestInjection.reset();
    ExecutorUtil.shutdownAndAwaitTermination(EXEC_SERVICE);
    EXEC_SERVICE = null;
    CLOUD_CLIENT.close(); CLOUD_CLIENT = null;
    for (HttpSolrClient client : CLIENTS) {
      client.close();
    }
    CLIENTS = null;
  }
  
  @Before
  private void clearCloudCollection() throws Exception {
    assertEquals(0, CLOUD_CLIENT.deleteByQuery("*:*").getStatus());
    assertEquals(0, CLOUD_CLIENT.optimize().getStatus());

    TestInjection.reset();
    
    final int injectionPercentage = (int)Math.ceil(atLeast(1) / 2);
    testInjection = usually() ? "false:0" : ("true:" + injectionPercentage);
  }

  /**
   * Assigns {@link #testInjection} to various TestInjection variables.  Calling this 
   * method multiple times in the same method should always result in the same setting being applied 
   * (even if {@link TestInjection#reset} was called in between.
   */
  private void startTestInjection() {
    log.info("TestInjection: fail replica, update pause, tlog pauses: " + testInjection);
    TestInjection.failReplicaRequests = testInjection;
    TestInjection.updateLogReplayRandomPause = testInjection;
    TestInjection.updateRandomPause = testInjection;
  }


  public void test_dv() throws Exception {
    String field = "long_dv";
    checkExpectedSchemaField(map("name", field,
                                 "type","long",
                                 "stored",Boolean.FALSE,
                                 "indexed",Boolean.FALSE,
                                 "docValues",Boolean.TRUE));
    
    checkField(field);
  }
  public void test_dv_stored() throws Exception {
    String field = "long_dv_stored";
    checkExpectedSchemaField(map("name", field,
                                 "type","long",
                                 "stored",Boolean.TRUE,
                                 "indexed",Boolean.FALSE,
                                 "docValues",Boolean.TRUE));
    
    checkField(field);

  }
  public void test_dv_stored_idx() throws Exception {
    String field = "long_dv_stored_idx";
    checkExpectedSchemaField(map("name", field,
                                 "type","long",
                                 "stored",Boolean.TRUE,
                                 "indexed",Boolean.TRUE,
                                 "docValues",Boolean.TRUE));
    
    checkField(field);
  }
  public void test_dv_idx() throws Exception {
    String field = "long_dv_idx";
    checkExpectedSchemaField(map("name", field,
                                 "type","long",
                                 "stored",Boolean.FALSE,
                                 "indexed",Boolean.TRUE,
                                 "docValues",Boolean.TRUE));
    
    checkField(field);
  }
  public void test_stored_idx() throws Exception {
    String field = "long_stored_idx";
    checkExpectedSchemaField(map("name", field,
                                 "type","long",
                                 "stored",Boolean.TRUE,
                                 "indexed",Boolean.TRUE,
                                 "docValues",Boolean.FALSE));
    
    checkField(field);
  }
  
  public void checkField(final String numericFieldName) throws Exception {

    final CountDownLatch abortLatch = new CountDownLatch(1);

    final int numDocsToCheck = atLeast(37);
    final int numDocsInIndex = (numDocsToCheck * DOC_ID_INCR);
    final AtomicLong[] expected = new AtomicLong[numDocsToCheck];

    log.info("Testing " + numericFieldName + ": numDocsToCheck=" + numDocsToCheck + ", numDocsInIndex=" + numDocsInIndex + ", incr=" + DOC_ID_INCR);
    
    // seed the index & keep track of what docs exist and with what values
    for (int id = 0; id < numDocsInIndex; id++) {
      // NOTE: the field we're mutating is a long, but we seed with a random int,
      // and we will inc/dec by random smaller ints, to ensure we never over/under flow
      final int initValue = random().nextInt();
      SolrInputDocument doc = doc(f("id",""+id), f(numericFieldName, initValue));
      UpdateResponse rsp = update(doc).process(CLOUD_CLIENT);
      assertEquals(doc.toString() + " => " + rsp.toString(), 0, rsp.getStatus());
      if (0 == id % DOC_ID_INCR) {
        expected[(int)(id / DOC_ID_INCR)] = new AtomicLong(initValue);
      }
    }
    assertNotNull("Sanity Check no off-by-one in expected init: ", expected[expected.length-1]);
    
    
    // sanity check index contents
    waitForRecoveriesToFinish(CLOUD_CLIENT);
    assertEquals(0, CLOUD_CLIENT.commit().getStatus());
    assertEquals(numDocsInIndex,
                 CLOUD_CLIENT.query(params("q", "*:*")).getResults().getNumFound());

    startTestInjection();
    
    // spin up parallel workers to hammer updates
    List<Future<Worker>> results = new ArrayList<Future<Worker>>(NUM_THREADS);
    for (int workerId = 0; workerId < NUM_THREADS; workerId++) {
      Worker worker = new Worker(workerId, expected, abortLatch, new Random(random().nextLong()),
                                 numericFieldName);
      // ask for the Worker to be returned in the Future so we can inspect it
      results.add(EXEC_SERVICE.submit(worker, worker));
    }
    // check the results of all our workers
    for (Future<Worker> r : results) {
      try {
        Worker w = r.get();
        if (! w.getFinishedOk() ) {
          // quick and dirty sanity check if any workers didn't succeed, but didn't throw an exception either
          abortLatch.countDown();
          log.error("worker={} didn't finish ok, but didn't throw exception?", w.workerId);
        }
      } catch (ExecutionException ee) {
        Throwable rootCause = ee.getCause();
        if (rootCause instanceof Error) {
          // low level error, or test assertion failure - either way don't leave it wrapped
          log.error("Worker exec Error, throwing root cause", ee);
          throw (Error) rootCause;
        } else { 
          log.error("Worker ExecutionException, re-throwing", ee);
          throw ee;
        }
      }
    }

    assertEquals("Abort latch has changed, why didn't we get an exception from a worker?",
                 1L, abortLatch.getCount());
    
    TestInjection.reset();
    waitForRecoveriesToFinish(CLOUD_CLIENT);

    // check all the final index contents match our expectations
    int incorrectDocs = 0;
    for (int id = 0; id < numDocsInIndex; id += DOC_ID_INCR) {
      assert 0 == id % DOC_ID_INCR : "WTF? " + id;
      
      final long expect = expected[(int)(id / DOC_ID_INCR)].longValue();
      
      final String docId = "" + id;
      
      // sometimes include an fq on the expected value to ensure the updated values
      // are "visible" for searching
      final SolrParams p = (0 != TestUtil.nextInt(random(), 0,15))
        ? params() : params("fq",numericFieldName + ":\"" + expect + "\"");
      SolrDocument doc = getRandClient(random()).getById(docId, p);
      
      final boolean foundWithFilter = (null != doc);
      if (! foundWithFilter) {
        // try again w/o fq to see what it does have
        doc = getRandClient(random()).getById(docId);
      }
      
      Long actual = (null == doc) ? null : (Long) doc.getFirstValue(numericFieldName);
      if (actual == null || expect != actual.longValue() || ! foundWithFilter) {
        log.error("docId={}, foundWithFilter={}, expected={}, actual={}",
                  docId, foundWithFilter, expect, actual);
        incorrectDocs++;
      }
      
    }
    assertEquals("Some docs had errors -- check logs", 0, incorrectDocs);
  }

  
  public static final class Worker implements Runnable {
    public final int workerId;
    final AtomicLong[] expected;
    final CountDownLatch abortLatch;
    final Random rand;
    final String updateField; 
    final int numDocsToUpdate;
    boolean ok = false; // set to true only on successful completion
    public Worker(int workerId, AtomicLong[] expected, CountDownLatch abortLatch, Random rand,
                  String updateField) {
      this.workerId = workerId;
      this.expected = expected;
      this.abortLatch = abortLatch;
      this.rand = rand;
      this.updateField = updateField;
      this.numDocsToUpdate = atLeast(rand, 25);
    }
    public boolean getFinishedOk() {
      return ok;
    }
    private void doRandomAtomicUpdate(int docId) throws Exception {
      assert 0 == docId % DOC_ID_INCR : "WTF? " + docId;
      
      final int delta = TestUtil.nextInt(rand, -1000, 1000);
      log.info("worker={}, docId={}, delta={}", workerId, docId, delta);

      SolrClient client = getRandClient(rand);
      SolrInputDocument doc = doc(f("id",""+docId),
                                  f(updateField,Collections.singletonMap("inc",delta)));
      UpdateResponse rsp = update(doc).process(client);
      assertEquals(doc + " => " + rsp, 0, rsp.getStatus());
      
      AtomicLong counter = expected[(int)(docId / DOC_ID_INCR)];
      assertNotNull("null counter for " + docId + "/" + DOC_ID_INCR, counter);
      counter.getAndAdd(delta);

    }
    
    public void run() {
      final String origThreadName = Thread.currentThread().getName();
      try {
        Thread.currentThread().setName(origThreadName + "-w" + workerId);
        final int maxDocMultiplier = expected.length-1;
        for (int docIter = 0; docIter < numDocsToUpdate; docIter++) {

          final int docId = DOC_ID_INCR * TestUtil.nextInt(rand, 0, maxDocMultiplier);

          // tweak our thread name to keep track of what we're up to
          Thread.currentThread().setName(origThreadName + "-w" + workerId + "-d" + docId);

          // no matter how random the doc selection may be per thread, ensure
          // every doc that is selected by *a* thread gets at least a couple rapid fire updates
          final int itersPerDoc = atLeast(rand, 2);
          
          for (int updateIter = 0; updateIter < itersPerDoc; updateIter++) {
            if (0 == abortLatch.getCount()) {
              return;
            }
            doRandomAtomicUpdate(docId);
          }
          if (rand.nextBoolean()) { Thread.yield(); }
        }
        
      } catch (Error err) {
        log.error(Thread.currentThread().getName(), err);
        abortLatch.countDown();
        throw err;
      } catch (Exception ex) {
        log.error(Thread.currentThread().getName(), ex);
        abortLatch.countDown();
        throw new RuntimeException(ex.getMessage(), ex);
      } finally {
        Thread.currentThread().setName(origThreadName);
      }
      ok = true;
    }
  }
  
  
  public static UpdateRequest update(SolrInputDocument... docs) {
    return update(null, docs);
  }
  public static UpdateRequest update(SolrParams params, SolrInputDocument... docs) {
    UpdateRequest r = new UpdateRequest();
    if (null != params) {
      r.setParams(new ModifiableSolrParams(params));
    }
    r.add(Arrays.asList(docs));
    return r;
  }
  
  public static SolrInputDocument doc(SolrInputField... fields) {
    SolrInputDocument doc = new SolrInputDocument();
    for (SolrInputField f : fields) {
      doc.put(f.getName(), f);
    }
    return doc;
  }
  
  public static SolrInputField f(String fieldName, Object... values) {
    SolrInputField f = new SolrInputField(fieldName);
    f.setValue(values);
    // TODO: soooooooooo stupid (but currently neccessary because atomic updates freak out
    // if the Map with the "inc" operation is inside of a collection - even if it's the only "value") ...
    if (1 == values.length) {
      f.setValue(values[0]);
    } else {
      f.setValue(values);
    }
    return f;
  }
  
  public static SolrClient getRandClient(Random rand) {
    int numClients = CLIENTS.size();
    int idx = TestUtil.nextInt(rand, 0, numClients);
    return (idx == numClients) ? CLOUD_CLIENT : CLIENTS.get(idx);
  }

  public static void waitForRecoveriesToFinish(CloudSolrClient client) throws Exception {
    assert null != client.getDefaultCollection();
    AbstractDistribZkTestBase.waitForRecoveriesToFinish(client.getDefaultCollection(),
                                                        client.getZkStateReader(),
                                                        true, true, 330);
  }

  /**
   * Use the schema API to verify that the specified expected Field exists with those exact attributes. 
   * @see #CLOUD_CLIENT
   */
  public static void checkExpectedSchemaField(Map<String,Object> expected) throws Exception {
    String fieldName = (String) expected.get("name");
    assertNotNull("expected contains no name: " + expected, fieldName);
    FieldResponse rsp = new Field(fieldName).process(CLOUD_CLIENT);
    assertNotNull("Field Null Response: " + fieldName, rsp);
    assertEquals("Field Status: " + fieldName + " => " + rsp.toString(), 0, rsp.getStatus());
    assertEquals("Field: " + fieldName, expected, rsp.getField());
  }
  
  /**
   * Use the schema API to verify that the specified expected FieldType exists with those exact attributes. 
   * @see #CLOUD_CLIENT
   */
  public static void checkExpectedSchemaType(Map<String,Object> expected) throws Exception {
    
    String typeName = (String) expected.get("name");
    assertNotNull("expected contains no type: " + expected, typeName);
    FieldTypeResponse rsp = new FieldType(typeName).process(CLOUD_CLIENT);
    assertNotNull("FieldType Null Response: " + typeName, rsp);
    assertEquals("FieldType Status: " + typeName + " => " + rsp.toString(), 0, rsp.getStatus());
    assertEquals("FieldType: " + typeName, expected, rsp.getFieldType().getAttributes());
    
  }
}