/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.replicator.nrt; import java.io.BufferedReader; import java.io.Closeable; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Pattern; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.Term; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.MockDirectoryWrapper; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LineFileDocs; import org.apache.lucene.util.LuceneTestCase.SuppressCodecs; import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.SuppressForbidden; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.ThreadInterruptedException; import com.carrotsearch.randomizedtesting.SeedUtils; /* TODO - fangs - sometimes have one replica be really slow at copying / have random pauses (fake GC) / etc. - test should not print scary exceptions and then succeed! - are the pre-copied-completed-merged files not being cleared in primary? - hmm the logic isn't right today? a replica may skip pulling a given copy state, that recorded the finished merged segments? - later - since all nodes are local, we could have a different test only impl that just does local file copies instead of via tcp... - get all other "single shard" functions working too: this cluster should "act like" a single shard - SLM - controlled nrt reopen thread / returning long gen on write - live field values - add indexes - replica should also track maxSegmentName its seen, and tap into inflateGens if it's later promoted to primary? - if we named index files using segment's ID we wouldn't have file name conflicts after primary crash / rollback? - back pressure on indexing if replicas can't keep up? - get xlog working on top? needs to be checkpointed, so we can correlate IW ops to NRT reader version and prune xlog based on commit quorum - maybe fix IW to return "gen" or "seq id" or "segment name" or something? - replica can copy files from other replicas too / use multicast / rsync / something - each replica could also pre-open a SegmentReader after pre-copy when warming a merge - we can pre-copy newly flushed files too, for cases where reopen rate is low vs IW's flushing because RAM buffer is full - opto: pre-copy files as they are written; if they will become CFS, we can build CFS on the replica? - what about multiple commit points? - fix primary to init directly from an open replica, instead of having to commit/close the replica first */ // Tricky cases: // - we are pre-copying a merge, then replica starts up part way through, so it misses that pre-copy and must do it on next nrt point // - a down replica starts up, but it's "from the future" vs the current primary, and must copy over file names with different contents // but referenced by its latest commit point, so it must fully remove that commit ... which is a hazardous window // - replica comes up just as the primary is crashing / moving // - electing a new primary when a replica is just finishing its nrt sync: we need to wait for it so we are sure to get the "most up to // date" replica // - replica comes up after merged segment finished so it doesn't copy over the merged segment "promptly" (i.e. only sees it on NRT refresh) /** * Test case showing how to implement NRT replication. This test spawns a sub-process per-node, running TestNRTReplicationChild. * * One node is primary, and segments are periodically flushed there, then concurrently the N replica nodes copy the new files over and open new readers, while * primary also opens a new reader. * * Nodes randomly crash and are restarted. If the primary crashes, a replica is promoted. * * Merges are currently first finished on the primary and then pre-copied out to replicas with a merged segment warmer so they don't block * ongoing NRT reopens. Probably replicas could do their own merging instead, but this is more complex and may not be better overall * (merging takes a lot of IO resources). * * Slow network is simulated with a RateLimiter. */ // MockRandom's .sd file has no index header/footer: @SuppressCodecs({"MockRandom", "Memory", "Direct", "SimpleText"}) @SuppressSysoutChecks(bugUrl = "Stuff gets printed, important stuff for debugging a failure") public class TestStressNRTReplication extends LuceneTestCase { // Test evilness controls: /** Randomly crash the current primary (losing data!) and promote the "next best" replica. */ static final boolean DO_CRASH_PRIMARY = true; /** Randomly crash (JVM core dumps) a replica; it will later randomly be restarted and sync itself. */ static final boolean DO_CRASH_REPLICA = true; /** Randomly gracefully close a replica; it will later be restarted and sync itself. */ static final boolean DO_CLOSE_REPLICA = true; /** Randomly gracefully close the primary; it will later be restarted and sync itself. */ static final boolean DO_CLOSE_PRIMARY = true; /** If false, all child + parent output is interleaved into single stdout/err */ static final boolean SEPARATE_CHILD_OUTPUT = false; /** Randomly crash whole cluster and then restart it */ static final boolean DO_FULL_CLUSTER_CRASH = true; /** True if we randomly flip a bit while copying files out */ static final boolean DO_BIT_FLIPS_DURING_COPY = true; /** Set to a non-null value to force exactly that many nodes; else, it's random. */ static final Integer NUM_NODES = null; final AtomicBoolean failed = new AtomicBoolean(); final AtomicBoolean stop = new AtomicBoolean(); /** cwd where we start each child (server) node */ private Path childTempDir; long primaryGen; volatile long lastPrimaryVersion; volatile NodeProcess primary; volatile NodeProcess[] nodes; volatile long[] nodeTimeStamps; volatile boolean[] starting; Path[] indexPaths; Path transLogPath; SimpleTransLog transLog; final AtomicInteger markerUpto = new AtomicInteger(); final AtomicInteger markerID = new AtomicInteger(); /** Maps searcher version to how many hits the query body:the matched. */ final Map<Long,Integer> hitCounts = new ConcurrentHashMap<>(); /** Maps searcher version to how many marker documents matched. This should only ever grow (we never delete marker documents). */ final Map<Long,Integer> versionToMarker = new ConcurrentHashMap<>(); /** Maps searcher version to xlog location when refresh of this version started. */ final Map<Long,Long> versionToTransLogLocation = new ConcurrentHashMap<>(); final AtomicLong nodeStartCounter = new AtomicLong(); final Set<Integer> crashingNodes = Collections.synchronizedSet(new HashSet<>()); @Nightly public void test() throws Exception { Node.globalStartNS = System.nanoTime(); message("change thread name from " + Thread.currentThread().getName()); Thread.currentThread().setName("main"); childTempDir = createTempDir("child"); // We are parent process: // Silly bootstrapping: versionToTransLogLocation.put(0L, 0L); versionToMarker.put(0L, 0); int numNodes; if (NUM_NODES == null) { numNodes = TestUtil.nextInt(random(), 2, 10); } else { numNodes = NUM_NODES.intValue(); } System.out.println("TEST: using " + numNodes + " nodes"); transLogPath = createTempDir("NRTReplication").resolve("translog"); transLog = new SimpleTransLog(transLogPath); //state.rateLimiters = new RateLimiter[numNodes]; indexPaths = new Path[numNodes]; nodes = new NodeProcess[numNodes]; nodeTimeStamps = new long[numNodes]; Arrays.fill(nodeTimeStamps, Node.globalStartNS); starting = new boolean[numNodes]; for(int i=0;i<numNodes;i++) { indexPaths[i] = createTempDir("index" + i); } Thread[] indexers = new Thread[TestUtil.nextInt(random(), 1, 3)]; System.out.println("TEST: launch " + indexers.length + " indexer threads"); for(int i=0;i<indexers.length;i++) { indexers[i] = new IndexThread(); indexers[i].setName("indexer" + i); indexers[i].setDaemon(true); indexers[i].start(); } Thread[] searchers = new Thread[TestUtil.nextInt(random(), 1, 3)]; System.out.println("TEST: launch " + searchers.length + " searcher threads"); for(int i=0;i<searchers.length;i++) { searchers[i] = new SearchThread(); searchers[i].setName("searcher" + i); searchers[i].setDaemon(true); searchers[i].start(); } Thread restarter = new RestartThread(); restarter.setName("restarter"); restarter.setDaemon(true); restarter.start(); int runTimeSec; if (TEST_NIGHTLY) { runTimeSec = RANDOM_MULTIPLIER * TestUtil.nextInt(random(), 120, 240); } else { runTimeSec = RANDOM_MULTIPLIER * TestUtil.nextInt(random(), 45, 120); } System.out.println("TEST: will run for " + runTimeSec + " sec"); long endTime = System.nanoTime() + runTimeSec*1000000000L; sendReplicasToPrimary(); while (failed.get() == false && System.nanoTime() < endTime) { // Wait a bit: Thread.sleep(TestUtil.nextInt(random(), Math.min(runTimeSec*4, 200), runTimeSec*4)); if (primary != null && random().nextBoolean()) { NodeProcess curPrimary = primary; if (curPrimary != null) { // Save these before we start flush: long nextTransLogLoc = transLog.getNextLocation(); int markerUptoSav = markerUpto.get(); message("top: now flush primary; at least marker count=" + markerUptoSav); long result; try { result = primary.flush(markerUptoSav); } catch (Throwable t) { message("top: flush failed; skipping: " + t.getMessage()); result = -1; } if (result > 0) { // There were changes message("top: flush finished with changed; new primary version=" + result); lastPrimaryVersion = result; addTransLogLoc(lastPrimaryVersion, nextTransLogLoc); addVersionMarker(lastPrimaryVersion, markerUptoSav); } } } StringBuilder sb = new StringBuilder(); int liveCount = 0; for(int i=0;i<nodes.length;i++) { NodeProcess node = nodes[i]; if (node != null) { if (sb.length() != 0) { sb.append(" "); } liveCount++; if (node.isPrimary) { sb.append('P'); } else { sb.append('R'); } sb.append(i); } } message("PG=" + (primary == null ? "X" : primaryGen) + " " + liveCount + " (of " + nodes.length + ") nodes running: " + sb); // Commit a random node, primary or replica if (random().nextInt(10) == 1) { NodeProcess node = nodes[random().nextInt(nodes.length)]; if (node != null && node.nodeIsClosing.get() == false) { // TODO: if this node is primary, it means we committed an unpublished version (not exposed as an NRT point)... not sure it matters. // maybe we somehow allow IW to commit a specific sis (the one we just flushed)? message("top: now commit node=" + node); try { node.commitAsync(); } catch (Throwable t) { message("top: hit exception during commit with R" + node.id + "; skipping"); t.printStackTrace(System.out); } } } } message("TEST: top: test done, now close"); stop.set(true); for(Thread thread : indexers) { thread.join(); } for(Thread thread : searchers) { thread.join(); } restarter.join(); // Close replicas before primary so we cancel any in-progres replications: System.out.println("TEST: top: now close replicas"); List<Closeable> toClose = new ArrayList<>(); for(NodeProcess node : nodes) { if (node != primary && node != null) { toClose.add(node); } } IOUtils.close(toClose); IOUtils.close(primary); IOUtils.close(transLog); if (failed.get() == false) { message("TEST: top: now checkIndex"); for(Path path : indexPaths) { message("TEST: check " + path); MockDirectoryWrapper dir = newMockFSDirectory(path); // Just too slow otherwise dir.setCrossCheckTermVectorsOnClose(false); dir.close(); } } else { message("TEST: failed; skip checkIndex"); } } private boolean anyNodesStarting() { for(int id=0;id<nodes.length;id++) { if (starting[id]) { return true; } } return false; } /** Picks a replica and promotes it as new primary. */ private void promoteReplica() throws IOException { message("top: primary crashed; now pick replica to promote"); long maxSearchingVersion = -1; NodeProcess replicaToPromote = null; // We must promote the most current replica, because otherwise file name reuse can cause a replication to fail when it needs to copy // over a file currently held open for searching. This also minimizes recovery work since the most current replica means less xlog // replay to catch up: for (NodeProcess node : nodes) { if (node != null) { message("ask " + node + " for its current searching version"); long searchingVersion; try { searchingVersion = node.getSearchingVersion(); } catch (Throwable t) { message("top: hit SocketException during getSearchingVersion with R" + node.id + "; skipping"); t.printStackTrace(System.out); continue; } message(node + " has searchingVersion=" + searchingVersion); if (searchingVersion > maxSearchingVersion) { maxSearchingVersion = searchingVersion; replicaToPromote = node; } } } if (replicaToPromote == null) { message("top: no replicas running; skipping primary promotion"); return; } message("top: promote " + replicaToPromote + " version=" + maxSearchingVersion + "; now commit"); try { replicaToPromote.commit(); } catch (Throwable t) { // Something wrong with this replica; skip it: message("top: hit exception during commit with R" + replicaToPromote.id + "; skipping"); t.printStackTrace(System.out); return; } message("top: now shutdown " + replicaToPromote); if (replicaToPromote.shutdown() == false) { message("top: shutdown failed for R" + replicaToPromote.id + "; skipping primary promotion"); return; } int id = replicaToPromote.id; message("top: now startPrimary " + replicaToPromote); startPrimary(replicaToPromote.id); } void startPrimary(int id) throws IOException { message(id + ": top: startPrimary lastPrimaryVersion=" + lastPrimaryVersion); assert nodes[id] == null; // Force version of new primary to advance beyond where old primary was, so we never re-use versions. It may have // already advanced beyond newVersion, e.g. if it flushed new segments while during xlog replay: // First start node as primary (it opens an IndexWriter) but do not publish it for searching until we replay xlog: NodeProcess newPrimary = startNode(id, indexPaths[id], true, lastPrimaryVersion+1); if (newPrimary == null) { message("top: newPrimary failed to start; abort"); return; } // Get xlog location that this node was guaranteed to already have indexed through; this may replay some ops already indexed but it's OK // because the ops are idempotent: we updateDocument (by docid) on replay even for original addDocument: Long startTransLogLoc; Integer markerCount; if (newPrimary.initCommitVersion == 0) { startTransLogLoc = 0L; markerCount = 0; } else { startTransLogLoc = versionToTransLogLocation.get(newPrimary.initCommitVersion); markerCount = versionToMarker.get(newPrimary.initCommitVersion); } assert startTransLogLoc != null: "newPrimary.initCommitVersion=" + newPrimary.initCommitVersion + " is missing from versionToTransLogLocation: keys=" + versionToTransLogLocation.keySet(); assert markerCount != null: "newPrimary.initCommitVersion=" + newPrimary.initCommitVersion + " is missing from versionToMarker: keys=" + versionToMarker.keySet(); // When the primary starts, the userData in its latest commit point tells us which version it had indexed up to, so we know where to // replay from in the xlog. However, we forcefuly advance the version, and then IW on init (or maybe getReader) also adds 1 to it. // Since we publish the primary in this state (before xlog replay is done), a replica can start up at this point and pull this version, // and possibly later be chosen as a primary, causing problems if the version is known recorded in the translog map. So we record it // here: addTransLogLoc(newPrimary.initInfosVersion, startTransLogLoc); addVersionMarker(newPrimary.initInfosVersion, markerCount); assert newPrimary.initInfosVersion >= lastPrimaryVersion; message("top: now change lastPrimaryVersion from " + lastPrimaryVersion + " to " + newPrimary.initInfosVersion + "; startup marker count " + markerCount); lastPrimaryVersion = newPrimary.initInfosVersion; long nextTransLogLoc = transLog.getNextLocation(); long t0 = System.nanoTime(); message("top: start translog replay " + startTransLogLoc + " (version=" + newPrimary.initCommitVersion + ") to " + nextTransLogLoc + " (translog end)"); try { transLog.replay(newPrimary, startTransLogLoc, nextTransLogLoc); } catch (IOException ioe) { message("top: replay xlog failed; shutdown new primary"); ioe.printStackTrace(System.out); newPrimary.shutdown(); return; } long t1 = System.nanoTime(); message("top: done translog replay; took " + ((t1 - t0)/1000000.0) + " msec; now publish primary"); // Publish new primary only after translog has succeeded in replaying; this is important, for this test anyway, so we keep a "linear" // history so enforcing marker counts is correct. E.g., if we publish first and replay translog concurrently with incoming ops, then // a primary commit that happens while translog is still replaying will incorrectly record the translog loc into the commit user data // when in fact that commit did NOT reflect all prior ops. So if we crash and start up again from that commit point, we are missing // ops. nodes[id] = newPrimary; primary = newPrimary; sendReplicasToPrimary(); } /** Launches a child "server" (separate JVM), which is either primary or replica node */ @SuppressForbidden(reason = "ProcessBuilder requires java.io.File for CWD") NodeProcess startNode(final int id, Path indexPath, boolean isPrimary, long forcePrimaryVersion) throws IOException { nodeTimeStamps[id] = System.nanoTime(); List<String> cmd = new ArrayList<>(); NodeProcess curPrimary = primary; cmd.add(System.getProperty("java.home") + System.getProperty("file.separator") + "bin" + System.getProperty("file.separator") + "java"); cmd.add("-Xmx512m"); if (curPrimary != null) { cmd.add("-Dtests.nrtreplication.primaryTCPPort=" + curPrimary.tcpPort); } else if (isPrimary == false) { // We cannot start a replica when there is no primary: return null; } // This is very costly (takes more time to check than it did to index); we do this ourselves in the end instead of each time a replica // is restarted: // cmd.add("-Dtests.nrtreplication.checkonclose=true"); cmd.add("-Dtests.nrtreplication.node=true"); cmd.add("-Dtests.nrtreplication.nodeid=" + id); cmd.add("-Dtests.nrtreplication.startNS=" + Node.globalStartNS); cmd.add("-Dtests.nrtreplication.indexpath=" + indexPath); if (isPrimary) { cmd.add("-Dtests.nrtreplication.isPrimary=true"); cmd.add("-Dtests.nrtreplication.forcePrimaryVersion=" + forcePrimaryVersion); if (DO_CRASH_PRIMARY) { cmd.add("-Dtests.nrtreplication.doRandomCrash=true"); } if (DO_CLOSE_PRIMARY) { cmd.add("-Dtests.nrtreplication.doRandomClose=true"); } } else { if (DO_CRASH_REPLICA) { cmd.add("-Dtests.nrtreplication.doRandomCrash=true"); } if (DO_CLOSE_REPLICA) { cmd.add("-Dtests.nrtreplication.doRandomClose=true"); } } if (DO_BIT_FLIPS_DURING_COPY) { cmd.add("-Dtests.nrtreplication.doFlipBitsDuringCopy=true"); } long myPrimaryGen = primaryGen; cmd.add("-Dtests.nrtreplication.primaryGen=" + myPrimaryGen); // Mixin our own counter because this is called from a fresh thread which means the seed otherwise isn't changing each time we spawn a // new node: long seed = random().nextLong() * nodeStartCounter.incrementAndGet(); cmd.add("-Dtests.seed=" + SeedUtils.formatSeed(seed)); cmd.add("-ea"); cmd.add("-cp"); cmd.add(System.getProperty("java.class.path")); cmd.add("org.junit.runner.JUnitCore"); cmd.add(getClass().getName().replace(getClass().getSimpleName(), "SimpleServer")); Writer childLog; if (SEPARATE_CHILD_OUTPUT) { Path childOut = childTempDir.resolve(id + ".log"); message("logging to " + childOut); childLog = Files.newBufferedWriter(childOut, StandardCharsets.UTF_8, StandardOpenOption.APPEND, StandardOpenOption.CREATE); childLog.write("\n\nSTART NEW CHILD:\n"); } else { childLog = null; } //message("child process command: " + cmd); ProcessBuilder pb = new ProcessBuilder(cmd); pb.redirectErrorStream(true); // Important, so that the scary looking hs_err_<pid>.log appear under our test temp dir: pb.directory(childTempDir.toFile()); Process p = pb.start(); BufferedReader r; try { r = new BufferedReader(new InputStreamReader(p.getInputStream(), IOUtils.UTF_8)); } catch (UnsupportedEncodingException uee) { throw new RuntimeException(uee); } int tcpPort = -1; long initCommitVersion = -1; long initInfosVersion = -1; Pattern logTimeStart = Pattern.compile("^[0-9\\.]+s .*"); boolean willCrash = false; while (true) { String l = r.readLine(); if (l == null) { message("top: node=" + id + " failed to start"); try { p.waitFor(); } catch (InterruptedException ie) { throw new RuntimeException(ie); } message("exit value=" + p.exitValue()); if (p.exitValue() == 0) { message("zero exit status; assuming failed to remove segments_N; skipping"); return null; } // Hackity hack, in case primary crashed/closed and we haven't noticed (reaped the process) yet: if (isPrimary == false) { for(int i=0;i<100;i++) { NodeProcess primary2 = primary; if (primaryGen != myPrimaryGen || primary2 == null || primary2.nodeIsClosing.get()) { // OK: primary crashed while we were trying to start, so it's expected/allowed that we could not start the replica: message("primary crashed/closed while replica R" + id + " tried to start; skipping"); return null; } else { try { Thread.sleep(10); } catch (InterruptedException ie) { throw new ThreadInterruptedException(ie); } } } } // Should fail the test: message("top: now fail test replica R" + id + " failed to start"); failed.set(true); throw new RuntimeException("replica R" + id + " failed to start"); } if (childLog != null) { childLog.write(l); childLog.write("\n"); childLog.flush(); } else if (logTimeStart.matcher(l).matches()) { // Already a well-formed log output: System.out.println(l); } else { message(l); } if (l.startsWith("PORT: ")) { tcpPort = Integer.parseInt(l.substring(6).trim()); } else if (l.startsWith("COMMIT VERSION: ")) { initCommitVersion = Integer.parseInt(l.substring(16).trim()); } else if (l.startsWith("INFOS VERSION: ")) { initInfosVersion = Integer.parseInt(l.substring(15).trim()); } else if (l.contains("will crash after")) { willCrash = true; } else if (l.startsWith("NODE STARTED")) { break; } } final boolean finalWillCrash = willCrash; final AtomicBoolean nodeIsClosing = new AtomicBoolean(); // Baby sits the child process, pulling its stdout and printing to our stdout, calling nodeClosed once it exits: Thread pumper = ThreadPumper.start( new Runnable() { @Override public void run() { message("now wait for process " + p); try { p.waitFor(); } catch (Throwable t) { throw new RuntimeException(t); } message("done wait for process " + p); int exitValue = p.exitValue(); message("exit value=" + exitValue + " willCrash=" + finalWillCrash); if (childLog != null) { try { childLog.write("process done; exitValue=" + exitValue + "\n"); childLog.close(); } catch (IOException ioe) { throw new RuntimeException(ioe); } } if (exitValue != 0 && finalWillCrash == false && crashingNodes.remove(id) == false) { // should fail test failed.set(true); if (childLog != null) { throw new RuntimeException("node " + id + " process had unexpected non-zero exit status=" + exitValue + "; see " + childLog + " for details"); } else { throw new RuntimeException("node " + id + " process had unexpected non-zero exit status=" + exitValue); } } nodeClosed(id); } }, r, System.out, childLog, nodeIsClosing); pumper.setName("pump" + id); message("top: node=" + id + " started at tcpPort=" + tcpPort + " initCommitVersion=" + initCommitVersion + " initInfosVersion=" + initInfosVersion); return new NodeProcess(p, id, tcpPort, pumper, isPrimary, initCommitVersion, initInfosVersion, nodeIsClosing); } private void nodeClosed(int id) { NodeProcess oldNode = nodes[id]; if (primary != null && oldNode == primary) { message("top: " + primary + ": primary process finished"); primary = null; primaryGen++; } else { message("top: " + oldNode + ": replica process finished"); } if (oldNode != null) { oldNode.isOpen = false; } nodes[id] = null; nodeTimeStamps[id] = System.nanoTime(); sendReplicasToPrimary(); } /** Sends currently alive replicas to primary, which uses this to know who to notify when it does a refresh */ private void sendReplicasToPrimary() { NodeProcess curPrimary = primary; if (curPrimary != null) { List<NodeProcess> replicas = new ArrayList<>(); for (NodeProcess node : nodes) { if (node != null && node.isPrimary == false) { replicas.add(node); } } message("top: send " + replicas.size() + " replicas to primary"); try (Connection c = new Connection(curPrimary.tcpPort)) { c.out.writeByte(SimplePrimaryNode.CMD_SET_REPLICAS); c.out.writeVInt(replicas.size()); for(NodeProcess replica : replicas) { c.out.writeVInt(replica.id); c.out.writeVInt(replica.tcpPort); } c.flush(); c.in.readByte(); } catch (Throwable t) { message("top: ignore exc sending replicas to primary P" + curPrimary.id + " at tcpPort=" + curPrimary.tcpPort); t.printStackTrace(System.out); } } } void addVersionMarker(long version, int count) { //System.out.println("ADD VERSION MARKER version=" + version + " count=" + count); if (versionToMarker.containsKey(version)) { int curCount = versionToMarker.get(version); if (curCount != count) { message("top: wrong marker count version=" + version + " count=" + count + " curCount=" + curCount); throw new IllegalStateException("version=" + version + " count=" + count + " curCount=" + curCount); } } else { message("top: record marker count: version=" + version + " count=" + count); versionToMarker.put(version, count); } } void addTransLogLoc(long version, long loc) { message("top: record transLogLoc: version=" + version + " loc=" + loc); versionToTransLogLocation.put(version, loc); } // Periodically wakes up and starts up any down nodes: private class RestartThread extends Thread { @Override public void run() { List<Thread> startupThreads = Collections.synchronizedList(new ArrayList<>()); try { while (stop.get() == false) { Thread.sleep(TestUtil.nextInt(random(), 50, 500)); //message("top: restarter cycle"); // Randomly crash full cluster: if (DO_FULL_CLUSTER_CRASH && random().nextInt(500) == 17) { message("top: full cluster crash"); for(int i=0;i<nodes.length;i++) { if (starting[i]) { message("N" + i + ": top: wait for startup so we can crash..."); while (starting[i]) { Thread.sleep(10); } message("N" + i + ": top: done wait for startup"); } NodeProcess node = nodes[i]; if (node != null) { crashingNodes.add(i); message("top: N" + node.id + ": top: now crash node"); node.crash(); message("top: N" + node.id + ": top: done crash node"); } } } List<Integer> downNodes = new ArrayList<>(); StringBuilder b = new StringBuilder(); long nowNS = System.nanoTime(); for(int i=0;i<nodes.length;i++) { b.append(' '); double sec = (nowNS - nodeTimeStamps[i])/1000000000.0; String prefix; if (nodes[i] == null) { downNodes.add(i); if (starting[i]) { prefix = "s"; } else { prefix = "x"; } } else { prefix = ""; } if (primary != null && nodes[i] == primary) { prefix += "p"; } b.append(String.format(Locale.ROOT, "%s%d(%.1fs)", prefix, i, sec)); } message("node status" + b.toString()); message("downNodes=" + downNodes); // If primary is down, promote a replica: if (primary == null) { if (anyNodesStarting()) { message("top: skip promote replica: nodes are still starting"); continue; } promoteReplica(); } // Randomly start up a down a replica: // Stop or start a replica if (downNodes.isEmpty() == false) { int idx = downNodes.get(random().nextInt(downNodes.size())); if (starting[idx] == false) { if (primary == null) { if (downNodes.size() == nodes.length) { // Cold start: entire cluster is down, start this node up as the new primary message("N" + idx + ": top: cold start as primary"); startPrimary(idx); } } else if (random().nextDouble() < ((double) downNodes.size())/nodes.length) { // Start up replica: starting[idx] = true; message("N" + idx + ": top: start up: launch thread"); Thread t = new Thread() { @Override public void run() { try { message("N" + idx + ": top: start up thread"); nodes[idx] = startNode(idx, indexPaths[idx], false, -1); sendReplicasToPrimary(); } catch (Throwable t) { failed.set(true); stop.set(true); throw new RuntimeException(t); } finally { starting[idx] = false; startupThreads.remove(Thread.currentThread()); } } }; t.setName("start R" + idx); t.start(); startupThreads.add(t); } } else { message("node " + idx + " still starting"); } } } System.out.println("Restarter: now stop: join " + startupThreads.size() + " startup threads"); while (startupThreads.size() > 0) { Thread.sleep(10); } } catch (Throwable t) { failed.set(true); stop.set(true); throw new RuntimeException(t); } } } /** Randomly picks a node and runs a search against it */ private class SearchThread extends Thread { @Override public void run() { // Maps version to number of hits for silly 'the' TermQuery: Query theQuery = new TermQuery(new Term("body", "the")); // Persists connections Map<Integer,Connection> connections = new HashMap<>(); while (stop.get() == false) { NodeProcess node = nodes[random().nextInt(nodes.length)]; if (node == null || node.isOpen == false) { continue; } if (node.lock.tryLock() == false) { // Node is in the process of closing or crashing or something continue; } boolean nodeIsPrimary = node == primary; try { Thread.currentThread().setName("Searcher node=" + node); //System.out.println("S: cycle; conns=" + connections); Connection c = connections.get(node.id); long version; try { if (c == null) { //System.out.println("S: new connection " + node.id + " " + Thread.currentThread().getName()); c = new Connection(node.tcpPort); connections.put(node.id, c); } else { //System.out.println("S: reuse connection " + node.id + " " + Thread.currentThread().getName()); } c.out.writeByte(SimplePrimaryNode.CMD_SEARCH); c.flush(); while (c.sockIn.available() == 0) { if (stop.get()) { break; } if (node.isOpen == false) { throw new IOException("node closed"); } Thread.sleep(1); } version = c.in.readVLong(); while (c.sockIn.available() == 0) { if (stop.get()) { break; } if (node.isOpen == false) { throw new IOException("node closed"); } Thread.sleep(1); } int hitCount = c.in.readVInt(); Integer oldHitCount = hitCounts.get(version); // TODO: we never prune this map... if (oldHitCount == null) { hitCounts.put(version, hitCount); message("top: searcher: record search hitCount version=" + version + " hitCount=" + hitCount + " node=" + node); if (nodeIsPrimary && version > lastPrimaryVersion) { // It's possible a search request sees a new primary version because it's in the process of flushing, but then the primary // crashes. In this case we need to ensure new primary forces its version beyond this: message("top: searcher: set lastPrimaryVersion=" + lastPrimaryVersion + " vs " + version); lastPrimaryVersion = version; } } else { // Just ensure that all nodes show the same hit count for // the same version, i.e. they really are replicas of one another: if (oldHitCount.intValue() != hitCount) { failed.set(true); stop.set(true); message("top: searcher: wrong version hitCount: version=" + version + " oldHitCount=" + oldHitCount.intValue() + " hitCount=" + hitCount); fail("version=" + version + " oldHitCount=" + oldHitCount.intValue() + " hitCount=" + hitCount); } } } catch (IOException ioe) { //message("top: searcher: ignore exc talking to node " + node + ": " + ioe); //ioe.printStackTrace(System.out); IOUtils.closeWhileHandlingException(c); connections.remove(node.id); continue; } // This can be null if primary is flushing, has already refreshed its searcher, but is e.g. still notifying replicas and hasn't // yet returned the version to us, in which case this searcher thread can see the version before the main thread has added it to // versionToMarker: Integer expectedAtLeastHitCount = versionToMarker.get(version); if (expectedAtLeastHitCount != null && expectedAtLeastHitCount > 0 && random().nextInt(10) == 7) { try { c.out.writeByte(SimplePrimaryNode.CMD_MARKER_SEARCH); c.out.writeVInt(expectedAtLeastHitCount); c.flush(); while (c.sockIn.available() == 0) { if (stop.get()) { break; } if (node.isOpen == false) { throw new IOException("node died"); } Thread.sleep(1); } version = c.in.readVLong(); while (c.sockIn.available() == 0) { if (stop.get()) { break; } if (node.isOpen == false) { throw new IOException("node died"); } Thread.sleep(1); } int hitCount = c.in.readVInt(); // Look for data loss: make sure all marker docs are visible: if (hitCount < expectedAtLeastHitCount) { String failMessage = "node=" + node + ": documents were lost version=" + version + " hitCount=" + hitCount + " vs expectedAtLeastHitCount=" + expectedAtLeastHitCount; message(failMessage); failed.set(true); stop.set(true); fail(failMessage); } } catch (IOException ioe) { //message("top: searcher: ignore exc talking to node " + node + ": " + ioe); //throw new RuntimeException(ioe); //ioe.printStackTrace(System.out); IOUtils.closeWhileHandlingException(c); connections.remove(node.id); continue; } } Thread.sleep(10); } catch (Throwable t) { failed.set(true); stop.set(true); throw new RuntimeException(t); } finally { node.lock.unlock(); } } System.out.println("Searcher: now stop"); IOUtils.closeWhileHandlingException(connections.values()); } } private class IndexThread extends Thread { @Override public void run() { try { LineFileDocs docs = new LineFileDocs(random()); int docCount = 0; // How often we do an update/delete vs add: double updatePct = random().nextDouble(); // Varies how many docs/sec we index: int sleepChance = TestUtil.nextInt(random(), 4, 100); message("top: indexer: updatePct=" + updatePct + " sleepChance=" + sleepChance); long lastTransLogLoc = transLog.getNextLocation(); NodeProcess curPrimary = null; Connection c = null; while (stop.get() == false) { try { while (stop.get() == false && curPrimary == null) { Thread.sleep(10); curPrimary = primary; if (curPrimary != null) { c = new Connection(curPrimary.tcpPort); c.out.writeByte(SimplePrimaryNode.CMD_INDEXING); break; } } if (stop.get()) { break; } Thread.currentThread().setName("indexer p" + curPrimary.id); if (random().nextInt(10) == 7) { // We use the marker docs to check for data loss in search thread: Document doc = new Document(); int id = markerID.getAndIncrement(); String idString = "m"+id; doc.add(newStringField("docid", idString, Field.Store.YES)); doc.add(newStringField("marker", "marker", Field.Store.YES)); curPrimary.addOrUpdateDocument(c, doc, false); transLog.addDocument(idString, doc); // Only increment after primary replies: markerUpto.getAndIncrement(); //message("index marker=" + idString + "; translog is " + Node.bytesToString(Files.size(transLogPath))); } if (docCount > 0 && random().nextDouble() < updatePct) { int randomID = random().nextInt(docCount); String randomIDString = Integer.toString(randomID); if (random().nextBoolean()) { // Replace previous doc Document doc = docs.nextDoc(); ((Field) doc.getField("docid")).setStringValue(randomIDString); curPrimary.addOrUpdateDocument(c, doc, true); transLog.updateDocument(randomIDString, doc); } else { // Delete previous doc curPrimary.deleteDocument(c, randomIDString); transLog.deleteDocuments(randomIDString); } } else { // Add new doc: Document doc = docs.nextDoc(); String idString = Integer.toString(docCount++); ((Field) doc.getField("docid")).setStringValue(idString); curPrimary.addOrUpdateDocument(c, doc, false); transLog.addDocument(idString, doc); } } catch (IOException se) { // Assume primary crashed if (c != null) { message("top: indexer lost connection to primary"); } try { c.close(); } catch (Throwable t) { } curPrimary = null; c = null; } if (random().nextInt(sleepChance) == 0) { Thread.sleep(10); } if (random().nextInt(100) == 17) { int pauseMS = TestUtil.nextInt(random(), 500, 2000); System.out.println("Indexer: now pause for " + pauseMS + " msec..."); Thread.sleep(pauseMS); System.out.println("Indexer: done pause for a bit..."); } } if (curPrimary != null) { try { c.out.writeByte(SimplePrimaryNode.CMD_INDEXING_DONE); c.flush(); c.in.readByte(); } catch (IOException se) { // Assume primary crashed message("top: indexer lost connection to primary"); try { c.close(); } catch (Throwable t) { } curPrimary = null; c = null; } } System.out.println("Indexer: now stop"); } catch (Throwable t) { failed.set(true); stop.set(true); throw new RuntimeException(t); } } } static void message(String message) { long now = System.nanoTime(); System.out.println(String.format(Locale.ROOT, "%5.3fs : parent [%11s] %s", (now-Node.globalStartNS)/1000000000., Thread.currentThread().getName(), message)); } static void message(String message, long localStartNS) { long now = System.nanoTime(); System.out.println(String.format(Locale.ROOT, "%5.3fs %5.1fs: parent [%11s] %s", (now-Node.globalStartNS)/1000000000., (now-localStartNS)/1000000000., Thread.currentThread().getName(), message)); } }