/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.jctools.jmh.latency.spsc; import org.jctools.queues.alt.ConcurrentQueue; import org.jctools.queues.alt.ConcurrentQueueByTypeFactory; import org.jctools.queues.alt.ConcurrentQueueConsumer; import org.jctools.queues.alt.ConcurrentQueueProducer; import org.openjdk.jmh.annotations.*; import org.openjdk.jmh.infra.Control; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; /** * Measure the Round Trip Time between 2 or more threads communicating via chained queues. This is a * performance edge case for queues as there is no scope for batching. The size of the batch will hopefully * expose near empty performance boundary cases. * <p> * Parameters must obey the following rules:<br> * <li>1 thread is the source(running ping), the rest are links</li> * <li>The chain length must be equal to the total number of threads</li> * <li>Burst size must not exceed the overall capacity of the ring</li> * <li>Only a single group of this benchmark can be executed!</li> * <li>Iterations must be synchronized(which is the default)</li> * <p> * To launch this benchmark with chain length 3, run:<br> * export JVM_ARGS=-server -XX:+UseCondCardMark -XX:CompileThreshold=100000<br> * export QUEUE_ARGS=-Dq.type=3 -Dsparse.shift=0 -Dpow2.capacity=15<br> * java $JVM_ARGS $QUEUE_ARGS <b>-Dburst.size=1 -Dchain.length=3</b> -jar target/microbenchmarks.jar <b>-tg * 2,1</b> -f 1 ".*.RingBurstRoundTripWithGroups.*"<br> * <p> * Note that when running this sort of benchmark thread affinity can effect performance significantly. It is * therefore recommended you pin the benchmark run to a uniform set of core such that all threads are the same * distance from each other. For example, when the chain.length is 2 we can run either pinned to 2 thread * running on same core (taskset -c 0,1), or across 2 separate cores (taskset -c 2,4). When running larger * chains it is recommended you avoid hyper threaded cores for stability. For example when running with * chain.length=4 we can pin the 4 threads to 4 separate cores (taskset -c 1,3,5,7). * <p> * The instability in the results stems from the different possible layouts chains can have. Consider the * case of chain.length=4 using 4 threads on 2 cores the ring layout could be: * <li> 0 -> 1 -> 2 -> 3 -> 0: leading to 2 same core transitions + 2 cross core transitions * <li> 0 -> 2 -> 1 -> 3 -> 0: leading to 4 cross core transitions * * @author nitsanw * */ @State(Scope.Group) @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.NANOSECONDS) @Warmup(iterations = 10, time = 1, timeUnit = TimeUnit.SECONDS) @Measurement(iterations = 10, time = 1, timeUnit = TimeUnit.SECONDS) public class RingCqBurstRoundTripWithGroups { private static final int CHAIN_LENGTH = Integer.getInteger("chain.length", 2); private static final int BURST_SIZE = Integer.getInteger("burst.size", 1); private static final Integer DUMMY_MESSAGE = 1; @SuppressWarnings("unchecked") private final static ConcurrentQueue<Integer>[] chain = new ConcurrentQueue[CHAIN_LENGTH]; /** * This is a bit annoying, I need the threads to keep their queues, so each thread needs an index. The id * is used to pick the in/out queues. */ private final static AtomicInteger idx = new AtomicInteger(); private final static ThreadLocal<Integer> tlIndex = new ThreadLocal<Integer>() { protected Integer initialValue() { return idx.getAndIncrement(); } }; /** * Link in the chain passes events from chain[threadIndex] to chain[(id + 1) % CHAIN_LENGTH]. * <p> * Note that while the state is per thread, the thread can change per iteration. We use the above thread * id to maintain the same queues are selected per thread. */ @State(Scope.Thread) public static class Link { final ConcurrentQueueConsumer<Integer> in; final ConcurrentQueueProducer<Integer> out; public Link() { int id = tlIndex.get(); // the old in out, in out this.in = chain[id % CHAIN_LENGTH].consumer(); this.out = chain[(id + 1) % CHAIN_LENGTH].producer(); } public void link() { // we could use the control here, but there's no reason as it is use externally and we only // really want to measure the ping method Integer e = in.poll(); if (e != null) { out.offer(e); } } /** * We want to always start with an empty inbound. Iteration tear downs are synchronized. */ @TearDown(Level.Iteration) public void clear() { // SPSC -> consumer must clear the queue in.clear(); } } /** * The source of events in the ring. Sends a burst of events into chain[(id + 1) % CHAIN_LENGTH] and waits * until the burst makes it through the ring back to chain[id]. * <p> * Note that while the state is per thread, the thread can change per iteration. We use the above thread * id to maintain the same queues are selected per thread. */ @State(Scope.Thread) public static class Source { final ConcurrentQueueProducer<Integer> start; final ConcurrentQueueConsumer<Integer> end; public Source() { int id = tlIndex.get(); // the source ties the knot in our ring this.end = chain[id % CHAIN_LENGTH].consumer(); this.start = chain[(id + 1) % CHAIN_LENGTH].producer(); } public void ping(Control ctl) { for (int i = 0; i < BURST_SIZE; i++) { start.offer(DUMMY_MESSAGE); } for (int i = 0; i < BURST_SIZE; i++) { while (!ctl.stopMeasurement && end.poll() == null) { } } } /** * We want to always start with an empty inbound. Iteration tear downs are synchronized. */ @TearDown(Level.Iteration) public void clear() { // SPSC -> consumer must clear the queue end.clear(); } } @Setup(Level.Trial) public void prepareChain() { // can't have group threads set to zero on a method, so can't handle the length of 1 case if (CHAIN_LENGTH < 2) { throw new IllegalArgumentException("Chain length must be 2 or more"); } // This is an estimate, but for bounded queues if the burst size is more than actual ring capacity // the benchmark will hang/ if (BURST_SIZE > ConcurrentQueueByTypeFactory.QUEUE_CAPACITY * CHAIN_LENGTH >> 1) { throw new IllegalArgumentException("Batch size exceeds estimated capacity"); } // initialize the chain for (int i = 0; i < CHAIN_LENGTH; i++) { chain[i] = ConcurrentQueueByTypeFactory.createQueue(); } } @Benchmark @Group("ring") @GroupThreads(1) public void ping(Control ctl, Source s) { s.ping(ctl); } /** * @param ctl required here to make the benchmark generate code correctly(JMH 0.2 issue, fixed on main) */ @Benchmark @Group("ring") @GroupThreads(1) public void loop(Control ctl, Link l) { l.link(); } }