RingCqBurstRoundTripWithGroups.java example

Explorer
JCTools-master
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.jctools.jmh.latency.spsc;

import org.jctools.queues.alt.ConcurrentQueue;
import org.jctools.queues.alt.ConcurrentQueueByTypeFactory;
import org.jctools.queues.alt.ConcurrentQueueConsumer;
import org.jctools.queues.alt.ConcurrentQueueProducer;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.Control;

import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * Measure the Round Trip Time between 2 or more threads communicating via chained queues. This is a
 * performance edge case for queues as there is no scope for batching. The size of the batch will hopefully
 * expose near empty performance boundary cases.
 * <p>
 * Parameters must obey the following rules:<br>
 * <li>1 thread is the source(running ping), the rest are links</li>
 * <li>The chain length must be equal to the total number of threads</li>
 * <li>Burst size must not exceed the overall capacity of the ring</li>
 * <li>Only a single group of this benchmark can be executed!</li>
 * <li>Iterations must be synchronized(which is the default)</li>
 * <p>
 * To launch this benchmark with chain length 3, run:<br>
 * export JVM_ARGS=-server -XX:+UseCondCardMark -XX:CompileThreshold=100000<br>
 * export QUEUE_ARGS=-Dq.type=3 -Dsparse.shift=0 -Dpow2.capacity=15<br>
 * java $JVM_ARGS $QUEUE_ARGS <b>-Dburst.size=1 -Dchain.length=3</b> -jar target/microbenchmarks.jar <b>-tg
 * 2,1</b> -f 1 ".*.RingBurstRoundTripWithGroups.*"<br>
 * <p>
 * Note that when running this sort of benchmark thread affinity can effect performance significantly. It is
 * therefore recommended you pin the benchmark run to a uniform set of core such that all threads are the same
 * distance from each other. For example, when the chain.length is 2 we can run either pinned to 2 thread
 * running on same core (taskset -c 0,1), or across 2 separate cores (taskset -c 2,4). When running larger
 * chains it is recommended you avoid hyper threaded cores for stability. For example when running with
 * chain.length=4 we can pin the 4 threads to 4 separate cores (taskset -c 1,3,5,7).
 * <p>
 * The instability in the results stems from the different possible layouts chains can have. Consider the
 * case of chain.length=4 using 4 threads on 2 cores the ring layout could be:
 * <li> 0 -> 1 -> 2 -> 3 -> 0: leading to 2 same core transitions + 2 cross core transitions
 * <li> 0 -> 2 -> 1 -> 3 -> 0: leading to 4 cross core transitions
 * 
 * @author nitsanw
 * 
 */
@State(Scope.Group)
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 10, time = 1, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 10, time = 1, timeUnit = TimeUnit.SECONDS)
public class RingCqBurstRoundTripWithGroups {
    private static final int CHAIN_LENGTH = Integer.getInteger("chain.length", 2);
    private static final int BURST_SIZE = Integer.getInteger("burst.size", 1);
    private static final Integer DUMMY_MESSAGE = 1;
    @SuppressWarnings("unchecked")
    private final static ConcurrentQueue<Integer>[] chain = new ConcurrentQueue[CHAIN_LENGTH];
    /**
     * This is a bit annoying, I need the threads to keep their queues, so each thread needs an index. The id
     * is used to pick the in/out queues.
     */
    private final static AtomicInteger idx = new AtomicInteger();
    private final static ThreadLocal<Integer> tlIndex = new ThreadLocal<Integer>() {
        protected Integer initialValue() {
            return idx.getAndIncrement();
        }
    };

    /**
     * Link in the chain passes events from chain[threadIndex] to chain[(id + 1) % CHAIN_LENGTH].
     * <p>
     * Note that while the state is per thread, the thread can change per iteration. We use the above thread
     * id to maintain the same queues are selected per thread.
     */
    @State(Scope.Thread)
    public static class Link {
        final ConcurrentQueueConsumer<Integer> in;
        final ConcurrentQueueProducer<Integer> out;

        public Link() {
            int id = tlIndex.get();
            // the old in out, in out
            this.in = chain[id % CHAIN_LENGTH].consumer();
            this.out = chain[(id + 1) % CHAIN_LENGTH].producer();
        }

        public void link() {
            // we could use the control here, but there's no reason as it is use externally and we only
            // really want to measure the ping method
            Integer e = in.poll();
            if (e != null) {
                out.offer(e);
            }
        }

        /**
         * We want to always start with an empty inbound. Iteration tear downs are synchronized.
         */
        @TearDown(Level.Iteration)
        public void clear() {
            // SPSC -> consumer must clear the queue
            in.clear();
        }
    }

    /**
     * The source of events in the ring. Sends a burst of events into chain[(id + 1) % CHAIN_LENGTH] and waits
     * until the burst makes it through the ring back to chain[id].
     * <p>
     * Note that while the state is per thread, the thread can change per iteration. We use the above thread
     * id to maintain the same queues are selected per thread.
     */
    @State(Scope.Thread)
    public static class Source {
        final ConcurrentQueueProducer<Integer> start;
        final ConcurrentQueueConsumer<Integer> end;

        public Source() {
            int id = tlIndex.get();
            // the source ties the knot in our ring
            this.end = chain[id % CHAIN_LENGTH].consumer();
            this.start = chain[(id + 1) % CHAIN_LENGTH].producer();
        }

        public void ping(Control ctl) {
            for (int i = 0; i < BURST_SIZE; i++) {
                start.offer(DUMMY_MESSAGE);
            }
            for (int i = 0; i < BURST_SIZE; i++) {
                while (!ctl.stopMeasurement && end.poll() == null) {
                }
            }
        }

        /**
         * We want to always start with an empty inbound. Iteration tear downs are synchronized.
         */
        @TearDown(Level.Iteration)
        public void clear() {
            // SPSC -> consumer must clear the queue
            end.clear();
        }
    }

    @Setup(Level.Trial)
    public void prepareChain() {
        // can't have group threads set to zero on a method, so can't handle the length of 1 case
        if (CHAIN_LENGTH < 2) {
            throw new IllegalArgumentException("Chain length must be 2 or more");
        }
        // This is an estimate, but for bounded queues if the burst size is more than actual ring capacity
        // the benchmark will hang/
        if (BURST_SIZE > ConcurrentQueueByTypeFactory.QUEUE_CAPACITY * CHAIN_LENGTH >> 1) {
            throw new IllegalArgumentException("Batch size exceeds estimated capacity");
        }
        // initialize the chain
        for (int i = 0; i < CHAIN_LENGTH; i++) {
            chain[i] = ConcurrentQueueByTypeFactory.createQueue();
        }
    }

    @Benchmark
    @Group("ring")
    @GroupThreads(1)
    public void ping(Control ctl, Source s) {
        s.ping(ctl);
    }

    /**
     * @param ctl required here to make the benchmark generate code correctly(JMH 0.2 issue, fixed on main)
     */
    @Benchmark
    @Group("ring")
    @GroupThreads(1)
    public void loop(Control ctl, Link l) {
        l.link();
    }
}