/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.jena.riot.lang; import java.util.Iterator; import java.util.NoSuchElementException; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.CancellationException; import java.util.concurrent.TimeUnit; import org.apache.jena.atlas.lib.Closeable; import org.apache.jena.riot.RiotException; import org.apache.jena.riot.system.PrefixMap; import org.apache.jena.riot.system.PrefixMapFactory; /** * <p> * A {@code PipedRDFIterator} should be connected to a {@link PipedRDFStream} * implementation; the piped iterator then provides whatever RDF primitives are * written to the {@code PipedRDFStream} * </p> * <p> * Typically, data is read from a {@code PipedRDFIterator} by one thread (the * consumer) and data is written to the corresponding {@code PipedRDFStream} by * some other thread (the producer). Attempting to use both objects from a * single thread is not recommended, as it may deadlock the thread. The * {@code PipedRDFIterator} contains a buffer, decoupling read operations from * write operations, within limits. * </p> * <p> * Inspired by Java's {@link java.io.PipedInputStream} and * {@link java.io.PipedOutputStream} * </p> * * @param <T> * The type of the RDF primitive, should be one of {@code Triple}, * {@code Quad}, or {@code Tuple<Node>} * * @see PipedTriplesStream * @see PipedQuadsStream * @see PipedTuplesStream */ public class PipedRDFIterator<T> implements Iterator<T>, Closeable { /** * Constant for default buffer size */ public static final int DEFAULT_BUFFER_SIZE = 10000; /** * Constant for default poll timeout in milliseconds, used to stop the * consumer deadlocking in certain circumstances */ public static final int DEFAULT_POLL_TIMEOUT = 1000; // one second /** * Constant for max number of failed poll attempts before the producer will * be declared as dead */ public static final int DEFAULT_MAX_POLLS = 10; private final BlockingQueue<T> queue; @SuppressWarnings("unchecked") private final T endMarker = (T) new Object(); private volatile boolean closedByConsumer = false; private volatile boolean closedByProducer = false; private volatile boolean finished = false; private volatile boolean threadReused = false; private volatile Thread consumerThread; private volatile Thread producerThread; private boolean connected = false; private int pollTimeout = DEFAULT_POLL_TIMEOUT; private int maxPolls = DEFAULT_MAX_POLLS; private T slot; private final Object lock = new Object(); // protects baseIri and prefixes private String baseIri; private final PrefixMap prefixes = PrefixMapFactory.createForInput(); /** * Creates a new piped RDF iterator with the default buffer size of * {@code DEFAULT_BUFFER_SIZE}. * <p> * Buffer size must be chosen carefully in order to avoid performance * problems, if you set the buffer size too low you will experience a lot of * blocked calls so it will take longer to consume the data from the * iterator. For best performance the buffer size should be at least 10% of * the expected input size though you may need to tune this depending on how * fast your consumer thread is. * </p> */ public PipedRDFIterator() { this(DEFAULT_BUFFER_SIZE); } /** * Creates a new piped RDF iterator * <p> * Buffer size must be chosen carefully in order to avoid performance * problems, if you set the buffer size too low you will experience a lot of * blocked calls so it will take longer to consume the data from the * iterator. For best performance the buffer size should be roughly 10% of * the expected input size though you may need to tune this depending on how * fast your consumer thread is. * </p> * * @param bufferSize * Buffer size */ public PipedRDFIterator(int bufferSize) { this(bufferSize, false, DEFAULT_POLL_TIMEOUT, DEFAULT_MAX_POLLS); } /** * Creates a new piped RDF iterator * <p> * Buffer size must be chosen carefully in order to avoid performance * problems, if you set the buffer size too low you will experience a lot of * blocked calls so it will take longer to consume the data from the * iterator. For best performance the buffer size should be roughly 10% of * the expected input size though you may need to tune this depending on how * fast your consumer thread is. * </p> * <p> * The fair parameter controls whether the locking policy used for the * buffer is fair. When enabled this reduces throughput but also reduces the * chance of thread starvation. This likely need only be set to {@code true} * if there will be multiple consumers. * </p> * * @param bufferSize * Buffer size * @param fair * Whether the buffer should use a fair locking policy */ public PipedRDFIterator(int bufferSize, boolean fair) { this(bufferSize, fair, DEFAULT_POLL_TIMEOUT, DEFAULT_MAX_POLLS); } /** * Creates a new piped RDF iterator * <p> * Buffer size must be chosen carefully in order to avoid performance * problems, if you set the buffer size too low you will experience a lot of * blocked calls so it will take longer to consume the data from the * iterator. For best performance the buffer size should be roughly 10% of * the expected input size though you may need to tune this depending on how * fast your consumer thread is. * </p> * <p> * The {@code fair} parameter controls whether the locking policy used for * the buffer is fair. When enabled this reduces throughput but also reduces * the chance of thread starvation. This likely need only be set to * {@code true} if there will be multiple consumers. * </p> * <p> * The {@code pollTimeout} parameter controls how long each poll attempt * waits for data to be produced. This prevents the consumer thread from * blocking indefinitely and allows it to detect various potential deadlock * conditions e.g. dead producer thread, another consumer closed the * iterator etc. and errors out accordingly. It is unlikely that you will * ever need to adjust this from the default value provided by * {@link #DEFAULT_POLL_TIMEOUT}. * </p> * <p> * The {@code maxPolls} parameter controls how many poll attempts will be * made by a single consumer thread within the context of a single call to * {@link #hasNext()} before the iterator declares the producer to be dead * and errors out accordingly. You may need to adjust this if you have a * slow producer thread or many consumer threads. * </p> * * @param bufferSize * Buffer size * @param fair * Whether the buffer should use a fair locking policy * @param pollTimeout * Poll timeout in milliseconds * @param maxPolls * Max poll attempts */ public PipedRDFIterator(int bufferSize, boolean fair, int pollTimeout, int maxPolls) { if (pollTimeout <= 0) throw new IllegalArgumentException("Poll Timeout must be > 0"); if (maxPolls <= 0) throw new IllegalArgumentException("Max Poll attempts must be > 0"); this.queue = new ArrayBlockingQueue<>(bufferSize, fair); this.pollTimeout = pollTimeout; this.maxPolls = maxPolls; } @Override public boolean hasNext() { if (!connected) throw new IllegalStateException("Pipe not connected"); if (closedByConsumer) throw new RiotException("Pipe closed"); if (finished) return false; consumerThread = Thread.currentThread(); // Depending on how code and/or the JVM schedules the threads involved // there is a scenario that exists where a producer can finish/die // before theconsumer is started and the consumer is scheduled onto the // same thread thus resulting in a deadlock on the consumer because it // will never be able to detect that the producer died // In this scenario we need to set a special flag to indicate the // possibility if (producerThread != null && producerThread == consumerThread) threadReused = true; if (slot != null) return true; int attempts = 0; while (true) { attempts++; try { slot = queue.poll(this.pollTimeout, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { throw new CancellationException(); } if (null != slot) break; // If the producer thread died and did not call finish() then // declare this pipe to be "broken" // Since check is after the break, we will drain as much as possible // out of the queue before throwing this exception if (threadReused || (producerThread != null && !producerThread.isAlive() && !closedByProducer)) { closedByConsumer = true; throw new RiotException("Producer dead"); } // Need to check this inside the loop as otherwise outside code that // attempts to break the deadlock by causing close() on the iterator // cannot do so if (closedByConsumer) throw new RiotException("Pipe closed"); // Need to check whether polling attempts have been exceeded // If so declare the producer dead and exit if (attempts >= this.maxPolls) { closedByConsumer = true; if (producerThread != null) { throw new RiotException( "Producer failed to produce any data within the specified number of polling attempts, declaring producer dead"); } else { throw new RiotException("Producer failed to ever call start(), declaring producer dead"); } } } // When the end marker is seen set slot to null if (slot == endMarker) { finished = true; slot = null; return false; } return true; } @Override public T next() { if (!hasNext()) throw new NoSuchElementException(); T item = slot; slot = null; return item; } @Override public void remove() { throw new UnsupportedOperationException(); } private void checkStateForReceive() { if (closedByProducer || closedByConsumer) { throw new RiotException("Pipe closed"); } else if (consumerThread != null && !consumerThread.isAlive()) { throw new RiotException("Consumer dead"); } } protected void connect() { this.connected = true; } protected void receive(T t) { checkStateForReceive(); producerThread = Thread.currentThread(); try { queue.put(t); } catch (InterruptedException e) { throw new CancellationException(); } } protected void base(String base) { synchronized (lock) { this.baseIri = base; } } /** * Gets the most recently seen Base IRI * * @return Base IRI */ public String getBaseIri() { synchronized (lock) { return baseIri; } } protected void prefix(String prefix, String iri) { synchronized (lock) { prefixes.add(prefix, iri); } } /** * Gets the prefix map which contains the prefixes seen so far in the stream * * @return Prefix Map */ public PrefixMap getPrefixes() { synchronized (lock) { // Need to return a copy since PrefixMap is not concurrent return PrefixMapFactory.create(this.prefixes); } } /** * Should be called by the producer when it begins writing to the iterator. * If the producer fails to call this for whatever reason and never produces * any output or calls {@code finish()} consumers may be blocked for a short * period before they detect this state and error out. */ protected void start() { // Track the producer thread in case it never delivers us anything and // dies before calling finish producerThread = Thread.currentThread(); } /** * Should be called by the producer when it has finished writing to the * iterator. If the producer fails to call this for whatever reason * consumers may be blocked for a short period before they detect this state * and error out. */ protected void finish() { if ( closedByProducer ) return ; receive(endMarker); closedByProducer = true; } /** * May be called by the consumer when it is finished reading from the * iterator, if the producer thread has not finished it will receive an * error the next time it tries to write to the iterator */ @Override public void close() { closedByConsumer = true; } }