PipedRDFIterator.java example

Explorer
jena-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.jena.riot.lang;

import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.CancellationException;
import java.util.concurrent.TimeUnit;

import org.apache.jena.atlas.lib.Closeable;
import org.apache.jena.riot.RiotException;
import org.apache.jena.riot.system.PrefixMap;
import org.apache.jena.riot.system.PrefixMapFactory;

/**
 * <p>
 * A {@code PipedRDFIterator} should be connected to a {@link PipedRDFStream}
 * implementation; the piped iterator then provides whatever RDF primitives are
 * written to the {@code PipedRDFStream}
 * </p>
 * <p>
 * Typically, data is read from a {@code PipedRDFIterator} by one thread (the
 * consumer) and data is written to the corresponding {@code PipedRDFStream} by
 * some other thread (the producer). Attempting to use both objects from a
 * single thread is not recommended, as it may deadlock the thread. The
 * {@code PipedRDFIterator} contains a buffer, decoupling read operations from
 * write operations, within limits.
 * </p>
 * <p>
 * Inspired by Java's {@link java.io.PipedInputStream} and
 * {@link java.io.PipedOutputStream}
 * </p>
 * 
 * @param <T>
 *            The type of the RDF primitive, should be one of {@code Triple},
 *            {@code Quad}, or {@code Tuple<Node>}
 * 
 * @see PipedTriplesStream
 * @see PipedQuadsStream
 * @see PipedTuplesStream
 */
public class PipedRDFIterator<T> implements Iterator<T>, Closeable {
    /**
     * Constant for default buffer size
     */
    public static final int DEFAULT_BUFFER_SIZE = 10000;

    /**
     * Constant for default poll timeout in milliseconds, used to stop the
     * consumer deadlocking in certain circumstances
     */
    public static final int DEFAULT_POLL_TIMEOUT = 1000; // one second
    /**
     * Constant for max number of failed poll attempts before the producer will
     * be declared as dead
     */
    public static final int DEFAULT_MAX_POLLS = 10;

    private final BlockingQueue<T> queue;

    @SuppressWarnings("unchecked")
    private final T endMarker = (T) new Object();

    private volatile boolean closedByConsumer = false;
    private volatile boolean closedByProducer = false;
    private volatile boolean finished = false;
    private volatile boolean threadReused = false;
    private volatile Thread consumerThread;
    private volatile Thread producerThread;

    private boolean connected = false;
    private int pollTimeout = DEFAULT_POLL_TIMEOUT;
    private int maxPolls = DEFAULT_MAX_POLLS;

    private T slot;

    private final Object lock = new Object(); // protects baseIri and prefixes
    private String baseIri;
    private final PrefixMap prefixes = PrefixMapFactory.createForInput();

    /**
     * Creates a new piped RDF iterator with the default buffer size of
     * {@code DEFAULT_BUFFER_SIZE}.
     * <p>
     * Buffer size must be chosen carefully in order to avoid performance
     * problems, if you set the buffer size too low you will experience a lot of
     * blocked calls so it will take longer to consume the data from the
     * iterator. For best performance the buffer size should be at least 10% of
     * the expected input size though you may need to tune this depending on how
     * fast your consumer thread is.
     * </p>
     */
    public PipedRDFIterator() {
        this(DEFAULT_BUFFER_SIZE);
    }

    /**
     * Creates a new piped RDF iterator
     * <p>
     * Buffer size must be chosen carefully in order to avoid performance
     * problems, if you set the buffer size too low you will experience a lot of
     * blocked calls so it will take longer to consume the data from the
     * iterator. For best performance the buffer size should be roughly 10% of
     * the expected input size though you may need to tune this depending on how
     * fast your consumer thread is.
     * </p>
     * 
     * @param bufferSize
     *            Buffer size
     */
    public PipedRDFIterator(int bufferSize) {
        this(bufferSize, false, DEFAULT_POLL_TIMEOUT, DEFAULT_MAX_POLLS);
    }

    /**
     * Creates a new piped RDF iterator
     * <p>
     * Buffer size must be chosen carefully in order to avoid performance
     * problems, if you set the buffer size too low you will experience a lot of
     * blocked calls so it will take longer to consume the data from the
     * iterator. For best performance the buffer size should be roughly 10% of
     * the expected input size though you may need to tune this depending on how
     * fast your consumer thread is.
     * </p>
     * <p>
     * The fair parameter controls whether the locking policy used for the
     * buffer is fair. When enabled this reduces throughput but also reduces the
     * chance of thread starvation. This likely need only be set to {@code true}
     * if there will be multiple consumers.
     * </p>
     * 
     * @param bufferSize
     *            Buffer size
     * @param fair
     *            Whether the buffer should use a fair locking policy
     */
    public PipedRDFIterator(int bufferSize, boolean fair) {
        this(bufferSize, fair, DEFAULT_POLL_TIMEOUT, DEFAULT_MAX_POLLS);
    }

    /**
     * Creates a new piped RDF iterator
     * <p>
     * Buffer size must be chosen carefully in order to avoid performance
     * problems, if you set the buffer size too low you will experience a lot of
     * blocked calls so it will take longer to consume the data from the
     * iterator. For best performance the buffer size should be roughly 10% of
     * the expected input size though you may need to tune this depending on how
     * fast your consumer thread is.
     * </p>
     * <p>
     * The {@code fair} parameter controls whether the locking policy used for
     * the buffer is fair. When enabled this reduces throughput but also reduces
     * the chance of thread starvation. This likely need only be set to
     * {@code true} if there will be multiple consumers.
     * </p>
     * <p>
     * The {@code pollTimeout} parameter controls how long each poll attempt
     * waits for data to be produced. This prevents the consumer thread from
     * blocking indefinitely and allows it to detect various potential deadlock
     * conditions e.g. dead producer thread, another consumer closed the
     * iterator etc. and errors out accordingly. It is unlikely that you will
     * ever need to adjust this from the default value provided by
     * {@link #DEFAULT_POLL_TIMEOUT}.
     * </p>
     * <p>
     * The {@code maxPolls} parameter controls how many poll attempts will be
     * made by a single consumer thread within the context of a single call to
     * {@link #hasNext()} before the iterator declares the producer to be dead
     * and errors out accordingly. You may need to adjust this if you have a
     * slow producer thread or many consumer threads.
     * </p>
     * 
     * @param bufferSize
     *            Buffer size
     * @param fair
     *            Whether the buffer should use a fair locking policy
     * @param pollTimeout
     *            Poll timeout in milliseconds
     * @param maxPolls
     *            Max poll attempts
     */
    public PipedRDFIterator(int bufferSize, boolean fair, int pollTimeout, int maxPolls) {
        if (pollTimeout <= 0)
            throw new IllegalArgumentException("Poll Timeout must be > 0");
        if (maxPolls <= 0)
            throw new IllegalArgumentException("Max Poll attempts must be > 0");
        this.queue = new ArrayBlockingQueue<>(bufferSize, fair);
        this.pollTimeout = pollTimeout;
        this.maxPolls = maxPolls;
    }

    @Override
    public boolean hasNext() {
        if (!connected)
            throw new IllegalStateException("Pipe not connected");

        if (closedByConsumer)
            throw new RiotException("Pipe closed");

        if (finished)
            return false;

        consumerThread = Thread.currentThread();

        // Depending on how code and/or the JVM schedules the threads involved
        // there is a scenario that exists where a producer can finish/die
        // before theconsumer is started and the consumer is scheduled onto the
        // same thread thus resulting in a deadlock on the consumer because it
        // will never be able to detect that the producer died
        // In this scenario we need to set a special flag to indicate the
        // possibility
        if (producerThread != null && producerThread == consumerThread)
            threadReused = true;

        if (slot != null)
            return true;

        int attempts = 0;
        while (true) {
            attempts++;
            try {
                slot = queue.poll(this.pollTimeout, TimeUnit.MILLISECONDS);
            } catch (InterruptedException e) {
                throw new CancellationException();
            }

            if (null != slot)
                break;

            // If the producer thread died and did not call finish() then
            // declare this pipe to be "broken"
            // Since check is after the break, we will drain as much as possible
            // out of the queue before throwing this exception
            if (threadReused || (producerThread != null && !producerThread.isAlive() && !closedByProducer)) {
                closedByConsumer = true;
                throw new RiotException("Producer dead");
            }

            // Need to check this inside the loop as otherwise outside code that
            // attempts to break the deadlock by causing close() on the iterator
            // cannot do so
            if (closedByConsumer)
                throw new RiotException("Pipe closed");

            // Need to check whether polling attempts have been exceeded
            // If so declare the producer dead and exit
            if (attempts >= this.maxPolls) {
                closedByConsumer = true;
                if (producerThread != null) {
                    throw new RiotException(
                            "Producer failed to produce any data within the specified number of polling attempts, declaring producer dead");
                } else {
                    throw new RiotException("Producer failed to ever call start(), declaring producer dead");
                }
            }
        }

        // When the end marker is seen set slot to null
        if (slot == endMarker) {
            finished = true;
            slot = null;
            return false;
        }
        return true;
    }

    @Override
    public T next() {
        if (!hasNext())
            throw new NoSuchElementException();
        T item = slot;
        slot = null;
        return item;
    }

    @Override
    public void remove() {
        throw new UnsupportedOperationException();
    }

    private void checkStateForReceive() {
        if (closedByProducer || closedByConsumer) {
            throw new RiotException("Pipe closed");
        } else if (consumerThread != null && !consumerThread.isAlive()) {
            throw new RiotException("Consumer dead");
        }
    }

    protected void connect() {
        this.connected = true;
    }

    protected void receive(T t) {
        checkStateForReceive();
        producerThread = Thread.currentThread();

        try {
            queue.put(t);
        } catch (InterruptedException e) {
            throw new CancellationException();
        }
    }

    protected void base(String base) {
        synchronized (lock) {
            this.baseIri = base;
        }
    }

    /**
     * Gets the most recently seen Base IRI
     * 
     * @return Base IRI
     */
    public String getBaseIri() {
        synchronized (lock) {
            return baseIri;
        }
    }

    protected void prefix(String prefix, String iri) {
        synchronized (lock) {
            prefixes.add(prefix, iri);
        }
    }

    /**
     * Gets the prefix map which contains the prefixes seen so far in the stream
     * 
     * @return Prefix Map
     */
    public PrefixMap getPrefixes() {
        synchronized (lock) {
            // Need to return a copy since PrefixMap is not concurrent
            return PrefixMapFactory.create(this.prefixes);
        }
    }

    /**
     * Should be called by the producer when it begins writing to the iterator.
     * If the producer fails to call this for whatever reason and never produces
     * any output or calls {@code finish()} consumers may be blocked for a short
     * period before they detect this state and error out.
     */
    protected void start() {
        // Track the producer thread in case it never delivers us anything and
        // dies before calling finish
        producerThread = Thread.currentThread();
    }

    /**
     * Should be called by the producer when it has finished writing to the
     * iterator. If the producer fails to call this for whatever reason
     * consumers may be blocked for a short period before they detect this state
     * and error out.
     */
    protected void finish() {
        if ( closedByProducer )
            return ;
        receive(endMarker);
        closedByProducer = true;
    }

    /**
     * May be called by the consumer when it is finished reading from the
     * iterator, if the producer thread has not finished it will receive an
     * error the next time it tries to write to the iterator
     */
    @Override
    public void close() {
        closedByConsumer = true;
    }
}