/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Jun 25, 2009
*/
package com.bigdata.rdf.rio;
import java.beans.Statement;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.AbstractExecutorService;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipInputStream;
import org.apache.log4j.Logger;
import org.openrdf.model.BNode;
import org.openrdf.model.Resource;
import org.openrdf.model.URI;
import org.openrdf.model.Value;
import org.openrdf.rio.RDFFormat;
import org.openrdf.rio.RDFParser;
import com.bigdata.btree.AsynchronousIndexWriteConfiguration;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.KVO;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.btree.proc.IAsyncResultHandler;
import com.bigdata.btree.proc.LongAggregator;
import com.bigdata.counters.CounterSet;
import com.bigdata.counters.Instrument;
import com.bigdata.counters.OneShotInstrument;
import com.bigdata.io.ByteArrayBuffer;
import com.bigdata.io.DataOutputBuffer;
import com.bigdata.journal.AbstractTask;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.internal.VTE;
import com.bigdata.rdf.internal.impl.BlobIV;
import com.bigdata.rdf.lexicon.AssignTermId;
import com.bigdata.rdf.lexicon.BigdataValueCentricFullTextIndex;
import com.bigdata.rdf.lexicon.BlobsIndexHelper;
import com.bigdata.rdf.lexicon.BlobsWriteProc;
import com.bigdata.rdf.lexicon.BlobsWriteProc.BlobsWriteProcConstructor;
import com.bigdata.rdf.lexicon.Id2TermWriteProc.Id2TermWriteProcConstructor;
import com.bigdata.rdf.lexicon.LexiconKeyBuilder;
import com.bigdata.rdf.lexicon.LexiconKeyOrder;
import com.bigdata.rdf.lexicon.LexiconRelation;
import com.bigdata.rdf.lexicon.Term2IdTupleSerializer;
import com.bigdata.rdf.lexicon.Term2IdWriteProc;
import com.bigdata.rdf.lexicon.Term2IdWriteProc.Term2IdWriteProcConstructor;
import com.bigdata.rdf.model.BigdataBNode;
import com.bigdata.rdf.model.BigdataBNodeImpl;
import com.bigdata.rdf.model.BigdataLiteral;
import com.bigdata.rdf.model.BigdataResource;
import com.bigdata.rdf.model.BigdataStatement;
import com.bigdata.rdf.model.BigdataURI;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.rdf.model.BigdataValueFactory;
import com.bigdata.rdf.model.BigdataValueImpl;
import com.bigdata.rdf.model.BigdataValueSerializer;
import com.bigdata.rdf.model.StatementEnum;
import com.bigdata.rdf.spo.ISPO;
import com.bigdata.rdf.spo.SPOIndexWriteProc;
import com.bigdata.rdf.spo.SPOIndexWriter;
import com.bigdata.rdf.spo.SPOKeyOrder;
import com.bigdata.rdf.spo.SPORelation;
import com.bigdata.rdf.spo.SPOTupleSerializer;
import com.bigdata.rdf.store.AbstractTripleStore;
import com.bigdata.rdf.store.ScaleOutTripleStore;
import com.bigdata.relation.accesspath.BlockingBuffer;
import com.bigdata.relation.accesspath.IBuffer;
import com.bigdata.relation.accesspath.IRunnableBuffer;
import com.bigdata.relation.accesspath.UnsynchronizedUnboundedChunkBuffer;
import com.bigdata.search.TextIndexWriteProc;
import com.bigdata.service.AbstractFederation;
import com.bigdata.service.Split;
import com.bigdata.service.ndx.IScaleOutClientIndex;
import com.bigdata.service.ndx.pipeline.DefaultDuplicateRemover;
import com.bigdata.service.ndx.pipeline.KVOC;
import com.bigdata.service.ndx.pipeline.KVOLatch;
import com.bigdata.service.ndx.pipeline.KVOList;
import com.bigdata.striterator.ChunkedWrappedIterator;
import com.bigdata.striterator.IChunkedIterator;
import com.bigdata.striterator.IChunkedOrderedIterator;
import com.bigdata.striterator.IKeyOrder;
import com.bigdata.util.Bytes;
import com.bigdata.util.DaemonThreadFactory;
import com.bigdata.util.concurrent.Latch;
import com.bigdata.util.concurrent.ShutdownHelper;
import com.bigdata.util.concurrent.ThreadPoolExecutorBaseStatisticsTask;
import cutthecrap.utils.striterators.Filter;
import cutthecrap.utils.striterators.Striterator;
/**
* Factory object for high-volume RDF data load.
* <p>
* The asynchronous statement buffer w/o SIDs is much simpler that w/. If we
* require that the document is fully buffered in memory, then we can simplify
* this to just:
*
* <pre>
*
* Given:
*
* value[] - RDF Values observed in the S,P,O, or C positions.
*
* statement[] - RDF Statements reported by the parser.
*
* Do:
*
* value[] => TERM2ID (Sync RPC, assigning TIDs)
* value[] => BLOBS (Sync RPC, assigning TIDs)
*
* value[] => ID2TERM (Async)
*
* value[] => Text (Async, iff enabled)
*
* statement[] => (SPO,POS,OSP) (Async)
* </pre>
*
* Note: This DOES NOT support truth maintenance. Truth maintenance requires
* that the term identifiers are resolved against the database's lexicon while
* the statements are written onto a local (and temporary) triple store. There
* is no (or at least less) reason to use asynchronous writes against a local
* store. However, TM could use this to copy the data from the temporary triple
* store to the database. This should be plugged in transparently in the
* copyStatements() API for the tripleStore.
* <p>
* Note: This DOES NOT support SIDS.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*
* @param <S>
* The generic type of the statement objects.
* @param <R>
* The generic type of the resource identifier (File, URL, etc).
*
* FIXME Modify to support SIDs. We basically need to loop in the
* {@link #workflowLatch_bufferTids} workflow state until all SIDs have been
* assigned. However, the termination conditions will be a little more complex.
* During termination, if we have the TIDs but not yet the SIDs then we need to
* flush the SID requests rather than allowing them to timeout. Since SID
* processing is cyclic, we may have to do this one or more times.
*
* <pre>
* AsynchronousStatementBufferWithSids:
*
* When SIDs are enabled, we must identify the minimum set of statements
* whose SIDs are referenced by blank nodes in the S, P, O positions of
* other statements. Since we can not make that determination until we
* reach the end of the document, all statements which use blank nodes
* are placed into the deferredStatements container.
*
* Further, and unlike the synchronous StatementBuffer, we must defer
* writes of grounded statements until we know whether or not their SID
* appears in a blank node reference by another statement. We MUST use
* synchronous RPC to obtain the SIDs for those statements. This means
* that the entire document MUST be parsed into memory. Since we must
* buffer the entire document in memory when SIDs are enabled (when using
* asynchronous writes), distinct implementations of the asynchronous
* statement buffer are used depending on whether or not SIDs are
* enabled. [Actually, we fully buffer anyway so we can use the same
* implementation class.]
*
* Once the end of the document has been reached, we iteratively divide
* the parsed statements into three collections. This process ends once
* all three collections are empty.
*
* 1. groundedStatements : These are statements which are not
* referenced by other statements using their SID and which do not
* contain references to the SIDs of other statements. The
* groundedStatements are written asynchronously since there is no
* dependency on their SIDs.
*
* 2. referencedStatements : These are statements whose SID has not
* been assigned yet and which do not reference other statements
* but which are themselves referenced by other statements using a
* blank node. These statements are written using synchronous RPC
* so that we may obtain their SIDs and thereby convert one or more
* deferredStatements to either groundedStatements or
* referencedStatements.
*
* 3. deferredStatements : These are statements using a blank node to
* reference another statement whose SID has not been assigned yet.
* These statements MAY also be referenced by other deferred
* statements. However, those references MAY NOT form a cycle.
* Deferred statements are moved to either the groundedStatements
* or the referencedStatements collection once their blank node
* references have been assigned SIDs.
*
* Given:
*
* value[] - RDF Values observed in the S, P, O, and C positions.
*
* unresolvedRefs[] - RDF blank nodes observed in the C position are
* entered into this collection. They are removed
* from the collection as they are resolved.
*
* statement[] - RDF Statements reported by the parser.
*
* Do:
*
* // remove blank nodes serving as SIDs from the value[].
* value[] := value[] - unresolvedRef[];
*
* value[] => TERM2ID (Sync RPC, assigning TIDs)
*
* value[] => ID2TERM (Async)
*
* value[] => Text (Async, iff enabled)
*
* // initially, all statements are deferred.
* deferredStatements := statements;
*
* while(!groundedStatements.isEmpty() && !referencedStatements.isEmpty()
* && !deferredStatements.isEmpty()) {
*
* groundedStatement[] => TERM2ID (async)
*
* groundedStatement[] := []; // empty.
*
* referencedStatement[] => TERM2ID (Sync RPC, assigning SIDs)
*
* foreach spo : referencedStatements {
*
* unresolvedRefs.remove( spo.c );
*
* }
*
* referencedStatement[] := []; // empty.
*
* foreach spo : deferredStatement[i] {
*
* if(spo.isGrounded) {
*
* // true iff S.tid, P.tid, and O.tid are bound, implying that
* // this statement does not have any unresolved references to
* // other statements.
*
* if(unresolvedReferences.contains(spo.c)) {
*
* // will be written synchronously.
* referencedStatements.add( spo );
*
* } else {
*
* // will be written asynchronously.
* groundedStatement.add( spo );
*
* }
*
* }
*
* }
*
* }
* </pre>
*
* @todo evaluate this approach for writing on a local triple store. if there is
* a performance benefit then refactor accordingly (requires asynchronous
* write API for BTree and friends).
*/
public class AsynchronousStatementBufferFactory<S extends BigdataStatement, R>
implements IAsynchronousWriteStatementBufferFactory<S> {
final private transient static Logger log = Logger
.getLogger(AsynchronousStatementBufferFactory.class);
/**
* The database into which the statements will be written.
*/
private final ScaleOutTripleStore tripleStore;
/**
* The lexicon.
*/
private final LexiconRelation lexiconRelation;
/**
* The triples.
*/
private final SPORelation spoRelation;
/**
* The initial capacity of the canonicalizing mapping for RDF
* {@link Value}.
*/
private final int valuesInitialCapacity;
/**
* The initial capacity of the canonicalizing mapping for RDF
* {@link BNode}s.
*/
private final int bnodesInitialCapacity;
/**
* The chunk size used by the producer to break the terms and statements
* into chunks before writing them onto the {@link BlockingBuffer} for
* the master.
*/
private final int producerChunkSize;
/**
* The default {@link RDFFormat}.
*/
private final RDFFormat defaultFormat;
/**
* The value that will be used for the graph/context co-ordinate when
* loading data represented in a triple format into a quad store.
*/
private final String defaultGraph;
/**
* Options for the {@link RDFParser}.
*/
private final RDFParserOptions parserOptions;
/**
* Delete files after they have been successfully loaded when
* <code>true</code>.
*/
private final boolean deleteAfter;
/**
* Delete files after they have been successfully loaded when
* <code>true</code>.
*/
protected boolean isDeleteAfter() {
return deleteAfter;
}
/**
* The default RDF interchange format that will be used when the format can
* not be determined.
*/
protected RDFFormat getDefaultRDFFormat() {
return defaultFormat;
}
/**
* When <code>true</code> and the full text index is enabled, then also
* index datatype literals.
*/
private final boolean indexDatatypeLiterals;
/*
* Asynchronous index write buffers.
*/
final private IRunnableBuffer<KVO<BigdataValue>[]> buffer_t2id;
final private IRunnableBuffer<KVO<BigdataValue>[]> buffer_id2t;
final private IRunnableBuffer<KVO<BigdataValue>[]> buffer_blobs;
final private IRunnableBuffer<KVO<BigdataValue>[]> buffer_text;
/**
* A map containing an entry for each statement index on which this
* class will write.
*/
private final Map<SPOKeyOrder, IRunnableBuffer<KVO<ISPO>[]>> buffer_stmts;
/**
* Counts statements written on the database (applied only to the SPO index
* so we do not double count).
*/
private final LongAggregator statementResultHandler = new LongAggregator();
/**
* Counts tuples written on the full text index.
*/
private final LongAggregator textResultHandler = new LongAggregator();
/**
* The timestamp set when {@link #notifyStart()} is invoked. This is done
* when the factory is created.
*/
private volatile long startTime;
/**
* The timestamp set when {@link #notifyEnd()} is invoked. This is done when
* the factory is {@link #close()}d or when execution is
* {@link #cancelAll(boolean) cancelled}.
*/
private long endTime;
/**
* Notify that the factory will begin running tasks. This sets the
* {@link #startTime} used by {@link #getElapsedMillis()} to report the run
* time of the tasks.
*/
protected void notifyStart() {
/*
* Note: uses the lock to make this atomic since we do this when we
* accept each document and we already own the lock at that point.
*/
if (!lock.isHeldByCurrentThread())
throw new IllegalMonitorStateException();
if (startTime == 0L) {
endTime = 0L;
startTime = System.currentTimeMillis();
}
}
/**
* Notify that the factory is done running tasks (for now). This places a
* cap on the time reported by {@link #elapsed()}.
*/
protected void notifyEnd() {
endTime = System.currentTimeMillis();
parserService.shutdownNow();
tidsWriterService.shutdownNow();
otherWriterService.shutdownNow();
notifyService.shutdownNow();
if (serviceStatisticsTask != null) {
serviceStatisticsTask.cancel();
}
}
/**
* The elapsed milliseconds, counting only the time between
* {@link #notifyStart()} and {@link #notifyEnd()}.
*/
public long getElapsedMillis() {
if (startTime == 0L)
return 0L;
if (endTime == 0L) {
return System.currentTimeMillis() - startTime;
}
return endTime - startTime;
}
/*
* Cumulative counters. These do not need to be protected by the lock as
* they do not guard transitions between workflow states.
*/
/**
* The #of documents that have been parsed (cumulative total).
*/
private final AtomicLong documentsParsedCount = new AtomicLong(0L);
/**
* The #of documents whose TIDs have been assigned (cumulative total).
*/
private final AtomicLong documentTIDsReadyCount = new AtomicLong(0L);
/**
* The #of documents that are waiting on their TIDs (current value). The
* counter is incremented when a document begins to buffer writes on the
* TERM2ID/BLOBS indices. The counter is decremented as soon as those writes are
* restart safe.
* <p>
* Note: The {@link #workflowLatch_bufferTids} is only decremented when
* the document begins to write on the other indices, so
* {@link #documentTIDsWaitingCount} will be decremented before
* {@link #workflowLatch_bufferTids}. The two counters will track very
* closely unless the {@link #otherWriterService} has a backlog.
* <p>
* Note: The {@link #workflowLatch_bufferOther} is decremented as soon as
* the writes on the other indices are restart safe since there is no
* transition to another workflow state. This is why there is no counter for
* "documentOtherWaitingCount".
*/
private final AtomicLong documentTIDsWaitingCount = new AtomicLong(0L);
/**
* The #of told triples parsed from documents using this factory and
* made restart safe on the database. This is incremented each time a
* document has been made restart safe by the #of distinct told triples
* parsed from that document.
* <p>
* Note: The same triple can occur in more than one document, and
* documents having duplicate triples may be loaded by distributed
* clients. The actual #of triples on the database is only available by
* querying the database.
*/
private final AtomicLong toldTriplesRestartSafeCount = new AtomicLong();
/**
* The #of documents which have been fully processed and are
* restart-safe on the database (cumulative total).
*/
private final AtomicLong documentRestartSafeCount = new AtomicLong();
/**
* The #of documents for which the {@link BufferOtherWritesTask} failed.
*/
private final AtomicLong documentErrorCount = new AtomicLong();
/*
* Latches. The latches guard transitions between workflow states and must
* be protected by the lock.
*/
/**
* The {@link #lock} is used to makes the observable state changes for the
* factory atomic and guards the termination conditions in {@link #close()}.
* You MUST own the {@link #lock} when incrementing or decrementing any of
* the {@link Latch}s. The {@link Latch} transitions must be accomplished
* while you are holding the lock. For example, the transition between
* <i>parsing</i> and <i>buffering TERM2ID/BLOBS writes</i> requires that we
* decrement one latch and increment the other while hold the {@link #lock}.
* <p>
* The counter associated with each {@link Latch} indicates the total #of
* documents associated with that workflow state but does not differentiate
* between documents waiting on the work queue for the corresponding thread
* pool (e.g., the {@link #parserService}), documents assigned to a worker
* thread and running in the thread pool, and documents waiting for some
* state change (e.g., the return from an asynchronous write) before they
* can be transferred to the next workflow state. However, you can gain
* additional information about the various thread pools from their
* counters, including their work queue size, the #of active tasks, etc.
*/
private final ReentrantLock lock = new ReentrantLock();
/**
* A global {@link Latch} guarding all documents which have been accepted for
* processing and have not yet reached an absorbing state (either an error
* state or been made restart safe).
*/
private final Latch workflowLatch_document = new Latch("document",lock);
/**
* A {@link Latch} guarding documents which have been accepted for parsing
* but have not been transferred to the {@link #workflowLatch_bufferTids}.
*
* @todo We could add a resolver latch for network IO required to buffer the
* document locally. E.g., a read from a DFS or a web page.
*/
private final Latch workflowLatch_parser = new Latch("parser", lock);
/**
* A {@link Latch} guarding documents that have begun to buffering their
* writes on the TERM2ID/BLOBS indices but have not been transferred to the
* {@link #workflowLatch_bufferOther}.
*/
private final Latch workflowLatch_bufferTids = new Latch(
"bufferTids", lock);
/**
* A {@link Latch} guarding documents that have begun to buffer their writes
* on the other indices but have not yet completed their processing.
*/
private final Latch workflowLatch_bufferOther = new Latch("bufferOther",
lock);
/*
* Latches used to guard tasks buffering writes. There is one such latch for
* TERM2ID/BLOBS and one for the rest of the buffers. During close() we will close
* the buffers to flush their writes as soon as these latches hit zero.
*
* Note: These latches allow us to close the buffers in a timely manner. The
* other latches guard the workflow state transitions. However, if we do not
* close the buffers in a timely manner then close() will hang until a chunk
* or idle timeout (if any) causes the buffers to be flushed!
*/
/**
* {@link Latch} guarding tasks until they have buffered their writes on the
* TERM2ID/BLOBS indices. This latch is decremented as soon as the writes for a given
* document have been buffered. This is used to close the TERM2ID/BLOBS buffers in
* a timely manner in {@link #close()}.
*/
private final Latch guardLatch_term2Id = new Latch("guard_term2Id", lock);
/**
* {@link Latch} guarding tasks until they have buffered their writes on the
* remaining index buffers. This latch is decremented as soon as the writes
* for a given document have been buffered. This is used to close the other
* buffers in a timely manner in {@link #close()}.
*/
private final Latch guardLatch_other = new Latch("guard_other", lock);
/**
* {@link Latch} guarding the notify service until all notices have been
* delivered.
*/
private final Latch guardLatch_notify = new Latch("guard_notify", lock);
/*
* Parser service pause/resume.
*/
/**
* New parser tasks submitted to the {@link #parserService} will block when
* the {@link #unbufferedStatementCount} is GT this value. This is used to
* control the RAM demand of the parsed (but not yet buffered) statements.
* The RAM demand of the buffered statements is controlled by the capacity
* of the master and sink queues on which those statements are buffered.
*
* @todo it is possible that the buffered writes on term2id/blobs could limit
* throughput when the parser pool is paused since the decision to
* pause the parser pool is based on the #of unbuffered statements
* overall not just those staged for the term2id/blobs or the other indices.
*/
private final long pauseParserPoolStatementThreshold;
/**
* The #of statements which have been parsed but not yet written onto the
* asynchronous index write buffers. This is incremented when all statements
* for a given document have been parsed by the #of distinct statements in
* that document. This is decremented when all statements for that document
* have been placed onto the asynchronous index write buffers, or if
* processing fails for that document. This is used to prevent new parser
* threads from overrunning the database when the parsers are faster than
* the database.
*/
private final AtomicLong unbufferedStatementCount = new AtomicLong();
/**
* The #of RDF {@link Statement}s that have been parsed but which are not
* yet restart safe on the database. This is incremented when all statements
* for a given document have been parsed by the #of distinct statements in
* that document. This is decremented when all writes for that document have
* been made restart safe on the database, or if processing fails for that
* document. This may be used as a proxy for the amount of data which is
* unavailable for garbage collection and thus for the size of the heap
* entailed by processing.
*/
private final AtomicLong outstandingStatementCount = new AtomicLong();
/**
* In order to prevent runaway demand on RAM, new parser tasks must await
* this {@link Condition} if the #of parsed but not yet buffered statements
* is GTE the configured {@link #pauseParserPoolStatementThreshold}
* threshold.
*/
private Condition unpaused = lock.newCondition();
/**
* The #of threads which are currently paused awaiting the {@link #unpaused}
* {@link Condition}.
*/
private AtomicLong pausedThreadCount = new AtomicLong();
/**
* The #of times the {@link #parserService} has been paused.
*/
private AtomicLong poolPausedCount = new AtomicLong();
/**
* Verify counters for latches which must sum atomically to the
* {@link #workflowLatch_document}.
*/
private void assertSumOfLatchs() {
if(!lock.isHeldByCurrentThread())
throw new IllegalMonitorStateException();
/*
* Sum the latches for the distinct workflow states for a document
* across all documents.
*/
final long n1 = workflowLatch_parser.get()//
+ workflowLatch_bufferTids.get()//
+ workflowLatch_bufferOther.get()//
;
final long n2 = workflowLatch_document.get();
if (n1 != n2) {
throw new AssertionError("Sum of Latches=" + n1
+ ", but unfinished=" + n2 + " : "
+ getCounters().toString());
}
}
/**
* Bounded thread pool using a bounded work queue to run the parser tasks.
* If a backlog develops, then the thread pool is <em>paused</em>, and new
* tasks will not start until the backlog is cleared. This will cause the
* work queue to fill up, and the threads feeding that work queue to block.
* This is done to place bounds on the memory demands of the total pipeline.
*/
private final ParserThreadPoolExecutor parserService;
/**
* Bounded thread pool using an unbounded work queue to buffer writes for
* the TERM2ID/BLOBS indices (these are the indices which assign tids).
* Tasks are added to the work queue by the parser task in
* {@link AsynchronousStatementBufferImpl#flush()}.
*/
private final ThreadPoolExecutor tidsWriterService;
/**
* Bounded thread pool using an unbounded work queue to run
* {@link BufferOtherWritesTask}s. Tasks are added to the work queue by
* the "TIDs Ready" {@link KVOLatch}. Once the index writes have been
* buffered, the statement buffer is placed onto the
* {@link #docsWaitingQueue}. This {@link ExecutorService} MUST be
* unbounded since tasks will be assigned by {@link KVOLatch#signal()}
* and that method MUST NOT block.
*/
private final ThreadPoolExecutor otherWriterService;
/**
* Bounded thread pool with an unbounded work queue used process per file
* success or failure notices.
*/
private final ThreadPoolExecutor notifyService;
/**
* {@link Runnable} collects performance counters on services used by the
* factory.
*/
private final ServiceStatisticsTask serviceStatisticsTask;
/**
* Return an estimate of the #of statements written on the indices.
* <p>
* This value is aggregated across any {@link IStatementBuffer} obtained
* from {@link #newStatementBuffer()} for this instance.
* <p>
* This value actually reports the #of statements written on the SPO index
* for the database. Statements are written asynchronously in chunks and the
* writes MAY proceed at different rates for each of the statement indices.
* The counter value will be stable once the {@link #awaitAll()} returns
* normally.
*
* @see SPOIndexWriteProc
*/
public long getStatementCount() {
return statementResultHandler.getResult().longValue();
}
/**
* The #of documents submitted to the factory which could not be processed
* due to some error.
*/
public long getDocumentErrorCount() {
return documentErrorCount.get();
}
/**
* The #of documents submitted to the factory which have been processed
* successfully.
*/
public long getDocumentDoneCount() {
return documentRestartSafeCount.get();
}
/**
* Note: do not invoke this directly. It does not know how to set the
* resource identifier on the statement buffer impl.
*/
public IStatementBuffer<S> newStatementBuffer() {
return newStatementBuffer(null/* resource */);
}
protected AsynchronousStatementBufferImpl newStatementBuffer(
final R resource) {
return new AsynchronousStatementBufferImpl(resource);
}
/**
* Submit a resource for processing.
*
* @param resource
* The resource (file or URL, but not a directory).
*
* @throws Exception
* if there is a problem creating the parser task.
* @throws RejectedExecutionException
* if the work queue for the parser service is full.
*/
public void submitOne(final R resource) throws Exception {
lock.lock();
try {
// Note: the parser task will obtain the lock when it runs.
final Callable<?> task = newParserTask(resource);
submitOne(resource, task);
} finally {
lock.unlock();
}
}
/**
* Inner method allows the caller to allocate the task once when the caller
* will retry if there is a {@link RejectedExecutionException}.
*
* @param The
* resource (file or URL, but not a directory).
* @param The
* parser task to run.
*
* @throws Exception
* if there is a problem creating the parser task.
* @throws RejectedExecutionException
* if the work queue for the parser service is full.
*/
private void submitOne(final R resource, final Callable<?> task)
throws Exception {
if (resource == null)
throw new IllegalArgumentException();
if (task == null)
throw new IllegalArgumentException();
lock.lock();
try {
assertSumOfLatchs();
notifyStart();
/*
* Note: The total processing of the documents will not terminate
* until this latch has been decremented back to zero.
*/
workflowLatch_document.inc();
workflowLatch_parser.inc();
assertSumOfLatchs();
try {
/*
* Submit resource for parsing.
*
* @todo it would be nice to return a Future here that tracked the
* document through the workflow.
*/
parserService.submit(task);
} catch (RejectedExecutionException ex) {
/*
* Back out the document since the task was not accepted for
* execution.
*/
// lock.lock();
// try {
assertSumOfLatchs();
workflowLatch_document.dec();
workflowLatch_parser.dec();
assertSumOfLatchs();
// } finally {
// lock.unlock();
// }
throw ex;
}
} finally {
lock.unlock();
}
}
/**
* Submit a resource for processing.
*
* @param resource
* The resource (file or URL, but not a directory).
* @param retryMillis
* The number of milliseconds to wait between retries when the
* parser service work queue is full. When ZERO (0L), a
* {@link RejectedExecutionException} will be thrown out instead.
*
* @throws Exception
* if there is a problem creating the parser task.
* @throws RejectedExecutionException
* if the service is shutdown -or- the retryMillis is ZERO(0L).
*/
public void submitOne(final R resource, final long retryMillis)
throws InterruptedException, Exception {
if (resource == null)
throw new IllegalArgumentException();
if (retryMillis < 0)
throw new IllegalArgumentException();
int retryCount = 0;
final long begin = System.currentTimeMillis();
long lastLogTime = begin;
// Note: the parser task will obtain the lock when it runs.
final Callable<?> task = newParserTask(resource);
while (true) {
try {
// submit resource for processing.
submitOne(resource, task);
return;
} catch (RejectedExecutionException ex) {
if(parserService.isShutdown()) {
// Do not retry since service is closed.
throw ex;
}
if (retryMillis == 0L) {
// Do not retry since if retry interval is 0L.
throw ex;
}
// sleep for the retry interval.
Thread.sleep(retryMillis);
retryCount++;
if (log.isInfoEnabled()) {
final long now = System.currentTimeMillis();
final long elapsedSinceLastLogTime = now - lastLogTime;
if (elapsedSinceLastLogTime > 5000) {
final long elapsed = now - begin;
lastLogTime = now;
log.info("Parser pool blocking: retryCount="
+ retryCount + ", elapsed=" + elapsed
+ "ms, resource=" + resource);
// log.info(getCounters().toString());
}
}
// retry
continue;
} catch (InterruptedException ex) {
throw ex;
} catch (Exception ex) {
log.error(resource, ex);
}
}
}
/**
* Submit all files in a directory for processing via
* {@link #submitOne(String)}.
*
* @param fileOrDir
* The file or directory.
* @param filter
* An optional filter. Only the files selected by the filter will
* be processed.
* @param retryMillis
* The number of milliseconds to wait between retries when the
* parser service work queue is full. When ZERO (0L), a
* {@link RejectedExecutionException} will be thrown out instead.
*
* @return The #of files that were submitted for processing.
*
* @throws Exception
*/
public int submitAll(final File fileOrDir, final FilenameFilter filter,
final long retryMillis)
throws Exception {
return new RunnableFileSystemLoader(fileOrDir, filter, retryMillis)
.call();
}
/**
* Open an buffered input stream reading from the resource. If the resource
* ends with <code>.gz</code> or <code>.zip</code> then the appropriate
* decompression will be applied.
*
* @param resource
* The resource identifier.
*
* @todo This will only read the first entry from a ZIP file. Archives need
* to be recognized as such by the driver and expanded into a sequence
* of parser calls with the input stream. That will require a
* different entry point since we can't close the
* {@link ZipInputStream} until we have read all the entries in that
* file. The {@link ZipInputStream} is likely not thread safe so the
* same parser thread would have to consume each of the entries even
* though they must also be dealt with as distinct documents. Given
* all that, reading more than the first entry might not be worth it.
*/
protected InputStream getInputStream(final R resource) throws IOException {
InputStream is;
if (resource instanceof File) {
is = new FileInputStream((File) resource);
final String name = ((File) resource).getName();
if (name.endsWith(".gz")) {
is = new GZIPInputStream(is);
} else if(name.endsWith(".zip")) {
is = new ZipInputStream(is);
}
} else if (resource instanceof URL) {
is = ((URL) resource).openStream();
} else {
throw new UnsupportedOperationException();
}
return is;
}
/**
* Return a task to parse the document. The task should allocate an
* {@link AsynchronousStatementBufferImpl} for the document. When
* that buffer is flushed, the document will be queued for further
* processing.
*
* @param resource
* The resource to be parsed.
* @return The task to execute.
*
* @throws Exception
*/
protected Callable<?> newParserTask(final R resource) throws Exception {
final String resourceStr = resource.toString();
if (log.isInfoEnabled())
log.info("resource=" + resourceStr);
final RDFFormat defaultFormat = getDefaultRDFFormat();
/* @todo This might be ignorant of .gz and .zip extensions.
* @todo when resource is URL use reported MimeTYPE also.
*/
final RDFFormat rdfFormat = (defaultFormat == null //
? RDFFormat.forFileName(resourceStr) //
: RDFFormat.forFileName(resourceStr, defaultFormat)//
);
if (rdfFormat == null) {
final String msg = "Could not determine interchange syntax - skipping : file="
+ resource;
log.error(msg);
throw new RuntimeException(msg);
}
// Convert the resource identifier to a URL.
final String baseURI; ;
if (getClass().getResource(resourceStr) != null) {
baseURI = getClass().getResource(resourceStr).toURI()
.toString();
} else {
baseURI = new File(resourceStr).toURI().toString();
}
return new ParserTask(resource, baseURI, rdfFormat);
}
/**
* Tasks either loads a RDF resource or verifies that the told triples found
* in that resource are present in the database. The difference between data
* load and data verify is just the behavior of the {@link IStatementBuffer}.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
*/
protected class ParserTask implements Callable<Void> {
/**
* The resource to be loaded.
*/
private final R resource;
/**
* The base URL for that resource.
*/
private final String baseURL;
/**
* The RDF interchange syntax that the file uses.
*/
private final RDFFormat rdfFormat;
/**
*
* @param resource
* The resource to be loaded (a plain file or URL, but not a
* directory).
* @param baseURL
* The base URL for that resource.
* @param rdfFormat
* The RDF interchange syntax that the file uses.
*/
public ParserTask(final R resource, final String baseURL,
final RDFFormat rdfFormat) {
if (resource == null)
throw new IllegalArgumentException();
if (baseURL == null)
throw new IllegalArgumentException();
this.resource = resource;
this.baseURL = baseURL;
this.rdfFormat = rdfFormat;
}
public Void call() throws Exception {
// Note: buffer will be pass along from queue to queue.
final AsynchronousStatementBufferImpl buffer = AsynchronousStatementBufferFactory.this
.newStatementBuffer(resource);
try {
// open reader on the file.
final InputStream rdfStream = getInputStream(resource);
try {
// Obtain a buffered reader on the input stream.
final Reader reader = new BufferedReader(new InputStreamReader(
rdfStream));
try {
// run the parser.
new PresortRioLoader(buffer).loadRdf(reader, baseURL,
rdfFormat, defaultGraph == null ? baseURL
: defaultGraph, parserOptions);
} finally {
reader.close();
}
} finally {
rdfStream.close();
}
lock.lock();
try {
// done parsing this document.
documentsParsedCount.incrementAndGet();
// new workflow state (code lifted from BufferTerm2IdWrites).
// lock.lock();
// try {
guardLatch_term2Id.inc();
// if(ENABLE_BLOBS)
// guardLatch_term2Id.inc();
workflowLatch_parser.dec();
workflowLatch_bufferTids.inc();
documentTIDsWaitingCount.incrementAndGet();
assertSumOfLatchs();
// } finally {
// lock.unlock();
// }
// queue tasks to buffer writes on TERM2ID/BLOBS indices.
tidsWriterService
.submit(new BufferTidWrites(buffer));
// if(ENABLE_BLOBS)
// tidsWriterService
// .submit(new BufferBlobsWrites(buffer));
// increment #of outstanding statements (parsed but not restart safe).
outstandingStatementCount.addAndGet(buffer.statementCount);
// increment #of unbuffered statements.
unbufferedStatementCount.addAndGet(buffer.statementCount);
} finally {
lock.unlock();
}
} catch (Throwable ex) {
// error state.
lock.lock();
try {
workflowLatch_parser.dec();
documentError(resource, ex);
throw new Exception(ex);
} finally {
lock.unlock();
}
}
// done.
if (log.isInfoEnabled())
log.info("resource=" + resource + " : " + this);
return null;
}
} // ParserTask
public String toString() {
return super.toString() + "::" + getCounters();
}
/**
*
* @param tripleStore
* @param producerChunkSize
* The chunk size used when writing chunks onto the master for
* the asynchronous index write API. If this value is on the
* order of the #of terms or statements in the parsed documents,
* then all terms / statements will be written onto the master in
* one chunk. The master will split the chunk based on the
* separator keys for the index partitions and write splits onto
* the sink for each index partition. The master and sink
* configuration is specified via the {@link IndexMetadata} when
* the triple store indices are created.
* @param valuesInitialCapacity
* The initial capacity of the map of the distinct RDF
* {@link Value}s parsed from a single document.
* @param bnodesInitialCapacity
* The initial capacity of the map of the distinct RDF
* {@link BNode}s parsed from a single document.
* @param defaultFormat
* The default {@link RDFFormat} which will be assumed.
* @param defaultGraph
* The value that will be used for the graph/context co-ordinate when
* loading data represented in a triple format into a quad store. If
* not given, then the context will be the resource identifier for
* the resource being parsed.
* @param parserOptions
* Options for the {@link RDFParser}.
* @param deleteAfter
* <code>true</code> if the resource should be deleted once the
* statements from that resource are restart safe on the target
* database.
* @param parserPoolSize
* The #of worker threads in the thread pool for parsing RDF
* documents.
* @param parserQueueCapacity
* The capacity of the bounded work queue for the service running
* the parser tasks.
* @param term2IdWriterPoolSize
* The #of worker threads in the thread pool for buffering
* asynchronous writes on the TERM2ID/BLOBS indices.
* @param otherWriterPoolSize
* The #of worker threads in the thread pool for buffering
* asynchronous index writes on the other indices.
* @param notifyPoolSize
* The #of worker threads in the thread pool for handling
* document success and document error notices.
* @param pauseParsedPoolStatementThreshold
* The maximum #of statements which can be parsed but not yet
* buffered before requests for new parser tasks are paused [0:
* {@link Long#MAX_VALUE}]. This allows you to place a constraint
* on the RAM of the parsers. The RAM demand of the asynchronous
* index write buffers is controlled by their master and sink
* queue capacity and chunk size.
*/
public AsynchronousStatementBufferFactory(//
final ScaleOutTripleStore tripleStore,//
final int producerChunkSize, //
final int valuesInitialCapacity,//
final int bnodesInitialCapacity, //
final RDFFormat defaultFormat,//
final String defaultGraph,//
final RDFParserOptions parserOptions,//
final boolean deleteAfter,//
final int parserPoolSize,//
final int parserQueueCapacity,//
final int term2IdWriterPoolSize,//
final int otherWriterPoolSize,//
final int notifyPoolSize,//
final long pauseParsedPoolStatementThreshold//
) {
if (tripleStore == null)
throw new IllegalArgumentException();
if (parserOptions== null)
throw new IllegalArgumentException();
if (producerChunkSize <= 0)
throw new IllegalArgumentException();
if (valuesInitialCapacity <= 0)
throw new IllegalArgumentException();
if (bnodesInitialCapacity <= 0)
throw new IllegalArgumentException();
if (pauseParsedPoolStatementThreshold < 0)
throw new IllegalArgumentException();
this.tripleStore = tripleStore;
this.lexiconRelation = tripleStore.getLexiconRelation();
this.spoRelation = tripleStore.getSPORelation();
this.producerChunkSize = producerChunkSize;
this.valuesInitialCapacity = valuesInitialCapacity;
this.bnodesInitialCapacity = bnodesInitialCapacity;
this.defaultFormat = defaultFormat;
this.defaultGraph = defaultGraph;
this.parserOptions = parserOptions;
this.deleteAfter = deleteAfter;
this.pauseParserPoolStatementThreshold = pauseParsedPoolStatementThreshold;
if (tripleStore.isStatementIdentifiers()) {
throw new UnsupportedOperationException("SIDs not supported");
}
/*
* Open the necessary buffers.
*
* Note: Lock is required by reopenBuffer_xxx() methods.
*/
lock.lock();
try {
// TERM2ID/ID2TERM
{
final AsynchronousIndexWriteConfiguration config = tripleStore
.getLexiconRelation().getTerm2IdIndex()
.getIndexMetadata()
.getAsynchronousIndexWriteConfiguration();
assertLiveness(lexiconRelation.getTerm2IdIndex()
.getIndexMetadata().getName(), config);
buffer_t2id = ((IScaleOutClientIndex) lexiconRelation
.getTerm2IdIndex())
.newWriteBuffer(
new Term2IdWriteProcAsyncResultHandler(false/* readOnly */),
new DefaultDuplicateRemover<BigdataValue>(true/* testRefs */),
new Term2IdWriteProcConstructor(
false/* readOnly */, lexiconRelation
.isStoreBlankNodes(),
lexiconRelation
.getTermIdBitsToReverse()));
buffer_id2t = ((IScaleOutClientIndex) lexiconRelation
.getId2TermIndex())
.newWriteBuffer(
null/* resultHandler */,
new DefaultDuplicateRemover<BigdataValue>(true/* testRefs */),
Id2TermWriteProcConstructor.INSTANCE);
}
// BLOBS
{
final AsynchronousIndexWriteConfiguration config = tripleStore
.getLexiconRelation().getBlobsIndex()
.getIndexMetadata()
.getAsynchronousIndexWriteConfiguration();
assertLiveness(lexiconRelation.getBlobsIndex()
.getIndexMetadata().getName(), config);
buffer_blobs = ((IScaleOutClientIndex) lexiconRelation
.getBlobsIndex())
.newWriteBuffer(
new BlobsWriteProcAsyncResultHandler(false/* readOnly */),
new DefaultDuplicateRemover<BigdataValue>(true/* testRefs */),
new BlobsWriteProcConstructor(
false/* readOnly */, lexiconRelation
.isStoreBlankNodes()));
}
// TEXT
{
if (lexiconRelation.isTextIndex()) {
/*
* FIXME Must hook in once the tids are available so we can
* tokenize the RDF Literals (Note: only the literals, and
* only those literals that will be indexed) and write out
* the tuples on the text index.
*
* TODO Unit tests. Must enable the full text index and must
* verify that both TermIds and BlobIVs were indexed. Inline
* Unicode IVs also need to be indexed (they are small,
* unless we change to [s] centric full text indexing).
*/
final BigdataValueCentricFullTextIndex tmp = (BigdataValueCentricFullTextIndex) lexiconRelation
.getSearchEngine();
buffer_text = ((IScaleOutClientIndex) tmp.getIndex()).newWriteBuffer(
textResultHandler,// counts tuples written on index
new DefaultDuplicateRemover<BigdataValue>(true/* testRefs */),
TextIndexWriteProc.IndexWriteProcConstructor.NO_OVERWRITE);
indexDatatypeLiterals = Boolean
.parseBoolean(lexiconRelation.getProperties().getProperty(
AbstractTripleStore.Options.TEXT_INDEX_DATATYPE_LITERALS,
AbstractTripleStore.Options.DEFAULT_TEXT_INDEX_DATATYPE_LITERALS));
} else {
buffer_text = null;
indexDatatypeLiterals = false;
}
}
/*
* STATEMENT INDICES
*
* Allocate and populate map with the SPOKeyOrders that we will be
* using.
*/
{
buffer_stmts = new LinkedHashMap<SPOKeyOrder, IRunnableBuffer<KVO<ISPO>[]>>(
tripleStore.isQuads() ? 6 : 3);
final Iterator<SPOKeyOrder> itr = tripleStore.getSPORelation()
.statementKeyOrderIterator();
while (itr.hasNext()) {
final SPOKeyOrder keyOrder = itr.next();
final IRunnableBuffer<KVO<ISPO>[]> buffer = ((IScaleOutClientIndex) spoRelation
.getIndex(keyOrder))
.newWriteBuffer(
keyOrder.isPrimaryIndex() ? statementResultHandler
: null,
new DefaultDuplicateRemover<ISPO>(true/* testRefs */),
SPOIndexWriteProc.IndexWriteProcConstructor.INSTANCE);
buffer_stmts.put(keyOrder, buffer);
}
}
} finally {
lock.unlock();
}
/*
* Set iff this is a federation based triple store. The various queue
* statistics are reported only for this case.
*/
final AbstractFederation<?> fed;
if (tripleStore.getIndexManager() instanceof AbstractFederation) {
fed = (AbstractFederation<?>) tripleStore.getIndexManager();
} else {
fed = null;
}
/*
* Note: This service must not reject tasks as long as the statement
* buffer factory is open. It is configured with a bounded workQueue and
* a bounded thread pool. The #of threads in the pool should build up to
* the maximumPoolSize and idle threads will be retired, but only after
* several minutes.
*/
parserService = new ParserThreadPoolExecutor(//
1, // corePoolSize
parserPoolSize, // maximumPoolSize
1, // keepAliveTime
TimeUnit.MINUTES, // keepAlive units.
new LinkedBlockingQueue<Runnable>(parserQueueCapacity),// workQueue
new DaemonThreadFactory(getClass().getName()+"_parserService") // threadFactory
);
/*
* Note: This service MUST NOT block or reject tasks as long as the
* statement buffer factory is open. It is configured with an
* unbounded workQueue and a bounded thread pool. The #of threads in
* the pool should build up to the maximumPoolSize and idle threads
* will be retired, but only after several minutes.
*
* Note: Since we are using an unbounded queue, at most corePoolSize
* threads will be created. Therefore we interpret the caller's argument
* as both the corePoolSize and the maximumPoolSize.
*/
tidsWriterService = new ThreadPoolExecutor(//
term2IdWriterPoolSize, // corePoolSize
term2IdWriterPoolSize, // maximumPoolSize
1, // keepAliveTime
TimeUnit.MINUTES, // keepAlive units.
new LinkedBlockingQueue<Runnable>(/* unbounded */),// workQueue
new DaemonThreadFactory(getClass().getName()+"_term2IdWriteService") // threadFactory
);
/*
* Note: This service MUST NOT block or reject tasks as long as the
* statement buffer factory is open. It is configured with an unbounded
* workQueue and a bounded thread pool. The #of threads in the pool
* should build up to the maximumPoolSize and idle threads will be
* retired, but only after several minutes.
*
* Note: Since we are using an unbounded queue, at most corePoolSize
* threads will be created. Therefore we interpret the caller's argument
* as both the corePoolSize and the maximumPoolSize.
*/
otherWriterService = new ThreadPoolExecutor(//
otherWriterPoolSize, // corePoolSize
otherWriterPoolSize, // maximumPoolSize
1, // keepAliveTime
TimeUnit.MINUTES, // keepAlive units.
new LinkedBlockingQueue<Runnable>(/* unbounded */),// workQueue
new DaemonThreadFactory(getClass().getName()+"_otherWriteService") // threadFactory
);
/*
* Note: This service MUST NOT block or reject tasks as long as the
* statement buffer factory is open. It is configured with an unbounded
* workQueue and a bounded thread pool. The #of threads in the pool
* should build up to the maximumPoolSize and idle threads will be
* retired, but only after several minutes.
*
* Note: Since we are using an unbounded queue, at most corePoolSize
* threads will be created. Therefore we interpret the caller's argument
* as both the corePoolSize and the maximumPoolSize.
*/
notifyService = new ThreadPoolExecutor(//
notifyPoolSize, // corePoolSize
notifyPoolSize, // maximumPoolSize
1, // keepAliveTime
TimeUnit.MINUTES, // keepAlive units.
new LinkedBlockingQueue<Runnable>(/* unbounded */),// workQueue
new DaemonThreadFactory(getClass().getName()+"_notifyService") // threadFactory
);
/*
* @todo If sampling should be done for non-federation cases then we
* need to pass in the ScheduledExecutorService, expose a method to
* start sampling on the caller's service, or create a
* ScheduledExecutorService within this factory class.
*/
serviceStatisticsTask = (fed == null ? null
: new ServiceStatisticsTask(fed.getScheduledExecutorService()));
} // ctor
/**
* Note: If there is a large sink idle timeout on the TERM2ID index then the
* sink will not flush itself automatically once its master is no longer
* pushing data. This situation can occur any time the parser pool is
* paused. A low sink idle timeout is required for the TERM2ID sink to flush
* its writes to the database, so the TIDs will be assigned, statements for
* the parsed documents will be buffered, and new parser threads can begin.
*
* @todo This should probably be automatically overridden for this use case.
* However, the asynchronous index configuration is not currently
* passed through with the requests but is instead global (on the
* IndexMetadata object for the index on the MDS).
*/
private static void assertLiveness(
final String name,
final AsynchronousIndexWriteConfiguration config) {
if (config.getSinkIdleTimeoutNanos() > TimeUnit.SECONDS.toNanos(60)) {
log.error("Large idle timeout will not preserve liveness: index="
+ name + ", config=" + config);
}
}
/**
* {@link Runnable} samples the services and provides reporting via
* {@link #getCounters()}.
*/
private class ServiceStatisticsTask implements Runnable {
private final Map<String, ThreadPoolExecutorBaseStatisticsTask> tasks = new LinkedHashMap<String, ThreadPoolExecutorBaseStatisticsTask>();
private final ScheduledFuture<?> serviceStatisticsFuture;
public ServiceStatisticsTask(final ScheduledExecutorService scheduledService) {
/*
* Add scheduled tasks to report the moving average of the queue
* length, active count, etc. for the various services used by this
* factory.
*/
tasks.put("parserService",
new ThreadPoolExecutorBaseStatisticsTask(parserService));
tasks.put("term2IdWriterService",
new ThreadPoolExecutorBaseStatisticsTask(
tidsWriterService));
tasks.put("otherWriterService",
new ThreadPoolExecutorBaseStatisticsTask(
otherWriterService));
tasks.put("notifyService",
new ThreadPoolExecutorBaseStatisticsTask(notifyService));
// schedule this task to sample performance counters.
serviceStatisticsFuture = scheduledService.scheduleWithFixedDelay(
this, 0/* initialDelay */, 1000/* delay */,
TimeUnit.MILLISECONDS);
}
protected void finalize() throws Exception {
cancel();
}
public void cancel() {
serviceStatisticsFuture.cancel(true/* mayInterruptIfRunning */);
}
public void run() {
for(Runnable r : tasks.values()) {
try {
r.run();
} catch(Throwable t) {
log.error(r,t);
}
}
}
public CounterSet getCounters() {
final CounterSet counterSet = new CounterSet();
for (Map.Entry<String, ThreadPoolExecutorBaseStatisticsTask> e : tasks
.entrySet()) {
counterSet.makePath(e.getKey()).attach(
e.getValue().getCounters());
}
return counterSet;
}
}
public boolean isAnyDone() {
/*
* Note: lock is required to make this test atomic with respect to
* re-opening of buffers.
*/
lock.lock();
try {
if (buffer_blobs != null)
if (buffer_blobs.getFuture().isDone())
return true;
if (buffer_t2id != null)
if (buffer_t2id.getFuture().isDone())
return true;
if (buffer_id2t.getFuture().isDone())
return true;
if (buffer_text != null)
if (buffer_text.getFuture().isDone())
return true;
for (Map.Entry<SPOKeyOrder, IRunnableBuffer<KVO<ISPO>[]>> e : buffer_stmts
.entrySet()) {
final IRunnableBuffer<KVO<ISPO>[]> buffer = e.getValue();
if (buffer != null && buffer.getFuture().isDone())
return true;
}
if (parserService.isTerminated())
return true;
if (tidsWriterService.isTerminated())
return true;
if (otherWriterService.isTerminated())
return true;
if (notifyService != null && notifyService.isTerminated())
return true;
return false;
} finally {
lock.unlock();
}
}
public void cancelAll(final boolean mayInterruptIfRunning) {
if(log.isInfoEnabled())
log.info("Cancelling futures.");
if (buffer_blobs != null)
buffer_blobs.getFuture().cancel(mayInterruptIfRunning);
if (buffer_t2id != null)
buffer_t2id.getFuture().cancel(mayInterruptIfRunning);
buffer_id2t.getFuture().cancel(mayInterruptIfRunning);
if (buffer_text != null)
buffer_text.getFuture().cancel(mayInterruptIfRunning);
for (Map.Entry<SPOKeyOrder, IRunnableBuffer<KVO<ISPO>[]>> e : buffer_stmts
.entrySet()) {
final IRunnableBuffer<KVO<ISPO>[]> buffer = e.getValue();
if (buffer != null)
buffer.getFuture().cancel(mayInterruptIfRunning);
}
notifyEnd();
}
/**
* Awaits a signal that all documents which have queued writes are
* finished and then closes the remaining buffers.
*/
public void close() {
log.info("");
try {
lock.lockInterruptibly();
try {
assertSumOfLatchs();
// not decremented until doc fails parse or is doing TERM2ID writes.
workflowLatch_parser.await();
assertSumOfLatchs();
/*
* No more tasks will request TIDs, so close the TERM2ID and
* BLOBS masters. It will flush its writes.
*/
guardLatch_term2Id.await();
{
if (buffer_t2id != null) {
if (log.isInfoEnabled()) {
log.info("Closing TERM2ID buffer.");
}
buffer_t2id.close();
}
if (buffer_blobs != null) {
if (log.isInfoEnabled()) {
log.info("Closing BLOBS buffer.");
}
buffer_blobs.close();
}
workflowLatch_bufferTids.await();
tidsWriterService.shutdown();
new ShutdownHelper(tidsWriterService, 10L,
TimeUnit.SECONDS) {
protected void logTimeout() {
log
.warn("Waiting for term2Id write service shutdown.");
}
};
assertSumOfLatchs();
}
/*
* No new index write tasks may start (and all should have
* terminated by now).
*/
guardLatch_other.await();
{
if (log.isInfoEnabled())
log.info("Closing remaining buffers.");
buffer_id2t.close();
if (buffer_text != null)
buffer_text.close();
for (Map.Entry<SPOKeyOrder, IRunnableBuffer<KVO<ISPO>[]>> e : buffer_stmts
.entrySet()) {
final IRunnableBuffer<KVO<ISPO>[]> buffer = e.getValue();
if(buffer!=null)
buffer.close();
}
workflowLatch_bufferOther.await();
otherWriterService.shutdown();
new ShutdownHelper(otherWriterService, 10L, TimeUnit.SECONDS) {
protected void logTimeout() {
log.warn("Waiting for other write service shutdown.");
}
};
assertSumOfLatchs();
}
// wait for the global latch.
workflowLatch_document.await();
assertSumOfLatchs();
if (notifyService != null) {
// wait until no notifications are pending.
guardLatch_notify.await();
// note: shutdown should be immediate since nothing should
// be pending.
notifyService.shutdown();
new ShutdownHelper(notifyService, 10L, TimeUnit.SECONDS) {
protected void logTimeout() {
log.warn("Waiting for delete service shutdown.");
}
};
}
} finally {
lock.unlock();
notifyEnd();
}
} catch (InterruptedException ex) {
// @todo should declare this exception in the API.
throw new RuntimeException(ex);
}
}
public void awaitAll() throws InterruptedException, ExecutionException {
if(log.isInfoEnabled())
log.info("Start");
// Close the asynchronous write buffers.
close();
// Await futures for the asynchronous write buffers.
if(log.isInfoEnabled())
log.info("Awaiting futures.");
if (buffer_blobs != null)
buffer_blobs.getFuture().get();
if (buffer_t2id != null)
buffer_t2id.getFuture().get();
buffer_id2t.getFuture().get();
if (buffer_text != null)
buffer_text.getFuture().get();
for (Map.Entry<SPOKeyOrder, IRunnableBuffer<KVO<ISPO>[]>> e : buffer_stmts
.entrySet()) {
final IRunnableBuffer<KVO<ISPO>[]> buffer = e.getValue();
if (buffer != null) {
buffer.getFuture().get();
}
}
if(log.isInfoEnabled())
log.info("Done.");
}
/**
* Invoked after a document has become restart safe. If
* {@link #newSuccessTask(Object)} returns a {@link Runnable} then that will
* be executed on the {@link #notifyService}.
*
* @param resource
* The document identifier.
*/
final protected void documentDone(final R resource) {
if (!lock.isHeldByCurrentThread())
throw new IllegalMonitorStateException();
try {
final Runnable task = newSuccessTask(resource);
if (task != null) {
// increment before we submit the task.
// lock.lock();
// try {
guardLatch_notify.inc();
// } finally {
// lock.unlock();
// }
try {
// queue up success notice.
notifyService.submit(new Runnable() {
public void run() {
try {
task.run();
} finally {
lock.lock(); // acquire latch w/in task.
try {
// decrement after the task is done.
guardLatch_notify.dec();
} finally {
lock.unlock();
}
}
}
});
} catch (RejectedExecutionException ex) {
// decrement latch since tasks did not run.
// lock.lock();
// try {
guardLatch_notify.dec();
// } finally {
// lock.unlock();
// }
// rethrow exception (will be logged below).
throw ex;
}
}
} catch (Throwable t) {
// Log @ ERROR and ignore.
log.error(t, t);
}
}
/**
* Invoked after a document has failed. If
* {@link #newFailureTask(Object, Throwable)} returns a {@link Runnable}
* then that will be executed on the {@link #notifyService}.
*
* @param resource
* The document identifier.
* @param t
* The exception.
*/
final protected void documentError(final R resource, final Throwable t) {
if (!lock.isHeldByCurrentThread()) throw new IllegalMonitorStateException();
documentErrorCount.incrementAndGet();
/*
* Note: this is responsible for decrementing the #of documents whose
* processing is not yet complete. This must be done for each task whose
* future is not watched. However, we MUST NOT do this twice for any
* given document since that would mess with the counter. That counter
* is critical as it forms part of the termination condition for the
* total data load operation.
*/
workflowLatch_document.dec();
try {
final Runnable task = newFailureTask(resource, t);
if (task != null) {
// increment before we submit the task.
guardLatch_notify.inc();
try {
// queue up failure notice.
notifyService.submit(new Runnable() {
public void run() {
try {
task.run();
} finally {
lock.lock(); // acquire latch w/in task.
try {
// decrement after the task is done.
guardLatch_notify.dec();
} finally {
lock.unlock();
}
}
}
});
} catch (RejectedExecutionException ex) {
// decrement latch since tasks did not run.
// lock.lock();
// try {
guardLatch_notify.dec();
// } finally {
// lock.unlock();
// }
// rethrow exception (will be logged below).
throw ex;
}
}
} catch (Throwable ex) {
log.error(ex, ex);
}
}
/**
* Return the optional task to be executed for a resource which has been
* successfully processed and whose assertions are now restart safe on the
* database. The task, if any, will be run on the {@link #notifyService}.
* <p>
* The default implementation runs a {@link DeleteTask} IFF
* <i>deleteAfter</i> was specified as <code>true</code> to the ctor and
* otherwise returns <code>null</code>. The event is logged @ INFO.
*
* @param resource
* The resource.
*
* @return The task to run -or- <code>null</code> if no task should be run.
*/
protected Runnable newSuccessTask(final R resource) {
if (log.isInfoEnabled())
log.info("resource=" + resource);
if (deleteAfter) {
return new DeleteTask(resource);
}
return null;
}
/**
* Return the optional task to be executed for a resource for which
* processing has failed. The task, if any, will be run on the
* {@link #notifyService}.
* <p>
* The default implementation logs a message @ ERROR.
*
* @param resource
* The resource.
* @param cause
* The cause.
*
* @return The task to run -or- <code>null</code> if no task should be
* run.
*/
protected Runnable newFailureTask(final R resource, final Throwable cause) {
return new Runnable() {
public void run() {
log.error(resource, cause);
}
};
}
/**
* Task deletes a resource from the local file system.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
*/
protected class DeleteTask implements Runnable {
private final R resource;
public DeleteTask(final R resource) {
if (resource == null)
throw new IllegalArgumentException();
this.resource = resource;
}
public void run() {
deleteResource(resource);
}
}
/**
* Delete a file whose data have been made restart safe on the database from
* the local file system (this must be overridden to handle resources which
* are not {@link File}s).
*
* @param resource
* The resource.
*/
protected void deleteResource(final R resource) {
if(resource instanceof File) {
if (!((File) resource).delete()) {
log.warn("Could not delete: " + resource);
}
}
}
public CounterSet getCounters() {
final CounterSet counterSet = new CounterSet();
/**
* The elapsed milliseconds.
*/
counterSet.addCounter("elapsedMillis", new Instrument<Long>() {
@Override
protected void sample() {
setValue(getElapsedMillis());
}
});
/**
* The #of documents that have been parsed.
*/
counterSet.addCounter("documentsParsedCount", new Instrument<Long>() {
@Override
protected void sample() {
setValue(documentsParsedCount.get());
}
});
/**
* The #of documents whose TERM2ID/BLOBS writes have begun to be
* buffered but are not yet restart-safe on the database.
*/
counterSet.addCounter("documentTIDsWaitingCount", new Instrument<Long>() {
@Override
protected void sample() {
setValue(documentTIDsWaitingCount.get());
}
});
/**
* The #of documents whose TERM2ID/BLOBS writes are restart-safe on the
* database.
*/
counterSet.addCounter("documentTIDsReadyCount", new Instrument<Long>() {
@Override
protected void sample() {
setValue(documentTIDsReadyCount.get());
}
});
/**
* The #of tuples written on the full text index (this does not count
* triples that were already present on the index).
*/
counterSet.addCounter("fullTextTupleWriteCount", new Instrument<Long>() {
@Override
protected void sample() {
setValue(textResultHandler.getResult().longValue());
}
});
/**
* The #of triples written on the SPO index (this does not count triples
* that were already present on the index).
*/
counterSet.addCounter("toldTriplesWriteCount", new Instrument<Long>() {
@Override
protected void sample() {
setValue(getStatementCount());
}
});
/**
* The #of told triples parsed from documents using this factory and
* made restart safe on the database. This is incremented each time a
* document has been made restart safe by the #of distinct told triples
* parsed from that document.
* <p>
* Note: The same triple can occur in more than one document, and
* documents having duplicate triples may be loaded by distributed
* clients. The actual #of triples on the database is only available by
* querying the database.
*/
counterSet.addCounter("toldTriplesRestartSafeCount",
new Instrument<Long>() {
@Override
protected void sample() {
setValue(toldTriplesRestartSafeCount.get());
}
});
/**
* The told triples per second rate which have been made restart safe by
* this factory object. When you are loading using multiple clients,
* then the total told triples per second rate is the aggregation across
* all of those instances.
*/
counterSet.addCounter("toldTriplesRestartSafePerSec",
new Instrument<Long>() {
@Override
protected void sample() {
final long elapsed = getElapsedMillis();
final double tps = (long) (((double) toldTriplesRestartSafeCount
.get())
/ ((double) elapsed) * 1000d);
setValue((long) tps);
}
});
/**
* The #of documents which have been processed by this client and are
* restart safe on the database by this client.
*/
counterSet.addCounter("documentRestartSafeCount",
new Instrument<Long>() {
@Override
protected void sample() {
setValue(documentRestartSafeCount.get());
}
});
/**
* The #of documents for which the buffer index writes task failed.
*/
counterSet.addCounter("documentErrorCount", new Instrument<Long>() {
@Override
protected void sample() {
setValue(documentErrorCount.get());
}
});
/*
* The latches are used to guard the termination conditions for the
* factory. If they are non-zero the factory can not terminate normally.
*/
{
final CounterSet workflowLatchSet = counterSet.makePath("workflowLatch");
workflowLatchSet.addCounter("parser", new Instrument<Long>() {
@Override
protected void sample() {
setValue(workflowLatch_parser.get());
}
});
workflowLatchSet.addCounter("bufferTids", new Instrument<Long>() {
@Override
protected void sample() {
setValue(workflowLatch_bufferTids.get());
}
});
workflowLatchSet.addCounter("bufferOther", new Instrument<Long>() {
@Override
protected void sample() {
setValue(workflowLatch_bufferOther.get());
}
});
// latch over the total life cycle for a document.
workflowLatchSet.addCounter("document", new Instrument<Long>() {
@Override
protected void sample() {
setValue(workflowLatch_document.get());
}
});
} // latches
/**
* Latches used to guard the buffers and close them in a timely manner.
*/
{
final CounterSet bufferGuardSet = counterSet.makePath("bufferGuard");
bufferGuardSet.addCounter("guardTerm2Id", new Instrument<Long>() {
@Override
protected void sample() {
setValue(guardLatch_term2Id.get());
}
});
bufferGuardSet.addCounter("guardOther", new Instrument<Long>() {
@Override
protected void sample() {
setValue(guardLatch_other.get());
}
});
bufferGuardSet.addCounter("guardNotify", new Instrument<Long>() {
@Override
protected void sample() {
setValue(guardLatch_notify.get());
}
});
}
/*
* Counters pertaining to the logic which suspects new parser task
* requests if too many statements are currently buffered.
*/
{
final CounterSet pauseSet = counterSet.makePath("pause");
/*
* The #of parsed or buffered RDF Statements not yet restart safe
* (current value).
*/
pauseSet.addCounter("outstandingStatementCount",
new Instrument<Long>() {
@Override
protected void sample() {
setValue(outstandingStatementCount.get());
}
});
/*
* The #of parsed but not yet buffered RDF Statements (current
* value).
*/
pauseSet.addCounter("unbufferedStatementCount",
new Instrument<Long>() {
@Override
protected void sample() {
setValue(unbufferedStatementCount.get());
}
});
/*
* The maximum #of statements parsed but not yet buffered before we
* suspend new parse requests.
*/
pauseSet.addCounter("pauseParserPoolStatementThreshold",
new OneShotInstrument<Long>(
pauseParserPoolStatementThreshold));
// The #of suspended parse request threads (current value).
pauseSet.addCounter("pausedThreadCount", new Instrument<Long>() {
@Override
protected void sample() {
setValue(pausedThreadCount.get());
}
});
// The #of suspended parse request threads (cumulative).
pauseSet.addCounter("poolPausedCount", new Instrument<Long>() {
@Override
protected void sample() {
setValue(poolPausedCount.get());
}
});
}
// services
{
counterSet.makePath("services").attach(
serviceStatisticsTask.getCounters());
}
// if(log.isInfoEnabled())
// { // @todo this is just for debugging problems with parser blocking.
//
// final String fqn = tripleStore.getLexiconRelation().getFQN(
// LexiconKeyOrder.TERM2ID);
//
// counterSet.makePath("TERM2ID").attach(
// ((AbstractFederation) tripleStore.getIndexManager())
// .getIndexCounters(fqn).getCounters());
//
// }
return counterSet;
}
/**
* {@link Runnable} class applies the factory to either a single file or to
* all files within a directory.
*/
private class RunnableFileSystemLoader implements Callable<Integer> {
// volatile boolean done = false;
private int count = 0;
// private long retryCount = 0L;
final File fileOrDir;
final FilenameFilter filter;
final long retryMillis;
/**
*
* @param fileOrDir
* The file or directory to be loaded.
* @param filter
* An optional filter on files that will be accepted when
* processing a directory.
* @param retryMillis
* The number of milliseconds to wait between retrys when the
* parser service work queue is full. When ZERO (0L), a
* {@link RejectedExecutionException} will be thrown out
* instead.
*/
public RunnableFileSystemLoader(final File fileOrDir,
final FilenameFilter filter, final long retryMillis) {
if (fileOrDir == null)
throw new IllegalArgumentException();
if (retryMillis < 0)
throw new IllegalArgumentException();
this.fileOrDir = fileOrDir;
this.filter = filter; // MAY be null.
this.retryMillis = retryMillis;
}
/**
* Creates a task using the {@link #taskFactory}, submits it to the
* {@link #loader} and and waits for the task to complete. Errors are
* logged, but not thrown.
*
* @throws RuntimeException
* if interrupted.
*/
public Integer call() throws Exception {
process2(fileOrDir);
return count;
}
/**
* Scans file(s) recursively starting with the named file, and, for each
* file that passes the filter, submits the task.
*
* @param file
* Either a URL, a plain file or directory containing files
* to be processed.
*
* @throws InterruptedException
* if the thread is interrupted while queuing tasks.
*/
private void process2(final File file) throws InterruptedException {
if (file.isHidden()) {
// ignore hidden files.
return;
}
if (file.isDirectory()) {
if (log.isInfoEnabled())
log.info("Scanning directory: " + file);
// filter is optional.
final File[] files = filter == null ? file.listFiles() : file
.listFiles(filter);
for (final File f : files) {
process2(f);
}
} else {
/*
* Processing a standard file.
*/
if(log.isInfoEnabled())
log.info("Will load: "+file);
try {
submitOne((R) file, retryMillis);
count++;
return;
} catch (InterruptedException ex) {
throw ex;
} catch (Exception ex) {
log.error(file, ex);
}
}
}
}
/**
* Class applies the term identifiers assigned by the
* {@link Term2IdWriteProc} to the {@link BigdataValue} references in the
* {@link KVO} correlated with each {@link Split} of data processed by that
* procedure.
* <p>
* Note: Of necessity, this requires access to the {@link BigdataValue}s
* whose term identifiers are being resolved. This implementation presumes
* that the array specified to the ctor and the array returned for each
* chunk that is processed have correlated indices and that the offset into
* the array is given by {@link Split#fromIndex}.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
*/
static private class Term2IdWriteProcAsyncResultHandler
implements
IAsyncResultHandler<Term2IdWriteProc.Result, Void, BigdataValue, KVO<BigdataValue>> {
private final boolean readOnly;
/**
*
* @param readOnly
* if readOnly was specified for the {@link Term2IdWriteProc}
* .
*/
public Term2IdWriteProcAsyncResultHandler(final boolean readOnly) {
this.readOnly = readOnly;
}
/**
* NOP
*
* @see #aggregateAsync(KVO[],
* com.bigdata.rdf.lexicon.Term2IdWriteProc.Result, Split)
*/
public void aggregate(final Term2IdWriteProc.Result result,
final Split split) {
}
/**
* Copy the assigned / discovered term identifiers onto the
* corresponding elements of the terms[].
*/
public void aggregateAsync(final KVO<BigdataValue>[] chunk,
final Term2IdWriteProc.Result result, final Split split) {
for (int i = 0; i < chunk.length; i++) {
@SuppressWarnings("rawtypes")
final IV iv = result.ivs[i];
if (iv == null) {
if (!readOnly)
throw new AssertionError();
} else {
// assign the term identifier.
chunk[i].obj.setIV(iv);
if (chunk[i] instanceof KVOList) {
final KVOList<BigdataValue> tmp = (KVOList<BigdataValue>) chunk[i];
if (!tmp.isDuplicateListEmpty()) {
// assign the term identifier to the duplicates.
tmp.map(new AssignTermId(iv));
}
}
if (log.isDebugEnabled()) {
log
.debug("termId=" + iv + ", term="
+ chunk[i].obj);
}
}
}
}
public Void getResult() {
return null;
}
}
/**
* Class applies the term identifiers assigned by the
* {@link BlobsWriteProc} to the {@link BigdataValue} references in the
* {@link KVO} correlated with each {@link Split} of data processed by that
* procedure.
* <p>
* Note: Of necessity, this requires access to the {@link BigdataValue}s
* whose term identifiers are being resolved. This implementation presumes
* that the array specified to the ctor and the array returned for each
* chunk that is processed have correlated indices and that the offset into
* the array is given by {@link Split#fromIndex}.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
*/
static private class BlobsWriteProcAsyncResultHandler
implements
IAsyncResultHandler<BlobsWriteProc.Result, Void, BigdataValue, KVO<BigdataValue>> {
private final boolean readOnly;
/**
*
* @param readOnly
* if readOnly was specified for the {@link BlobsWriteProc}
* .
*/
public BlobsWriteProcAsyncResultHandler(final boolean readOnly) {
this.readOnly = readOnly;
}
/**
* NOP
*
* @see #aggregateAsync(KVO[],
* com.bigdata.rdf.lexicon.BlobsWriteProc.Result, Split)
*/
public void aggregate(final BlobsWriteProc.Result result,
final Split split) {
}
/**
* Copy the assigned / discovered term identifiers onto the
* corresponding elements of the terms[].
*/
public void aggregateAsync(final KVO<BigdataValue>[] chunk,
final BlobsWriteProc.Result result, final Split split) {
for (int i = 0; i < chunk.length; i++) {
final int counter = result.counters[i];
if (counter == BlobsIndexHelper.NOT_FOUND) {
if (!readOnly)
throw new AssertionError();
} else {
// The value whose IV we have discovered/asserted.
final BigdataValue value = chunk[i].obj;
// Rebuild the IV.
@SuppressWarnings("rawtypes")
final BlobIV<?> iv = new BlobIV(VTE.valueOf(value), value
.hashCode(), (short) counter);
// assign the term identifier.
value.setIV(iv);
if (chunk[i] instanceof KVOList) {
final KVOList<BigdataValue> tmp = (KVOList<BigdataValue>) chunk[i];
if (!tmp.isDuplicateListEmpty()) {
// assign the term identifier to the duplicates.
tmp.map(new AssignTermId(iv));
}
}
if (log.isDebugEnabled()) {
log
.debug("termId=" + iv + ", term="
+ chunk[i].obj);
}
}
}
}
public Void getResult() {
return null;
}
}
/**
* Wrap a {@link BigdataValue}[] with a chunked iterator.
* <p>
* Note: This resolves inline {@link IV}s and filters them out of the
* visited {@link BigdataValue}s as a side-effect.
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
static <V extends BigdataValue> IChunkedIterator<V> newValuesIterator(
final LexiconRelation r,
final Iterator<V> itr, final int chunkSize) {
return new ChunkedWrappedIterator(new Striterator(itr)
.addFilter(new Filter() {
private static final long serialVersionUID = 1L;
@Override
public boolean isValid(final Object obj) {
/*
* Assigns the IV as a side effect iff the RDF Value can
* be inlined according to the governing lexicon
* configuration and returns true iff the value CAN NOT
* be inlined. Thus, inlining is done as a side effect
* while the caller sees only those Values which need to
* be written onto the TERM2ID/BLOBS index.
*/
return r.getInlineIV((Value) obj) == null;
}
}), chunkSize, BigdataValue.class);
}
/**
* Wrap a {@link BigdataValue}[] with a chunked iterator which filters out
* blank nodes and blobs (neither of which is written onto the reverse
* index).
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
private static <V extends BigdataValue> IChunkedIterator<V> newId2TIterator(
final LexiconRelation r, final Iterator<V> itr, final int chunkSize) {
return new ChunkedWrappedIterator(new Striterator(itr)
.addFilter(new Filter() {
private static final long serialVersionUID = 1L;
/*
* Filter hides blank nodes since we do not write them onto
* the reverse index.
*
* Filter does not visit blobs since we do not want to write
* those onto the reverse index either.
*/
@Override
public boolean isValid(final Object obj) {
final BigdataValue v = (BigdataValue) obj;
if (v instanceof BNode)
return false;
if (r.isBlob(v))
return false;
return true;
}
}), chunkSize, BigdataValue.class);
}
/**
* Return iterator visiting only the {@link BigdataLiteral}s that we want
* to write on the full text index.
* @param r
* @param itr
* @param chunkSize
* @return
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
private static <V extends BigdataValue> IChunkedIterator<V> newTextIterator(
final LexiconRelation r, final Iterator<V> itr,
final int chunkSize, final boolean indexDatatypeLiterals) {
return new ChunkedWrappedIterator(
new Striterator(itr).addFilter(new Filter() {
private static final long serialVersionUID = 1L;
/*
* Filter hides blank nodes since we do not write them onto
* the TEXT index.
*/
@Override
public boolean isValid(final Object obj) {
if (!(obj instanceof BigdataLiteral)) {
// Only index Literals.
return false;
}
final BigdataLiteral lit = (BigdataLiteral) obj;
if (!indexDatatypeLiterals && lit.getDatatype() != null) {
// Ignore datatype literals.
return false;
}
return true;
}
}), chunkSize, BigdataValue.class);
}
/**
* Asynchronous writes on the TERM2ID index.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
* Thompson</a>
*
* @todo something similar for the SIDs
*/
static class AsyncTerm2IdIndexWriteTask implements Callable<Void> {
final protected transient static Logger log = Logger
.getLogger(AsyncTerm2IdIndexWriteTask.class);
private final KVOLatch latch;
private final IChunkedIterator<BigdataValue> src;
private final LexiconRelation lexiconRelation;
private final Term2IdTupleSerializer tupleSerTerm2Id;
// private final BlobsTupleSerializer tupleSerBlobs;
private final IRunnableBuffer<KVO<BigdataValue>[]> bufferTerm2Id;
private final IRunnableBuffer<KVO<BigdataValue>[]> bufferBlobs;
/**
*
* @param latch
* @param r
* @param src
* The visits chunks of distinct {@link Value}s.
* @param bufferTerm2Id
* @param bufferBlobs
*/
public AsyncTerm2IdIndexWriteTask(final KVOLatch latch,
final LexiconRelation r,
final IChunkedIterator<BigdataValue> src,
final IRunnableBuffer<KVO<BigdataValue>[]> bufferTerm2Id,
final IRunnableBuffer<KVO<BigdataValue>[]> bufferBlobs) {
if (latch == null)
throw new IllegalArgumentException();
if (r == null)
throw new IllegalArgumentException();
if (src == null)
throw new IllegalArgumentException();
if (bufferTerm2Id == null && bufferBlobs == null)
throw new IllegalArgumentException();
this.latch = latch;
this.lexiconRelation = r;
this.tupleSerTerm2Id = bufferTerm2Id == null ? null
: (Term2IdTupleSerializer) r
.getIndex(LexiconKeyOrder.TERM2ID)
.getIndexMetadata().getTupleSerializer();
this.src = src;
this.bufferTerm2Id = bufferTerm2Id;
this.bufferBlobs = bufferBlobs;
}
/**
* Return <code>true</code> if the {@link BigdataValue} will be stored
* against the BLOBS index.
*/
private boolean isBlob(final BigdataValue v) {
return lexiconRelation.isBlob(v);
}
// /**
// * Return <code>true</code> iff the {@link BigdataValue} is fully inline
// * (in which case the {@link IV} is set as a side-effect on the
// * {@link BigdataValue}).
// */
// private boolean isInline(final BigdataValue v) {
//
// return lexiconRelation.getInlineIV(v) != null;
//
// }
/**
* Reshapes the {@link #src} into {@link KVOC}[]s a chunk at a time and
* submits each chunk to the write buffer for the TERM2ID index.
*/
public Void call() throws Exception {
/*
* This is a thread-local instance, which is why we defer obtaining
* this object until call() is executing.
*/
final LexiconKeyBuilder keyBuilderTerm2Id = bufferTerm2Id == null ? null
: tupleSerTerm2Id.getLexiconKeyBuilder();
// BLOBS stuff.
final BigdataValueSerializer<BigdataValue> valSer = lexiconRelation
.getValueFactory().getValueSerializer();
final BlobsIndexHelper h = new BlobsIndexHelper();
final IKeyBuilder keyBuilder = h.newKeyBuilder();
final DataOutputBuffer out = new DataOutputBuffer(512);
final ByteArrayBuffer tmp = new ByteArrayBuffer(512);
latch.inc();
try {
List<KVOC<BigdataValue>> terms = null;
List<KVOC<BigdataValue>> blobs = null;
while (src.hasNext()) {
final BigdataValue[] chunkIn = src.nextChunk();
for (BigdataValue v : chunkIn) {
/*
* Note: The iterator we are visiting has already had
* the IVs for fully inline Values resolved and set as a
* side-effect and the inline Values have been filtered
* out. We will only see non-inline values here, but
* they may wind up as TermIds or BlobIVs.
*/
// if (isInline(v)) {
// // Immediately resolve the IV via a side-effect.
// System.err.println("inline: "+v);
// continue;
// }
if (bufferBlobs != null && isBlob(v)) {
final byte[] key = h.makePrefixKey(keyBuilder.reset(), v);
final byte[] val = valSer.serialize(v, out.reset(), tmp);
if (blobs == null) {
// Lazily allocate.
blobs = new ArrayList<KVOC<BigdataValue>>();
}
// Assign a sort key to each Value.
blobs.add(new KVOC<BigdataValue>(key, val, v, latch));
// System.err.println("blob : "+v);
} else {
if (terms == null) {
// Lazily allocate to chunkSize.
terms = new ArrayList<KVOC<BigdataValue>>(
chunkIn.length);
}
// Assign a sort key to each Value.
terms.add(new KVOC<BigdataValue>(keyBuilderTerm2Id
.value2Key(v), null/* val */, v, latch));
// System.err.println("term : "+v);
}
}
if (terms != null && !terms.isEmpty()) {
@SuppressWarnings("unchecked")
final KVOC<BigdataValue>[] a = terms
.toArray(new KVOC[terms.size()]);
// Place in KVO sorted order (by the byte[] keys).
Arrays.sort(a);
if (log.isInfoEnabled())
log.info("Adding chunk to TERM2ID master: chunkSize="
+ a.length);
// add chunk to async write buffer
bufferTerm2Id.add(a);
// Clear list.
terms.clear();
}
if (blobs != null && !blobs.isEmpty()) {
@SuppressWarnings("unchecked")
final KVOC<BigdataValue>[] a = blobs
.toArray(new KVOC[blobs.size()]);
// Place in KVO sorted order (by the byte[] keys).
Arrays.sort(a);
if (log.isInfoEnabled())
log.info("Adding chunk to BLOBS master: chunkSize="
+ a.length);
// add chunk to async write buffer
bufferBlobs.add(a);
// Clear list.
blobs.clear();
}
}
} finally {
latch.dec();
}
// Done.
return null;
}
}
/**
* Asynchronous writes on the ID2TERM index.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
* Thompson</a>
*/
static class AsyncId2TermIndexWriteTask implements Callable<Void> {
final protected transient static Logger log = Logger
.getLogger(AsyncId2TermIndexWriteTask.class);
private final KVOLatch latch;
private final BigdataValueFactory valueFactory;
private final IChunkedIterator<BigdataValue> src;
private final IRunnableBuffer<KVO<BigdataValue>[]> buffer;
/**
*
* @param src
* The visits chunks of distinct {@link Value}s with their
* TIDs assigned. Blank nodes will automatically be filtered
* out.
*/
public AsyncId2TermIndexWriteTask(final KVOLatch latch,
final BigdataValueFactory valueFactory,
final IChunkedIterator<BigdataValue> src,
final IRunnableBuffer<KVO<BigdataValue>[]> buffer) {
if (latch == null)
throw new IllegalArgumentException();
if (valueFactory == null)
throw new IllegalArgumentException();
if (src == null)
throw new IllegalArgumentException();
if (buffer == null)
throw new IllegalArgumentException();
this.latch = latch;
this.valueFactory = valueFactory;
this.src = src;
this.buffer = buffer;
}
public Void call() throws Exception {
// used to serialize the Values for the BTree.
final BigdataValueSerializer<BigdataValue> ser = valueFactory
.getValueSerializer();
// thread-local key builder removes single-threaded constraint.
final IKeyBuilder tmp = KeyBuilder.newInstance(Bytes.SIZEOF_LONG);
// buffer is reused for each serialized term.
final DataOutputBuffer out = new DataOutputBuffer();
final ByteArrayBuffer tbuf = new ByteArrayBuffer();
latch.inc();
try {
while (src.hasNext()) {
final BigdataValue[] chunkIn = src.nextChunk();
@SuppressWarnings("unchecked")
final KVOC<BigdataValue>[] chunkOut = new KVOC[chunkIn.length];
int i = 0;
for (BigdataValue v : chunkIn) {
assert v != null;
if (v instanceof BNode) {
// Do not write blank nodes on the reverse index.
continue;
}
if (v.getIV() == null) {
throw new RuntimeException("No TID: " + v);
}
if (v.getIV().isInline()) {
// Do not write inline values on the reverse index.
continue;
}
final byte[] key = v.getIV().encode(tmp.reset())
.getKey();
// Serialize the term.
final byte[] val = ser.serialize((BigdataValueImpl)v, out.reset(), tbuf);
/*
* Note: The BigdataValue instance is NOT supplied to
* the KVO since we do not want it to be retained and
* since there is no side-effect on the BigdataValue for
* writes on ID2TERM (unlike the writes on TERM2ID).
*/
chunkOut[i++] = new KVOC<BigdataValue>(key, val,
null/* v */, latch);
}
// make dense.
final KVO<BigdataValue>[] dense = KVO.dense(chunkOut, i);
/*
* Put into key order in preparation for writing on the
* reverse index.
*/
Arrays.sort(dense);
// add chunk to asynchronous write buffer
buffer.add(dense);
}
} finally {
latch.dec();
}
// Done.
return null;
}
}
/**
* Asynchronous writes on the TEXT index.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
* Thompson</a>
*/
static class AsyncTextIndexWriteTask implements Callable<Void> {
final protected transient static Logger log = Logger
.getLogger(AsyncTextIndexWriteTask.class);
private final KVOLatch latch;
private final BigdataValueCentricFullTextIndex textIndex;
private final IChunkedIterator<BigdataValue> src;
private final IRunnableBuffer<KVO<BigdataValue>[]> buffer;
/**
*
* @param src
* The visits chunks of distinct {@link BigdataLiteral}s with
* their TIDs assigned. Anything which should not be indexed
* has already been filtered out.
*/
public AsyncTextIndexWriteTask(final KVOLatch latch,
final BigdataValueCentricFullTextIndex textIndex,
final IChunkedIterator<BigdataValue> src,
final IRunnableBuffer<KVO<BigdataValue>[]> buffer) {
if (latch == null)
throw new IllegalArgumentException();
if (textIndex == null)
throw new IllegalArgumentException();
if (src == null)
throw new IllegalArgumentException();
if (buffer == null)
throw new IllegalArgumentException();
this.latch = latch;
this.textIndex = textIndex;
this.src = src;
this.buffer = buffer;
}
/**
* FIXME This will on the full text index using the
* {@link BigdataValueCentricFullTextIndex} class. That class will wind up doing
* gathered batch inserts in chunks of up to the capacity set inline in
* the method below. However, it will use Sync RPC rather than the ASYNC
* [buffer_text] index write pipeline. While this should be enough to
* write unit tests for the full text indexing feature, it is not going
* to scale well.
*
* @see BigdataValueCentricFullTextIndex
*/
public Void call() throws Exception {
latch.inc();
try {
/*
* TODO capacity for the full text index writes.
*/
final int capacity = 100000;
textIndex.index(capacity, src);
} finally {
latch.dec();
}
// Done.
return null;
}
}
/**
* Writes the statement chunks onto the specified statement index using the
* asynchronous write API.
* <p>
* Note: This is similar to the {@link SPOIndexWriter}, but the latter uses
* synchronous RPC.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
* Thompson</a>
*/
static class AsyncSPOIndexWriteTask implements Callable<Void> {
final protected transient static Logger log = Logger
.getLogger(AsyncSPOIndexWriteTask.class);
private final KVOLatch latch;
private final IKeyOrder<ISPO> keyOrder;
/* Note: problem with java 1.6.0_07 and _12 on linux when typed. */
@SuppressWarnings("rawtypes")
private final IChunkedOrderedIterator/* <ISPO> */src;
private final IRunnableBuffer<KVO<ISPO>[]> writeBuffer;
private final SPOTupleSerializer tupleSer;
@SuppressWarnings("rawtypes")
public AsyncSPOIndexWriteTask(final KVOLatch latch,
final IKeyOrder<ISPO> keyOrder, final SPORelation spoRelation,
/* Note: problem with java 1.6.0_07 and _12 on linux when typed. */
final IChunkedOrderedIterator/* <ISPO> */src,
final IRunnableBuffer<KVO<ISPO>[]> writeBuffer) {
if (latch == null)
throw new IllegalArgumentException();
if (keyOrder == null)
throw new IllegalArgumentException();
if (writeBuffer == null)
throw new IllegalArgumentException();
this.latch = latch;
this.keyOrder = keyOrder;
this.src = src;
this.writeBuffer = writeBuffer;
// the tuple serializer for this access path.
this.tupleSer = (SPOTupleSerializer) spoRelation.getIndex(keyOrder)
.getIndexMetadata().getTupleSerializer();
}
public Void call() throws Exception {
long chunksOut = 0;
long elementsOut = 0;
latch.inc();
try {
while (src.hasNext()) {
// next chunk, in the specified order.
@SuppressWarnings("unchecked")
final ISPO[] chunk = (ISPO[]) src.nextChunk(keyOrder);
// note: a[] will be dense since nothing is filtered.
@SuppressWarnings("unchecked")
final KVOC<ISPO>[] a = new KVOC[chunk.length];
for (int i = 0; i < chunk.length; i++) {
final ISPO spo = chunk[i];
if (spo == null)
throw new IllegalArgumentException();
if (!spo.isFullyBound())
throw new IllegalArgumentException(
"Not fully bound: " + spo.toString());
// generate key for the index.
final byte[] key = tupleSer.serializeKey(spo);
// generate value for the index.
final byte[] val = tupleSer.serializeVal(spo);
/*
* Note: The SPO is deliberately not provided to the KVO
* instance since it is not required (there is nothing
* being passed back from the write via a side-effect on
* the BigdataStatementImpl) and since it otherwise will
* force the retention of the RDF Value objects in its
* s/p/o/c positions.
*/
a[i] = new KVOC<ISPO>(key, val, null/* spo */, latch);
}
// put chunk into sorted order based on assigned keys.
Arrays.sort(a);
// write chunk on the buffer.
writeBuffer.add(a);
chunksOut++;
elementsOut += a.length;
if (log.isDebugEnabled())
log.debug("Wrote chunk: index=" + keyOrder
+ ", chunksOut=" + chunksOut + ", elementsOut="
+ elementsOut + ", chunkSize=" + a.length);
if (log.isTraceEnabled())
log.trace("Wrote: index=" + keyOrder + ", chunk="
+ Arrays.toString(a));
}
} finally {
latch.dec();
}
if (log.isDebugEnabled())
log.debug("Done: index=" + keyOrder + ", chunksOut="
+ chunksOut + ", elementsOut=" + elementsOut);
// done.
return null;
}
}
/**
* Inner class provides the statement buffer.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
* Thompson</a>
* @param <S>
* @param <F>
*/
protected class AsynchronousStatementBufferImpl implements
IStatementBuffer<S> {
/** The document identifier. */
private final R resource;
private final AbstractTripleStore database;
private final BigdataValueFactory valueFactory;
/**
* A canonicalizing map for RDF {@link Value}s. The use of this map
* provides a ~40% performance gain.
*/
private LinkedHashMap<Value, BigdataValue> values;
/**
* A canonicalizing map for blank nodes. This map MUST be cleared before
* you begin to add statements to the buffer from a new "source"
* otherwise it will co-reference blank nodes from distinct sources. The
* life cycle of the map is the life cycle of the document being loaded,
* so if you are loading a large document with a lot of blank nodes the
* map will also become large.
*/
private final AtomicReference<Map<String, BigdataBNode>> bnodes = new AtomicReference<Map<String, BigdataBNode>>();
/**
* The total #of parsed statements so far.
*
* {@link IBuffer}
*/
private int statementCount;
/**
* Buffer used to accumulate chunks of statements.
*/
private UnsynchronizedUnboundedChunkBuffer<S> statements;
public final AbstractTripleStore getDatabase() {
return database;
}
/**
* Returns <code>null</code>.
* <p>
* Note: This implementation does not support the concept of a
* focusStore so it can not be used for truth maintenance.
*/
public AbstractTripleStore getStatementStore() {
return null;
}
public boolean isEmpty() {
return statementCount == 0;
}
public int size() {
return statementCount;
}
/**
* Return the identifier for the document.
*/
public R getDocumentIdentifier() {
return resource;
}
/**
* @param resource
* The document identifier.
*/
protected AsynchronousStatementBufferImpl(final R resource) {
this.resource = resource;
this.database = AsynchronousStatementBufferFactory.this.tripleStore;
this.valueFactory = database.getValueFactory();
}
/**
* Note: this implementation always returns ZERO (0).
*
* @see ParserTask
*/
public long flush() {
return 0L;
}
/**
* Clears all buffered data, including the canonicalizing mapping for
* blank nodes and deferred provenance statements.
*/
public void reset() {
if (log.isInfoEnabled())
log.info("resource=" + getDocumentIdentifier());
/*
* Note: clear the reference NOT the contents of the map! This makes
* it possible for the caller to reuse the same map across multiple
* StatementBuffer instances.
*/
bnodes.set(null);
values = null;
statements = null;
statementCount = 0;
}
public void setBNodeMap(final Map<String, BigdataBNode> bnodes) {
if (bnodes == null)
throw new IllegalArgumentException();
if (!this.bnodes
.compareAndSet(null/* expect */, bnodes/* update */)) {
throw new IllegalStateException();
}
}
/**
* Add an "explicit" statement to the buffer (flushes on overflow, no
* context).
*
* @param s
* @param p
* @param o
*/
public void add(Resource s, URI p, Value o) {
add(s, p, o, null, StatementEnum.Explicit);
}
/**
* Add an "explicit" statement to the buffer (flushes on overflow).
*
* @param s
* @param p
* @param o
* @param c
*/
public void add(Resource s, URI p, Value o, Resource c) {
add(s, p, o, c, StatementEnum.Explicit);
}
/**
* Add a statement to the buffer (core impl).
*/
public void add(final Resource s, final URI p, final Value o,
final Resource c, final StatementEnum type) {
// add to the buffer.
handleStatement(s, p, o, c, type);
}
public void add(final S e) {
add(e.getSubject(), e.getPredicate(), e.getObject(),
e.getContext(),
(e instanceof BigdataStatement ? ((BigdataStatement) e)
.getStatementType() : null));
}
/**
* Canonicalizing mapping for blank nodes.
* <p>
* Note: This map MUST stay in effect while reading from a given source
* and MUST be cleared (or set to null) before reading from another
* source.
*/
private BigdataBNode getCanonicalBNode(final BigdataBNodeImpl bnode) {
// the BNode's ID.
final String id = bnode.getID();
Map<String, BigdataBNode> bnodes = this.bnodes.get();
if (bnodes == null) {
/*
* Allocate a canonicalizing map for blank nodes. Since this
* will be a private map it does not need to be thread-safe.
*/
setBNodeMap(new HashMap<String, BigdataBNode>(
bnodesInitialCapacity));
// fall through.
bnodes = this.bnodes.get();
if (bnodes == null)
throw new AssertionError();
}
/*
* Specialized for a concurrent hash map.
*/
if (bnodes instanceof ConcurrentHashMap) {
final BigdataBNode tmp = ((ConcurrentHashMap<String, BigdataBNode>) bnodes)
.putIfAbsent(id, bnode);
if (tmp != null) {
// already exists in the map.
return tmp;
}
if (log.isTraceEnabled())
log.trace("added: " + bnode);
// was inserted into the map.
return bnode;
}
/*
* Synchronized on the map to make the conditional insert atomic.
*/
synchronized (bnodes) {
final BigdataBNode tmp = bnodes.get(id);
if (tmp != null) {
// already exists in the map.
return tmp;
}
// insert this blank node into the map.
bnodes.put(id, bnode);
if (log.isTraceEnabled())
log.trace("added: " + bnode);
// was inserted into the map.
return bnode;
} // synchronized
}
/**
* Canonicalizing mapping for a term.
* <p>
* Note: Blank nodes are made canonical with the scope of the source
* from which the data are being read. See {@link #bnodes}. All other
* kinds of terms are made canonical within the scope of the buffer's
* current contents in order to keep down the demand on the heap with
* reading either very large documents or a series of small documents.
*
* @param term
* A term.
*
* @return Either the term or the pre-existing term in the buffer with
* the same data.
*/
private BigdataValue getCanonicalValue(final BigdataValue term0) {
if (term0 == null) {
// Note: This handles an empty context position.
return term0;
}
final BigdataValue term;
if (term0 instanceof BNode) {
// impose canonicalizing mapping for blank nodes.
term = getCanonicalBNode((BigdataBNodeImpl) term0);
/*
* Fall through.
*
* Note: This also records the blank node in the values map so
* that we can process the values map without having to consider
* the blank nodes as well.
*/
} else {
// not a blank node.
term = term0;
}
if (values == null) {
/*
* Create a private (non-thread safe) canonicalizing mapping for
* RDF Values.
*
* Note: A linked hash map is used to make the iterator faster.
*/
values = new LinkedHashMap<Value, BigdataValue>(
valuesInitialCapacity);
}
/*
* Impose a canonicalizing mapping on the term.
*/
final BigdataValue tmp = values.get(term);
if (tmp != null) {
// already exists.
return tmp;
}
// add to the map.
if (values.put(term, term) != null) {
throw new AssertionError();
}
if (log.isTraceEnabled())
log.trace("n=" + values.size() + ", added: " + term);
// return the new term.
return term;
}
/**
* Adds the values and the statement into the buffer.
*
* @param s
* The subject.
* @param p
* The predicate.
* @param o
* The object.
* @param c
* The context (may be null).
* @param type
* The statement type.
*
* @throws IndexOutOfBoundsException
* if the buffer capacity is exceeded.
*
* @see #nearCapacity()
*/
private void handleStatement(final Resource s, final URI p,
final Value o, final Resource c, final StatementEnum type) {
_handleStatement(
(Resource) getCanonicalValue((BigdataResource) valueFactory
.asValue(s)),//
(URI) getCanonicalValue((BigdataURI) valueFactory
.asValue(p)),//
(Value) getCanonicalValue((BigdataValue) valueFactory
.asValue(o)),//
(Resource) getCanonicalValue((BigdataResource) valueFactory
.asValue(c)), //
type);
}
/**
* Form the BigdataStatement object using the valueFactory now that we
* bindings which were (a) allocated by the valueFactory and (b) are
* canonical for the scope of this document.
*/
@SuppressWarnings("unchecked")
private void _handleStatement(final Resource s, final URI p,
final Value o, final Resource c, final StatementEnum type) {
final BigdataStatement stmt = valueFactory.createStatement(
(BigdataResource) s, (BigdataURI) p, (BigdataValue) o,
(BigdataResource) c, type);
if (statements == null) {
statements = new UnsynchronizedUnboundedChunkBuffer<S>(
producerChunkSize,
(Class<? extends S>) BigdataStatement.class);
}
statements.add((S) stmt);
// total #of statements accepted.
statementCount++;
if (log.isTraceEnabled())
log.trace("n=" + statementCount + ", added: " + stmt);
}
/**
* Buffers the asynchronous writes on the TERM2ID and BLOBS indices.
* Those indices will assign tids. If {@link BigdataValue} is fully
* inline, then its {@link IV} is resolved immediately. If the
* {@link BigdataValue} will be stored as a BLOB, then it is written
* onto the buffer for the BLOBS index. Otherwise it is written onto the
* buffer for the TERM2ID index.
*/
private void bufferTidWrites() throws Exception {
if (log.isInfoEnabled()) {
final Map<String, BigdataBNode> bnodes = this.bnodes.get();
final int bnodeCount = (bnodes == null ? 0 : bnodes.size());
log.info("bnodeCount=" + bnodeCount + ", values="
+ values.size() + ", statementCount=" + statementCount);
}
if (isAnyDone()) {
throw new RuntimeException("Factory closed?");
}
/*
* Run task which will queue BigdataValue[] chunks onto the TERM2ID
* async write buffers.
*
* Note: This is responsible for assigning the TIDs (term
* identifiers) to the {@link BigdataValue}s. We CAN NOT write on
* the other indices until we have those TIDs.
*
* Note: If there is not enough load being placed the async index
* write then it can wait up to its idle/chunk timeout. Normally we
* want to use an infinite chunk timeout so that all chunks written
* on the index partitions are as full as possible. Therefore, the
* TERM2ID async writer should use a shorter idle timeout or it can
* live lock. Ideally, there should be some explicit notice when we
* are done queuing writes on TERM2ID across all source documents.
* Even then we can live lock if the input queue is not large
* enough.
*/
/*
* Latch notifies us when all writes for _this_ document on TERM2ID
* are complete such that we have the assigned term identifiers for
* all BigdataValues appearing in the document. This event is used
* to transfer the document to another queue.
*/
final KVOLatch tidsLatch = new KVOLatch() {
public String toString() {
return super.toString() + " : tidsLatch";
}
@Override
protected void signal() throws InterruptedException {
super.signal();
/*
* Note: There is no requirement for an atomic state
* transition for these two counters so there is no reason
* to take the lock here.
*/
// lock.lock();
// try {
documentTIDsWaitingCount.decrementAndGet();
documentTIDsReadyCount.incrementAndGet();
// } finally {
//
// lock.unlock();
//
// }
// Note: otherWriterService MUST have unbounded queue.
otherWriterService.submit(new BufferOtherWritesTask(
AsynchronousStatementBufferImpl.this));
}
};
// pre-increment to avoid notice on transient zeros.
tidsLatch.inc(); // Note: decremented in the finally{} clause.
try {
final Callable<Void> task1 = new AsyncTerm2IdIndexWriteTask(
tidsLatch, lexiconRelation, newValuesIterator(//
lexiconRelation,//
values.values().iterator(),//
producerChunkSize),
buffer_t2id,
buffer_blobs);
// queue chunks onto the write buffer.
task1.call();
} finally {
/*
* Decrement now that all chunks have been queued for
* asynchronous writes.
*/
tidsLatch.dec();
}
/*
* Note: At this point the writes on TERM2ID indices have been buffered.
*/
}
// /**
// * Buffers the asynchronous writes on the BLOBS index.
// *
// * @throws Exception
// */
// private void bufferBlobsWrites() throws Exception {
//
// if (log.isInfoEnabled()) {
// final Map<String, BigdataBNode> bnodes = this.bnodes.get();
// final int bnodeCount = (bnodes == null ? 0 : bnodes.size());
// log.info("bnodeCount=" + bnodeCount + ", values="
// + values.size() + ", statementCount=" + statementCount);
// }
//
// if (isAnyDone()) {
//
// throw new RuntimeException("Factory closed?");
//
// }
//
// /*
// * Run task which will queue BigdataValue[] chunks onto the TERMS
// * async write buffer.
// *
// * Note: This is responsible for assigning the TIDs (term
// * identifiers) to the {@link BigdataValue}s. We CAN NOT write on
// * the other indices until we have those TIDs.
// *
// * Note: If there is not enough load being placed the async index
// * write then it can wait up to its idle/chunk timeout. Normally we
// * want to use an infinite chunk timeout so that all chunks written
// * on the index partitions are as full as possible. Therefore, the
// * TERMS async writer should use a shorter idle timeout or it can
// * live lock. Ideally, there should be some explicit notice when we
// * are done queuing writes on TERMS across all source documents.
// * Even then we can live lock if the input queue is not large
// * enough.
// */
//
// /*
// * Latch notifies us when all writes for _this_ document on TERMS
// * are complete such that we have the assigned term identifiers for
// * all BigdataValues appearing in the document. This event is used
// * to transfer the document to another queue.
// */
// final KVOLatch tidsLatch = new KVOLatch() {
//
// public String toString() {
//
// return super.toString() + " : tidsLatch";
//
// }
//
// @Override
// protected void signal() throws InterruptedException {
//
// super.signal();
// /*
// * Note: There is no requirement for an atomic state
// * transition for these two counters so there is no reason
// * to take the lock here.
// */
//// lock.lock();
//// try {
//
// documentTIDsWaitingCount.decrementAndGet();
//
// documentTIDsReadyCount.incrementAndGet();
//
//// } finally {
////
//// lock.unlock();
////
//// }
//
// // Note: otherWriterService MUST have unbounded queue.
// otherWriterService.submit(new BufferOtherWritesTask(
// AsynchronousStatementBufferImpl.this));
//
// }
//
// };
//
// // pre-increment to avoid notice on transient zeros.
// tidsLatch.inc();
//
// try {
//
// final Callable<Void> task = new AsyncBlobsIndexWriteTask(
// tidsLatch, lexiconRelation, newValuesIterator(//
// lexiconRelation,//
// values.values().iterator(),//
// producerChunkSize),
// buffer_blobs);
//
// // queue chunks onto the write buffer.
// task.call();
//
// } finally {
//
// /*
// * Decrement now that all chunks have been queued for
// * asynchronous writes.
// */
//
// tidsLatch.dec();
//
// }
//
// /*
// * Note: At this point the writes on TERMS have been buffered.
// */
//
// }
/**
* Buffers write requests for the remaining indices (everything except
* TERM2ID/BLOBS indices).
*
* @throws InterruptedException
* @throws ExecutionException
*/
private void bufferOtherWrites() throws InterruptedException,
ExecutionException {
if (log.isDebugEnabled()) {
log.debug("Writing on remaining indices.");
}
/*
* Setup tasks which can run asynchronously. These tasks have no
* dependencies. They can each proceed at their own rate. However,
* we can not return from within this method until they are all
* done.
*
* Note: Each task runs in parallel.
*
* Note: Each task uses the asynchronous write API. When the Future
* for that task is complete all it means is that the data are now
* buffered on the asynchronous write buffer for the appropriate
* index. It DOES NOT mean that those writes are complete. However,
* the [documentStableLatch] DOES indicate when the data is restart
* safe.
*
* Note: These tasks all process iterators. This approach was chosen
* to isolate the tasks (which queue data for asynchronous writes)
* from the data structures in this IStatementBuffer implementation.
* An example of something which WOULD NOT work is if these tasks
* were inner classes accessing the instance fields on this class
* since reset() would clear those fields which might cause
* spontaneous failures within ongoing processing.
*/
final List<Callable> tasks = new LinkedList<Callable>();
/*
* The #of triples parsed from this document. This is added to the
* total #of restart safe told triples loaded by this client when
* the latch is triggered. Of course, the actual #of triples on the
* database is only available by querying the database since the
* same triple can occur in more than one document, and documents
* are loaded by distributed clients so there is no way to correct
* for such duplicate told triples short of querying the database.
*/
final int toldTriplesThisDocument = statementCount;
/*
* Latch is signaled when all data buffered for this document is
* RESTART SAFE on the database.
*
* Note: In order for the latch to have those semantics we have to
* include it on each KVO object buffered for all remaining indices.
* The semantics are valid in the presence of duplicate removes IFF
* they obey the contract for KVOList and link together the
* duplicates such that the latch is decremented for each distinct
* KVOC instance, including those which were eliminated as
* duplicates.
*/
final KVOLatch documentRestartSafeLatch = new KVOLatch() {
public String toString() {
return super.toString() + " : documentRestartSafeLatch";
}
@Override
protected void signal() throws InterruptedException {
super.signal();
lock.lock();
try {
workflowLatch_bufferOther.dec();
workflowLatch_document.dec();
assertSumOfLatchs();
documentRestartSafeCount.incrementAndGet();
toldTriplesRestartSafeCount
.addAndGet(toldTriplesThisDocument);
outstandingStatementCount
.addAndGet(-toldTriplesThisDocument);
// notify that the document is done.
documentDone(getDocumentIdentifier());
} finally {
lock.unlock();
}
}
};
tasks.add(new AsyncId2TermIndexWriteTask(documentRestartSafeLatch,
valueFactory, newId2TIterator(lexiconRelation, values
.values().iterator(), producerChunkSize),
buffer_id2t));
if (buffer_text != null) {
tasks.add(new AsyncTextIndexWriteTask(documentRestartSafeLatch,
(BigdataValueCentricFullTextIndex) lexiconRelation
.getSearchEngine(), newTextIterator(
lexiconRelation, values.values().iterator(),
producerChunkSize, indexDatatypeLiterals),
buffer_text));
}
for (Map.Entry<SPOKeyOrder, IRunnableBuffer<KVO<ISPO>[]>> e : buffer_stmts
.entrySet()) {
final SPOKeyOrder keyOrder = e.getKey();
final IRunnableBuffer<KVO<ISPO>[]> buffer = e.getValue();
tasks.add(new AsyncSPOIndexWriteTask(documentRestartSafeLatch,
keyOrder, spoRelation,
// (IChunkedOrderedIterator<ISPO>)
statements.iterator(), buffer));
}
/*
* Submit all tasks. They will run in parallel. If they complete
* successfully then all we know is that the data has been buffered
* for asynchronous writes on the various indices.
*
* Note: java 1.6.0_07/12 build problems under linux when typed as
* <Future> or any other combination that I have tried.
*/
final List futures;
/*
* This latch is incremented _before_ buffering writes, and within
* each routine that buffers writes, to avoid false triggering. This
* is done to ensure that the latch will be positive until we exit
* the try / finally block. We do this around the submit of the
* tasks and do not decrement the latch until the futures are
* available so we known that all data is buffered.
*/
documentRestartSafeLatch.inc();
try {
futures = tripleStore.getExecutorService().invokeAll(
(List) tasks);
} finally {
// decrement so that the latch can be triggered.
documentRestartSafeLatch.dec();
}
try {
/*
* Make sure that no errors were reported by those tasks.
*/
for (Object f : futures) {
((Future) f).get();
}
} finally {
/*
* At this point all writes have been buffered. We now discard
* the buffered data (RDF Values and statements) since it will
* no longer be used.
*/
reset();
lock.lock();
try {
if (unbufferedStatementCount
.addAndGet(-toldTriplesThisDocument) <= pauseParserPoolStatementThreshold) {
unpaused.signalAll();
}
} finally {
lock.unlock();
}
}
}
}// StatementBuffer impl.
/**
* Task buffers the asynchronous writes on the TERM2ID index.
*/
private class BufferTidWrites implements Callable<Void> {
private final AsynchronousStatementBufferImpl buffer;
public BufferTidWrites(final AsynchronousStatementBufferImpl buffer) {
if (buffer == null)
throw new IllegalArgumentException();
this.buffer = buffer;
}
public Void call() throws Exception {
// // new workflow state.
// lock.lock();
// try {
// guardLatch_term2Id.inc();
// workflowLatch_parser.dec();
// workflowLatch_bufferTerm2Id.inc();
// documentTIDsWaitingCount.incrementAndGet();
// assertSumOfLatchs();
// } finally {
// lock.unlock();
// }
try {
buffer.bufferTidWrites();
lock.lock();
try {
guardLatch_term2Id.dec();
} finally {
lock.unlock();
}
return null;
} catch (Throwable t) {
lock.lock();
try {
guardLatch_term2Id.dec();
workflowLatch_bufferTids.dec();
documentTIDsWaitingCount.decrementAndGet();
documentError(buffer.getDocumentIdentifier(), t);
outstandingStatementCount.addAndGet(-buffer.statementCount);
if (unbufferedStatementCount
.addAndGet(-buffer.statementCount) <= pauseParserPoolStatementThreshold) {
unpaused.signalAll();
}
throw new Exception(t);
} finally {
lock.unlock();
}
}
}
}
// /**
// * Task buffers the asynchronous writes on the BLOBS index.
// */
// private class BufferBlobsWrites implements Callable<Void> {
//
// private final AsynchronousStatementBufferImpl buffer;
//
// public BufferBlobsWrites(final AsynchronousStatementBufferImpl buffer) {
//
// if (buffer == null)
// throw new IllegalArgumentException();
//
// this.buffer = buffer;
//
// }
//
// public Void call() throws Exception {
//
//// // new workflow state.
//// lock.lock();
//// try {
//// guardLatch_term2Id.inc();
//// workflowLatch_parser.dec();
//// workflowLatch_bufferTerm2Id.inc();
//// documentTIDsWaitingCount.incrementAndGet();
//// assertSumOfLatchs();
//// } finally {
//// lock.unlock();
//// }
//
// try {
//
// buffer.bufferBlobsWrites();
//
// lock.lock();
// try {
// guardLatch_term2Id.dec();
// } finally {
// lock.unlock();
// }
//
// return null;
//
// } catch (Throwable t) {
//
// lock.lock();
// try {
// guardLatch_term2Id.dec();
// workflowLatch_bufferTerm2Id.dec();
// documentTIDsWaitingCount.decrementAndGet();
// documentError(buffer.getDocumentIdentifier(), t);
// outstandingStatementCount.addAndGet(-buffer.statementCount);
// if (unbufferedStatementCount
// .addAndGet(-buffer.statementCount) <= pauseParserPoolStatementThreshold) {
// unpaused.signalAll();
// }
// throw new Exception(t);
// } finally {
// lock.unlock();
// }
//
// }
//
// }
//
// }
/**
* Task which buffers index writes for the remaining indices (everything
* other than TERM2ID).
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
* Thompson</a>
*/
private class BufferOtherWritesTask implements Callable<Void> {
private final AsynchronousStatementBufferImpl buffer;
public BufferOtherWritesTask(final AsynchronousStatementBufferImpl buffer) {
if (buffer == null)
throw new IllegalArgumentException();
this.buffer = buffer;
}
public Void call() throws Exception {
// new workflow state.
lock.lock();
try {
guardLatch_other.inc();
workflowLatch_bufferTids.dec();
workflowLatch_bufferOther.inc();
assertSumOfLatchs();
} finally {
lock.unlock();
}
try {
buffer.bufferOtherWrites();
lock.lock();
try {
guardLatch_other.dec();
} finally {
lock.unlock();
}
return null;
} catch (Throwable t) {
lock.lock();
try {
guardLatch_other.dec();
workflowLatch_bufferOther.dec();
documentError(buffer.getDocumentIdentifier(), t);
outstandingStatementCount.addAndGet(-buffer.statementCount);
if (unbufferedStatementCount
.addAndGet(-buffer.statementCount) <= pauseParserPoolStatementThreshold) {
unpaused.signalAll();
}
throw new Exception(t);
} finally {
lock.unlock();
}
}
}
}
/**
* Thread pool with pause/resume semantics based on the amount of buffered
* state for the outer class.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
* Thompson</a>
*/
private class ParserThreadPoolExecutor extends ThreadPoolExecutor {
/**
* @param corePoolSize
* @param maximumPoolSize
* @param keepAliveTime
* @param unit
* @param workQueue
* @param threadFactory
*/
public ParserThreadPoolExecutor(final int corePoolSize,
final int maximumPoolSize, final long keepAliveTime,
final TimeUnit unit, final BlockingQueue<Runnable> workQueue,
final ThreadFactory threadFactory) {
super(corePoolSize, maximumPoolSize, keepAliveTime, unit,
workQueue, threadFactory);
}
/**
* <code>true</code> if worker tasks must wait in
* {@link #beforeExecute(Thread, Runnable)}
*/
private boolean isPaused() {
return unbufferedStatementCount.get() > pauseParserPoolStatementThreshold;
}
/**
* Overridden to have worker threads pause if {@link #isPaused()}
* returns true.
*
* @param t
* The thread that will run the task.
* @param r
* The {@link Runnable} wrapping the {@link AbstractTask} -
* this is actually a {@link FutureTask}. See
* {@link AbstractExecutorService}.
*/
protected void beforeExecute(final Thread t, final Runnable r) {
// Note: [r] is the FutureTask.
lock.lock();
try {
if(isPaused()) {
pausedThreadCount.incrementAndGet();
poolPausedCount.incrementAndGet();
if (log.isInfoEnabled())
log.info("PAUSE : "
+ AsynchronousStatementBufferFactory.this
.toString());
while (isPaused()) {
unpaused.await();
}
// if (!unpaused.await(60, TimeUnit.SECONDS)) {
//
// /*
// * Note: This was a trial workaround for a liveness
// * problem. Unfortunately, it did not fix the
// * problem. [The issue was a deadlock in the global
// * LRU, which has been fixed.]
// */
//
// log.error("Flushing TERM2ID buffer: "
// + AbstractStatisticsCollector.fullyQualifiedHostName);
//
// reopenBuffer_term2Id();
//
// // fall through : while(isPaused()) will retest.
//
// }
//
// }
pausedThreadCount.decrementAndGet();
if (log.isInfoEnabled())
log.info("RESUME: "
+ AsynchronousStatementBufferFactory.this
.toString());
}
} catch (InterruptedException ie) {
t.interrupt();
} finally {
lock.unlock();
}
super.beforeExecute(t, r);
}
}
} // StatementBufferFactory impl.