package com.bigdata.rdf.internal; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; import java.security.NoSuchAlgorithmException; import java.util.Arrays; import java.util.Collections; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.TimeZone; import java.util.UUID; import java.util.concurrent.BlockingQueue; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.FutureTask; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.RejectedExecutionException; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.ReentrantLock; import java.util.zip.Deflater; import java.util.zip.GZIPInputStream; import org.apache.log4j.Logger; import org.openrdf.model.BNode; import org.openrdf.model.Literal; import org.openrdf.model.Statement; import org.openrdf.model.URI; import org.openrdf.model.Value; import org.openrdf.rio.RDFFormat; import org.openrdf.rio.RDFHandlerException; import org.openrdf.rio.RDFParseException; import org.openrdf.rio.RDFParser; import org.openrdf.rio.RDFParserFactory; import org.openrdf.rio.RDFParserRegistry; import org.openrdf.rio.helpers.RDFHandlerBase; import com.bigdata.Banner; import com.bigdata.btree.BTree; import com.bigdata.btree.DefaultTupleSerializer; import com.bigdata.btree.IRangeQuery; import com.bigdata.btree.ITuple; import com.bigdata.btree.ITupleIterator; import com.bigdata.btree.IndexMetadata; import com.bigdata.btree.keys.DefaultKeyBuilderFactory; import com.bigdata.btree.keys.IKeyBuilder; import com.bigdata.btree.keys.KV; import com.bigdata.btree.keys.KeyBuilder; import com.bigdata.btree.keys.SuccessorUtil; import com.bigdata.btree.raba.codec.CanonicalHuffmanRabaCoder; import com.bigdata.btree.raba.codec.FrontCodedRabaCoder; import com.bigdata.io.ByteArrayBuffer; import com.bigdata.io.DataOutputBuffer; import com.bigdata.io.DirectBufferPool; import com.bigdata.io.compression.RecordCompressor; import com.bigdata.journal.BufferMode; import com.bigdata.journal.Journal; import com.bigdata.rdf.internal.impl.BlobIV; import com.bigdata.rdf.internal.impl.literal.PartlyInlineTypedLiteralIV; import com.bigdata.rdf.internal.impl.uri.PartlyInlineURIIV; import com.bigdata.rdf.model.BigdataValue; import com.bigdata.rdf.model.BigdataValueFactory; import com.bigdata.rdf.model.BigdataValueFactoryImpl; import com.bigdata.rdf.model.BigdataValueSerializer; import com.bigdata.rdf.store.AbstractTripleStore; import com.bigdata.rdf.vocab.BaseVocabulary; import com.bigdata.rwstore.sector.IMemoryManager; import com.bigdata.rwstore.sector.MemoryManager; import com.bigdata.util.Bytes; import com.bigdata.util.BytesUtil; import com.bigdata.util.concurrent.Latch; /** * Utility class to parse some RDF resource(s) and count hash collisions using a * variety of hash codes. * * TODO Various data sets: * * <pre> * /nas/data/lubm/U1/data/University0/ * /nas/data/bsbm/bsbm_2785/dataset.nt.gz * /nas/data/bsbm/bsbm_566496/dataset.nt.gz * /data/bsbm3_200m_1MSplits * * 8B triple bioinformatics data set. * * BTC data (some very large literals) * * </pre> * * TODO order preserving hash codes could be interesting here. Look at 32 and 64 * bit variants of the math and at generalized order preserving hash codes. With * order preserving hash codes, it makes sense to insert all Unicode terms into * TERM2ID such that we have a total order there. * * TODO benchmark the load time with different hash codes. the cost of the hash * computation and the randomness of the distribution will both play a role. The * B+Tree will need to be setup with a sufficient [writeRetentionQueue] and we * will need to specify [-server -Xmx1G]. * * SHA-256 - no collisions on BSBM 200M. 30G file. time? * * 32-bit hash codes. #collisions=1544132 Elapsed: 16656445ms Journal size: * 23841341440 bytes (23G) * * Now limiting the size of values in a leaf and also increasing the branching * factor to 512 (was 32). [The current run is scanning after the initial * insert, which involves a little wasted effort. It was also without the * -server -Xmx2g, and write retention queue parameters. Finally, it was * serializing BigdataValue objects, including their IV, rather than RDF Value * objects. The code has since been modified to serialize just the BigdataValue * Also, I've since raised the initial extent from 10M to 200M]. * maxCollisions=3, Elapsed: 22579073ms Journal size: 35270950912 bytes * * Now buffering 100k values at a time: 2x faster. * * <pre> * U1: * Elapsed: 23379ms * NumStatements: 1000313 * NumDistinctVals: 291259 * TotalKeyBytes: 1747554 * TotalValBytes: 60824514 * MaxCollisions: 1 * TotalCollisions: 6 * Journal size: 209715200 bytes * name m height nnodes nleaves nodeBytes leafBytes totalBytes avgNodeBytes avgLeafBytes minNodeBytes maxNodeBytes minLeafBytes maxLeafBytes * lex 1024 1 1 474 7913 3662623 3670536 7913 7727 7913 7913 5786 13784 * </pre> * * With only a byte (versus short) counter in the key. Oddly, this has no impact * on the average leaf size. That suggests that the keys in the leaves are very * sparse in terms of the hash code space such that prefix compression is not * really doing that much for us. * * <pre> * Elapsed: 23235ms * NumStatements: 1000313 * NumDistinctVals: 291259 * TotalKeyBytes: 1456295 * TotalValBytes: 60824514 * MaxCollisions: 1 * TotalCollisions: 6 * Journal size: 209715200 bytes * name m height nnodes nleaves nodeBytes leafBytes totalBytes avgNodeBytes avgLeafBytes minNodeBytes maxNodeBytes minLeafBytes maxLeafBytes * lex 1024 1 1 474 7913 3371370 3379283 7913 7112 7913 7913 5274 12774 * </pre> * * BSBM 200M: This is the best time and space so far. using a byte counter * rather than a short. * * <pre> * Elapsed: 16338357ms * NumStatements: 198808848 * NumDistinctVals: 45647082 * TotalKeyBytes: 228235410 * TotalValBytes: 11292849582 * MaxCollisions: 3 * TotalCollisions: 244042 * Journal size: 16591683584 bytes * </pre> * * BSBM 200M: Note: I restarted this run after terminating yourkit so the * results should be valid (right?). The main changes are to use stringValue() * to test for dateTime, to use the canonical huffman coder for the leaf keys. * * <pre> * Elapsed: 20148506ms * NumStatements: 198808848 * NumDistinctVals: 45647082 * TotalKeyBytes: 228235410 * TotalValBytes: 11292849582 * MaxCollisions: 3 * TotalCollisions: 244042 * Journal size: 16591683584 bytes * </pre> * * BSBM 200M: raw records are compress if they are over 64 bytes long. * * <pre> * Elapsed: 18757003ms * NumStatements: 198808848 * NumDistinctVals: 45647082 * TotalKeyBytes: 228235410 * TotalValBytes: 7910596818 * MaxCollisions: 3 * TotalCollisions: 244042 * Journal size: 12270108672 bytes * </pre> * * BSBM 200M: literals LT 64 byte labels are assumed inlined into statement * indices (except datatype URIs). * * <pre> * Elapsed: 16193915ms * NumStatements: 198808848 * NumDistinctVals: 43273381 * NumShortLiterals: 2723662 * TotalKeyBytes: 216366905 * TotalValBytes: 7807037644 * MaxCollisions: 3 * TotalCollisions: 219542 * Journal size: 11083186176 bytes * </pre> * * BSBM 200M: uris LT 64 byte localNames are assumed inlined into statement * indices (plus datatype literals LT 64 bytes). * * <pre> * Elapsed: 5699248ms * NumStatements: 198808848 * NumDistinctVals: 12198222 * NumShortLiterals: 32779032 * NumShortURIs: 493520581 * TotalKeyBytes: 60991110 * TotalValBytes: 4944223808 * MaxCollisions: 2 * TotalCollisions: 17264 * Journal size: 7320764416 bytes * </pre> * * BSBM 200M: one parser thread and one indexer thread. * * <pre> * Elapsed: 3724415ms * NumStatements: 198808848 * NumDistinctVals: 12198222 * NumShortLiterals: 32779032 * NumShortBNodes: 0 * NumShortURIs: 493520581 * TotalKeyBytes: 60991110 * TotalValBytes: 4944223808 * MaxCollisions: 2 * TotalCollisions: 17264 * Journal size: 7320764416 bytes * </pre> * * GC OH problem trying to run multiple parsers against BSBM 200M when split * into 200 files. * * <pre> * valBufSize := 10000 * valQueueCapacity = 100 * maxDrain := 50 * nparserThreads := 2 * parserWorkQueue := 1000 * </pre> * * BSBM 200M - this is 3x longer. This run did not have the GC OH problem, but * GC had frequent 10% spikes, which is a lot in comparison to our best run. * * <pre> * valBufSize := 1000 * valQueueCapacity = 10 * maxDrain := 5 * nparserThreads := 4 * parserWorkQueue := 100 * * Elapsed: 9589520ms * NumStatements: 198805837 * NumDistinctVals: 12202052 * NumShortLiterals: 32776100 * NumShortBNodes: 0 * NumShortURIs: 493514954 * TotalKeyBytes: 61010260 * TotalValBytes: 4945278396 * MaxCollisions: 2 * TotalCollisions: 17260 * Journal size: 7320764416 bytes * </pre> * * BSBM 200M: split in 200 files. 69m versus best time so far of 62m. There is * only one thread in the pool, but the caller runs policy means that we are * actually running two parsers. So, this is not really the same as the best * run, which was one parser running in the main thread with the indexer running * in another thread. * * <pre> * valBufSize := 10000 * valQueueCapacity = 10 * maxDrain := 5 * nparserThreads := 1 * parserWorkQueue := 100 * * Elapsed: 4119775ms * NumStatements: 198805837 * NumDistinctVals: 12202052 * NumShortLiterals: 32776100 * NumShortBNodes: 0 * NumShortURIs: 493514954 * TotalKeyBytes: 61010260 * TotalValBytes: 4945278396 * MaxCollisions: 2 * TotalCollisions: 17260 * Journal size: 7320764416 bytes * </pre> * * BSBM 200M with 1M statement splits using the memory manager to buffer the * data on the native heap. This is the best score so far (compare with * 3724415ms with one parser and one indexer thread). For some reason, the #of * distinct values and literals is slightly different for these two runs. One * other change in this run is that we always gzip the record since we can not * deserialize the record unless we know in advance whether or not it is * compressed. Previous runs had conditionally compressed based on the original * byte[] value length and stored the compressed record iff it was shorter. * However, we can only conditionally compress if we use a header or bit flag to * indicate that the record is compressed. Peak memory manager use was 262M. * * <pre> * Elapsed: 2863898ms * NumStatements: 198805837 * NumDistinctVals: 12,202,052 * NumShortLiterals: 61,100,900 * NumShortBNodes: 0 * NumShortURIs: 493514954 * TotalKeyBytes: 61010260 * TotalValBytes: 4945376779 * MaxCollisions: 2 * TotalCollisions: 17260 * Journal size: 7320764416 bytes * </pre> * * BSBM 200M using memory manager (high tide of 351M) and 5 parser threads (plus * the main thread). Heap usage is pretty controlled. * * <pre> * Elapsed: 2803451ms * NumStatements: 198805837 * NumDistinctVals: 12202052 * NumShortLiterals: 61100900 * NumShortBNodes: 0 * NumShortURIs: 493514954 * TotalKeyBytes: 61010260 * TotalValBytes: 4945376779 * MaxCollisions: 2 * TotalCollisions: 17260 * Journal size: 7320764416 bytes * </pre> * * BSBM 200M. Using memory manager and only one parser thread. This does run * significantly slower (55m versus 47m with two parser threads). It might not * be slower if we also ran against the single source file (this ran against the * split files) since each chunk placed onto the queue would then be full, but I * doubt that this will make that much difference. * * <pre> * Elapsed: 3300871ms * NumStatements: 198805837 * NumDistinctVals: 12049125 * NumShortLiterals: 61100900 * NumShortBNodes: 0 * NumShortURIs: 493514954 * TotalKeyBytes: 60245625 * TotalValBytes: 4877760110 * MaxCollisions: 2 * TotalCollisions: 16840 * Journal size: 7320764416 bytes * </pre> * * BSBM 200M. Using memory manager, one parser thread (the caller), and a single * source file. The question is whether we do better with a statement handler * that is only flushed incrementally (when full) compared to using 2 parsers * and flushing each time we reach the end of a 1M statement source file. Nope. * This was 77 minutes. (This was a fair comparison since the source files for * the split sources are compressed. So we really do better with two parsers and * split files) * * <pre> * /allocationCount=0 * /bufferCapacity=1000 * /bufferCount=232 * /extent=243269632 * /slotBytes=0 * /userBytes=0 * Elapsed: 4605950ms * NumStatements: 198808848 * NumDistinctVals: 12198222 * NumShortLiterals: 61103832 * NumShortBNodes: 0 * NumShortURIs: 493520581 * TotalKeyBytes: 60991110 * TotalValBytes: 4944322031 * MaxCollisions: 2 * TotalCollisions: 17264 * Journal size: 7320764416 bytes * </pre> * * TODO Try with only N bytes worth of the SHA hash code, leaving some bits left * over for partitioning URIs, Literals, and BNodes (for told bnode mode) and * for a counter to break ties when there is a hash collision. We should wind up * with an 8-12 byte termId which is collision proof and very well distributed. * * TODO Add bit flags at the front for {BLOB, URI, Literal, BNode} (BLOB being * the odd one out). If we move BLOBs out of the key range of other plain * literals, or literals of a given language code or datatype, then we can not * do an ordered scan of the literals anymore which is inclusive of the blobs. * There is a similar consequence of moving small literals into the statement * index. * <p> * If we inline small unicode values (<32 bytes) and reserve the TERM2ID index * for large(r) values then we can approach a situation in which it serves * solely for blobs but with a tradeoff in size (of the statement indices) * versus indirection. * <p> * Large value promotion does not really let us handle large blobs * (multi-megabytes) in s/o as a 50 4M blobs would fill up a shard. There, I * think that we need to give the control over to the application and require it * to write on a shared resource (shared file system, S3, etc). The value * inserted into the index would then be just the pathname in the shared file * system or the URL of the S3 resource. This breaks the ACID decision boundary * though as the application has no means available to atomically decide that * the resource does not exist and hence create it. Even using a conditional * E-Tag on S3 would not work since it would have to have an index over the S3 * entities to detect a write-write conflict for the same data under different * URLs. * * @author thompsonbry */ public class HashCollisionUtility { private final static Logger log = Logger .getLogger(HashCollisionUtility.class); /** * An index mapping <code>hashCode(Value)+counter : Value</code>. This * provides a dictionary for RDF {@link Value}s encountered when loading * {@link Statement}s into the database. The counter provides a simple * mechanism for reconciling hash collisions. */ private final BTree termsIndex; private final LexiconConfiguration<BigdataValue> conf; private final BigdataValueFactory vf; /** * Counters for things that we track. */ private static class Counters { /** * #of statements visited. */ private final AtomicLong nstmts = new AtomicLong(); /** * The #of {@link URI}s whose <code>localName</code> was short enough * that we decided to inline them into the statement indices instead. */ private final AtomicLong nshortURIs = new AtomicLong(); /** * The #of {@link BNode}s whose <code>ID</code> was short enough that we * decided to inline them into the statement indices instead (this also * counts blank nodes which are inlined because they have integer or * UUID IDs). */ private final AtomicLong nshortBNodes = new AtomicLong(); /** * The #of {@link Literal}s which were short enough that we decided to * inline them into the statement indices instead. */ private final AtomicLong nshortLiterals = new AtomicLong(); // private final ConcurrentWeakValueCacheWithBatchedUpdates<Value, // BigdataValue> valueCache; /** * The size of the hash collision set for the RDF Value with the most * hash collisions observed to date. */ private final AtomicLong maxCollisions = new AtomicLong(); /** * The total #of hash collisions. */ private final AtomicLong totalCollisions = new AtomicLong(); // /** // * The #of RDF {@link Value}s which were found in the {@link // #valueCache}, // * thereby avoiding a lookup against the index. // */ // private final AtomicLong ncached = new AtomicLong(); /** * The #of distinct RDF {@link Value}s inserted into the index. */ private final AtomicLong ninserted = new AtomicLong(); /** The total #of bytes in the generated B+Tree keys (leaves only). */ private final AtomicLong totalKeyBytes = new AtomicLong(); /** The total #of bytes in the serialized RDF Values. */ private final AtomicLong totalValBytes = new AtomicLong(); } // class Counters //// private interface IHashCode { //// void hashCode(IKeyBuilder keyBuilder,Object o); //// } // // private static class Int32HashCode { //implements IHashCode { // // public void hashCode(IKeyBuilder keyBuilder, Object o) { // // keyBuilder.append(o.hashCode()); // // } // // } // // private static class MessageDigestHashCode { //implements IHashCode { // // final MessageDigest d; // // MessageDigestHashCode() throws NoSuchAlgorithmException { // // d = MessageDigest.getInstance("SHA-256"); // 256 bits (32 bytes) // // } // // public void hashCode(IKeyBuilder keyBuilder, final byte[] b) { // // d.reset(); // d.digest(b); // keyBuilder.append(d.digest()); // // } // // } /** * Lock used to coordinate {@link #shutdown()} and the {@link #valueQueue}. */ private final ReentrantLock lock = new ReentrantLock(); /** * Latch which is incremented as we accept files to parse and decremented * once a parser begins to parse that file. */ private final Latch parserQueueLatch = new Latch(lock); /** * Latch which is incremented once we begin to parse a file and decremented * as the parser task completes. */ private final Latch parserRunLatch = new Latch(lock); /** * Thread pool used to run the parser. */ private final ExecutorService parserService; /** * Thread pool used to run the parser and indexer. */ private final ExecutorService indexerService; /** * Class hooks the runnable to provide reporting on the outcome of the * {@link FutureTask}. */ private class ReportingFutureTask<V> extends FutureTask<V> { public final File file; public ReportingFutureTask(final File file, Callable<V> callable) { super(callable); this.file = file; parserQueueLatch.inc(); } public void run() { try { parserRunLatch.inc(); parserQueueLatch.dec(); super.run(); parserRunLatch.dec(); } finally { report(this); } } /** * Callback is invoked when a {@link ParseFileTask} completes. * * @param task * The future for that task. */ protected void report(final ReportingFutureTask<?> task) { try { task.get(); if (log.isDebugEnabled()) log.debug("Finished parsing: " + task.file + ", queueLatch=" + parserQueueLatch + ", runLatch=" + parserRunLatch); } catch (ExecutionException ex) { log.error(ex, ex); } catch (InterruptedException e) { // propagate the interrupt. Thread.currentThread().interrupt(); } } } /** * A {@link Bucket} has an <code>unsigned byte[]</code> key and an unordered * list of <code>long</code> addrs for <code>byte[]</code> values. * {@link Bucket} implements {@link Comparable} can can be used to place an * array of {@link Bucket}s into ascending key order. * * TODO This is space efficient for large {@link Value}s, but it would not * be efficient for storing binding sets which hash to the same key. In the * case of binding sets, the binding sets are normally small. An extensible * hash table would conserve space by dynamically determining the #of hash * bits in the address, and hence mapping the records onto a smaller #of * pages. */ static private class Bucket implements Comparable<Bucket> { /** * The <code>unsigned byte[]</code> key. */ public final byte[] key; /** * The list of addresses for this bucket. * * TODO Collisions in a bucket are very rare given an int32 hash code, * so this should be optimized for the common case with a single * address. */ public final List<Long> addrs = new LinkedList<Long>(); public Bucket(final byte[] key) { if(key == null) throw new IllegalArgumentException(); this.key = key; } public Bucket(final byte[] key,final long addr) { this(key); addrs.add(addr); } /** * Add an address to this bucket. * * @param addr * The address. */ public void add(final long addr) { addrs.add(addr); } /** * Order {@link Bucket}s into ascending <code>unsigned byte[]</code> key * order. */ public int compareTo(final Bucket o) { return BytesUtil.compareBytes(key, o.key); } } /** * A chunk of RDF {@link Value}s from the parser which are ready to be * inserted into the TERMS index. */ static private class ValueBuffer { /** * The allocation contexts which can be released once these data have * been processed. */ private final Set<IMemoryManager> contexts = new LinkedHashSet<IMemoryManager>(); /** * The #of distinct records in the addrMap (this is more than the map * size if there are hash collisions since some buckets will have more * than one entry). */ private final int nvalues; /** * A map from the <code>unsigned byte[]</code> keys to the collision * bucket containing the address of each record for a given * <code>unsigned byte[]</code> key. */ private final Map<byte[]/* key */, Bucket> addrMap; /** * * @param contexts * The allocation contexts for the records in the addrMap. * @param nvalues * The #of distinct records in the addrMap (this is more than * the map size if there are hash collisions since some * buckets will have more than one entry). * @param addrMap * A map from the <code>unsigned byte[]</code> keys to the * collision bucket containing the address of each record for * a given <code>unsigned byte[]</code> key. */ public ValueBuffer(final List<IMemoryManager> contexts, final int nvalues, final Map<byte[], Bucket> addrMap) { if (contexts == null) throw new IllegalArgumentException(); if (addrMap == null) throw new IllegalArgumentException(); this.contexts.addAll(contexts); this.nvalues = nvalues; this.addrMap = addrMap; } /** * Clear the address map and the {@link IMemoryManager} allocation * contexts against which the data were stored. */ public void clear() { addrMap.clear(); for(IMemoryManager context : contexts) { context.clear(); } } public long getUserBytes() { long nbytes = 0L; for(IMemoryManager context : contexts) { nbytes += context.getUserBytes(); } return nbytes; } } // class ValueBuffer /** * Queue used to hand off {@link ValueBuffer}s from the parser to the * indexer. */ private BlockingQueue<ValueBuffer> valueQueue; /** * Counters for things that we track. */ private final Counters c = new Counters(); /** * The upper bound on the size of a {@link ValueBuffer} chunk (currently in * slotBytes for the allocations against the {@link MemoryManager}). * <p> * The size of the chunks, the capacity of the queue, and the number of * chunks that may be combined into a single chunk for the indexer may all * be used to adjust the parallelism and efficiency of the parsing and * indexing. You have to be careful not to let too much data onto the Java * heap, but the indexer will do better when it is given a bigger chunk * since it can order the data and be more efficient in the index updates. */ final int valBufSize = Bytes.megabyte32 * 10;// 100000; /** Capacity of the {@link #valueQueue}. */ final int valQueueCapacity = 10; /** * Maximum #of chunks to drain from the {@link #valueQueue} in one go. This * bounds the largest chunk that we will index at one go. You can remove the * limit by specifying {@link Integer#MAX_VALUE}. */ final int maxDrain = 5; /** * The size of the read buffer when reading a file. */ final int fileBufSize = 1024 * 8;// default 8k /** * How many parser threads to use. There can be only one parser per file, * but you can parse more than one file at a time. */ final int nparserThreads = 1; /** * The size of the work queue for the {@link #parserService}. * <p> * Note: This should be large enough that we will not wait around forever if * the caller is forced to parse a file rather than scan the file system for * the next file to be parsed. This hack is introduced by the need to handle * a {@link RejectedExecutionException} from the {@link #parserService}. We * do that by forcing the parse task to run in the caller's thread. Another * choice would be for the caller to catch the * {@link RejectedExecutionException}, wait a bit, and then retry. */ final int parserWorkQueueCapacity = 100; /** * A direct memory heap used to buffer RDF {@link Value}s which will be * inserted into the TERMS index. A distinct child {@link IMemoryManager} * context is created by the {@link StatementHandler} each time it needs to * buffer data. The {@link StatementHandler} monitors the size of the * allocation context to decide when it is "big enough" to be transferred * onto the {@link #valueQueue}. The indexer eventually obtains the context * from the {@link #valueQueue}. Once the indexer is done with a context, it * {@link IMemoryManager#clear() clears} the context. The total memory * across the allocation contexts is released back to the * {@link DirectBufferPool} in {@link #shutdown()} and * {@link #shutdownNow()} and no later than when the {@link #mmgr} is * finalized. */ final MemoryManager mmgr; /** * The #of buffers to give to the {@link MemoryManager}. */ private final int nbuffers = 1000; private HashCollisionUtility(final Journal jnl) { this.termsIndex = getTermsIndex(jnl); /* * Setup the parser thread pool. If there is an attempt to run more * threads then * * Note: The pool size is one less than the total #of specified threads * since the caller will wind up running tasks rejected by the pool. If * the pool would be empty then it is [null] and the caller will run * the parser in its own thread. * * Note: The work queue is bounded so that we do not read any too far in * the file system. The #of threads is bounded so that we do not run too * many parsers at once. However, running multiple parsers can increase * throughput as the parser itself caps out at ~ 68k tps. */ if (nparserThreads > 1) { // this.parserService = // Executors.newFixedThreadPool(nparserThreads); final int corePoolSize = nparserThreads-1; final int maximumPoolSize = nparserThreads-1; final long keepAliveTime = 60; final TimeUnit unit = TimeUnit.SECONDS; final BlockingQueue<Runnable> workQueue = new LinkedBlockingQueue<Runnable>( parserWorkQueueCapacity); // final BlockingQueue<Runnable> workQueue = new SynchronousQueue<Runnable>(); this.parserService = new ThreadPoolExecutor(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, new ThreadPoolExecutor.CallerRunsPolicy() ); } else { /* * The caller must run the parser in its own thread. */ this.parserService = null; } // But they all feed the same indexer. this.indexerService = Executors.newSingleThreadExecutor(); // *blocking* queue of ValueBuffers to be indexed this.valueQueue = new LinkedBlockingQueue<ValueBuffer>( valQueueCapacity);// lock); vf = BigdataValueFactoryImpl.getInstance("test"); final BaseVocabulary vocab; try { vocab = (BaseVocabulary) Class.forName( AbstractTripleStore.Options.DEFAULT_VOCABULARY_CLASS) .getDeclaredConstructor(String.class) .newInstance(vf.getNamespace()); vocab.init(); } catch (Exception e) { throw new RuntimeException(e); } // factory does not support any extensions. final IExtensionFactory xFactory = new IExtensionFactory() { @Override public void init(final IDatatypeURIResolver resolver, final ILexiconConfiguration<BigdataValue> config) { // NOP } @Override @SuppressWarnings("rawtypes") public Iterator<IExtension<? extends BigdataValue>> getExtensions() { return Collections.emptyIterator(); } }; final InlineURIFactory uriFactory = new InlineURIFactory(); uriFactory.init(vocab); /* * Note: This inlines everything *except* xsd:dateTime, which * substantially reduces the data we will put into the index. * * @todo Do a special IExtension implementation to handle xsd:dateTime * since the DateTimeExtension uses the LexiconRelation to do its work. */ conf = new LexiconConfiguration<BigdataValue>( 256, // blobsThreshold true, // inlineXSDDatatypeLiterals true, // inlineTextLiterals 64, // maxInlineStringLength true, // inlineBNodes false, // inlineDateTimes TimeZone.getDefault(), // inlineDateTimesTimeZone false, // rejectInvalidXSDValues xFactory, // extension factory vocab, // predefined vocabulary vf, uriFactory, false, // GeoSpatial support null // GeoSpatial config ); // valueCache = new ConcurrentWeakValueCacheWithBatchedUpdates<Value, BigdataValue>( // 50000 // hard reference queue capacity // ); mmgr = new MemoryManager(DirectBufferPool.INSTANCE, nbuffers); } /** * Start the task which will index data as it is parsed. */ public void start() { lock.lock(); try { if (indexerTask != null) throw new IllegalStateException(); // start indexer. indexerTask = new FutureTask<Void>(new IndexerMainTask()); indexerService.submit(indexerTask); // allow parsers to run. parsing.set(true); } finally { lock.unlock(); } } /** * Future for the task which drains the {@link #valueQueue} and indexes * the {@link ValueBuffer}s drained from that queue. */ private FutureTask<Void> indexerTask; /** Flag is <code>true</code> while parsers are still running. */ private final AtomicBoolean parsing = new AtomicBoolean(false); /** * Poison pill used to indicate that no more objects will be placed onto the * {@link #valueQueue}. */ private final ValueBuffer poisonPill = new ValueBuffer( new LinkedList<IMemoryManager>(), 0, new LinkedHashMap<byte[], Bucket>()); /** * Normal shutdown. Running parsers will complete and their data will be * indexed, but new parsers will not start. This method will block until * all data has been indexed. * * @throws Exception */ public void shutdown() throws Exception { log.debug("shutting down..."); lock.lock(); try { if (log.isDebugEnabled()) log.debug("Waiting on parserQueueLatch: " + parserQueueLatch); parserQueueLatch.await(); if (parserService != null) { // no new parsers may start parserService.shutdown(); } if (log.isDebugEnabled()) log.debug("Waiting on parserRunLatch: " + parserRunLatch); parserRunLatch.await(); // no parsers should be running. parsing.set(false); // drop a poison pill on the queue. log.debug("Inserting poison pill."); valueQueue.put(poisonPill); if (indexerTask != null) { // wait for the indexer to finish. indexerTask.get(); } if (indexerService != null) indexerService.shutdown(); if (mmgr != null) { if (log.isInfoEnabled()) log.info(mmgr.getCounters().toString()); mmgr.clear(); } } finally { lock.unlock(); } log.debug("all done."); } /** * Immediate shutdown. Running tasks will be canceled. * * @throws Exception */ public void shutdownNow() throws Exception { log.debug("shutdownNow"); parsing.set(false); if (parserService != null) parserService.shutdownNow(); if (indexerService != null) indexerService.shutdownNow(); if (indexerTask != null) { indexerTask.cancel(true/* mayInterruptIfRunning */); } if (mmgr != null) { mmgr.clear(); } } /** * Task drains the valueQueue and runs an {@link IndexerTask} each time * something is drained from that queue. * * @author thompsonbry */ private class IndexerMainTask implements Callable<Void> { public Void call() throws Exception { boolean done = false; while (!done) { try { // Blocking take so we know that there is something ready. final ValueBuffer first = valueQueue.take(); // Drain queue, but keep an eye out for that poison pill. final LinkedList<ValueBuffer> coll = new LinkedList<ValueBuffer>(); // The element we already took from the queue. coll.add(first); // Drain (non-blocking). final int ndrained = valueQueue.drainTo(coll, maxDrain) + 1; if (log.isInfoEnabled()) log.info("Drained " + ndrained + " chunks with " + valueQueue.size() + " remaining in the queue."); // look for and remove poison pill, noting if found. if (coll.remove(poisonPill)) { if (log.isDebugEnabled()) log.debug("Found poison pill."); done = true; // fall through and index what we already have. } if (!coll.isEmpty()) { // combine the buffers into a single chunk. final ValueBuffer b = combineChunks(coll); if (log.isDebugEnabled()) log.debug("Will index " + coll.size() + " chunks having " + b.nvalues + " values in " + b.getUserBytes() + " bytes"); // Now index that chunk. new IndexValueBufferTask(mmgr, b, termsIndex, vf, c) .call(); } } catch (Throwable t) { log.error(t, t); HashCollisionUtility.this.shutdownNow(); throw new RuntimeException(t); } } // while(!done) log.debug("done."); return (Void) null; } /** * Combine chunks from the queue into a single chunk. */ private ValueBuffer combineChunks(final LinkedList<ValueBuffer> coll) { final ValueBuffer b; if (coll.size() == 1) { // There is only one chunk. b = coll.getFirst(); } else { // Combine together into a single chunk. int nvalues = 0; for (ValueBuffer t : coll) nvalues += t.nvalues; final List<IMemoryManager> contexts = new LinkedList<IMemoryManager>(); final LinkedHashMap<byte[], Bucket> addrMap = new LinkedHashMap<byte[], Bucket>(); // int off = 0; for (ValueBuffer t : coll) { contexts.addAll(t.contexts); nvalues += t.nvalues; for(Bucket bucket : t.addrMap.values()) { final Bucket tmp = addrMap.get(bucket.key); if(tmp == null) { // copy bucket. addrMap.put(bucket.key, bucket); } else { // merge bucket. tmp.addrs.addAll(bucket.addrs); } } // System // .arraycopy(t.keys/* src */, 0/* srcPos */, // keys/* dest */, off/* destPos */, // t.nvalues/* length */); // // System // .arraycopy(t.addrs/* src */, 0/* srcPos */, // addrs/* dest */, off/* destPos */, // t.nvalues/* length */); // // off += t.nvalues; } b = new ValueBuffer(contexts, nvalues, addrMap); } return b; } } // class IndexerMainTask /** * Return the index in which we store RDF {@link Value}s. * * @param jnl * The index manager. * * @return The index. */ /* * TODO CanonicalHuffmanRabaCoder for U1 drops the average leaf size * * @ m=512 from 24k to 16k. Experiment with performance tradeoff * when compared with gzip of the record. * * No apparent impact for U1 on the leaves or nodes for 32 versus 8 * on the front-coded raba. * * Dropping maxRecLen from 256 to 64 reduces the leaves from 16k to * 10k. Dropping it to ZERO (0) reduces the leaves to 5k. This * suggests that we could to much better if we keep all RDF Values * out of the index. In standalone, we can give people a TermId * which is the raw record address. However, in scale-out it needs * to be the key (to locate the shard) and we will resolve the RDF * Value using the index on the shard. * * Suffix compression would allow us to generalize the counter and * avoid index space costs when collisions are rare while being able * to tolerate more collisions (short versus byte). U1: m=800, q=8000, ratio=8, maxRecLen=0, Elapsed: 41340ms NumStatements: 1000313 NumDistinctVals: 291259 TotalKeyBytes: 1747554 TotalValBytes: 60824514 MaxCollisions: 1 TotalCollisions: 6 Journal size: 209715200 bytes Average node: 9813 Average leaf: 6543 U1: m=800, q=8000, ratio=32, maxRecLen=0, Elapsed: 40971ms NumStatements: 1000313 NumDistinctVals: 291259 TotalKeyBytes: 1747554 TotalValBytes: 60824514 MaxCollisions: 1 TotalCollisions: 6 Journal size: 209715200 bytes Average node: 9821 Average leaf: 6478 U1: m=800, q=8000, ratio=64, maxRecLen=0, Elapsed: 41629ms NumStatements: 1000313 NumDistinctVals: 291259 TotalKeyBytes: 1747554 TotalValBytes: 60824514 MaxCollisions: 1 TotalCollisions: 6 Journal size: 209715200 bytes Average node: 9822 Average leaf: 6467 U1: m=512, q=8000, ratio=32, maxRecLen=0, Elapsed: 44722ms NumStatements: 1000313 NumDistinctVals: 291259 TotalKeyBytes: 1747554 TotalValBytes: 60824514 MaxCollisions: 1 TotalCollisions: 6 Journal size: 209715200 bytes Average node/leaf: 3969 4149 U1: m=512, q=8000, ratio=32, maxRecLen=0, Elapsed: 40519ms NumStatements: 1000313 NumDistinctVals: 291259 TotalKeyBytes: 1747554 TotalValBytes: 60824514 MaxCollisions: 1 TotalCollisions: 6 Journal size: 209715200 bytes Average node/leaf, node(min/max), leaf(min/max): 7583 8326 7583 7583 5755 14660 It would be great if we tracked the node/leaf data live on the RWStore for these counters so it could all be reported periodically (via http) or at the end in a summary. TODO The front compression of the keys is not helping out much since the keys are so sparse in the hash code space. It is a Good Thing that the keys are so sparse, but this suggests that we should try a different coder for the leaf keys. */ private BTree getTermsIndex(final Journal jnl) { final String name = "TERMS"; BTree ndx = jnl.getIndex(name); final int m = 1024; final int q = 8000; final int ratio = 32; final int maxRecLen = 0; if(ndx == null) { final IndexMetadata md = new IndexMetadata(name, UUID.randomUUID()); md.setNodeKeySerializer(new FrontCodedRabaCoder(ratio)); final DefaultTupleSerializer tupleSer = new DefaultTupleSerializer( new DefaultKeyBuilderFactory(new Properties()),// /* * leaf keys */ // DefaultFrontCodedRabaCoder.INSTANCE,// new FrontCodedRabaCoder(ratio),// // CanonicalHuffmanRabaCoder.INSTANCE, /* * leaf values */ CanonicalHuffmanRabaCoder.INSTANCE // new SimpleRabaCoder()// ); md.setTupleSerializer(tupleSer); // enable raw record support. md.setRawRecords(true); // set the maximum length of a byte[] value in a leaf. md.setMaxRecLen(maxRecLen); /* * increase the branching factor since leaf size is smaller w/o * large records. */ md.setBranchingFactor(m); // Note: You need to give sufficient heap for this option! md.setWriteRetentionQueueCapacity(q); ndx = jnl.registerIndex(name, md); } return ndx; } private void parseFileOrDirectory(final File fileOrDir, final RDFFormat fallback) throws Exception { if (fileOrDir.isDirectory()) { final File[] files = fileOrDir.listFiles(); for (int i = 0; i < files.length; i++) { final File f = files[i]; parseFileOrDirectory(f, fallback); } return; } final File f = fileOrDir; final String n = f.getName(); RDFFormat fmt = RDFFormat.forFileName(n, fallback); if (fmt == null && n.endsWith(".zip")) { fmt = RDFFormat.forFileName(n.substring(0, n.length() - 4), fallback); } if (fmt == null && n.endsWith(".gz")) { fmt = RDFFormat.forFileName(n.substring(0, n.length() - 3), fallback); } if (fmt == null) { log.warn("Ignoring: " + f); return; } final StatementHandler stmtHandler = new StatementHandler(valBufSize, c, conf, vf, mmgr, valueQueue, parsing); final FutureTask<Void> ft = new ReportingFutureTask<Void>( f, new ParseFileTask(f, fallback, fileBufSize, vf, stmtHandler) ); if (parserService != null) { // run on the thread pool. parserService.submit(ft); } else { // Run in the caller's thread. ft.run(); // Test the Future. ft.get(); } } /** * Task parses a single file. * * @author thompsonbry */ private static class ParseFileTask implements Callable<Void> { private final File file; private final RDFFormat fallback; private final int fileBufSize; private final BigdataValueFactory vf; private final StatementHandler stmtHandler; public ParseFileTask(final File file, final RDFFormat fallback, final int fileBufSize, final BigdataValueFactory vf, final StatementHandler stmtHandler) { if (file == null) throw new IllegalArgumentException(); if (stmtHandler == null) throw new IllegalArgumentException(); this.file = file; this.fallback = fallback; this.fileBufSize = fileBufSize; this.vf = vf; this.stmtHandler = stmtHandler; } public Void call() throws Exception { parseFile(file); return (Void) null; } private void parseFile(final File file) throws IOException, RDFParseException, RDFHandlerException, NoSuchAlgorithmException, InterruptedException { if (!file.exists()) throw new RuntimeException("Not found: " + file); final RDFFormat format = RDFFormat.forFileName(file.getName(),fallback); if (format == null) throw new RuntimeException("Unknown format: " + file); if (log.isTraceEnabled()) log.trace("RDFFormat=" + format); final RDFParserFactory rdfParserFactory = RDFParserRegistry .getInstance().get(format); if (rdfParserFactory == null) throw new RuntimeException("No parser for format: " + format); final RDFParser rdfParser = rdfParserFactory.getParser(); rdfParser.setValueFactory(vf); rdfParser.setVerifyData(false); rdfParser.setStopAtFirstError(false); rdfParser.setDatatypeHandling(RDFParser.DatatypeHandling.IGNORE); rdfParser.setRDFHandler(stmtHandler); /* * Run the parser, which will cause statements to be inserted. */ if (log.isDebugEnabled()) log.debug("Parsing: " + file); InputStream is = new FileInputStream(file); try { is = new BufferedInputStream(is, fileBufSize); final boolean gzip = file.getName().endsWith(".gz"); if (gzip) is = new GZIPInputStream(is); final String baseURI = file.toURI().toString(); // parse the file rdfParser.parse(is, baseURI); } finally { is.close(); } } } /** * Helper class adds statements to the sail as they are visited by a parser. */ static private class StatementHandler extends RDFHandlerBase { // private static final transient Logger log = HashCollisionUtility.log; /** * Various counters that we track. */ private final Counters c; /** The lexicon configuration. */ private final LexiconConfiguration<BigdataValue> conf; /** * Blocking queue to which we add {@link ValueBuffer} instances as they * are generated by the parser. */ final BlockingQueue<ValueBuffer> valueQueue; /** * <code>true</code> iff the parser is permitted to run and * <code>false</code> if the parser should terminate. */ final AtomicBoolean parsing; /** * Used to build the keys (just a hash code). */ private final IKeyBuilder keyBuilder = KeyBuilder.newInstance(); /** Used to serialize RDF Values as byte[]s. */ private final DataOutputBuffer out = new DataOutputBuffer(); /** Used to serialize RDF Values as byte[]s. */ private final ByteArrayBuffer tbuf = new ByteArrayBuffer(); /** Used to serialize RDF Values as byte[]s. */ private final BigdataValueSerializer<BigdataValue> valSer; /** * Used to (de-)compress the raw values. * <p> * Note: This is not thread-safe, even for decompression. You need a * pool or thread-local instance to support concurrent reads against the * TERMS index. */ private final RecordCompressor compressor = new RecordCompressor( Deflater.BEST_SPEED); // /** Buffer for (serialized) RDF Values. */ // private KV[] values; /** #of buffered values. */ private int nvalues = 0; /** The memory manager. */ private final IMemoryManager memoryManager; /** The current allocation context. */ private IMemoryManager context = null; /** * Map of distinct values in the buffer. * * TODO In addition to enforcing DISTINCT over the Values in the * ValueBuffer, an LRU/LIRS cache would be nice here so we can reuse the * frequently resolved (BigdataValue => IV) mappings across buffer * instances. * * FIXME We need to provide a canonicalizing mapping for blank nodes. * * TODO The key should also include the URI,Literal,BNode, etc. prefix * bits (or is this necessary any more?). */ private Map<byte[]/*key*/,Bucket> addrMap; /** The size of the {@link #values} buffer when it is allocated. */ private final int valueBufSize; public StatementHandler(// final int valueBufSize, final Counters c, final LexiconConfiguration<BigdataValue> conf, final BigdataValueFactory vf, final IMemoryManager memoryManager, final BlockingQueue<ValueBuffer> valueQueue, final AtomicBoolean parsing) { this.valueBufSize = valueBufSize; this.c = c; this.conf = conf; this.memoryManager = memoryManager; this.valueQueue = valueQueue; this.parsing = parsing; this.valSer = vf.getValueSerializer(); } public void endRDF() { if(log.isTraceEnabled()) log.trace("End of source."); try { flush(); } catch (InterruptedException e) { throw new RuntimeException(e); } } public void handleStatement(final Statement stmt) throws RDFHandlerException { if (!parsing.get()) { // Either shutdown or never started. throw new IllegalStateException(); } try { bufferValue((BigdataValue) stmt.getSubject()); bufferValue((BigdataValue) stmt.getPredicate()); bufferValue((BigdataValue) stmt.getObject()); if (stmt.getContext() != null) { bufferValue((BigdataValue) stmt.getContext()); } } catch (InterruptedException ex) { // Interrupted while blocked on the valueQueue throw new RDFHandlerException(ex); } c.nstmts.incrementAndGet(); } /** * If the RDF {@link Value} can not be represented inline within the * statement indices, then buffer the value for batch resolution against * the TERMS index. * * @param value * The RDF {@link Value}. * * @return A {@link Value}. If the caller's {@link Value} could be * represented as an inline {@link IV}, then the returned value * will be a {@link BigdataValue} and the inline {@link IV} will * be available from {@link BigdataValue#getIV()}. Otherwise the * caller's {@link Value} is returned and the {@link Value} must * be resolved against the TERMS index in order to obtain its * {@link IV}. * * @throws InterruptedException * * FIXME Handle {@link BlobIV}, {@link PartlyInlineURIIV}, and * {@link PartlyInlineTypedLiteralIV}. These are three kinds of * "non-inline" values. They will have to be queued for * insertion into the TERMS index and Statement which depend * on those non-inline values will have to be deferred until * we have resolved those non-inline values. This is * basically the same logic that we already have for * StatementBuffer, except that an asynchronous queue is * being used (by this class) to do the resolution of the IV * for large values. * <p> * Other kinds of {@link IV}s which could be handled here * would be references to large values stored in the file * system, in S3, etc. */ private void bufferValue(final BigdataValue value) throws InterruptedException { // Not expecting the IV to already be cached. assert value.getIV() == null; // Attempt to inline this value. final IV<?, ?> iv = conf.createInlineIV(value); if (iv != null) { // This is being inlined. switch (iv.getVTE()) { case URI: c.nshortURIs.incrementAndGet(); break; case BNODE: c.nshortBNodes.incrementAndGet(); break; case LITERAL: c.nshortLiterals.incrementAndGet(); break; default: throw new AssertionError(); } // Verify IV is cached on that Value. assert value.getIV() == iv; return; } if (context != null && context.getSlotBytes() >= valueBufSize) { // Incremental flush of large values to the TERMS index. flush(); } if (context == null) { // Lazy allocation of the buffer. context = memoryManager.createAllocationContext(); addrMap = new LinkedHashMap<byte[], Bucket>(); } /* * Generate a key (hash code) and value (serialized and compressed) * from the BigdataValue. */ final KV t = makeKV(value); /* * Lookup the list of addresses for RDF Values which hash to the * same key. */ Bucket bucket = addrMap.get(t.key); if (bucket == null) { /* * No match on that hash code key. */ // lay the record down on the memory manager. final long addr = context.allocate(ByteBuffer.wrap(t.val)); // add new bucket to the map. addrMap.put(t.key, bucket = new Bucket(t.key, addr)); nvalues++; } else { /* * Either a hash collision or the value is already stored at * a known address. */ { for (Long addr : bucket.addrs) { if (context.allocationSize(addr) != t.val.length) { // Non-match based on the allocated record size. continue; } /* * TODO It would be more efficient to compare the data * using the zero-copy get(addr) method. */ final byte[] tmp = context.read(addr); if (BytesUtil.bytesEqual(t.val, tmp)) { // We've already seen this Value. if (log.isDebugEnabled()) log.debug("Duplicate value in chunk: " + Arrays.toString(t.val)); /* * FIXME This pattern does not really work out for * building statements since we lack a reference to * the Value which is being inserted into the TERMS * index. The StatementBuffer handles this. It keeps * the Values in a map and inserts all values into * the database. [It should only keep the distinct * non-inline values but it currently keeps all * distinct values without regard to inlining.] */ return; } } // Fall through - there is no such record on the store. } // lay the record down on the memory manager. bucket.add(context.allocate(ByteBuffer.wrap(t.val))); nvalues++; } return; } // bufferValue() /** * Transfer a non-empty buffer to the {@link #valueQueue}. * * @throws InterruptedException */ void flush() throws InterruptedException { if (nvalues == 0) return; if (!parsing.get()) { // Either shutdown or never started. throw new IllegalStateException(); } if (log.isInfoEnabled()) log.info("Adding chunk with " + nvalues + " values and " + context.getUserBytes() + " bytes to queue."); /* * Create an object which encapsulates the allocation context (to be * cleared when the data have been consumed) and the address map. */ final List<IMemoryManager> contexts = new LinkedList<IMemoryManager>(); contexts.add(context); // put the buffer on the queue (blocking operation). valueQueue.put(new ValueBuffer(contexts, nvalues, addrMap)); // clear reference since we just handed off the data. context = null; addrMap = null; nvalues = 0; // clear distinct value set so it does not build for ever. // distinctValues.clear(); // addrMap.clear(); } private KV makeKV(final BigdataValue r) { byte[] val = valSer.serialize(r, out.reset(), tbuf); /* * FIXME In order support conditional compression we will have to * mark the record with a header to indicate whether or not * it is compressed. Without that header we can not * deserialize a record resolved via its TermId since we * will not know whether or not it is compressed (actually, * that could be part of the termId....) */ if (compressor != null) {//&& val.length > 64) { // compress, reusing [out]. out.reset(); compressor.compress(val, out); } // if (out.pos() < val.length) // TODO Use compressed version iff smaller. { val = out.toByteArray(); } /* * Note: This is an exclusive lower bound (it does not include the * counter). * * TODO We could format the counter in here as a ZERO (0) since it * is a fixed length value and then patch it up later. That would * involve less copying. */ final byte[] key = buildKey(r, val).getKey(); return new KV(key, val); } // makeKV() private IKeyBuilder buildKey(final Value r, final byte[] val) { // if (true) { /* * Simple 32-bit hash code based on the byte[] representation of * the RDF Value. */ final int hashCode = r.hashCode(); return keyBuilder.reset().append(hashCode); // } else { // // /* // * Message digest of the serialized representation of the RDF // * Value. // * // * TODO There are methods to copy out the digest (hash code) // * without memory allocations. getDigestLength() and // * getDigest(out,start,len). // */ // private final MessageDigest d; // // try { // // d = MessageDigest.getInstance("SHA-256"); // 256 bits (32 bytes) // // } catch (NoSuchAlgorithmException e) { // // throw new RuntimeException(e); // // } // // // final byte[] hashCode = d.digest(val); // // return keyBuilder.reset().append(hashCode); // // } } // buildKey } // class StatementHandler /** * Index a {@link ValueBuffer}. */ private static class IndexValueBufferTask implements Callable<Void> { /** * The {@link MemoryManager} against which the allocations were made. */ private final MemoryManager mmgr; /** * The data to be indexed. */ private final ValueBuffer vbuf; /** * The index to write on. */ private final BTree termsIndex; /** Counters for things that we track. */ private final Counters c; /** * Used to build the keys. */ private final IKeyBuilder keyBuilder = KeyBuilder.newInstance(); // /** Used to serialize RDF Values as byte[]s. */ // private final DataOutputBuffer out = new DataOutputBuffer(); /** Used to de-serialize RDF Values (debugging only). */ private final BigdataValueSerializer<BigdataValue> valSer; /** * Used to de-compress the raw values (debugging only). * <p> * Note: This is not thread-safe, even for decompression. You need a * pool or thread-local instance to support concurrent reads against the * TERMS index. */ private final RecordCompressor compressor; public IndexValueBufferTask(final MemoryManager mmgr, final ValueBuffer vbuf, final BTree termsIndex, final BigdataValueFactory vf, final Counters c) { if(mmgr == null) throw new IllegalArgumentException(); if(vbuf == null) throw new IllegalArgumentException(); if(termsIndex== null) throw new IllegalArgumentException(); if(vf == null) throw new IllegalArgumentException(); if(c == null) throw new IllegalArgumentException(); this.mmgr = mmgr; this.vbuf = vbuf; this.termsIndex = termsIndex; this.c = c; /* * Note: debugging only. */ this.valSer = vf.getValueSerializer(); this.compressor = new RecordCompressor(Deflater.BEST_SPEED); } public Void call() throws Exception { final long begin = System.currentTimeMillis(); if (log.isInfoEnabled()) log.info("Indexing " + vbuf.nvalues + " values occupying " + vbuf.getUserBytes() + " bytes"); /* * Place into sorted order by the keys. * * The Bucket implements Comparable. We extract the buckets, sort * them, and then process them. */ final Bucket[] a = vbuf.addrMap.values().toArray(new Bucket[0]); Arrays.sort(a); // Index the values. for (int i = 0; i <a.length; i++) { final Bucket b = a[i]; // The key for that bucket. final byte[] baseKey = keyBuilder.reset().append(b.key).getKey(); // All records for that bucket. for (long addr : b.addrs) { // Materialize the byte[] from the memory manager. final byte[] val = mmgr.read(addr); addValue(baseKey, val); } } if (log.isInfoEnabled()) { final long elapsed = System.currentTimeMillis() - begin; log.info("Indexed " + vbuf.nvalues + " values occupying " + vbuf.getUserBytes() + " bytes in " + elapsed + "ms"); } // release the address map and backing allocation context. vbuf.clear(); return (Void) null; } /** * Insert a record into the TERMS index. * * @param baseKey * The base key for the hash code (without the counter * suffix). * * @param val * The (serialized and compressed) RDF Value. */ private void addValue(final byte[] baseKey, final byte[] val) { /* * This is the fixed length hash code prefix. When a collision * exists we can either append a counter -or- use more bits from the * prefix. An extensible hash index works by progressively * increasing the #of bits from the hash code which are used to * create a distinction in the index. Records with identical hash * values are stored in an (unordered, and possibly chained) bucket. * We can approximate this by using N-bits of the hash code for the * key and then increasing the #of bits in the key when there is a * hash collision. Unless a hash function is used which has * sufficient bits available to ensure that there are no collisions, * we may be forced eventually to append a counter to impose a * distinction among records which are hash identical but whose * values differ. * * In the case of a hash collision, we can determine the records * which have already collided using the fast range count between * the hash code key and the fixed length successor of that key. We * can create a guaranteed distinct key by creating a BigInteger * whose values is (#collisions+1) and appending it to the key. This * approach will give us keys whose byte length increases slowly as * the #of collisions grows (though these might not be the minimum * length keys - depending on how we are encoding the BigInteger in * the key.) * * When we have a hash collision, we first need to scan all of the * collision records and make sure that none of those records has * the same value as the given record. This is done using the fixed * length successor of the hash code key as the exclusive upper * bound of a key range scan. Each record associated with a tuple in * that key range must be compared for equality with the given * record to decide whether or not the given record already exists * in the index. * * The fromKey is strictly LT any full key for the hash code of this * val but strictly GT any key have a hash code LT the hash code of * this val. * * TODO From [fromKey] and [toKey] could reuse a pair of buffers to * reduce heap churn, especially since they are FIXED length keys. * The fromKey would have to be formed more intelligently as we do * not have a version of SuccessorUtil#successor() which works with * a byte offset and length. */ final byte[] fromKey = baseKey; // key strictly LT any successor of the hash code of this val. final byte[] toKey = SuccessorUtil.successor(fromKey.clone()); // fast range count. this tells us how many collisions there are. // this is an exact collision count since we are not deleting tuples // from the TERMS index. final long rangeCount = termsIndex.rangeCount(fromKey, toKey); if (rangeCount >= Byte.MAX_VALUE) { /* * Impose a hard limit on the #of hash collisions we will accept * in this utility. * * @todo We do not need to have a hard limit if we use * BigInteger for the counter, but the performance will go * through the floor if we have to scan 32k entries on a hash * collision! */ throw new RuntimeException("Too many hash collisions: ncoll=" + rangeCount); } // force range count into (signed) byte final byte counter = (byte) rangeCount; if (rangeCount == 0) { /* * This is the first time we have observed a Value which * generates this hash code, so append a [short] ZERO (0) to * generate the actual key and then insert the Value into the * index. Since there is nothing in the index for this hash * code, no collision is possible and we do not need to test the * index for the value before inserting the value into the * index. */ final byte[] key = keyBuilder.reset().append(fromKey).appendSigned( counter).getKey(); if (termsIndex.insert(key, val) != null) { throw new AssertionError(); } c.ninserted.incrementAndGet(); c.totalKeyBytes.addAndGet(key.length); c.totalValBytes.addAndGet(val.length); return; } /* * iterator over that key range * * TODO Filter for the value of interest so we can optimize the scan * by comparing with the value without causing it to be * materialized, especially we should be able to efficiently reject * tuples where the byte[] value length is known to differ from the * a given length, including when the value is stored as a raw * record at which point we are doing a fast rejection based on * comparing the byteCount(addr) for the raw record with the target * byte count for value that we are seeking in the index. * * We can visit something iff the desired tuple already exists (same * length, and possibly the same data). If we visit nothing then we * know that we have to insert a tuple and we know the counter value * from the collision count. */ final ITupleIterator<?> itr = termsIndex.rangeIterator(fromKey, toKey, 0/* capacity */, IRangeQuery.VALS, null/* filter */); boolean found = false; while(itr.hasNext()) { final ITuple<?> tuple = itr.next(); // raw bytes final byte[] tmp = tuple.getValue(); if (false) System.out.println(getValue(tmp)); // Note: Compares the compressed values ;-) if(BytesUtil.bytesEqual(val, tmp)) { found = true; break; } } if(found) { // Already in the index. return; } /* * Hash collision. */ if (rangeCount > c.maxCollisions.get()) { // Raise the maximum collision count. c.maxCollisions.set(rangeCount); log.warn("MAX COLLISIONS NOW: " + c.maxCollisions.get()); } final byte[] key = keyBuilder.reset().append(fromKey).appendSigned( counter).getKey(); // Insert into the index. if (termsIndex.insert(key, val) != null) { throw new AssertionError(); } c.ninserted.incrementAndGet(); c.totalKeyBytes.addAndGet(key.length); c.totalValBytes.addAndGet(val.length); c.totalCollisions.incrementAndGet(); if (rangeCount > 128) { // arbitrary limit to log @ WARN. log.warn("Collision: hashCode=" + BytesUtil.toString(key) + ", nstmts="+c.nstmts + ", nshortLiterals=" + c.nshortLiterals + ", nshortURIs=" + c.nshortURIs + ", ninserted=" + c.ninserted + ", totalCollisions=" + c.totalCollisions + ", maxCollisions=" + c.maxCollisions + ", ncollThisTerm=" + rangeCount + ", resource=" + getValue(val)); } else if (log.isDebugEnabled()) log.debug("Collision: hashCode=" + BytesUtil.toString(key) + ", nstmts="+c.nstmts + ", nshortLiterals=" + c.nshortLiterals + ", nshortURIs=" + c.nshortURIs + ", ninserted=" + c.ninserted + ", totalCollisions=" + c.totalCollisions + ", maxCollisions=" + c.maxCollisions + ", ncollThisTerm=" + rangeCount + ", resource=" + getValue(val)); } /** * Decompress and deserialize a {@link Value}. * * @param tmp * The serialized and compressed value. * * @return The {@link Value}. */ private Value getValue(final byte[] tmp) { // decompress final ByteBuffer b = compressor.decompress(tmp); final byte[] c = new byte[b.limit()]; b.get(c); // deserialize. return valSer.deserialize(c); } } // class IndexValueBufferTask /** * Parse files, inserting {@link Value}s into indices and counting hash * collisions. * * @param args * filename(s) * * @throws IOException * @throws RDFHandlerException * @throws RDFParseException * @throws NoSuchAlgorithmException */ public static void main(final String[] args) throws Exception { Banner.banner(); // check args. { for (String filename : args) { final File file = new File(filename); if (!file.exists()) throw new RuntimeException("Not found: " + file); } } final long begin = System.currentTimeMillis(); final Properties properties = new Properties(); properties.setProperty(Journal.Options.BUFFER_MODE, BufferMode.DiskRW .toString()); properties.setProperty(Journal.Options.INITIAL_EXTENT, "" + (Bytes.megabyte * 200)); // properties.setProperty(Journal.Options.COLLECT_PLATFORM_STATISTICS,"true"); // properties.setProperty(Journal.Options.COLLECT_QUEUE_STATISTICS,"true"); properties.setProperty(Journal.Options.HTTPD_PORT,"8081"); // The caller MUST specify the filename using -D on the command line. final String journalFile = System.getProperty(Journal.Options.FILE); if (journalFile == null) { System.err.println("Journal file must be specified: -D" + Journal.Options.FILE); System.exit(1); } properties.setProperty(Journal.Options.FILE, journalFile); if (new File(journalFile).exists()) { System.err.println("Removing old journal: " + journalFile); new File(journalFile).delete(); } final Journal jnl = new Journal(properties); final RDFFormat fallback = RDFFormat.N3; HashCollisionUtility u = null; try { u = new HashCollisionUtility(jnl); u.start(); for (String filename : args) { u.parseFileOrDirectory(new File(filename), fallback); } // // flush anything left in the buffer. // u.stmtHandler.flush(); // shutdown and block until all data is indexed. u.shutdown(); jnl.commit(); } catch (Throwable t) { u.shutdownNow(); throw new RuntimeException(t); } finally { jnl.close(); final long elapsed = System.currentTimeMillis() - begin; System.out.println("Elapsed: " + elapsed + "ms"); if (u != null) { System.out.println("NumStatements: " + u.c.nstmts); System.out.println("NumDistinctVals: " + u.c.ninserted); System.out.println("NumShortLiterals: " + u.c.nshortLiterals); System.out.println("NumShortBNodes: " + u.c.nshortBNodes); System.out.println("NumShortURIs: " + u.c.nshortURIs); // System.out.println("NumCacheHit: " + u.ncached); System.out.println("TotalKeyBytes: " + u.c.totalKeyBytes); System.out.println("TotalValBytes: " + u.c.totalValBytes); System.out.println("MaxCollisions: " + u.c.maxCollisions); System.out.println("TotalCollisions: " + u.c.totalCollisions); } if (new File(journalFile).exists()) { System.out.println("Journal size: " + new File(journalFile).length() + " bytes"); } } } }