IVBindingSetEncoderWithIVCache.java example

Explorer
blazegraph-master
- database-master
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Feb 15, 2012
 */

package com.bigdata.rdf.internal.encoder;

import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicReference;

import org.openrdf.model.Value;

import com.bigdata.bop.BOp;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IConstant;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.IndexAnnotations;
import com.bigdata.bop.ap.Predicate;
import com.bigdata.btree.BTree;
import com.bigdata.btree.BloomFilterFactory;
import com.bigdata.btree.Checkpoint;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.Tuple;
import com.bigdata.btree.keys.ASCIIKeyBuilderFactory;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.btree.raba.codec.FrontCodedRabaCoder;
import com.bigdata.btree.raba.codec.SimpleRabaCoder;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.internal.IVCache;
import com.bigdata.rdf.internal.IVUtility;
import com.bigdata.rdf.internal.impl.BlobIV;
import com.bigdata.rdf.internal.impl.TermId;
import com.bigdata.rdf.internal.impl.literal.LiteralExtensionIV;
import com.bigdata.rdf.lexicon.BlobsIndexHelper;
import com.bigdata.rdf.lexicon.BlobsTupleSerializer;
import com.bigdata.rdf.lexicon.Id2TermTupleSerializer;
import com.bigdata.rdf.lexicon.LexiconRelation;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.rdf.model.BigdataValueFactory;
import com.bigdata.rdf.model.BigdataValueFactoryImpl;
import com.bigdata.rdf.sparql.ast.eval.AST2BOpUtility;
import com.bigdata.util.Bytes;

/**
 * A concrete implementation using scalable {@link BTree}s to store the mapping
 * from an {@link IV} to the cached RDF {@link Value}. This approach is useful
 * when you will be encoding a LOT of data and you need to get the cached RDF
 * {@link Value} objects off of the JVM heap.
 * <p>
 * Note: Two different {@link BTree} instances are used. One for {@link TermId}s
 * and another for {@link BlobIV}s. These indices use exactly the same schema as
 * the ID2TERM and BLOBS indices.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 * 
 * This class is out of use. As part of https://jira.blazegraph.com/browse/BLZG-1899, 
 * we carried out some experiments illustrating that caching the IVs (i.e., preserving
 * them in HTree hash joins) is far slower than just re-materializing them. We decided
 * to retire the class and, instead, do remove variables from the doneSet within the
 * {@link AST2BOpUtility} update process whenever we set up an analytic hash join.
 */
@Deprecated
public class IVBindingSetEncoderWithIVCache extends IVBindingSetEncoder {

    /**
     * The namespace of the {@link LexiconRelation} IFF we need to maintain
     * the {@link #ivCache}.
     */
    private final String namespace;

    /**
     * The {@link BigdataValueFactory} for the {@link LexiconRelation} IFF we
     * need to maintain the {@link #ivCache}.
     */
    private final BigdataValueFactory valueFactory;

    /**
     * The set of variables for which materialized {@link IV}s have been
     * observed.
     */
    final protected LinkedHashSet<IVariable<?>> ivCacheSchema;

    /**
     * A cache mapping from non-inline {@link IV}s ({@link TermId}s and
     * {@link BlobIV}s) whose {@link IVCache} association was set to the
     * corresponding {@link BigdataValue}. Used to batch updates into
     * the ID2TERM and BLOBS indices.
     */
    final Map<IV<?, ?>, BigdataValue> cache;    
    
    /**
     * The {@link IV}:{@link BigdataValue} mapping for non-{@link BlobIV}s. This
     * captures any cached BigdataValue references encountered on {@link IV}s.
     * This map does not store duplicate entries for the same {@link IV}.
     * <p>
     * Note: This is precisely the same mapping we use for the ID2TERM index.
     */
    private final AtomicReference<BTree> ivCache = new AtomicReference<BTree>();

    /**
     * The {@link IV}:{@link BigdataValue} mapping for {@link BlobIV}s with
     * cached {@link BigdataValue}s. This captures any cached BigdataValue
     * references encountered on {@link BlobIV}s. This map does not store
     * duplicate entries for the same {@link IV}.
     * <p>
     * Note: This is precisely the same mapping we use for the BLOBS index.
     */
    private final AtomicReference<BTree> blobsCache = new AtomicReference<BTree>();
    
    /**
     * The {@link IV}:{@link BigdataValue} mapping for {@link LiteralExtensionIV}s
     * with cached {@link BigdataValue}s. This captures any cached BigdataValue
     * references encountered on {@link LiteralExtensionIV}s. This map does not
     * store duplicate entries for the same {@link IV}.
     */
    private final AtomicReference<BTree> literalExtensionIVCache = new AtomicReference<BTree>();

    public String toString() {
        /*
         * Note: schema and ivCacheSchema are not thread-safe, so it is not safe
         * to show their state here. A concurrent modification exception could
         * easily be thrown. Not synchronizing on these things is currently
         * valued more than observing their contents outside of a debugger.
         */
        final StringBuilder sb = new StringBuilder();
        sb.append(getClass().getSimpleName());
        sb.append("{namespace=" + namespace);
        if (ivCache.get() != null)
            sb.append(",ivCacheSize=" + getIVCacheSize());
        if (blobsCache.get() != null)
            sb.append(",blobCacheSize=" + getBlobsCacheSize());
        sb.append("}");
        return sb.toString();
    }
    
    private long getIVCacheSize() {

        final BTree ndx = ivCache.get();

        if (ndx != null) {

            return ndx.getEntryCount();

        }

        return 0L;
        
    }
    
    private long getLiteralExtensionIVCacheSize() {

        final BTree ndx = literalExtensionIVCache.get();

        if (ndx != null) {

            return ndx.getEntryCount();

        }

        return 0L;
        
    }

    private long getBlobsCacheSize() {

        final BTree ndx = blobsCache.get();

        if (ndx != null) {

            return ndx.getEntryCount();

        }

        return 0L;
        
    }

    /**
     * Setup the {@link IndexMetadata} for {@link #ivCache}.
     * <p>
     * Note: This is basically the same setup as the ID2TERM index.
     */
    private IndexMetadata getIVCacheIndexMetadata(final BOp op) {

        final IndexMetadata metadata = new IndexMetadata(UUID.randomUUID());

        final int branchingFactor = 256;// TODO Config/tune.
        
        final int ratio = 32; // TODO Config/tune.
        
        metadata.setBranchingFactor(branchingFactor);

        metadata.setWriteRetentionQueueCapacity(op.getProperty(
                IndexAnnotations.WRITE_RETENTION_QUEUE_CAPACITY,
                IndexAnnotations.DEFAULT_WRITE_RETENTION_QUEUE_CAPACITY));

        metadata.setTupleSerializer(new Id2TermTupleSerializer(namespace,
                valueFactory, new ASCIIKeyBuilderFactory(Bytes.SIZEOF_LONG),//
                new FrontCodedRabaCoder(ratio), SimpleRabaCoder.INSTANCE));

        // a bloom filter should help avoid lookups when IVs do not have cached
        // values.
        metadata.setBloomFilterFactory(BloomFilterFactory.DEFAULT);

        if (true) {
          
            // enable raw record support.
            metadata.setRawRecords(true);

            /*
             * Very small RDF values can be inlined into the index, but after
             * that threshold we want to have the values out of line on the
             * backing store.
             * 
             * TODO Tune this and the threshold at which we use the BLOBS index
             * instead.
             */
            metadata.setMaxRecLen(16);
            
        }
        
        return metadata;

    }
    
    /**
     * Setup the {@link IndexMetadata} for {@link #literalExtensionIVCache}.
     * <p>
     * Note: This is basically the same setup as the ID2TERM index.
     */
    private IndexMetadata getLiteralExtensionIVCacheIndexMetadata(final BOp op) {
        
        final IndexMetadata metadata = new IndexMetadata(UUID.randomUUID());

        final int branchingFactor = 256;// TODO Config/tune.
        
        final int ratio = 32; // TODO Config/tune.
        
        metadata.setBranchingFactor(branchingFactor);

        metadata.setWriteRetentionQueueCapacity(op.getProperty(
                IndexAnnotations.WRITE_RETENTION_QUEUE_CAPACITY,
                IndexAnnotations.DEFAULT_WRITE_RETENTION_QUEUE_CAPACITY));

        metadata.setTupleSerializer(new Id2TermTupleSerializer(namespace,
                valueFactory, new ASCIIKeyBuilderFactory(Bytes.SIZEOF_LONG),//
                new FrontCodedRabaCoder(ratio), SimpleRabaCoder.INSTANCE));

        // a bloom filter should help avoid lookups when IVs do not have cached
        // values.
        metadata.setBloomFilterFactory(BloomFilterFactory.DEFAULT);

        if (true) {
          
            // enable raw record support.
            metadata.setRawRecords(true);

            /*
             * Very small RDF values can be inlined into the index, but after
             * that threshold we want to have the values out of line on the
             * backing store.
             * 
             * TODO Tune this and the threshold at which we use the BLOBS index
             * instead.
             */
            metadata.setMaxRecLen(16);
            
        }
        
        return metadata;
    }
    
    /**
     * Setup the {@link IndexMetadata} for {@link #blobsCache}.
     * <p>
     * Note: This is basically the same setup as the BLOBS index.
     */
    private IndexMetadata getBlobsCacheIndexMetadata(final BOp op) {

        final IndexMetadata metadata = new IndexMetadata(UUID.randomUUID());

        metadata.setTupleSerializer(new BlobsTupleSerializer(namespace,
                valueFactory));

        // enable raw record support.
        metadata.setRawRecords(true);

        /*
         * The presumption is that we are storing large literals (blobs) in this
         * index so we always want to write them on raw records rather than have
         * them be inline in the leaves of the index.
         */
        metadata.setMaxRecLen(0);

        /*
         * TODO The default branching factor for this index should probably be
         * pretty big. All of the values are on raw records, so it is just the
         * keys in the index and the have a fixed width (8 bytes).
         */
        final int branchingFactor = 256;
        
        metadata.setBranchingFactor(branchingFactor);

        metadata.setWriteRetentionQueueCapacity(op.getProperty(
                IndexAnnotations.WRITE_RETENTION_QUEUE_CAPACITY,
                IndexAnnotations.DEFAULT_WRITE_RETENTION_QUEUE_CAPACITY));

        // a bloom filter should help avoid lookups when IVs do not have cached
        // values.
        metadata.setBloomFilterFactory(BloomFilterFactory.DEFAULT);

        return metadata;

    }

    /**
     * @param store
     *            The backing {@link IRawStore} for the {@link IV} to
     *            {@link BigdataValue} cache.
     * @param filter
     *            <code>true</code> iff this is in support of a DISTINCT filter.
     *            <p>
     *            Note: we do not maintain the {@link #ivCacheSchema} for a
     *            DISTINCT filter since the original solutions flow through the
     *            filter.
     * @param op
     *            The operator whose annotations are used to parameterize the
     *            creation of the backing indices for the {@link IV} to
     *            {@link BigdataValue} cache.
     */
    public IVBindingSetEncoderWithIVCache(final IRawStore store,
            final boolean filter, final BOp op) {

        super(BigdataValueFactoryImpl.getInstance(((String[]) op
                .getRequiredProperty(Predicate.Annotations.RELATION_NAME))[0]), filter);
        
        if (!filter) {

            /*
             * Setup the IV => BigdataValue mapping. This captures any cached
             * BigdataValue references encountered on IVs. This map does not store
             * duplicate entries for the same IV.
             */

            namespace = ((String[]) op
                    .getRequiredProperty(Predicate.Annotations.RELATION_NAME))[0];

            valueFactory = BigdataValueFactoryImpl.getInstance(namespace);

            ivCache.set(BTree.create(store, getIVCacheIndexMetadata(op)));

            literalExtensionIVCache.set(BTree.create(store, getLiteralExtensionIVCacheIndexMetadata(op)));
            
            blobsCache.set(BTree.create(store, getBlobsCacheIndexMetadata(op)));

            ivCacheSchema = new LinkedHashSet<IVariable<?>>();

            cache = new HashMap<IV<?, ?>, BigdataValue>();

        } else {

            namespace = null;
            
            valueFactory = null;
            
            ivCacheSchema = null;
            
            cache = null;
            
        }

    }

    /**
     * {@inheritDoc}
     * <p>
     * This maintains the {@link IVCache} associations UNLESS
     * <i>filter:=true</code> was specified to the constructor.
     */
    @Override
    public boolean isValueCache() {

        return !filter;

    }

    /**
     * Checkpoint the {@link BTree} instance(s) used to buffer the cached
     * {@link IV} to RDF {@link Value} mappings and then re-load the them in a
     * read-only mode from their checkpoint(s). This exposes a view of the
     * {@link BTree} instance(s) which is safe for concurrent readers.
     */
    public void saveSolutionSet() {

        flush();
        
        checkpointBTree(ivCache);
        
        checkpointBTree(literalExtensionIVCache);
        
        checkpointBTree(blobsCache);
        
    }
    
    private void checkpointBTree(final AtomicReference<BTree> ref) {

        final BTree tmp = ref.get();

        if (tmp != null) {

            // Checkpoint the HTree.
            final Checkpoint checkpoint = tmp.writeCheckpoint2();

            final BTree readOnly = BTree.load(tmp.getStore(),
                    checkpoint.getCheckpointAddr(), true/* readOnly */);

            // Get a read-only view of the HTree.
            if (!ref.compareAndSet(tmp/* expect */, readOnly)) {

                throw new IllegalStateException();

            }

        }

    }

    @Override
    public void release() {

        BTree tmp2 = ivCache.getAndSet(null/* newValue */);

        if (tmp2 != null) {

            tmp2.close();

        }

        tmp2 = blobsCache.getAndSet(null/* newValue */);
        
        if (tmp2 != null) {

            tmp2.close();

        }
        
        tmp2 = literalExtensionIVCache.getAndSet(null/* newValue */);
        
        if (tmp2 != null) {
            
            tmp2.close();
            
        }


        if (ivCacheSchema != null) {

            ivCacheSchema.clear();
            
        }
        
        super.release();
        
    }

    /**
     * {@inheritDoc}
     * <p>
     * Vectored update of the internal ivCache.
     */
    @Override
    public void flush() {

        if (filter) {

            super.flush();
            
            return;
            
        }

        final BTree ivCache = this.ivCache.get();

        final BTree blobsCache = this.blobsCache.get();
        
        final BTree literalExtensionIVCache = this.literalExtensionIVCache.get();

        // Lazily resolved.
        BlobsIndexHelper h = null;

        // Used to serialize RDF {@link Value}s.
        final Id2TermTupleSerializer tupSer = (Id2TermTupleSerializer) ivCache
                .getIndexMetadata().getTupleSerializer();

        for (Map.Entry<IV<?, ?>, BigdataValue> e : cache.entrySet()) {

            final IV<?, ?> iv = e.getKey();

            final BigdataValue value = e.getValue();

            if (iv instanceof BlobIV<?>) {

                if (h == null) {
        
                    // Lazily resolved.
                    h = new BlobsIndexHelper();
                    
                }

                /*
                 * Note: The insert logic for the BLOBS index here is different
                 * (and much simpler) because we already have the exact BlobIV
                 * and we want to ensure that there is an entry under that key
                 * in the [blobsCache].
                 * 
                 * Normally, you use the helper class to do a conditional
                 * resolveOrAddValue() which assigns a collision counter for the
                 * blob, but here we already know the collision counter which
                 * was assigned by the real BLOBS index on the LexiconRelation.
                 */
                
                final IKeyBuilder keyBuilder = h.newKeyBuilder();

                final byte[] key = iv.encode(keyBuilder.reset()).getKey();

                final byte[] val = valueFactory.getValueSerializer().serialize(
                        value);

                if (!blobsCache.contains(key)) {

                    blobsCache.insert(key, val);

                }

            /**
             * BLZG-1899: we also need to encode literal extension IVs: these IVs
             *            require materialization although they are inlined, since
             *            their interpretation depends on the LexiconConfiguration
             */
            } else if (iv instanceof LiteralExtensionIV) {

                final IKeyBuilder keyBuilder = new KeyBuilder();

                final byte[] key = iv.encode(keyBuilder.reset()).getKey();
                
                final byte[] val = valueFactory.getValueSerializer().serialize(value);
                
                if (!literalExtensionIVCache.contains(key)) {
                    
                    literalExtensionIVCache.insert(key, val);
                }
                
            } else {

                final byte[] key = tupSer.serializeKey(iv);

                if (!ivCache.contains(key)) {

                    ivCache.insert(key, tupSer.serializeVal(value));

                }

            }

        }

        if (cache != null)
            cache.clear();
        
        super.flush();
    }

    /**
     * {@inheritDoc}
     * 
     * TODO If we vectored this operation it would substantially reduce its
     * costs. We would have to collect up a bunch of solutions which needed
     * resolution, then collect up the IVs which do not have cached values for
     * variables which might have values in the ivCache. We would then sort the
     * IVs and do a vectored resolution against the ivCache. Finally, the
     * solutions could be output in a chunk with their resolved Values.
     * <p>
     * If the operator is not vectored, then we should just fold it into
     * {@link #decodeSolution(byte[], int, int)}.
     */
    @SuppressWarnings("rawtypes")
    public void resolveCachedValues(final IBindingSet bset) {
        
        final BTree ivCache = this.ivCache.get();

        final BTree literalExtensionIVCache = this.literalExtensionIVCache.get();

        final BTree blobsCache = this.blobsCache.get();

        if ((ivCache == null || ivCache.getEntryCount() == 0L)
                && (literalExtensionIVCache == null || literalExtensionIVCache.getEntryCount() == 0L)
                && (blobsCache == null || blobsCache.getEntryCount() == 0L)) {

            // Nothing materialized.
            return;
            
        }
        
        final Id2TermTupleSerializer tupSer = (Id2TermTupleSerializer) ivCache
                .getIndexMetadata().getTupleSerializer();
        
        final IKeyBuilder keyBuilder = tupSer.getKeyBuilder();

        final Tuple ivCacheTuple = new Tuple(ivCache, IRangeQuery.KEYS
                | IRangeQuery.VALS);

        // Lazily initialized.
        BlobsIndexHelper h = null;
        
        final Iterator<Map.Entry<IVariable, IConstant>> itr = bset.iterator();

        while (itr.hasNext()) {

            final Map.Entry<IVariable, IConstant> e = itr.next();

            final IVariable<?> v = e.getKey();
            
            if (!ivCacheSchema.contains(v)) {
                // Nothing observed for that variable.
                continue;
            }
            
            final IV iv = (IV) e.getValue().get();

            if (iv.hasValue()) {
                // Already cached.
                continue;
            }

            keyBuilder.reset();

            if (iv instanceof BlobIV<?>) {

                final BlobIV<?> blobIV = (BlobIV<?>) iv;

                if(h == null) {
                    
                    h = new BlobsIndexHelper();
                    
                }
                
                final byte[] val = h.lookup(blobsCache, blobIV, keyBuilder);

                if (val == null) {

                    continue;

                }

                /*
                 * TODO Factor out the buffers used to do the de-serialization
                 * when we vector the resolution of IVs.
                 */
                final BigdataValue value = valueFactory.getValueSerializer()
                        .deserialize(val);

                iv.setValue(value);

            } else if (iv instanceof LiteralExtensionIV) {
                
                final LiteralExtensionIV<?> literalExtensionIv = (LiteralExtensionIV<?>) iv;
                
                IVUtility.encode(keyBuilder, literalExtensionIv);

                final byte[] key = keyBuilder.getKey();

                if (literalExtensionIVCache.lookup(key, ivCacheTuple) == null) {

                    continue;

                }
                
                final BigdataValue value = tupSer.deserialize(ivCacheTuple);

                iv.setValue(value);
                
            } else {

                IVUtility.encode(keyBuilder, iv);

                final byte[] key = keyBuilder.getKey();

                if (ivCache.lookup(key, ivCacheTuple) == null) {

                    continue;

                }

                final BigdataValue value = tupSer.deserialize(ivCacheTuple);

                iv.setValue(value);

            }
            
        }

    }
    
    @Override
    void cacheSchemaAndValue(final IVariable<?> v, final IV<?,?> iv, final boolean updateCache) {

        /**
         *  BLZG-1899: we need to materialize all IVs that require materialization;
         *             before, this condition was !iv.isInline(), which did not consider
         *             cases such as LiteralExtensionIVs that are inline but nevertheless
         *             need to be materialized
         */
        if (iv.needsMaterialization() && iv.hasValue() && !filter) {
            ivCacheSchema.add(v);
            if (updateCache && cache != null)
                cache.put(iv, iv.getValue());
        }
        
    }
    

}