/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Feb 15, 2012
*/
package com.bigdata.rdf.internal.encoder;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicReference;
import org.openrdf.model.Value;
import com.bigdata.bop.BOp;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IConstant;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.IndexAnnotations;
import com.bigdata.bop.ap.Predicate;
import com.bigdata.btree.BTree;
import com.bigdata.btree.BloomFilterFactory;
import com.bigdata.btree.Checkpoint;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.Tuple;
import com.bigdata.btree.keys.ASCIIKeyBuilderFactory;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.btree.raba.codec.FrontCodedRabaCoder;
import com.bigdata.btree.raba.codec.SimpleRabaCoder;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.internal.IVCache;
import com.bigdata.rdf.internal.IVUtility;
import com.bigdata.rdf.internal.impl.BlobIV;
import com.bigdata.rdf.internal.impl.TermId;
import com.bigdata.rdf.internal.impl.literal.LiteralExtensionIV;
import com.bigdata.rdf.lexicon.BlobsIndexHelper;
import com.bigdata.rdf.lexicon.BlobsTupleSerializer;
import com.bigdata.rdf.lexicon.Id2TermTupleSerializer;
import com.bigdata.rdf.lexicon.LexiconRelation;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.rdf.model.BigdataValueFactory;
import com.bigdata.rdf.model.BigdataValueFactoryImpl;
import com.bigdata.rdf.sparql.ast.eval.AST2BOpUtility;
import com.bigdata.util.Bytes;
/**
* A concrete implementation using scalable {@link BTree}s to store the mapping
* from an {@link IV} to the cached RDF {@link Value}. This approach is useful
* when you will be encoding a LOT of data and you need to get the cached RDF
* {@link Value} objects off of the JVM heap.
* <p>
* Note: Two different {@link BTree} instances are used. One for {@link TermId}s
* and another for {@link BlobIV}s. These indices use exactly the same schema as
* the ID2TERM and BLOBS indices.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*
* This class is out of use. As part of https://jira.blazegraph.com/browse/BLZG-1899,
* we carried out some experiments illustrating that caching the IVs (i.e., preserving
* them in HTree hash joins) is far slower than just re-materializing them. We decided
* to retire the class and, instead, do remove variables from the doneSet within the
* {@link AST2BOpUtility} update process whenever we set up an analytic hash join.
*/
@Deprecated
public class IVBindingSetEncoderWithIVCache extends IVBindingSetEncoder {
/**
* The namespace of the {@link LexiconRelation} IFF we need to maintain
* the {@link #ivCache}.
*/
private final String namespace;
/**
* The {@link BigdataValueFactory} for the {@link LexiconRelation} IFF we
* need to maintain the {@link #ivCache}.
*/
private final BigdataValueFactory valueFactory;
/**
* The set of variables for which materialized {@link IV}s have been
* observed.
*/
final protected LinkedHashSet<IVariable<?>> ivCacheSchema;
/**
* A cache mapping from non-inline {@link IV}s ({@link TermId}s and
* {@link BlobIV}s) whose {@link IVCache} association was set to the
* corresponding {@link BigdataValue}. Used to batch updates into
* the ID2TERM and BLOBS indices.
*/
final Map<IV<?, ?>, BigdataValue> cache;
/**
* The {@link IV}:{@link BigdataValue} mapping for non-{@link BlobIV}s. This
* captures any cached BigdataValue references encountered on {@link IV}s.
* This map does not store duplicate entries for the same {@link IV}.
* <p>
* Note: This is precisely the same mapping we use for the ID2TERM index.
*/
private final AtomicReference<BTree> ivCache = new AtomicReference<BTree>();
/**
* The {@link IV}:{@link BigdataValue} mapping for {@link BlobIV}s with
* cached {@link BigdataValue}s. This captures any cached BigdataValue
* references encountered on {@link BlobIV}s. This map does not store
* duplicate entries for the same {@link IV}.
* <p>
* Note: This is precisely the same mapping we use for the BLOBS index.
*/
private final AtomicReference<BTree> blobsCache = new AtomicReference<BTree>();
/**
* The {@link IV}:{@link BigdataValue} mapping for {@link LiteralExtensionIV}s
* with cached {@link BigdataValue}s. This captures any cached BigdataValue
* references encountered on {@link LiteralExtensionIV}s. This map does not
* store duplicate entries for the same {@link IV}.
*/
private final AtomicReference<BTree> literalExtensionIVCache = new AtomicReference<BTree>();
public String toString() {
/*
* Note: schema and ivCacheSchema are not thread-safe, so it is not safe
* to show their state here. A concurrent modification exception could
* easily be thrown. Not synchronizing on these things is currently
* valued more than observing their contents outside of a debugger.
*/
final StringBuilder sb = new StringBuilder();
sb.append(getClass().getSimpleName());
sb.append("{namespace=" + namespace);
if (ivCache.get() != null)
sb.append(",ivCacheSize=" + getIVCacheSize());
if (blobsCache.get() != null)
sb.append(",blobCacheSize=" + getBlobsCacheSize());
sb.append("}");
return sb.toString();
}
private long getIVCacheSize() {
final BTree ndx = ivCache.get();
if (ndx != null) {
return ndx.getEntryCount();
}
return 0L;
}
private long getLiteralExtensionIVCacheSize() {
final BTree ndx = literalExtensionIVCache.get();
if (ndx != null) {
return ndx.getEntryCount();
}
return 0L;
}
private long getBlobsCacheSize() {
final BTree ndx = blobsCache.get();
if (ndx != null) {
return ndx.getEntryCount();
}
return 0L;
}
/**
* Setup the {@link IndexMetadata} for {@link #ivCache}.
* <p>
* Note: This is basically the same setup as the ID2TERM index.
*/
private IndexMetadata getIVCacheIndexMetadata(final BOp op) {
final IndexMetadata metadata = new IndexMetadata(UUID.randomUUID());
final int branchingFactor = 256;// TODO Config/tune.
final int ratio = 32; // TODO Config/tune.
metadata.setBranchingFactor(branchingFactor);
metadata.setWriteRetentionQueueCapacity(op.getProperty(
IndexAnnotations.WRITE_RETENTION_QUEUE_CAPACITY,
IndexAnnotations.DEFAULT_WRITE_RETENTION_QUEUE_CAPACITY));
metadata.setTupleSerializer(new Id2TermTupleSerializer(namespace,
valueFactory, new ASCIIKeyBuilderFactory(Bytes.SIZEOF_LONG),//
new FrontCodedRabaCoder(ratio), SimpleRabaCoder.INSTANCE));
// a bloom filter should help avoid lookups when IVs do not have cached
// values.
metadata.setBloomFilterFactory(BloomFilterFactory.DEFAULT);
if (true) {
// enable raw record support.
metadata.setRawRecords(true);
/*
* Very small RDF values can be inlined into the index, but after
* that threshold we want to have the values out of line on the
* backing store.
*
* TODO Tune this and the threshold at which we use the BLOBS index
* instead.
*/
metadata.setMaxRecLen(16);
}
return metadata;
}
/**
* Setup the {@link IndexMetadata} for {@link #literalExtensionIVCache}.
* <p>
* Note: This is basically the same setup as the ID2TERM index.
*/
private IndexMetadata getLiteralExtensionIVCacheIndexMetadata(final BOp op) {
final IndexMetadata metadata = new IndexMetadata(UUID.randomUUID());
final int branchingFactor = 256;// TODO Config/tune.
final int ratio = 32; // TODO Config/tune.
metadata.setBranchingFactor(branchingFactor);
metadata.setWriteRetentionQueueCapacity(op.getProperty(
IndexAnnotations.WRITE_RETENTION_QUEUE_CAPACITY,
IndexAnnotations.DEFAULT_WRITE_RETENTION_QUEUE_CAPACITY));
metadata.setTupleSerializer(new Id2TermTupleSerializer(namespace,
valueFactory, new ASCIIKeyBuilderFactory(Bytes.SIZEOF_LONG),//
new FrontCodedRabaCoder(ratio), SimpleRabaCoder.INSTANCE));
// a bloom filter should help avoid lookups when IVs do not have cached
// values.
metadata.setBloomFilterFactory(BloomFilterFactory.DEFAULT);
if (true) {
// enable raw record support.
metadata.setRawRecords(true);
/*
* Very small RDF values can be inlined into the index, but after
* that threshold we want to have the values out of line on the
* backing store.
*
* TODO Tune this and the threshold at which we use the BLOBS index
* instead.
*/
metadata.setMaxRecLen(16);
}
return metadata;
}
/**
* Setup the {@link IndexMetadata} for {@link #blobsCache}.
* <p>
* Note: This is basically the same setup as the BLOBS index.
*/
private IndexMetadata getBlobsCacheIndexMetadata(final BOp op) {
final IndexMetadata metadata = new IndexMetadata(UUID.randomUUID());
metadata.setTupleSerializer(new BlobsTupleSerializer(namespace,
valueFactory));
// enable raw record support.
metadata.setRawRecords(true);
/*
* The presumption is that we are storing large literals (blobs) in this
* index so we always want to write them on raw records rather than have
* them be inline in the leaves of the index.
*/
metadata.setMaxRecLen(0);
/*
* TODO The default branching factor for this index should probably be
* pretty big. All of the values are on raw records, so it is just the
* keys in the index and the have a fixed width (8 bytes).
*/
final int branchingFactor = 256;
metadata.setBranchingFactor(branchingFactor);
metadata.setWriteRetentionQueueCapacity(op.getProperty(
IndexAnnotations.WRITE_RETENTION_QUEUE_CAPACITY,
IndexAnnotations.DEFAULT_WRITE_RETENTION_QUEUE_CAPACITY));
// a bloom filter should help avoid lookups when IVs do not have cached
// values.
metadata.setBloomFilterFactory(BloomFilterFactory.DEFAULT);
return metadata;
}
/**
* @param store
* The backing {@link IRawStore} for the {@link IV} to
* {@link BigdataValue} cache.
* @param filter
* <code>true</code> iff this is in support of a DISTINCT filter.
* <p>
* Note: we do not maintain the {@link #ivCacheSchema} for a
* DISTINCT filter since the original solutions flow through the
* filter.
* @param op
* The operator whose annotations are used to parameterize the
* creation of the backing indices for the {@link IV} to
* {@link BigdataValue} cache.
*/
public IVBindingSetEncoderWithIVCache(final IRawStore store,
final boolean filter, final BOp op) {
super(BigdataValueFactoryImpl.getInstance(((String[]) op
.getRequiredProperty(Predicate.Annotations.RELATION_NAME))[0]), filter);
if (!filter) {
/*
* Setup the IV => BigdataValue mapping. This captures any cached
* BigdataValue references encountered on IVs. This map does not store
* duplicate entries for the same IV.
*/
namespace = ((String[]) op
.getRequiredProperty(Predicate.Annotations.RELATION_NAME))[0];
valueFactory = BigdataValueFactoryImpl.getInstance(namespace);
ivCache.set(BTree.create(store, getIVCacheIndexMetadata(op)));
literalExtensionIVCache.set(BTree.create(store, getLiteralExtensionIVCacheIndexMetadata(op)));
blobsCache.set(BTree.create(store, getBlobsCacheIndexMetadata(op)));
ivCacheSchema = new LinkedHashSet<IVariable<?>>();
cache = new HashMap<IV<?, ?>, BigdataValue>();
} else {
namespace = null;
valueFactory = null;
ivCacheSchema = null;
cache = null;
}
}
/**
* {@inheritDoc}
* <p>
* This maintains the {@link IVCache} associations UNLESS
* <i>filter:=true</code> was specified to the constructor.
*/
@Override
public boolean isValueCache() {
return !filter;
}
/**
* Checkpoint the {@link BTree} instance(s) used to buffer the cached
* {@link IV} to RDF {@link Value} mappings and then re-load the them in a
* read-only mode from their checkpoint(s). This exposes a view of the
* {@link BTree} instance(s) which is safe for concurrent readers.
*/
public void saveSolutionSet() {
flush();
checkpointBTree(ivCache);
checkpointBTree(literalExtensionIVCache);
checkpointBTree(blobsCache);
}
private void checkpointBTree(final AtomicReference<BTree> ref) {
final BTree tmp = ref.get();
if (tmp != null) {
// Checkpoint the HTree.
final Checkpoint checkpoint = tmp.writeCheckpoint2();
final BTree readOnly = BTree.load(tmp.getStore(),
checkpoint.getCheckpointAddr(), true/* readOnly */);
// Get a read-only view of the HTree.
if (!ref.compareAndSet(tmp/* expect */, readOnly)) {
throw new IllegalStateException();
}
}
}
@Override
public void release() {
BTree tmp2 = ivCache.getAndSet(null/* newValue */);
if (tmp2 != null) {
tmp2.close();
}
tmp2 = blobsCache.getAndSet(null/* newValue */);
if (tmp2 != null) {
tmp2.close();
}
tmp2 = literalExtensionIVCache.getAndSet(null/* newValue */);
if (tmp2 != null) {
tmp2.close();
}
if (ivCacheSchema != null) {
ivCacheSchema.clear();
}
super.release();
}
/**
* {@inheritDoc}
* <p>
* Vectored update of the internal ivCache.
*/
@Override
public void flush() {
if (filter) {
super.flush();
return;
}
final BTree ivCache = this.ivCache.get();
final BTree blobsCache = this.blobsCache.get();
final BTree literalExtensionIVCache = this.literalExtensionIVCache.get();
// Lazily resolved.
BlobsIndexHelper h = null;
// Used to serialize RDF {@link Value}s.
final Id2TermTupleSerializer tupSer = (Id2TermTupleSerializer) ivCache
.getIndexMetadata().getTupleSerializer();
for (Map.Entry<IV<?, ?>, BigdataValue> e : cache.entrySet()) {
final IV<?, ?> iv = e.getKey();
final BigdataValue value = e.getValue();
if (iv instanceof BlobIV<?>) {
if (h == null) {
// Lazily resolved.
h = new BlobsIndexHelper();
}
/*
* Note: The insert logic for the BLOBS index here is different
* (and much simpler) because we already have the exact BlobIV
* and we want to ensure that there is an entry under that key
* in the [blobsCache].
*
* Normally, you use the helper class to do a conditional
* resolveOrAddValue() which assigns a collision counter for the
* blob, but here we already know the collision counter which
* was assigned by the real BLOBS index on the LexiconRelation.
*/
final IKeyBuilder keyBuilder = h.newKeyBuilder();
final byte[] key = iv.encode(keyBuilder.reset()).getKey();
final byte[] val = valueFactory.getValueSerializer().serialize(
value);
if (!blobsCache.contains(key)) {
blobsCache.insert(key, val);
}
/**
* BLZG-1899: we also need to encode literal extension IVs: these IVs
* require materialization although they are inlined, since
* their interpretation depends on the LexiconConfiguration
*/
} else if (iv instanceof LiteralExtensionIV) {
final IKeyBuilder keyBuilder = new KeyBuilder();
final byte[] key = iv.encode(keyBuilder.reset()).getKey();
final byte[] val = valueFactory.getValueSerializer().serialize(value);
if (!literalExtensionIVCache.contains(key)) {
literalExtensionIVCache.insert(key, val);
}
} else {
final byte[] key = tupSer.serializeKey(iv);
if (!ivCache.contains(key)) {
ivCache.insert(key, tupSer.serializeVal(value));
}
}
}
if (cache != null)
cache.clear();
super.flush();
}
/**
* {@inheritDoc}
*
* TODO If we vectored this operation it would substantially reduce its
* costs. We would have to collect up a bunch of solutions which needed
* resolution, then collect up the IVs which do not have cached values for
* variables which might have values in the ivCache. We would then sort the
* IVs and do a vectored resolution against the ivCache. Finally, the
* solutions could be output in a chunk with their resolved Values.
* <p>
* If the operator is not vectored, then we should just fold it into
* {@link #decodeSolution(byte[], int, int)}.
*/
@SuppressWarnings("rawtypes")
public void resolveCachedValues(final IBindingSet bset) {
final BTree ivCache = this.ivCache.get();
final BTree literalExtensionIVCache = this.literalExtensionIVCache.get();
final BTree blobsCache = this.blobsCache.get();
if ((ivCache == null || ivCache.getEntryCount() == 0L)
&& (literalExtensionIVCache == null || literalExtensionIVCache.getEntryCount() == 0L)
&& (blobsCache == null || blobsCache.getEntryCount() == 0L)) {
// Nothing materialized.
return;
}
final Id2TermTupleSerializer tupSer = (Id2TermTupleSerializer) ivCache
.getIndexMetadata().getTupleSerializer();
final IKeyBuilder keyBuilder = tupSer.getKeyBuilder();
final Tuple ivCacheTuple = new Tuple(ivCache, IRangeQuery.KEYS
| IRangeQuery.VALS);
// Lazily initialized.
BlobsIndexHelper h = null;
final Iterator<Map.Entry<IVariable, IConstant>> itr = bset.iterator();
while (itr.hasNext()) {
final Map.Entry<IVariable, IConstant> e = itr.next();
final IVariable<?> v = e.getKey();
if (!ivCacheSchema.contains(v)) {
// Nothing observed for that variable.
continue;
}
final IV iv = (IV) e.getValue().get();
if (iv.hasValue()) {
// Already cached.
continue;
}
keyBuilder.reset();
if (iv instanceof BlobIV<?>) {
final BlobIV<?> blobIV = (BlobIV<?>) iv;
if(h == null) {
h = new BlobsIndexHelper();
}
final byte[] val = h.lookup(blobsCache, blobIV, keyBuilder);
if (val == null) {
continue;
}
/*
* TODO Factor out the buffers used to do the de-serialization
* when we vector the resolution of IVs.
*/
final BigdataValue value = valueFactory.getValueSerializer()
.deserialize(val);
iv.setValue(value);
} else if (iv instanceof LiteralExtensionIV) {
final LiteralExtensionIV<?> literalExtensionIv = (LiteralExtensionIV<?>) iv;
IVUtility.encode(keyBuilder, literalExtensionIv);
final byte[] key = keyBuilder.getKey();
if (literalExtensionIVCache.lookup(key, ivCacheTuple) == null) {
continue;
}
final BigdataValue value = tupSer.deserialize(ivCacheTuple);
iv.setValue(value);
} else {
IVUtility.encode(keyBuilder, iv);
final byte[] key = keyBuilder.getKey();
if (ivCache.lookup(key, ivCacheTuple) == null) {
continue;
}
final BigdataValue value = tupSer.deserialize(ivCacheTuple);
iv.setValue(value);
}
}
}
@Override
void cacheSchemaAndValue(final IVariable<?> v, final IV<?,?> iv, final boolean updateCache) {
/**
* BLZG-1899: we need to materialize all IVs that require materialization;
* before, this condition was !iv.isInline(), which did not consider
* cases such as LiteralExtensionIVs that are inline but nevertheless
* need to be materialized
*/
if (iv.needsMaterialization() && iv.hasValue() && !filter) {
ivCacheSchema.add(v);
if (updateCache && cache != null)
cache.put(iv, iv.getValue());
}
}
}