IVSolutionSetEncoder.java example

Explorer
blazegraph-master
- database-master
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Feb 15, 2012
 */

package com.bigdata.rdf.internal.encoder;

import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.log4j.Logger;
import org.openrdf.model.Value;

import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IConstant;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.bindingSet.ListBindingSet;
import com.bigdata.btree.keys.ASCIIKeyBuilderFactory;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.htree.HTree;
import com.bigdata.io.ByteArrayBuffer;
import com.bigdata.io.DataOutputBuffer;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.internal.IVCache;
import com.bigdata.rdf.internal.IVUtility;
import com.bigdata.rdf.internal.impl.TermId;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.rdf.model.BigdataValueFactory;
import com.bigdata.rdf.model.BigdataValueFactoryImpl;
import com.bigdata.rdf.model.BigdataValueSerializer;
import com.bigdata.util.BytesUtil;

/**
 * This class provides fast, efficient serialization for solution sets. Each
 * solution must be an {@link IBindingSet}s whose bound values are {@link IV}s
 * and their cached {@link BigdataValue}s. The {@link IV}s and the cached
 * {@link BigdataValue}s are efficiently and compactly represented in format
 * suitable for chunked messages or streaming. Decode is a fast online process.
 * Both encode and decode require the maintenance of a map from the {@link IV}
 * having cached {@link BigdataValue}s to those cached values.
 * 
 * <h2>Record Format</h2>
 * 
 * The format is as follows:
 * 
 * <pre>
 * nbound
 * nvars
 * ncached 
 * (namespace)
 * var[0]...var[nvars-1]
 * bitmap-for-bound-variables
 * bitmap-for-IV-with-cached-Values
 * IV[0] ... IV[nbound-1]
 * Value[0] ... Value[ncached-1]
 * </pre>
 * 
 * where <code>nbound</code> is the #of bindings in the binding set. When zero,
 * the rest of the record is omitted.
 * <p>
 * where <code>nvars</code> is the #of new variables in this binding set. The
 * "schema" used to encode the bindings is based on the ordered set of variables
 * for which bindings are observed. The encoder writes this information out
 * incrementally. The decoder builds up this information as it decodes
 * solutions.
 * <p>
 * where <code>ncached</code> is the #of bindings in the binding set for which
 * there is a cached {@link BigdataValue} which has not already been written
 * into a previous record. Even if the {@link IV} has a cached
 * {@link BigdataValue}, if the {@link IV} has been previously written into a
 * record then the {@link IV} is NOT record in this record with a cached Value.
 * Further, if the {@link IV} appears more than once in a given record, the
 * cached value is only marked in the bitmap for the first such occurrence and
 * the cached value is only written into the record once.
 * <p>
 * where <code>namespace</code> is the namespace of the lexicon relation. This
 * is written out for the first solution having an {@link IVCache} association.
 * It is assumed that all {@link Value}s are {@link BigdataValue} for the same
 * lexicon relation. If no solutions have an {@link IVCache} association, then
 * the namespace will never be written into the encoded output.
 * <p>
 * where <code>var</code> is the name of a variable for which a binding was
 * first observed for the current solution. The names of the variables are
 * written in the order in which they are first observed. This forms the
 * implicit "schema" required to decode the {@link IV}[].
 * <p>
 * where <code>bitmap-for-bound-variables</code> is zero or more bytes providing
 * a bit map indicating those variables which are bound in this solution out of
 * the total set of variables which have been observed in the solutions
 * presented to this encode.
 * <p>
 * where <code>bitmap-for-IVs-with-cached-Values</code> is zero or more bytes
 * providing a bit map indicating which IVs are associated with cached values
 * written into the record. Whether or not an IV has a cached value must be
 * decided by the caller after processing the record and consulting an
 * (IV,Value) cache which they maintain over the set of records processed to
 * date. Cached values are written out (and the bit set) only the first time a
 * given IV with a cached Value is observed.
 * <p>
 * where <code>IV[n]</code> is an {@link IV} as encoded by {@link IVUtility}.
 * <p>
 * where {@link BigdataValue} is an RDF Value serialized using the
 * {@link BigdataValueSerializer} for the namespace of the lexicon.
 * 
 * <h2>Decode</h2>
 * 
 * The namespace of the lexicon is required in to obtain the
 * {@link BigdataValueFactory} and {@link BigdataValueSerializer} used to decode
 * and materialize the cached {@link BigdataValue}s. This information can be
 * sent before the records if it is not known to the caller.
 * <p>
 * The decoder materializes the cached values into a map (either a HashMap or
 * HTree, as appropriate for the data scale) as the records are processed. Only
 * one solution needs to be decoded at a time, but the decoder must maintain the
 * (IV,Value) cache across all decoded records. There is no need to indicate the
 * #of records, but IChunkMessage#getSolutionCount() in fact reports exactly
 * that information.
 * <p>
 * Each solution can be turned into an {@link IBindingSet} at the time that it
 * is decoded. If we use a standard {@link ListBindingSet}, then we need to
 * resolve each {@link IV} against the {@link IV} cache, setting its RDF Value
 * as a side effect before returning the IBindingSet to the caller. If we do a
 * custom {@link IBindingSet} implementation, then the cached
 * {@link BigdataValue} could be lazily materialized by hooking
 * {@link IVCache#getValue()}. Either way, the life cycle of the materialized
 * objects will be very short unless they are propagated into new solutions.
 * Short life cycle objects entail very little heap burden.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id: IVSolutionSetEncoder.java 6032 2012-02-16 12:48:04Z thompsonbry
 *          $
 * 
 * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/475">Optimize
 *      serialization for query messages on cluster </a>
 * 
 *      TODO There chould be a completely different encoding when only a single
 *      variable is bound (column projection) especially if there are likely to
 *      be duplicate IVs. However, we still have to pass through the cached
 *      Value associations, which this does pretty efficiently.
 */
public class IVSolutionSetEncoder implements IBindingSetEncoder {

    private static final Logger log = Logger
            .getLogger(IVSolutionSetEncoder.class);
    
    /**
     * The schema provides the order in which the {@link IV}[] for solutions
     * stored in the hash index are encoded in the {@link HTree}. {@link IV} s
     * which are not bound are modeled by a {@link TermId#NullIV}.
     * <p>
     * Note: In order to be able to encode/decode the schema based on the lazy
     * identification of the variables which appear in solutions the
     * {@link HTree} must store variable length {@link IV}[]s since new
     * variables may be discovered at any point.
     */
    private final LinkedHashSet<IVariable<?>> schema;

    /**
     * Used to store the {@link IVCache} associations. This allows us to elide
     * {@link BigdataValue}s which have already been written by this encoder
     * instance.
     */
    private final Map<IV<?, ?>, BigdataValue> cache;

    /**
     * Used to encode the {@link IV}s.
     */
    private final IKeyBuilder keyBuilder;
    
    /**
     * Used to format the encoded records (reset each time).
     */
    private final DataOutputBuffer out;

    /**
     * Temporary buffer used by {@link BigdataValueSerializer}.
     */
    private final ByteArrayBuffer tmp;
    
    /**
     * The initial version.
     */
    static final int VERSION0 = 0x0;
    
    /**
     * The version used by this encoder.
     */
    private final int version = VERSION0;

    /**
     * The #of solutions encoded to date.
     */
    private int nsolutions = 0;

    /*
     * Set when the first solution having a bound value is processed.
     */
    
    /**
     * The namespace of the lexicon relation. This is discovered from the first
     * {@link IVCache} association and written out into the stream at that
     * point. If there are no {@link IVCache} associations then it is never set.
     */
    private String namespace;

    /**
     * Used to de-serialize the {@link BigdataValue}s for {@link IVCache}
     * associations.
     */
    private BigdataValueSerializer<BigdataValue> valueSer;

    @Override
    public String toString() {
        final StringBuilder sb = new StringBuilder();
        sb.append(super.toString());
        sb.append("{namespace=" + namespace);
        sb.append(",schema=" + schema); // Not thread-safe.
        sb.append(",cacheSize=" + cache.size());// Not thread-safe
        sb.append(",nsolutions=" + nsolutions);
        sb.append("}");
        return sb.toString();
    }
    
    public IVSolutionSetEncoder() {

        // The ordered set of variables for which bindings have been observed.
        this.schema = new LinkedHashSet<IVariable<?>>();

        // The IV -> BigdataValue cache
        this.cache = new HashMap<IV<?, ?>, BigdataValue>();

        // Used to encode the IVs.
        this.keyBuilder = new ASCIIKeyBuilderFactory(128).getKeyBuilder();
        
        // Used to format the encoded records (reset each time).
        this.out = new DataOutputBuffer();
        
        // Temp buffer used by the BigdataValueSerializer
        this.tmp = new ByteArrayBuffer();
        
    }

    /**
     * Encode the solution on the stream.
     * 
     * @param out
     *            The stream.
     * @param bset
     *            The solution.
     */
    public void encodeSolution(final DataOutputBuffer out,
            final IBindingSet bset) {

        out.append(encodeSolution(bset));

    }
    
    @Override
    public byte[] encodeSolution(final IBindingSet bset) {

        return encodeSolution(bset, true/* updateCache */);

    }

    /**
     * {@inheritDoc}
     * 
     * TODO We typically use a {@link ListBindingSet}. If the
     * {@link IBindingSet} is large enough, then it would be more efficient to
     * create an {@link IVariable} to {@link IV} map within this method since we
     * have to lookup bindings by variables more than once.
     */
    @Override
    public byte[] encodeSolution(final IBindingSet bset,
            final boolean updateCacheIsIgnored) {

        if (bset == null)
            throw new IllegalArgumentException();

        final boolean trace = log.isTraceEnabled();
        
        // Reset internal buffers.
        keyBuilder.reset();
        out.reset();

        if (0 == nsolutions++) {

            // Write out the version number before the first solution.
            out.packLong(version);

        }
        
        /*
         * Before we can encode the binding set, we need to update the schema
         * such that it captures any variables used in the binding set (plus any
         * variables which have been observed in previous binding sets). We also
         * note which variables are present for the first time in this solution.
         * We will need to write those onto the wire.
         */
        final List<IVariable<?>> newVars = new LinkedList<IVariable<?>>();
        {
            /*
             * Note: Changed to use bset.iterator() based on a hot spot in the
             * profiler.
             */
            @SuppressWarnings("rawtypes")
//            final Iterator<IVariable> vitr = bset.vars();
//            while (vitr.hasNext()) {
//                final IVariable<?> v = vitr.next();
//                if (schema.add(v)) {
//                    newVars.add(v);
//                }
//            }
            final Iterator<Map.Entry<IVariable, IConstant>> itr = bset
                    .iterator();
            while (itr.hasNext()) {
                final IVariable<?> v = itr.next().getKey();
                if (schema.add(v)) {
                    newVars.add(v);
                }
            }
        }

        /*
         * Encode the binding set. Bindings will appear in the same order that
         * they were added to the schema. 
         */
        // Ordered set of new IV -> Value associations. List entry is [null] if
        // no association for IV at that ordinal position in the solution.
        final List<BigdataValue> values = new LinkedList<BigdataValue>();
        // #of bindings with non-null IVs.
        int numBindings = 0;
        // #of new IVCache associations.
        int newCached = 0;
        boolean discoveredNamespace = false;
        {
            // For each variable in the [schema] order.
            final Iterator<IVariable<?>> vitr = schema.iterator();
            while (vitr.hasNext()) {
                final IVariable<?> v = vitr.next();
                // Lookup binding for that variable in the solution.
                @SuppressWarnings("unchecked")
                final IConstant<IV<?, ?>> c = bset.get(v);
                if (c == null)
                    continue;
                // Variable is bound in this solution.
                final IV<?, ?> iv = c.get();
                // Encode binding into buffer.
                IVUtility.encode(keyBuilder, iv);
                if (iv.hasValue() && (iv.isNullIV() || !cache.containsKey(iv))) {
                    // New IV => Value association (all NullIVs are "new").
                    final BigdataValue value = iv.getValue();
                    if (namespace == null) {
                        // Note: The namespace is discovered here!!!
                        namespace = value.getValueFactory().getNamespace();
                        valueSer = BigdataValueFactoryImpl.getInstance(
                                namespace).getValueSerializer();
                        discoveredNamespace = true;
                    }
                    if (!iv.isNullIV()) {
                        /*
                         * We can not lookup Null IVs in the cache on the
                         * decoder side since ties are broken by comparing the
                         * IVCache association, which is what we are trying to
                         * resolve. Therefore we always inline the IVCache
                         * assocation if TermId.isNull() is true.
                         */
                        cache.put(iv, value);
                    }
                    values.add(value);
                    newCached++;
                } else {
                    values.add(null/* no-value */);
                }
                numBindings++;
            }
        }

//        * nbound
//        * nvars
//        * ncached 
//        * var[0]...var[nvars-1]
//        * bitmap-for-bound-variables
//        * bitmap-for-IV-with-cached-Values
//        * IV[0] ... IV[nbound-1]
//        * Value[0] ... Value[ncached-1]
        
        out.packLong(numBindings);
        if (numBindings == 0) {
            // Return formatted record.
            return out.toByteArray();
        }
        out.packLong(newVars.size());
        out.packLong(newCached);
        if (discoveredNamespace) {
            out.writeUTF2(namespace);
        }
        if (trace) {
            log.trace("schemaSize=" + schema.size() + ", cacheSize="
                    + cache.size() + ", namespace=" + namespace);
            log.trace("newVars=" + newVars.size() + ", numBindings="
                    + numBindings + ", newCached=" + newCached);
        }
        
        // write newly declared variable names.
        for (IVariable<?> var : newVars) {
            out.writeUTF2(var.getName());
        }
        
        /*
         * Write out a bit map for the variables which are bound in this
         * solution.
         * 
         * Note: This is more compact than using TermId.NULL to indicate an
         * unbound variable. TermId.NULL is 9 bytes. 9 bytes is enough for a bit
         * map for 9*8=72 variables, so this is a win if there is even one
         * unbound variable.
         */
        if (numBindings > 0) {
            // #of bytes required for the bit flags (one per declared var to date)
            final int nbytes = BytesUtil.bitFlagByteLength(schema.size());
            // current buffer position as bit index.
            int bitIndex = out.pos() << 3;
            if (trace)
                log.trace("varbitmap: beginBitOffset=" + bitIndex + ", nbytes="
                        + nbytes);
            // pre-extend the buffer, zeroing the bitmap.
            out.ensureFree(nbytes);
            for (int i = 0; i < nbytes; i++) {
                out.append((byte) 0);
            }
            for (IVariable<?> var : schema) {
                if (bset.isBound(var)) {
                    BytesUtil.setBit(out.array(), bitIndex, true);
                }
                bitIndex++;
            }
        }
        /*
         * Write out bit map for IVs for which we have newly observed a cached
         * Value.
         */
        if (newCached > 0) {
            // #of bytes required for the bit flags.
            final int nbytes = BytesUtil.bitFlagByteLength(numBindings);
            // current buffer position as bit index.
            int bitIndex = out.pos() << 3;
            // pre-extend the buffer, zeroing the bitmap.
            out.ensureFree(nbytes);
            for (int i = 0; i < nbytes; i++) {
                out.append((byte) 0);
            }
            if (trace)
                log.trace("cachebitmap: beginBitOffset=" + bitIndex
                        + ", nbytes=" + nbytes);
            for (BigdataValue value : values) {
                if (value != null) {
                    BytesUtil.setBit(out.array(), bitIndex, true);
                }
                bitIndex++;
            }
        }

        // write IV[].
        if (trace)
            log.trace("IV[]: off=" + out.pos() + ", numBindings=" + numBindings
                    + ", byteLength=" + keyBuilder.len());
        out.append(keyBuilder.array(), 0/* off */, keyBuilder.len());

        /*
         * Write Value[]. Each Value is written directly into [out].
         */
        if (newCached > 0) {
            if(trace)
                log.trace("cache[]: off=" + out.pos() + ", newCached="
                        + newCached);
            for (BigdataValue value : values) {

                if (value != null) {

                    valueSer.serialize2(value, out, tmp);

                }

            }
        }
        if(trace)
            log.trace("done: off=" + out.pos());

        // Return formatted record.
        return out.toByteArray();

    }

    @Override
    public void release() {

        cache.clear();

        schema.clear();

        out.clear();

        tmp.clear();
        
        nsolutions = 0;
        
    }

    @Override
    public void flush() {

        // NOP

    }

    /**
     * {@inheritDoc}
     * <p>
     * Always returns <code>true</code>.
     */
    @Override
    public boolean isValueCache() {

        return true;

    }

}