CBD.java example

Explorer
blazegraph-master
- database-master
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Aug 28, 2012
 */
package com.bigdata.rdf.sparql.ast.eval;

import info.aduna.iteration.CloseableIteration;

import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;

import org.apache.log4j.Logger;
import org.openrdf.query.BindingSet;
import org.openrdf.query.QueryEvaluationException;
import org.openrdf.query.algebra.evaluation.iterator.CollectionIteration;

import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.bindingSet.ListBindingSet;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.model.BigdataBNode;
import com.bigdata.rdf.model.BigdataStatement;
import com.bigdata.rdf.model.BigdataValue;
import com.bigdata.rdf.sparql.ast.ASTContainer;
import com.bigdata.rdf.sparql.ast.AssignmentNode;
import com.bigdata.rdf.sparql.ast.ConstantNode;
import com.bigdata.rdf.sparql.ast.DescribeModeEnum;
import com.bigdata.rdf.sparql.ast.ProjectionNode;
import com.bigdata.rdf.sparql.ast.QueryRoot;
import com.bigdata.rdf.sparql.ast.QueryType;
import com.bigdata.rdf.sparql.ast.VarNode;
import com.bigdata.rdf.sparql.ast.optimizers.ASTDescribeOptimizer;
import com.bigdata.rdf.store.AbstractTripleStore;

/**
 * Utility class for computing the Concise Bounded Description.
 * 
 * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/578"> Concise
 *      Bounded Description </a>
 * @see <a href="http://www.w3.org/Submission/CBD/"> CBD - Concise Bounded
 *      Description </a>
 * @see ASTDescribeOptimizer
 * @see ASTConstructIterator
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * 
 *         FIXME Watch a timeout on the top-level query (if present)
 */
public class CBD {

    private static final Logger log = Logger.getLogger(CBD.class);
   
    /** The {@link AbstractTripleStore}. */
    private final AbstractTripleStore store;
    
    /**
     * The {@link DescribeModeEnum} specifying how to evaluate the top-level
     * DESCRIBE query.
     */
    private final DescribeModeEnum describeMode;

    /**
     * The limit on the #of iterations (iff the statement limit is also
     * reached) -or- ZERO (0) for no limit.
     */
    private final int describeIterationLimit;
    
    /**
     * The limit on the #of statements (iff the iteration limit is also
     * reached) -or- ZERO (0) for no limit.
     */
    private final int describeStatementLimit;
    
    /**
     * The {@link DescribeModeEnum} specifying how to evaluate each expansion
     * round of the DESCRIBE query.
     */
    private final DescribeModeEnum describeExpansionMode;

    /**
     * A mapping that is used to preserve a consistent assignment from blank
     * node IDs to {@link BigdataBNode}s scoped to the subgraph reported by the
     * top-level DESCRIBE query.
     */
    private final Map<String, BigdataBNode> bnodes;
    
    /**
     * @param store
     *            The {@link AbstractTripleStore}.
     * @param describeMode
     *            The {@link DescribeModeEnum} specifying how to evaluate the
     *            DESCRIBE query.
     * @param describeIterationLimit
     *            The limit on the #of iterations (iff the statement limit is
     *            also reached) -or- ZERO (0) for no limit.
     * @param describeStatementLimit
     *            The limit on the #of statements (iff the iteration limit is
     *            also reached) -or- ZERO (0) for no limit..
     * @param bnodes
     *            A mapping that is used to preserve a consistent assignment
     *            from blank node IDs to {@link BigdataBNode}s scoped to the
     *            subgraph reported by the top-level DESCRIBE query.
     */
    public CBD(final AbstractTripleStore store,
            final DescribeModeEnum describeMode,
            final int describeIterationLimit,
            final int describeStatementLimit,
            final Map<String, BigdataBNode> bnodes) {

        if (store == null)
            throw new IllegalArgumentException();
        
        if (describeMode == null)
            throw new IllegalArgumentException();
        
        if (describeIterationLimit < 0)
            throw new IllegalArgumentException();
        
        if (describeStatementLimit < 0)
            throw new IllegalArgumentException();
        
        if (bnodes == null)
            throw new IllegalArgumentException();

        this.store = store;
        
        this.describeMode = describeMode;

        this.describeIterationLimit = describeIterationLimit;
        
        this.describeStatementLimit = describeStatementLimit;
        
        this.bnodes = bnodes;
        
        switch(describeMode) {
        case CBD:
//        case CBDNR:
            // Expansion only explores the forward links.
            describeExpansionMode = DescribeModeEnum.ForwardOneStep;
            break;
        case SCBD:
//        case SCBDNR:
            // Expansion explores both forward and reverse links.
            describeExpansionMode = DescribeModeEnum.SymmetricOneStep;
            break;
        case ForwardOneStep:
        case SymmetricOneStep:
            // There are no expansion steps for these modes.
            throw new UnsupportedOperationException();
        default:
            // Unknown describe mode.
            throw new AssertionError();
        }
        
    }
    
    /**
     * The description of the original resource(s) is expanded for each blank
     * node encountered in the constructed statements until no new blank nodes
     * are encountered. The resulting set of statements is then reported as the
     * description for the resources identified either as constants in the
     * projection of the original query or as variables in the projection of the
     * original query that became bound in the WHERE clause of that original
     * query.
     * 
     * @param src
     *            An iterator from which we can drain the solutions to the
     *            top-level DESCRIBE query. This is the input into the CBD
     *            expansion.
     * 
     * @return An iterator from which the concise bounded description of the may
     *         be drained.
     * 
     * @throws QueryEvaluationException
     */
    CloseableIteration<BigdataStatement, QueryEvaluationException> computeClosure(
            CloseableIteration<BigdataStatement, QueryEvaluationException> src)
            throws QueryEvaluationException {

        // Round ZERO (0) is the top-level describe.
        int nrounds = 0;
        // The blank nodes identified in the previous round.
        final Set<IV<?,?>> bnodes_tm1 = new LinkedHashSet<IV<?,?>>();
        // The statements identified so far.
        final Set<BigdataStatement> stmts = new LinkedHashSet<BigdataStatement>();
        while (true) {

            // CBD expansion begins at round ONE (1).
            nrounds++;

            // #of statements on entry to this round.
            final int nstmts = stmts.size();

            if (cutoffQuery(nrounds - 1, nstmts)) {
                src.close();
                throw new QueryEvaluationException("CBD cutoff: nrounds="
                        + nrounds + ", nstatements=" + nstmts + ".");
            }

            /*
             * Build a collection of all distinct statements and all distinct
             * blank node IVs encountered in the source statements. Any of the
             * (s,o,c) positions can be blank nodes.
             */

            // The blank nodes IVs identified in this round that were NOT known
            // in the previous round(s).
            final Set<IV<?, ?>> newBnodes = consumeStatements(src, stmts,
                    bnodes_tm1);
            
            if (newBnodes.isEmpty()) {

                // All done.
                break;

            }

            /*
             * We will have to do another CBD round since there is at least one
             * new blank node IV that needs to be described.
             */

            if (log.isInfoEnabled()) {
                log.info("#rounds=" + nrounds + ", describeMode="
                        + describeMode + ", #stmts(in)=" + stmts.size()
                        + ", #bnodes(in)=" + bnodes_tm1.size()
                        + ", #bnodes(new)=" + newBnodes.size() + " : "
                        + newBnodes);

                // Conditional logging.
                logState(stmts, bnodes_tm1, newBnodes);

            }

            src = doRound(newBnodes);
            
            // All of these blank nodes have been resolved.
            bnodes_tm1.addAll(newBnodes);
        
        }

        // Done.
        
        if (log.isInfoEnabled()) {
        
            log.info("#rounds=" + nrounds + " (done), describeMode="
                    + describeMode + ", #stmts(in)=" + stmts.size()
                    + ", #bnodes(in)=" + bnodes_tm1.size());
            
            // Conditional logging.
            logState(stmts, bnodes_tm1, null/* newBNodes */);
            
        }

        /*
         * Stream out the fixed point collection of statements that are the
         * Concise Bounded Description of the resources identified in/by the
         * top-level DESCRIBE query.
         */

        return new CollectionIteration<BigdataStatement, QueryEvaluationException>(
                stmts);

    }

    /**
     * Return <code>true</code> iff the DESCRIBE query should be cutoff because
     * the limits have been exceeded.
     * 
     * @param nrounds
     *            The #of evaluation rounds that have already been computed and
     *            ZERO (0) if this is the ffirst round.
     * @param nstmts
     *            The #of statements at the start of this round.
     *            
     * @return <code>true</code> iff evaluation should be cutoff.
     */
    private boolean cutoffQuery(int nrounds, int nstmts) {

        // ZERO implies MAX_INT
        final int describeIterationLimit = this.describeIterationLimit == 0 ? Integer.MAX_VALUE
                : this.describeIterationLimit;

        final int describeStatementLimit = this.describeStatementLimit == 0 ? Integer.MAX_VALUE
                : this.describeStatementLimit;
        
        final boolean cutoffRounds = nrounds >= describeIterationLimit;

        final boolean cutoffStatements = nstmts >= describeStatementLimit;

        return cutoffRounds && cutoffStatements;

    }

    /**
     * Log the statements and bnode {@link IV}s @ DEBUG.
     * 
     * @param stmts
     *            The statements.
     * @param bnodes_tm1
     *            The bnode {@link IV}s from the last round (initially empty).
     * @param newBnodes
     *            The bnode {@link IV}s (optional and <code>null</code> if we
     *            are done).
     */
    private void logState(final Set<BigdataStatement> stmts,
            final Set<IV<?, ?>> bnodes_tm1, final Set<IV<?, ?>> newBnodes) {

        if (!log.isDebugEnabled())
            return;

        final StringBuilder sb = new StringBuilder(stmts.size() * 100);
        {
            sb.append("Statements: ("+stmts.size()+")\n");
            for (BigdataStatement st : stmts) {
                sb.append(st.toString());
                sb.append("\n");
            }
            log.debug(sb.toString());
        }
        {
            sb.setLength(0);// truncate.
            sb.append("BNodes(t-1): ("+bnodes_tm1.size()+")\n");
            for (IV<?, ?> iv : bnodes_tm1) {
                sb.append(iv.toString());
                sb.append("\n");
            }
            log.debug(sb.toString());
        }
        if (newBnodes != null) {
            sb.setLength(0);// truncate.
            sb.append("BNodes(new): ("+newBnodes.size()+")\n");
            for (IV<?, ?> iv : newBnodes) {
                sb.append(iv.toString());
                sb.append("\n");
            }
            log.debug(sb.toString());
        }

    }

    /**
     * Consume statements from the source iterator, adding new statements into a
     * collection and adding new blank node {@link IV}s into another collection.
     * 
     * @param src
     *            The statements to be consumed.
     * @param stmts
     *            The set of statements in the description of the resources from
     *            the previous round(s) (if any).
     * @param bnodes_tm1
     *            The blank node {@link IV}s already known on entry to the
     *            current round. This is empty for on entry to the first
     *            expansion round.
     * 
     * @return The set of blank node {@link IV}s not previous encountered in the
     *         CBD expansion.
     * 
     * @throws QueryEvaluationException
     */
    private static Set<IV<?, ?>> consumeStatements(
            final CloseableIteration<BigdataStatement, QueryEvaluationException> src,
            final Set<BigdataStatement> stmts, final Set<IV<?, ?>> bnodes_tm1)
            throws QueryEvaluationException {

        final Set<IV<?, ?>> newBnodes = new LinkedHashSet<IV<?, ?>>();
        try {

            while (src.hasNext()) {

                final BigdataStatement stmt = src.next();

//                /*
//                 * A statement of the form
//                 * 
//                 * ?stmtN rdf:subject <term>
//                 * 
//                 * where <term> is a blank node.
//                 */
//                final boolean foo = stmt.getPredicate().equals(RDF.SUBJECT)
//                        && bnodes_tm1.contains(stmt.getObject());
                
                if (stmts.add(stmt)) {

                    /*
                     * New blank node IVs can only be encountered for new
                     * statements.
                     * 
                     * TODO Consider using an ISPO => BigdataStatement map for
                     * the statements so we can avoid duplicate entries for
                     * BigdataStatements having different blank nodes but the
                     * same IVs for those blank nodes.
                     */
                    
                    collectBNodeIVs(bnodes_tm1, newBnodes,
                            getBNodeIV(stmt.getSubject()));

                    collectBNodeIVs(bnodes_tm1, newBnodes,
                            getBNodeIV(stmt.getObject()));

                    collectBNodeIVs(bnodes_tm1, newBnodes,
                            getBNodeIV(stmt.getContext()));

                }

            }
            
            return newBnodes;

        } finally {

            src.close();

        }

    }
    
    /**
     * Create a new DESCRIBE query to describe each new blank node identifier in
     * the previous round. We need to tunnel the evaluation of the DESCRIBE
     * query in order to: (a) ensure that the blank node {@link IV}s are
     * attached to the blank nodes in the DESCRIBE clause; and (b) avoid the
     * describe cache materialization logic since rounds GT ZERO (0) are not
     * top-level DESCRIBE queries and do not describe top-level resources.
     * 
     * @param bnodeIVs
     *            The blank nodes that need to be described.
     * @return An iterator from which the description of those blank nodes may
     *         be read.
     * @throws QueryEvaluationException
     */
    private CloseableIteration<BigdataStatement, QueryEvaluationException> doRound(
            final Set<IV<?, ?>> bnodeIVs) throws QueryEvaluationException {

        /*
         * Create the DESCRIBE query for the blank node IVs.
         */
        final ASTContainer astContainer = getDescribeQuery(bnodeIVs);
        
        final AST2BOpContext context = new AST2BOpContext(astContainer, store);

        // Clear the optimized AST.
        astContainer.clearOptimizedAST();
        
        // Batch resolve Values to IVs and convert to bigdata binding set.
        final IBindingSet[] bindingSets = 
              new IBindingSet[] { new ListBindingSet() };

        // Convert the query (generates an optimized AST as a side-effect).
        AST2BOpUtility.convert(context, bindingSets);

        // The optimized AST.
        final QueryRoot optimizedQuery = astContainer.getOptimizedAST();
        
        if (log.isDebugEnabled()) {
            log.debug("describeMode=" + describeMode + ", expansionMode="
                    + describeExpansionMode);
            log.debug("OriginalAST: " + astContainer.getOriginalAST());
            log.debug("OptimizedAST: " + optimizedQuery);
        }

        final boolean materializeProjectionInQuery = context.materializeProjectionInQuery
                && !optimizedQuery.hasSlice();

        // Solutions to the WHERE clause (as projected).
        final CloseableIteration<BindingSet, QueryEvaluationException> solutions = ASTEvalHelper
                .evaluateQuery(astContainer, context, materializeProjectionInQuery//
                        , optimizedQuery.getProjection().getProjectionVars()//
                );

        // Constructed Statements.
        final CloseableIteration<BigdataStatement, QueryEvaluationException> src =
                new ASTConstructIterator(context, store, //
                        optimizedQuery.getConstruct(), //
                        optimizedQuery.getWhereClause(),//
                        bnodes,//
                        solutions//
                        );

        return src;
        
    }

    /**
     * Generate a DESCRIBE query for one of the expansion rounds.
     * 
     * @param bnodeIVs
     *            The blank nodes that need to be described.
     * @return The {@link ASTContainer} wrapping that DESCRIBE query.
     */
    @SuppressWarnings({ "unchecked", "rawtypes" })
    private ASTContainer getDescribeQuery(final Set<IV<?, ?>> bnodeIVs) {

        /*
         * Ensure that the bnode IVs are resolved to the corresponding
         * BigdataBlankNode objects and that the valueCache relation is set on
         * those bnode IVs.
         */
        final Map<IV<?, ?>, BigdataValue> terms = store.getLexiconRelation()
                .getTerms(bnodeIVs);

        for (Map.Entry<IV<?, ?>, BigdataValue> e : terms.entrySet()) {

            ((IV) e.getKey()).setValue(e.getValue());

        }

        final QueryRoot queryRoot = new QueryRoot(QueryType.DESCRIBE);
        {

            final ProjectionNode projection = new ProjectionNode();
            queryRoot.setProjection(projection);
         
            /*
             * Specify the describe mode appropriate for the expansion given the
             * top-level describe algorithm that we are evaluating.
             */
            projection.setDescribeMode(describeExpansionMode);
            
            int i=1;
            for (IV<?, ?> iv : bnodeIVs) {

                if (!iv.hasValue())
                    throw new AssertionError("valueCache not set : " + iv);

                final VarNode anonvar = new VarNode("-cbd-bnode-"+i++);

                anonvar.setAnonymous(true);

                projection.addProjectionExpression(new AssignmentNode(anonvar,
                        new ConstantNode(iv)));

            }

        }

        return new ASTContainer(queryRoot);

    }

    /**
     * Collect blank nodes {@link IV}s not already declared in a previous round.
     * 
     * @param bnodes_tm1
     *            The blank node {@link IV}s already declared in the previous
     *            round(s).
     * @param newBnodes
     *            The set of blank node {@link IV}s that were discovered in this
     *            round.
     * @param bNodeIV
     *            A blank node {@link IV} (may be <code>null</code>).
     */
    private static void collectBNodeIVs(final Set<IV<?, ?>> bnodes_tm1,
            final Set<IV<?, ?>> newBnodes, final IV<?, ?> bNodeIV) {
        
        if (bNodeIV == null) {

            /* The corresponding position in the statement was not a blank node. */
            return;

        }

        if (bnodes_tm1.contains(bNodeIV)) {
            
            /* This blank node was already declared in the previous round. */
            return;
            
        }

        newBnodes.add(bNodeIV);
        
    }

    /**
     * If the value is a blank node, then return the IV for that blank node and
     * otherwise return <code>null</code>.
     * 
     * @param v
     *            The value.
     *            
     * @return The {@link IV} for that blank node and <code>null</code> iff the
     *         value is not a blank node.
     */
    static private IV<?,?> getBNodeIV(final BigdataValue v) {

        if (v == null) {

            /*
             * Note: This case is allowed for the context position of the
             * statement.
             */
            return null;

        }

        final BigdataBNode bnode = (BigdataBNode) ((v instanceof BigdataBNode) ? v
                : null);

        if (bnode == null) {

            // Not a blank node.
            return null;

        }

        final IV<?, ?> iv = bnode.getIV();

        /*
         * No. What we really need to do is ensure that the blank node variables
         * (a) have their valueCache set when we create the DESCRIBE query for
         * the expansion rounds; and (b) are projected out of the CONSTRUCT
         * queries that are evaluated in the expansion rounds. That will ensure
         * that the blank nodes are materialized.
         */
//        /*
//         * We need to set the valueCache relation on the IV. The
//         * ASTConstructIterator depends on this in makeStatement().
//         */
//        
//        ((IV) iv).setValue(bnode);

        return iv;
        
    }
    
}