ASTSimpleGroupByAndCountOptimizer.java example

Explorer
blazegraph-master
- database-master
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
/*
 * Created on Jan 9, 2015
 */

package com.bigdata.rdf.sparql.ast.optimizers;

import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.log4j.Logger;

import com.bigdata.bop.BOp;
import com.bigdata.bop.BOpUtility;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.aggregate.AggregateBase;
import com.bigdata.rdf.sparql.ast.AssignmentNode;
import com.bigdata.rdf.sparql.ast.DatasetNode;
import com.bigdata.rdf.sparql.ast.FunctionNode;
import com.bigdata.rdf.sparql.ast.FunctionRegistry;
import com.bigdata.rdf.sparql.ast.GraphPatternGroup;
import com.bigdata.rdf.sparql.ast.GroupByNode;
import com.bigdata.rdf.sparql.ast.IGroupMemberNode;
import com.bigdata.rdf.sparql.ast.IQueryNode;
import com.bigdata.rdf.sparql.ast.IValueExpressionNode;
import com.bigdata.rdf.sparql.ast.JoinGroupNode;
import com.bigdata.rdf.sparql.ast.NamedSubqueriesNode;
import com.bigdata.rdf.sparql.ast.NamedSubqueryRoot;
import com.bigdata.rdf.sparql.ast.ProjectionNode;
import com.bigdata.rdf.sparql.ast.QueryBase;
import com.bigdata.rdf.sparql.ast.QueryHints;
import com.bigdata.rdf.sparql.ast.QueryNodeWithBindingSet;
import com.bigdata.rdf.sparql.ast.QueryRoot;
import com.bigdata.rdf.sparql.ast.QueryType;
import com.bigdata.rdf.sparql.ast.StatementPatternNode;
import com.bigdata.rdf.sparql.ast.StaticAnalysis;
import com.bigdata.rdf.sparql.ast.SubqueryRoot;
import com.bigdata.rdf.sparql.ast.VarNode;
import com.bigdata.rdf.sparql.ast.eval.AST2BOpContext;

/**
 * Optimizes <code>
 	SELECT (COUNT(*) as ?count) ?z WHERE {  ?x rdf:type ?z  } GROUP BY ?z
 * </code> and similar patterns using an O(N) algorithm, where N is
 * the number of distinct solutions.
 * <p>
 * The optimizer aims at establishing an execution plan that applies a
 * combination of distinct term scan pattern (to efficiently compute the
 * distinct values for the group variable) and fast range count pattern to
 * efficiently calculate the COUNT, without materialization of the variables on
 * which the COUNT operation is performed.
 * 
 * The basic idea is to
 * 
 * (i) replace the GROUP BY pattern through a SELECT DISTINCT subquery to
 * calculate the distinct bindings for variable ?z first, and (ii) use a fast
 * range count operator to efficiently calculate the COUNT.
 * 
 * Note that the sub query in step (i) may (where possible) be optimized by the
 * {@link ASTDistinctTermScanOptimizer}, i.e. if possible the subquery producing
 * the ?z bindings will be replaced by a distinct term scan in a later
 * optimization step.
 * 
 * @see <a href="http://trac.blazegraph.com/ticket/1059"> GROUP BY optimization
 *      using distinct-term-scan and fast-range-count</a>
 * 
 * @author <a href="mailto:ms@metaphacts.com">Michael Schmidt</a>
 */
public class ASTSimpleGroupByAndCountOptimizer implements IASTOptimizer {

    private static final transient Logger log = Logger.getLogger(ASTSimpleGroupByAndCountOptimizer.class);
    
   public ASTSimpleGroupByAndCountOptimizer() {
   }

   @Override
   public QueryNodeWithBindingSet optimize(
      final AST2BOpContext context, final QueryNodeWithBindingSet input) {

      final IQueryNode queryNode = input.getQueryNode();
      final IBindingSet[] bindingSets = input.getBindingSets();     


      if (context.getAbstractTripleStore().
            getSPORelation().indicesHaveDeleteMarkers()) {
        /**
         * Disallow for optimization when using delete markers.
         * <p>
         * The presence of deleteMarkers means that the fast-range count will
         * be turned into a key-range scan, which is not desired.
         * <p>
         * While AccessPath.rangeCountExact(true) method will do the right
         * thing even if the index has delete markers (it will convert to a
         * scan for either delete markers or if there is a FILTER attached
         * to the index), converting to a scan defeats the purpose of the
         * ASTFastRangeCountOptimizer. In this case, the cost would have
         * been the same if we had not rewritten the AST. Hence we do not
         * rewrite the query.
         */

         return new QueryNodeWithBindingSet(queryNode, bindingSets);
     }
      
      final QueryRoot queryRoot = (QueryRoot) queryNode;

      final StaticAnalysis sa = new StaticAnalysis(queryRoot, context);

      final DatasetNode dataset = queryRoot.getDataset();

      if (context.getAbstractTripleStore().isQuads()) {
         boolean ok = false;
         if (dataset == null || dataset.getNamedGraphs() == null) {
            /*
             * The dataset is all graphs.
             */
            ok = true;
         }

         if (!ok) {
            return new QueryNodeWithBindingSet(queryNode, bindingSets);
         }
      }

      // First, process any pre-existing named subqueries.
      {
         final NamedSubqueriesNode namedSubqueries = queryRoot
               .getNamedSubqueries();

         if (namedSubqueries != null) {

            // Note: works around concurrent modification error.
            final List<NamedSubqueryRoot> list = BOpUtility.toList(
                  namedSubqueries, NamedSubqueryRoot.class);

            for (NamedSubqueryRoot namedSubquery : list) {

               // Rewrite the named sub-select
               doSelectQuery(context, sa, (QueryRoot) queryNode, namedSubquery);
            }
         }
      }

      // rewrite the top-level select
      
      /**
       * https://jira.blazegraph.com/browse/BLZG-618, failure of 
       * http://www.w3.org/2009/sparql/docs/tests/data-sparql11/aggregates/manifest#agg06 
       * mentioned in one of the comments: we are not allowed to do the fast range count
       * optimization if the query exhibits an additional having clause (which might apply
       * any [possibly other] kind of aggregation on the query body).
       */
      final QueryRoot qr = (QueryRoot)queryNode;
      if (qr.getHaving()==null)
          doSelectQuery(context, sa, (QueryRoot) queryNode, (QueryBase) queryNode);

      return new QueryNodeWithBindingSet(queryNode, bindingSets);
   }

   /**
    * Attempt to rewrite the SELECT.
    * 
    * @param context
    * @param sa
    * @param queryRoot
    *           The top-level of the query.
    * @param queryBase
    *           Either a top-level query or a sub-query.
    */
   private void doSelectQuery(final AST2BOpContext context,
         final StaticAnalysis sa, final QueryRoot queryRoot,
         final QueryBase queryBase) {

      /**
       * The prerequisites for the optimizer, which we check in the following,
       * are as follows:
       * 
       * (C1) Query must be a SELECT query 
       * 
       * (C2) Single triple pattern in body
       * 
       * (C3) Single GROUP BY variable that is part of the triple pattern
       * 
       * (C4) Projection for the GROUP BY variable plus a COUNT operation, which
       *      operates over star or any other variable in the triple pattern,
       *      plus the COUNT must not be DISTINCT
       */
      
      // the following variables will be bound during precondition check,
      // for later use in rewriting (used only in case the optimization applies)
      ProjectionNode projectionNode = null;
      Integer indexOfVarNode = null; // index of VarNode in projectionNode
      Integer indexOfCountNode = null;// index of COUNT node in projectionNode
      VarNode countNodeVar = null; // VarNode bound through COUNT operation
      GraphPatternGroup<IGroupMemberNode> graphPattern; // surrounding gp
      StatementPatternNode stmtPattern = null; // the inner statement pattern
      VarNode groupingVar = null; // the variable which is grouped

      {
         final GroupByNode groupByNode = queryBase.getGroupBy();
         if (groupByNode == null || groupByNode.arity() != 1
               || !(groupByNode.get(0) instanceof AssignmentNode)) {
            return;
         }
         
         final AssignmentNode assignmentNodeInGroupBy = 
            (AssignmentNode) groupByNode.get(0);
         if (assignmentNodeInGroupBy.arity() != 2
               || !(assignmentNodeInGroupBy.get(0) instanceof VarNode)
               || !(assignmentNodeInGroupBy.get(1) instanceof VarNode)) {
            return; // something's wrong here
         }

         groupingVar = (VarNode) assignmentNodeInGroupBy.get(1);
         
         final VarNode groupingVarRenamed =
            (VarNode) assignmentNodeInGroupBy.get(0);

         // Check for condition (C1)
         final QueryType queryType = queryBase.getQueryType();
         if (!QueryType.SELECT.equals(queryType)) {
            return; // optimization not applicable
         }

         // Check for condition (C2)
         graphPattern = queryBase.getGraphPattern();
         if (graphPattern.args().size() != 1) {
            return;
         }

         final BOp potentialStmtPattern = graphPattern.get(0);
         if (!(potentialStmtPattern instanceof StatementPatternNode)) {
            return;
         }
         
         stmtPattern = (StatementPatternNode) potentialStmtPattern;
         
         /*
          * When in history mode, we can't do this optimization because neither
          * a distinct term scan nor a fast range count is possible, except if 
          * the StatementPatternNode has been marked to read history.
          */
         if (context.getAbstractTripleStore().isRDRHistory()) {
             
             if (!stmtPattern.getQueryHintAsBoolean(QueryHints.HISTORY, false)) {
                 if (log.isDebugEnabled()) {
                     log.debug("nope");
                 }
                 // Can not rewrite.
                 return;
             }
             
         }
         if (log.isDebugEnabled()) {
             log.debug("yep");
         }

         Set<VarNode> varNodesInStmtPattern = new HashSet<VarNode>();
         VarNode graphVarNode = null;
         for (int i = 0; i < stmtPattern.arity(); i++) {
            final BOp arg = stmtPattern.get(i);
            if (arg instanceof VarNode) {
               varNodesInStmtPattern.add((VarNode) arg);
               if (i == 3) {
                  graphVarNode = (VarNode) arg;
               }
            }
         }

         // Check for condition (C3)
         if (!varNodesInStmtPattern.contains(groupingVar)) {
            return; // illegal grouping, optimization not applicable
         }

         // Check for condition (C4)
         projectionNode = queryBase.getProjection();
         if (projectionNode.size() != 2) {
            return;
         }

         indexOfVarNode = null;
         indexOfCountNode = null;
         for (int i = 0; i < 2; i++) {
            final AssignmentNode curNode = projectionNode.getExpr(i);

            final IValueExpressionNode valNode = 
               curNode.getValueExpressionNode();

            if (valNode instanceof FunctionNode && indexOfCountNode == null) {
               final FunctionNode fNode = (FunctionNode) curNode
                     .getValueExpressionNode();

               if (!fNode.getFunctionURI().equals(FunctionRegistry.COUNT))
                  return; // NOT COUNT

               // check for problematic COUNT(DISTINCT ...)) pattern
               final Map<String, Object> scalarVals = fNode.getScalarValues();
               final Object isDistinct = scalarVals
                     .get(AggregateBase.Annotations.DISTINCT);
               if (isDistinct != null && isDistinct instanceof Boolean
                     && (Boolean) isDistinct) {
                  return; // COUNT (DISTINCT ...) cannot be optimized
               }

               if (fNode.args().size() != 1) {
                  return;
               }

               final BOp inner = fNode.args().get(0);
               if (!(inner instanceof VarNode)) {
                  return;
               }
               final VarNode innerVarNode = (VarNode) inner;

               // the count operation must be performed on a variable
               // distinct from the named graph variable, if any 
               // (as the latter is not necessarily bound); 
               // alternatively, a COUNT(*) is also ok
               boolean countOnPotentiallyUnboundVariable = 
                  varNodesInStmtPattern.contains(innerVarNode) && 
                  !innerVarNode.equals(graphVarNode);
               boolean isWildcard = innerVarNode.isWildcard();

               if (!(countOnPotentiallyUnboundVariable || isWildcard)) {
                  return; // optimization in overall not applicable
               }

               if (!(curNode.get(0) instanceof VarNode)) {
                  return; // first exp of assignment node is assigned var
               }

               countNodeVar = (VarNode) curNode.get(0);
               indexOfCountNode = i;

            } else if (valNode instanceof VarNode && indexOfVarNode == null) {

               final VarNode valNodeAsVarNode = (VarNode) valNode;
               if (!valNodeAsVarNode.equals(groupingVarRenamed)) {
                  return; // optimization not applicable
               }
               indexOfVarNode = i;

            } else {
               return; // invalid
            }
         }
      }

      /**
       * Once we reach this point, we're sure that the optimization is
       * applicable, we now rewrite the query plan accordingly; the following
       * needs to be done:
       * 
       * (O1) Wrap the existing statement pattern into a SELECT DISTINCT
       * subquery, which could (potentially) be optimized to a distinct term
       * scan by the #{ASTDistinctTermScanOptimzer} 
       * 
       * (O2) Duplicate the statement pattern and append a fast range count 
       * annotation, binding the count var of the SELECT clause 
       * 
       * (O3) Transform the SELECT clause: the COUNT expression is replaced by 
       * a projection for the variable introduced in O2 
       * 
       * (O4) Eliminate the group by clause
       */
      {
         // apply optimization step (O1)
         final SubqueryRoot selectDistinct = new SubqueryRoot(QueryType.SELECT);
         final ProjectionNode projection = new ProjectionNode();
         final AssignmentNode assignemntNode = 
            new AssignmentNode(groupingVar, groupingVar);
         projection.addArg(assignemntNode);
         projection.setDistinct(true);
         selectDistinct.setProjection(projection);
         
         final JoinGroupNode join = new JoinGroupNode();
         join.addArg(stmtPattern);
         selectDistinct.setWhereClause(join);
         graphPattern.setArg(0, selectDistinct);

         // apply optimization step (O2)
         final StatementPatternNode stmtPatternClone = 
            new StatementPatternNode(stmtPattern);
         stmtPatternClone.setFastRangeCount(countNodeVar);
         graphPattern.addArg(stmtPatternClone);

         // apply optimization step (O3)
         projectionNode.setArg(indexOfCountNode, new AssignmentNode(
               new VarNode(countNodeVar), new VarNode(countNodeVar)));

         // apply optimization step (O4)
         queryBase.setGroupBy(null);
      }
   }
}