ASTDistinctTermScanOptimizer.java example

/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Sep 14, 2011
 */

package com.bigdata.rdf.sparql.ast.optimizers;

import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;

import org.apache.log4j.Logger;

import com.bigdata.bop.BOp;
import com.bigdata.bop.BOpUtility;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IConstant;
import com.bigdata.bop.IPredicate;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.IVariableOrConstant;
import com.bigdata.bop.Var;
import com.bigdata.rdf.sparql.ast.AssignmentNode;
import com.bigdata.rdf.sparql.ast.DatasetNode;
import com.bigdata.rdf.sparql.ast.GraphPatternGroup;
import com.bigdata.rdf.sparql.ast.IGroupMemberNode;
import com.bigdata.rdf.sparql.ast.IQueryNode;
import com.bigdata.rdf.sparql.ast.NamedSubqueriesNode;
import com.bigdata.rdf.sparql.ast.NamedSubqueryRoot;
import com.bigdata.rdf.sparql.ast.ProjectionNode;
import com.bigdata.rdf.sparql.ast.QueryBase;
import com.bigdata.rdf.sparql.ast.QueryHints;
import com.bigdata.rdf.sparql.ast.QueryNodeWithBindingSet;
import com.bigdata.rdf.sparql.ast.QueryRoot;
import com.bigdata.rdf.sparql.ast.QueryType;
import com.bigdata.rdf.sparql.ast.StatementPatternNode;
import com.bigdata.rdf.sparql.ast.StaticAnalysis;
import com.bigdata.rdf.sparql.ast.SubqueryBase;
import com.bigdata.rdf.sparql.ast.SubqueryRoot;
import com.bigdata.rdf.sparql.ast.VarNode;
import com.bigdata.rdf.sparql.ast.eval.AST2BOpBase;
import com.bigdata.rdf.sparql.ast.eval.AST2BOpContext;
import com.bigdata.rdf.sparql.ast.service.ServiceNode;
import com.bigdata.rdf.spo.ISPO;
import com.bigdata.rdf.spo.SPOKeyOrder;
import com.bigdata.striterator.IKeyOrder;

/**
 * Optimizes
 * <code>SELECT (DISTINCT|REDUCED) ?property WHERE { ?x ?property ?y . }</code>
 * and similar patterns using an O(N) algorithm, where N is the number of
 * distinct solutions.
 * <p>
 * The main advantage here is to turn an access path that is fully unbound into
 * a distinct-term scan. If the access path would be evaluated as-bound with at
 * least one variable bound, then the distinct term scan might not have any
 * advantage over the pipeline join (and the semantics of DISTINCT would be
 * violated with multiple as-bound evaluations of the distinct-term-scan without
 * a hash index to impose the DISTINCT constraint).
 * 
 * TODO We are doing something very similar for <code>GRAPH ?g {}</code>. It
 * would be worth while to look at that code in the light of this optimizer.
 * 
 * @see <a href="http://trac.blazegraph.com/ticket/1035" > DISTINCT PREDICATEs
 *      query is slow </a>
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 */
public class ASTDistinctTermScanOptimizer implements IASTOptimizer {

   /**
     * 
     */
   public ASTDistinctTermScanOptimizer() {
   }

   @Override
   public QueryNodeWithBindingSet optimize(
      final AST2BOpContext context, final QueryNodeWithBindingSet input) {

      final IQueryNode queryNode = input.getQueryNode();
      final IBindingSet[] bindingSets = input.getBindingSets();     

      final QueryRoot queryRoot = (QueryRoot) queryNode;

      final StaticAnalysis sa = new StaticAnalysis(queryRoot, context);

      final DatasetNode dataset = queryRoot.getDataset();

      if (context.getAbstractTripleStore().isQuads()) {
         boolean ok = false;
         if (dataset == null || dataset.getNamedGraphs() == null) {
            /*
             * The dataset is all graphs.
             */
            ok = true;
         }

         if (!ok) {
            return new QueryNodeWithBindingSet(queryNode, bindingSets);
         }
      }
        
      // First, process any pre-existing named subqueries.
      {

         final NamedSubqueriesNode namedSubqueries = queryRoot
               .getNamedSubqueries();

         if (namedSubqueries != null) {

            // Note: works around concurrent modification error.
            final List<NamedSubqueryRoot> list = BOpUtility.toList(
                  namedSubqueries, NamedSubqueryRoot.class);

            for (NamedSubqueryRoot namedSubquery : list) {

               // Rewrite the named sub-select
               doSelectQuery(context, sa, (QueryRoot) queryNode, namedSubquery);

            }

         }

      }

      // rewrite the top-level select
      doSelectQuery(context, sa, (QueryRoot) queryNode, (QueryBase) queryNode);

      return new QueryNodeWithBindingSet(queryNode, bindingSets);
   }

   private void doRecursiveRewrite(final AST2BOpContext context,
         final StaticAnalysis sa, final QueryRoot queryRoot,
         final GraphPatternGroup<IGroupMemberNode> group) {

      final int arity = group.arity();

      for (int i = 0; i < arity; i++) {

         final BOp child = (BOp) group.get(i);

         if (child instanceof GraphPatternGroup<?>) {

            // Recursion into groups.
            doRecursiveRewrite(context, sa, queryRoot,
                  ((GraphPatternGroup<IGroupMemberNode>) child));

         } else if (child instanceof SubqueryRoot) {

            // Recursion into subqueries.
            final SubqueryRoot subqueryRoot = (SubqueryRoot) child;
            doRecursiveRewrite(context, sa, queryRoot,
                  subqueryRoot.getWhereClause());

            // rewrite the sub-select
            doSelectQuery(context, sa, queryRoot, (SubqueryBase) child);

         } else if (child instanceof ServiceNode) {

            // Do not rewrite things inside of a SERVICE node.
            continue;

         }

      }

   }

   /**
    * Attempt to rewrite the SELECT.
    * 
    * @param context
    * @param sa
    * @param queryRoot
    *           The top-level of the query.
    * @param queryBase
    *           Either a top-level query or a sub-query.
    */
   private void doSelectQuery(final AST2BOpContext context,
         final StaticAnalysis sa, final QueryRoot queryRoot,
         final QueryBase queryBase) {

      // recursion first.
      doRecursiveRewrite(context, sa, queryRoot, queryBase.getWhereClause());

      if (queryBase.getQueryType() != QueryType.SELECT) {
         return;
      }

      /*
       * Looking for SELECT ?var { triple-or-quads-pattern }
       * 
       * where ?var is one of the variables in that triple or quads pattern.
       */
      final ProjectionNode projection = queryBase.getProjection();

      if (!projection.isDistinct() && !projection.isReduced()) {
         /*
          * The distinct term scan automatically eliminates duplicates.
          * Therefore it is only allowable with SELECT DISTINCT or SELECT
          * REDUCED.
          */
         return;
      }

      if (projection.isEmpty())
         return;

      if (projection.arity() > 1)
         return;

      final AssignmentNode assignmentNode = projection.getExpr(0);

      if (!(assignmentNode.getValueExpressionNode() instanceof VarNode)) {
         /*
          * The projection needs to be a single, simple variable.
          */
         return;
      }

      final IVariable<?> projectedVar = assignmentNode.getVar();

      /**
       * Looking for a single triple or quad pattern in the WHERE clause.
       */

      final GraphPatternGroup<IGroupMemberNode> whereClause = 
            queryBase.getWhereClause();

      if (whereClause == null || whereClause.arity() != 1) {
         // Not simple triple pattern.
         return;
      }

      if (!(whereClause.get(0) instanceof StatementPatternNode)) {
         // Not simple triple pattern.
         return;
      }

      // The single triple pattern.
      final StatementPatternNode sp = (StatementPatternNode) whereClause.get(0);

      /*
       * When in history mode, we can't do a distinct term scan unless the 
       * StatementPatternNode has been marked to read history.  The distinct
       * term scan will visit terms that might have been deleted.
       */
      if (context.getAbstractTripleStore().isRDRHistory()) {
          
          if (!sp.getQueryHintAsBoolean(QueryHints.HISTORY, false)) {
              // Can not rewrite.
              return;
          }
          
      }
      
      IKeyOrder<ISPO> keyOrder = getApplicableKeyOrderIfExists(sp,
            projectedVar, context);
      if (keyOrder == null) {
         return;
      }

      /*
       * Make sure that there are no correlated variables in the SP.
       */
      {
         final Set<VarNode> vars = new LinkedHashSet<VarNode>();

         for (VarNode varNode : BOpUtility.toList(sp, VarNode.class)) {

            if (!vars.add(varNode)) {

               // This variable appears more than once.
               return;

            }

         }

      }

      final Set<IVariable<?>> producedBindings = sp.getProducedBindings();

      if (!producedBindings.contains(projectedVar)) {
         /*
          * The projected variable is not any of the variables used in the
          * triple pattern.
          * 
          * Note: This rewrite is only advantageous when a single variable is
          * bound by the triple pattern is projected out of the query (or
          * perhaps when 2 variables are bound by a quad pattern and projected
          * out of the query). The benefit of this rewrite is that we can avoid
          * binding the variables that are NOT projected out of the query.
          * 
          * TODO Does this already handle named graph APs?
          */
         return;
      }

      if (queryBase instanceof SubqueryRoot) {

         /*
          * The pattern is detected is a sub-select.
          * 
          * Mark the SELECT as "run-once". This will cause it to be lifted out
          * as a named subquery. This is how we enforce bottom-up evaluation. In
          * this case it is REQUIRED since the semantics of DISTINCT / REDUCED
          * would not be enforced across multiple as-bound invocations of the
          * rewritten sub-SELECT.
          * 
          * FIXME Make sure that we have a correctness test for the case of an
          * embedded sub-select where the DISTINCT semantics would otherwise
          * break. (We are REQUIRED to use bottom-up evaluation in order to have
          * the semantics of DISTINCT/REDUCED across the SELECT if this clause
          * is appearing as a sub-SELECT. Otherwise the sub-SELECT could be
          * evaluated for multiple source bindings leading to multiple
          * applications of the distinct-term-scan and that would break the
          * DISTINCT/REDUCED semantics of the operator.)
          */
         ((SubqueryRoot) queryBase).setRunOnce(true/* runOnce */);

      }

      /*
       * Disable DISTINCT/REDUCED. The distinct-term-scan will automatically
       * enforce this.
       */
      projection.setDistinct(false);
      projection.setReduced(false);

      /*
       * Setup the distinct-term-scan annotation with the variables that will be
       * projected out of the SELECT.
       */
      final VarNode distinctTermScanVar = new VarNode(projectedVar.getName());
      sp.setDistinctTermScanVar(distinctTermScanVar);
      sp.setQueryHint(IPredicate.Annotations.KEY_ORDER, keyOrder.toString());

      /**
       * Change the estimated cardinality.
       * 
       * The new cardinality is:
       * 
       * <pre>
       * newCard = oldCard * 1.0 / arity(context, sp)
       * </pre>
       * 
       * where arity() is 3 for triples and 4 for quads.
       */
      final Long oldCard = (Long) sp
            .getProperty(AST2BOpBase.Annotations.ESTIMATED_CARDINALITY);

      if (oldCard == null) {
         throw new AssertionError(
               "Expecting estimated-cardinality to be bound: sp=" + sp);
      }
      final int arity = context.isQuads() ? 4 : 3;
      final long newCard = (long) (oldCard * 1.0 / arity);

      sp.setProperty(AST2BOpBase.Annotations.ESTIMATED_CARDINALITY, newCard);

   }

   /**
    * Computes an applicable key order for performing a distinct range term
    * scan, if exists. Such a key order must be formed out of a prefix
    * [ConstList + DistinctVar], where ConstList is the list of constants in the
    * triple pattern.
    * 
    * @param sp
    * @param context
    * @return matching key order, if exists, null if not (indicating failure)
    */
   private IKeyOrder<ISPO> getApplicableKeyOrderIfExists(
         StatementPatternNode sp, IVariable<?> termScanVar,
         AST2BOpContext context) {

      boolean isQuads = context.getAbstractTripleStore().isQuads();

      // first, construct a predicate for index probing
      IVariableOrConstant[] args = new IVariableOrConstant[isQuads ? 4 : 3];
      args[0] = sp.s().getValueExpression();
      args[1] = sp.p().getValueExpression();
      args[2] = sp.o().getValueExpression();
      if (isQuads) {
         args[3] = sp.c() == null ? Var.var("--anon-" + context.nextId()) : sp
               .c().getValueExpression();
      }

      Set<SPOKeyOrder> candidateKeyOrder = getCandidateKeyOrders(sp,
            termScanVar, context, isQuads);
      if (candidateKeyOrder.isEmpty()) {
         return null;
      } else {
         return candidateKeyOrder.iterator().next();
      }
   }

   /**
    * Return all candidate key orders. Candidate key orders must satisfy the
    * condition that the constants in the triple pattern form a prefix, followed
    * by the term scan variable in the position right behind the constant
    * positions.
    * 
    * @param sp
    * @param termScanVar
    * @param context
    * @param isQuads
    * @return
    */
   private Set<SPOKeyOrder> getCandidateKeyOrders(StatementPatternNode sp,
         IVariable<?> termScanVar, AST2BOpContext context, boolean isQuads) {

      /**
       * Constraints on the positions are as follows:
       * 
       * 2 - constant 1 - the distinct term scan var 0 - unconstrained
       */
      final StringBuffer constantPosBuf = new StringBuffer();
      Character distinctTermScanPos = null;
      final StringBuffer unconstrainedPosBuf = new StringBuffer();

      final int pcS = getPositionConstraint(sp.s().getValueExpression(), termScanVar);
      final int pcP = getPositionConstraint(sp.p().getValueExpression(), termScanVar);
      final int pcO = getPositionConstraint(sp.o().getValueExpression(), termScanVar);

      if (pcS == 2)
         constantPosBuf.append("S");
      if (pcP == 2)
         constantPosBuf.append("P");
      if (pcO == 2)
         constantPosBuf.append("O");
      if (pcS == 1)
         distinctTermScanPos = 'S';
      if (pcP == 1)
         distinctTermScanPos = 'P';
      if (pcO == 1)
         distinctTermScanPos = 'O';
      if (pcS == 0)
         unconstrainedPosBuf.append("S");
      if (pcP == 0)
         unconstrainedPosBuf.append("P");
      if (pcO == 0)
         unconstrainedPosBuf.append("O");

      if (isQuads) {
         if (sp.c() == null || sp.c().getValueExpression() == null) {
            unconstrainedPosBuf.append("C");
         } else {
            int pcC = getPositionConstraint(sp.c().getValueExpression(),
                  termScanVar);
            if (pcC == 2)
               constantPosBuf.append("C");
            if (pcC == 1)
               distinctTermScanPos = 'C';
            if (pcC == 0)
               unconstrainedPosBuf.append("C");
         }
      }
      
      /**
       * There's some bug if all but one of the positions are constrained,
       * we do *not* want to apply the optimization in that case (though it
       * should be possible). For now, the fix is to not optimize, while in
       * future we may want to address the root cause of the issue.
       * 
       * See https://jira.blazegraph.com/browse/BLZG-1346.
       */
      final int maxLength = isQuads ? 2 : 1;
      if (constantPosBuf.length() > maxLength) {
         return new LinkedHashSet<SPOKeyOrder>();
      }

      

      final String prefix = constantPosBuf.toString();
      final Set<String> allPossibleConstPrefixes = new LinkedHashSet<String>();
      getPermutations(prefix, allPossibleConstPrefixes);
      if (allPossibleConstPrefixes.isEmpty())
         allPossibleConstPrefixes.add(""); // neutral element

      final String suffix = unconstrainedPosBuf.toString();
      final Set<String> allPossibleConstSuffixes = new LinkedHashSet<String>();
      getPermutations(suffix, allPossibleConstSuffixes);
      if (allPossibleConstSuffixes.isEmpty())
         allPossibleConstSuffixes.add(""); // neutral element

      // calculate set of all key order candidates
      final Set<SPOKeyOrder> allPossiblePrefixes = 
         new LinkedHashSet<SPOKeyOrder>();
      for (String constPrefix : allPossibleConstPrefixes) {
         for (String constSuffix : allPossibleConstSuffixes) {
            final String index = constPrefix + distinctTermScanPos + constSuffix;
            try {
               allPossiblePrefixes.add(SPOKeyOrder.fromString(index));
            } catch (IllegalArgumentException e) {
               // key order does not exist, ignoring
            }

         }
      }

      return allPossiblePrefixes;
   }

   /**
    * Returns a constraint ID defined as follows:
    * 
    * 2 - constant 1 - the distinct term scan var 0 - unconstrained
    */
   private int getPositionConstraint(IVariableOrConstant val,
         IVariable<?> termScanVar) {
      if (val instanceof IConstant) {
         return 2;
      } else if (val instanceof IVariable) {
         return val.equals(termScanVar) ? 1 : 0;
      } else {
         return 0; // val == null
      }
   }

   
   private void getPermutations(String str, Set<String> collector) {
      getPermutations("", str, collector);
   }

   
   private void getPermutations(
      String prefix, String str, Set<String> collector) {
      
      final int n = str.length();
      if (n == 0)
         collector.add(prefix);
      else {
         for (int i = 0; i < n; i++)
            getPermutations(prefix + str.charAt(i),
                  str.substring(0, i) + str.substring(i + 1, n), collector);
      }
   }
}