package com.bigdata.rdf.sparql.ast.eval; import java.util.Arrays; import java.util.Collections; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.Set; import org.openrdf.model.URI; import org.openrdf.query.Dataset; import com.bigdata.bop.BOpContextBase; import com.bigdata.bop.Constant; import com.bigdata.bop.IVariable; import com.bigdata.bop.ap.Predicate; import com.bigdata.bop.cost.SubqueryCostReport; import com.bigdata.bop.fed.FederatedQueryEngine; import com.bigdata.rdf.internal.IV; import com.bigdata.rdf.lexicon.LexiconRelation; import com.bigdata.rdf.model.BigdataURI; import com.bigdata.rdf.store.IRawTripleStore; import com.bigdata.relation.IRelation; import com.bigdata.relation.accesspath.AccessPath; import com.bigdata.service.ResourceService; /** * Helper class summarizes the named graphs or default graph mode for a quads * query. */ @SuppressWarnings("rawtypes") public class DataSetSummary { public static Set<IV> toInternalValues(final Set<URI> graphs) { /* * Note: Per DAWG tests graph-02 and graph-04, a query against an empty * default graph collection or an empty named graph collection should * be constrained to NO graphs. This is different from the case where * the dataset is simply not specified, which is interpreted as having * no constraint on the visited graphs. If you uncomment the next two * lines, both graph-02 and graph-04 in the TCK will fail. */ // if(graphs.isEmpty()) // return null; final Set<IV> s = new LinkedHashSet<IV>(); for (URI uri : graphs) { IV iv = null; if (uri != null && uri instanceof BigdataURI) { final BigdataURI bURI = (BigdataURI) uri; iv = bURI.getIV(); } s.add(iv); } return s; } /** * The set of graphs. The {@link URI}s MUST have been resolved against the * appropriate {@link LexiconRelation} such that their term identifiers * (when the exist) are known. If any term identifier is * {@link IRawTripleStore#NULL}, then the corresponding graph does not exist * and no access path will be queried for that graph. However, a non- * {@link IRawTripleStore#NULL} term identifier may also identify a graph * which does not exist, in which case an access path will be created for * that {@link URI}s but will not visit any data. */ // public final Iterable<? extends URI> graphs; public final Set<IV> graphs; /** * The #of graphs in {@link #graphs} whose term identifier is known. While * this is not proof that there is data in the quad store for a graph having * the corresponding {@link URI}, it does allow the possibility that a graph * could exist for that {@link URI}. */ public final int nknown; /** * The #of graphs in {@link #graphs} whose term identifier is not known. For * QUERY, this is proof that there is no data in the quad store for a graph * having the corresponding {@link URI}. However, for UPDATE it is possible * that a graph could be created for that {@link URI} during an UPDATE * operation. */ public final int nunknown; /** * The {@link IV} for the first graph having a known {@link IV} and * {@link IRawTripleStore#NULL} if no graphs were specified having a known * {@link IV}. */ public final IV firstContext; /** * * @param graphs * The set of named graphs in the SPARQL DATASET (optional). A * runtime exception will be thrown during evaluation of the if * the {@link URI}s are not {@link BigdataURI}s. If * <code>graphs := null</code>, then the set of named graphs is * understood to be ALL graphs in the quad store. * @param update * When <code>true</code>, unknown {@link IV}s WILL NOT be pruned * from the {@link DataSetSummary}. This is because those graphs * might be implicitly created during an UPDATE operation. */ public DataSetSummary(final Set<IV> graphs, final boolean update) { IV firstContext = null; if (graphs == null) { nknown = Integer.MAX_VALUE; nunknown = Integer.MAX_VALUE; } else { final Iterator<IV> itr = graphs.iterator(); int nknown = 0; int nunknown = 0; while (itr.hasNext()) { final IV iv = itr.next(); if (iv == null) continue; if (iv.isNullIV()) { nunknown++; } else { if (++nknown == 1) { firstContext = iv; } } } // while this.nknown = nknown; this.nunknown = nunknown; } this.firstContext = firstContext; /* * Note: Includes unknown IVs iff [update:=true]. */ final IV[] a = new IV[nknown + (update ? nunknown : 0)]; final Iterator<IV> itr = graphs.iterator(); int nknown = 0; while (itr.hasNext()) { final IV iv = itr.next(); if (iv == null) continue; if (iv.isNullIV() && !update) { // Drop unknown IVs unless [update:=true]. continue; } a[nknown++] = iv; } // while /* * Put the graphs into termId order. Since the individual access paths * will be formed by binding [c] to each graphId in turn, evaluating * those access paths in graphId order will make better use of the * B+Tree cache as the reads will tend to be more clustered. */ Arrays.sort(a); // Populate hash set which will maintain the sorted order. this.graphs = new LinkedHashSet<IV>(nknown); for (int i = 0; i < nknown; i++) { this.graphs.add(a[i]); } } /** * Return the distinct {@link IV}s for the graphs known to the database. * * @return An ordered set of the distinct {@link IV}s. */ public Set<IV> getGraphs() { return graphs != null ? graphs : Collections.<IV> emptySet(); } /** * Estimate cost of SUBQUERY with C bound (sampling). * * @param context * @param limit * The maximum #of samples to take. * @param pred * The predicate. * * @return The estimated cost report. This is adjusted based on the sample * size and the #of graphs against which the query was issued and * represents the total expected cost of the subqueries against all * of the graphs in the {@link Dataset}. * * @todo Subquery will be less efficient than a scan when the access path is * remote since there will be remote requests. This model does not * capture that additional overhead. We need to measure the overhead * using appropriate data sets and queries and then build it into the * model. The overhead itself could be changed dramatically by * optimizations in the {@link FederatedQueryEngine} and the * {@link ResourceService}. * * @todo This should randomly sample in case there is bias. */ @SuppressWarnings({ "unchecked" }) public SubqueryCostReport estimateSubqueryCost( final BOpContextBase context, final int limit, final Predicate pred) { final IRelation r = context.getRelation(pred); double subqueryCost = 0d; long rangeCount = 0L; int nsamples = 0; // for (URI uri : graphs) { for (IV graph : graphs) { if (nsamples == limit) break; // final IV graph = ((BigdataURI) uri).getIV(); // // if (graph == null) // continue; final Predicate tmp = pred.asBound((IVariable) pred.get(3), new Constant(graph)); final AccessPath ap = (AccessPath) context.getAccessPath(r, tmp); subqueryCost += ap.estimateCost().cost; rangeCount += context.getAccessPath(context.getRelation(tmp), tmp) .rangeCount(false/* exact */); nsamples++; } subqueryCost = (subqueryCost * nknown) / nsamples; rangeCount = (rangeCount * nknown) / nsamples; return new SubqueryCostReport(nknown, limit, nsamples, rangeCount, subqueryCost); } @Override public String toString() { return "DataSetSummary{ngraphs=" + graphs.size() + ", nknown=" + nknown + ", nunknown=" + nunknown + ", graphs=" + graphs + "}" ; } @Override public boolean equals(final Object o) { if (this == o) return true; if (!(o instanceof DataSetSummary)) return false; final DataSetSummary t = (DataSetSummary) o; if (!graphs.equals(t.graphs)) return false; return true; } }