GraphQuery.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.search.join;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.TreeSet;

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
import org.apache.solr.search.BitDocSet;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.Filter;
import org.apache.solr.search.SolrIndexSearcher;

/**
 * GraphQuery - search for nodes and traverse edges in an index.
 * 
 * Params:
 * fromField = the field that contains the node id
 * toField = the field that contains the edge ids
 * traversalFilter = a query that can be applied for each hop in the graph.
 * maxDepth = the max depth to traverse.  (start nodes is depth=1)
 * onlyLeafNodes = only return documents that have no edge id values.
 * returnRoot = if false, the documents matching the initial query will not be returned.
 *
 * @lucene.experimental
 */
public class GraphQuery extends Query {
  
  /** The inital node matching query */
  private Query q;
  /** the field with the node id */
  private String fromField;
  /** the field containing the edge ids */
  private String toField;
  /** A query to apply while traversing the graph to filter out edges */
  private Query traversalFilter;
  /** The max depth to traverse the graph, -1 means no limit. */
  private int maxDepth = -1;

  /** Use automaton compilation for graph query traversal (experimental + expert use only) */
  private boolean useAutn = true;
  
  /** If this is true, the graph traversal result will only return documents that 
   * do not have a value in the edge field. (Only leaf nodes returned from the graph) */
  private boolean onlyLeafNodes = false;
  
  /** False if documents matching the start query for the graph will be excluded from the final result set.  */
  private boolean returnRoot = true;
  
  /**
   * Create a graph query 
   * q - the starting node query
   * fromField - the field containing the node id
   * toField - the field containing the edge ids
   */
  public GraphQuery(Query q, String fromField, String toField) {
    this(q, fromField, toField, null);
  }
  
  /**
   * Create a graph query with a traversal filter applied while traversing the frontier.
   * q - the starting node query
   * fromField - the field containing the node id
   * toField - the field containing the edge ids
   * traversalFilter - the filter to be applied on each iteration of the frontier.
   */
  public GraphQuery(Query q, String fromField, String toField, Query traversalFilter) {
    this.q = q;
    this.fromField = fromField;
    this.toField = toField;
    this.traversalFilter = traversalFilter;
  }
  
  @Override
  public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
    Weight graphWeight = new GraphQueryWeight((SolrIndexSearcher)searcher, boost);
    return graphWeight;
  }
  
  @Override
  public String toString(String field) {
    StringBuilder sb = new StringBuilder();
    sb.append("[[" + q.toString() + "]," + fromField + "=" + toField + "]");
    if (traversalFilter != null) {
      sb.append(" [TraversalFilter: " + traversalFilter.toString() + "]");
    }
    sb.append("[maxDepth=" + maxDepth + "]");
    sb.append("[returnRoot=" + returnRoot + "]");
    sb.append("[onlyLeafNodes=" + onlyLeafNodes + "]");
    sb.append("[useAutn=" + useAutn + "]");
    return sb.toString();
  }
  
  protected class GraphQueryWeight extends Weight {
    
    final SolrIndexSearcher fromSearcher;
    private final float boost;
    private int frontierSize = 0;
    private int currentDepth = -1;
    private Filter filter;
    private DocSet resultSet;
    
    public GraphQueryWeight(SolrIndexSearcher searcher, float boost) {
      // Grab the searcher so we can run additional searches.
      super(null);
      this.fromSearcher = searcher;
      this.boost = boost;
    }
    
    @Override
    public Explanation explain(LeafReaderContext context, int doc) throws IOException {
      // currently no ranking for graph queries. 
      final Scorer cs = scorer(context);
      final boolean exists = (cs != null && cs.iterator().advance(doc) == doc);
      if (exists) {
        List<Explanation> subs = new ArrayList<Explanation>();
        return Explanation.match(1.0F, "Graph Match", subs);
      } else {
        List<Explanation> subs = new ArrayList<Explanation>();
        return Explanation.noMatch("No Graph Match.", subs);
      }
    }
    
    /**
     * This computes the matching doc set for a given graph query
     * 
     * @return DocSet representing the documents in the graph.
     * @throws IOException - if a sub search fails... maybe other cases too! :)
     */
    private DocSet getDocSet() throws IOException {
      // Size that the bit set needs to be.
      int capacity = fromSearcher.getRawReader().maxDoc();
      // The bit set to contain the results that match the query.
      FixedBitSet resultBits = new FixedBitSet(capacity);
      // this holds the result at each level
      BitDocSet fromSet = null;
      // the root docs if we return root is false
      FixedBitSet rootBits = null;
      // the initial query for the frontier for the first query
      Query frontierQuery = q;
      // Find all documents in this graph that are leaf nodes to speed traversal
      DocSet leafNodes = resolveLeafNodes(toField);
      // Start the breadth first graph traversal.
      
      do {
        // Increment how far we have gone in the frontier.
        currentDepth++;
        // if we are at the max level we don't need the graph terms collector.
        // TODO validate that the join case works properly.
        if (maxDepth != -1 && currentDepth >= maxDepth) {
          // if we've reached the max depth, don't worry about collecting edges.
          fromSet = fromSearcher.getDocSetBits(frontierQuery);
          // explicitly the frontier size is zero now so we can break
          frontierSize = 0;
        } else {
          // when we're not at the max depth level, we need to collect edges          
          // Create the graph result collector for this level
          GraphTermsCollector graphResultCollector = new GraphTermsCollector(toField,capacity, resultBits, leafNodes);
          fromSearcher.search(frontierQuery, graphResultCollector);
          fromSet = graphResultCollector.getDocSet();
          // All edge ids on the frontier.
          BytesRefHash collectorTerms = graphResultCollector.getCollectorTerms();
          frontierSize = collectorTerms.size();
          // The resulting doc set from the frontier.
          FrontierQuery fq = buildFrontierQuery(collectorTerms, frontierSize);
          if (fq == null) {
            // in case we get null back, make sure we know we're done at this level.
            frontierSize = 0;
          } else {
            frontierQuery = fq.getQuery();
            frontierSize = fq.getFrontierSize();
          }
        }
        if (currentDepth == 0 && !returnRoot) {
          // grab a copy of the root bits but only if we need it.
          rootBits = fromSet.getBits();
        }
        // Add the bits from this level to the result set.
        resultBits.or(fromSet.getBits());
        // test if we discovered any new edges, if not , we're done.
        if ((maxDepth != -1 && currentDepth >= maxDepth)) {
          break;
        }
      } while (frontierSize > 0);
      // helper bit set operations on the final result set
      if (!returnRoot) {
        resultBits.andNot(rootBits);
      }
      // this is the final resulting filter.
      BitDocSet resultSet = new BitDocSet(resultBits);
      // If we only want to return leaf nodes do that here.
      if (onlyLeafNodes) {
        return resultSet.intersection(leafNodes);
      } else {
        return resultSet;
      }
    }
    
    private DocSet resolveLeafNodes(String field) throws IOException {
      BooleanQuery.Builder leafNodeQuery = new BooleanQuery.Builder();
      WildcardQuery edgeQuery = new WildcardQuery(new Term(field, "*"));
      leafNodeQuery.add(edgeQuery, Occur.MUST_NOT);
      DocSet leafNodes = fromSearcher.getDocSet(leafNodeQuery.build());
      return leafNodes;
    }
    
    /** Build an automaton to represent the frontier query */
    private Automaton buildAutomaton(BytesRefHash termBytesHash) {
      // need top pass a sorted set of terms to the autn builder (maybe a better way to avoid this?)
      final TreeSet<BytesRef> terms = new TreeSet<BytesRef>();
      for (int i = 0 ; i < termBytesHash.size(); i++) {
        BytesRef ref = new BytesRef();
        termBytesHash.get(i, ref);
        terms.add(ref);
      }
      final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
      return a;    
    }
    
    /**
     * This return a query that represents the documents that match the next hop in the query.
     * 
     * collectorTerms - the terms that represent the edge ids for the current frontier.
     * frontierSize - the size of the frontier query (number of unique edges)
     *  
     */
    public FrontierQuery buildFrontierQuery(BytesRefHash collectorTerms, Integer frontierSize) {
      if (collectorTerms == null || collectorTerms.size() == 0) {
        // return null if there are no terms (edges) to traverse.
        return null;
      } else {
        // Create a query
        Query q = null;

        // TODO: see if we should dynamically select this based on the frontier size.
        if (useAutn) {
          // build an automaton based query for the frontier.
          Automaton autn = buildAutomaton(collectorTerms);
          AutomatonQuery autnQuery = new AutomatonQuery(new Term(fromField), autn);
          q = autnQuery;
        } else {
          List<BytesRef> termList = new ArrayList<>(collectorTerms.size());
          for (int i = 0 ; i < collectorTerms.size(); i++) {
            BytesRef ref = new BytesRef();
            collectorTerms.get(i, ref);
            termList.add(ref);
          }
          q = new TermInSetQuery(fromField, termList);
        }
        
        // If there is a filter to be used while crawling the graph, add that.
        if (traversalFilter != null) {
          BooleanQuery.Builder builder = new BooleanQuery.Builder();
          builder.add(q, Occur.MUST);
          builder.add(traversalFilter, Occur.MUST);
          q = builder.build();
        } 
        // return the new query. 
        FrontierQuery frontier = new FrontierQuery(q, frontierSize);
        return frontier;
      }
    }
    
    @Override
    public Scorer scorer(LeafReaderContext context) throws IOException {
      if (filter == null) {
        resultSet = getDocSet();
        filter = resultSet.getTopFilter();
      }
      DocIdSet readerSet = filter.getDocIdSet(context,context.reader().getLiveDocs());
      // create a scrorer on the result set, if results from right query are empty, use empty iterator.
      return new GraphScorer(this, readerSet == null ? DocIdSetIterator.empty() : readerSet.iterator(), 1);
    }
    
    @Override
    public void extractTerms(Set<Term> terms) {
      // NoOp for now , not used.. / supported
    }
    
  }
  
  private static class GraphScorer extends Scorer {
    
    final DocIdSetIterator iter;
    final float score;
    // graph query scorer constructor with iterator
    public GraphScorer(Weight w, DocIdSetIterator iter, float score) throws IOException {
      super(w);
      this.iter = iter==null ? DocIdSet.EMPTY.iterator() : iter;
      this.score = score;
    }
    
    @Override
    public float score() throws IOException {
      // no dynamic scoring now.  
      return score;
    }

    @Override
    public DocIdSetIterator iterator() {
      return iter;
    }

    @Override
    public int docID() {
      // current position of the doc iterator.
      return iter.docID();
    }
    
    @Override
    public int freq() throws IOException {
      return 1;
    }
  }
  
  /**
   * @return The query to be used as a filter for each hop in the graph.
   */
  public Query getTraversalFilter() {
    return traversalFilter;
  }
  
  public void setTraversalFilter(Query traversalFilter) {
    this.traversalFilter = traversalFilter;
  }
  
  public Query getQ() {
    return q;
  }
  
  public void setQ(Query q) {
    this.q = q;
  }
  
  /**
   * @return The field that contains the node id
   */
  public String getFromField() {
    return fromField;
  }
  
  public void setFromField(String fromField) {
    this.fromField = fromField;
  }
  
  /**
   * @return the field that contains the edge id(s)
   */
  public String getToField() {
    return toField;
  }
  
  public void setToField(String toField) {
    this.toField = toField;
  }
  
  /**
   * @return Max depth for traversal,  -1 for infinite!
   */
  public int getMaxDepth() {
    return maxDepth;
  }
  
  public void setMaxDepth(int maxDepth) {
    this.maxDepth = maxDepth;
  }
  
  /**
   * @return If true , an automaton query will be compiled for each new frontier traversal
   * this helps to avoid max boolean clause errors.
   */
  public boolean isUseAutn() {
    return useAutn;
  }
  
  public void setUseAutn(boolean useAutn) {
    this.useAutn = useAutn;
  }
  
  /**
   * @return if true only documents that do not have a value in the edge id field will be returned.
   */
  public boolean isOnlyLeafNodes() {
    return onlyLeafNodes;
  }
  
  public void setOnlyLeafNodes(boolean onlyLeafNodes) {
    this.onlyLeafNodes = onlyLeafNodes;
  }
  
  /**
   * @return if true the documents that matched the rootNodes query will be returned.  o/w they will be removed from the result set.
   */
  public boolean isReturnRoot() {
    return returnRoot;
  }
  
  public void setReturnRoot(boolean returnRoot) {
    this.returnRoot = returnRoot;
  }
  
  @Override
  public int hashCode() {
    final int prime = 31;
    int result = classHash();
    result = prime * result + Objects.hashCode(fromField);
    result = prime * result + maxDepth;
    result = prime * result + (onlyLeafNodes ? 1231 : 1237);
    result = prime * result + Objects.hashCode(q);
    result = prime * result + (returnRoot ? 1231 : 1237);
    result = prime * result + Objects.hashCode(toField);
    result = prime * result + Objects.hashCode(traversalFilter);
    result = prime * result + (useAutn ? 1231 : 1237);
    return result;
  }

  @Override
  public boolean equals(Object other) {
    return sameClassAs(other) &&
           equalsTo(getClass().cast(other));
  }

  private boolean equalsTo(GraphQuery other) {
    return Objects.equals(fromField, other.fromField) &&
           maxDepth == other.maxDepth &&
           onlyLeafNodes == other.onlyLeafNodes &&
           returnRoot == other.returnRoot &&
           useAutn == other.useAutn &&
           Objects.equals(q, other.q) &&
           Objects.equals(toField, other.toField) &&
           Objects.equals(traversalFilter, other.traversalFilter);
  }
  
}