RecursivePrefixTreeFilter.java example

Explorer
solr-analytics-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.spatial.prefix;

import com.spatial4j.core.shape.Shape;
import com.spatial4j.core.shape.SpatialRelation;
import org.apache.lucene.index.*;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.spatial.prefix.tree.Node;
import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.StringHelper;

import java.io.IOException;
import java.util.LinkedList;

/**
 * Performs a spatial intersection filter between a query shape and a field
 * indexed with {@link SpatialPrefixTree}, a Trie. SPT yields terms (grids) at
 * length 1 (aka "Level 1") and at greater lengths corresponding to greater
 * precisions. This filter recursively traverses each grid length and uses
 * methods on {@link Shape} to efficiently know that all points at a prefix fit
 * in the shape or not to either short-circuit unnecessary traversals or to
 * efficiently load all enclosed points.  If no indexed data lies in a portion
 * of the shape then that portion of the query shape is quickly passed over
 * without decomposing the shape unnecessarily.
 *
 * @lucene.internal
 */
public class RecursivePrefixTreeFilter extends Filter {

  /* TODOs for future:

Can a polygon query shape be optimized / made-simpler at recursive depths (e.g. intersection of shape + cell box)

RE "scan" threshold:
  // IF configured to do so, we could use term.freq() as an estimate on the number of places at this depth.  OR, perhaps
  //  make estimates based on the total known term count at this level?
  if (!scan) {
    //Make some estimations on how many points there are at this level and how few there would need to be to set
    // !scan to false.
    long termsThreshold = (long) estimateNumberIndexedTerms(cell.length(),queryShape.getDocFreqExpenseThreshold(cell));
    long thisOrd = termsEnum.ord();
    scan = (termsEnum.seek(thisOrd+termsThreshold+1) == TermsEnum.SeekStatus.END
            || !cell.contains(termsEnum.term()));
    termsEnum.seek(thisOrd);//return to last position
  }

  */

  private final String fieldName;
  private final SpatialPrefixTree grid;
  private final Shape queryShape;
  private final int prefixGridScanLevel;//at least one less than grid.getMaxLevels()
  private final int detailLevel;

  public RecursivePrefixTreeFilter(String fieldName, SpatialPrefixTree grid, Shape queryShape, int prefixGridScanLevel,
                             int detailLevel) {
    this.fieldName = fieldName;
    this.grid = grid;
    this.queryShape = queryShape;
    this.prefixGridScanLevel = Math.max(1,Math.min(prefixGridScanLevel,grid.getMaxLevels()-1));
    this.detailLevel = detailLevel;
    assert detailLevel <= grid.getMaxLevels();
  }

  @Override
  public DocIdSet getDocIdSet(AtomicReaderContext ctx, Bits acceptDocs) throws IOException {
    AtomicReader reader = ctx.reader();
    OpenBitSet bits = new OpenBitSet(reader.maxDoc());
    Terms terms = reader.terms(fieldName);
    if (terms == null)
      return null;
    TermsEnum termsEnum = terms.iterator(null);
    DocsEnum docsEnum = null;//cached for termsEnum.docs() calls
    Node scanCell = null;

    //cells is treated like a stack. LinkedList conveniently has bulk add to beginning. It's in sorted order so that we
    //  always advance forward through the termsEnum index.
    LinkedList<Node> cells = new LinkedList<Node>(
        grid.getWorldNode().getSubCells(queryShape) );

    //This is a recursive algorithm that starts with one or more "big" cells, and then recursively dives down into the
    // first such cell that intersects with the query shape.  It's a depth first traversal because we don't move onto
    // the next big cell (breadth) until we're completely done considering all smaller cells beneath it. For a given
    // cell, if it's *within* the query shape then we can conveniently short-circuit the depth traversal and
    // grab all documents assigned to this cell/term.  For an intersection of the cell and query shape, we either
    // recursively step down another grid level or we decide heuristically (via prefixGridScanLevel) that there aren't
    // that many points, and so we scan through all terms within this cell (i.e. the term starts with the cell's term),
    // seeing which ones are within the query shape.
    while(!cells.isEmpty()) {
      final Node cell = cells.removeFirst();
      final BytesRef cellTerm = new BytesRef(cell.getTokenBytes());
      TermsEnum.SeekStatus seekStat = termsEnum.seekCeil(cellTerm);
      if (seekStat == TermsEnum.SeekStatus.END)
        break;
      if (seekStat == TermsEnum.SeekStatus.NOT_FOUND)
        continue;
      if (cell.getLevel() == detailLevel || cell.isLeaf()) {
        docsEnum = termsEnum.docs(acceptDocs, docsEnum, 0);
        addDocs(docsEnum,bits);
      } else {//any other intersection
        //If the next indexed term is the leaf marker, then add all of them
        BytesRef nextCellTerm = termsEnum.next();
        assert StringHelper.startsWith(nextCellTerm, cellTerm);
        scanCell = grid.getNode(nextCellTerm.bytes, nextCellTerm.offset, nextCellTerm.length, scanCell);
        if (scanCell.isLeaf()) {
          docsEnum = termsEnum.docs(acceptDocs, docsEnum, 0);
          addDocs(docsEnum,bits);
          termsEnum.next();//move pointer to avoid potential redundant addDocs() below
        }

        //Decide whether to continue to divide & conquer, or whether it's time to scan through terms beneath this cell.
        // Scanning is a performance optimization trade-off.
        boolean scan = cell.getLevel() >= prefixGridScanLevel;//simple heuristic

        if (!scan) {
          //Divide & conquer
          cells.addAll(0, cell.getSubCells(queryShape));//add to beginning
        } else {
          //Scan through all terms within this cell to see if they are within the queryShape. No seek()s.
          for(BytesRef term = termsEnum.term(); term != null && StringHelper.startsWith(term,cellTerm); term = termsEnum.next()) {
            scanCell = grid.getNode(term.bytes, term.offset, term.length, scanCell);
            int termLevel = scanCell.getLevel();
            if (termLevel > detailLevel)
              continue;
            if (termLevel == detailLevel || scanCell.isLeaf()) {
              //TODO should put more thought into implications of box vs point
              Shape cShape = termLevel == grid.getMaxLevels() ? scanCell.getCenter() : scanCell.getShape();
              if(queryShape.relate(cShape) == SpatialRelation.DISJOINT)
                continue;

              docsEnum = termsEnum.docs(acceptDocs, docsEnum, 0);
              addDocs(docsEnum,bits);
            }
          }//term loop
        }
      }
    }//cell loop

    return bits;
  }

  private void addDocs(DocsEnum docsEnum, OpenBitSet bits) throws IOException {
    int docid;
    while ((docid = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
      bits.fastSet(docid);
    }
  }

  @Override
  public String toString() {
    return getClass().getSimpleName()+"{fieldName='" + fieldName + '\'' + ", shape=" + queryShape + '}';
  }

  @Override
  public boolean equals(Object o) {
    if (this == o) return true;
    if (o == null || getClass() != o.getClass()) return false;

    RecursivePrefixTreeFilter that = (RecursivePrefixTreeFilter) o;

    if (!fieldName.equals(that.fieldName)) return false;
    //note that we don't need to look at grid since for the same field it should be the same
    if (prefixGridScanLevel != that.prefixGridScanLevel) return false;
    if (detailLevel != that.detailLevel) return false;
    if (!queryShape.equals(that.queryShape)) return false;

    return true;
  }

  @Override
  public int hashCode() {
    int result = fieldName.hashCode();
    result = 31 * result + queryShape.hashCode();
    result = 31 * result + detailLevel;
    return result;
  }
}