SimpleTextBKDReader.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.simpletext;

import java.io.IOException;
import java.nio.charset.StandardCharsets;

import org.apache.lucene.codecs.simpletext.SimpleTextUtil;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.bkd.BKDReader;

import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_COUNT;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_DOC_ID;
import static org.apache.lucene.codecs.simpletext.SimpleTextPointsWriter.BLOCK_VALUE;

/** Forked from {@link BKDReader} and simplified/specialized for SimpleText's usage */

final class SimpleTextBKDReader extends PointValues implements Accountable {
  // Packed array of byte[] holding all split values in the full binary tree:
  final private byte[] splitPackedValues; 
  final long[] leafBlockFPs;
  final private int leafNodeOffset;
  final int numDims;
  final int bytesPerDim;
  final int bytesPerIndexEntry;
  final IndexInput in;
  final int maxPointsInLeafNode;
  final byte[] minPackedValue;
  final byte[] maxPackedValue;
  final long pointCount;
  final int docCount;
  final int version;
  protected final int packedBytesLength;

  public SimpleTextBKDReader(IndexInput in, int numDims, int maxPointsInLeafNode, int bytesPerDim, long[] leafBlockFPs, byte[] splitPackedValues,
                             byte[] minPackedValue, byte[] maxPackedValue, long pointCount, int docCount) throws IOException {
    this.in = in;
    this.numDims = numDims;
    this.maxPointsInLeafNode = maxPointsInLeafNode;
    this.bytesPerDim = bytesPerDim;
    // no version check here because callers of this API (SimpleText) have no back compat:
    bytesPerIndexEntry = numDims == 1 ? bytesPerDim : bytesPerDim + 1;
    packedBytesLength = numDims * bytesPerDim;
    this.leafNodeOffset = leafBlockFPs.length;
    this.leafBlockFPs = leafBlockFPs;
    this.splitPackedValues = splitPackedValues;
    this.minPackedValue = minPackedValue;
    this.maxPackedValue = maxPackedValue;
    this.pointCount = pointCount;
    this.docCount = docCount;
    this.version = SimpleTextBKDWriter.VERSION_CURRENT;
    assert minPackedValue.length == packedBytesLength;
    assert maxPackedValue.length == packedBytesLength;
  }

  /** Used to track all state for a single call to {@link #intersect}. */
  public static final class IntersectState {
    final IndexInput in;
    final int[] scratchDocIDs;
    final byte[] scratchPackedValue;
    final int[] commonPrefixLengths;

    final IntersectVisitor visitor;

    public IntersectState(IndexInput in, int numDims,
                          int packedBytesLength,
                          int maxPointsInLeafNode,
                          IntersectVisitor visitor) {
      this.in = in;
      this.visitor = visitor;
      this.commonPrefixLengths = new int[numDims];
      this.scratchDocIDs = new int[maxPointsInLeafNode];
      this.scratchPackedValue = new byte[packedBytesLength];
    }
  }

  public void intersect(IntersectVisitor visitor) throws IOException {
    intersect(getIntersectState(visitor), 1, minPackedValue, maxPackedValue);
  }

  /** Fast path: this is called when the query box fully encompasses all cells under this node. */
  private void addAll(IntersectState state, int nodeID) throws IOException {
    //System.out.println("R: addAll nodeID=" + nodeID);

    if (nodeID >= leafNodeOffset) {
      //System.out.println("ADDALL");
      visitDocIDs(state.in, leafBlockFPs[nodeID-leafNodeOffset], state.visitor);
      // TODO: we can assert that the first value here in fact matches what the index claimed?
    } else {
      addAll(state, 2*nodeID);
      addAll(state, 2*nodeID+1);
    }
  }

  /** Create a new {@link IntersectState} */
  public IntersectState getIntersectState(IntersectVisitor visitor) {
    return new IntersectState(in.clone(), numDims,
                              packedBytesLength,
                              maxPointsInLeafNode,
                              visitor);
  }

  /** Visits all docIDs and packed values in a single leaf block */
  public void visitLeafBlockValues(int nodeID, IntersectState state) throws IOException {
    int leafID = nodeID - leafNodeOffset;

    // Leaf node; scan and filter all points in this block:
    int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);

    // Again, this time reading values and checking with the visitor
    visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
  }

  void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
    BytesRefBuilder scratch = new BytesRefBuilder();
    in.seek(blockFP);
    readLine(in, scratch);
    int count = parseInt(scratch, BLOCK_COUNT);
    visitor.grow(count);
    for(int i=0;i<count;i++) {
      readLine(in, scratch);
      visitor.visit(parseInt(scratch, BLOCK_DOC_ID));
    }
  }

  int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
    BytesRefBuilder scratch = new BytesRefBuilder();
    in.seek(blockFP);
    readLine(in, scratch);
    int count = parseInt(scratch, BLOCK_COUNT);
    for(int i=0;i<count;i++) {
      readLine(in, scratch);
      docIDs[i] = parseInt(scratch, BLOCK_DOC_ID);
    }
    return count;
  }

  void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
    visitor.grow(count);
    // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
    assert scratchPackedValue.length == packedBytesLength;
    BytesRefBuilder scratch = new BytesRefBuilder();
    for(int i=0;i<count;i++) {
      readLine(in, scratch);
      assert startsWith(scratch, BLOCK_VALUE);
      BytesRef br = SimpleTextUtil.fromBytesRefString(stripPrefix(scratch, BLOCK_VALUE));
      assert br.length == packedBytesLength;
      System.arraycopy(br.bytes, br.offset, scratchPackedValue, 0, packedBytesLength);
      visitor.visit(docIDs[i], scratchPackedValue);
    }
  }

  private void visitCompressedDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor, int compressedDim) throws IOException {
    // the byte at `compressedByteOffset` is compressed using run-length compression,
    // other suffix bytes are stored verbatim
    final int compressedByteOffset = compressedDim * bytesPerDim + commonPrefixLengths[compressedDim];
    commonPrefixLengths[compressedDim]++;
    int i;
    for (i = 0; i < count; ) {
      scratchPackedValue[compressedByteOffset] = in.readByte();
      final int runLen = Byte.toUnsignedInt(in.readByte());
      for (int j = 0; j < runLen; ++j) {
        for(int dim=0;dim<numDims;dim++) {
          int prefix = commonPrefixLengths[dim];
          in.readBytes(scratchPackedValue, dim*bytesPerDim + prefix, bytesPerDim - prefix);
        }
        visitor.visit(docIDs[i+j], scratchPackedValue);
      }
      i += runLen;
    }
    if (i != count) {
      throw new CorruptIndexException("Sub blocks do not add up to the expected count: " + count + " != " + i, in);
    }
  }

  private int readCompressedDim(IndexInput in) throws IOException {
    int compressedDim = in.readByte();
    if (compressedDim < -1 || compressedDim >= numDims) {
      throw new CorruptIndexException("Got compressedDim="+compressedDim, in);
    }
    return compressedDim;
  }

  private void readCommonPrefixes(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in) throws IOException {
    for(int dim=0;dim<numDims;dim++) {
      int prefix = in.readVInt();
      commonPrefixLengths[dim] = prefix;
      if (prefix > 0) {
        in.readBytes(scratchPackedValue, dim*bytesPerDim, prefix);
      }
      //System.out.println("R: " + dim + " of " + numDims + " prefix=" + prefix);
    }
  }

  private void intersect(IntersectState state,
                         int nodeID,
                         byte[] cellMinPacked, byte[] cellMaxPacked)
    throws IOException {

    /*
    System.out.println("\nR: intersect nodeID=" + nodeID);
    for(int dim=0;dim<numDims;dim++) {
      System.out.println("  dim=" + dim + "\n    cellMin=" + new BytesRef(cellMinPacked, dim*bytesPerDim, bytesPerDim) + "\n    cellMax=" + new BytesRef(cellMaxPacked, dim*bytesPerDim, bytesPerDim));
    }
    */

    Relation r = state.visitor.compare(cellMinPacked, cellMaxPacked);

    if (r == Relation.CELL_OUTSIDE_QUERY) {
      // This cell is fully outside of the query shape: stop recursing
      return;
    } else if (r == Relation.CELL_INSIDE_QUERY) {
      // This cell is fully inside of the query shape: recursively add all points in this cell without filtering
      addAll(state, nodeID);
      return;
    } else {
      // The cell crosses the shape boundary, or the cell fully contains the query, so we fall through and do full filtering
    }

    if (nodeID >= leafNodeOffset) {
      // TODO: we can assert that the first value here in fact matches what the index claimed?

      int leafID = nodeID - leafNodeOffset;
      
      // In the unbalanced case it's possible the left most node only has one child:
      if (leafID < leafBlockFPs.length) {
        // Leaf node; scan and filter all points in this block:
        int count = readDocIDs(state.in, leafBlockFPs[leafID], state.scratchDocIDs);

        // Again, this time reading values and checking with the visitor
        visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
      }

    } else {
      
      // Non-leaf node: recurse on the split left and right nodes

      int address = nodeID * bytesPerIndexEntry;
      int splitDim;
      if (numDims == 1) {
        splitDim = 0;
      } else {
        splitDim = splitPackedValues[address++] & 0xff;
      }
      
      assert splitDim < numDims;

      // TODO: can we alloc & reuse this up front?

      byte[] splitPackedValue = new byte[packedBytesLength];

      // Recurse on left sub-tree:
      System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
      System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
      intersect(state,
                2*nodeID,
                cellMinPacked, splitPackedValue);

      // Recurse on right sub-tree:
      System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
      System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
      intersect(state,
                2*nodeID+1,
                splitPackedValue, cellMaxPacked);
    }
  }

  @Override
  public long estimatePointCount(IntersectVisitor visitor) {
    return estimatePointCount(getIntersectState(visitor), 1, minPackedValue, maxPackedValue);
  }

  private long estimatePointCount(IntersectState state,
      int nodeID, byte[] cellMinPacked, byte[] cellMaxPacked) {
    Relation r = state.visitor.compare(cellMinPacked, cellMaxPacked);

    if (r == Relation.CELL_OUTSIDE_QUERY) {
      // This cell is fully outside of the query shape: stop recursing
      return 0L;
    } else if (nodeID >= leafNodeOffset) {
      // Assume all points match and there are no dups
      return maxPointsInLeafNode;
    } else {
      
      // Non-leaf node: recurse on the split left and right nodes

      int address = nodeID * bytesPerIndexEntry;
      int splitDim;
      if (numDims == 1) {
        splitDim = 0;
      } else {
        splitDim = splitPackedValues[address++] & 0xff;
      }
      
      assert splitDim < numDims;

      // TODO: can we alloc & reuse this up front?

      byte[] splitPackedValue = new byte[packedBytesLength];

      // Recurse on left sub-tree:
      System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
      System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
      final long leftCost = estimatePointCount(state,
                2*nodeID,
                cellMinPacked, splitPackedValue);

      // Recurse on right sub-tree:
      System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
      System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
      final long rightCost = estimatePointCount(state,
                2*nodeID+1,
                splitPackedValue, cellMaxPacked);
      return leftCost + rightCost;
    }
  }

  /** Copies the split value for this node into the provided byte array */
  public void copySplitValue(int nodeID, byte[] splitPackedValue) {
    int address = nodeID * bytesPerIndexEntry;
    int splitDim;
    if (numDims == 1) {
      splitDim = 0;
    } else {
      splitDim = splitPackedValues[address++] & 0xff;
    }
    
    assert splitDim < numDims;
    System.arraycopy(splitPackedValues, address, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
  }

  @Override
  public long ramBytesUsed() {
    return RamUsageEstimator.sizeOf(splitPackedValues) +
        RamUsageEstimator.sizeOf(leafBlockFPs);
  }

  @Override
  public byte[] getMinPackedValue() {
    return minPackedValue.clone();
  }

  @Override
  public byte[] getMaxPackedValue() {
    return maxPackedValue.clone();
  }

  @Override
  public int getNumDimensions() {
    return numDims;
  }

  @Override
  public int getBytesPerDimension() {
    return bytesPerDim;
  }

  @Override
  public long size() {
    return pointCount;
  }

  @Override
  public int getDocCount() {
    return docCount;
  }

  public boolean isLeafNode(int nodeID) {
    return nodeID >= leafNodeOffset;
  }

  private int parseInt(BytesRefBuilder scratch, BytesRef prefix) {
    assert startsWith(scratch, prefix);
    return Integer.parseInt(stripPrefix(scratch, prefix));
  }

  private String stripPrefix(BytesRefBuilder scratch, BytesRef prefix) {
    return new String(scratch.bytes(), prefix.length, scratch.length() - prefix.length, StandardCharsets.UTF_8);
  }

  private boolean startsWith(BytesRefBuilder scratch, BytesRef prefix) {
    return StringHelper.startsWith(scratch.get(), prefix);
  }

  private void readLine(IndexInput in, BytesRefBuilder scratch) throws IOException {
    SimpleTextUtil.readLine(in, scratch);
  }
}