SimpleStandardTermsIndexReader.java example

Explorer
solrcene-master
package org.apache.lucene.index.codecs.standard;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.packed.PackedInts;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Collection;
import java.util.Comparator;
import java.io.IOException;

/**
 * Uses a simplistic format to record terms dict index
 * information.  Limititations:
 *
 *   - Index for all fields is loaded entirely into RAM up
 *     front 
 *   - Index is stored in RAM using shared byte[] that
 *     wastefully expand every term.  Using FST to share
 *     common prefix & suffix would save RAM.
 *   - Index is taken at regular numTerms (every 128 by
 *     default); might be better to do it by "net docFreqs"
 *     encountered, so that for spans of low-freq terms we
 *     take index less often.
 *
 * A better approach might be something similar to how
 * postings are encoded, w/ multi-level skips.  Ie, load all
 * terms index data into memory, as a single large compactly
 * encoded stream (eg delta bytes + delta offset).  Index
 * that w/ multi-level skipper.  Then to look up a term is
 * the equivalent binary search, using the skipper instead,
 * while data remains compressed in memory.
 */

import org.apache.lucene.index.IndexFileNames;

/** @lucene.experimental */
public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader {

  // NOTE: long is overkill here, since this number is 128
  // by default and only indexDivisor * 128 if you change
  // the indexDivisor at search time.  But, we use this in a
  // number of places to multiply out the actual ord, and we
  // will overflow int during those multiplies.  So to avoid
  // having to upgrade each multiple to long in multiple
  // places (error proned), we use long here:
  private long totalIndexInterval;

  private int indexDivisor;
  final private int indexInterval;

  // Closed if indexLoaded is true:
  final private IndexInput in;
  private volatile boolean indexLoaded;

  private final Comparator<BytesRef> termComp;

  private final static int PAGED_BYTES_BITS = 15;

  // all fields share this single logical byte[]
  private final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS);
  private PagedBytes.Reader termBytesReader;

  final HashMap<FieldInfo,FieldIndexReader> fields = new HashMap<FieldInfo,FieldIndexReader>();
  
  // start of the field info data
  protected long dirOffset;

  public SimpleStandardTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, Comparator<BytesRef> termComp)
    throws IOException {

    this.termComp = termComp;

    IndexInput in = dir.openInput(IndexFileNames.segmentFileName(segment, "", StandardCodec.TERMS_INDEX_EXTENSION));
    
    boolean success = false;

    try {
      
      readHeader(in);
      indexInterval = in.readInt();
      this.indexDivisor = indexDivisor;

      if (indexDivisor < 0) {
        totalIndexInterval = indexInterval;
      } else {
        // In case terms index gets loaded, later, on demand
        totalIndexInterval = indexInterval * indexDivisor;
      }
      assert totalIndexInterval > 0;
      
      seekDir(in, dirOffset);

      // Read directory
      final int numFields = in.readInt();

      for(int i=0;i<numFields;i++) {
        final int field = in.readInt();
        final int numIndexTerms = in.readInt();
        final long termsStart = in.readLong();
        final long indexStart = in.readLong();
        final long packedIndexStart = in.readLong();
        final long packedOffsetsStart = in.readLong();
        assert packedIndexStart >= indexStart: "packedStart=" + packedIndexStart + " indexStart=" + indexStart + " numIndexTerms=" + numIndexTerms + " seg=" + segment;
        if (numIndexTerms > 0) {
          final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
          fields.put(fieldInfo, new FieldIndexReader(in, fieldInfo, numIndexTerms, indexStart, termsStart, packedIndexStart, packedOffsetsStart));
        }
      }
      success = true;
    } finally {
      if (indexDivisor > 0) {
        in.close();
        this.in = null;
        if (success) {
          indexLoaded = true;
        }
        termBytesReader = termBytes.freeze(true);
      } else {
        this.in = in;
      }
    }
  }
  
  protected void readHeader(IndexInput input) throws IOException {
    CodecUtil.checkHeader(input, SimpleStandardTermsIndexWriter.CODEC_NAME,
      SimpleStandardTermsIndexWriter.VERSION_START, SimpleStandardTermsIndexWriter.VERSION_START);
    dirOffset = input.readLong();
  }

  private final class FieldIndexReader extends FieldReader {

    final private FieldInfo fieldInfo;

    private volatile CoreFieldIndex coreIndex;

    private final IndexInput in;

    private final long indexStart;
    private final long termsStart;
    private final long packedIndexStart;
    private final long packedOffsetsStart;

    private final int numIndexTerms;

    public FieldIndexReader(IndexInput in, FieldInfo fieldInfo, int numIndexTerms, long indexStart, long termsStart, long packedIndexStart,
                            long packedOffsetsStart) throws IOException {

      this.fieldInfo = fieldInfo;
      this.in = in;
      this.termsStart = termsStart;
      this.indexStart = indexStart;
      this.packedIndexStart = packedIndexStart;
      this.packedOffsetsStart = packedOffsetsStart;
      this.numIndexTerms = numIndexTerms;

      // We still create the indexReader when indexDivisor
      // is -1, so that StandardTermsDictReader can call
      // isIndexTerm for each field:
      if (indexDivisor > 0) {
        coreIndex = new CoreFieldIndex(indexStart,
                                       termsStart,
                                       packedIndexStart,
                                       packedOffsetsStart,
                                       numIndexTerms);
      
      }
    }

    public void loadTermsIndex() throws IOException {
      if (coreIndex == null) {
        coreIndex = new CoreFieldIndex(indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms);
      }
    }

    @Override
    public boolean isIndexTerm(long ord, int docFreq, boolean onlyLoaded) {
      if (onlyLoaded) {
        return ord % totalIndexInterval == 0;
      } else {
        return ord % indexInterval == 0;
      }
    }

    @Override
    public boolean nextIndexTerm(long ord, TermsIndexResult result) throws IOException {
      if (coreIndex == null) {
        throw new IllegalStateException("terms index was not loaded");
      } else {
        return coreIndex.nextIndexTerm(ord, result);
      }
    }

    @Override
    public void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException {
      // You must call loadTermsIndex if you had specified -1 for indexDivisor
      if (coreIndex == null) {
        throw new IllegalStateException("terms index was not loaded");
      }
      coreIndex.getIndexOffset(term, result);
    }

    @Override
    public void getIndexOffset(long ord, TermsIndexResult result) throws IOException {
      // You must call loadTermsIndex if you had specified
      // indexDivisor < 0 to ctor
      if (coreIndex == null) {
        throw new IllegalStateException("terms index was not loaded");
      }
      coreIndex.getIndexOffset(ord, result);
    }

    private final class CoreFieldIndex {

      final private long termBytesStart;

      // offset into index termBytes
      final PackedInts.Reader termOffsets;

      // index pointers into main terms dict
      final PackedInts.Reader termsDictOffsets;

      final int numIndexTerms;

      final long termsStart;

      public CoreFieldIndex(long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, int numIndexTerms) throws IOException {

        this.termsStart = termsStart;
        termBytesStart = termBytes.getPointer();

        IndexInput clone = (IndexInput) in.clone();
        clone.seek(indexStart);

        // -1 is passed to mean "don't load term index", but
        // if we are then later loaded it's overwritten with
        // a real value
        assert indexDivisor > 0;

        this.numIndexTerms = 1+(numIndexTerms-1) / indexDivisor;

        assert this.numIndexTerms  > 0: "numIndexTerms=" + numIndexTerms + " indexDivisor=" + indexDivisor;

        if (indexDivisor == 1) {
          // Default (load all index terms) is fast -- slurp in the images from disk:
          
          try {
            final long numTermBytes = packedIndexStart - indexStart;
            termBytes.copy(clone, numTermBytes);

            // records offsets into main terms dict file
            termsDictOffsets = PackedInts.getReader(clone);
            assert termsDictOffsets.size() == numIndexTerms;

            // records offsets into byte[] term data
            termOffsets = PackedInts.getReader(clone);
            assert termOffsets.size() == 1+numIndexTerms;
          } finally {
            clone.close();
          }
        } else {
          // Get packed iterators
          final IndexInput clone1 = (IndexInput) in.clone();
          final IndexInput clone2 = (IndexInput) in.clone();

          try {
            // Subsample the index terms
            clone1.seek(packedIndexStart);
            final PackedInts.ReaderIterator termsDictOffsetsIter = PackedInts.getReaderIterator(clone1);

            clone2.seek(packedOffsetsStart);
            final PackedInts.ReaderIterator termOffsetsIter = PackedInts.getReaderIterator(clone2);

            // TODO: often we can get by w/ fewer bits per
            // value, below.. .but this'd be more complex:
            // we'd have to try @ fewer bits and then grow
            // if we overflowed it.

            PackedInts.Mutable termsDictOffsetsM = PackedInts.getMutable(this.numIndexTerms, termsDictOffsetsIter.getBitsPerValue());
            PackedInts.Mutable termOffsetsM = PackedInts.getMutable(this.numIndexTerms+1, termOffsetsIter.getBitsPerValue());

            termsDictOffsets = termsDictOffsetsM;
            termOffsets = termOffsetsM;

            int upto = 0;

            long termOffsetUpto = 0;

            while(upto < this.numIndexTerms) {
              // main file offset copies straight over
              termsDictOffsetsM.set(upto, termsDictOffsetsIter.next());

              termOffsetsM.set(upto, termOffsetUpto);
              upto++;

              long termOffset = termOffsetsIter.next();
              long nextTermOffset = termOffsetsIter.next();
              final int numTermBytes = (int) (nextTermOffset - termOffset);

              clone.seek(indexStart + termOffset);
              assert indexStart + termOffset < clone.length() : "indexStart=" + indexStart + " termOffset=" + termOffset + " len=" + clone.length();
              assert indexStart + termOffset + numTermBytes < clone.length();

              termBytes.copy(clone, numTermBytes);
              termOffsetUpto += numTermBytes;

              // skip terms:
              termsDictOffsetsIter.next();
              for(int i=0;i<indexDivisor-2;i++) {
                termOffsetsIter.next();
                termsDictOffsetsIter.next();
              }
            }
            termOffsetsM.set(upto, termOffsetUpto);

          } finally {
            clone1.close();
            clone2.close();
            clone.close();
          }
        }
      }

      public boolean nextIndexTerm(long ord, TermsIndexResult result) throws IOException {
        int idx = 1 + (int) (ord / totalIndexInterval);
        if (idx < numIndexTerms) {
          fillResult(idx, result);
          return true;
        } else {
          return false;
        }
      }

      private void fillResult(int idx, TermsIndexResult result) {
        final long offset = termOffsets.get(idx);
        final int length = (int) (termOffsets.get(1+idx) - offset);
        termBytesReader.fill(result.term, termBytesStart + offset, length);
        result.position = idx * totalIndexInterval;
        result.offset = termsStart + termsDictOffsets.get(idx);
      }

      public void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException {
        int lo = 0;					  // binary search
        int hi = numIndexTerms - 1;
        assert totalIndexInterval > 0 : "totalIndexInterval=" + totalIndexInterval;

        while (hi >= lo) {
          int mid = (lo + hi) >>> 1;

          final long offset = termOffsets.get(mid);
          final int length = (int) (termOffsets.get(1+mid) - offset);
          termBytesReader.fill(result.term, termBytesStart + offset, length);

          int delta = termComp.compare(term, result.term);
          if (delta < 0) {
            hi = mid - 1;
          } else if (delta > 0) {
            lo = mid + 1;
          } else {
            assert mid >= 0;
            result.position = mid*totalIndexInterval;
            result.offset = termsStart + termsDictOffsets.get(mid);
            return;
          }
        }
        if (hi < 0) {
          assert hi == -1;
          hi = 0;
        }

        final long offset = termOffsets.get(hi);
        final int length = (int) (termOffsets.get(1+hi) - offset);
        termBytesReader.fill(result.term, termBytesStart + offset, length);

        result.position = hi*totalIndexInterval;
        result.offset = termsStart + termsDictOffsets.get(hi);
      }

      public void getIndexOffset(long ord, TermsIndexResult result) throws IOException {
        int idx = (int) (ord / totalIndexInterval);
        // caller must ensure ord is in bounds
        assert idx < numIndexTerms;
        fillResult(idx, result);
      }
    }
  }

  @Override
  public void loadTermsIndex(int indexDivisor) throws IOException {
    if (!indexLoaded) {

      if (indexDivisor < 0) {
        this.indexDivisor = -indexDivisor;
      } else {
        this.indexDivisor = indexDivisor;
      }
      this.totalIndexInterval = indexInterval * this.indexDivisor;

      Iterator<FieldIndexReader> it = fields.values().iterator();
      while(it.hasNext()) {
        it.next().loadTermsIndex();
      }

      indexLoaded = true;
      in.close();
      termBytesReader = termBytes.freeze(true);
    }
  }

  @Override
  public FieldReader getField(FieldInfo fieldInfo) {
    return fields.get(fieldInfo);
  }

  public static void files(Directory dir, SegmentInfo info, Collection<String> files) {
    files.add(IndexFileNames.segmentFileName(info.name, "", StandardCodec.TERMS_INDEX_EXTENSION));
  }

  public static void getIndexExtensions(Collection<String> extensions) {
    extensions.add(StandardCodec.TERMS_INDEX_EXTENSION);
  }

  @Override
  public void getExtensions(Collection<String> extensions) {
    getIndexExtensions(extensions);
  }

  @Override
  public void close() throws IOException {
    if (in != null && !indexLoaded) {
      in.close();
    }
    if (termBytesReader != null) {
      termBytesReader.close();
    }
  }

  protected void seekDir(IndexInput input, long dirOffset) throws IOException {
    input.seek(dirOffset);
  }
}