TermsHashPerField.java example

Explorer
solrcene-master
package org.apache.lucene.index;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;

import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;

final class TermsHashPerField extends InvertedDocConsumerPerField {

  final TermsHashConsumerPerField consumer;

  final TermsHashPerField nextPerField;
  final TermsHashPerThread perThread;
  final DocumentsWriter.DocState docState;
  final FieldInvertState fieldState;
  TermToBytesRefAttribute termAtt;

  // Copied from our perThread
  final IntBlockPool intPool;
  final ByteBlockPool bytePool;
  final ByteBlockPool termBytePool;

  final int streamCount;
  final int numPostingInt;

  final FieldInfo fieldInfo;

  boolean postingsCompacted;
  int numPostings;
  private int postingsHashSize = 4;
  private int postingsHashHalfSize = postingsHashSize/2;
  private int postingsHashMask = postingsHashSize-1;
  private int[] postingsHash;
 
  ParallelPostingsArray postingsArray;
  private final BytesRef utf8;
  private Comparator<BytesRef> termComp;

  public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
    this.perThread = perThread;
    intPool = perThread.intPool;
    bytePool = perThread.bytePool;
    termBytePool = perThread.termBytePool;
    docState = perThread.docState;

    postingsHash = new int[postingsHashSize];
    Arrays.fill(postingsHash, -1);
    bytesUsed(postingsHashSize * RamUsageEstimator.NUM_BYTES_INT);

    fieldState = docInverterPerField.fieldState;
    this.consumer = perThread.consumer.addField(this, fieldInfo);
    initPostingsArray();

    streamCount = consumer.getStreamCount();
    numPostingInt = 2*streamCount;
    utf8 = perThread.utf8;
    this.fieldInfo = fieldInfo;
    if (nextPerThread != null)
      nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo);
    else
      nextPerField = null;
  }

  private void initPostingsArray() {
    postingsArray = consumer.createPostingsArray(2);
    bytesUsed(postingsArray.size * postingsArray.bytesPerPosting());
  }

  // sugar: just forwards to DW
  private void bytesUsed(long size) {
    if (perThread.termsHash.trackAllocations) {
      perThread.termsHash.docWriter.bytesUsed(size);
    }
  }
  
  void shrinkHash(int targetSize) {
    assert postingsCompacted || numPostings == 0;

    final int newSize = 4;
    if (newSize != postingsHash.length) {
      final long previousSize = postingsHash.length;
      postingsHash = new int[newSize];
      bytesUsed((newSize-previousSize)*RamUsageEstimator.NUM_BYTES_INT);
      Arrays.fill(postingsHash, -1);
      postingsHashSize = newSize;
      postingsHashHalfSize = newSize/2;
      postingsHashMask = newSize-1;
    }

    // Fully free the postings array on each flush:
    if (postingsArray != null) {
      bytesUsed(-postingsArray.bytesPerPosting() * postingsArray.size);
      postingsArray = null;
    }
  }

  public void reset() {
    if (!postingsCompacted)
      compactPostings();
    assert numPostings <= postingsHash.length;
    if (numPostings > 0) {
      Arrays.fill(postingsHash, 0, numPostings, -1);
      numPostings = 0;
    }
    postingsCompacted = false;
    if (nextPerField != null)
      nextPerField.reset();
  }

  @Override
  synchronized public void abort() {
    reset();
    if (nextPerField != null)
      nextPerField.abort();
  }
  
  private final void growParallelPostingsArray() {
    int oldSize = postingsArray.size;
    this.postingsArray = this.postingsArray.grow();
    bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - oldSize));
  }

  public void initReader(ByteSliceReader reader, int termID, int stream) {
    assert stream < streamCount;
    int intStart = postingsArray.intStarts[termID];
    final int[] ints = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
    final int upto = intStart & DocumentsWriter.INT_BLOCK_MASK;
    reader.init(bytePool,
                postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
                ints[upto+stream]);
  }

  private synchronized void compactPostings() {
    int upto = 0;
    for(int i=0;i<postingsHashSize;i++) {
      if (postingsHash[i] != -1) {
        if (upto < i) {
          postingsHash[upto] = postingsHash[i];
          postingsHash[i] = -1;
        }
        upto++;
      }
    }

    assert upto == numPostings;
    postingsCompacted = true;
  }

  /** Collapse the hash table & sort in-place. */
  public int[] sortPostings(Comparator<BytesRef> termComp) {
    this.termComp = termComp;
    compactPostings();
    quickSort(postingsHash, 0, numPostings-1);
    return postingsHash;
  }

  void quickSort(int[] termIDs, int lo, int hi) {
    if (lo >= hi)
      return;
    else if (hi == 1+lo) {
      if (comparePostings(termIDs[lo], termIDs[hi]) > 0) {
        final int tmp = termIDs[lo];
        termIDs[lo] = termIDs[hi];
        termIDs[hi] = tmp;
      }
      return;
    }

    int mid = (lo + hi) >>> 1;

    if (comparePostings(termIDs[lo], termIDs[mid]) > 0) {
      int tmp = termIDs[lo];
      termIDs[lo] = termIDs[mid];
      termIDs[mid] = tmp;
    }

    if (comparePostings(termIDs[mid], termIDs[hi]) > 0) {
      int tmp = termIDs[mid];
      termIDs[mid] = termIDs[hi];
      termIDs[hi] = tmp;

      if (comparePostings(termIDs[lo], termIDs[mid]) > 0) {
        int tmp2 = termIDs[lo];
        termIDs[lo] = termIDs[mid];
        termIDs[mid] = tmp2;
      }
    }

    int left = lo + 1;
    int right = hi - 1;

    if (left >= right)
      return;

    int partition = termIDs[mid];

    for (; ;) {
      while (comparePostings(termIDs[right], partition) > 0)
        --right;

      while (left < right && comparePostings(termIDs[left], partition) <= 0)
        ++left;

      if (left < right) {
        int tmp = termIDs[left];
        termIDs[left] = termIDs[right];
        termIDs[right] = tmp;
        --right;
      } else {
        break;
      }
    }

    quickSort(termIDs, lo, left);
    quickSort(termIDs, left + 1, hi);
  }

  /** Compares term text for two Posting instance and
   *  returns -1 if p1 < p2; 1 if p1 > p2; else 0. */
  int comparePostings(int term1, int term2) {

    if (term1 == term2) {
      // Our quicksort does this, eg during partition
      return 0;
    }

    termBytePool.setBytesRef(perThread.tr1, postingsArray.textStarts[term1]);
    termBytePool.setBytesRef(perThread.tr2, postingsArray.textStarts[term2]);

    return termComp.compare(perThread.tr1, perThread.tr2);
  }

  /** Test whether the text for current RawPostingList p equals
   *  current tokenText in utf8. */
  private boolean postingEquals(final int termID) {
    final int textStart = postingsArray.textStarts[termID];
    final byte[] text = termBytePool.buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
    assert text != null;

    int pos = textStart & DocumentsWriter.BYTE_BLOCK_MASK;
    
    final int len;
    if ((text[pos] & 0x80) == 0) {
      // length is 1 byte
      len = text[pos];
      pos += 1;
    } else {
      // length is 2 bytes
      len = (text[pos]&0x7f) + ((text[pos+1]&0xff)<<7);
      pos += 2;
    }

    if (len == utf8.length) {
      final byte[] utf8Bytes = utf8.bytes;
      for(int tokenPos=0;tokenPos<utf8.length;pos++,tokenPos++) {
        if (utf8Bytes[tokenPos] != text[pos]) {
          return false;
        }
      }
      return true;
    } else {
      return false;
    }
  }
  
  private boolean doCall;
  private boolean doNextCall;

  @Override
  void start(Fieldable f) {
    termAtt = fieldState.attributeSource.getAttribute(TermToBytesRefAttribute.class);
    consumer.start(f);
    if (nextPerField != null) {
      nextPerField.start(f);
    }
  }
  
  @Override
  boolean start(Fieldable[] fields, int count) throws IOException {
    doCall = consumer.start(fields, count);
    if (postingsArray == null) {
      initPostingsArray();
    }

    if (nextPerField != null)
      doNextCall = nextPerField.start(fields, count);
    return doCall || doNextCall;
  }

  // Secondary entry point (for 2nd & subsequent TermsHash),
  // because token text has already been "interned" into
  // textStart, so we hash by textStart
  public void add(int textStart) throws IOException {
    int code = textStart;

    int hashPos = code & postingsHashMask;

    assert !postingsCompacted;

    // Locate RawPostingList in hash
    int termID = postingsHash[hashPos];

    if (termID != -1 && postingsArray.textStarts[termID] != textStart) {
      // Conflict: keep searching different locations in
      // the hash table.
      final int inc = ((code>>8)+code)|1;
      do {
        code += inc;
        hashPos = code & postingsHashMask;
        termID = postingsHash[hashPos];
      } while (termID != -1 && postingsArray.textStarts[termID] != textStart);
    }

    if (termID == -1) {

      // First time we are seeing this token since we last
      // flushed the hash.

      // New posting
      termID = numPostings++;
      if (termID >= postingsArray.size) {
        growParallelPostingsArray();
      }

      assert termID >= 0;

      postingsArray.textStarts[termID] = textStart;
          
      assert postingsHash[hashPos] == -1;
      postingsHash[hashPos] = termID;

      if (numPostings == postingsHashHalfSize)
        rehashPostings(2*postingsHashSize);

      // Init stream slices
      if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
        intPool.nextBuffer();

      if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
        bytePool.nextBuffer();

      intUptos = intPool.buffer;
      intUptoStart = intPool.intUpto;
      intPool.intUpto += streamCount;

      postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;

      for(int i=0;i<streamCount;i++) {
        final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
        intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
      }
      postingsArray.byteStarts[termID] = intUptos[intUptoStart];

      consumer.newTerm(termID);

    } else {
      int intStart = postingsArray.intStarts[termID];
      intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
      intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
      consumer.addTerm(termID);
    }
  }

  // Primary entry point (for first TermsHash)
  @Override
  void add() throws IOException {

    assert !postingsCompacted;

    // We are first in the chain so we must "intern" the
    // term text into textStart address

    // Get the text & hash of this term.
    int code = termAtt.toBytesRef(utf8);

    int hashPos = code & postingsHashMask;

    // Locate RawPostingList in hash
    int termID = postingsHash[hashPos];

    if (termID != -1 && !postingEquals(termID)) {
      // Conflict: keep searching different locations in
      // the hash table.
      final int inc = ((code>>8)+code)|1;
      do {
        code += inc;
        hashPos = code & postingsHashMask;
        termID = postingsHash[hashPos];
      } while (termID != -1 && !postingEquals(termID));
    }

    if (termID == -1) {

      // First time we are seeing this token since we last
      // flushed the hash.
      final int textLen2 = 2+utf8.length;
      if (textLen2 + bytePool.byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE) {
        // Not enough room in current block

        if (utf8.length > DocumentsWriter.MAX_TERM_LENGTH_UTF8) {
          // Just skip this term, to remain as robust as
          // possible during indexing.  A TokenFilter
          // can be inserted into the analyzer chain if
          // other behavior is wanted (pruning the term
          // to a prefix, throwing an exception, etc).
          if (docState.maxTermPrefix == null) {
            final int saved = utf8.length;
            try {
              utf8.length = Math.min(30, DocumentsWriter.MAX_TERM_LENGTH_UTF8);
              docState.maxTermPrefix = utf8.toString();
            } finally {
              utf8.length = saved;
            }
          }

          consumer.skippingLongTerm();
          return;
        }
        bytePool.nextBuffer();
      }

      // New posting
      termID = numPostings++;
      if (termID >= postingsArray.size) {
        growParallelPostingsArray();
      }

      assert termID != -1;
      assert postingsHash[hashPos] == -1;

      postingsHash[hashPos] = termID;

      final byte[] text = bytePool.buffer;
      final int textUpto = bytePool.byteUpto;
      postingsArray.textStarts[termID] = textUpto + bytePool.byteOffset;

      // We first encode the length, followed by the UTF8
      // bytes.  Length is encoded as vInt, but will consume
      // 1 or 2 bytes at most (we reject too-long terms,
      // above).

      // encode length @ start of bytes
      if (utf8.length < 128) {
        // 1 byte to store length
        text[textUpto] = (byte) utf8.length;
        bytePool.byteUpto += utf8.length + 1;
        System.arraycopy(utf8.bytes, 0, text, textUpto+1, utf8.length);
      } else {
        // 2 byte to store length
        text[textUpto] = (byte) (0x80 | (utf8.length & 0x7f));
        text[textUpto+1] = (byte) ((utf8.length>>7) & 0xff);
        bytePool.byteUpto += utf8.length + 2;
        System.arraycopy(utf8.bytes, 0, text, textUpto+2, utf8.length);
      }

      if (numPostings == postingsHashHalfSize) {
        rehashPostings(2*postingsHashSize);
        bytesUsed(2*numPostings * RamUsageEstimator.NUM_BYTES_INT);
      }

      // Init stream slices
      if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) {
        intPool.nextBuffer();
      }

      if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
        bytePool.nextBuffer();
      }

      intUptos = intPool.buffer;
      intUptoStart = intPool.intUpto;
      intPool.intUpto += streamCount;

      postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;

      for(int i=0;i<streamCount;i++) {
        final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
        intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
      }
      postingsArray.byteStarts[termID] = intUptos[intUptoStart];

      consumer.newTerm(termID);

    } else {
      final int intStart = postingsArray.intStarts[termID];
      intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
      intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
      consumer.addTerm(termID);
    }

    if (doNextCall)
      nextPerField.add(postingsArray.textStarts[termID]);
  }

  int[] intUptos;
  int intUptoStart;

  void writeByte(int stream, byte b) {
    int upto = intUptos[intUptoStart+stream];
    byte[] bytes = bytePool.buffers[upto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
    assert bytes != null;
    int offset = upto & DocumentsWriter.BYTE_BLOCK_MASK;
    if (bytes[offset] != 0) {
      // End of slice; allocate a new one
      offset = bytePool.allocSlice(bytes, offset);
      bytes = bytePool.buffer;
      intUptos[intUptoStart+stream] = offset + bytePool.byteOffset;
    }
    bytes[offset] = b;
    (intUptos[intUptoStart+stream])++;
  }

  public void writeBytes(int stream, byte[] b, int offset, int len) {
    // TODO: optimize
    final int end = offset + len;
    for(int i=offset;i<end;i++)
      writeByte(stream, b[i]);
  }

  void writeVInt(int stream, int i) {
    assert stream < streamCount;
    while ((i & ~0x7F) != 0) {
      writeByte(stream, (byte)((i & 0x7f) | 0x80));
      i >>>= 7;
    }
    writeByte(stream, (byte) i);
  }

  @Override
  void finish() throws IOException {
    consumer.finish();
    if (nextPerField != null)
      nextPerField.finish();
  }

  /** Called when postings hash is too small (> 50%
   *  occupied) or too large (< 20% occupied). */
  void rehashPostings(final int newSize) {

    final int newMask = newSize-1;

    int[] newHash = new int[newSize];
    Arrays.fill(newHash, -1);
    for(int i=0;i<postingsHashSize;i++) {
      int termID = postingsHash[i];
      if (termID != -1) {
        int code;
        if (perThread.primary) {
          final int textStart = postingsArray.textStarts[termID];
          final int start = textStart & DocumentsWriter.BYTE_BLOCK_MASK;
          final byte[] text = bytePool.buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
          code = 0;

          final int len;
          int pos;
          if ((text[start] & 0x80) == 0) {
            // length is 1 byte
            len = text[start];
            pos = start+1;
          } else {
            len = (text[start]&0x7f) + ((text[start+1]&0xff)<<7);
            pos = start+2;
          }

          final int endPos = pos+len;
          while(pos < endPos) {
            code = (code*31) + text[pos++];
          }
        } else {
          code = postingsArray.textStarts[termID];
        }

        int hashPos = code & newMask;
        assert hashPos >= 0;
        if (newHash[hashPos] != -1) {
          final int inc = ((code>>8)+code)|1;
          do {
            code += inc;
            hashPos = code & newMask;
          } while (newHash[hashPos] != -1);
        }
        newHash[hashPos] = termID;
      }
    }

    postingsHashMask = newMask;
    postingsHash = newHash;

    postingsHashSize = newSize;
    postingsHashHalfSize = newSize >> 1;
  }
}