TermsHashPerField.java example

Explorer
mdrill-master
- trunk
package org.apache.lucene.index;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.Arrays;

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.SorterTemplate;

final class TermsHashPerField extends InvertedDocConsumerPerField {

  final TermsHashConsumerPerField consumer;

  final TermsHashPerField nextPerField;
  final TermsHashPerThread perThread;
  final DocumentsWriter.DocState docState;
  final FieldInvertState fieldState;
  CharTermAttribute termAtt;
  
  // Copied from our perThread
  final CharBlockPool charPool;
  final IntBlockPool intPool;
  final ByteBlockPool bytePool;

  final int streamCount;
  final int numPostingInt;

  final FieldInfo fieldInfo;

  boolean postingsCompacted;
  int numPostings;
  private int postingsHashSize = 4;
  private int postingsHashHalfSize = postingsHashSize/2;
  private int postingsHashMask = postingsHashSize-1;
  private int[] postingsHash;
 
  ParallelPostingsArray postingsArray;
  
  public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
    this.perThread = perThread;
    intPool = perThread.intPool;
    charPool = perThread.charPool;
    bytePool = perThread.bytePool;
    docState = perThread.docState;

    postingsHash = new int[postingsHashSize];
    Arrays.fill(postingsHash, -1);
    bytesUsed(postingsHashSize * RamUsageEstimator.NUM_BYTES_INT);

    fieldState = docInverterPerField.fieldState;
    this.consumer = perThread.consumer.addField(this, fieldInfo);
    initPostingsArray();

    streamCount = consumer.getStreamCount();
    numPostingInt = 2*streamCount;
    this.fieldInfo = fieldInfo;
    if (nextPerThread != null)
      nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo);
    else
      nextPerField = null;
  }

  private void initPostingsArray() {
    postingsArray = consumer.createPostingsArray(2);
    bytesUsed(postingsArray.size * postingsArray.bytesPerPosting());
  }

  // sugar: just forwards to DW
  private void bytesUsed(long size) {
    if (perThread.termsHash.trackAllocations) {
      perThread.termsHash.docWriter.bytesUsed(size);
    }
  }
  
  void shrinkHash(int targetSize) {
    assert postingsCompacted || numPostings == 0;

    final int newSize = 4;
    if (newSize != postingsHash.length) {
      final long previousSize = postingsHash.length;
      postingsHash = new int[newSize];
      bytesUsed((newSize-previousSize)*RamUsageEstimator.NUM_BYTES_INT);
      Arrays.fill(postingsHash, -1);
      postingsHashSize = newSize;
      postingsHashHalfSize = newSize/2;
      postingsHashMask = newSize-1;
    }

    // Fully free the postings array on each flush:
    if (postingsArray != null) {
      bytesUsed(-postingsArray.bytesPerPosting() * postingsArray.size);
      postingsArray = null;
    }
  }

  public void reset() {
    if (!postingsCompacted)
      compactPostings();
    assert numPostings <= postingsHash.length;
    if (numPostings > 0) {
      Arrays.fill(postingsHash, 0, numPostings, -1);
      numPostings = 0;
    }
    postingsCompacted = false;
    if (nextPerField != null)
      nextPerField.reset();
  }

  @Override
  synchronized public void abort() {
    reset();
    if (nextPerField != null)
      nextPerField.abort();
  }
  
  private final void growParallelPostingsArray() {
    int oldSize = postingsArray.size;
    this.postingsArray = this.postingsArray.grow();
    bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - oldSize));
  }

  public void initReader(ByteSliceReader reader, int termID, int stream) {
    assert stream < streamCount;
    int intStart = postingsArray.intStarts[termID];
    final int[] ints = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
    final int upto = intStart & DocumentsWriter.INT_BLOCK_MASK;
    reader.init(bytePool,
                postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
                ints[upto+stream]);
  }

  private void compactPostings() {
    int upto = 0;
    for(int i=0;i<postingsHashSize;i++) {
      if (postingsHash[i] != -1) {
        if (upto < i) {
          postingsHash[upto] = postingsHash[i];
          postingsHash[i] = -1;
        }
        upto++;
      }
    }

    assert upto == numPostings: "upto=" + upto + " numPostings=" + numPostings;
    postingsCompacted = true;
  }

  /** Collapse the hash table & sort in-place. */
  public int[] sortPostings() {
    compactPostings();
    final int[] postingsHash = this.postingsHash;
    new SorterTemplate() {
      @Override
      protected void swap(int i, int j) {
        final int o = postingsHash[i];
        postingsHash[i] = postingsHash[j];
        postingsHash[j] = o;
      }
      
      @Override
      protected int compare(int i, int j) {
        final int term1 = postingsHash[i], term2 = postingsHash[j];
        if (term1 == term2)
          return 0;
        final int textStart1 = postingsArray.textStarts[term1],
          textStart2 = postingsArray.textStarts[term2];
        final char[] text1 = charPool.buffers[textStart1 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
        final int pos1 = textStart1 & DocumentsWriter.CHAR_BLOCK_MASK;
        final char[] text2 = charPool.buffers[textStart2 >> DocumentsWriter.CHAR_BLOCK_SHIFT];
        final int pos2 = textStart2 & DocumentsWriter.CHAR_BLOCK_MASK;
        return comparePostings(text1, pos1, text2, pos2);
      }

      @Override
      protected void setPivot(int i) {
        pivotTerm = postingsHash[i];
        final int textStart = postingsArray.textStarts[pivotTerm];
        pivotBuf = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
        pivotBufPos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
      }
  
      @Override
      protected int comparePivot(int j) {
        final int term = postingsHash[j];
        if (pivotTerm == term)
          return 0;
        final int textStart = postingsArray.textStarts[term];
        final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
        final int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
        return comparePostings(pivotBuf, pivotBufPos, text, pos);
      }
      
      private int pivotTerm, pivotBufPos;
      private char[] pivotBuf;

      /** Compares term text for two Posting instance and
       *  returns -1 if p1 < p2; 1 if p1 > p2; else 0. */
      private int comparePostings(final char[] text1, int pos1, final char[] text2, int pos2) {
        assert text1 != text2 || pos1 != pos2;

        while(true) {
          final char c1 = text1[pos1++];
          final char c2 = text2[pos2++];
          if (c1 != c2) {
            if (0xffff == c2)
              return 1;
            else if (0xffff == c1)
              return -1;
            else
              return c1-c2;
          } else
            // This method should never compare equal postings
            // unless p1==p2
            assert c1 != 0xffff;
        }
      }
    }.quickSort(0, numPostings-1);
    return postingsHash;
  }

  /** Test whether the text for current RawPostingList p equals
   *  current tokenText. */
  private boolean postingEquals(final int termID, final char[] tokenText, final int tokenTextLen) {
    final int textStart = postingsArray.textStarts[termID];
    
    final char[] text = perThread.charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
    assert text != null;
    int pos = textStart & DocumentsWriter.CHAR_BLOCK_MASK;

    int tokenPos = 0;
    for(;tokenPos<tokenTextLen;pos++,tokenPos++)
      if (tokenText[tokenPos] != text[pos])
        return false;
    return 0xffff == text[pos];
  }
  
  private boolean doCall;
  private boolean doNextCall;

  @Override
  void start(Fieldable f) {
    termAtt = fieldState.attributeSource.addAttribute(CharTermAttribute.class);
    consumer.start(f);
    if (nextPerField != null) {
      nextPerField.start(f);
    }
  }
  
  @Override
  boolean start(Fieldable[] fields, int count) throws IOException {
    doCall = consumer.start(fields, count);
    if (postingsArray == null) {
      initPostingsArray();
    }

    if (nextPerField != null)
      doNextCall = nextPerField.start(fields, count);
    return doCall || doNextCall;
  }

  // Secondary entry point (for 2nd & subsequent TermsHash),
  // because token text has already been "interned" into
  // textStart, so we hash by textStart
  public void add(int textStart) throws IOException {
    int code = textStart;

    int hashPos = code & postingsHashMask;

    assert !postingsCompacted;

    // Locate RawPostingList in hash
    int termID = postingsHash[hashPos];

    if (termID != -1 && postingsArray.textStarts[termID] != textStart) {
      // Conflict: keep searching different locations in
      // the hash table.
      final int inc = ((code>>8)+code)|1;
      do {
        code += inc;
        hashPos = code & postingsHashMask;
        termID = postingsHash[hashPos];
      } while (termID != -1 && postingsArray.textStarts[termID] != textStart);
    }

    if (termID == -1) {

      // First time we are seeing this token since we last
      // flushed the hash.

      // New posting
      termID = numPostings++;
      if (termID >= postingsArray.size) {
        growParallelPostingsArray();
      }

      assert termID >= 0;

      postingsArray.textStarts[termID] = textStart;
          
      assert postingsHash[hashPos] == -1;
      postingsHash[hashPos] = termID;

      if (numPostings == postingsHashHalfSize)
        rehashPostings(2*postingsHashSize);

      // Init stream slices
      if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
        intPool.nextBuffer();

      if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
        bytePool.nextBuffer();

      intUptos = intPool.buffer;
      intUptoStart = intPool.intUpto;
      intPool.intUpto += streamCount;

      postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;

      for(int i=0;i<streamCount;i++) {
        final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
        intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
      }
      postingsArray.byteStarts[termID] = intUptos[intUptoStart];

      consumer.newTerm(termID);

    } else {
      int intStart = postingsArray.intStarts[termID];
      intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
      intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
      consumer.addTerm(termID);
    }
  }

  // Primary entry point (for first TermsHash)
  @Override
  void add() throws IOException {

    assert !postingsCompacted;

    // We are first in the chain so we must "intern" the
    // term text into textStart address

    // Get the text of this term.
    final char[] tokenText = termAtt.buffer();
    final int tokenTextLen = termAtt.length();

    // Compute hashcode & replace any invalid UTF16 sequences
    int downto = tokenTextLen;
    int code = 0;
    while (downto > 0) {
      char ch = tokenText[--downto];

      if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) {
        if (0 == downto) {
          // Unpaired
          ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
        } else {
          final char ch2 = tokenText[downto-1];
          if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) {
            // OK: high followed by low.  This is a valid
            // surrogate pair.
            code = ((code*31) + ch)*31+ch2;
            downto--;
            continue;
          } else {
            // Unpaired
            ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
          }            
        }
      } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END ||
                                                          ch == 0xffff)) {
        // Unpaired or 0xffff
        ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR;
      }

      code = (code*31) + ch;
    }

    int hashPos = code & postingsHashMask;

    // Locate RawPostingList in hash
    int termID = postingsHash[hashPos];

    if (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen)) {
      // Conflict: keep searching different locations in
      // the hash table.
      final int inc = ((code>>8)+code)|1;
      do {
        code += inc;
        hashPos = code & postingsHashMask;
        termID = postingsHash[hashPos];
      } while (termID != -1 && !postingEquals(termID, tokenText, tokenTextLen));
    }

    if (termID == -1) {

      // First time we are seeing this token since we last
      // flushed the hash.
      final int textLen1 = 1+tokenTextLen;
      if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) {
        if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) {
          // Just skip this term, to remain as robust as
          // possible during indexing.  A TokenFilter
          // can be inserted into the analyzer chain if
          // other behavior is wanted (pruning the term
          // to a prefix, throwing an exception, etc).

          if (docState.maxTermPrefix == null)
            docState.maxTermPrefix = new String(tokenText, 0, 30);

          consumer.skippingLongTerm();
          return;
        }
        charPool.nextBuffer();
      }

      // New posting
      termID = numPostings++;
      if (termID >= postingsArray.size) {
        growParallelPostingsArray();
      }

      assert termID != -1;

      final char[] text = charPool.buffer;
      final int textUpto = charPool.charUpto;
      postingsArray.textStarts[termID] = textUpto + charPool.charOffset;
      charPool.charUpto += textLen1;
      System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);
      text[textUpto+tokenTextLen] = 0xffff;
          
      assert postingsHash[hashPos] == -1;
      postingsHash[hashPos] = termID;

      if (numPostings == postingsHashHalfSize) {
        rehashPostings(2*postingsHashSize);
        bytesUsed(2*numPostings * RamUsageEstimator.NUM_BYTES_INT);
      }

      // Init stream slices
      if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
        intPool.nextBuffer();

      if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
        bytePool.nextBuffer();

      intUptos = intPool.buffer;
      intUptoStart = intPool.intUpto;
      intPool.intUpto += streamCount;

      postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;

      for(int i=0;i<streamCount;i++) {
        final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
        intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
      }
      postingsArray.byteStarts[termID] = intUptos[intUptoStart];
      
      consumer.newTerm(termID);

    } else {
      final int intStart = postingsArray.intStarts[termID];
      intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
      intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
      consumer.addTerm(termID);
    }

    if (doNextCall)
      nextPerField.add(postingsArray.textStarts[termID]);
  }

  int[] intUptos;
  int intUptoStart;

  void writeByte(int stream, byte b) {
    int upto = intUptos[intUptoStart+stream];
    byte[] bytes = bytePool.buffers[upto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
    assert bytes != null;
    int offset = upto & DocumentsWriter.BYTE_BLOCK_MASK;
    if (bytes[offset] != 0) {
      // End of slice; allocate a new one
      offset = bytePool.allocSlice(bytes, offset);
      bytes = bytePool.buffer;
      intUptos[intUptoStart+stream] = offset + bytePool.byteOffset;
    }
    bytes[offset] = b;
    (intUptos[intUptoStart+stream])++;
  }

  public void writeBytes(int stream, byte[] b, int offset, int len) {
    // TODO: optimize
    final int end = offset + len;
    for(int i=offset;i<end;i++)
      writeByte(stream, b[i]);
  }

  void writeVInt(int stream, int i) {
    assert stream < streamCount;
    while ((i & ~0x7F) != 0) {
      writeByte(stream, (byte)((i & 0x7f) | 0x80));
      i >>>= 7;
    }
    writeByte(stream, (byte) i);
  }

  @Override
  void finish() throws IOException {
    try {
      consumer.finish();
    } finally {
      if (nextPerField != null) {
        nextPerField.finish();
      }
    }
  }

  /** Called when postings hash is too small (> 50%
   *  occupied) or too large (< 20% occupied). */
  void rehashPostings(final int newSize) {

    final int newMask = newSize-1;

    int[] newHash = new int[newSize];
    Arrays.fill(newHash, -1);
    for(int i=0;i<postingsHashSize;i++) {
      int termID = postingsHash[i];
      if (termID != -1) {
        int code;
        if (perThread.primary) {
          final int textStart = postingsArray.textStarts[termID];
          final int start = textStart & DocumentsWriter.CHAR_BLOCK_MASK;
          final char[] text = charPool.buffers[textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
          int pos = start;
          while(text[pos] != 0xffff)
            pos++;
          code = 0;
          while (pos > start)
            code = (code*31) + text[--pos];
        } else
          code = postingsArray.textStarts[termID];

        int hashPos = code & newMask;
        assert hashPos >= 0;
        if (newHash[hashPos] != -1) {
          final int inc = ((code>>8)+code)|1;
          do {
            code += inc;
            hashPos = code & newMask;
          } while (newHash[hashPos] != -1);
        }
        newHash[hashPos] = termID;
      }
    }

    postingsHashMask = newMask;
    postingsHash = newHash;

    postingsHashSize = newSize;
    postingsHashHalfSize = newSize >> 1;
  }
}