package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
final class TermsHashPerField extends InvertedDocConsumerPerField {
final TermsHashConsumerPerField consumer;
final TermsHashPerField nextPerField;
final TermsHashPerThread perThread;
final DocumentsWriter.DocState docState;
final FieldInvertState fieldState;
TermToBytesRefAttribute termAtt;
// Copied from our perThread
final IntBlockPool intPool;
final ByteBlockPool bytePool;
final ByteBlockPool termBytePool;
final int streamCount;
final int numPostingInt;
final FieldInfo fieldInfo;
boolean postingsCompacted;
int numPostings;
private int postingsHashSize = 4;
private int postingsHashHalfSize = postingsHashSize/2;
private int postingsHashMask = postingsHashSize-1;
private int[] postingsHash;
ParallelPostingsArray postingsArray;
private final BytesRef utf8;
private Comparator<BytesRef> termComp;
public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
this.perThread = perThread;
intPool = perThread.intPool;
bytePool = perThread.bytePool;
termBytePool = perThread.termBytePool;
docState = perThread.docState;
postingsHash = new int[postingsHashSize];
Arrays.fill(postingsHash, -1);
bytesUsed(postingsHashSize * RamUsageEstimator.NUM_BYTES_INT);
fieldState = docInverterPerField.fieldState;
this.consumer = perThread.consumer.addField(this, fieldInfo);
initPostingsArray();
streamCount = consumer.getStreamCount();
numPostingInt = 2*streamCount;
utf8 = perThread.utf8;
this.fieldInfo = fieldInfo;
if (nextPerThread != null)
nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo);
else
nextPerField = null;
}
private void initPostingsArray() {
postingsArray = consumer.createPostingsArray(2);
bytesUsed(postingsArray.size * postingsArray.bytesPerPosting());
}
// sugar: just forwards to DW
private void bytesUsed(long size) {
if (perThread.termsHash.trackAllocations) {
perThread.termsHash.docWriter.bytesUsed(size);
}
}
void shrinkHash(int targetSize) {
assert postingsCompacted || numPostings == 0;
final int newSize = 4;
if (newSize != postingsHash.length) {
final long previousSize = postingsHash.length;
postingsHash = new int[newSize];
bytesUsed((newSize-previousSize)*RamUsageEstimator.NUM_BYTES_INT);
Arrays.fill(postingsHash, -1);
postingsHashSize = newSize;
postingsHashHalfSize = newSize/2;
postingsHashMask = newSize-1;
}
// Fully free the postings array on each flush:
if (postingsArray != null) {
bytesUsed(-postingsArray.bytesPerPosting() * postingsArray.size);
postingsArray = null;
}
}
public void reset() {
if (!postingsCompacted)
compactPostings();
assert numPostings <= postingsHash.length;
if (numPostings > 0) {
Arrays.fill(postingsHash, 0, numPostings, -1);
numPostings = 0;
}
postingsCompacted = false;
if (nextPerField != null)
nextPerField.reset();
}
@Override
synchronized public void abort() {
reset();
if (nextPerField != null)
nextPerField.abort();
}
private final void growParallelPostingsArray() {
int oldSize = postingsArray.size;
this.postingsArray = this.postingsArray.grow();
bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - oldSize));
}
public void initReader(ByteSliceReader reader, int termID, int stream) {
assert stream < streamCount;
int intStart = postingsArray.intStarts[termID];
final int[] ints = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
final int upto = intStart & DocumentsWriter.INT_BLOCK_MASK;
reader.init(bytePool,
postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
ints[upto+stream]);
}
private synchronized void compactPostings() {
int upto = 0;
for(int i=0;i<postingsHashSize;i++) {
if (postingsHash[i] != -1) {
if (upto < i) {
postingsHash[upto] = postingsHash[i];
postingsHash[i] = -1;
}
upto++;
}
}
assert upto == numPostings;
postingsCompacted = true;
}
/** Collapse the hash table & sort in-place. */
public int[] sortPostings(Comparator<BytesRef> termComp) {
this.termComp = termComp;
compactPostings();
quickSort(postingsHash, 0, numPostings-1);
return postingsHash;
}
void quickSort(int[] termIDs, int lo, int hi) {
if (lo >= hi)
return;
else if (hi == 1+lo) {
if (comparePostings(termIDs[lo], termIDs[hi]) > 0) {
final int tmp = termIDs[lo];
termIDs[lo] = termIDs[hi];
termIDs[hi] = tmp;
}
return;
}
int mid = (lo + hi) >>> 1;
if (comparePostings(termIDs[lo], termIDs[mid]) > 0) {
int tmp = termIDs[lo];
termIDs[lo] = termIDs[mid];
termIDs[mid] = tmp;
}
if (comparePostings(termIDs[mid], termIDs[hi]) > 0) {
int tmp = termIDs[mid];
termIDs[mid] = termIDs[hi];
termIDs[hi] = tmp;
if (comparePostings(termIDs[lo], termIDs[mid]) > 0) {
int tmp2 = termIDs[lo];
termIDs[lo] = termIDs[mid];
termIDs[mid] = tmp2;
}
}
int left = lo + 1;
int right = hi - 1;
if (left >= right)
return;
int partition = termIDs[mid];
for (; ;) {
while (comparePostings(termIDs[right], partition) > 0)
--right;
while (left < right && comparePostings(termIDs[left], partition) <= 0)
++left;
if (left < right) {
int tmp = termIDs[left];
termIDs[left] = termIDs[right];
termIDs[right] = tmp;
--right;
} else {
break;
}
}
quickSort(termIDs, lo, left);
quickSort(termIDs, left + 1, hi);
}
/** Compares term text for two Posting instance and
* returns -1 if p1 < p2; 1 if p1 > p2; else 0. */
int comparePostings(int term1, int term2) {
if (term1 == term2) {
// Our quicksort does this, eg during partition
return 0;
}
termBytePool.setBytesRef(perThread.tr1, postingsArray.textStarts[term1]);
termBytePool.setBytesRef(perThread.tr2, postingsArray.textStarts[term2]);
return termComp.compare(perThread.tr1, perThread.tr2);
}
/** Test whether the text for current RawPostingList p equals
* current tokenText in utf8. */
private boolean postingEquals(final int termID) {
final int textStart = postingsArray.textStarts[termID];
final byte[] text = termBytePool.buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
assert text != null;
int pos = textStart & DocumentsWriter.BYTE_BLOCK_MASK;
final int len;
if ((text[pos] & 0x80) == 0) {
// length is 1 byte
len = text[pos];
pos += 1;
} else {
// length is 2 bytes
len = (text[pos]&0x7f) + ((text[pos+1]&0xff)<<7);
pos += 2;
}
if (len == utf8.length) {
final byte[] utf8Bytes = utf8.bytes;
for(int tokenPos=0;tokenPos<utf8.length;pos++,tokenPos++) {
if (utf8Bytes[tokenPos] != text[pos]) {
return false;
}
}
return true;
} else {
return false;
}
}
private boolean doCall;
private boolean doNextCall;
@Override
void start(Fieldable f) {
termAtt = fieldState.attributeSource.getAttribute(TermToBytesRefAttribute.class);
consumer.start(f);
if (nextPerField != null) {
nextPerField.start(f);
}
}
@Override
boolean start(Fieldable[] fields, int count) throws IOException {
doCall = consumer.start(fields, count);
if (postingsArray == null) {
initPostingsArray();
}
if (nextPerField != null)
doNextCall = nextPerField.start(fields, count);
return doCall || doNextCall;
}
// Secondary entry point (for 2nd & subsequent TermsHash),
// because token text has already been "interned" into
// textStart, so we hash by textStart
public void add(int textStart) throws IOException {
int code = textStart;
int hashPos = code & postingsHashMask;
assert !postingsCompacted;
// Locate RawPostingList in hash
int termID = postingsHash[hashPos];
if (termID != -1 && postingsArray.textStarts[termID] != textStart) {
// Conflict: keep searching different locations in
// the hash table.
final int inc = ((code>>8)+code)|1;
do {
code += inc;
hashPos = code & postingsHashMask;
termID = postingsHash[hashPos];
} while (termID != -1 && postingsArray.textStarts[termID] != textStart);
}
if (termID == -1) {
// First time we are seeing this token since we last
// flushed the hash.
// New posting
termID = numPostings++;
if (termID >= postingsArray.size) {
growParallelPostingsArray();
}
assert termID >= 0;
postingsArray.textStarts[termID] = textStart;
assert postingsHash[hashPos] == -1;
postingsHash[hashPos] = termID;
if (numPostings == postingsHashHalfSize)
rehashPostings(2*postingsHashSize);
// Init stream slices
if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
intPool.nextBuffer();
if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
bytePool.nextBuffer();
intUptos = intPool.buffer;
intUptoStart = intPool.intUpto;
intPool.intUpto += streamCount;
postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
for(int i=0;i<streamCount;i++) {
final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
}
postingsArray.byteStarts[termID] = intUptos[intUptoStart];
consumer.newTerm(termID);
} else {
int intStart = postingsArray.intStarts[termID];
intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
consumer.addTerm(termID);
}
}
// Primary entry point (for first TermsHash)
@Override
void add() throws IOException {
assert !postingsCompacted;
// We are first in the chain so we must "intern" the
// term text into textStart address
// Get the text & hash of this term.
int code = termAtt.toBytesRef(utf8);
int hashPos = code & postingsHashMask;
// Locate RawPostingList in hash
int termID = postingsHash[hashPos];
if (termID != -1 && !postingEquals(termID)) {
// Conflict: keep searching different locations in
// the hash table.
final int inc = ((code>>8)+code)|1;
do {
code += inc;
hashPos = code & postingsHashMask;
termID = postingsHash[hashPos];
} while (termID != -1 && !postingEquals(termID));
}
if (termID == -1) {
// First time we are seeing this token since we last
// flushed the hash.
final int textLen2 = 2+utf8.length;
if (textLen2 + bytePool.byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE) {
// Not enough room in current block
if (utf8.length > DocumentsWriter.MAX_TERM_LENGTH_UTF8) {
// Just skip this term, to remain as robust as
// possible during indexing. A TokenFilter
// can be inserted into the analyzer chain if
// other behavior is wanted (pruning the term
// to a prefix, throwing an exception, etc).
if (docState.maxTermPrefix == null) {
final int saved = utf8.length;
try {
utf8.length = Math.min(30, DocumentsWriter.MAX_TERM_LENGTH_UTF8);
docState.maxTermPrefix = utf8.toString();
} finally {
utf8.length = saved;
}
}
consumer.skippingLongTerm();
return;
}
bytePool.nextBuffer();
}
// New posting
termID = numPostings++;
if (termID >= postingsArray.size) {
growParallelPostingsArray();
}
assert termID != -1;
assert postingsHash[hashPos] == -1;
postingsHash[hashPos] = termID;
final byte[] text = bytePool.buffer;
final int textUpto = bytePool.byteUpto;
postingsArray.textStarts[termID] = textUpto + bytePool.byteOffset;
// We first encode the length, followed by the UTF8
// bytes. Length is encoded as vInt, but will consume
// 1 or 2 bytes at most (we reject too-long terms,
// above).
// encode length @ start of bytes
if (utf8.length < 128) {
// 1 byte to store length
text[textUpto] = (byte) utf8.length;
bytePool.byteUpto += utf8.length + 1;
System.arraycopy(utf8.bytes, 0, text, textUpto+1, utf8.length);
} else {
// 2 byte to store length
text[textUpto] = (byte) (0x80 | (utf8.length & 0x7f));
text[textUpto+1] = (byte) ((utf8.length>>7) & 0xff);
bytePool.byteUpto += utf8.length + 2;
System.arraycopy(utf8.bytes, 0, text, textUpto+2, utf8.length);
}
if (numPostings == postingsHashHalfSize) {
rehashPostings(2*postingsHashSize);
bytesUsed(2*numPostings * RamUsageEstimator.NUM_BYTES_INT);
}
// Init stream slices
if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) {
intPool.nextBuffer();
}
if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
bytePool.nextBuffer();
}
intUptos = intPool.buffer;
intUptoStart = intPool.intUpto;
intPool.intUpto += streamCount;
postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
for(int i=0;i<streamCount;i++) {
final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
}
postingsArray.byteStarts[termID] = intUptos[intUptoStart];
consumer.newTerm(termID);
} else {
final int intStart = postingsArray.intStarts[termID];
intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
consumer.addTerm(termID);
}
if (doNextCall)
nextPerField.add(postingsArray.textStarts[termID]);
}
int[] intUptos;
int intUptoStart;
void writeByte(int stream, byte b) {
int upto = intUptos[intUptoStart+stream];
byte[] bytes = bytePool.buffers[upto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
assert bytes != null;
int offset = upto & DocumentsWriter.BYTE_BLOCK_MASK;
if (bytes[offset] != 0) {
// End of slice; allocate a new one
offset = bytePool.allocSlice(bytes, offset);
bytes = bytePool.buffer;
intUptos[intUptoStart+stream] = offset + bytePool.byteOffset;
}
bytes[offset] = b;
(intUptos[intUptoStart+stream])++;
}
public void writeBytes(int stream, byte[] b, int offset, int len) {
// TODO: optimize
final int end = offset + len;
for(int i=offset;i<end;i++)
writeByte(stream, b[i]);
}
void writeVInt(int stream, int i) {
assert stream < streamCount;
while ((i & ~0x7F) != 0) {
writeByte(stream, (byte)((i & 0x7f) | 0x80));
i >>>= 7;
}
writeByte(stream, (byte) i);
}
@Override
void finish() throws IOException {
consumer.finish();
if (nextPerField != null)
nextPerField.finish();
}
/** Called when postings hash is too small (> 50%
* occupied) or too large (< 20% occupied). */
void rehashPostings(final int newSize) {
final int newMask = newSize-1;
int[] newHash = new int[newSize];
Arrays.fill(newHash, -1);
for(int i=0;i<postingsHashSize;i++) {
int termID = postingsHash[i];
if (termID != -1) {
int code;
if (perThread.primary) {
final int textStart = postingsArray.textStarts[termID];
final int start = textStart & DocumentsWriter.BYTE_BLOCK_MASK;
final byte[] text = bytePool.buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
code = 0;
final int len;
int pos;
if ((text[start] & 0x80) == 0) {
// length is 1 byte
len = text[start];
pos = start+1;
} else {
len = (text[start]&0x7f) + ((text[start+1]&0xff)<<7);
pos = start+2;
}
final int endPos = pos+len;
while(pos < endPos) {
code = (code*31) + text[pos++];
}
} else {
code = postingsArray.textStarts[termID];
}
int hashPos = code & newMask;
assert hashPos >= 0;
if (newHash[hashPos] != -1) {
final int inc = ((code>>8)+code)|1;
do {
code += inc;
hashPos = code & newMask;
} while (newHash[hashPos] != -1);
}
newHash[hashPos] = termID;
}
}
postingsHashMask = newMask;
postingsHash = newHash;
postingsHashSize = newSize;
postingsHashHalfSize = newSize >> 1;
}
}