package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
final TermVectorsTermsWriterPerThread perThread;
final TermsHashPerField termsHashPerField;
final TermVectorsTermsWriter termsWriter;
final FieldInfo fieldInfo;
final DocumentsWriter.DocState docState;
final FieldInvertState fieldState;
boolean doVectors;
boolean doVectorPositions;
boolean doVectorOffsets;
int maxNumPostings;
OffsetAttribute offsetAttribute = null;
public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo) {
this.termsHashPerField = termsHashPerField;
this.perThread = perThread;
this.termsWriter = perThread.termsWriter;
this.fieldInfo = fieldInfo;
docState = termsHashPerField.docState;
fieldState = termsHashPerField.fieldState;
}
@Override
int getStreamCount() {
return 2;
}
@Override
boolean start(Fieldable[] fields, int count) {
doVectors = false;
doVectorPositions = false;
doVectorOffsets = false;
for(int i=0;i<count;i++) {
Fieldable field = fields[i];
if (field.isIndexed() && field.isTermVectorStored()) {
doVectors = true;
doVectorPositions |= field.isStorePositionWithTermVector();
doVectorOffsets |= field.isStoreOffsetWithTermVector();
}
}
if (doVectors) {
if (perThread.doc == null) {
perThread.doc = termsWriter.getPerDoc();
perThread.doc.docID = docState.docID;
assert perThread.doc.numVectorFields == 0;
assert 0 == perThread.doc.perDocTvf.length();
assert 0 == perThread.doc.perDocTvf.getFilePointer();
} else {
assert perThread.doc.docID == docState.docID;
if (termsHashPerField.numPostings != 0)
// Only necessary if previous doc hit a
// non-aborting exception while writing vectors in
// this field:
termsHashPerField.reset();
}
}
// TODO: only if needed for performance
//perThread.postingsCount = 0;
return doVectors;
}
public void abort() {}
/** Called once per field per document if term vectors
* are enabled, to write the vectors to
* RAMOutputStream, which is then quickly flushed to
* * the real term vectors files in the Directory. */
@Override
void finish() throws IOException {
assert docState.testPoint("TermVectorsTermsWriterPerField.finish start");
final int numPostings = termsHashPerField.numPostings;
final BytesRef flushTerm = perThread.flushTerm;
assert numPostings >= 0;
if (!doVectors || numPostings == 0)
return;
if (numPostings > maxNumPostings)
maxNumPostings = numPostings;
final IndexOutput tvf = perThread.doc.perDocTvf;
// This is called once, after inverting all occurrences
// of a given field in the doc. At this point we flush
// our hash into the DocWriter.
assert fieldInfo.storeTermVector;
assert perThread.vectorFieldsInOrder(fieldInfo);
perThread.doc.addField(termsHashPerField.fieldInfo.number);
TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
// TODO: we may want to make this sort in same order
// as Codec's terms dict?
final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator());
tvf.writeVInt(numPostings);
byte bits = 0x0;
if (doVectorPositions)
bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
if (doVectorOffsets)
bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
tvf.writeByte(bits);
int lastLen = 0;
byte[] lastBytes = null;
int lastStart = 0;
final ByteSliceReader reader = perThread.vectorSliceReader;
final ByteBlockPool termBytePool = perThread.termsHashPerThread.termBytePool;
for(int j=0;j<numPostings;j++) {
final int termID = termIDs[j];
final int freq = postings.freqs[termID];
// Get BytesRef
termBytePool.setBytesRef(flushTerm, postings.textStarts[termID]);
// Compute common byte prefix between last term and
// this term
int prefix = 0;
if (j > 0) {
while(prefix < lastLen && prefix < flushTerm.length) {
if (lastBytes[lastStart+prefix] != flushTerm.bytes[flushTerm.offset+prefix]) {
break;
}
prefix++;
}
}
lastLen = flushTerm.length;
lastBytes = flushTerm.bytes;
lastStart = flushTerm.offset;
final int suffix = flushTerm.length - prefix;
tvf.writeVInt(prefix);
tvf.writeVInt(suffix);
tvf.writeBytes(flushTerm.bytes, lastStart+prefix, suffix);
tvf.writeVInt(freq);
if (doVectorPositions) {
termsHashPerField.initReader(reader, termID, 0);
reader.writeTo(tvf);
}
if (doVectorOffsets) {
termsHashPerField.initReader(reader, termID, 1);
reader.writeTo(tvf);
}
}
termsHashPerField.reset();
perThread.termsHashPerThread.reset(false);
}
void shrinkHash() {
termsHashPerField.shrinkHash(maxNumPostings);
maxNumPostings = 0;
}
@Override
void start(Fieldable f) {
if (doVectorOffsets) {
offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
} else {
offsetAttribute = null;
}
}
@Override
void newTerm(final int termID) {
assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start");
TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
postings.freqs[termID] = 1;
if (doVectorOffsets) {
int startOffset = fieldState.offset + offsetAttribute.startOffset();
int endOffset = fieldState.offset + offsetAttribute.endOffset();
termsHashPerField.writeVInt(1, startOffset);
termsHashPerField.writeVInt(1, endOffset - startOffset);
postings.lastOffsets[termID] = endOffset;
}
if (doVectorPositions) {
termsHashPerField.writeVInt(0, fieldState.position);
postings.lastPositions[termID] = fieldState.position;
}
}
@Override
void addTerm(final int termID) {
assert docState.testPoint("TermVectorsTermsWriterPerField.addTerm start");
TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
postings.freqs[termID]++;
if (doVectorOffsets) {
int startOffset = fieldState.offset + offsetAttribute.startOffset();
int endOffset = fieldState.offset + offsetAttribute.endOffset();
termsHashPerField.writeVInt(1, startOffset - postings.lastOffsets[termID]);
termsHashPerField.writeVInt(1, endOffset - startOffset);
postings.lastOffsets[termID] = endOffset;
}
if (doVectorPositions) {
termsHashPerField.writeVInt(0, fieldState.position - postings.lastPositions[termID]);
postings.lastPositions[termID] = fieldState.position;
}
}
@Override
void skippingLongTerm() {}
@Override
ParallelPostingsArray createPostingsArray(int size) {
return new TermVectorsPostingsArray(size);
}
static final class TermVectorsPostingsArray extends ParallelPostingsArray {
public TermVectorsPostingsArray(int size) {
super(size);
freqs = new int[size];
lastOffsets = new int[size];
lastPositions = new int[size];
}
int[] freqs; // How many times this term occurred in the current doc
int[] lastOffsets; // Last offset we saw
int[] lastPositions; // Last position where this term occurred
ParallelPostingsArray newInstance(int size) {
return new TermVectorsPostingsArray(size);
}
@Override
void copyTo(ParallelPostingsArray toArray, int numToCopy) {
assert toArray instanceof TermVectorsPostingsArray;
TermVectorsPostingsArray to = (TermVectorsPostingsArray) toArray;
super.copyTo(toArray, numToCopy);
System.arraycopy(freqs, 0, to.freqs, 0, size);
System.arraycopy(lastOffsets, 0, to.lastOffsets, 0, size);
System.arraycopy(lastPositions, 0, to.lastPositions, 0, size);
}
@Override
int bytesPerPosting() {
return super.bytesPerPosting() + 3 * DocumentsWriter.INT_NUM_BYTE;
}
}
}