package org.apache.lucene.index.codecs.pulsing;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.index.codecs.standard.StandardPostingsWriter;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
// TODO: we now pulse entirely according to docFreq of the
// term; it might be better to eg pulse by "net bytes used"
// so that a term that has only 1 doc but zillions of
// positions would not be inlined. Though this is
// presumably rare in practice...
/** @lucene.experimental */
public final class PulsingPostingsWriterImpl extends StandardPostingsWriter {
final static String CODEC = "PulsedPostings";
// To add a new version, increment from the last one, and
// change VERSION_CURRENT to point to your new version:
final static int VERSION_START = 0;
final static int VERSION_CURRENT = VERSION_START;
IndexOutput termsOut;
boolean omitTF;
boolean storePayloads;
// Starts a new term
FieldInfo fieldInfo;
/** @lucene.experimental */
public static class Document {
int docID;
int termDocFreq;
int numPositions;
Position[] positions;
Document() {
positions = new Position[1];
positions[0] = new Position();
}
@Override
public Object clone() {
Document doc = new Document();
doc.docID = docID;
doc.termDocFreq = termDocFreq;
doc.numPositions = numPositions;
doc.positions = new Position[positions.length];
for(int i = 0; i < positions.length; i++) {
doc.positions[i] = (Position) positions[i].clone();
}
return doc;
}
void reallocPositions(int minSize) {
final Position[] newArray = new Position[ArrayUtil.oversize(minSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(positions, 0, newArray, 0, positions.length);
for(int i=positions.length;i<newArray.length;i++) {
newArray[i] = new Position();
}
positions = newArray;
}
}
final Document[] pendingDocs;
int pendingDocCount = 0;
Document currentDoc;
boolean pulsed; // false if we've seen > maxPulsingDocFreq docs
static class Position {
BytesRef payload;
int pos;
@Override
public Object clone() {
Position position = new Position();
position.pos = pos;
if (payload != null) {
position.payload = new BytesRef(payload);
}
return position;
}
}
// TODO: -- lazy init this? ie, if every single term
// was pulsed then we never need to use this fallback?
// Fallback writer for non-pulsed terms:
final StandardPostingsWriter wrappedPostingsWriter;
/** If docFreq <= maxPulsingDocFreq, its postings are
* inlined into terms dict */
public PulsingPostingsWriterImpl(int maxPulsingDocFreq, StandardPostingsWriter wrappedPostingsWriter) throws IOException {
super();
pendingDocs = new Document[maxPulsingDocFreq];
for(int i=0;i<maxPulsingDocFreq;i++) {
pendingDocs[i] = new Document();
}
// We simply wrap another postings writer, but only call
// on it when doc freq is higher than our cutoff
this.wrappedPostingsWriter = wrappedPostingsWriter;
}
@Override
public void start(IndexOutput termsOut) throws IOException {
this.termsOut = termsOut;
CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
termsOut.writeVInt(pendingDocs.length);
wrappedPostingsWriter.start(termsOut);
}
@Override
public void startTerm() {
assert pendingDocCount == 0;
pulsed = false;
}
// TODO: -- should we NOT reuse across fields? would
// be cleaner
// Currently, this instance is re-used across fields, so
// our parent calls setField whenever the field changes
@Override
public void setField(FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
omitTF = fieldInfo.omitTermFreqAndPositions;
storePayloads = fieldInfo.storePayloads;
wrappedPostingsWriter.setField(fieldInfo);
}
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
assert docID >= 0: "got docID=" + docID;
if (!pulsed && pendingDocCount == pendingDocs.length) {
// OK we just crossed the threshold, this term should
// now be written with our wrapped codec:
wrappedPostingsWriter.startTerm();
// Flush all buffered docs
for(int i=0;i<pendingDocCount;i++) {
final Document doc = pendingDocs[i];
wrappedPostingsWriter.startDoc(doc.docID, doc.termDocFreq);
if (!omitTF) {
assert doc.termDocFreq == doc.numPositions;
for(int j=0;j<doc.termDocFreq;j++) {
final Position pos = doc.positions[j];
if (pos.payload != null && pos.payload.length > 0) {
assert storePayloads;
wrappedPostingsWriter.addPosition(pos.pos, pos.payload);
} else {
wrappedPostingsWriter.addPosition(pos.pos, null);
}
}
wrappedPostingsWriter.finishDoc();
}
}
pendingDocCount = 0;
pulsed = true;
}
if (pulsed) {
// We've already seen too many docs for this term --
// just forward to our fallback writer
wrappedPostingsWriter.startDoc(docID, termDocFreq);
} else {
currentDoc = pendingDocs[pendingDocCount++];
currentDoc.docID = docID;
// TODO: -- need not store in doc? only used for alloc & assert
currentDoc.termDocFreq = termDocFreq;
if (termDocFreq > currentDoc.positions.length) {
currentDoc.reallocPositions(termDocFreq);
}
currentDoc.numPositions = 0;
}
}
@Override
public void addPosition(int position, BytesRef payload) throws IOException {
if (pulsed) {
wrappedPostingsWriter.addPosition(position, payload);
} else {
// just buffer up
Position pos = currentDoc.positions[currentDoc.numPositions++];
pos.pos = position;
if (payload != null && payload.length > 0) {
if (pos.payload == null) {
pos.payload = new BytesRef(payload);
} else {
pos.payload.copy(payload);
}
} else if (pos.payload != null) {
pos.payload.length = 0;
}
}
}
@Override
public void finishDoc() {
assert omitTF || currentDoc.numPositions == currentDoc.termDocFreq;
}
boolean pendingIsIndexTerm;
int pulsedCount;
int nonPulsedCount;
/** Called when we are done adding docs to this term */
@Override
public void finishTerm(int docCount, boolean isIndexTerm) throws IOException {
assert docCount > 0;
pendingIsIndexTerm |= isIndexTerm;
if (pulsed) {
wrappedPostingsWriter.finishTerm(docCount, pendingIsIndexTerm);
pendingIsIndexTerm = false;
pulsedCount++;
} else {
nonPulsedCount++;
// OK, there were few enough occurrences for this
// term, so we fully inline our postings data into
// terms dict, now:
int lastDocID = 0;
for(int i=0;i<pendingDocCount;i++) {
final Document doc = pendingDocs[i];
final int delta = doc.docID - lastDocID;
lastDocID = doc.docID;
if (omitTF) {
termsOut.writeVInt(delta);
} else {
assert doc.numPositions == doc.termDocFreq;
if (doc.numPositions == 1)
termsOut.writeVInt((delta<<1)|1);
else {
termsOut.writeVInt(delta<<1);
termsOut.writeVInt(doc.numPositions);
}
// TODO: we could do better in encoding
// payloadLength, eg, if it's always the same
// across all terms
int lastPosition = 0;
int lastPayloadLength = -1;
for(int j=0;j<doc.numPositions;j++) {
final Position pos = doc.positions[j];
final int delta2 = pos.pos - lastPosition;
lastPosition = pos.pos;
if (storePayloads) {
final int payloadLength = pos.payload == null ? 0 : pos.payload.length;
if (payloadLength != lastPayloadLength) {
termsOut.writeVInt((delta2 << 1)|1);
termsOut.writeVInt(payloadLength);
lastPayloadLength = payloadLength;
} else {
termsOut.writeVInt(delta2 << 1);
}
if (payloadLength > 0) {
termsOut.writeBytes(pos.payload.bytes, 0, pos.payload.length);
}
} else {
termsOut.writeVInt(delta2);
}
}
}
}
}
pendingDocCount = 0;
}
@Override
public void close() throws IOException {
wrappedPostingsWriter.close();
}
}