package org.apache.lucene.index.codecs.pulsing;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.codecs.PostingsWriterBase;
import org.apache.lucene.index.codecs.TermStats;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
// TODO: we now inline based on total TF of the term,
// but it might be better to inline by "net bytes used"
// so that a term that has only 1 posting but a huge
// payload would not be inlined. Though this is
// presumably rare in practice...
/** @lucene.experimental */
public final class PulsingPostingsWriterImpl extends PostingsWriterBase {
final static String CODEC = "PulsedPostings";
// To add a new version, increment from the last one, and
// change VERSION_CURRENT to point to your new version:
final static int VERSION_START = 0;
final static int VERSION_CURRENT = VERSION_START;
private IndexOutput termsOut;
private boolean omitTF;
private boolean storePayloads;
// one entry per position
private final Position[] pending;
private int pendingCount = 0; // -1 once we've hit too many positions
private Position currentDoc; // first Position entry of current doc
private static final class Position {
BytesRef payload;
int termFreq; // only incremented on first position for a given doc
int pos;
int docID;
}
// TODO: -- lazy init this? ie, if every single term
// was inlined (eg for a "primary key" field) then we
// never need to use this fallback? Fallback writer for
// non-inlined terms:
final PostingsWriterBase wrappedPostingsWriter;
/** If the total number of positions (summed across all docs
* for this term) is <= maxPositions, then the postings are
* inlined into terms dict */
public PulsingPostingsWriterImpl(int maxPositions, PostingsWriterBase wrappedPostingsWriter) throws IOException {
super();
pending = new Position[maxPositions];
for(int i=0;i<maxPositions;i++) {
pending[i] = new Position();
}
// We simply wrap another postings writer, but only call
// on it when tot positions is >= the cutoff:
this.wrappedPostingsWriter = wrappedPostingsWriter;
}
@Override
public void start(IndexOutput termsOut) throws IOException {
this.termsOut = termsOut;
CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
termsOut.writeVInt(pending.length); // encode maxPositions in header
wrappedPostingsWriter.start(termsOut);
}
@Override
public void startTerm() {
//System.out.println("PW startTerm");
assert pendingCount == 0;
}
// TODO: -- should we NOT reuse across fields? would
// be cleaner
// Currently, this instance is re-used across fields, so
// our parent calls setField whenever the field changes
@Override
public void setField(FieldInfo fieldInfo) {
omitTF = fieldInfo.omitTermFreqAndPositions;
//System.out.println("PW field=" + fieldInfo.name + " omitTF=" + omitTF);
storePayloads = fieldInfo.storePayloads;
wrappedPostingsWriter.setField(fieldInfo);
}
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
assert docID >= 0: "got docID=" + docID;
//System.out.println("PW doc=" + docID);
if (pendingCount == pending.length) {
push();
//System.out.println("PW: wrapped.finishDoc");
wrappedPostingsWriter.finishDoc();
}
if (pendingCount != -1) {
assert pendingCount < pending.length;
currentDoc = pending[pendingCount];
currentDoc.docID = docID;
if (omitTF) {
pendingCount++;
} else {
currentDoc.termFreq = termDocFreq;
}
} else {
// We've already seen too many docs for this term --
// just forward to our fallback writer
wrappedPostingsWriter.startDoc(docID, termDocFreq);
}
}
@Override
public void addPosition(int position, BytesRef payload) throws IOException {
//System.out.println("PW pos=" + position + " payload=" + (payload == null ? "null" : payload.length + " bytes"));
if (pendingCount == pending.length) {
push();
}
if (pendingCount == -1) {
// We've already seen too many docs for this term --
// just forward to our fallback writer
wrappedPostingsWriter.addPosition(position, payload);
} else {
// buffer up
final Position pos = pending[pendingCount++];
pos.pos = position;
pos.docID = currentDoc.docID;
if (payload != null && payload.length > 0) {
if (pos.payload == null) {
pos.payload = new BytesRef(payload);
} else {
pos.payload.copy(payload);
}
} else if (pos.payload != null) {
pos.payload.length = 0;
}
}
}
@Override
public void finishDoc() throws IOException {
//System.out.println("PW finishDoc");
if (pendingCount == -1) {
wrappedPostingsWriter.finishDoc();
}
}
private final RAMOutputStream buffer = new RAMOutputStream();
private final RAMOutputStream buffer2 = new RAMOutputStream();
/** Called when we are done adding docs to this term */
@Override
public void finishTerm(TermStats stats) throws IOException {
//System.out.println("PW finishTerm docCount=" + stats.docFreq);
assert pendingCount > 0 || pendingCount == -1;
if (pendingCount == -1) {
wrappedPostingsWriter.finishTerm(stats);
} else {
// There were few enough total occurrences for this
// term, so we fully inline our postings data into
// terms dict, now:
// TODO: it'd be better to share this encoding logic
// in some inner codec that knows how to write a
// single doc / single position, etc. This way if a
// given codec wants to store other interesting
// stuff, it could use this pulsing codec to do so
if (!omitTF) {
int lastDocID = 0;
int pendingIDX = 0;
while(pendingIDX < pendingCount) {
final Position doc = pending[pendingIDX];
final int delta = doc.docID - lastDocID;
lastDocID = doc.docID;
//System.out.println(" write doc=" + doc.docID + " freq=" + doc.termFreq);
if (doc.termFreq == 1) {
buffer.writeVInt((delta<<1)|1);
} else {
buffer.writeVInt(delta<<1);
buffer.writeVInt(doc.termFreq);
}
int lastPos = 0;
int lastPayloadLength = -1;
for(int posIDX=0;posIDX<doc.termFreq;posIDX++) {
final Position pos = pending[pendingIDX++];
assert pos.docID == doc.docID;
final int posDelta = pos.pos - lastPos;
lastPos = pos.pos;
//System.out.println(" write pos=" + pos.pos);
if (storePayloads) {
final int payloadLength = pos.payload == null ? 0 : pos.payload.length;
if (payloadLength != lastPayloadLength) {
buffer.writeVInt((posDelta << 1)|1);
buffer.writeVInt(payloadLength);
lastPayloadLength = payloadLength;
} else {
buffer.writeVInt(posDelta << 1);
}
if (payloadLength > 0) {
buffer.writeBytes(pos.payload.bytes, 0, pos.payload.length);
}
} else {
buffer.writeVInt(posDelta);
}
}
}
} else {
int lastDocID = 0;
for(int posIDX=0;posIDX<pendingCount;posIDX++) {
final Position doc = pending[posIDX];
buffer.writeVInt(doc.docID - lastDocID);
lastDocID = doc.docID;
}
}
//System.out.println(" bytes=" + buffer.getFilePointer());
buffer2.writeVInt((int) buffer.getFilePointer());
buffer.writeTo(buffer2);
buffer.reset();
}
pendingCount = 0;
}
@Override
public void close() throws IOException {
wrappedPostingsWriter.close();
}
@Override
public void flushTermsBlock() throws IOException {
termsOut.writeVInt((int) buffer2.getFilePointer());
buffer2.writeTo(termsOut);
buffer2.reset();
// TODO: can we avoid calling this if all terms
// were inlined...? Eg for a "primary key" field, the
// wrapped codec is never invoked...
wrappedPostingsWriter.flushTermsBlock();
}
// Pushes pending positions to the wrapped codec
private void push() throws IOException {
//System.out.println("PW now push @ " + pendingCount + " wrapped=" + wrappedPostingsWriter);
assert pendingCount == pending.length;
wrappedPostingsWriter.startTerm();
// Flush all buffered docs
if (!omitTF) {
Position doc = null;
for(Position pos : pending) {
if (doc == null) {
doc = pos;
//System.out.println("PW: wrapped.startDoc docID=" + doc.docID + " tf=" + doc.termFreq);
wrappedPostingsWriter.startDoc(doc.docID, doc.termFreq);
} else if (doc.docID != pos.docID) {
assert pos.docID > doc.docID;
//System.out.println("PW: wrapped.finishDoc");
wrappedPostingsWriter.finishDoc();
doc = pos;
//System.out.println("PW: wrapped.startDoc docID=" + doc.docID + " tf=" + doc.termFreq);
wrappedPostingsWriter.startDoc(doc.docID, doc.termFreq);
}
//System.out.println("PW: wrapped.addPos pos=" + pos.pos);
wrappedPostingsWriter.addPosition(pos.pos, pos.payload);
}
//wrappedPostingsWriter.finishDoc();
} else {
for(Position doc : pending) {
wrappedPostingsWriter.startDoc(doc.docID, 0);
}
}
pendingCount = -1;
}
}