package org.apache.lucene.codecs.pulsing;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
// TODO: we now inline based on total TF of the term,
// but it might be better to inline by "net bytes used"
// so that a term that has only 1 posting but a huge
// payload would not be inlined. Though this is
// presumably rare in practice...
/**
* Writer for the pulsing format.
* <p>
* Wraps another postings implementation and decides
* (based on total number of occurrences), whether a terms
* postings should be inlined into the term dictionary,
* or passed through to the wrapped writer.
*
* @lucene.experimental */
public final class PulsingPostingsWriter extends PostingsWriterBase {
final static String CODEC = "PulsedPostingsWriter";
// recording field summary
final static String SUMMARY_EXTENSION = "smy";
// To add a new version, increment from the last one, and
// change VERSION_CURRENT to point to your new version:
final static int VERSION_START = 0;
final static int VERSION_META_ARRAY = 1;
final static int VERSION_CURRENT = VERSION_META_ARRAY;
private SegmentWriteState segmentState;
private List<FieldMetaData> fields;
// Reused by writeTerm:
private DocsEnum docsEnum;
private DocsAndPositionsEnum posEnum;
private int enumFlags;
private final RAMOutputStream buffer = new RAMOutputStream();
private IndexOptions indexOptions;
// information for wrapped PF, in current field
private int longsSize;
private long[] longs;
private boolean fieldHasFreqs;
private boolean fieldHasPositions;
private boolean fieldHasOffsets;
private boolean fieldHasPayloads;
boolean absolute;
private static class PulsingTermState extends BlockTermState {
private byte[] bytes;
private BlockTermState wrappedState;
@Override
public String toString() {
if (bytes != null) {
return "inlined";
} else {
return "not inlined wrapped=" + wrappedState;
}
}
}
private static final class FieldMetaData {
int fieldNumber;
int longsSize;
FieldMetaData(int number, int size) {
fieldNumber = number;
longsSize = size;
}
}
// TODO: -- lazy init this? ie, if every single term
// was inlined (eg for a "primary key" field) then we
// never need to use this fallback? Fallback writer for
// non-inlined terms:
final PostingsWriterBase wrappedPostingsWriter;
final int maxPositions;
/** If the total number of positions (summed across all docs
* for this term) is <= maxPositions, then the postings are
* inlined into terms dict */
public PulsingPostingsWriter(SegmentWriteState state, int maxPositions, PostingsWriterBase wrappedPostingsWriter) {
fields = new ArrayList<>();
this.maxPositions = maxPositions;
// We simply wrap another postings writer, but only call
// on it when tot positions is >= the cutoff:
this.wrappedPostingsWriter = wrappedPostingsWriter;
this.segmentState = state;
}
@Override
public void init(IndexOutput termsOut) throws IOException {
CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
termsOut.writeVInt(maxPositions); // encode maxPositions in header
wrappedPostingsWriter.init(termsOut);
}
@Override
public BlockTermState writeTerm(BytesRef term, TermsEnum termsEnum, FixedBitSet docsSeen) throws IOException {
// First pass: figure out whether we should pulse this term
long posCount = 0;
if (fieldHasPositions == false) {
// No positions:
docsEnum = termsEnum.docs(null, docsEnum, enumFlags);
assert docsEnum != null;
while (posCount <= maxPositions) {
if (docsEnum.nextDoc() == DocsEnum.NO_MORE_DOCS) {
break;
}
posCount++;
}
} else {
posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags);
assert posEnum != null;
while (posCount <= maxPositions) {
if (posEnum.nextDoc() == DocsEnum.NO_MORE_DOCS) {
break;
}
posCount += posEnum.freq();
}
}
if (posCount == 0) {
// All docs were deleted
return null;
}
// Second pass: write postings
if (posCount > maxPositions) {
// Too many positions; do not pulse. Just lset
// wrapped postingsWriter encode the postings:
PulsingTermState state = new PulsingTermState();
state.wrappedState = wrappedPostingsWriter.writeTerm(term, termsEnum, docsSeen);
state.docFreq = state.wrappedState.docFreq;
state.totalTermFreq = state.wrappedState.totalTermFreq;
return state;
} else {
// Pulsed:
if (fieldHasPositions == false) {
docsEnum = termsEnum.docs(null, docsEnum, enumFlags);
} else {
posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags);
docsEnum = posEnum;
}
assert docsEnum != null;
// There were few enough total occurrences for this
// term, so we fully inline our postings data into
// terms dict, now:
// TODO: it'd be better to share this encoding logic
// in some inner codec that knows how to write a
// single doc / single position, etc. This way if a
// given codec wants to store other interesting
// stuff, it could use this pulsing codec to do so
int lastDocID = 0;
int lastPayloadLength = -1;
int lastOffsetLength = -1;
int docFreq = 0;
long totalTermFreq = 0;
while (true) {
int docID = docsEnum.nextDoc();
if (docID == DocsEnum.NO_MORE_DOCS) {
break;
}
docsSeen.set(docID);
int delta = docID - lastDocID;
lastDocID = docID;
docFreq++;
if (fieldHasFreqs) {
int freq = docsEnum.freq();
totalTermFreq += freq;
if (freq == 1) {
buffer.writeVInt((delta << 1) | 1);
} else {
buffer.writeVInt(delta << 1);
buffer.writeVInt(freq);
}
if (fieldHasPositions) {
int lastPos = 0;
int lastOffset = 0;
for(int posIDX=0;posIDX<freq;posIDX++) {
int pos = posEnum.nextPosition();
int posDelta = pos - lastPos;
lastPos = pos;
int payloadLength;
BytesRef payload;
if (fieldHasPayloads) {
payload = posEnum.getPayload();
payloadLength = payload == null ? 0 : payload.length;
if (payloadLength != lastPayloadLength) {
buffer.writeVInt((posDelta << 1)|1);
buffer.writeVInt(payloadLength);
lastPayloadLength = payloadLength;
} else {
buffer.writeVInt(posDelta << 1);
}
} else {
payloadLength = 0;
payload = null;
buffer.writeVInt(posDelta);
}
if (fieldHasOffsets) {
int startOffset = posEnum.startOffset();
int endOffset = posEnum.endOffset();
int offsetDelta = startOffset - lastOffset;
int offsetLength = endOffset - startOffset;
if (offsetLength != lastOffsetLength) {
buffer.writeVInt(offsetDelta << 1 | 1);
buffer.writeVInt(offsetLength);
} else {
buffer.writeVInt(offsetDelta << 1);
}
lastOffset = startOffset;
lastOffsetLength = offsetLength;
}
if (payloadLength > 0) {
assert fieldHasPayloads;
assert payload != null;
buffer.writeBytes(payload.bytes, payload.offset, payload.length);
}
}
}
} else {
buffer.writeVInt(delta);
}
}
PulsingTermState state = new PulsingTermState();
state.bytes = new byte[(int) buffer.getFilePointer()];
state.docFreq = docFreq;
state.totalTermFreq = fieldHasFreqs ? totalTermFreq : -1;
buffer.writeTo(state.bytes, 0);
buffer.reset();
return state;
}
}
// TODO: -- should we NOT reuse across fields? would
// be cleaner
// Currently, this instance is re-used across fields, so
// our parent calls setField whenever the field changes
@Override
public int setField(FieldInfo fieldInfo) {
this.indexOptions = fieldInfo.getIndexOptions();
//if (DEBUG) System.out.println("PW field=" + fieldInfo.name + " indexOptions=" + indexOptions);
fieldHasPayloads = fieldInfo.hasPayloads();
absolute = false;
longsSize = wrappedPostingsWriter.setField(fieldInfo);
longs = new long[longsSize];
fields.add(new FieldMetaData(fieldInfo.number, longsSize));
fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
if (fieldHasFreqs == false) {
enumFlags = 0;
} else if (fieldHasPositions == false) {
enumFlags = DocsEnum.FLAG_FREQS;
} else if (fieldHasOffsets == false) {
if (fieldHasPayloads) {
enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS;
} else {
enumFlags = 0;
}
} else {
if (fieldHasPayloads) {
enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS | DocsAndPositionsEnum.FLAG_OFFSETS;
} else {
enumFlags = DocsAndPositionsEnum.FLAG_OFFSETS;
}
}
return 0;
//DEBUG = BlockTreeTermsWriter.DEBUG;
}
@Override
public void encodeTerm(long[] empty, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException {
PulsingTermState state = (PulsingTermState)_state;
assert empty.length == 0;
this.absolute = this.absolute || absolute;
if (state.bytes == null) {
wrappedPostingsWriter.encodeTerm(longs, buffer, fieldInfo, state.wrappedState, this.absolute);
for (int i = 0; i < longsSize; i++) {
out.writeVLong(longs[i]);
}
buffer.writeTo(out);
buffer.reset();
this.absolute = false;
} else {
out.writeVInt(state.bytes.length);
out.writeBytes(state.bytes, 0, state.bytes.length);
this.absolute = this.absolute || absolute;
}
}
@Override
public void close() throws IOException {
wrappedPostingsWriter.close();
if (wrappedPostingsWriter instanceof PulsingPostingsWriter ||
VERSION_CURRENT < VERSION_META_ARRAY) {
return;
}
String summaryFileName = IndexFileNames.segmentFileName(segmentState.segmentInfo.name, segmentState.segmentSuffix, SUMMARY_EXTENSION);
IndexOutput out = null;
try {
out = segmentState.directory.createOutput(summaryFileName, segmentState.context);
CodecUtil.writeHeader(out, CODEC, VERSION_CURRENT);
out.writeVInt(fields.size());
for (FieldMetaData field : fields) {
out.writeVInt(field.fieldNumber);
out.writeVInt(field.longsSize);
}
out.close();
} finally {
IOUtils.closeWhileHandlingException(out);
}
}
}