package org.apache.lucene.index.codecs.pulsing;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.codecs.PostingsReaderBase;
import org.apache.lucene.index.codecs.BlockTermState;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
/** Concrete class that reads the current doc/freq/skip
* postings format
* @lucene.experimental */
// TODO: -- should we switch "hasProx" higher up? and
// create two separate docs readers, one that also reads
// prox and one that doesn't?
public class PulsingPostingsReaderImpl extends PostingsReaderBase {
// Fallback reader for non-pulsed terms:
final PostingsReaderBase wrappedPostingsReader;
int maxPositions;
public PulsingPostingsReaderImpl(PostingsReaderBase wrappedPostingsReader) throws IOException {
this.wrappedPostingsReader = wrappedPostingsReader;
}
@Override
public void init(IndexInput termsIn) throws IOException {
CodecUtil.checkHeader(termsIn, PulsingPostingsWriterImpl.CODEC,
PulsingPostingsWriterImpl.VERSION_START, PulsingPostingsWriterImpl.VERSION_START);
maxPositions = termsIn.readVInt();
wrappedPostingsReader.init(termsIn);
}
private static class PulsingTermState extends BlockTermState {
private byte[] postings;
private int postingsSize; // -1 if this term was not inlined
private BlockTermState wrappedTermState;
ByteArrayDataInput inlinedBytesReader;
private byte[] inlinedBytes;
@Override
public Object clone() {
PulsingTermState clone;
clone = (PulsingTermState) super.clone();
if (postingsSize != -1) {
clone.postings = new byte[postingsSize];
System.arraycopy(postings, 0, clone.postings, 0, postingsSize);
} else {
assert wrappedTermState != null;
clone.wrappedTermState = (BlockTermState) wrappedTermState.clone();
}
return clone;
}
@Override
public void copyFrom(TermState _other) {
super.copyFrom(_other);
PulsingTermState other = (PulsingTermState) _other;
postingsSize = other.postingsSize;
if (other.postingsSize != -1) {
if (postings == null || postings.length < other.postingsSize) {
postings = new byte[ArrayUtil.oversize(other.postingsSize, 1)];
}
System.arraycopy(other.postings, 0, postings, 0, other.postingsSize);
} else {
wrappedTermState.copyFrom(other.wrappedTermState);
}
// NOTE: we do not copy the
// inlinedBytes/inlinedBytesReader; these are only
// stored on the "primary" TermState. They are
// "transient" to cloned term states.
}
@Override
public String toString() {
if (postingsSize == -1) {
return "PulsingTermState: not inlined: wrapped=" + wrappedTermState;
} else {
return "PulsingTermState: inlined size=" + postingsSize + " " + super.toString();
}
}
}
@Override
public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
final PulsingTermState termState = (PulsingTermState) _termState;
if (termState.inlinedBytes == null) {
termState.inlinedBytes = new byte[128];
termState.inlinedBytesReader = new ByteArrayDataInput(null);
}
int len = termsIn.readVInt();
if (termState.inlinedBytes.length < len) {
termState.inlinedBytes = new byte[ArrayUtil.oversize(len, 1)];
}
termsIn.readBytes(termState.inlinedBytes, 0, len);
termState.inlinedBytesReader.reset(termState.inlinedBytes);
termState.wrappedTermState.termCount = 0;
wrappedPostingsReader.readTermsBlock(termsIn, fieldInfo, termState.wrappedTermState);
}
@Override
public BlockTermState newTermState() throws IOException {
PulsingTermState state = new PulsingTermState();
state.wrappedTermState = wrappedPostingsReader.newTermState();
return state;
}
@Override
public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState) throws IOException {
//System.out.println("PR nextTerm");
PulsingTermState termState = (PulsingTermState) _termState;
// total TF, but in the omitTFAP case its computed based on docFreq.
long count = fieldInfo.omitTermFreqAndPositions ? termState.docFreq : termState.totalTermFreq;
//System.out.println(" count=" + count + " threshold=" + maxPositions);
if (count <= maxPositions) {
//System.out.println(" inlined pos=" + termState.inlinedBytesReader.getPosition());
// Inlined into terms dict -- just read the byte[] blob in,
// but don't decode it now (we only decode when a DocsEnum
// or D&PEnum is pulled):
termState.postingsSize = termState.inlinedBytesReader.readVInt();
if (termState.postings == null || termState.postings.length < termState.postingsSize) {
termState.postings = new byte[ArrayUtil.oversize(termState.postingsSize, 1)];
}
// TODO: sort of silly to copy from one big byte[]
// (the blob holding all inlined terms' blobs for
// current term block) into another byte[] (just the
// blob for this term)...
termState.inlinedBytesReader.readBytes(termState.postings, 0, termState.postingsSize);
} else {
//System.out.println(" not inlined");
termState.postingsSize = -1;
// TODO: should we do full copyFrom? much heavier...?
termState.wrappedTermState.docFreq = termState.docFreq;
termState.wrappedTermState.totalTermFreq = termState.totalTermFreq;
wrappedPostingsReader.nextTerm(fieldInfo, termState.wrappedTermState);
termState.wrappedTermState.termCount++;
}
}
// TODO: we could actually reuse, by having TL that
// holds the last wrapped reuse, and vice-versa
@Override
public DocsEnum docs(FieldInfo field, BlockTermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException {
PulsingTermState termState = (PulsingTermState) _termState;
if (termState.postingsSize != -1) {
PulsingDocsEnum postings;
if (reuse instanceof PulsingDocsEnum) {
postings = (PulsingDocsEnum) reuse;
if (!postings.canReuse(field)) {
postings = new PulsingDocsEnum(field);
}
} else {
postings = new PulsingDocsEnum(field);
}
return postings.reset(skipDocs, termState);
} else {
// TODO: not great that we lose reuse of PulsingDocsEnum in this case:
if (reuse instanceof PulsingDocsEnum) {
return wrappedPostingsReader.docs(field, termState.wrappedTermState, skipDocs, null);
} else {
return wrappedPostingsReader.docs(field, termState.wrappedTermState, skipDocs, reuse);
}
}
}
// TODO: -- not great that we can't always reuse
@Override
public DocsAndPositionsEnum docsAndPositions(FieldInfo field, BlockTermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
if (field.omitTermFreqAndPositions) {
return null;
}
//System.out.println("D&P: field=" + field.name);
final PulsingTermState termState = (PulsingTermState) _termState;
if (termState.postingsSize != -1) {
PulsingDocsAndPositionsEnum postings;
if (reuse instanceof PulsingDocsAndPositionsEnum) {
postings = (PulsingDocsAndPositionsEnum) reuse;
if (!postings.canReuse(field)) {
postings = new PulsingDocsAndPositionsEnum(field);
}
} else {
postings = new PulsingDocsAndPositionsEnum(field);
}
return postings.reset(skipDocs, termState);
} else {
if (reuse instanceof PulsingDocsAndPositionsEnum) {
return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, skipDocs, null);
} else {
return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, skipDocs, reuse);
}
}
}
private static class PulsingDocsEnum extends DocsEnum {
private final ByteArrayDataInput postings = new ByteArrayDataInput(null);
private final boolean omitTF;
private final boolean storePayloads;
private Bits skipDocs;
private int docID;
private int freq;
public PulsingDocsEnum(FieldInfo fieldInfo) {
omitTF = fieldInfo.omitTermFreqAndPositions;
storePayloads = fieldInfo.storePayloads;
}
public PulsingDocsEnum reset(Bits skipDocs, PulsingTermState termState) {
//System.out.println("PR docsEnum termState=" + termState + " docFreq=" + termState.docFreq);
assert termState.postingsSize != -1;
final byte[] bytes = new byte[termState.postingsSize];
System.arraycopy(termState.postings, 0, bytes, 0, termState.postingsSize);
postings.reset(bytes);
docID = 0;
freq = 1;
this.skipDocs = skipDocs;
return this;
}
boolean canReuse(FieldInfo fieldInfo) {
return omitTF == fieldInfo.omitTermFreqAndPositions && storePayloads == fieldInfo.storePayloads;
}
@Override
public int nextDoc() throws IOException {
//System.out.println("PR nextDoc this= "+ this);
while(true) {
if (postings.eof()) {
//System.out.println("PR END");
return docID = NO_MORE_DOCS;
}
final int code = postings.readVInt();
if (omitTF) {
docID += code;
} else {
docID += code >>> 1; // shift off low bit
if ((code & 1) != 0) { // if low bit is set
freq = 1; // freq is one
} else {
freq = postings.readVInt(); // else read freq
}
// Skip positions
if (storePayloads) {
int payloadLength = -1;
for(int pos=0;pos<freq;pos++) {
final int posCode = postings.readVInt();
if ((posCode & 1) != 0) {
payloadLength = postings.readVInt();
}
if (payloadLength != 0) {
postings.skipBytes(payloadLength);
}
}
} else {
for(int pos=0;pos<freq;pos++) {
// TODO: skipVInt
postings.readVInt();
}
}
}
if (skipDocs == null || !skipDocs.get(docID)) {
//System.out.println(" return docID=" + docID + " freq=" + freq);
return docID;
}
}
}
@Override
public int freq() {
return freq;
}
@Override
public int docID() {
return docID;
}
@Override
public int advance(int target) throws IOException {
int doc;
while((doc=nextDoc()) != NO_MORE_DOCS) {
if (doc >= target)
return doc;
}
return docID = NO_MORE_DOCS;
}
}
private static class PulsingDocsAndPositionsEnum extends DocsAndPositionsEnum {
private final ByteArrayDataInput postings = new ByteArrayDataInput(null);
private final boolean storePayloads;
private Bits skipDocs;
private int docID;
private int freq;
private int posPending;
private int position;
private int payloadLength;
private BytesRef payload;
private boolean payloadRetrieved;
public PulsingDocsAndPositionsEnum(FieldInfo fieldInfo) {
storePayloads = fieldInfo.storePayloads;
}
boolean canReuse(FieldInfo fieldInfo) {
return storePayloads == fieldInfo.storePayloads;
}
public PulsingDocsAndPositionsEnum reset(Bits skipDocs, PulsingTermState termState) {
assert termState.postingsSize != -1;
final byte[] bytes = new byte[termState.postingsSize];
System.arraycopy(termState.postings, 0, bytes, 0, termState.postingsSize);
postings.reset(bytes);
this.skipDocs = skipDocs;
payloadLength = 0;
docID = 0;
//System.out.println("PR d&p reset storesPayloads=" + storePayloads + " bytes=" + bytes.length + " this=" + this);
return this;
}
@Override
public int nextDoc() throws IOException {
//System.out.println("PR d&p nextDoc this=" + this);
while(true) {
//System.out.println(" cycle skip posPending=" + posPending);
skipPositions();
if (postings.eof()) {
//System.out.println("PR END");
return docID = NO_MORE_DOCS;
}
final int code = postings.readVInt();
docID += code >>> 1; // shift off low bit
if ((code & 1) != 0) { // if low bit is set
freq = 1; // freq is one
} else {
freq = postings.readVInt(); // else read freq
}
posPending = freq;
if (skipDocs == null || !skipDocs.get(docID)) {
//System.out.println(" return docID=" + docID + " freq=" + freq);
position = 0;
return docID;
}
}
}
@Override
public int freq() {
return freq;
}
@Override
public int docID() {
return docID;
}
@Override
public int advance(int target) throws IOException {
int doc;
while((doc=nextDoc()) != NO_MORE_DOCS) {
if (doc >= target) {
return doc;
}
}
return docID = NO_MORE_DOCS;
}
@Override
public int nextPosition() throws IOException {
//System.out.println("PR d&p nextPosition posPending=" + posPending + " vs freq=" + freq);
assert posPending > 0;
posPending--;
if (storePayloads) {
if (!payloadRetrieved) {
//System.out.println("PR skip payload=" + payloadLength);
postings.skipBytes(payloadLength);
}
final int code = postings.readVInt();
//System.out.println("PR code=" + code);
if ((code & 1) != 0) {
payloadLength = postings.readVInt();
//System.out.println("PR new payload len=" + payloadLength);
}
position += code >> 1;
payloadRetrieved = false;
} else {
position += postings.readVInt();
}
//System.out.println("PR d&p nextPos return pos=" + position + " this=" + this);
return position;
}
private void skipPositions() throws IOException {
while(posPending != 0) {
nextPosition();
}
if (storePayloads && !payloadRetrieved) {
//System.out.println(" skip payload len=" + payloadLength);
postings.skipBytes(payloadLength);
payloadRetrieved = true;
}
}
@Override
public boolean hasPayload() {
return storePayloads && !payloadRetrieved && payloadLength > 0;
}
@Override
public BytesRef getPayload() throws IOException {
//System.out.println("PR getPayload payloadLength=" + payloadLength + " this=" + this);
if (payloadRetrieved) {
throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
}
payloadRetrieved = true;
if (payloadLength > 0) {
if (payload == null) {
payload = new BytesRef(payloadLength);
} else {
payload.grow(payloadLength);
}
postings.readBytes(payload.bytes, 0, payloadLength);
payload.length = payloadLength;
return payload;
} else {
return null;
}
}
}
@Override
public void close() throws IOException {
wrappedPostingsReader.close();
}
}