package org.apache.lucene.index.codecs.pulsing; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.codecs.standard.TermState; import org.apache.lucene.index.codecs.standard.StandardPostingsReader; import org.apache.lucene.index.codecs.pulsing.PulsingPostingsWriterImpl.Document; import org.apache.lucene.index.codecs.pulsing.PulsingPostingsWriterImpl.Position; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CodecUtil; /** Concrete class that reads the current doc/freq/skip * postings format * @lucene.experimental */ // TODO: -- should we switch "hasProx" higher up? and // create two separate docs readers, one that also reads // prox and one that doesn't? public class PulsingPostingsReaderImpl extends StandardPostingsReader { // Fallback reader for non-pulsed terms: final StandardPostingsReader wrappedPostingsReader; int maxPulsingDocFreq; public PulsingPostingsReaderImpl(StandardPostingsReader wrappedPostingsReader) throws IOException { this.wrappedPostingsReader = wrappedPostingsReader; } @Override public void init(IndexInput termsIn) throws IOException { CodecUtil.checkHeader(termsIn, PulsingPostingsWriterImpl.CODEC, PulsingPostingsWriterImpl.VERSION_START, PulsingPostingsWriterImpl.VERSION_START); maxPulsingDocFreq = termsIn.readVInt(); wrappedPostingsReader.init(termsIn); } private static class PulsingTermState extends TermState { private Document docs[]; private TermState wrappedTermState; private boolean pendingIndexTerm; public Object clone() { PulsingTermState clone; clone = (PulsingTermState) super.clone(); clone.docs = (Document[]) docs.clone(); for(int i=0;i<clone.docs.length;i++) { final Document doc = clone.docs[i]; if (doc != null) { clone.docs[i] = (Document) doc.clone(); } } clone.wrappedTermState = (TermState) wrappedTermState.clone(); return clone; } public void copy(TermState _other) { super.copy(_other); PulsingTermState other = (PulsingTermState) _other; pendingIndexTerm = other.pendingIndexTerm; wrappedTermState.copy(other.wrappedTermState); for(int i=0;i<docs.length;i++) { if (other.docs[i] != null) { docs[i] = (Document) other.docs[i].clone(); } } } } @Override public TermState newTermState() throws IOException { PulsingTermState state = new PulsingTermState(); state.wrappedTermState = wrappedPostingsReader.newTermState(); state.docs = new Document[maxPulsingDocFreq]; return state; } @Override public void readTerm(IndexInput termsIn, FieldInfo fieldInfo, TermState _termState, boolean isIndexTerm) throws IOException { PulsingTermState termState = (PulsingTermState) _termState; termState.pendingIndexTerm |= isIndexTerm; if (termState.docFreq <= maxPulsingDocFreq) { // Inlined into terms dict -- read everything in // TODO: maybe only read everything in lazily? But // then we'd need to store length so we could seek // over it when docs/pos enum was not requested // TODO: it'd be better to share this encoding logic // in some inner codec that knows how to write a // single doc / single position, etc. This way if a // given codec wants to store other interesting // stuff, it could use this pulsing codec to do so int docID = 0; for(int i=0;i<termState.docFreq;i++) { Document doc = termState.docs[i]; if (doc == null) { doc = termState.docs[i] = new Document(); } final int code = termsIn.readVInt(); if (fieldInfo.omitTermFreqAndPositions) { docID += code; doc.numPositions = 1; } else { docID += code>>>1; if ((code & 1) != 0) { doc.numPositions = 1; } else { doc.numPositions = termsIn.readVInt(); } if (doc.numPositions > doc.positions.length) { doc.reallocPositions(doc.numPositions); } int position = 0; int payloadLength = -1; for(int j=0;j<doc.numPositions;j++) { final Position pos = doc.positions[j]; final int code2 = termsIn.readVInt(); if (fieldInfo.storePayloads) { position += code2 >>> 1; if ((code2 & 1) != 0) { payloadLength = termsIn.readVInt(); } if (payloadLength > 0) { if (pos.payload == null) { pos.payload = new BytesRef(); pos.payload.bytes = new byte[payloadLength]; } else if (payloadLength > pos.payload.bytes.length) { pos.payload.grow(payloadLength); } pos.payload.length = payloadLength; termsIn.readBytes(pos.payload.bytes, 0, payloadLength); } else if (pos.payload != null) { pos.payload.length = 0; } } else { position += code2; } pos.pos = position; } } doc.docID = docID; } } else { termState.wrappedTermState.docFreq = termState.docFreq; wrappedPostingsReader.readTerm(termsIn, fieldInfo, termState.wrappedTermState, termState.pendingIndexTerm); termState.pendingIndexTerm = false; } } // TODO: we could actually reuse, by having TL that // holds the last wrapped reuse, and vice-versa @Override public DocsEnum docs(FieldInfo field, TermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException { PulsingTermState termState = (PulsingTermState) _termState; if (termState.docFreq <= maxPulsingDocFreq) { if (reuse instanceof PulsingDocsEnum) { return ((PulsingDocsEnum) reuse).reset(skipDocs, termState); } else { PulsingDocsEnum docsEnum = new PulsingDocsEnum(); return docsEnum.reset(skipDocs, termState); } } else { if (reuse instanceof PulsingDocsEnum) { return wrappedPostingsReader.docs(field, termState.wrappedTermState, skipDocs, null); } else { return wrappedPostingsReader.docs(field, termState.wrappedTermState, skipDocs, reuse); } } } // TODO: -- not great that we can't always reuse @Override public DocsAndPositionsEnum docsAndPositions(FieldInfo field, TermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { PulsingTermState termState = (PulsingTermState) _termState; if (termState.docFreq <= maxPulsingDocFreq) { if (reuse instanceof PulsingDocsAndPositionsEnum) { return ((PulsingDocsAndPositionsEnum) reuse).reset(skipDocs, termState); } else { PulsingDocsAndPositionsEnum postingsEnum = new PulsingDocsAndPositionsEnum(); return postingsEnum.reset(skipDocs, termState); } } else { if (reuse instanceof PulsingDocsAndPositionsEnum) { return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, skipDocs, null); } else { return wrappedPostingsReader.docsAndPositions(field, termState.wrappedTermState, skipDocs, reuse); } } } static class PulsingDocsEnum extends DocsEnum { private int nextRead; private Bits skipDocs; private Document doc; private PulsingTermState state; public void close() {} PulsingDocsEnum reset(Bits skipDocs, PulsingTermState termState) { // TODO: -- not great we have to clone here -- // merging is wasteful; TermRangeQuery too state = (PulsingTermState) termState.clone(); this.skipDocs = skipDocs; nextRead = 0; return this; } @Override public int nextDoc() { while(true) { if (nextRead >= state.docFreq) { return NO_MORE_DOCS; } else { doc = state.docs[nextRead++]; if (skipDocs == null || !skipDocs.get(doc.docID)) { return doc.docID; } } } } @Override public int read() { int i=0; // TODO: -- ob1? initBulkResult(); final int[] docs = bulkResult.docs.ints; final int[] freqs = bulkResult.freqs.ints; while(nextRead < state.docFreq) { doc = state.docs[nextRead++]; if (skipDocs == null || !skipDocs.get(doc.docID)) { docs[i] = doc.docID; freqs[i] = doc.numPositions; i++; } } return i; } @Override public int freq() { return doc.numPositions; } @Override public int docID() { return doc.docID; } @Override public int advance(int target) throws IOException { int doc; while((doc=nextDoc()) != NO_MORE_DOCS) { if (doc >= target) return doc; } return NO_MORE_DOCS; } } static class PulsingDocsAndPositionsEnum extends DocsAndPositionsEnum { private int nextRead; private int nextPosRead; private Bits skipDocs; private Document doc; private Position pos; private PulsingTermState state; // Only here to emulate limitation of standard codec, // which only allows retrieving payload more than once private boolean payloadRetrieved; public void close() {} PulsingDocsAndPositionsEnum reset(Bits skipDocs, PulsingTermState termState) { // TODO: -- not great we have to clone here -- // merging is wasteful; TermRangeQuery too state = (PulsingTermState) termState.clone(); this.skipDocs = skipDocs; nextRead = 0; nextPosRead = 0; return this; } @Override public int nextDoc() { while(true) { if (nextRead >= state.docFreq) { return NO_MORE_DOCS; } else { doc = state.docs[nextRead++]; if (skipDocs == null || !skipDocs.get(doc.docID)) { nextPosRead = 0; return doc.docID; } } } } @Override public int freq() { return doc.numPositions; } @Override public int docID() { return doc.docID; } @Override public int advance(int target) throws IOException { int doc; while((doc=nextDoc()) != NO_MORE_DOCS) { if (doc >= target) { return doc; } } return NO_MORE_DOCS; } @Override public int nextPosition() { assert nextPosRead < doc.numPositions; pos = doc.positions[nextPosRead++]; payloadRetrieved = false; return pos.pos; } @Override public boolean hasPayload() { return !payloadRetrieved && pos.payload != null && pos.payload.length > 0; } @Override public BytesRef getPayload() { payloadRetrieved = true; return pos.payload; } } @Override public void close() throws IOException { wrappedPostingsReader.close(); } }