package org.apache.lucene.index.codecs.sep; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import; import java.util.Collection; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.TermState; import org.apache.lucene.index.codecs.BlockTermState; import org.apache.lucene.index.codecs.PostingsReaderBase; import; import; import; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CodecUtil; /** Concrete class that reads the current doc/freq/skip * postings format. * * @lucene.experimental */ // TODO: -- should we switch "hasProx" higher up? and // create two separate docs readers, one that also reads // prox and one that doesn't? public class SepPostingsReaderImpl extends PostingsReaderBase { final IntIndexInput freqIn; final IntIndexInput docIn; final IntIndexInput posIn; final IndexInput payloadIn; final IndexInput skipIn; int skipInterval; int maxSkipLevels; int skipMinimum; public SepPostingsReaderImpl(Directory dir, SegmentInfo segmentInfo, int readBufferSize, IntStreamFactory intFactory, String codecId) throws IOException { boolean success = false; try { final String docFileName = IndexFileNames.segmentFileName(, codecId, SepPostingsWriterImpl.DOC_EXTENSION); docIn = intFactory.openInput(dir, docFileName); skipIn = dir.openInput(IndexFileNames.segmentFileName(, codecId, SepPostingsWriterImpl.SKIP_EXTENSION), readBufferSize); if (segmentInfo.getHasProx()) { freqIn = intFactory.openInput(dir, IndexFileNames.segmentFileName(, codecId, SepPostingsWriterImpl.FREQ_EXTENSION)); posIn = intFactory.openInput(dir, IndexFileNames.segmentFileName(, codecId, SepPostingsWriterImpl.POS_EXTENSION), readBufferSize); payloadIn = dir.openInput(IndexFileNames.segmentFileName(, codecId, SepPostingsWriterImpl.PAYLOAD_EXTENSION), readBufferSize); } else { posIn = null; payloadIn = null; freqIn = null; } success = true; } finally { if (!success) { close(); } } } public static void files(SegmentInfo segmentInfo, String codecId, Collection<String> files) { files.add(IndexFileNames.segmentFileName(, codecId, SepPostingsWriterImpl.DOC_EXTENSION)); files.add(IndexFileNames.segmentFileName(, codecId, SepPostingsWriterImpl.SKIP_EXTENSION)); if (segmentInfo.getHasProx()) { files.add(IndexFileNames.segmentFileName(, codecId, SepPostingsWriterImpl.FREQ_EXTENSION)); files.add(IndexFileNames.segmentFileName(, codecId, SepPostingsWriterImpl.POS_EXTENSION)); files.add(IndexFileNames.segmentFileName(, codecId, SepPostingsWriterImpl.PAYLOAD_EXTENSION)); } } @Override public void init(IndexInput termsIn) throws IOException { // Make sure we are talking to the matching past writer CodecUtil.checkHeader(termsIn, SepPostingsWriterImpl.CODEC, SepPostingsWriterImpl.VERSION_START, SepPostingsWriterImpl.VERSION_START); skipInterval = termsIn.readInt(); maxSkipLevels = termsIn.readInt(); skipMinimum = termsIn.readInt(); } @Override public void close() throws IOException { try { if (freqIn != null) freqIn.close(); } finally { try { if (docIn != null) docIn.close(); } finally { try { if (skipIn != null) skipIn.close(); } finally { try { if (posIn != null) { posIn.close(); } } finally { if (payloadIn != null) { payloadIn.close(); } } } } } } private static final class SepTermState extends BlockTermState { // We store only the seek point to the docs file because // the rest of the info (freqIndex, posIndex, etc.) is // stored in the docs file: IntIndexInput.Index docIndex; IntIndexInput.Index posIndex; IntIndexInput.Index freqIndex; long payloadFP; long skipFP; // Only used for "primary" term state; these are never // copied on clone: byte[] bytes; ByteArrayDataInput bytesReader; @Override public Object clone() { SepTermState other = (SepTermState) super.clone(); other.docIndex = (IntIndexInput.Index) docIndex.clone(); if (freqIndex != null) { other.freqIndex = (IntIndexInput.Index) freqIndex.clone(); } if (posIndex != null) { other.posIndex = (IntIndexInput.Index) posIndex.clone(); } return other; } @Override public void copyFrom(TermState _other) { super.copyFrom(_other); SepTermState other = (SepTermState) _other; docIndex.set(other.docIndex); if (freqIndex != null && other.freqIndex != null) { freqIndex.set(other.freqIndex); } if (posIndex != null && other.posIndex != null) { posIndex.set(other.posIndex); } payloadFP = other.payloadFP; skipFP = other.skipFP; } @Override public String toString() { return super.toString() + " docIndex=" + docIndex + " freqIndex=" + freqIndex + " posIndex=" + posIndex + " payloadFP=" + payloadFP + " skipFP=" + skipFP; } } @Override public BlockTermState newTermState() throws IOException { final SepTermState state = new SepTermState(); state.docIndex = docIn.index(); if (freqIn != null) { state.freqIndex = freqIn.index(); } if (posIn != null) { state.posIndex = posIn.index(); } return state; } @Override public void readTermsBlock(IndexInput termsIn, FieldInfo fieldInfo, BlockTermState _termState) throws IOException { final SepTermState termState = (SepTermState) _termState; final int len = termsIn.readVInt(); //System.out.println("SepR.readTermsBlock len=" + len); if (termState.bytes == null) { termState.bytes = new byte[ArrayUtil.oversize(len, 1)]; termState.bytesReader = new ByteArrayDataInput(termState.bytes); } else if (termState.bytes.length < len) { termState.bytes = new byte[ArrayUtil.oversize(len, 1)]; } termState.bytesReader.reset(termState.bytes, 0, len); termsIn.readBytes(termState.bytes, 0, len); } @Override public void nextTerm(FieldInfo fieldInfo, BlockTermState _termState) throws IOException { final SepTermState termState = (SepTermState) _termState; //System.out.println("SepR.nextTerm termCount=" + termState.termCount); //System.out.println(" docFreq=" + termState.docFreq); final boolean isFirstTerm = termState.termCount == 0;, isFirstTerm); //System.out.println(" docIndex=" + termState.docIndex); if (!fieldInfo.omitTermFreqAndPositions) {, isFirstTerm); //System.out.println(" freqIndex=" + termState.freqIndex);, isFirstTerm); //System.out.println(" posIndex=" + termState.posIndex); if (fieldInfo.storePayloads) { if (isFirstTerm) { termState.payloadFP = termState.bytesReader.readVLong(); } else { termState.payloadFP += termState.bytesReader.readVLong(); } //System.out.println(" payloadFP=" + termState.payloadFP); } } if (termState.docFreq >= skipMinimum) { //System.out.println(" readSkip @ " + termState.bytesReader.pos); if (isFirstTerm) { termState.skipFP = termState.bytesReader.readVLong(); } else { termState.skipFP += termState.bytesReader.readVLong(); } //System.out.println(" skipFP=" + termState.skipFP); } else if (isFirstTerm) { termState.skipFP = 0; } } @Override public DocsEnum docs(FieldInfo fieldInfo, BlockTermState _termState, Bits skipDocs, DocsEnum reuse) throws IOException { final SepTermState termState = (SepTermState) _termState; SepDocsEnum docsEnum; if (reuse == null || !(reuse instanceof SepDocsEnum)) { docsEnum = new SepDocsEnum(); } else { docsEnum = (SepDocsEnum) reuse; if (docsEnum.startDocIn != docIn) { // If you are using ParellelReader, and pass in a // reused DocsAndPositionsEnum, it could have come // from another reader also using sep codec docsEnum = new SepDocsEnum(); } } return docsEnum.init(fieldInfo, termState, skipDocs); } @Override public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState _termState, Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { assert !fieldInfo.omitTermFreqAndPositions; final SepTermState termState = (SepTermState) _termState; SepDocsAndPositionsEnum postingsEnum; if (reuse == null || !(reuse instanceof SepDocsAndPositionsEnum)) { postingsEnum = new SepDocsAndPositionsEnum(); } else { postingsEnum = (SepDocsAndPositionsEnum) reuse; if (postingsEnum.startDocIn != docIn) { // If you are using ParellelReader, and pass in a // reused DocsAndPositionsEnum, it could have come // from another reader also using sep codec postingsEnum = new SepDocsAndPositionsEnum(); } } return postingsEnum.init(fieldInfo, termState, skipDocs); } class SepDocsEnum extends DocsEnum { int docFreq; int doc; int count; int freq; long freqStart; // TODO: -- should we do omitTF with 2 different enum classes? private boolean omitTF; private boolean storePayloads; private Bits skipDocs; private final IntIndexInput.Reader docReader; private final IntIndexInput.Reader freqReader; private long skipFP; private final IntIndexInput.Index docIndex; private final IntIndexInput.Index freqIndex; private final IntIndexInput.Index posIndex; private final IntIndexInput startDocIn; // TODO: -- should we do hasProx with 2 different enum classes? boolean skipped; SepSkipListReader skipper; SepDocsEnum() throws IOException { startDocIn = docIn; docReader = docIn.reader(); docIndex = docIn.index(); if (freqIn != null) { freqReader = freqIn.reader(); freqIndex = freqIn.index(); } else { freqReader = null; freqIndex = null; } if (posIn != null) { posIndex = posIn.index(); // only init this so skipper can read it } else { posIndex = null; } } SepDocsEnum init(FieldInfo fieldInfo, SepTermState termState, Bits skipDocs) throws IOException { this.skipDocs = skipDocs; omitTF = fieldInfo.omitTermFreqAndPositions; storePayloads = fieldInfo.storePayloads; // TODO: can't we only do this if consumer // skipped consuming the previous docs? docIndex.set(termState.docIndex);; if (!omitTF) { freqIndex.set(termState.freqIndex);; } else { freq = 1; } docFreq = termState.docFreq; // NOTE: unused if docFreq < skipMinimum: skipFP = termState.skipFP; count = 0; doc = 0; skipped = false; return this; } @Override public int nextDoc() throws IOException { while(true) { if (count == docFreq) { return doc = NO_MORE_DOCS; } count++; // Decode next doc //System.out.println("decode docDelta:"); doc +=; if (!omitTF) { //System.out.println("decode freq:"); freq =; } if (skipDocs == null || !skipDocs.get(doc)) { break; } } return doc; } @Override public int read() throws IOException { // TODO: -- switch to bulk read api in IntIndexInput //System.out.println("sepdocs read"); final int[] docs =; final int[] freqs = bulkResult.freqs.ints; int i = 0; final int length = docs.length; while (i < length && count < docFreq) { count++; // manually inlined call to next() for speed //System.out.println("decode doc"); doc +=; if (!omitTF) { //System.out.println("decode freq"); freq =; } if (skipDocs == null || !skipDocs.get(doc)) { docs[i] = doc; freqs[i] = freq; //System.out.println(" docs[" + i + "]=" + doc + " count=" + count + " dF=" + docFreq); i++; } } return i; } @Override public int freq() { return freq; } @Override public int docID() { return doc; } @Override public int advance(int target) throws IOException { if ((target - skipInterval) >= doc && docFreq >= skipMinimum) { // There are enough docs in the posting to have // skip data, and its not too close if (skipper == null) { // This DocsEnum has never done any skipping skipper = new SepSkipListReader((IndexInput) skipIn.clone(), freqIn, docIn, posIn, maxSkipLevels, skipInterval); } if (!skipped) { // We haven't yet skipped for this posting skipper.init(skipFP, docIndex, freqIndex, posIndex, 0, docFreq, storePayloads); skipper.setOmitTF(omitTF); skipped = true; } final int newCount = skipper.skipTo(target); if (newCount > count) { // Skipper did move if (!omitTF) { skipper.getFreqIndex().seek(freqReader); } skipper.getDocIndex().seek(docReader); count = newCount; doc = skipper.getDoc(); } } // Now, linear scan for the rest: do { if (nextDoc() == NO_MORE_DOCS) { return NO_MORE_DOCS; } } while (target > doc); return doc; } } class SepDocsAndPositionsEnum extends DocsAndPositionsEnum { int docFreq; int doc; int count; int freq; long freqStart; private boolean storePayloads; private Bits skipDocs; private final IntIndexInput.Reader docReader; private final IntIndexInput.Reader freqReader; private final IntIndexInput.Reader posReader; private final IndexInput payloadIn; private long skipFP; private final IntIndexInput.Index docIndex; private final IntIndexInput.Index freqIndex; private final IntIndexInput.Index posIndex; private final IntIndexInput startDocIn; private long payloadFP; private int pendingPosCount; private int position; private int payloadLength; private long pendingPayloadBytes; private boolean skipped; private SepSkipListReader skipper; private boolean payloadPending; private boolean posSeekPending; SepDocsAndPositionsEnum() throws IOException { startDocIn = docIn; docReader = docIn.reader(); docIndex = docIn.index(); freqReader = freqIn.reader(); freqIndex = freqIn.index(); posReader = posIn.reader(); posIndex = posIn.index(); payloadIn = (IndexInput) SepPostingsReaderImpl.this.payloadIn.clone(); } SepDocsAndPositionsEnum init(FieldInfo fieldInfo, SepTermState termState, Bits skipDocs) throws IOException { this.skipDocs = skipDocs; storePayloads = fieldInfo.storePayloads; //System.out.println("Sep D&P init"); // TODO: can't we only do this if consumer // skipped consuming the previous docs? docIndex.set(termState.docIndex);; //System.out.println(" docIndex=" + docIndex); freqIndex.set(termState.freqIndex);; //System.out.println(" freqIndex=" + freqIndex); posIndex.set(termState.posIndex); //System.out.println(" posIndex=" + posIndex); posSeekPending = true; payloadPending = false; payloadFP = termState.payloadFP; skipFP = termState.skipFP; //System.out.println(" skipFP=" + skipFP); docFreq = termState.docFreq; count = 0; doc = 0; pendingPosCount = 0; pendingPayloadBytes = 0; skipped = false; return this; } @Override public int nextDoc() throws IOException { while(true) { if (count == docFreq) { return doc = NO_MORE_DOCS; } count++; // TODO: maybe we should do the 1-bit trick for encoding // freq=1 case? // Decode next doc //System.out.println(" sep d&p read doc"); doc +=; //System.out.println(" sep d&p read freq"); freq =; pendingPosCount += freq; if (skipDocs == null || !skipDocs.get(doc)) { break; } } position = 0; return doc; } @Override public int freq() { return freq; } @Override public int docID() { return doc; } @Override public int advance(int target) throws IOException { //System.out.println("SepD&P advance target=" + target + " vs current=" + doc + " this=" + this); if ((target - skipInterval) >= doc && docFreq >= skipMinimum) { // There are enough docs in the posting to have // skip data, and its not too close if (skipper == null) { //System.out.println(" create skipper"); // This DocsEnum has never done any skipping skipper = new SepSkipListReader((IndexInput) skipIn.clone(), freqIn, docIn, posIn, maxSkipLevels, skipInterval); } if (!skipped) { //System.out.println(" init skip data skipFP=" + skipFP); // We haven't yet skipped for this posting skipper.init(skipFP, docIndex, freqIndex, posIndex, payloadFP, docFreq, storePayloads); skipped = true; } final int newCount = skipper.skipTo(target); //System.out.println(" skip newCount=" + newCount + " vs " + count); if (newCount > count) { // Skipper did move skipper.getFreqIndex().seek(freqReader); skipper.getDocIndex().seek(docReader); // NOTE: don't seek pos here; do it lazily // instead. Eg a PhraseQuery may skip to many // docs before finally asking for positions... posIndex.set(skipper.getPosIndex()); posSeekPending = true; count = newCount; doc = skipper.getDoc(); //System.out.println(" moved to doc=" + doc); //; payloadFP = skipper.getPayloadPointer(); pendingPosCount = 0; pendingPayloadBytes = 0; payloadPending = false; payloadLength = skipper.getPayloadLength(); //System.out.println(" move payloadLen=" + payloadLength); } } // Now, linear scan for the rest: do { if (nextDoc() == NO_MORE_DOCS) { //System.out.println(" advance nextDoc=END"); return NO_MORE_DOCS; } //System.out.println(" advance nextDoc=" + doc); } while (target > doc); //System.out.println(" return doc=" + doc); return doc; } @Override public int nextPosition() throws IOException { if (posSeekPending) {;; posSeekPending = false; } // scan over any docs that were iterated without their // positions while (pendingPosCount > freq) { final int code =; if (storePayloads && (code & 1) != 0) { // Payload length has changed payloadLength =; assert payloadLength >= 0; } pendingPosCount--; position = 0; pendingPayloadBytes += payloadLength; } final int code =; assert code >= 0; if (storePayloads) { if ((code & 1) != 0) { // Payload length has changed payloadLength =; assert payloadLength >= 0; } position += code >> 1; pendingPayloadBytes += payloadLength; payloadPending = payloadLength > 0; } else { position += code; } pendingPosCount--; assert pendingPosCount >= 0; return position; } private BytesRef payload; @Override public BytesRef getPayload() throws IOException { if (!payloadPending) { throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once."); } assert pendingPayloadBytes >= payloadLength; if (pendingPayloadBytes > payloadLength) { + (pendingPayloadBytes - payloadLength)); } if (payload == null) { payload = new BytesRef(); payload.bytes = new byte[payloadLength]; } else if (payload.bytes.length < payloadLength) { payload.grow(payloadLength); } payloadIn.readBytes(payload.bytes, 0, payloadLength); payloadPending = false; payload.length = payloadLength; pendingPayloadBytes = 0; return payload; } @Override public boolean hasPayload() { return payloadPending && payloadLength > 0; } } }