package org.apache.lucene.index.codecs.pulsing; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.util.CodecUtil; import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.RamUsageEstimator; // TODO: we now pulse entirely according to docFreq of the // term; it might be better to eg pulse by "net bytes used" // so that a term that has only 1 doc but zillions of // positions would not be inlined. Though this is // presumably rare in practice... /** @lucene.experimental */ public final class PulsingPostingsWriterImpl extends StandardPostingsWriter { final static String CODEC = "PulsedPostings"; // To add a new version, increment from the last one, and // change VERSION_CURRENT to point to your new version: final static int VERSION_START = 0; final static int VERSION_CURRENT = VERSION_START; IndexOutput termsOut; boolean omitTF; boolean storePayloads; // Starts a new term FieldInfo fieldInfo; /** @lucene.experimental */ public static class Document { int docID; int termDocFreq; int numPositions; Position[] positions; Document() { positions = new Position[1]; positions[0] = new Position(); } @Override public Object clone() { Document doc = new Document(); doc.docID = docID; doc.termDocFreq = termDocFreq; doc.numPositions = numPositions; doc.positions = new Position[positions.length]; for(int i = 0; i < positions.length; i++) { doc.positions[i] = (Position) positions[i].clone(); } return doc; } void reallocPositions(int minSize) { final Position[] newArray = new Position[ArrayUtil.oversize(minSize, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(positions, 0, newArray, 0, positions.length); for(int i=positions.length;i<newArray.length;i++) { newArray[i] = new Position(); } positions = newArray; } } final Document[] pendingDocs; int pendingDocCount = 0; Document currentDoc; boolean pulsed; // false if we've seen > maxPulsingDocFreq docs static class Position { BytesRef payload; int pos; @Override public Object clone() { Position position = new Position(); position.pos = pos; if (payload != null) { position.payload = new BytesRef(payload); } return position; } } // TODO: -- lazy init this? ie, if every single term // was pulsed then we never need to use this fallback? // Fallback writer for non-pulsed terms: final StandardPostingsWriter wrappedPostingsWriter; /** If docFreq <= maxPulsingDocFreq, its postings are * inlined into terms dict */ public PulsingPostingsWriterImpl(int maxPulsingDocFreq, StandardPostingsWriter wrappedPostingsWriter) throws IOException { super(); pendingDocs = new Document[maxPulsingDocFreq]; for(int i=0;i<maxPulsingDocFreq;i++) { pendingDocs[i] = new Document(); } // We simply wrap another postings writer, but only call // on it when doc freq is higher than our cutoff this.wrappedPostingsWriter = wrappedPostingsWriter; } @Override public void start(IndexOutput termsOut) throws IOException { this.termsOut = termsOut; CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT); termsOut.writeVInt(pendingDocs.length); wrappedPostingsWriter.start(termsOut); } @Override public void startTerm() { assert pendingDocCount == 0; pulsed = false; } // TODO: -- should we NOT reuse across fields? would // be cleaner // Currently, this instance is re-used across fields, so // our parent calls setField whenever the field changes @Override public void setField(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; omitTF = fieldInfo.omitTermFreqAndPositions; storePayloads = fieldInfo.storePayloads; wrappedPostingsWriter.setField(fieldInfo); } @Override public void startDoc(int docID, int termDocFreq) throws IOException { assert docID >= 0: "got docID=" + docID; if (!pulsed && pendingDocCount == pendingDocs.length) { // OK we just crossed the threshold, this term should // now be written with our wrapped codec: wrappedPostingsWriter.startTerm(); // Flush all buffered docs for(int i=0;i<pendingDocCount;i++) { final Document doc = pendingDocs[i]; wrappedPostingsWriter.startDoc(doc.docID, doc.termDocFreq); if (!omitTF) { assert doc.termDocFreq == doc.numPositions; for(int j=0;j<doc.termDocFreq;j++) { final Position pos = doc.positions[j]; if (pos.payload != null && pos.payload.length > 0) { assert storePayloads; wrappedPostingsWriter.addPosition(pos.pos, pos.payload); } else { wrappedPostingsWriter.addPosition(pos.pos, null); } } wrappedPostingsWriter.finishDoc(); } } pendingDocCount = 0; pulsed = true; } if (pulsed) { // We've already seen too many docs for this term -- // just forward to our fallback writer wrappedPostingsWriter.startDoc(docID, termDocFreq); } else { currentDoc = pendingDocs[pendingDocCount++]; currentDoc.docID = docID; // TODO: -- need not store in doc? only used for alloc & assert currentDoc.termDocFreq = termDocFreq; if (termDocFreq > currentDoc.positions.length) { currentDoc.reallocPositions(termDocFreq); } currentDoc.numPositions = 0; } } @Override public void addPosition(int position, BytesRef payload) throws IOException { if (pulsed) { wrappedPostingsWriter.addPosition(position, payload); } else { // just buffer up Position pos = currentDoc.positions[currentDoc.numPositions++]; pos.pos = position; if (payload != null && payload.length > 0) { if (pos.payload == null) { pos.payload = new BytesRef(payload); } else { pos.payload.copy(payload); } } else if (pos.payload != null) { pos.payload.length = 0; } } } @Override public void finishDoc() { assert omitTF || currentDoc.numPositions == currentDoc.termDocFreq; } boolean pendingIsIndexTerm; int pulsedCount; int nonPulsedCount; /** Called when we are done adding docs to this term */ @Override public void finishTerm(int docCount, boolean isIndexTerm) throws IOException { assert docCount > 0; pendingIsIndexTerm |= isIndexTerm; if (pulsed) { wrappedPostingsWriter.finishTerm(docCount, pendingIsIndexTerm); pendingIsIndexTerm = false; pulsedCount++; } else { nonPulsedCount++; // OK, there were few enough occurrences for this // term, so we fully inline our postings data into // terms dict, now: int lastDocID = 0; for(int i=0;i<pendingDocCount;i++) { final Document doc = pendingDocs[i]; final int delta = doc.docID - lastDocID; lastDocID = doc.docID; if (omitTF) { termsOut.writeVInt(delta); } else { assert doc.numPositions == doc.termDocFreq; if (doc.numPositions == 1) termsOut.writeVInt((delta<<1)|1); else { termsOut.writeVInt(delta<<1); termsOut.writeVInt(doc.numPositions); } // TODO: we could do better in encoding // payloadLength, eg, if it's always the same // across all terms int lastPosition = 0; int lastPayloadLength = -1; for(int j=0;j<doc.numPositions;j++) { final Position pos = doc.positions[j]; final int delta2 = pos.pos - lastPosition; lastPosition = pos.pos; if (storePayloads) { final int payloadLength = pos.payload == null ? 0 : pos.payload.length; if (payloadLength != lastPayloadLength) { termsOut.writeVInt((delta2 << 1)|1); termsOut.writeVInt(payloadLength); lastPayloadLength = payloadLength; } else { termsOut.writeVInt(delta2 << 1); } if (payloadLength > 0) { termsOut.writeBytes(pos.payload.bytes, 0, pos.payload.length); } } else { termsOut.writeVInt(delta2); } } } } } pendingDocCount = 0; } @Override public void close() throws IOException { wrappedPostingsWriter.close(); } }