/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.codecs.compressing; import static org.apache.lucene.util.BitUtil.zigZagEncode; import java.io.Closeable; import java.io.IOException; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.packed.PackedInts; /** * Efficient index format for block-based {@link Codec}s. * <p> This writer generates a file which can be loaded into memory using * memory-efficient data structures to quickly locate the block that contains * any document. * <p>In order to have a compact in-memory representation, for every block of * 1024 chunks, this index computes the average number of bytes per * chunk and for every chunk, only stores the difference between<ul> * <li>${chunk number} * ${average length of a chunk}</li> * <li>and the actual start offset of the chunk</li></ul> * <p>Data is written as follows: * <ul> * <li>PackedIntsVersion, <Block><sup>BlockCount</sup>, BlocksEndMarker</li> * <li>PackedIntsVersion --> {@link PackedInts#VERSION_CURRENT} as a {@link DataOutput#writeVInt VInt}</li> * <li>BlocksEndMarker --> <tt>0</tt> as a {@link DataOutput#writeVInt VInt}, this marks the end of blocks since blocks are not allowed to start with <tt>0</tt></li> * <li>Block --> BlockChunks, <DocBases>, <StartPointers></li> * <li>BlockChunks --> a {@link DataOutput#writeVInt VInt} which is the number of chunks encoded in the block</li> * <li>DocBases --> DocBase, AvgChunkDocs, BitsPerDocBaseDelta, DocBaseDeltas</li> * <li>DocBase --> first document ID of the block of chunks, as a {@link DataOutput#writeVInt VInt}</li> * <li>AvgChunkDocs --> average number of documents in a single chunk, as a {@link DataOutput#writeVInt VInt}</li> * <li>BitsPerDocBaseDelta --> number of bits required to represent a delta from the average using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag encoding</a></li> * <li>DocBaseDeltas --> {@link PackedInts packed} array of BlockChunks elements of BitsPerDocBaseDelta bits each, representing the deltas from the average doc base using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag encoding</a>.</li> * <li>StartPointers --> StartPointerBase, AvgChunkSize, BitsPerStartPointerDelta, StartPointerDeltas</li> * <li>StartPointerBase --> the first start pointer of the block, as a {@link DataOutput#writeVLong VLong}</li> * <li>AvgChunkSize --> the average size of a chunk of compressed documents, as a {@link DataOutput#writeVLong VLong}</li> * <li>BitsPerStartPointerDelta --> number of bits required to represent a delta from the average using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag encoding</a></li> * <li>StartPointerDeltas --> {@link PackedInts packed} array of BlockChunks elements of BitsPerStartPointerDelta bits each, representing the deltas from the average start pointer using <a href="https://developers.google.com/protocol-buffers/docs/encoding#types">ZigZag encoding</a></li> * <li>Footer --> {@link CodecUtil#writeFooter CodecFooter}</li> * </ul> * <p>Notes * <ul> * <li>For any block, the doc base of the n-th chunk can be restored with * <code>DocBase + AvgChunkDocs * n + DocBaseDeltas[n]</code>.</li> * <li>For any block, the start pointer of the n-th chunk can be restored with * <code>StartPointerBase + AvgChunkSize * n + StartPointerDeltas[n]</code>.</li> * <li>Once data is loaded into memory, you can lookup the start pointer of any * document by performing two binary searches: a first one based on the values * of DocBase in order to find the right block, and then inside the block based * on DocBaseDeltas (by reconstructing the doc bases for every chunk).</li> * </ul> * @lucene.internal */ public final class CompressingStoredFieldsIndexWriter implements Closeable { final IndexOutput fieldsIndexOut; final int blockSize; int totalDocs; int blockDocs; int blockChunks; long firstStartPointer; long maxStartPointer; final int[] docBaseDeltas; final long[] startPointerDeltas; CompressingStoredFieldsIndexWriter(IndexOutput indexOutput, int blockSize) throws IOException { if (blockSize <= 0) { throw new IllegalArgumentException("blockSize must be positive"); } this.blockSize = blockSize; this.fieldsIndexOut = indexOutput; reset(); totalDocs = 0; docBaseDeltas = new int[blockSize]; startPointerDeltas = new long[blockSize]; fieldsIndexOut.writeVInt(PackedInts.VERSION_CURRENT); } private void reset() { blockChunks = 0; blockDocs = 0; firstStartPointer = -1; // means unset } private void writeBlock() throws IOException { assert blockChunks > 0; fieldsIndexOut.writeVInt(blockChunks); // The trick here is that we only store the difference from the average start // pointer or doc base, this helps save bits per value. // And in order to prevent a few chunks that would be far from the average to // raise the number of bits per value for all of them, we only encode blocks // of 1024 chunks at once // See LUCENE-4512 // doc bases final int avgChunkDocs; if (blockChunks == 1) { avgChunkDocs = 0; } else { avgChunkDocs = Math.round((float) (blockDocs - docBaseDeltas[blockChunks - 1]) / (blockChunks - 1)); } fieldsIndexOut.writeVInt(totalDocs - blockDocs); // docBase fieldsIndexOut.writeVInt(avgChunkDocs); int docBase = 0; long maxDelta = 0; for (int i = 0; i < blockChunks; ++i) { final int delta = docBase - avgChunkDocs * i; maxDelta |= zigZagEncode(delta); docBase += docBaseDeltas[i]; } final int bitsPerDocBase = PackedInts.bitsRequired(maxDelta); fieldsIndexOut.writeVInt(bitsPerDocBase); PackedInts.Writer writer = PackedInts.getWriterNoHeader(fieldsIndexOut, PackedInts.Format.PACKED, blockChunks, bitsPerDocBase, 1); docBase = 0; for (int i = 0; i < blockChunks; ++i) { final long delta = docBase - avgChunkDocs * i; assert PackedInts.bitsRequired(zigZagEncode(delta)) <= writer.bitsPerValue(); writer.add(zigZagEncode(delta)); docBase += docBaseDeltas[i]; } writer.finish(); // start pointers fieldsIndexOut.writeVLong(firstStartPointer); final long avgChunkSize; if (blockChunks == 1) { avgChunkSize = 0; } else { avgChunkSize = (maxStartPointer - firstStartPointer) / (blockChunks - 1); } fieldsIndexOut.writeVLong(avgChunkSize); long startPointer = 0; maxDelta = 0; for (int i = 0; i < blockChunks; ++i) { startPointer += startPointerDeltas[i]; final long delta = startPointer - avgChunkSize * i; maxDelta |= zigZagEncode(delta); } final int bitsPerStartPointer = PackedInts.bitsRequired(maxDelta); fieldsIndexOut.writeVInt(bitsPerStartPointer); writer = PackedInts.getWriterNoHeader(fieldsIndexOut, PackedInts.Format.PACKED, blockChunks, bitsPerStartPointer, 1); startPointer = 0; for (int i = 0; i < blockChunks; ++i) { startPointer += startPointerDeltas[i]; final long delta = startPointer - avgChunkSize * i; assert PackedInts.bitsRequired(zigZagEncode(delta)) <= writer.bitsPerValue(); writer.add(zigZagEncode(delta)); } writer.finish(); } void writeIndex(int numDocs, long startPointer) throws IOException { if (blockChunks == blockSize) { writeBlock(); reset(); } if (firstStartPointer == -1) { firstStartPointer = maxStartPointer = startPointer; } assert firstStartPointer > 0 && startPointer >= firstStartPointer; docBaseDeltas[blockChunks] = numDocs; startPointerDeltas[blockChunks] = startPointer - maxStartPointer; ++blockChunks; blockDocs += numDocs; totalDocs += numDocs; maxStartPointer = startPointer; } void finish(int numDocs, long maxPointer) throws IOException { if (numDocs != totalDocs) { throw new IllegalStateException("Expected " + numDocs + " docs, but got " + totalDocs); } if (blockChunks > 0) { writeBlock(); } fieldsIndexOut.writeVInt(0); // end marker fieldsIndexOut.writeVLong(maxPointer); CodecUtil.writeFooter(fieldsIndexOut); } @Override public void close() throws IOException { fieldsIndexOut.close(); } }