package org.apache.lucene.codecs.compressing; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.BYTE_ARR; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.CODEC_SFX_DAT; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.CODEC_SFX_IDX; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_DOUBLE; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_FLOAT; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_INT; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_LONG; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.STRING; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.TYPE_BITS; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.TYPE_MASK; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_BIG_CHUNKS; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_CURRENT; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_START; import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.FIELDS_EXTENSION; import static org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION; import java.io.EOFException; import java.io.IOException; import java.util.Arrays; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.packed.PackedInts; /** * {@link StoredFieldsReader} impl for {@link CompressingStoredFieldsFormat}. * @lucene.experimental */ public final class CompressingStoredFieldsReader extends StoredFieldsReader { // Do not reuse the decompression buffer when there is more than 32kb to decompress private static final int BUFFER_REUSE_THRESHOLD = 1 << 15; private static final byte[] SKIP_BUFFER = new byte[1024]; // TODO: should this be a method on DataInput? private static void skipBytes(DataInput in, long numBytes) throws IOException { assert numBytes >= 0; for (long skipped = 0; skipped < numBytes; ) { final int toRead = (int) Math.min(numBytes - skipped, SKIP_BUFFER.length); in.readBytes(SKIP_BUFFER, 0, toRead); skipped += toRead; } } private final int version; private final FieldInfos fieldInfos; private final CompressingStoredFieldsIndexReader indexReader; private final IndexInput fieldsStream; private final int chunkSize; private final int packedIntsVersion; private final CompressionMode compressionMode; private final Decompressor decompressor; private final BytesRef bytes; private final int numDocs; private boolean closed; // used by clone private CompressingStoredFieldsReader(CompressingStoredFieldsReader reader) { this.version = reader.version; this.fieldInfos = reader.fieldInfos; this.fieldsStream = reader.fieldsStream.clone(); this.indexReader = reader.indexReader.clone(); this.chunkSize = reader.chunkSize; this.packedIntsVersion = reader.packedIntsVersion; this.compressionMode = reader.compressionMode; this.decompressor = reader.decompressor.clone(); this.numDocs = reader.numDocs; this.bytes = new BytesRef(reader.bytes.bytes.length); this.closed = false; } /** Sole constructor. */ public CompressingStoredFieldsReader(Directory d, SegmentInfo si, String segmentSuffix, FieldInfos fn, IOContext context, String formatName, CompressionMode compressionMode) throws IOException { this.compressionMode = compressionMode; final String segment = si.name; boolean success = false; fieldInfos = fn; numDocs = si.getDocCount(); IndexInput indexStream = null; try { // Load the index into memory final String indexStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_INDEX_EXTENSION); indexStream = d.openInput(indexStreamFN, context); final String codecNameIdx = formatName + CODEC_SFX_IDX; version = CodecUtil.checkHeader(indexStream, codecNameIdx, VERSION_START, VERSION_CURRENT); assert CodecUtil.headerLength(codecNameIdx) == indexStream.getFilePointer(); indexReader = new CompressingStoredFieldsIndexReader(indexStream, si); indexStream.close(); indexStream = null; // Open the data file and read metadata final String fieldsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_EXTENSION); fieldsStream = d.openInput(fieldsStreamFN, context); final String codecNameDat = formatName + CODEC_SFX_DAT; final int fieldsVersion = CodecUtil.checkHeader(fieldsStream, codecNameDat, VERSION_START, VERSION_CURRENT); if (version != fieldsVersion) { throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + fieldsVersion); } assert CodecUtil.headerLength(codecNameDat) == fieldsStream.getFilePointer(); if (version >= VERSION_BIG_CHUNKS) { chunkSize = fieldsStream.readVInt(); } else { chunkSize = -1; } packedIntsVersion = fieldsStream.readVInt(); decompressor = compressionMode.newDecompressor(); this.bytes = new BytesRef(); success = true; } finally { if (!success) { IOUtils.closeWhileHandlingException(this, indexStream); } } } /** * @throws AlreadyClosedException if this FieldsReader is closed */ private void ensureOpen() throws AlreadyClosedException { if (closed) { throw new AlreadyClosedException("this FieldsReader is closed"); } } /** * Close the underlying {@link IndexInput}s. */ @Override public void close() throws IOException { if (!closed) { IOUtils.close(fieldsStream); closed = true; } } private static void readField(DataInput in, StoredFieldVisitor visitor, FieldInfo info, int bits) throws IOException { switch (bits & TYPE_MASK) { case BYTE_ARR: int length = in.readVInt(); byte[] data = new byte[length]; in.readBytes(data, 0, length); visitor.binaryField(info, data); break; case STRING: length = in.readVInt(); data = new byte[length]; in.readBytes(data, 0, length); visitor.stringField(info, new String(data, IOUtils.CHARSET_UTF_8)); break; case NUMERIC_INT: visitor.intField(info, in.readInt()); break; case NUMERIC_FLOAT: visitor.floatField(info, Float.intBitsToFloat(in.readInt())); break; case NUMERIC_LONG: visitor.longField(info, in.readLong()); break; case NUMERIC_DOUBLE: visitor.doubleField(info, Double.longBitsToDouble(in.readLong())); break; default: throw new AssertionError("Unknown type flag: " + Integer.toHexString(bits)); } } private static void skipField(DataInput in, int bits) throws IOException { switch (bits & TYPE_MASK) { case BYTE_ARR: case STRING: final int length = in.readVInt(); skipBytes(in, length); break; case NUMERIC_INT: case NUMERIC_FLOAT: in.readInt(); break; case NUMERIC_LONG: case NUMERIC_DOUBLE: in.readLong(); break; default: throw new AssertionError("Unknown type flag: " + Integer.toHexString(bits)); } } @Override public void visitDocument(int docID, StoredFieldVisitor visitor) throws IOException { fieldsStream.seek(indexReader.getStartPointer(docID)); final int docBase = fieldsStream.readVInt(); final int chunkDocs = fieldsStream.readVInt(); if (docID < docBase || docID >= docBase + chunkDocs || docBase + chunkDocs > numDocs) { throw new CorruptIndexException("Corrupted: docID=" + docID + ", docBase=" + docBase + ", chunkDocs=" + chunkDocs + ", numDocs=" + numDocs + " (resource=" + fieldsStream + ")"); } final int numStoredFields, offset, length, totalLength; if (chunkDocs == 1) { numStoredFields = fieldsStream.readVInt(); offset = 0; length = fieldsStream.readVInt(); totalLength = length; } else { final int bitsPerStoredFields = fieldsStream.readVInt(); if (bitsPerStoredFields == 0) { numStoredFields = fieldsStream.readVInt(); } else if (bitsPerStoredFields > 31) { throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields + " (resource=" + fieldsStream + ")"); } else { final long filePointer = fieldsStream.getFilePointer(); final PackedInts.Reader reader = PackedInts.getDirectReaderNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields); numStoredFields = (int) (reader.get(docID - docBase)); fieldsStream.seek(filePointer + PackedInts.Format.PACKED.byteCount(packedIntsVersion, chunkDocs, bitsPerStoredFields)); } final int bitsPerLength = fieldsStream.readVInt(); if (bitsPerLength == 0) { length = fieldsStream.readVInt(); offset = (docID - docBase) * length; totalLength = chunkDocs * length; } else if (bitsPerStoredFields > 31) { throw new CorruptIndexException("bitsPerLength=" + bitsPerLength + " (resource=" + fieldsStream + ")"); } else { final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1); int off = 0; for (int i = 0; i < docID - docBase; ++i) { off += it.next(); } offset = off; length = (int) it.next(); off += length; for (int i = docID - docBase + 1; i < chunkDocs; ++i) { off += it.next(); } totalLength = off; } } if ((length == 0) != (numStoredFields == 0)) { throw new CorruptIndexException("length=" + length + ", numStoredFields=" + numStoredFields + " (resource=" + fieldsStream + ")"); } if (numStoredFields == 0) { // nothing to do return; } final DataInput documentInput; if (version >= VERSION_BIG_CHUNKS && totalLength >= 2 * chunkSize) { assert chunkSize > 0; assert offset < chunkSize; decompressor.decompress(fieldsStream, chunkSize, offset, Math.min(length, chunkSize - offset), bytes); documentInput = new DataInput() { int decompressed = bytes.length; void fillBuffer() throws IOException { assert decompressed <= length; if (decompressed == length) { throw new EOFException(); } final int toDecompress = Math.min(length - decompressed, chunkSize); decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, bytes); decompressed += toDecompress; } @Override public byte readByte() throws IOException { if (bytes.length == 0) { fillBuffer(); } --bytes.length; return bytes.bytes[bytes.offset++]; } @Override public void readBytes(byte[] b, int offset, int len) throws IOException { while (len > bytes.length) { System.arraycopy(bytes.bytes, bytes.offset, b, offset, bytes.length); len -= bytes.length; offset += bytes.length; fillBuffer(); } System.arraycopy(bytes.bytes, bytes.offset, b, offset, len); bytes.offset += len; bytes.length -= len; } }; } else { final BytesRef bytes = totalLength <= BUFFER_REUSE_THRESHOLD ? this.bytes : new BytesRef(); decompressor.decompress(fieldsStream, totalLength, offset, length, bytes); assert bytes.length == length; documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length); } for (int fieldIDX = 0; fieldIDX < numStoredFields; fieldIDX++) { final long infoAndBits = documentInput.readVLong(); final int fieldNumber = (int) (infoAndBits >>> TYPE_BITS); final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); final int bits = (int) (infoAndBits & TYPE_MASK); assert bits <= NUMERIC_DOUBLE: "bits=" + Integer.toHexString(bits); switch(visitor.needsField(fieldInfo)) { case YES: readField(documentInput, visitor, fieldInfo, bits); break; case NO: skipField(documentInput, bits); break; case STOP: return; } } } @Override public StoredFieldsReader clone() { ensureOpen(); return new CompressingStoredFieldsReader(this); } int getVersion() { return version; } CompressionMode getCompressionMode() { return compressionMode; } int getChunkSize() { return chunkSize; } ChunkIterator chunkIterator(int startDocID) throws IOException { ensureOpen(); fieldsStream.seek(indexReader.getStartPointer(startDocID)); return new ChunkIterator(); } final class ChunkIterator { BytesRef spare; BytesRef bytes; int docBase; int chunkDocs; int[] numStoredFields; int[] lengths; private ChunkIterator() { this.docBase = -1; bytes = new BytesRef(); spare = new BytesRef(); numStoredFields = new int[1]; lengths = new int[1]; } /** * Return the decompressed size of the chunk */ int chunkSize() { int sum = 0; for (int i = 0; i < chunkDocs; ++i) { sum += lengths[i]; } return sum; } /** * Go to the chunk containing the provided doc ID. */ void next(int doc) throws IOException { assert doc >= docBase + chunkDocs : doc + " " + docBase + " " + chunkDocs; fieldsStream.seek(indexReader.getStartPointer(doc)); final int docBase = fieldsStream.readVInt(); final int chunkDocs = fieldsStream.readVInt(); if (docBase < this.docBase + this.chunkDocs || docBase + chunkDocs > numDocs) { throw new CorruptIndexException("Corrupted: current docBase=" + this.docBase + ", current numDocs=" + this.chunkDocs + ", new docBase=" + docBase + ", new numDocs=" + chunkDocs + " (resource=" + fieldsStream + ")"); } this.docBase = docBase; this.chunkDocs = chunkDocs; if (chunkDocs > numStoredFields.length) { final int newLength = ArrayUtil.oversize(chunkDocs, 4); numStoredFields = new int[newLength]; lengths = new int[newLength]; } if (chunkDocs == 1) { numStoredFields[0] = fieldsStream.readVInt(); lengths[0] = fieldsStream.readVInt(); } else { final int bitsPerStoredFields = fieldsStream.readVInt(); if (bitsPerStoredFields == 0) { Arrays.fill(numStoredFields, 0, chunkDocs, fieldsStream.readVInt()); } else if (bitsPerStoredFields > 31) { throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields + " (resource=" + fieldsStream + ")"); } else { final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields, 1); for (int i = 0; i < chunkDocs; ++i) { numStoredFields[i] = (int) it.next(); } } final int bitsPerLength = fieldsStream.readVInt(); if (bitsPerLength == 0) { Arrays.fill(lengths, 0, chunkDocs, fieldsStream.readVInt()); } else if (bitsPerLength > 31) { throw new CorruptIndexException("bitsPerLength=" + bitsPerLength); } else { final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1); for (int i = 0; i < chunkDocs; ++i) { lengths[i] = (int) it.next(); } } } } /** * Decompress the chunk. */ void decompress() throws IOException { // decompress data final int chunkSize = chunkSize(); if (version >= VERSION_BIG_CHUNKS && chunkSize >= 2 * CompressingStoredFieldsReader.this.chunkSize) { bytes.offset = bytes.length = 0; for (int decompressed = 0; decompressed < chunkSize; ) { final int toDecompress = Math.min(chunkSize - decompressed, CompressingStoredFieldsReader.this.chunkSize); decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, spare); bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + spare.length); System.arraycopy(spare.bytes, spare.offset, bytes.bytes, bytes.length, spare.length); bytes.length += spare.length; decompressed += toDecompress; } } else { decompressor.decompress(fieldsStream, chunkSize, 0, chunkSize, bytes); } if (bytes.length != chunkSize) { throw new CorruptIndexException("Corrupted: expected chunk size = " + chunkSize() + ", got " + bytes.length + " (resource=" + fieldsStream + ")"); } } /** * Copy compressed data. */ void copyCompressedData(DataOutput out) throws IOException { final long chunkEnd = docBase + chunkDocs == numDocs ? fieldsStream.length() : indexReader.getStartPointer(docBase + chunkDocs); out.copyBytes(fieldsStream, chunkEnd - fieldsStream.getFilePointer()); } } @Override public long ramBytesUsed() { return indexReader.ramBytesUsed(); } }