/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.codecs.compressing; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.BYTE_ARR; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.CODEC_SFX_DAT; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.CODEC_SFX_IDX; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.DAY; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.DAY_ENCODING; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.FIELDS_EXTENSION; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.FIELDS_INDEX_EXTENSION; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.HOUR; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.HOUR_ENCODING; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_DOUBLE; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_FLOAT; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_INT; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.NUMERIC_LONG; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.SECOND; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.SECOND_ENCODING; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.STRING; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.TYPE_BITS; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.TYPE_MASK; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_CURRENT; import static org.apache.lucene.codecs.compressing.CompressingStoredFieldsWriter.VERSION_START; import java.io.EOFException; import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.Accountables; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.packed.PackedInts; /** * {@link StoredFieldsReader} impl for {@link CompressingStoredFieldsFormat}. * @lucene.experimental */ public final class CompressingStoredFieldsReader extends StoredFieldsReader { private final int version; private final FieldInfos fieldInfos; private final CompressingStoredFieldsIndexReader indexReader; private final long maxPointer; private final IndexInput fieldsStream; private final int chunkSize; private final int packedIntsVersion; private final CompressionMode compressionMode; private final Decompressor decompressor; private final int numDocs; private final boolean merging; private final BlockState state; private final long numChunks; // number of compressed blocks written private final long numDirtyChunks; // number of incomplete compressed blocks written private boolean closed; // used by clone private CompressingStoredFieldsReader(CompressingStoredFieldsReader reader, boolean merging) { this.version = reader.version; this.fieldInfos = reader.fieldInfos; this.fieldsStream = reader.fieldsStream.clone(); this.indexReader = reader.indexReader.clone(); this.maxPointer = reader.maxPointer; this.chunkSize = reader.chunkSize; this.packedIntsVersion = reader.packedIntsVersion; this.compressionMode = reader.compressionMode; this.decompressor = reader.decompressor.clone(); this.numDocs = reader.numDocs; this.numChunks = reader.numChunks; this.numDirtyChunks = reader.numDirtyChunks; this.merging = merging; this.state = new BlockState(); this.closed = false; } /** Sole constructor. */ public CompressingStoredFieldsReader(Directory d, SegmentInfo si, String segmentSuffix, FieldInfos fn, IOContext context, String formatName, CompressionMode compressionMode) throws IOException { this.compressionMode = compressionMode; final String segment = si.name; boolean success = false; fieldInfos = fn; numDocs = si.maxDoc(); int version = -1; long maxPointer = -1; CompressingStoredFieldsIndexReader indexReader = null; // Load the index into memory final String indexName = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_INDEX_EXTENSION); try (ChecksumIndexInput indexStream = d.openChecksumInput(indexName, context)) { Throwable priorE = null; try { final String codecNameIdx = formatName + CODEC_SFX_IDX; version = CodecUtil.checkIndexHeader(indexStream, codecNameIdx, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix); assert CodecUtil.indexHeaderLength(codecNameIdx, segmentSuffix) == indexStream.getFilePointer(); indexReader = new CompressingStoredFieldsIndexReader(indexStream, si); maxPointer = indexStream.readVLong(); } catch (Throwable exception) { priorE = exception; } finally { CodecUtil.checkFooter(indexStream, priorE); } } this.version = version; this.maxPointer = maxPointer; this.indexReader = indexReader; final String fieldsStreamFN = IndexFileNames.segmentFileName(segment, segmentSuffix, FIELDS_EXTENSION); try { // Open the data file and read metadata fieldsStream = d.openInput(fieldsStreamFN, context); final String codecNameDat = formatName + CODEC_SFX_DAT; final int fieldsVersion = CodecUtil.checkIndexHeader(fieldsStream, codecNameDat, VERSION_START, VERSION_CURRENT, si.getId(), segmentSuffix); if (version != fieldsVersion) { throw new CorruptIndexException("Version mismatch between stored fields index and data: " + version + " != " + fieldsVersion, fieldsStream); } assert CodecUtil.indexHeaderLength(codecNameDat, segmentSuffix) == fieldsStream.getFilePointer(); chunkSize = fieldsStream.readVInt(); packedIntsVersion = fieldsStream.readVInt(); decompressor = compressionMode.newDecompressor(); this.merging = false; this.state = new BlockState(); fieldsStream.seek(maxPointer); numChunks = fieldsStream.readVLong(); numDirtyChunks = fieldsStream.readVLong(); if (numDirtyChunks > numChunks) { throw new CorruptIndexException("invalid chunk counts: dirty=" + numDirtyChunks + ", total=" + numChunks, fieldsStream); } // NOTE: data file is too costly to verify checksum against all the bytes on open, // but for now we at least verify proper structure of the checksum footer: which looks // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption // such as file truncation. CodecUtil.retrieveChecksum(fieldsStream); success = true; } finally { if (!success) { IOUtils.closeWhileHandlingException(this); } } } /** * @throws AlreadyClosedException if this FieldsReader is closed */ private void ensureOpen() throws AlreadyClosedException { if (closed) { throw new AlreadyClosedException("this FieldsReader is closed"); } } /** * Close the underlying {@link IndexInput}s. */ @Override public void close() throws IOException { if (!closed) { IOUtils.close(fieldsStream); closed = true; } } private static void readField(DataInput in, StoredFieldVisitor visitor, FieldInfo info, int bits) throws IOException { switch (bits & TYPE_MASK) { case BYTE_ARR: int length = in.readVInt(); byte[] data = new byte[length]; in.readBytes(data, 0, length); visitor.binaryField(info, data); break; case STRING: length = in.readVInt(); data = new byte[length]; in.readBytes(data, 0, length); visitor.stringField(info, data); break; case NUMERIC_INT: visitor.intField(info, in.readZInt()); break; case NUMERIC_FLOAT: visitor.floatField(info, readZFloat(in)); break; case NUMERIC_LONG: visitor.longField(info, readTLong(in)); break; case NUMERIC_DOUBLE: visitor.doubleField(info, readZDouble(in)); break; default: throw new AssertionError("Unknown type flag: " + Integer.toHexString(bits)); } } private static void skipField(DataInput in, int bits) throws IOException { switch (bits & TYPE_MASK) { case BYTE_ARR: case STRING: final int length = in.readVInt(); in.skipBytes(length); break; case NUMERIC_INT: in.readZInt(); break; case NUMERIC_FLOAT: readZFloat(in); break; case NUMERIC_LONG: readTLong(in); break; case NUMERIC_DOUBLE: readZDouble(in); break; default: throw new AssertionError("Unknown type flag: " + Integer.toHexString(bits)); } } /** * Reads a float in a variable-length format. Reads between one and * five bytes. Small integral values typically take fewer bytes. */ static float readZFloat(DataInput in) throws IOException { int b = in.readByte() & 0xFF; if (b == 0xFF) { // negative value return Float.intBitsToFloat(in.readInt()); } else if ((b & 0x80) != 0) { // small integer [-1..125] return (b & 0x7f) - 1; } else { // positive float int bits = b << 24 | ((in.readShort() & 0xFFFF) << 8) | (in.readByte() & 0xFF); return Float.intBitsToFloat(bits); } } /** * Reads a double in a variable-length format. Reads between one and * nine bytes. Small integral values typically take fewer bytes. */ static double readZDouble(DataInput in) throws IOException { int b = in.readByte() & 0xFF; if (b == 0xFF) { // negative value return Double.longBitsToDouble(in.readLong()); } else if (b == 0xFE) { // float return Float.intBitsToFloat(in.readInt()); } else if ((b & 0x80) != 0) { // small integer [-1..124] return (b & 0x7f) - 1; } else { // positive double long bits = ((long) b) << 56 | ((in.readInt() & 0xFFFFFFFFL) << 24) | ((in.readShort() & 0xFFFFL) << 8) | (in.readByte() & 0xFFL); return Double.longBitsToDouble(bits); } } /** * Reads a long in a variable-length format. Reads between one andCorePropLo * nine bytes. Small values typically take fewer bytes. */ static long readTLong(DataInput in) throws IOException { int header = in.readByte() & 0xFF; long bits = header & 0x1F; if ((header & 0x20) != 0) { // continuation bit bits |= in.readVLong() << 5; } long l = BitUtil.zigZagDecode(bits); switch (header & DAY_ENCODING) { case SECOND_ENCODING: l *= SECOND; break; case HOUR_ENCODING: l *= HOUR; break; case DAY_ENCODING: l *= DAY; break; case 0: // uncompressed break; default: throw new AssertionError(); } return l; } /** * A serialized document, you need to decode its input in order to get an actual * {@link Document}. */ static class SerializedDocument { // the serialized data final DataInput in; // the number of bytes on which the document is encoded final int length; // the number of stored fields final int numStoredFields; private SerializedDocument(DataInput in, int length, int numStoredFields) { this.in = in; this.length = length; this.numStoredFields = numStoredFields; } } /** * Keeps state about the current block of documents. */ private class BlockState { private int docBase, chunkDocs; // whether the block has been sliced, this happens for large documents private boolean sliced; private int[] offsets = IntsRef.EMPTY_INTS; private int[] numStoredFields = IntsRef.EMPTY_INTS; // the start pointer at which you can read the compressed documents private long startPointer; private final BytesRef spare = new BytesRef(); private final BytesRef bytes = new BytesRef(); boolean contains(int docID) { return docID >= docBase && docID < docBase + chunkDocs; } /** * Reset this block so that it stores state for the block * that contains the given doc id. */ void reset(int docID) throws IOException { boolean success = false; try { doReset(docID); success = true; } finally { if (success == false) { // if the read failed, set chunkDocs to 0 so that it does not // contain any docs anymore and is not reused. This should help // get consistent exceptions when trying to get several // documents which are in the same corrupted block since it will // force the header to be decoded again chunkDocs = 0; } } } private void doReset(int docID) throws IOException { docBase = fieldsStream.readVInt(); final int token = fieldsStream.readVInt(); chunkDocs = token >>> 1; if (contains(docID) == false || docBase + chunkDocs > numDocs) { throw new CorruptIndexException("Corrupted: docID=" + docID + ", docBase=" + docBase + ", chunkDocs=" + chunkDocs + ", numDocs=" + numDocs, fieldsStream); } sliced = (token & 1) != 0; offsets = ArrayUtil.grow(offsets, chunkDocs + 1); numStoredFields = ArrayUtil.grow(numStoredFields, chunkDocs); if (chunkDocs == 1) { numStoredFields[0] = fieldsStream.readVInt(); offsets[1] = fieldsStream.readVInt(); } else { // Number of stored fields per document final int bitsPerStoredFields = fieldsStream.readVInt(); if (bitsPerStoredFields == 0) { Arrays.fill(numStoredFields, 0, chunkDocs, fieldsStream.readVInt()); } else if (bitsPerStoredFields > 31) { throw new CorruptIndexException("bitsPerStoredFields=" + bitsPerStoredFields, fieldsStream); } else { final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerStoredFields, 1); for (int i = 0; i < chunkDocs; ++i) { numStoredFields[i] = (int) it.next(); } } // The stream encodes the length of each document and we decode // it into a list of monotonically increasing offsets final int bitsPerLength = fieldsStream.readVInt(); if (bitsPerLength == 0) { final int length = fieldsStream.readVInt(); for (int i = 0; i < chunkDocs; ++i) { offsets[1 + i] = (1 + i) * length; } } else if (bitsPerStoredFields > 31) { throw new CorruptIndexException("bitsPerLength=" + bitsPerLength, fieldsStream); } else { final PackedInts.ReaderIterator it = PackedInts.getReaderIteratorNoHeader(fieldsStream, PackedInts.Format.PACKED, packedIntsVersion, chunkDocs, bitsPerLength, 1); for (int i = 0; i < chunkDocs; ++i) { offsets[i + 1] = (int) it.next(); } for (int i = 0; i < chunkDocs; ++i) { offsets[i + 1] += offsets[i]; } } // Additional validation: only the empty document has a serialized length of 0 for (int i = 0; i < chunkDocs; ++i) { final int len = offsets[i + 1] - offsets[i]; final int storedFields = numStoredFields[i]; if ((len == 0) != (storedFields == 0)) { throw new CorruptIndexException("length=" + len + ", numStoredFields=" + storedFields, fieldsStream); } } } startPointer = fieldsStream.getFilePointer(); if (merging) { final int totalLength = offsets[chunkDocs]; // decompress eagerly if (sliced) { bytes.offset = bytes.length = 0; for (int decompressed = 0; decompressed < totalLength; ) { final int toDecompress = Math.min(totalLength - decompressed, chunkSize); decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, spare); bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length + spare.length); System.arraycopy(spare.bytes, spare.offset, bytes.bytes, bytes.length, spare.length); bytes.length += spare.length; decompressed += toDecompress; } } else { decompressor.decompress(fieldsStream, totalLength, 0, totalLength, bytes); } if (bytes.length != totalLength) { throw new CorruptIndexException("Corrupted: expected chunk size = " + totalLength + ", got " + bytes.length, fieldsStream); } } } /** * Get the serialized representation of the given docID. This docID has * to be contained in the current block. */ SerializedDocument document(int docID) throws IOException { if (contains(docID) == false) { throw new IllegalArgumentException(); } final int index = docID - docBase; final int offset = offsets[index]; final int length = offsets[index+1] - offset; final int totalLength = offsets[chunkDocs]; final int numStoredFields = this.numStoredFields[index]; final DataInput documentInput; if (length == 0) { // empty documentInput = new ByteArrayDataInput(); } else if (merging) { // already decompressed documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset + offset, length); } else if (sliced) { fieldsStream.seek(startPointer); decompressor.decompress(fieldsStream, chunkSize, offset, Math.min(length, chunkSize - offset), bytes); documentInput = new DataInput() { int decompressed = bytes.length; void fillBuffer() throws IOException { assert decompressed <= length; if (decompressed == length) { throw new EOFException(); } final int toDecompress = Math.min(length - decompressed, chunkSize); decompressor.decompress(fieldsStream, toDecompress, 0, toDecompress, bytes); decompressed += toDecompress; } @Override public byte readByte() throws IOException { if (bytes.length == 0) { fillBuffer(); } --bytes.length; return bytes.bytes[bytes.offset++]; } @Override public void readBytes(byte[] b, int offset, int len) throws IOException { while (len > bytes.length) { System.arraycopy(bytes.bytes, bytes.offset, b, offset, bytes.length); len -= bytes.length; offset += bytes.length; fillBuffer(); } System.arraycopy(bytes.bytes, bytes.offset, b, offset, len); bytes.offset += len; bytes.length -= len; } }; } else { fieldsStream.seek(startPointer); decompressor.decompress(fieldsStream, totalLength, offset, length, bytes); assert bytes.length == length; documentInput = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length); } return new SerializedDocument(documentInput, length, numStoredFields); } } SerializedDocument document(int docID) throws IOException { if (state.contains(docID) == false) { fieldsStream.seek(indexReader.getStartPointer(docID)); state.reset(docID); } assert state.contains(docID); return state.document(docID); } @Override public void visitDocument(int docID, StoredFieldVisitor visitor) throws IOException { final SerializedDocument doc = document(docID); for (int fieldIDX = 0; fieldIDX < doc.numStoredFields; fieldIDX++) { final long infoAndBits = doc.in.readVLong(); final int fieldNumber = (int) (infoAndBits >>> TYPE_BITS); final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); final int bits = (int) (infoAndBits & TYPE_MASK); assert bits <= NUMERIC_DOUBLE: "bits=" + Integer.toHexString(bits); switch(visitor.needsField(fieldInfo)) { case YES: readField(doc.in, visitor, fieldInfo, bits); break; case NO: if (fieldIDX == doc.numStoredFields - 1) {// don't skipField on last field value; treat like STOP return; } skipField(doc.in, bits); break; case STOP: return; } } } @Override public StoredFieldsReader clone() { ensureOpen(); return new CompressingStoredFieldsReader(this, false); } @Override public StoredFieldsReader getMergeInstance() { ensureOpen(); return new CompressingStoredFieldsReader(this, true); } int getVersion() { return version; } CompressionMode getCompressionMode() { return compressionMode; } CompressingStoredFieldsIndexReader getIndexReader() { return indexReader; } long getMaxPointer() { return maxPointer; } IndexInput getFieldsStream() { return fieldsStream; } int getChunkSize() { return chunkSize; } long getNumChunks() { return numChunks; } long getNumDirtyChunks() { return numDirtyChunks; } int getPackedIntsVersion() { return packedIntsVersion; } @Override public long ramBytesUsed() { return indexReader.ramBytesUsed(); } @Override public Collection<Accountable> getChildResources() { return Collections.singleton(Accountables.namedAccountable("stored field index", indexReader)); } @Override public void checkIntegrity() throws IOException { CodecUtil.checksumEntireFile(fieldsStream); } @Override public String toString() { return getClass().getSimpleName() + "(mode=" + compressionMode + ",chunksize=" + chunkSize + ")"; } }