/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.codecs.lucene54; import java.io.IOException; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.lucene70.Lucene70DocValuesFormat; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.SmallFloat; import org.apache.lucene.util.packed.DirectWriter; /** * Lucene 5.4 DocValues format. * <p> * Encodes the five per-document value types (Numeric,Binary,Sorted,SortedSet,SortedNumeric) with these strategies: * <p> * {@link DocValuesType#NUMERIC NUMERIC}: * <ul> * <li>Delta-compressed: per-document integers written as deltas from the minimum value, * compressed with bitpacking. For more information, see {@link DirectWriter}. * <li>Table-compressed: when the number of unique values is very small (< 256), and * when there are unused "gaps" in the range of values used (such as {@link SmallFloat}), * a lookup table is written instead. Each per-document entry is instead the ordinal * to this table, and those ordinals are compressed with bitpacking ({@link DirectWriter}). * <li>GCD-compressed: when all numbers share a common divisor, such as dates, the greatest * common denominator (GCD) is computed, and quotients are stored using Delta-compressed Numerics. * <li>Monotonic-compressed: when all numbers are monotonically increasing offsets, they are written * as blocks of bitpacked integers, encoding the deviation from the expected delta. * <li>Const-compressed: when there is only one possible non-missing value, only the missing * bitset is encoded. * <li>Sparse-compressed: only documents with a value are stored, and lookups are performed * using binary search. * </ul> * <p> * {@link DocValuesType#BINARY BINARY}: * <ul> * <li>Fixed-width Binary: one large concatenated byte[] is written, along with the fixed length. * Each document's value can be addressed directly with multiplication ({@code docID * length}). * <li>Variable-width Binary: one large concatenated byte[] is written, along with end addresses * for each document. The addresses are written as Monotonic-compressed numerics. * <li>Prefix-compressed Binary: values are written in chunks of 16, with the first value written * completely and other values sharing prefixes. chunk addresses are written as Monotonic-compressed * numerics. A reverse lookup index is written from a portion of every 1024th term. * </ul> * <p> * {@link DocValuesType#SORTED SORTED}: * <ul> * <li>Sorted: a mapping of ordinals to deduplicated terms is written as Binary, * along with the per-document ordinals written using one of the numeric strategies above. * </ul> * <p> * {@link DocValuesType#SORTED_SET SORTED_SET}: * <ul> * <li>Single: if all documents have 0 or 1 value, then data are written like SORTED. * <li>SortedSet table: when there are few unique sets of values (< 256) then each set is assigned * an id, a lookup table is written and the mapping from document to set id is written using the * numeric strategies above. * <li>SortedSet: a mapping of ordinals to deduplicated terms is written as Binary, * an ordinal list and per-document index into this list are written using the numeric strategies * above. * </ul> * <p> * {@link DocValuesType#SORTED_NUMERIC SORTED_NUMERIC}: * <ul> * <li>Single: if all documents have 0 or 1 value, then data are written like NUMERIC. * <li>SortedSet table: when there are few unique sets of values (< 256) then each set is assigned * an id, a lookup table is written and the mapping from document to set id is written using the * numeric strategies above. * <li>SortedNumeric: a value list and per-document index into this list are written using the numeric * strategies above. * </ul> * <p> * Files: * <ol> * <li><tt>.dvd</tt>: DocValues data</li> * <li><tt>.dvm</tt>: DocValues metadata</li> * </ol> * @lucene.experimental * @deprecated Use {@link Lucene70DocValuesFormat}. */ @Deprecated public final class Lucene54DocValuesFormat extends DocValuesFormat { /** Sole Constructor */ public Lucene54DocValuesFormat() { super("Lucene54"); } @Override public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { return new Lucene54DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); } @Override public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { return new Lucene54DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); } static final String DATA_CODEC = "Lucene54DocValuesData"; static final String DATA_EXTENSION = "dvd"; static final String META_CODEC = "Lucene54DocValuesMetadata"; static final String META_EXTENSION = "dvm"; static final int VERSION_START = 0; static final int VERSION_CURRENT = VERSION_START; // indicates docvalues type static final byte NUMERIC = 0; static final byte BINARY = 1; static final byte SORTED = 2; static final byte SORTED_SET = 3; static final byte SORTED_NUMERIC = 4; // address terms in blocks of 16 terms static final int INTERVAL_SHIFT = 4; static final int INTERVAL_COUNT = 1 << INTERVAL_SHIFT; static final int INTERVAL_MASK = INTERVAL_COUNT - 1; // build reverse index from every 1024th term static final int REVERSE_INTERVAL_SHIFT = 10; static final int REVERSE_INTERVAL_COUNT = 1 << REVERSE_INTERVAL_SHIFT; static final int REVERSE_INTERVAL_MASK = REVERSE_INTERVAL_COUNT - 1; // for conversion from reverse index to block static final int BLOCK_INTERVAL_SHIFT = REVERSE_INTERVAL_SHIFT - INTERVAL_SHIFT; static final int BLOCK_INTERVAL_COUNT = 1 << BLOCK_INTERVAL_SHIFT; static final int BLOCK_INTERVAL_MASK = BLOCK_INTERVAL_COUNT - 1; /** Compressed using packed blocks of ints. */ static final int DELTA_COMPRESSED = 0; /** Compressed by computing the GCD. */ static final int GCD_COMPRESSED = 1; /** Compressed by giving IDs to unique values. */ static final int TABLE_COMPRESSED = 2; /** Compressed with monotonically increasing values */ static final int MONOTONIC_COMPRESSED = 3; /** Compressed with constant value (uses only missing bitset) */ static final int CONST_COMPRESSED = 4; /** Compressed with sparse arrays. */ static final int SPARSE_COMPRESSED = 5; /** Uncompressed binary, written directly (fixed length). */ static final int BINARY_FIXED_UNCOMPRESSED = 0; /** Uncompressed binary, written directly (variable length). */ static final int BINARY_VARIABLE_UNCOMPRESSED = 1; /** Compressed binary with shared prefixes */ static final int BINARY_PREFIX_COMPRESSED = 2; /** Standard storage for sorted set values with 1 level of indirection: * {@code docId -> address -> ord}. */ static final int SORTED_WITH_ADDRESSES = 0; /** Single-valued sorted set values, encoded as sorted values, so no level * of indirection: {@code docId -> ord}. */ static final int SORTED_SINGLE_VALUED = 1; /** Compressed giving IDs to unique sets of values: * {@code docId -> setId -> ords} */ static final int SORTED_SET_TABLE = 2; /** placeholder for missing offset that means there are no missing values */ static final int ALL_LIVE = -1; /** placeholder for missing offset that means all values are missing */ static final int ALL_MISSING = -2; // addressing uses 16k blocks static final int MONOTONIC_BLOCK_SIZE = 16384; static final int DIRECT_MONOTONIC_BLOCK_SHIFT = 16; }