package org.apache.lucene.index; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Closeable; import java.io.IOException; import java.util.Arrays; import java.util.Comparator; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.document.ByteDocValuesField; // javadocs import org.apache.lucene.document.DerefBytesDocValuesField; // javadocs import org.apache.lucene.document.DoubleDocValuesField; // javadocs import org.apache.lucene.document.Field; // javadocs import org.apache.lucene.document.FloatDocValuesField; // javadocs import org.apache.lucene.document.IntDocValuesField; // javadocs import org.apache.lucene.document.LongDocValuesField; // javadocs import org.apache.lucene.document.PackedLongDocValuesField; // javadocs import org.apache.lucene.document.ShortDocValuesField; // javadocs import org.apache.lucene.document.SortedBytesDocValuesField; // javadocs import org.apache.lucene.document.StraightBytesDocValuesField; // javadocs import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.packed.PackedInts; /** * {@link DocValues} provides a dense per-document typed storage for fast * value access based on the lucene internal document id. {@link DocValues} * exposes two distinct APIs: * <ul> * <li>via {@link #getSource()} providing RAM resident random access</li> * <li>via {@link #getDirectSource()} providing on disk random access</li> * </ul> {@link DocValues} are exposed via * {@link AtomicReader#docValues(String)} on a per-segment basis. For best * performance {@link DocValues} should be consumed per-segment just like * IndexReader. * <p> * {@link DocValues} are fully integrated into the {@link DocValuesFormat} API. * <p> * NOTE: DocValues is a strongly typed per-field API. Type changes within an * indexing session can result in exceptions if the type has changed in a way that * the previously give type for a field can't promote the value without losing * information. For instance a field initially indexed with {@link Type#FIXED_INTS_32} * can promote a value with {@link Type#FIXED_INTS_8} but can't promote * {@link Type#FIXED_INTS_64}. During segment merging type-promotion exceptions are suppressed. * Fields will be promoted to their common denominator or automatically transformed * into a 3rd type like {@link Type#BYTES_VAR_STRAIGHT} to prevent data loss and merge exceptions. * This behavior is considered <i>best-effort</i> might change in future releases. * </p> * <p> * DocValues are exposed via the {@link Field} API with type safe * specializations for each type variant: * <ul> * <li> {@link ByteDocValuesField} - for adding byte values to the index</li> * <li> {@link ShortDocValuesField} - for adding short values to the index</li> * <li> {@link IntDocValuesField} - for adding int values to the index</li> * <li> {@link LongDocValuesField} - for adding long values to the index</li> * <li> {@link FloatDocValuesField} - for adding float values to the index</li> * <li> {@link DoubleDocValuesField} - for adding double values to the index</li> * <li> {@link PackedLongDocValuesField} - for adding packed long values to the * index</li> * <li> {@link SortedBytesDocValuesField} - for adding sorted {@link BytesRef} * values to the index</li> * <li> {@link StraightBytesDocValuesField} - for adding straight * {@link BytesRef} values to the index</li> * <li> {@link DerefBytesDocValuesField} - for adding deref {@link BytesRef} * values to the index</li> * </ul> * See {@link Type} for limitations of each type variant. * <p> * <p> * * @see DocValuesFormat#docsConsumer(org.apache.lucene.index.PerDocWriteState) * * @lucene.experimental */ public abstract class DocValues implements Closeable { /** Zero length DocValues array. */ public static final DocValues[] EMPTY_ARRAY = new DocValues[0]; private volatile SourceCache cache = new SourceCache.DirectSourceCache(); private final Object cacheLock = new Object(); /** Sole constructor. (For invocation by subclass * constructors, typically implicit.) */ protected DocValues() { } /** * Loads a new {@link Source} instance for this {@link DocValues} field * instance. Source instances returned from this method are not cached. It is * the callers responsibility to maintain the instance and release its * resources once the source is not needed anymore. * <p> * For managed {@link Source} instances see {@link #getSource()}. * * @see #getSource() * @see #setCache(SourceCache) */ public abstract Source load() throws IOException; /** * Returns a {@link Source} instance through the current {@link SourceCache}. * Iff no {@link Source} has been loaded into the cache so far the source will * be loaded through {@link #load()} and passed to the {@link SourceCache}. * The caller of this method should not close the obtained {@link Source} * instance unless it is not needed for the rest of its life time. * <p> * {@link Source} instances obtained from this method are closed / released * from the cache once this {@link DocValues} instance is closed by the * {@link IndexReader}, {@link Fields} or the * {@link DocValues} was created from. */ public Source getSource() throws IOException { return cache.load(this); } /** * Returns a disk resident {@link Source} instance. Direct Sources are not * cached in the {@link SourceCache} and should not be shared between threads. */ public abstract Source getDirectSource() throws IOException; /** * Returns the {@link Type} of this {@link DocValues} instance */ public abstract Type getType(); /** * Closes this {@link DocValues} instance. This method should only be called * by the creator of this {@link DocValues} instance. API users should not * close {@link DocValues} instances. */ public void close() throws IOException { cache.close(this); } /** * Returns the size per value in bytes or <code>-1</code> iff size per value * is variable. * * @return the size per value in bytes or <code>-1</code> iff size per value * is variable. */ public int getValueSize() { return -1; } /** * Sets the {@link SourceCache} used by this {@link DocValues} instance. This * method should be called before {@link #load()} is called. All {@link Source} instances in the currently used cache will be closed * before the new cache is installed. * <p> * Note: All instances previously obtained from {@link #load()} will be lost. * * @throws IllegalArgumentException * if the given cache is <code>null</code> * */ public void setCache(SourceCache cache) { if (cache == null) throw new IllegalArgumentException("cache must not be null"); synchronized (cacheLock) { SourceCache toClose = this.cache; this.cache = cache; toClose.close(this); } } /** * Source of per document values like long, double or {@link BytesRef} * depending on the {@link DocValues} fields {@link Type}. Source * implementations provide random access semantics similar to array lookups * <p> * @see DocValues#getSource() * @see DocValues#getDirectSource() */ public static abstract class Source { /** {@link Type} of this {@code Source}. */ protected final Type type; /** Sole constructor. (For invocation by subclass * constructors, typically implicit.) */ protected Source(Type type) { this.type = type; } /** * Returns a <tt>long</tt> for the given document id or throws an * {@link UnsupportedOperationException} if this source doesn't support * <tt>long</tt> values. * * @throws UnsupportedOperationException * if this source doesn't support <tt>long</tt> values. */ public long getInt(int docID) { throw new UnsupportedOperationException("ints are not supported"); } /** * Returns a <tt>double</tt> for the given document id or throws an * {@link UnsupportedOperationException} if this source doesn't support * <tt>double</tt> values. * * @throws UnsupportedOperationException * if this source doesn't support <tt>double</tt> values. */ public double getFloat(int docID) { throw new UnsupportedOperationException("floats are not supported"); } /** * Returns a {@link BytesRef} for the given document id or throws an * {@link UnsupportedOperationException} if this source doesn't support * <tt>byte[]</tt> values. * * @throws UnsupportedOperationException * if this source doesn't support <tt>byte[]</tt> values. */ public BytesRef getBytes(int docID, BytesRef ref) { throw new UnsupportedOperationException("bytes are not supported"); } /** * Returns the {@link Type} of this source. * * @return the {@link Type} of this source. */ public Type getType() { return type; } /** * Returns <code>true</code> iff this {@link Source} exposes an array via * {@link #getArray()} otherwise <code>false</code>. * * @return <code>true</code> iff this {@link Source} exposes an array via * {@link #getArray()} otherwise <code>false</code>. */ public boolean hasArray() { return false; } /** * Returns the internal array representation iff this {@link Source} uses an * array as its inner representation, otherwise <code>UOE</code>. */ public Object getArray() { throw new UnsupportedOperationException("getArray is not supported"); } /** * If this {@link Source} is sorted this method will return an instance of * {@link SortedSource} otherwise <code>UOE</code> */ public SortedSource asSortedSource() { throw new UnsupportedOperationException("asSortedSource is not supported"); } } /** * A sorted variant of {@link Source} for <tt>byte[]</tt> values per document. * <p> */ public static abstract class SortedSource extends Source { private final Comparator<BytesRef> comparator; /** Sole constructor. (For invocation by subclass * constructors, typically implicit.) */ protected SortedSource(Type type, Comparator<BytesRef> comparator) { super(type); this.comparator = comparator; } @Override public BytesRef getBytes(int docID, BytesRef bytesRef) { final int ord = ord(docID); if (ord < 0) { // Negative ord means doc was missing? bytesRef.length = 0; } else { getByOrd(ord, bytesRef); } return bytesRef; } /** * Returns ord for specified docID. Ord is dense, ie, starts at 0, then increments by 1 * for the next (as defined by {@link Comparator} value. */ public abstract int ord(int docID); /** Returns value for specified ord. */ public abstract BytesRef getByOrd(int ord, BytesRef result); /** Return true if it's safe to call {@link * #getDocToOrd}. */ public boolean hasPackedDocToOrd() { return false; } /** * Returns the PackedInts.Reader impl that maps document to ord. */ public abstract PackedInts.Reader getDocToOrd(); /** * Returns the comparator used to order the BytesRefs. */ public Comparator<BytesRef> getComparator() { return comparator; } /** * Lookup ord by value. * * @param value * the value to look up * @param spare * a spare {@link BytesRef} instance used to compare internal * values to the given value. Must not be <code>null</code> * @return the given values ordinal if found or otherwise * <code>(-(ord)-1)</code>, defined as the ordinal of the first * element that is greater than the given value (the insertion * point). This guarantees that the return value will always be * >= 0 if the given value is found. */ public int getOrdByValue(BytesRef value, BytesRef spare) { return binarySearch(value, spare, 0, getValueCount() - 1); } private int binarySearch(BytesRef b, BytesRef bytesRef, int low, int high) { int mid = 0; while (low <= high) { mid = (low + high) >>> 1; getByOrd(mid, bytesRef); final int cmp = comparator.compare(bytesRef, b); if (cmp < 0) { low = mid + 1; } else if (cmp > 0) { high = mid - 1; } else { return mid; } } assert comparator.compare(bytesRef, b) != 0; return -(low + 1); } @Override public SortedSource asSortedSource() { return this; } /** * Returns the number of unique values in this sorted source */ public abstract int getValueCount(); } /** Returns a Source that always returns default (missing) * values for all documents. */ public static Source getDefaultSource(final Type type) { return new Source(type) { @Override public long getInt(int docID) { return 0; } @Override public double getFloat(int docID) { return 0.0; } @Override public BytesRef getBytes(int docID, BytesRef ref) { ref.length = 0; return ref; } }; } /** Returns a SortedSource that always returns default (missing) * values for all documents. */ public static SortedSource getDefaultSortedSource(final Type type, final int size) { final PackedInts.Reader docToOrd = new PackedInts.Reader() { @Override public long get(int index) { return 0; } @Override public int getBitsPerValue() { return 0; } @Override public int size() { return size; } @Override public boolean hasArray() { return false; } @Override public Object getArray() { return null; } @Override public int get(int index, long[] arr, int off, int len) { len = Math.min(len, size() - index); Arrays.fill(arr, off, off+len, 0); return len; } @Override public long ramBytesUsed() { return 0; } }; return new SortedSource(type, BytesRef.getUTF8SortedAsUnicodeComparator()) { @Override public BytesRef getBytes(int docID, BytesRef ref) { ref.length = 0; return ref; } @Override public int ord(int docID) { return 0; } @Override public BytesRef getByOrd(int ord, BytesRef bytesRef) { assert ord == 0; bytesRef.length = 0; return bytesRef; } @Override public boolean hasPackedDocToOrd() { return true; } @Override public PackedInts.Reader getDocToOrd() { return docToOrd; } @Override public int getOrdByValue(BytesRef value, BytesRef spare) { if (value.length == 0) { return 0; } else { return -1; } } @Override public int getValueCount() { return 1; } }; } /** * <code>Type</code> specifies the {@link DocValues} type for a * certain field. A <code>Type</code> only defines the data type for a field * while the actual implementation used to encode and decode the values depends * on the the {@link DocValuesFormat#docsConsumer} and {@link DocValuesFormat#docsProducer} methods. * * @lucene.experimental */ public static enum Type { /** * A variable bit signed integer value. By default this type uses * {@link PackedInts} to compress the values, as an offset * from the minimum value, as long as the value range * fits into 2<sup>63</sup>-1. Otherwise, * the default implementation falls back to fixed size 64bit * integers ({@link #FIXED_INTS_64}). * <p> * NOTE: this type uses <tt>0</tt> as the default value without any * distinction between provided <tt>0</tt> values during indexing. All * documents without an explicit value will use <tt>0</tt> instead. * Custom default values must be assigned explicitly. * </p> */ VAR_INTS, /** * A 8 bit signed integer value. {@link Source} instances of * this type return a <tt>byte</tt> array from {@link Source#getArray()} * <p> * NOTE: this type uses <tt>0</tt> as the default value without any * distinction between provided <tt>0</tt> values during indexing. All * documents without an explicit value will use <tt>0</tt> instead. * Custom default values must be assigned explicitly. * </p> */ FIXED_INTS_8, /** * A 16 bit signed integer value. {@link Source} instances of * this type return a <tt>short</tt> array from {@link Source#getArray()} * <p> * NOTE: this type uses <tt>0</tt> as the default value without any * distinction between provided <tt>0</tt> values during indexing. All * documents without an explicit value will use <tt>0</tt> instead. * Custom default values must be assigned explicitly. * </p> */ FIXED_INTS_16, /** * A 32 bit signed integer value. {@link Source} instances of * this type return a <tt>int</tt> array from {@link Source#getArray()} * <p> * NOTE: this type uses <tt>0</tt> as the default value without any * distinction between provided <tt>0</tt> values during indexing. All * documents without an explicit value will use <tt>0</tt> instead. * Custom default values must be assigned explicitly. * </p> */ FIXED_INTS_32, /** * A 64 bit signed integer value. {@link Source} instances of * this type return a <tt>long</tt> array from {@link Source#getArray()} * <p> * NOTE: this type uses <tt>0</tt> as the default value without any * distinction between provided <tt>0</tt> values during indexing. All * documents without an explicit value will use <tt>0</tt> instead. * Custom default values must be assigned explicitly. * </p> */ FIXED_INTS_64, /** * A 32 bit floating point value. By default there is no compression * applied. To fit custom float values into less than 32bit either a custom * implementation is needed or values must be encoded into a * {@link #BYTES_FIXED_STRAIGHT} type. {@link Source} instances of * this type return a <tt>float</tt> array from {@link Source#getArray()} * <p> * NOTE: this type uses <tt>0.0f</tt> as the default value without any * distinction between provided <tt>0.0f</tt> values during indexing. All * documents without an explicit value will use <tt>0.0f</tt> instead. * Custom default values must be assigned explicitly. * </p> */ FLOAT_32, /** * * A 64 bit floating point value. By default there is no compression * applied. To fit custom float values into less than 64bit either a custom * implementation is needed or values must be encoded into a * {@link #BYTES_FIXED_STRAIGHT} type. {@link Source} instances of * this type return a <tt>double</tt> array from {@link Source#getArray()} * <p> * NOTE: this type uses <tt>0.0d</tt> as the default value without any * distinction between provided <tt>0.0d</tt> values during indexing. All * documents without an explicit value will use <tt>0.0d</tt> instead. * Custom default values must be assigned explicitly. * </p> */ FLOAT_64, // TODO(simonw): -- shouldn't lucene decide/detect straight vs // deref, as well fixed vs var? /** * A fixed length straight byte[]. All values added to * such a field must be of the same length. All bytes are stored sequentially * for fast offset access. * <p> * NOTE: this type uses <tt>0 byte</tt> filled byte[] based on the length of the first seen * value as the default value without any distinction between explicitly * provided values during indexing. All documents without an explicit value * will use the default instead.Custom default values must be assigned explicitly. * </p> */ BYTES_FIXED_STRAIGHT, /** * A fixed length dereferenced byte[] variant. Fields with * this type only store distinct byte values and store an additional offset * pointer per document to dereference the shared byte[]. * Use this type if your documents may share the same byte[]. * <p> * NOTE: Fields of this type will not store values for documents without an * explicitly provided value. If a documents value is accessed while no * explicit value is stored the returned {@link BytesRef} will be a 0-length * reference. Custom default values must be assigned explicitly. * </p> */ BYTES_FIXED_DEREF, /** * Variable length straight stored byte[] variant. All bytes are * stored sequentially for compactness. Usage of this type via the * disk-resident API might yield performance degradation since no additional * index is used to advance by more than one document value at a time. * <p> * NOTE: Fields of this type will not store values for documents without an * explicitly provided value. If a documents value is accessed while no * explicit value is stored the returned {@link BytesRef} will be a 0-length * byte[] reference. Custom default values must be assigned explicitly. * </p> */ BYTES_VAR_STRAIGHT, /** * A variable length dereferenced byte[]. Just like * {@link #BYTES_FIXED_DEREF}, but allowing each * document's value to be a different length. * <p> * NOTE: Fields of this type will not store values for documents without an * explicitly provided value. If a documents value is accessed while no * explicit value is stored the returned {@link BytesRef} will be a 0-length * reference. Custom default values must be assigned explicitly. * </p> */ BYTES_VAR_DEREF, /** * A variable length pre-sorted byte[] variant. Just like * {@link #BYTES_FIXED_SORTED}, but allowing each * document's value to be a different length. * <p> * NOTE: Fields of this type will not store values for documents without an * explicitly provided value. If a documents value is accessed while no * explicit value is stored the returned {@link BytesRef} will be a 0-length * reference.Custom default values must be assigned explicitly. * </p> * * @see SortedSource */ BYTES_VAR_SORTED, /** * A fixed length pre-sorted byte[] variant. Fields with this type only * store distinct byte values and store an additional offset pointer per * document to dereference the shared byte[]. The stored * byte[] is presorted, by default by unsigned byte order, * and allows access via document id, ordinal and by-value. * Use this type if your documents may share the same byte[]. * <p> * NOTE: Fields of this type will not store values for documents without an * explicitly provided value. If a documents value is accessed while no * explicit value is stored the returned {@link BytesRef} will be a 0-length * reference. Custom default values must be assigned * explicitly. * </p> * * @see SortedSource */ BYTES_FIXED_SORTED } /** * Abstract base class for {@link DocValues} {@link Source} cache. * <p> * {@link Source} instances loaded via {@link DocValues#load()} are entirely memory resident * and need to be maintained by the caller. Each call to * {@link DocValues#load()} will cause an entire reload of * the underlying data. Source instances obtained from * {@link DocValues#getSource()} and {@link DocValues#getSource()} * respectively are maintained by a {@link SourceCache} that is closed ( * {@link #close(DocValues)}) once the {@link IndexReader} that created the * {@link DocValues} instance is closed. * <p> * Unless {@link Source} instances are managed by another entity it is * recommended to use the cached variants to obtain a source instance. * <p> * Implementation of this API must be thread-safe. * * @see DocValues#setCache(SourceCache) * @see DocValues#getSource() * * @lucene.experimental */ public static abstract class SourceCache { /** Sole constructor. (For invocation by subclass * constructors, typically implicit.) */ protected SourceCache() { } /** * Atomically loads a {@link Source} into the cache from the given * {@link DocValues} and returns it iff no other {@link Source} has already * been cached. Otherwise the cached source is returned. * <p> * This method will not return <code>null</code> */ public abstract Source load(DocValues values) throws IOException; /** * Atomically invalidates the cached {@link Source} * instances if any and empties the cache. */ public abstract void invalidate(DocValues values); /** * Atomically closes the cache and frees all resources. */ public synchronized void close(DocValues values) { invalidate(values); } /** * Simple per {@link DocValues} instance cache implementation that holds a * {@link Source} a member variable. * <p> * If a {@link DirectSourceCache} instance is closed or invalidated the cached * reference are simply set to <code>null</code> */ public static final class DirectSourceCache extends SourceCache { private Source ref; /** Sole constructor. */ public DirectSourceCache() { } public synchronized Source load(DocValues values) throws IOException { if (ref == null) { ref = values.load(); } return ref; } public synchronized void invalidate(DocValues values) { ref = null; } } } }