package org.apache.lucene.index;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Closeable;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.document.ByteDocValuesField; // javadocs
import org.apache.lucene.document.DerefBytesDocValuesField; // javadocs
import org.apache.lucene.document.DoubleDocValuesField; // javadocs
import org.apache.lucene.document.Field; // javadocs
import org.apache.lucene.document.FloatDocValuesField; // javadocs
import org.apache.lucene.document.IntDocValuesField; // javadocs
import org.apache.lucene.document.LongDocValuesField; // javadocs
import org.apache.lucene.document.PackedLongDocValuesField; // javadocs
import org.apache.lucene.document.ShortDocValuesField; // javadocs
import org.apache.lucene.document.SortedBytesDocValuesField; // javadocs
import org.apache.lucene.document.StraightBytesDocValuesField; // javadocs
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.packed.PackedInts;
/**
* {@link DocValues} provides a dense per-document typed storage for fast
* value access based on the lucene internal document id. {@link DocValues}
* exposes two distinct APIs:
* <ul>
* <li>via {@link #getSource()} providing RAM resident random access</li>
* <li>via {@link #getDirectSource()} providing on disk random access</li>
* </ul> {@link DocValues} are exposed via
* {@link AtomicReader#docValues(String)} on a per-segment basis. For best
* performance {@link DocValues} should be consumed per-segment just like
* IndexReader.
* <p>
* {@link DocValues} are fully integrated into the {@link DocValuesFormat} API.
* <p>
* NOTE: DocValues is a strongly typed per-field API. Type changes within an
* indexing session can result in exceptions if the type has changed in a way that
* the previously give type for a field can't promote the value without losing
* information. For instance a field initially indexed with {@link Type#FIXED_INTS_32}
* can promote a value with {@link Type#FIXED_INTS_8} but can't promote
* {@link Type#FIXED_INTS_64}. During segment merging type-promotion exceptions are suppressed.
* Fields will be promoted to their common denominator or automatically transformed
* into a 3rd type like {@link Type#BYTES_VAR_STRAIGHT} to prevent data loss and merge exceptions.
* This behavior is considered <i>best-effort</i> might change in future releases.
* </p>
* <p>
* DocValues are exposed via the {@link Field} API with type safe
* specializations for each type variant:
* <ul>
* <li> {@link ByteDocValuesField} - for adding byte values to the index</li>
* <li> {@link ShortDocValuesField} - for adding short values to the index</li>
* <li> {@link IntDocValuesField} - for adding int values to the index</li>
* <li> {@link LongDocValuesField} - for adding long values to the index</li>
* <li> {@link FloatDocValuesField} - for adding float values to the index</li>
* <li> {@link DoubleDocValuesField} - for adding double values to the index</li>
* <li> {@link PackedLongDocValuesField} - for adding packed long values to the
* index</li>
* <li> {@link SortedBytesDocValuesField} - for adding sorted {@link BytesRef}
* values to the index</li>
* <li> {@link StraightBytesDocValuesField} - for adding straight
* {@link BytesRef} values to the index</li>
* <li> {@link DerefBytesDocValuesField} - for adding deref {@link BytesRef}
* values to the index</li>
* </ul>
* See {@link Type} for limitations of each type variant.
* <p>
* <p>
*
* @see DocValuesFormat#docsConsumer(org.apache.lucene.index.PerDocWriteState)
*
* @lucene.experimental
*/
public abstract class DocValues implements Closeable {
/** Zero length DocValues array. */
public static final DocValues[] EMPTY_ARRAY = new DocValues[0];
private volatile SourceCache cache = new SourceCache.DirectSourceCache();
private final Object cacheLock = new Object();
/** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */
protected DocValues() {
}
/**
* Loads a new {@link Source} instance for this {@link DocValues} field
* instance. Source instances returned from this method are not cached. It is
* the callers responsibility to maintain the instance and release its
* resources once the source is not needed anymore.
* <p>
* For managed {@link Source} instances see {@link #getSource()}.
*
* @see #getSource()
* @see #setCache(SourceCache)
*/
public abstract Source load() throws IOException;
/**
* Returns a {@link Source} instance through the current {@link SourceCache}.
* Iff no {@link Source} has been loaded into the cache so far the source will
* be loaded through {@link #load()} and passed to the {@link SourceCache}.
* The caller of this method should not close the obtained {@link Source}
* instance unless it is not needed for the rest of its life time.
* <p>
* {@link Source} instances obtained from this method are closed / released
* from the cache once this {@link DocValues} instance is closed by the
* {@link IndexReader}, {@link Fields} or the
* {@link DocValues} was created from.
*/
public Source getSource() throws IOException {
return cache.load(this);
}
/**
* Returns a disk resident {@link Source} instance. Direct Sources are not
* cached in the {@link SourceCache} and should not be shared between threads.
*/
public abstract Source getDirectSource() throws IOException;
/**
* Returns the {@link Type} of this {@link DocValues} instance
*/
public abstract Type getType();
/**
* Closes this {@link DocValues} instance. This method should only be called
* by the creator of this {@link DocValues} instance. API users should not
* close {@link DocValues} instances.
*/
public void close() throws IOException {
cache.close(this);
}
/**
* Returns the size per value in bytes or <code>-1</code> iff size per value
* is variable.
*
* @return the size per value in bytes or <code>-1</code> iff size per value
* is variable.
*/
public int getValueSize() {
return -1;
}
/**
* Sets the {@link SourceCache} used by this {@link DocValues} instance. This
* method should be called before {@link #load()} is called. All {@link Source} instances in the currently used cache will be closed
* before the new cache is installed.
* <p>
* Note: All instances previously obtained from {@link #load()} will be lost.
*
* @throws IllegalArgumentException
* if the given cache is <code>null</code>
*
*/
public void setCache(SourceCache cache) {
if (cache == null)
throw new IllegalArgumentException("cache must not be null");
synchronized (cacheLock) {
SourceCache toClose = this.cache;
this.cache = cache;
toClose.close(this);
}
}
/**
* Source of per document values like long, double or {@link BytesRef}
* depending on the {@link DocValues} fields {@link Type}. Source
* implementations provide random access semantics similar to array lookups
* <p>
* @see DocValues#getSource()
* @see DocValues#getDirectSource()
*/
public static abstract class Source {
/** {@link Type} of this {@code Source}. */
protected final Type type;
/** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */
protected Source(Type type) {
this.type = type;
}
/**
* Returns a <tt>long</tt> for the given document id or throws an
* {@link UnsupportedOperationException} if this source doesn't support
* <tt>long</tt> values.
*
* @throws UnsupportedOperationException
* if this source doesn't support <tt>long</tt> values.
*/
public long getInt(int docID) {
throw new UnsupportedOperationException("ints are not supported");
}
/**
* Returns a <tt>double</tt> for the given document id or throws an
* {@link UnsupportedOperationException} if this source doesn't support
* <tt>double</tt> values.
*
* @throws UnsupportedOperationException
* if this source doesn't support <tt>double</tt> values.
*/
public double getFloat(int docID) {
throw new UnsupportedOperationException("floats are not supported");
}
/**
* Returns a {@link BytesRef} for the given document id or throws an
* {@link UnsupportedOperationException} if this source doesn't support
* <tt>byte[]</tt> values.
*
* @throws UnsupportedOperationException
* if this source doesn't support <tt>byte[]</tt> values.
*/
public BytesRef getBytes(int docID, BytesRef ref) {
throw new UnsupportedOperationException("bytes are not supported");
}
/**
* Returns the {@link Type} of this source.
*
* @return the {@link Type} of this source.
*/
public Type getType() {
return type;
}
/**
* Returns <code>true</code> iff this {@link Source} exposes an array via
* {@link #getArray()} otherwise <code>false</code>.
*
* @return <code>true</code> iff this {@link Source} exposes an array via
* {@link #getArray()} otherwise <code>false</code>.
*/
public boolean hasArray() {
return false;
}
/**
* Returns the internal array representation iff this {@link Source} uses an
* array as its inner representation, otherwise <code>UOE</code>.
*/
public Object getArray() {
throw new UnsupportedOperationException("getArray is not supported");
}
/**
* If this {@link Source} is sorted this method will return an instance of
* {@link SortedSource} otherwise <code>UOE</code>
*/
public SortedSource asSortedSource() {
throw new UnsupportedOperationException("asSortedSource is not supported");
}
}
/**
* A sorted variant of {@link Source} for <tt>byte[]</tt> values per document.
* <p>
*/
public static abstract class SortedSource extends Source {
private final Comparator<BytesRef> comparator;
/** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */
protected SortedSource(Type type, Comparator<BytesRef> comparator) {
super(type);
this.comparator = comparator;
}
@Override
public BytesRef getBytes(int docID, BytesRef bytesRef) {
final int ord = ord(docID);
if (ord < 0) {
// Negative ord means doc was missing?
bytesRef.length = 0;
} else {
getByOrd(ord, bytesRef);
}
return bytesRef;
}
/**
* Returns ord for specified docID. Ord is dense, ie, starts at 0, then increments by 1
* for the next (as defined by {@link Comparator} value.
*/
public abstract int ord(int docID);
/** Returns value for specified ord. */
public abstract BytesRef getByOrd(int ord, BytesRef result);
/** Return true if it's safe to call {@link
* #getDocToOrd}. */
public boolean hasPackedDocToOrd() {
return false;
}
/**
* Returns the PackedInts.Reader impl that maps document to ord.
*/
public abstract PackedInts.Reader getDocToOrd();
/**
* Returns the comparator used to order the BytesRefs.
*/
public Comparator<BytesRef> getComparator() {
return comparator;
}
/**
* Lookup ord by value.
*
* @param value
* the value to look up
* @param spare
* a spare {@link BytesRef} instance used to compare internal
* values to the given value. Must not be <code>null</code>
* @return the given values ordinal if found or otherwise
* <code>(-(ord)-1)</code>, defined as the ordinal of the first
* element that is greater than the given value (the insertion
* point). This guarantees that the return value will always be
* >= 0 if the given value is found.
*/
public int getOrdByValue(BytesRef value, BytesRef spare) {
return binarySearch(value, spare, 0, getValueCount() - 1);
}
private int binarySearch(BytesRef b, BytesRef bytesRef, int low,
int high) {
int mid = 0;
while (low <= high) {
mid = (low + high) >>> 1;
getByOrd(mid, bytesRef);
final int cmp = comparator.compare(bytesRef, b);
if (cmp < 0) {
low = mid + 1;
} else if (cmp > 0) {
high = mid - 1;
} else {
return mid;
}
}
assert comparator.compare(bytesRef, b) != 0;
return -(low + 1);
}
@Override
public SortedSource asSortedSource() {
return this;
}
/**
* Returns the number of unique values in this sorted source
*/
public abstract int getValueCount();
}
/** Returns a Source that always returns default (missing)
* values for all documents. */
public static Source getDefaultSource(final Type type) {
return new Source(type) {
@Override
public long getInt(int docID) {
return 0;
}
@Override
public double getFloat(int docID) {
return 0.0;
}
@Override
public BytesRef getBytes(int docID, BytesRef ref) {
ref.length = 0;
return ref;
}
};
}
/** Returns a SortedSource that always returns default (missing)
* values for all documents. */
public static SortedSource getDefaultSortedSource(final Type type, final int size) {
final PackedInts.Reader docToOrd = new PackedInts.Reader() {
@Override
public long get(int index) {
return 0;
}
@Override
public int getBitsPerValue() {
return 0;
}
@Override
public int size() {
return size;
}
@Override
public boolean hasArray() {
return false;
}
@Override
public Object getArray() {
return null;
}
@Override
public int get(int index, long[] arr, int off, int len) {
len = Math.min(len, size() - index);
Arrays.fill(arr, off, off+len, 0);
return len;
}
@Override
public long ramBytesUsed() {
return 0;
}
};
return new SortedSource(type, BytesRef.getUTF8SortedAsUnicodeComparator()) {
@Override
public BytesRef getBytes(int docID, BytesRef ref) {
ref.length = 0;
return ref;
}
@Override
public int ord(int docID) {
return 0;
}
@Override
public BytesRef getByOrd(int ord, BytesRef bytesRef) {
assert ord == 0;
bytesRef.length = 0;
return bytesRef;
}
@Override
public boolean hasPackedDocToOrd() {
return true;
}
@Override
public PackedInts.Reader getDocToOrd() {
return docToOrd;
}
@Override
public int getOrdByValue(BytesRef value, BytesRef spare) {
if (value.length == 0) {
return 0;
} else {
return -1;
}
}
@Override
public int getValueCount() {
return 1;
}
};
}
/**
* <code>Type</code> specifies the {@link DocValues} type for a
* certain field. A <code>Type</code> only defines the data type for a field
* while the actual implementation used to encode and decode the values depends
* on the the {@link DocValuesFormat#docsConsumer} and {@link DocValuesFormat#docsProducer} methods.
*
* @lucene.experimental
*/
public static enum Type {
/**
* A variable bit signed integer value. By default this type uses
* {@link PackedInts} to compress the values, as an offset
* from the minimum value, as long as the value range
* fits into 2<sup>63</sup>-1. Otherwise,
* the default implementation falls back to fixed size 64bit
* integers ({@link #FIXED_INTS_64}).
* <p>
* NOTE: this type uses <tt>0</tt> as the default value without any
* distinction between provided <tt>0</tt> values during indexing. All
* documents without an explicit value will use <tt>0</tt> instead.
* Custom default values must be assigned explicitly.
* </p>
*/
VAR_INTS,
/**
* A 8 bit signed integer value. {@link Source} instances of
* this type return a <tt>byte</tt> array from {@link Source#getArray()}
* <p>
* NOTE: this type uses <tt>0</tt> as the default value without any
* distinction between provided <tt>0</tt> values during indexing. All
* documents without an explicit value will use <tt>0</tt> instead.
* Custom default values must be assigned explicitly.
* </p>
*/
FIXED_INTS_8,
/**
* A 16 bit signed integer value. {@link Source} instances of
* this type return a <tt>short</tt> array from {@link Source#getArray()}
* <p>
* NOTE: this type uses <tt>0</tt> as the default value without any
* distinction between provided <tt>0</tt> values during indexing. All
* documents without an explicit value will use <tt>0</tt> instead.
* Custom default values must be assigned explicitly.
* </p>
*/
FIXED_INTS_16,
/**
* A 32 bit signed integer value. {@link Source} instances of
* this type return a <tt>int</tt> array from {@link Source#getArray()}
* <p>
* NOTE: this type uses <tt>0</tt> as the default value without any
* distinction between provided <tt>0</tt> values during indexing. All
* documents without an explicit value will use <tt>0</tt> instead.
* Custom default values must be assigned explicitly.
* </p>
*/
FIXED_INTS_32,
/**
* A 64 bit signed integer value. {@link Source} instances of
* this type return a <tt>long</tt> array from {@link Source#getArray()}
* <p>
* NOTE: this type uses <tt>0</tt> as the default value without any
* distinction between provided <tt>0</tt> values during indexing. All
* documents without an explicit value will use <tt>0</tt> instead.
* Custom default values must be assigned explicitly.
* </p>
*/
FIXED_INTS_64,
/**
* A 32 bit floating point value. By default there is no compression
* applied. To fit custom float values into less than 32bit either a custom
* implementation is needed or values must be encoded into a
* {@link #BYTES_FIXED_STRAIGHT} type. {@link Source} instances of
* this type return a <tt>float</tt> array from {@link Source#getArray()}
* <p>
* NOTE: this type uses <tt>0.0f</tt> as the default value without any
* distinction between provided <tt>0.0f</tt> values during indexing. All
* documents without an explicit value will use <tt>0.0f</tt> instead.
* Custom default values must be assigned explicitly.
* </p>
*/
FLOAT_32,
/**
*
* A 64 bit floating point value. By default there is no compression
* applied. To fit custom float values into less than 64bit either a custom
* implementation is needed or values must be encoded into a
* {@link #BYTES_FIXED_STRAIGHT} type. {@link Source} instances of
* this type return a <tt>double</tt> array from {@link Source#getArray()}
* <p>
* NOTE: this type uses <tt>0.0d</tt> as the default value without any
* distinction between provided <tt>0.0d</tt> values during indexing. All
* documents without an explicit value will use <tt>0.0d</tt> instead.
* Custom default values must be assigned explicitly.
* </p>
*/
FLOAT_64,
// TODO(simonw): -- shouldn't lucene decide/detect straight vs
// deref, as well fixed vs var?
/**
* A fixed length straight byte[]. All values added to
* such a field must be of the same length. All bytes are stored sequentially
* for fast offset access.
* <p>
* NOTE: this type uses <tt>0 byte</tt> filled byte[] based on the length of the first seen
* value as the default value without any distinction between explicitly
* provided values during indexing. All documents without an explicit value
* will use the default instead.Custom default values must be assigned explicitly.
* </p>
*/
BYTES_FIXED_STRAIGHT,
/**
* A fixed length dereferenced byte[] variant. Fields with
* this type only store distinct byte values and store an additional offset
* pointer per document to dereference the shared byte[].
* Use this type if your documents may share the same byte[].
* <p>
* NOTE: Fields of this type will not store values for documents without an
* explicitly provided value. If a documents value is accessed while no
* explicit value is stored the returned {@link BytesRef} will be a 0-length
* reference. Custom default values must be assigned explicitly.
* </p>
*/
BYTES_FIXED_DEREF,
/**
* Variable length straight stored byte[] variant. All bytes are
* stored sequentially for compactness. Usage of this type via the
* disk-resident API might yield performance degradation since no additional
* index is used to advance by more than one document value at a time.
* <p>
* NOTE: Fields of this type will not store values for documents without an
* explicitly provided value. If a documents value is accessed while no
* explicit value is stored the returned {@link BytesRef} will be a 0-length
* byte[] reference. Custom default values must be assigned explicitly.
* </p>
*/
BYTES_VAR_STRAIGHT,
/**
* A variable length dereferenced byte[]. Just like
* {@link #BYTES_FIXED_DEREF}, but allowing each
* document's value to be a different length.
* <p>
* NOTE: Fields of this type will not store values for documents without an
* explicitly provided value. If a documents value is accessed while no
* explicit value is stored the returned {@link BytesRef} will be a 0-length
* reference. Custom default values must be assigned explicitly.
* </p>
*/
BYTES_VAR_DEREF,
/**
* A variable length pre-sorted byte[] variant. Just like
* {@link #BYTES_FIXED_SORTED}, but allowing each
* document's value to be a different length.
* <p>
* NOTE: Fields of this type will not store values for documents without an
* explicitly provided value. If a documents value is accessed while no
* explicit value is stored the returned {@link BytesRef} will be a 0-length
* reference.Custom default values must be assigned explicitly.
* </p>
*
* @see SortedSource
*/
BYTES_VAR_SORTED,
/**
* A fixed length pre-sorted byte[] variant. Fields with this type only
* store distinct byte values and store an additional offset pointer per
* document to dereference the shared byte[]. The stored
* byte[] is presorted, by default by unsigned byte order,
* and allows access via document id, ordinal and by-value.
* Use this type if your documents may share the same byte[].
* <p>
* NOTE: Fields of this type will not store values for documents without an
* explicitly provided value. If a documents value is accessed while no
* explicit value is stored the returned {@link BytesRef} will be a 0-length
* reference. Custom default values must be assigned
* explicitly.
* </p>
*
* @see SortedSource
*/
BYTES_FIXED_SORTED
}
/**
* Abstract base class for {@link DocValues} {@link Source} cache.
* <p>
* {@link Source} instances loaded via {@link DocValues#load()} are entirely memory resident
* and need to be maintained by the caller. Each call to
* {@link DocValues#load()} will cause an entire reload of
* the underlying data. Source instances obtained from
* {@link DocValues#getSource()} and {@link DocValues#getSource()}
* respectively are maintained by a {@link SourceCache} that is closed (
* {@link #close(DocValues)}) once the {@link IndexReader} that created the
* {@link DocValues} instance is closed.
* <p>
* Unless {@link Source} instances are managed by another entity it is
* recommended to use the cached variants to obtain a source instance.
* <p>
* Implementation of this API must be thread-safe.
*
* @see DocValues#setCache(SourceCache)
* @see DocValues#getSource()
*
* @lucene.experimental
*/
public static abstract class SourceCache {
/** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */
protected SourceCache() {
}
/**
* Atomically loads a {@link Source} into the cache from the given
* {@link DocValues} and returns it iff no other {@link Source} has already
* been cached. Otherwise the cached source is returned.
* <p>
* This method will not return <code>null</code>
*/
public abstract Source load(DocValues values) throws IOException;
/**
* Atomically invalidates the cached {@link Source}
* instances if any and empties the cache.
*/
public abstract void invalidate(DocValues values);
/**
* Atomically closes the cache and frees all resources.
*/
public synchronized void close(DocValues values) {
invalidate(values);
}
/**
* Simple per {@link DocValues} instance cache implementation that holds a
* {@link Source} a member variable.
* <p>
* If a {@link DirectSourceCache} instance is closed or invalidated the cached
* reference are simply set to <code>null</code>
*/
public static final class DirectSourceCache extends SourceCache {
private Source ref;
/** Sole constructor. */
public DirectSourceCache() {
}
public synchronized Source load(DocValues values) throws IOException {
if (ref == null) {
ref = values.load();
}
return ref;
}
public synchronized void invalidate(DocValues values) {
ref = null;
}
}
}
}