package org.apache.lucene.index.codecs.standard;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.packed.PackedInts;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Collection;
import java.util.Comparator;
import java.io.IOException;
/**
* Uses a simplistic format to record terms dict index
* information. Limititations:
*
* - Index for all fields is loaded entirely into RAM up
* front
* - Index is stored in RAM using shared byte[] that
* wastefully expand every term. Using FST to share
* common prefix & suffix would save RAM.
* - Index is taken at regular numTerms (every 128 by
* default); might be better to do it by "net docFreqs"
* encountered, so that for spans of low-freq terms we
* take index less often.
*
* A better approach might be something similar to how
* postings are encoded, w/ multi-level skips. Ie, load all
* terms index data into memory, as a single large compactly
* encoded stream (eg delta bytes + delta offset). Index
* that w/ multi-level skipper. Then to look up a term is
* the equivalent binary search, using the skipper instead,
* while data remains compressed in memory.
*/
import org.apache.lucene.index.IndexFileNames;
/** @lucene.experimental */
public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader {
// NOTE: long is overkill here, since this number is 128
// by default and only indexDivisor * 128 if you change
// the indexDivisor at search time. But, we use this in a
// number of places to multiply out the actual ord, and we
// will overflow int during those multiplies. So to avoid
// having to upgrade each multiple to long in multiple
// places (error proned), we use long here:
private long totalIndexInterval;
private int indexDivisor;
final private int indexInterval;
// Closed if indexLoaded is true:
final private IndexInput in;
private volatile boolean indexLoaded;
private final Comparator<BytesRef> termComp;
private final static int PAGED_BYTES_BITS = 15;
// all fields share this single logical byte[]
private final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS);
private PagedBytes.Reader termBytesReader;
final HashMap<FieldInfo,FieldIndexReader> fields = new HashMap<FieldInfo,FieldIndexReader>();
// start of the field info data
protected long dirOffset;
public SimpleStandardTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, Comparator<BytesRef> termComp)
throws IOException {
this.termComp = termComp;
IndexInput in = dir.openInput(IndexFileNames.segmentFileName(segment, "", StandardCodec.TERMS_INDEX_EXTENSION));
boolean success = false;
try {
readHeader(in);
indexInterval = in.readInt();
this.indexDivisor = indexDivisor;
if (indexDivisor < 0) {
totalIndexInterval = indexInterval;
} else {
// In case terms index gets loaded, later, on demand
totalIndexInterval = indexInterval * indexDivisor;
}
assert totalIndexInterval > 0;
seekDir(in, dirOffset);
// Read directory
final int numFields = in.readInt();
for(int i=0;i<numFields;i++) {
final int field = in.readInt();
final int numIndexTerms = in.readInt();
final long termsStart = in.readLong();
final long indexStart = in.readLong();
final long packedIndexStart = in.readLong();
final long packedOffsetsStart = in.readLong();
assert packedIndexStart >= indexStart: "packedStart=" + packedIndexStart + " indexStart=" + indexStart + " numIndexTerms=" + numIndexTerms + " seg=" + segment;
if (numIndexTerms > 0) {
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
fields.put(fieldInfo, new FieldIndexReader(in, fieldInfo, numIndexTerms, indexStart, termsStart, packedIndexStart, packedOffsetsStart));
}
}
success = true;
} finally {
if (indexDivisor > 0) {
in.close();
this.in = null;
if (success) {
indexLoaded = true;
}
termBytesReader = termBytes.freeze(true);
} else {
this.in = in;
}
}
}
protected void readHeader(IndexInput input) throws IOException {
CodecUtil.checkHeader(input, SimpleStandardTermsIndexWriter.CODEC_NAME,
SimpleStandardTermsIndexWriter.VERSION_START, SimpleStandardTermsIndexWriter.VERSION_START);
dirOffset = input.readLong();
}
private final class FieldIndexReader extends FieldReader {
final private FieldInfo fieldInfo;
private volatile CoreFieldIndex coreIndex;
private final IndexInput in;
private final long indexStart;
private final long termsStart;
private final long packedIndexStart;
private final long packedOffsetsStart;
private final int numIndexTerms;
public FieldIndexReader(IndexInput in, FieldInfo fieldInfo, int numIndexTerms, long indexStart, long termsStart, long packedIndexStart,
long packedOffsetsStart) throws IOException {
this.fieldInfo = fieldInfo;
this.in = in;
this.termsStart = termsStart;
this.indexStart = indexStart;
this.packedIndexStart = packedIndexStart;
this.packedOffsetsStart = packedOffsetsStart;
this.numIndexTerms = numIndexTerms;
// We still create the indexReader when indexDivisor
// is -1, so that StandardTermsDictReader can call
// isIndexTerm for each field:
if (indexDivisor > 0) {
coreIndex = new CoreFieldIndex(indexStart,
termsStart,
packedIndexStart,
packedOffsetsStart,
numIndexTerms);
}
}
public void loadTermsIndex() throws IOException {
if (coreIndex == null) {
coreIndex = new CoreFieldIndex(indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms);
}
}
@Override
public boolean isIndexTerm(long ord, int docFreq, boolean onlyLoaded) {
if (onlyLoaded) {
return ord % totalIndexInterval == 0;
} else {
return ord % indexInterval == 0;
}
}
@Override
public boolean nextIndexTerm(long ord, TermsIndexResult result) throws IOException {
if (coreIndex == null) {
throw new IllegalStateException("terms index was not loaded");
} else {
return coreIndex.nextIndexTerm(ord, result);
}
}
@Override
public void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException {
// You must call loadTermsIndex if you had specified -1 for indexDivisor
if (coreIndex == null) {
throw new IllegalStateException("terms index was not loaded");
}
coreIndex.getIndexOffset(term, result);
}
@Override
public void getIndexOffset(long ord, TermsIndexResult result) throws IOException {
// You must call loadTermsIndex if you had specified
// indexDivisor < 0 to ctor
if (coreIndex == null) {
throw new IllegalStateException("terms index was not loaded");
}
coreIndex.getIndexOffset(ord, result);
}
private final class CoreFieldIndex {
final private long termBytesStart;
// offset into index termBytes
final PackedInts.Reader termOffsets;
// index pointers into main terms dict
final PackedInts.Reader termsDictOffsets;
final int numIndexTerms;
final long termsStart;
public CoreFieldIndex(long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, int numIndexTerms) throws IOException {
this.termsStart = termsStart;
termBytesStart = termBytes.getPointer();
IndexInput clone = (IndexInput) in.clone();
clone.seek(indexStart);
// -1 is passed to mean "don't load term index", but
// if we are then later loaded it's overwritten with
// a real value
assert indexDivisor > 0;
this.numIndexTerms = 1+(numIndexTerms-1) / indexDivisor;
assert this.numIndexTerms > 0: "numIndexTerms=" + numIndexTerms + " indexDivisor=" + indexDivisor;
if (indexDivisor == 1) {
// Default (load all index terms) is fast -- slurp in the images from disk:
try {
final long numTermBytes = packedIndexStart - indexStart;
termBytes.copy(clone, numTermBytes);
// records offsets into main terms dict file
termsDictOffsets = PackedInts.getReader(clone);
assert termsDictOffsets.size() == numIndexTerms;
// records offsets into byte[] term data
termOffsets = PackedInts.getReader(clone);
assert termOffsets.size() == 1+numIndexTerms;
} finally {
clone.close();
}
} else {
// Get packed iterators
final IndexInput clone1 = (IndexInput) in.clone();
final IndexInput clone2 = (IndexInput) in.clone();
try {
// Subsample the index terms
clone1.seek(packedIndexStart);
final PackedInts.ReaderIterator termsDictOffsetsIter = PackedInts.getReaderIterator(clone1);
clone2.seek(packedOffsetsStart);
final PackedInts.ReaderIterator termOffsetsIter = PackedInts.getReaderIterator(clone2);
// TODO: often we can get by w/ fewer bits per
// value, below.. .but this'd be more complex:
// we'd have to try @ fewer bits and then grow
// if we overflowed it.
PackedInts.Mutable termsDictOffsetsM = PackedInts.getMutable(this.numIndexTerms, termsDictOffsetsIter.getBitsPerValue());
PackedInts.Mutable termOffsetsM = PackedInts.getMutable(this.numIndexTerms+1, termOffsetsIter.getBitsPerValue());
termsDictOffsets = termsDictOffsetsM;
termOffsets = termOffsetsM;
int upto = 0;
long termOffsetUpto = 0;
while(upto < this.numIndexTerms) {
// main file offset copies straight over
termsDictOffsetsM.set(upto, termsDictOffsetsIter.next());
termOffsetsM.set(upto, termOffsetUpto);
upto++;
long termOffset = termOffsetsIter.next();
long nextTermOffset = termOffsetsIter.next();
final int numTermBytes = (int) (nextTermOffset - termOffset);
clone.seek(indexStart + termOffset);
assert indexStart + termOffset < clone.length() : "indexStart=" + indexStart + " termOffset=" + termOffset + " len=" + clone.length();
assert indexStart + termOffset + numTermBytes < clone.length();
termBytes.copy(clone, numTermBytes);
termOffsetUpto += numTermBytes;
// skip terms:
termsDictOffsetsIter.next();
for(int i=0;i<indexDivisor-2;i++) {
termOffsetsIter.next();
termsDictOffsetsIter.next();
}
}
termOffsetsM.set(upto, termOffsetUpto);
} finally {
clone1.close();
clone2.close();
clone.close();
}
}
}
public boolean nextIndexTerm(long ord, TermsIndexResult result) throws IOException {
int idx = 1 + (int) (ord / totalIndexInterval);
if (idx < numIndexTerms) {
fillResult(idx, result);
return true;
} else {
return false;
}
}
private void fillResult(int idx, TermsIndexResult result) {
final long offset = termOffsets.get(idx);
final int length = (int) (termOffsets.get(1+idx) - offset);
termBytesReader.fill(result.term, termBytesStart + offset, length);
result.position = idx * totalIndexInterval;
result.offset = termsStart + termsDictOffsets.get(idx);
}
public void getIndexOffset(BytesRef term, TermsIndexResult result) throws IOException {
int lo = 0; // binary search
int hi = numIndexTerms - 1;
assert totalIndexInterval > 0 : "totalIndexInterval=" + totalIndexInterval;
while (hi >= lo) {
int mid = (lo + hi) >>> 1;
final long offset = termOffsets.get(mid);
final int length = (int) (termOffsets.get(1+mid) - offset);
termBytesReader.fill(result.term, termBytesStart + offset, length);
int delta = termComp.compare(term, result.term);
if (delta < 0) {
hi = mid - 1;
} else if (delta > 0) {
lo = mid + 1;
} else {
assert mid >= 0;
result.position = mid*totalIndexInterval;
result.offset = termsStart + termsDictOffsets.get(mid);
return;
}
}
if (hi < 0) {
assert hi == -1;
hi = 0;
}
final long offset = termOffsets.get(hi);
final int length = (int) (termOffsets.get(1+hi) - offset);
termBytesReader.fill(result.term, termBytesStart + offset, length);
result.position = hi*totalIndexInterval;
result.offset = termsStart + termsDictOffsets.get(hi);
}
public void getIndexOffset(long ord, TermsIndexResult result) throws IOException {
int idx = (int) (ord / totalIndexInterval);
// caller must ensure ord is in bounds
assert idx < numIndexTerms;
fillResult(idx, result);
}
}
}
@Override
public void loadTermsIndex(int indexDivisor) throws IOException {
if (!indexLoaded) {
if (indexDivisor < 0) {
this.indexDivisor = -indexDivisor;
} else {
this.indexDivisor = indexDivisor;
}
this.totalIndexInterval = indexInterval * this.indexDivisor;
Iterator<FieldIndexReader> it = fields.values().iterator();
while(it.hasNext()) {
it.next().loadTermsIndex();
}
indexLoaded = true;
in.close();
termBytesReader = termBytes.freeze(true);
}
}
@Override
public FieldReader getField(FieldInfo fieldInfo) {
return fields.get(fieldInfo);
}
public static void files(Directory dir, SegmentInfo info, Collection<String> files) {
files.add(IndexFileNames.segmentFileName(info.name, "", StandardCodec.TERMS_INDEX_EXTENSION));
}
public static void getIndexExtensions(Collection<String> extensions) {
extensions.add(StandardCodec.TERMS_INDEX_EXTENSION);
}
@Override
public void getExtensions(Collection<String> extensions) {
getIndexExtensions(extensions);
}
@Override
public void close() throws IOException {
if (in != null && !indexLoaded) {
in.close();
}
if (termBytesReader != null) {
termBytesReader.close();
}
}
protected void seekDir(IndexInput input, long dirOffset) throws IOException {
input.seek(dirOffset);
}
}