package org.apache.lucene.index.codecs.standard;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Closeable;
import java.util.Collection;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.Comparator;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.DoubleBarrelLRUCache;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
/** Handles a terms dict, but decouples all details of
* doc/freqs/positions reading to an instance of {@link
* StandardPostingsReader}. This class is reusable for
* codecs that use a different format for
* docs/freqs/positions (though codecs are also free to
* make their own terms dict impl).
*
* <p>This class also interacts with an instance of {@link
* StandardTermsIndexReader}, to abstract away the specific
* implementation of the terms dict index.
* @lucene.experimental */
public class StandardTermsDictReader extends FieldsProducer {
// Open input to the main terms dict file (_X.tis)
private final IndexInput in;
// Reads the terms dict entries, to gather state to
// produce DocsEnum on demand
private final StandardPostingsReader postingsReader;
private final TreeMap<String,FieldReader> fields = new TreeMap<String,FieldReader>();
// Comparator that orders our terms
private final Comparator<BytesRef> termComp;
// Caches the most recently looked-up field + terms:
private final DoubleBarrelLRUCache<FieldAndTerm,TermState> termsCache;
// Reads the terms index
private StandardTermsIndexReader indexReader;
// keeps the dirStart offset
protected long dirOffset;
// Used as key for the terms cache
private static class FieldAndTerm extends DoubleBarrelLRUCache.CloneableKey {
String field;
BytesRef term;
public FieldAndTerm() {
}
public FieldAndTerm(FieldAndTerm other) {
field = other.field;
term = new BytesRef(other.term);
}
@Override
public boolean equals(Object _other) {
FieldAndTerm other = (FieldAndTerm) _other;
return other.field == field && term.bytesEquals(other.term);
}
@Override
public Object clone() {
return new FieldAndTerm(this);
}
@Override
public int hashCode() {
return field.hashCode() * 31 + term.hashCode();
}
}
public StandardTermsDictReader(StandardTermsIndexReader indexReader, Directory dir, FieldInfos fieldInfos, String segment, StandardPostingsReader postingsReader, int readBufferSize,
Comparator<BytesRef> termComp, int termsCacheSize)
throws IOException {
this.postingsReader = postingsReader;
termsCache = new DoubleBarrelLRUCache<FieldAndTerm,TermState>(termsCacheSize);
this.termComp = termComp;
in = dir.openInput(IndexFileNames.segmentFileName(segment, "", StandardCodec.TERMS_EXTENSION),
readBufferSize);
boolean success = false;
try {
readHeader(in);
// Have PostingsReader init itself
postingsReader.init(in);
// Read per-field details
seekDir(in, dirOffset);
final int numFields = in.readInt();
for(int i=0;i<numFields;i++) {
final int field = in.readInt();
final long numTerms = in.readLong();
assert numTerms >= 0;
final long termsStartPointer = in.readLong();
final StandardTermsIndexReader.FieldReader fieldIndexReader;
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
fieldIndexReader = indexReader.getField(fieldInfo);
if (numTerms > 0) {
assert !fields.containsKey(fieldInfo.name);
fields.put(fieldInfo.name, new FieldReader(fieldIndexReader, fieldInfo, numTerms, termsStartPointer));
}
}
success = true;
} finally {
if (!success) {
in.close();
}
}
this.indexReader = indexReader;
}
protected void readHeader(IndexInput input) throws IOException {
CodecUtil.checkHeader(in, StandardTermsDictWriter.CODEC_NAME,
StandardTermsDictWriter.VERSION_START, StandardTermsDictWriter.VERSION_CURRENT);
dirOffset = in.readLong();
}
protected void seekDir(IndexInput input, long dirOffset)
throws IOException {
input.seek(dirOffset);
}
@Override
public void loadTermsIndex(int indexDivisor) throws IOException {
indexReader.loadTermsIndex(indexDivisor);
}
@Override
public void close() throws IOException {
try {
try {
if (indexReader != null) {
indexReader.close();
}
} finally {
// null so if an app hangs on to us (ie, we are not
// GCable, despite being closed) we still free most
// ram
indexReader = null;
if (in != null) {
in.close();
}
}
} finally {
try {
if (postingsReader != null) {
postingsReader.close();
}
} finally {
for(FieldReader field : fields.values()) {
field.close();
}
}
}
}
public static void files(Directory dir, SegmentInfo segmentInfo, Collection<String> files) {
files.add(IndexFileNames.segmentFileName(segmentInfo.name, "", StandardCodec.TERMS_EXTENSION));
}
public static void getExtensions(Collection<String> extensions) {
extensions.add(StandardCodec.TERMS_EXTENSION);
}
@Override
public FieldsEnum iterator() {
return new TermFieldsEnum();
}
@Override
public Terms terms(String field) throws IOException {
return fields.get(field);
}
// Iterates through all fields
private class TermFieldsEnum extends FieldsEnum {
final Iterator<FieldReader> it;
FieldReader current;
TermFieldsEnum() {
it = fields.values().iterator();
}
@Override
public String next() {
if (it.hasNext()) {
current = it.next();
return current.fieldInfo.name;
} else {
current = null;
return null;
}
}
@Override
public TermsEnum terms() throws IOException {
return current.iterator();
}
}
private class FieldReader extends Terms implements Closeable {
final long numTerms;
final FieldInfo fieldInfo;
final long termsStartPointer;
final StandardTermsIndexReader.FieldReader fieldIndexReader;
FieldReader(StandardTermsIndexReader.FieldReader fieldIndexReader, FieldInfo fieldInfo, long numTerms, long termsStartPointer) {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
this.numTerms = numTerms;
this.termsStartPointer = termsStartPointer;
this.fieldIndexReader = fieldIndexReader;
}
@Override
public Comparator<BytesRef> getComparator() {
return termComp;
}
@Override
public void close() {
super.close();
}
@Override
public TermsEnum iterator() throws IOException {
return new SegmentTermsEnum();
}
@Override
public long getUniqueTermCount() {
return numTerms;
}
// Iterates through terms in this field
private class SegmentTermsEnum extends TermsEnum {
private final IndexInput in;
private final DeltaBytesReader bytesReader;
private final TermState state;
private boolean seekPending;
private final StandardTermsIndexReader.TermsIndexResult indexResult = new StandardTermsIndexReader.TermsIndexResult();
private final FieldAndTerm fieldTerm = new FieldAndTerm();
SegmentTermsEnum() throws IOException {
in = (IndexInput) StandardTermsDictReader.this.in.clone();
in.seek(termsStartPointer);
bytesReader = new DeltaBytesReader(in);
fieldTerm.field = fieldInfo.name;
state = postingsReader.newTermState();
state.ord = -1;
}
@Override
public Comparator<BytesRef> getComparator() {
return termComp;
}
/** Seeks until the first term that's >= the provided
* text; returns SeekStatus.FOUND if the exact term
* is found, SeekStatus.NOT_FOUND if a different term
* was found, SeekStatus.END if we hit EOF */
@Override
public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
// Check cache
fieldTerm.term = term;
TermState cachedState;
if (useCache) {
cachedState = termsCache.get(fieldTerm);
if (cachedState != null) {
state.copy(cachedState);
seekPending = true;
bytesReader.term.copy(term);
return SeekStatus.FOUND;
}
} else {
cachedState = null;
}
boolean doSeek = true;
if (state.ord != -1) {
// we are positioned
final int cmp = termComp.compare(bytesReader.term, term);
if (cmp == 0) {
// already at the requested term
return SeekStatus.FOUND;
}
if (cmp < 0 &&
fieldIndexReader.nextIndexTerm(state.ord, indexResult) &&
termComp.compare(indexResult.term, term) > 0) {
// Optimization: requested term is within the
// same index block we are now in; skip seeking
// (but do scanning):
doSeek = false;
}
}
// Used only for assert:
final long startOrd;
if (doSeek) {
// As index to find biggest index term that's <=
// our text:
fieldIndexReader.getIndexOffset(term, indexResult);
in.seek(indexResult.offset);
seekPending = false;
// NOTE: the first next() after an index seek is
// wasteful, since it redundantly reads the same
// bytes into the buffer. We could avoid storing
// those bytes in the primary file, but then when
// scanning over an index term we'd have to
// special case it:
bytesReader.reset(indexResult.term);
state.ord = indexResult.position-1;
assert state.ord >= -1: "ord=" + state.ord + " pos=" + indexResult.position;
startOrd = indexResult.position;
} else {
startOrd = -1;
}
// Now scan:
while(next() != null) {
final int cmp = termComp.compare(bytesReader.term, term);
if (cmp == 0) {
if (doSeek && useCache) {
// Store in cache
FieldAndTerm entryKey = new FieldAndTerm(fieldTerm);
cachedState = (TermState) state.clone();
// this is fp after current term
cachedState.filePointer = in.getFilePointer();
termsCache.put(entryKey, cachedState);
}
return SeekStatus.FOUND;
} else if (cmp > 0) {
return SeekStatus.NOT_FOUND;
}
// The purpose of the terms dict index is to seek
// the enum to the closest index term before the
// term we are looking for. So, we should never
// cross another index term (besides the first
// one) while we are scanning:
assert state.ord == startOrd || !fieldIndexReader.isIndexTerm(state.ord, state.docFreq, true): "state.ord=" + state.ord + " startOrd=" + startOrd + " ir.isIndexTerm=" + fieldIndexReader.isIndexTerm(state.ord, state.docFreq, true) + " state.docFreq=" + state.docFreq;
}
return SeekStatus.END;
}
@Override
public SeekStatus seek(long ord) throws IOException {
// TODO: should we cache term lookup by ord as well...?
if (ord >= numTerms) {
state.ord = numTerms-1;
return SeekStatus.END;
}
fieldIndexReader.getIndexOffset(ord, indexResult);
in.seek(indexResult.offset);
seekPending = false;
// NOTE: the first next() after an index seek is
// wasteful, since it redundantly reads the same
// bytes into the buffer
bytesReader.reset(indexResult.term);
state.ord = indexResult.position-1;
assert state.ord >= -1: "ord=" + state.ord;
// Now, scan:
int left = (int) (ord - state.ord);
while(left > 0) {
final BytesRef term = next();
assert term != null;
left--;
}
// always found
return SeekStatus.FOUND;
}
@Override
public BytesRef term() {
return bytesReader.term;
}
@Override
public long ord() {
return state.ord;
}
@Override
public BytesRef next() throws IOException {
if (seekPending) {
seekPending = false;
in.seek(state.filePointer);
}
if (state.ord >= numTerms-1) {
return null;
}
bytesReader.read();
state.docFreq = in.readVInt();
// TODO: would be cleaner, but space-wasting, to
// simply record a bit into each index entry as to
// whether it's an index entry or not, rather than
// re-compute that information... or, possibly store
// a "how many terms until next index entry" in each
// index entry, but that'd require some tricky
// lookahead work when writing the index
postingsReader.readTerm(in,
fieldInfo, state,
fieldIndexReader.isIndexTerm(1+state.ord, state.docFreq, false));
state.ord++;
return bytesReader.term;
}
@Override
public int docFreq() {
return state.docFreq;
}
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
DocsEnum docsEnum = postingsReader.docs(fieldInfo, state, skipDocs, reuse);
assert docsEnum != null;
return docsEnum;
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
if (fieldInfo.omitTermFreqAndPositions) {
return null;
} else {
return postingsReader.docsAndPositions(fieldInfo, state, skipDocs, reuse);
}
}
}
}
}