package org.apache.lucene.index;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.CloseableThreadLocal;
import org.apache.lucene.util.cache.Cache;
import org.apache.lucene.util.cache.SimpleLRUCache;
import com.nearinfinity.bloomfilter.BloomFilter;
/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
* Directory. Pairs are accessed either by Term or by ordinal position the
* set. */
final class TermInfosReader {
private static final String COM_NEARINFINITY_LUCENE_BLOOMFILTER_READ = "com.nearinfinity.lucene.bloomfilter.read";
private final Directory directory;
private final String segment;
private final FieldInfos fieldInfos;
private final CloseableThreadLocal<ThreadResources> threadResources = new CloseableThreadLocal<ThreadResources>();
private final SegmentTermEnum origEnum;
private final long size;
private TermInfosReaderIndex index;
private int indexLength;
private final int totalIndexInterval;
private final static int DEFAULT_CACHE_SIZE = 1024;
private BloomFilter bloomFilter;
private boolean bloomFilterEnabled = Boolean.getBoolean(COM_NEARINFINITY_LUCENE_BLOOMFILTER_READ);
/**
* Per-thread resources managed by ThreadLocal
*/
private static final class ThreadResources {
SegmentTermEnum termEnum;
// Used for caching the least recently looked-up Terms
Cache<Term,TermInfo> termInfoCache;
}
TermInfosReader(Directory dir, String seg, FieldInfos fis, int readBufferSize, int indexDivisor)
throws CorruptIndexException, IOException {
boolean success = false;
if (indexDivisor < 1 && indexDivisor != -1) {
throw new IllegalArgumentException("indexDivisor must be -1 (don't load terms index) or greater than 0: got " + indexDivisor);
}
try {
directory = dir;
segment = seg;
fieldInfos = fis;
origEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_EXTENSION,
readBufferSize), fieldInfos, false);
size = origEnum.size;
//read in bloom filter.... here if it exists...
BloomFilter filter = TermInfosCache.getFromCache(getRealDir(directory), segment);
if (filter == null && bloomFilterEnabled) {
if (directory.fileExists(segment + "." + IndexFileNames.BLOOM_FILTER_EXTENSION)) {
IndexInput openInput = directory.openInput(segment + "." + IndexFileNames.BLOOM_FILTER_EXTENSION);
if (openInput.readLong() != -1L) {
openInput.seek(0);
bloomFilter = new BloomFilter();
bloomFilter.read(openInput);
openInput.close();
}
}
}
if (indexDivisor != -1) {
// Load terms index
totalIndexInterval = origEnum.indexInterval * indexDivisor;
final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_INDEX_EXTENSION,
readBufferSize), fieldInfos, true);
try {
index = new TermInfosReaderIndex(directory,segment);
index.build(indexEnum, indexDivisor, (int) dir.fileLength(segment + "." + IndexFileNames.TERMS_INDEX_EXTENSION));
indexLength = index.length();
} finally {
indexEnum.close();
}
} else {
// Do not load terms index:
totalIndexInterval = -1;
index = null;
}
success = true;
} finally {
// With lock-less commits, it's entirely possible (and
// fine) to hit a FileNotFound exception above. In
// this case, we want to explicitly close any subset
// of things that were opened so that we don't have to
// wait for a GC to do so.
if (!success) {
close();
}
}
}
private Directory getRealDir(Directory directory) {
if (directory instanceof CompoundFileReader) {
CompoundFileReader compoundFileReader = (CompoundFileReader) directory;
return getRealDir(compoundFileReader.getDirectory());
}
return directory;
}
public int getSkipInterval() {
return origEnum.skipInterval;
}
public int getMaxSkipLevels() {
return origEnum.maxSkipLevels;
}
final void close() throws IOException {
if (origEnum != null)
origEnum.close();
threadResources.close();
}
/** Returns the number of term/value pairs in the set. */
final long size() {
return size;
}
private ThreadResources getThreadResources() {
ThreadResources resources = threadResources.get();
if (resources == null) {
resources = new ThreadResources();
resources.termEnum = terms();
// Cache does not have to be thread-safe, it is only used by one thread at the same time
resources.termInfoCache = new SimpleLRUCache<Term,TermInfo>(DEFAULT_CACHE_SIZE);
threadResources.set(resources);
}
return resources;
}
/** Returns the offset of the greatest index entry which is less than or equal to term.*/
private final int getIndexOffset(Term term) {
return index.getIndexOffset(term);
}
private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
index.seekEnum(enumerator, indexOffset, totalIndexInterval);
}
/** Returns the TermInfo for a Term in the set, or null. */
TermInfo get(Term term) throws IOException {
if (bloomFilter != null) {
byte[] key = getTermKey(term);
if (!bloomFilter.testBytes(key, 0, key.length)) {
return null;
}
}
return get(term, true);
}
private byte[] getTermKey(Term term) {
int fieldNumber = fieldInfos.fieldNumber(term.field);
try {
byte[] bs = term.text.getBytes("UTF-8");
int length = bs.length;
if (length > TermInfosWriter.BLOOM_BUFFER_SIZE) {
length = TermInfosWriter.BLOOM_BUFFER_SIZE;
}
ByteBuffer buffer = ByteBuffer.allocate(length + 4);
return buffer.putInt(fieldNumber).put(bs,0,length).array();
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
}
/** Returns the TermInfo for a Term in the set, or null. */
private TermInfo get(Term term, boolean useCache) throws IOException {
if (size == 0) return null;
ensureIndexIsRead();
TermInfo ti;
ThreadResources resources = getThreadResources();
Cache<Term,TermInfo> cache = null;
if (useCache) {
cache = resources.termInfoCache;
// check the cache first if the term was recently looked up
ti = cache.get(term);
if (ti != null) {
return ti;
}
}
// optimize sequential access: first try scanning cached enum w/o seeking
SegmentTermEnum enumerator = resources.termEnum;
if (enumerator.term() != null // term is at or past current
&& ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0)
|| term.compareTo(enumerator.term()) >= 0)) {
int enumOffset = (int)(enumerator.position/totalIndexInterval)+1;
if (indexLength == enumOffset // but before end of block
|| index.compareTo(term,enumOffset) < 0) {
// no need to seek
int numScans = enumerator.scanTo(term);
if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
ti = enumerator.termInfo();
if (cache != null && numScans > 1) {
// we only want to put this TermInfo into the cache if
// scanEnum skipped more than one dictionary entry.
// This prevents RangeQueries or WildcardQueries to
// wipe out the cache when they iterate over a large numbers
// of terms in order
cache.put(term, ti);
}
} else {
ti = null;
}
return ti;
}
}
// random-access: must seek
seekEnum(enumerator, getIndexOffset(term));
enumerator.scanTo(term);
if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
ti = enumerator.termInfo();
if (cache != null) {
cache.put(term, ti);
}
} else {
ti = null;
}
return ti;
}
private void ensureIndexIsRead() {
if (index == null) {
throw new IllegalStateException("terms index was not loaded when this reader was created");
}
}
/** Returns the position of a Term in the set or -1. */
final long getPosition(Term term) throws IOException {
if (size == 0) return -1;
ensureIndexIsRead();
int indexOffset = getIndexOffset(term);
SegmentTermEnum enumerator = getThreadResources().termEnum;
seekEnum(enumerator, indexOffset);
while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {}
if (term.compareTo(enumerator.term()) == 0)
return enumerator.position;
else
return -1;
}
/** Returns an enumeration of all the Terms and TermInfos in the set. */
public SegmentTermEnum terms() {
return (SegmentTermEnum)origEnum.clone();
}
/** Returns an enumeration of terms starting at or after the named term. */
public SegmentTermEnum terms(Term term) throws IOException {
// don't use the cache in this call because we want to reposition the
// enumeration
get(term, false);
return (SegmentTermEnum)getThreadResources().termEnum.clone();
}
}