package org.apache.lucene.index.codecs.preflex; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.store.Directory; import org.apache.lucene.util.CloseableThreadLocal; import org.apache.lucene.util.DoubleBarrelLRUCache; /** This stores a monotonically increasing set of <Term, TermInfo> pairs in a * Directory. Pairs are accessed either by Term or by ordinal position the * set * @deprecated This class has been replaced by * FormatPostingsTermsDictReader, except for reading old segments. * @lucene.experimental */ @Deprecated public final class TermInfosReader { private final Directory directory; private final String segment; private final FieldInfos fieldInfos; private final CloseableThreadLocal<ThreadResources> threadResources = new CloseableThreadLocal<ThreadResources>(); private final SegmentTermEnum origEnum; private final long size; private final Term[] indexTerms; private final TermInfo[] indexInfos; private final long[] indexPointers; private final int totalIndexInterval; private final static int DEFAULT_CACHE_SIZE = 1024; // Just adds term's ord to TermInfo private final static class TermInfoAndOrd extends TermInfo { final int termOrd; public TermInfoAndOrd(TermInfo ti, int termOrd) { super(ti); this.termOrd = termOrd; } } private static class CloneableTerm extends DoubleBarrelLRUCache.CloneableKey { private Term term; public CloneableTerm(Term t) { this.term = t; } public boolean equals(Object other) { return this.term.equals(other); } public int hashCode() { return term.hashCode(); } public Object clone() { return new CloneableTerm(term); } } private final DoubleBarrelLRUCache<CloneableTerm,TermInfoAndOrd> termsCache = new DoubleBarrelLRUCache<CloneableTerm,TermInfoAndOrd>(DEFAULT_CACHE_SIZE); /** * Per-thread resources managed by ThreadLocal */ private static final class ThreadResources { SegmentTermEnum termEnum; } TermInfosReader(Directory dir, String seg, FieldInfos fis, int readBufferSize, int indexDivisor) throws CorruptIndexException, IOException { boolean success = false; if (indexDivisor < 1 && indexDivisor != -1) { throw new IllegalArgumentException("indexDivisor must be -1 (don't load terms index) or greater than 0: got " + indexDivisor); } try { directory = dir; segment = seg; fieldInfos = fis; origEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, "", PreFlexCodec.TERMS_EXTENSION), readBufferSize), fieldInfos, false); size = origEnum.size; if (indexDivisor != -1) { // Load terms index totalIndexInterval = origEnum.indexInterval * indexDivisor; final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, "", PreFlexCodec.TERMS_INDEX_EXTENSION), readBufferSize), fieldInfos, true); try { int indexSize = 1+((int)indexEnum.size-1)/indexDivisor; // otherwise read index indexTerms = new Term[indexSize]; indexInfos = new TermInfo[indexSize]; indexPointers = new long[indexSize]; for (int i=0;indexEnum.next(); i++) { indexTerms[i] = indexEnum.term(); assert indexTerms[i] != null; assert indexTerms[i].text() != null; assert indexTerms[i].field() != null; indexInfos[i] = indexEnum.termInfo(); indexPointers[i] = indexEnum.indexPointer; for (int j = 1; j < indexDivisor; j++) if (!indexEnum.next()) break; } } finally { indexEnum.close(); } } else { // Do not load terms index: totalIndexInterval = -1; indexTerms = null; indexInfos = null; indexPointers = null; } success = true; } finally { // With lock-less commits, it's entirely possible (and // fine) to hit a FileNotFound exception above. In // this case, we want to explicitly close any subset // of things that were opened so that we don't have to // wait for a GC to do so. if (!success) { close(); } } } public int getSkipInterval() { return origEnum.skipInterval; } public int getMaxSkipLevels() { return origEnum.maxSkipLevels; } void close() throws IOException { if (origEnum != null) origEnum.close(); threadResources.close(); } /** Returns the number of term/value pairs in the set. */ long size() { return size; } private ThreadResources getThreadResources() { ThreadResources resources = threadResources.get(); if (resources == null) { resources = new ThreadResources(); resources.termEnum = terms(); threadResources.set(resources); } return resources; } /** Returns the offset of the greatest index entry which is less than or equal to term.*/ private int getIndexOffset(Term term) { int lo = 0; // binary search indexTerms[] int hi = indexTerms.length - 1; while (hi >= lo) { int mid = (lo + hi) >>> 1; assert indexTerms[mid] != null : "indexTerms = " + indexTerms.length + " mid=" + mid; int delta = term.compareToUTF16(indexTerms[mid]); if (delta < 0) hi = mid - 1; else if (delta > 0) lo = mid + 1; else return mid; } return hi; } private void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { enumerator.seek(indexPointers[indexOffset], ((long) indexOffset * totalIndexInterval) - 1, indexTerms[indexOffset], indexInfos[indexOffset]); } /** Returns the TermInfo for a Term in the set, or null. */ TermInfo get(Term term) throws IOException { return get(term, false); } /** Returns the TermInfo for a Term in the set, or null. */ private TermInfo get(Term term, boolean mustSeekEnum) throws IOException { if (size == 0) return null; ensureIndexIsRead(); TermInfoAndOrd tiOrd = termsCache.get(new CloneableTerm(term)); ThreadResources resources = getThreadResources(); if (!mustSeekEnum && tiOrd != null) { return tiOrd; } return seekEnum(resources.termEnum, term, tiOrd); } TermInfo seekEnum(SegmentTermEnum enumerator, Term term) throws IOException { return seekEnum(enumerator, term, termsCache.get(new CloneableTerm(term))); } TermInfo seekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd) throws IOException { if (size == 0) { return null; } // optimize sequential access: first try scanning cached enum w/o seeking if (enumerator.term() != null // term is at or past current && ((enumerator.prev() != null && term.compareToUTF16(enumerator.prev())> 0) || term.compareToUTF16(enumerator.term()) >= 0)) { int enumOffset = (int)(enumerator.position/totalIndexInterval)+1; if (indexTerms.length == enumOffset // but before end of block || term.compareToUTF16(indexTerms[enumOffset]) < 0) { // no need to seek final TermInfo ti; int numScans = enumerator.scanTo(term); if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { ti = enumerator.termInfo(); if (numScans > 1) { // we only want to put this TermInfo into the cache if // scanEnum skipped more than one dictionary entry. // This prevents RangeQueries or WildcardQueries to // wipe out the cache when they iterate over a large numbers // of terms in order if (tiOrd == null) { termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, (int) enumerator.position)); } else { assert sameTermInfo(ti, tiOrd, enumerator); assert (int) enumerator.position == tiOrd.termOrd; } } } else { ti = null; } return ti; } } // random-access: must seek final int indexPos; if (tiOrd != null) { indexPos = tiOrd.termOrd / totalIndexInterval; } else { // Must do binary search: indexPos = getIndexOffset(term); } seekEnum(enumerator, indexPos); enumerator.scanTo(term); final TermInfo ti; if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { ti = enumerator.termInfo(); if (tiOrd == null) { termsCache.put(new CloneableTerm(term), new TermInfoAndOrd(ti, (int) enumerator.position)); } else { assert sameTermInfo(ti, tiOrd, enumerator); assert (int) enumerator.position == tiOrd.termOrd; } } else { ti = null; } return ti; } // called only from asserts private boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) { if (ti1.docFreq != ti2.docFreq) { return false; } if (ti1.freqPointer != ti2.freqPointer) { return false; } if (ti1.proxPointer != ti2.proxPointer) { return false; } // skipOffset is only valid when docFreq >= skipInterval: if (ti1.docFreq >= enumerator.skipInterval && ti1.skipOffset != ti2.skipOffset) { return false; } return true; } private void ensureIndexIsRead() { if (indexTerms == null) { throw new IllegalStateException("terms index was not loaded when this reader was created"); } } /** Returns the position of a Term in the set or -1. */ long getPosition(Term term) throws IOException { if (size == 0) return -1; ensureIndexIsRead(); int indexOffset = getIndexOffset(term); SegmentTermEnum enumerator = getThreadResources().termEnum; seekEnum(enumerator, indexOffset); while(term.compareToUTF16(enumerator.term()) > 0 && enumerator.next()) {} if (term.compareToUTF16(enumerator.term()) == 0) return enumerator.position; else return -1; } /** Returns an enumeration of all the Terms and TermInfos in the set. */ public SegmentTermEnum terms() { return (SegmentTermEnum)origEnum.clone(); } /** Returns an enumeration of terms starting at or after the named term. */ public SegmentTermEnum terms(Term term) throws IOException { get(term, true); return (SegmentTermEnum)getThreadResources().termEnum.clone(); } }