package org.apache.lucene.search.cache;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.OrdTermState;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache.DocTermsIndex;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.packed.GrowableWriter;
import org.apache.lucene.util.packed.PackedInts;
public class DocTermsIndexCreator extends EntryCreatorWithOptions<DocTermsIndex>
{
public static final int FASTER_BUT_MORE_RAM = 2;
public String field;
public DocTermsIndexCreator( String field )
{
super( FASTER_BUT_MORE_RAM ); // By default turn on FASTER_BUT_MORE_RAM
if( field == null ) {
throw new IllegalArgumentException( "field can not be null" );
}
this.field = field;
}
public DocTermsIndexCreator( String field, int flags )
{
super( flags );
if( field == null ) {
throw new IllegalArgumentException( "field can not be null" );
}
this.field = field;
}
@Override
public EntryKey getCacheKey() {
return new SimpleEntryKey( DocTermsIndexCreator.class, field );
}
@Override
public DocTermsIndex create(IndexReader reader) throws IOException
{
String field = StringHelper.intern(this.field); // TODO?? necessary?
Terms terms = MultiFields.getTerms(reader, field);
final boolean fasterButMoreRAM = hasOption(FASTER_BUT_MORE_RAM);
final PagedBytes bytes = new PagedBytes(15);
int startBytesBPV;
int startTermsBPV;
int startNumUniqueTerms;
int maxDoc = reader.maxDoc();
final int termCountHardLimit;
if (maxDoc == Integer.MAX_VALUE) {
termCountHardLimit = Integer.MAX_VALUE;
} else {
termCountHardLimit = maxDoc+1;
}
if (terms != null) {
// Try for coarse estimate for number of bits; this
// should be an underestimate most of the time, which
// is fine -- GrowableWriter will reallocate as needed
long numUniqueTerms = 0;
try {
numUniqueTerms = terms.getUniqueTermCount();
} catch (UnsupportedOperationException uoe) {
numUniqueTerms = -1;
}
if (numUniqueTerms != -1) {
if (numUniqueTerms > termCountHardLimit) {
// app is misusing the API (there is more than
// one term per doc); in this case we make best
// effort to load what we can (see LUCENE-2142)
numUniqueTerms = termCountHardLimit;
}
startBytesBPV = PackedInts.bitsRequired(numUniqueTerms*4);
startTermsBPV = PackedInts.bitsRequired(numUniqueTerms);
startNumUniqueTerms = (int) numUniqueTerms;
} else {
startBytesBPV = 1;
startTermsBPV = 1;
startNumUniqueTerms = 1;
}
} else {
startBytesBPV = 1;
startTermsBPV = 1;
startNumUniqueTerms = 1;
}
GrowableWriter termOrdToBytesOffset = new GrowableWriter(startBytesBPV, 1+startNumUniqueTerms, fasterButMoreRAM);
final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, reader.maxDoc(), fasterButMoreRAM);
// 0 is reserved for "unset"
bytes.copyUsingLengthPrefix(new BytesRef());
int termOrd = 1;
if (terms != null) {
final TermsEnum termsEnum = terms.iterator();
DocsEnum docs = null;
while(true) {
final BytesRef term = termsEnum.next();
if (term == null) {
break;
}
if (termOrd >= termCountHardLimit) {
break;
}
if (termOrd == termOrdToBytesOffset.size()) {
// NOTE: this code only runs if the incoming
// reader impl doesn't implement
// getUniqueTermCount (which should be uncommon)
termOrdToBytesOffset = termOrdToBytesOffset.resize(ArrayUtil.oversize(1+termOrd, 1));
}
termOrdToBytesOffset.set(termOrd, bytes.copyUsingLengthPrefix(term));
docs = termsEnum.docs(null, docs);
while (true) {
final int docID = docs.nextDoc();
if (docID == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
docToTermOrd.set(docID, termOrd);
}
termOrd++;
}
if (termOrdToBytesOffset.size() > termOrd) {
termOrdToBytesOffset = termOrdToBytesOffset.resize(termOrd);
}
}
// maybe an int-only impl?
return new DocTermsIndexImpl(bytes.freeze(true), termOrdToBytesOffset.getMutable(), docToTermOrd.getMutable(), termOrd);
}
@Override
public DocTermsIndex validate(DocTermsIndex entry, IndexReader reader) throws IOException {
// TODO? nothing? perhaps subsequent call with FASTER_BUT_MORE_RAM?
return entry;
}
//-----------------------------------------------------------------------------
//-----------------------------------------------------------------------------
public static class DocTermsIndexImpl extends DocTermsIndex {
private final PagedBytes.Reader bytes;
private final PackedInts.Reader termOrdToBytesOffset;
private final PackedInts.Reader docToTermOrd;
private final int numOrd;
public DocTermsIndexImpl(PagedBytes.Reader bytes, PackedInts.Reader termOrdToBytesOffset, PackedInts.Reader docToTermOrd, int numOrd) {
this.bytes = bytes;
this.docToTermOrd = docToTermOrd;
this.termOrdToBytesOffset = termOrdToBytesOffset;
this.numOrd = numOrd;
}
@Override
public PackedInts.Reader getDocToOrd() {
return docToTermOrd;
}
@Override
public int numOrd() {
return numOrd;
}
@Override
public int getOrd(int docID) {
return (int) docToTermOrd.get(docID);
}
@Override
public int size() {
return docToTermOrd.size();
}
@Override
public BytesRef lookup(int ord, BytesRef ret) {
return bytes.fill(ret, termOrdToBytesOffset.get(ord));
}
@Override
public TermsEnum getTermsEnum() {
return this.new DocTermsIndexEnum();
}
class DocTermsIndexEnum extends TermsEnum {
int currentOrd;
int currentBlockNumber;
int end; // end position in the current block
final byte[][] blocks;
final int[] blockEnds;
final BytesRef term = new BytesRef();
public DocTermsIndexEnum() {
currentOrd = 0;
currentBlockNumber = 0;
blocks = bytes.getBlocks();
blockEnds = bytes.getBlockEnds();
currentBlockNumber = bytes.fillAndGetIndex(term, termOrdToBytesOffset.get(0));
end = blockEnds[currentBlockNumber];
}
@Override
public SeekStatus seek(BytesRef text, boolean useCache) throws IOException {
int low = 1;
int high = numOrd-1;
while (low <= high) {
int mid = (low + high) >>> 1;
seek(mid);
int cmp = term.compareTo(text);
if (cmp < 0)
low = mid + 1;
else if (cmp > 0)
high = mid - 1;
else
return SeekStatus.FOUND; // key found
}
if (low == numOrd) {
return SeekStatus.END;
} else {
seek(low);
return SeekStatus.NOT_FOUND;
}
}
@Override
public SeekStatus seek(long ord) throws IOException {
assert(ord >= 0 && ord <= numOrd);
// TODO: if gap is small, could iterate from current position? Or let user decide that?
currentBlockNumber = bytes.fillAndGetIndex(term, termOrdToBytesOffset.get((int)ord));
end = blockEnds[currentBlockNumber];
currentOrd = (int)ord;
return SeekStatus.FOUND;
}
@Override
public BytesRef next() throws IOException {
int start = term.offset + term.length;
if (start >= end) {
// switch byte blocks
if (currentBlockNumber +1 >= blocks.length) {
return null;
}
currentBlockNumber++;
term.bytes = blocks[currentBlockNumber];
end = blockEnds[currentBlockNumber];
start = 0;
if (end<=0) return null; // special case of empty last array
}
currentOrd++;
byte[] block = term.bytes;
if ((block[start] & 128) == 0) {
term.length = block[start];
term.offset = start+1;
} else {
term.length = (((block[start] & 0x7f)) << 8) | (block[1+start] & 0xff);
term.offset = start+2;
}
return term;
}
@Override
public BytesRef term() throws IOException {
return term;
}
@Override
public long ord() throws IOException {
return currentOrd;
}
@Override
public int docFreq() {
throw new UnsupportedOperationException();
}
@Override
public long totalTermFreq() {
return -1;
}
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public Comparator<BytesRef> getComparator() throws IOException {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public void seek(BytesRef term, TermState state) throws IOException {
assert state != null && state instanceof OrdTermState;
this.seek(((OrdTermState)state).ord);
}
@Override
public TermState termState() throws IOException {
OrdTermState state = new OrdTermState();
state.ord = currentOrd;
return state;
}
}
}
}