package org.apache.lucene.codecs.blockterms; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.PagedBytes; import org.apache.lucene.util.packed.MonotonicBlockPackedReader; import java.util.HashMap; import java.util.Comparator; import java.io.IOException; import org.apache.lucene.index.IndexFileNames; /** * TermsIndexReader for simple every Nth terms indexes. * * @see FixedGapTermsIndexWriter * @lucene.experimental */ public class FixedGapTermsIndexReader extends TermsIndexReaderBase { // NOTE: long is overkill here, but we use this in a // number of places to multiply out the actual ord, and we // will overflow int during those multiplies. So to avoid // having to upgrade each multiple to long in multiple // places (error prone), we use long here: private final long indexInterval; private final int packedIntsVersion; private final int blocksize; private final Comparator<BytesRef> termComp; private final static int PAGED_BYTES_BITS = 15; // all fields share this single logical byte[] private final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS); private PagedBytes.Reader termBytesReader; final HashMap<FieldInfo,FieldIndexData> fields = new HashMap<>(); // start of the field info data private long dirOffset; public FixedGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, Comparator<BytesRef> termComp, String segmentSuffix, IOContext context) throws IOException { this.termComp = termComp; final IndexInput in = dir.openInput(IndexFileNames.segmentFileName(segment, segmentSuffix, FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION), context); boolean success = false; try { readHeader(in); indexInterval = in.readVInt(); if (indexInterval < 1) { throw new CorruptIndexException("invalid indexInterval: " + indexInterval + " (resource=" + in + ")"); } packedIntsVersion = in.readVInt(); blocksize = in.readVInt(); seekDir(in, dirOffset); // Read directory final int numFields = in.readVInt(); if (numFields < 0) { throw new CorruptIndexException("invalid numFields: " + numFields + " (resource=" + in + ")"); } //System.out.println("FGR: init seg=" + segment + " div=" + indexDivisor + " nF=" + numFields); for(int i=0;i<numFields;i++) { final int field = in.readVInt(); final long numIndexTerms = in.readVInt(); // TODO: change this to a vLong if we fix writer to support > 2B index terms if (numIndexTerms < 0) { throw new CorruptIndexException("invalid numIndexTerms: " + numIndexTerms + " (resource=" + in + ")"); } final long termsStart = in.readVLong(); final long indexStart = in.readVLong(); final long packedIndexStart = in.readVLong(); final long packedOffsetsStart = in.readVLong(); if (packedIndexStart < indexStart) { throw new CorruptIndexException("invalid packedIndexStart: " + packedIndexStart + " indexStart: " + indexStart + "numIndexTerms: " + numIndexTerms + " (resource=" + in + ")"); } final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); FieldIndexData previous = fields.put(fieldInfo, new FieldIndexData(in, indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms)); if (previous != null) { throw new CorruptIndexException("duplicate field: " + fieldInfo.name + " (resource=" + in + ")"); } } success = true; } finally { if (success) { IOUtils.close(in); } else { IOUtils.closeWhileHandlingException(in); } termBytesReader = termBytes.freeze(true); } } private void readHeader(IndexInput input) throws IOException { CodecUtil.checkHeader(input, FixedGapTermsIndexWriter.CODEC_NAME, FixedGapTermsIndexWriter.VERSION_CURRENT, FixedGapTermsIndexWriter.VERSION_CURRENT); } private class IndexEnum extends FieldIndexEnum { private final FieldIndexData fieldIndex; private final BytesRef term = new BytesRef(); private long ord; public IndexEnum(FieldIndexData fieldIndex) { this.fieldIndex = fieldIndex; } @Override public BytesRef term() { return term; } @Override public long seek(BytesRef target) { long lo = 0; // binary search long hi = fieldIndex.numIndexTerms - 1; while (hi >= lo) { long mid = (lo + hi) >>> 1; final long offset = fieldIndex.termOffsets.get(mid); final int length = (int) (fieldIndex.termOffsets.get(1+mid) - offset); termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length); int delta = termComp.compare(target, term); if (delta < 0) { hi = mid - 1; } else if (delta > 0) { lo = mid + 1; } else { assert mid >= 0; ord = mid*indexInterval; return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(mid); } } if (hi < 0) { assert hi == -1; hi = 0; } final long offset = fieldIndex.termOffsets.get(hi); final int length = (int) (fieldIndex.termOffsets.get(1+hi) - offset); termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length); ord = hi*indexInterval; return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(hi); } @Override public long next() { final long idx = 1 + (ord / indexInterval); if (idx >= fieldIndex.numIndexTerms) { return -1; } ord += indexInterval; final long offset = fieldIndex.termOffsets.get(idx); final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset); termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length); return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx); } @Override public long ord() { return ord; } @Override public long seek(long ord) { long idx = ord / indexInterval; // caller must ensure ord is in bounds assert idx < fieldIndex.numIndexTerms; final long offset = fieldIndex.termOffsets.get(idx); final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset); termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length); this.ord = idx * indexInterval; return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx); } } @Override public boolean supportsOrd() { return true; } private final class FieldIndexData { // where this field's terms begin in the packed byte[] // data final long termBytesStart; // offset into index termBytes final MonotonicBlockPackedReader termOffsets; // index pointers into main terms dict final MonotonicBlockPackedReader termsDictOffsets; final long numIndexTerms; final long termsStart; public FieldIndexData(IndexInput in, long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, long numIndexTerms) throws IOException { this.termsStart = termsStart; termBytesStart = termBytes.getPointer(); IndexInput clone = in.clone(); clone.seek(indexStart); this.numIndexTerms = numIndexTerms; assert this.numIndexTerms > 0: "numIndexTerms=" + numIndexTerms; // slurp in the images from disk: try { final long numTermBytes = packedIndexStart - indexStart; termBytes.copy(clone, numTermBytes); // records offsets into main terms dict file termsDictOffsets = new MonotonicBlockPackedReader(clone, packedIntsVersion, blocksize, numIndexTerms, false); // records offsets into byte[] term data termOffsets = new MonotonicBlockPackedReader(clone, packedIntsVersion, blocksize, 1+numIndexTerms, false); } finally { clone.close(); } } /** Returns approximate RAM bytes used */ public long ramBytesUsed() { return ((termOffsets!=null)? termOffsets.ramBytesUsed() : 0) + ((termsDictOffsets!=null)? termsDictOffsets.ramBytesUsed() : 0); } } @Override public FieldIndexEnum getFieldEnum(FieldInfo fieldInfo) { return new IndexEnum(fields.get(fieldInfo)); } @Override public void close() throws IOException {} private void seekDir(IndexInput input, long dirOffset) throws IOException { input.seek(input.length() - 8); dirOffset = input.readLong(); input.seek(dirOffset); } @Override public long ramBytesUsed() { long sizeInBytes = ((termBytes!=null) ? termBytes.ramBytesUsed() : 0) + ((termBytesReader!=null)? termBytesReader.ramBytesUsed() : 0); for(FieldIndexData entry : fields.values()) { sizeInBytes += entry.ramBytesUsed(); } return sizeInBytes; } }