/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.codecs.blockterms; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.Accountables; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.PagedBytes; import org.apache.lucene.util.packed.MonotonicBlockPackedReader; /** * TermsIndexReader for simple every Nth terms indexes. * * @see FixedGapTermsIndexWriter * @lucene.experimental */ public class FixedGapTermsIndexReader extends TermsIndexReaderBase { // NOTE: long is overkill here, but we use this in a // number of places to multiply out the actual ord, and we // will overflow int during those multiplies. So to avoid // having to upgrade each multiple to long in multiple // places (error prone), we use long here: private final long indexInterval; private final int packedIntsVersion; private final int blocksize; private final static int PAGED_BYTES_BITS = 15; // all fields share this single logical byte[] private final PagedBytes.Reader termBytesReader; final HashMap<String,FieldIndexData> fields = new HashMap<>(); public FixedGapTermsIndexReader(SegmentReadState state) throws IOException { final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS); String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION); final IndexInput in = state.directory.openInput(fileName, state.context); boolean success = false; try { CodecUtil.checkIndexHeader(in, FixedGapTermsIndexWriter.CODEC_NAME, FixedGapTermsIndexWriter.VERSION_CURRENT, FixedGapTermsIndexWriter.VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); CodecUtil.checksumEntireFile(in); indexInterval = in.readVInt(); if (indexInterval < 1) { throw new CorruptIndexException("invalid indexInterval: " + indexInterval, in); } packedIntsVersion = in.readVInt(); blocksize = in.readVInt(); seekDir(in); // Read directory final int numFields = in.readVInt(); if (numFields < 0) { throw new CorruptIndexException("invalid numFields: " + numFields, in); } //System.out.println("FGR: init seg=" + segment + " div=" + indexDivisor + " nF=" + numFields); for(int i=0;i<numFields;i++) { final int field = in.readVInt(); final long numIndexTerms = in.readVInt(); // TODO: change this to a vLong if we fix writer to support > 2B index terms if (numIndexTerms < 0) { throw new CorruptIndexException("invalid numIndexTerms: " + numIndexTerms, in); } final long termsStart = in.readVLong(); final long indexStart = in.readVLong(); final long packedIndexStart = in.readVLong(); final long packedOffsetsStart = in.readVLong(); if (packedIndexStart < indexStart) { throw new CorruptIndexException("invalid packedIndexStart: " + packedIndexStart + " indexStart: " + indexStart + "numIndexTerms: " + numIndexTerms, in); } final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field); FieldIndexData previous = fields.put(fieldInfo.name, new FieldIndexData(in, termBytes, indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms)); if (previous != null) { throw new CorruptIndexException("duplicate field: " + fieldInfo.name, in); } } success = true; } finally { if (success) { IOUtils.close(in); } else { IOUtils.closeWhileHandlingException(in); } termBytesReader = termBytes.freeze(true); } } private class IndexEnum extends FieldIndexEnum { private final FieldIndexData fieldIndex; private final BytesRef term = new BytesRef(); private long ord; public IndexEnum(FieldIndexData fieldIndex) { this.fieldIndex = fieldIndex; } @Override public BytesRef term() { return term; } @Override public long seek(BytesRef target) { long lo = 0; // binary search long hi = fieldIndex.numIndexTerms - 1; while (hi >= lo) { long mid = (lo + hi) >>> 1; final long offset = fieldIndex.termOffsets.get(mid); final int length = (int) (fieldIndex.termOffsets.get(1+mid) - offset); termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length); int delta = target.compareTo(term); if (delta < 0) { hi = mid - 1; } else if (delta > 0) { lo = mid + 1; } else { assert mid >= 0; ord = mid*indexInterval; return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(mid); } } if (hi < 0) { assert hi == -1; hi = 0; } final long offset = fieldIndex.termOffsets.get(hi); final int length = (int) (fieldIndex.termOffsets.get(1+hi) - offset); termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length); ord = hi*indexInterval; return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(hi); } @Override public long next() { final long idx = 1 + (ord / indexInterval); if (idx >= fieldIndex.numIndexTerms) { return -1; } ord += indexInterval; final long offset = fieldIndex.termOffsets.get(idx); final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset); termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length); return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx); } @Override public long ord() { return ord; } @Override public long seek(long ord) { long idx = ord / indexInterval; // caller must ensure ord is in bounds assert idx < fieldIndex.numIndexTerms; final long offset = fieldIndex.termOffsets.get(idx); final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset); termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length); this.ord = idx * indexInterval; return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx); } } @Override public boolean supportsOrd() { return true; } private final class FieldIndexData implements Accountable { // where this field's terms begin in the packed byte[] // data final long termBytesStart; // offset into index termBytes final MonotonicBlockPackedReader termOffsets; // index pointers into main terms dict final MonotonicBlockPackedReader termsDictOffsets; final long numIndexTerms; final long termsStart; public FieldIndexData(IndexInput in, PagedBytes termBytes, long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, long numIndexTerms) throws IOException { this.termsStart = termsStart; termBytesStart = termBytes.getPointer(); IndexInput clone = in.clone(); clone.seek(indexStart); this.numIndexTerms = numIndexTerms; assert this.numIndexTerms > 0: "numIndexTerms=" + numIndexTerms; // slurp in the images from disk: try { final long numTermBytes = packedIndexStart - indexStart; termBytes.copy(clone, numTermBytes); // records offsets into main terms dict file termsDictOffsets = MonotonicBlockPackedReader.of(clone, packedIntsVersion, blocksize, numIndexTerms, false); // records offsets into byte[] term data termOffsets = MonotonicBlockPackedReader.of(clone, packedIntsVersion, blocksize, 1+numIndexTerms, false); } finally { clone.close(); } } @Override public long ramBytesUsed() { return ((termOffsets!=null)? termOffsets.ramBytesUsed() : 0) + ((termsDictOffsets!=null)? termsDictOffsets.ramBytesUsed() : 0); } @Override public Collection<Accountable> getChildResources() { List<Accountable> resources = new ArrayList<>(); if (termOffsets != null) { resources.add(Accountables.namedAccountable("term lengths", termOffsets)); } if (termsDictOffsets != null) { resources.add(Accountables.namedAccountable("offsets", termsDictOffsets)); } return Collections.unmodifiableList(resources); } @Override public String toString() { return "FixedGapTermIndex(indexterms=" + numIndexTerms + ")"; } } @Override public FieldIndexEnum getFieldEnum(FieldInfo fieldInfo) { return new IndexEnum(fields.get(fieldInfo.name)); } @Override public void close() throws IOException {} private void seekDir(IndexInput input) throws IOException { input.seek(input.length() - CodecUtil.footerLength() - 8); long dirOffset = input.readLong(); input.seek(dirOffset); } @Override public long ramBytesUsed() { long sizeInBytes = ((termBytesReader!=null)? termBytesReader.ramBytesUsed() : 0); for(FieldIndexData entry : fields.values()) { sizeInBytes += entry.ramBytesUsed(); } return sizeInBytes; } @Override public Collection<Accountable> getChildResources() { return Accountables.namedAccountables("field", fields); } @Override public String toString() { return getClass().getSimpleName() + "(fields=" + fields.size() + ",interval=" + indexInterval + ")"; } }