/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.uninverting; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.WeakHashMap; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DocValues; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.PointValues; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.index.PointValues.Relation; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.Accountables; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.PagedBytes; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.util.packed.GrowableWriter; import org.apache.lucene.util.packed.PackedInts; import org.apache.lucene.util.packed.PackedLongValues; /** * Expert: The default cache implementation, storing all values in memory. * A WeakHashMap is used for storage. * * @lucene.internal */ public class FieldCacheImpl implements FieldCache { private Map<Class<?>,Cache> caches; FieldCacheImpl() { init(); } private synchronized void init() { caches = new HashMap<>(6); caches.put(Long.TYPE, new LongCache(this)); caches.put(BinaryDocValues.class, new BinaryDocValuesCache(this)); caches.put(SortedDocValues.class, new SortedDocValuesCache(this)); caches.put(DocTermOrds.class, new DocTermOrdsCache(this)); caches.put(DocsWithFieldCache.class, new DocsWithFieldCache(this)); } @Override public synchronized void purgeAllCaches() { init(); } @Override public synchronized void purgeByCacheKey(IndexReader.CacheKey coreCacheKey) { for(Cache c : caches.values()) { c.purgeByCacheKey(coreCacheKey); } } @Override public synchronized CacheEntry[] getCacheEntries() { List<CacheEntry> result = new ArrayList<>(17); for(final Map.Entry<Class<?>,Cache> cacheEntry: caches.entrySet()) { final Cache cache = cacheEntry.getValue(); final Class<?> cacheType = cacheEntry.getKey(); synchronized(cache.readerCache) { for (final Map.Entry<IndexReader.CacheKey,Map<CacheKey, Accountable>> readerCacheEntry : cache.readerCache.entrySet()) { final IndexReader.CacheKey readerKey = readerCacheEntry.getKey(); if (readerKey == null) continue; final Map<CacheKey, Accountable> innerCache = readerCacheEntry.getValue(); for (final Map.Entry<CacheKey, Accountable> mapEntry : innerCache.entrySet()) { CacheKey entry = mapEntry.getKey(); result.add(new CacheEntry(readerKey, entry.field, cacheType, entry.custom, mapEntry.getValue())); } } } } return result.toArray(new CacheEntry[result.size()]); } // per-segment fieldcaches don't purge until the shared core closes. final IndexReader.ClosedListener purgeCore = FieldCacheImpl.this::purgeByCacheKey; private void initReader(LeafReader reader) { IndexReader.CacheHelper cacheHelper = reader.getCoreCacheHelper(); if (cacheHelper == null) { throw new IllegalStateException("Cannot cache on " + reader); } cacheHelper.addClosedListener(purgeCore); } /** Expert: Internal cache. */ abstract static class Cache { Cache(FieldCacheImpl wrapper) { this.wrapper = wrapper; } final FieldCacheImpl wrapper; final Map<IndexReader.CacheKey,Map<CacheKey,Accountable>> readerCache = new WeakHashMap<>(); protected abstract Accountable createValue(LeafReader reader, CacheKey key) throws IOException; /** Remove this reader from the cache, if present. */ public void purgeByCacheKey(IndexReader.CacheKey coreCacheKey) { synchronized(readerCache) { readerCache.remove(coreCacheKey); } } /** Sets the key to the value for the provided reader; * if the key is already set then this doesn't change it. */ public void put(LeafReader reader, CacheKey key, Accountable value) { IndexReader.CacheHelper cacheHelper = reader.getCoreCacheHelper(); if (cacheHelper == null) { throw new IllegalStateException("Cannot cache on " + reader); } final IndexReader.CacheKey readerKey = cacheHelper.getKey(); synchronized (readerCache) { Map<CacheKey,Accountable> innerCache = readerCache.get(readerKey); if (innerCache == null) { // First time this reader is using FieldCache innerCache = new HashMap<>(); readerCache.put(readerKey, innerCache); wrapper.initReader(reader); } if (innerCache.get(key) == null) { innerCache.put(key, value); } else { // Another thread beat us to it; leave the current // value } } } public Object get(LeafReader reader, CacheKey key) throws IOException { Map<CacheKey,Accountable> innerCache; Accountable value; IndexReader.CacheHelper cacheHelper = reader.getCoreCacheHelper(); if (cacheHelper == null) { reader.getCoreCacheHelper(); throw new IllegalStateException("Cannot cache on " + reader); } final IndexReader.CacheKey readerKey = cacheHelper.getKey(); synchronized (readerCache) { innerCache = readerCache.get(readerKey); if (innerCache == null) { // First time this reader is using FieldCache innerCache = new HashMap<>(); readerCache.put(readerKey, innerCache); wrapper.initReader(reader); value = null; } else { value = innerCache.get(key); } if (value == null) { value = new CreationPlaceholder(); innerCache.put(key, value); } } if (value instanceof CreationPlaceholder) { synchronized (value) { CreationPlaceholder progress = (CreationPlaceholder) value; if (progress.value == null) { progress.value = createValue(reader, key); synchronized (readerCache) { innerCache.put(key, progress.value); } } return progress.value; } } return value; } } /** Expert: Every composite-key in the internal cache is of this type. */ static class CacheKey { final String field; // which Field final Object custom; // which custom comparator or parser /** Creates one of these objects for a custom comparator/parser. */ CacheKey(String field, Object custom) { this.field = field; this.custom = custom; } /** Two of these are equal iff they reference the same field and type. */ @Override public boolean equals (Object o) { if (o instanceof CacheKey) { CacheKey other = (CacheKey) o; if (other.field.equals(field)) { if (other.custom == null) { if (custom == null) return true; } else if (other.custom.equals (custom)) { return true; } } } return false; } /** Composes a hashcode based on the field and type. */ @Override public int hashCode() { return field.hashCode() ^ (custom==null ? 0 : custom.hashCode()); } } private static abstract class Uninvert { public Bits docsWithField; final boolean points; // pass true to pull from points, otherwise postings. Uninvert(boolean points) { this.points = points; } final void uninvert(LeafReader reader, String field) throws IOException { if (points) { uninvertPoints(reader, field); } else { uninvertPostings(reader, field); } } final void uninvertPoints(LeafReader reader, String field) throws IOException { final int maxDoc = reader.maxDoc(); PointValues values = reader.getPointValues(field); assert values != null; assert values.size() > 0; final boolean setDocsWithField; final int docCount = values.getDocCount(); assert docCount <= maxDoc; if (docCount == maxDoc) { // Fast case: all docs have this field: this.docsWithField = new Bits.MatchAllBits(maxDoc); setDocsWithField = false; } else { setDocsWithField = true; } BytesRef scratch = new BytesRef(); values.intersect(new IntersectVisitor() { @Override public void visit(int docID) throws IOException { throw new AssertionError(); } @Override public void visit(int docID, byte[] packedValue) throws IOException { scratch.bytes = packedValue; scratch.length = packedValue.length; visitTerm(scratch); visitDoc(docID); if (setDocsWithField) { if (docsWithField == null) { // Lazy init docsWithField = new FixedBitSet(maxDoc); } ((FixedBitSet)docsWithField).set(docID); } } @Override public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { return Relation.CELL_CROSSES_QUERY; // inspect all byte-docid pairs } }); } final void uninvertPostings(LeafReader reader, String field) throws IOException { final int maxDoc = reader.maxDoc(); Terms terms = reader.terms(field); if (terms != null) { final boolean setDocsWithField; final int termsDocCount = terms.getDocCount(); assert termsDocCount <= maxDoc; if (termsDocCount == maxDoc) { // Fast case: all docs have this field: this.docsWithField = new Bits.MatchAllBits(maxDoc); setDocsWithField = false; } else { setDocsWithField = true; } final TermsEnum termsEnum = termsEnum(terms); PostingsEnum docs = null; FixedBitSet docsWithField = null; while(true) { final BytesRef term = termsEnum.next(); if (term == null) { break; } visitTerm(term); docs = termsEnum.postings(docs, PostingsEnum.NONE); while (true) { final int docID = docs.nextDoc(); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } visitDoc(docID); if (setDocsWithField) { if (docsWithField == null) { // Lazy init this.docsWithField = docsWithField = new FixedBitSet(maxDoc); } docsWithField.set(docID); } } } } } protected abstract TermsEnum termsEnum(Terms terms) throws IOException; protected abstract void visitTerm(BytesRef term); protected abstract void visitDoc(int docID); } // null Bits means no docs matched void setDocsWithField(LeafReader reader, String field, Bits docsWithField, Parser parser) { final int maxDoc = reader.maxDoc(); final Bits bits; if (docsWithField == null) { bits = new Bits.MatchNoBits(maxDoc); } else if (docsWithField instanceof FixedBitSet) { final int numSet = ((FixedBitSet) docsWithField).cardinality(); if (numSet >= maxDoc) { // The cardinality of the BitSet is maxDoc if all documents have a value. assert numSet == maxDoc; bits = new Bits.MatchAllBits(maxDoc); } else { bits = docsWithField; } } else { bits = docsWithField; } caches.get(DocsWithFieldCache.class).put(reader, new CacheKey(field, parser), new BitsEntry(bits)); } private static class HoldsOneThing<T> { private T it; public void set(T it) { this.it = it; } public T get() { return it; } } private static class GrowableWriterAndMinValue { GrowableWriterAndMinValue(GrowableWriter array, long minValue) { this.writer = array; this.minValue = minValue; } public GrowableWriter writer; public long minValue; } @Override public Bits getDocsWithField(LeafReader reader, String field, Parser parser) throws IOException { final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field); if (fieldInfo == null) { // field does not exist or has no value return new Bits.MatchNoBits(reader.maxDoc()); } if (fieldInfo.getDocValuesType() != DocValuesType.NONE) { // doc values case } else if (parser instanceof PointParser) { // points case } else { // postings case if (fieldInfo.getIndexOptions() == IndexOptions.NONE) { return new Bits.MatchNoBits(reader.maxDoc()); } } BitsEntry bitsEntry = (BitsEntry) caches.get(DocsWithFieldCache.class).get(reader, new CacheKey(field, parser)); return bitsEntry.bits; } static class BitsEntry implements Accountable { final Bits bits; BitsEntry(Bits bits) { this.bits = bits; } @Override public long ramBytesUsed() { long base = RamUsageEstimator.NUM_BYTES_OBJECT_REF; if (bits instanceof Bits.MatchAllBits || bits instanceof Bits.MatchNoBits) { return base; } else { return base + (bits.length() >>> 3); } } } static final class DocsWithFieldCache extends Cache { DocsWithFieldCache(FieldCacheImpl wrapper) { super(wrapper); } @Override protected BitsEntry createValue(LeafReader reader, CacheKey key) throws IOException { final String field = key.field; final Parser parser = (Parser) key.custom; FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field); if (fieldInfo.getDocValuesType() != DocValuesType.NONE) { return createValueDocValues(reader, field); } else if (parser instanceof PointParser) { return createValuePoints(reader, field); } else { return createValuePostings(reader, field); } } private BitsEntry createValueDocValues(LeafReader reader, String field) throws IOException { FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field); DocValuesType dvType = fieldInfo.getDocValuesType(); DocIdSetIterator iterator; switch(dvType) { case NUMERIC: iterator = reader.getNumericDocValues(field); break; case BINARY: iterator = reader.getBinaryDocValues(field); break; case SORTED: iterator = reader.getSortedDocValues(field); break; case SORTED_NUMERIC: iterator = reader.getSortedNumericDocValues(field); break; case SORTED_SET: iterator = reader.getSortedSetDocValues(field); break; default: throw new AssertionError(); } FixedBitSet bits = new FixedBitSet(reader.maxDoc()); while (true) { int docID = iterator.nextDoc(); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } bits.set(docID); } return new BitsEntry(bits); } private BitsEntry createValuePoints(LeafReader reader, String field) throws IOException { final int maxDoc = reader.maxDoc(); PointValues values = reader.getPointValues(field); assert values != null; assert values.size() > 0; final int docCount = values.getDocCount(); assert docCount <= maxDoc; if (docCount == maxDoc) { // Fast case: all docs have this field: return new BitsEntry(new Bits.MatchAllBits(maxDoc)); } // otherwise a no-op uninvert! Uninvert u = new Uninvert(true) { @Override protected TermsEnum termsEnum(Terms terms) throws IOException { throw new AssertionError(); } @Override protected void visitTerm(BytesRef term) {} @Override protected void visitDoc(int docID) {} }; u.uninvert(reader, field); return new BitsEntry(u.docsWithField); } // TODO: it is dumb that uninverting code is duplicated here in this method!! private BitsEntry createValuePostings(LeafReader reader, String field) throws IOException { final int maxDoc = reader.maxDoc(); // Visit all docs that have terms for this field FixedBitSet res = null; Terms terms = reader.terms(field); if (terms != null) { final int termsDocCount = terms.getDocCount(); assert termsDocCount <= maxDoc; if (termsDocCount == maxDoc) { // Fast case: all docs have this field: return new BitsEntry(new Bits.MatchAllBits(maxDoc)); } final TermsEnum termsEnum = terms.iterator(); PostingsEnum docs = null; while(true) { final BytesRef term = termsEnum.next(); if (term == null) { break; } if (res == null) { // lazy init res = new FixedBitSet(maxDoc); } docs = termsEnum.postings(docs, PostingsEnum.NONE); // TODO: use bulk API while (true) { final int docID = docs.nextDoc(); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } res.set(docID); } } } if (res == null) { return new BitsEntry(new Bits.MatchNoBits(maxDoc)); } final int numSet = res.cardinality(); if (numSet >= maxDoc) { // The cardinality of the BitSet is maxDoc if all documents have a value. assert numSet == maxDoc; return new BitsEntry(new Bits.MatchAllBits(maxDoc)); } return new BitsEntry(res); } } @Override public NumericDocValues getNumerics(LeafReader reader, String field, Parser parser) throws IOException { if (parser == null) { throw new NullPointerException(); } final NumericDocValues valuesIn = reader.getNumericDocValues(field); if (valuesIn != null) { return valuesIn; } else { final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (info == null) { return DocValues.emptyNumeric(); } else if (info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); } if (parser instanceof PointParser) { // points case // no points in this segment if (info.getPointDimensionCount() == 0) { return DocValues.emptyNumeric(); } if (info.getPointDimensionCount() != 1) { throw new IllegalStateException("Type mismatch: " + field + " was indexed with dimensions=" + info.getPointDimensionCount()); } PointValues values = reader.getPointValues(field); // no actual points for this field (e.g. all points deleted) if (values == null || values.size() == 0) { return DocValues.emptyNumeric(); } // not single-valued if (values.size() != values.getDocCount()) { throw new IllegalStateException("Type mismatch: " + field + " was indexed with multiple values, numValues=" + values.size() + ",numDocs=" + values.getDocCount()); } } else { // postings case // not indexed if (info.getIndexOptions() == IndexOptions.NONE) { return DocValues.emptyNumeric(); } } return ((LongsFromArray) caches.get(Long.TYPE).get(reader, new CacheKey(field, parser))).iterator(); } } public static class LongsFromArray implements Accountable { private final PackedInts.Reader values; private final long minValue; private final Bits docsWithField; private final String field; public LongsFromArray(String field, PackedInts.Reader values, long minValue, Bits docsWithField) { // TODO: accept null docsWithField? this.field = field; this.values = values; this.minValue = minValue; this.docsWithField = docsWithField; } @Override public long ramBytesUsed() { return values.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_OBJECT_REF + Long.BYTES; } public NumericDocValues iterator() { return new NumericDocValues() { int docID = -1; @Override public int docID() { return docID; } @Override public int nextDoc() { while (true) { docID++; if (docID >= values.size()) { docID = NO_MORE_DOCS; return docID; } if (docsWithField.get(docID)) { return docID; } } } @Override public int advance(int target) { if (target < values.size()) { docID = target; if (docsWithField.get(docID)) { return docID; } else{ return nextDoc(); } } else { docID = NO_MORE_DOCS; return docID; } } @Override public boolean advanceExact(int target) throws IOException { docID = target; return docsWithField.get(docID); } @Override public long cost() { return values.size(); } @Override public long longValue() { return minValue + values.get(docID); } }; } } static final class LongCache extends Cache { LongCache(FieldCacheImpl wrapper) { super(wrapper); } @Override protected Accountable createValue(final LeafReader reader, CacheKey key) throws IOException { final Parser parser = (Parser) key.custom; final HoldsOneThing<GrowableWriterAndMinValue> valuesRef = new HoldsOneThing<>(); Uninvert u = new Uninvert(parser instanceof PointParser) { private long minValue; private long currentValue; private GrowableWriter values; @Override public void visitTerm(BytesRef term) { currentValue = parser.parseValue(term); if (values == null) { // Lazy alloc so for the numeric field case // (which will hit a NumberFormatException // when we first try the DEFAULT_INT_PARSER), // we don't double-alloc: int startBitsPerValue; // Make sure than missing values (0) can be stored without resizing if (currentValue < 0) { minValue = currentValue; startBitsPerValue = minValue == Long.MIN_VALUE ? 64 : PackedInts.bitsRequired(-minValue); } else { minValue = 0; startBitsPerValue = PackedInts.bitsRequired(currentValue); } values = new GrowableWriter(startBitsPerValue, reader.maxDoc(), PackedInts.FAST); if (minValue != 0) { values.fill(0, values.size(), -minValue); // default value must be 0 } valuesRef.set(new GrowableWriterAndMinValue(values, minValue)); } } @Override public void visitDoc(int docID) { values.set(docID, currentValue - minValue); } @Override protected TermsEnum termsEnum(Terms terms) throws IOException { return parser.termsEnum(terms); } }; u.uninvert(reader, key.field); wrapper.setDocsWithField(reader, key.field, u.docsWithField, parser); GrowableWriterAndMinValue values = valuesRef.get(); Bits docsWithField = u.docsWithField == null ? new Bits.MatchNoBits(reader.maxDoc()) : u.docsWithField; if (values == null) { return new LongsFromArray(key.field, new PackedInts.NullReader(reader.maxDoc()), 0L, docsWithField); } return new LongsFromArray(key.field, values.writer.getMutable(), values.minValue, docsWithField); } } public static class SortedDocValuesImpl implements Accountable { private final PagedBytes.Reader bytes; private final PackedLongValues termOrdToBytesOffset; private final PackedInts.Reader docToTermOrd; private final int numOrd; public SortedDocValuesImpl(PagedBytes.Reader bytes, PackedLongValues termOrdToBytesOffset, PackedInts.Reader docToTermOrd, int numOrd) { this.bytes = bytes; this.docToTermOrd = docToTermOrd; this.termOrdToBytesOffset = termOrdToBytesOffset; this.numOrd = numOrd; } public SortedDocValues iterator() { return new Iter(); } public class Iter extends SortedDocValues { private int docID = -1; private final BytesRef term = new BytesRef(); /** @lucene.internal Specific to this implementation and subject to change. For internal optimization only. */ public int getOrd(int docID) { // Subtract 1, matching the 1+ord we did when // storing, so that missing values, which are 0 in the // packed ints, are returned as -1 ord: return (int) docToTermOrd.get(docID)-1; } @Override public int docID() { return docID; } @Override public int nextDoc() { while (true) { docID++; if (docID >= docToTermOrd.size()) { docID = NO_MORE_DOCS; return docID; } if (docToTermOrd.get(docID) != 0) { return docID; } } } @Override public int advance(int target) { if (target < docToTermOrd.size()) { docID = target; if (docToTermOrd.get(docID) != 0) { return docID; } else{ return nextDoc(); } } else { docID = NO_MORE_DOCS; return docID; } } @Override public boolean advanceExact(int target) throws IOException { docID = target; return docToTermOrd.get(docID) != 0; } @Override public long cost() { return 0; } @Override public int ordValue() { // Subtract 1, matching the 1+ord we did when // storing, so that missing values, which are 0 in the // packed ints, are returned as -1 ord: return (int) docToTermOrd.get(docID)-1; } @Override public int getValueCount() { return numOrd; } @Override public BytesRef lookupOrd(int ord) { if (ord < 0) { throw new IllegalArgumentException("ord must be >=0 (got ord=" + ord + ")"); } bytes.fill(term, termOrdToBytesOffset.get(ord)); return term; } } @Override public long ramBytesUsed() { return bytes.ramBytesUsed() + termOrdToBytesOffset.ramBytesUsed() + docToTermOrd.ramBytesUsed() + 3*RamUsageEstimator.NUM_BYTES_OBJECT_REF + Integer.BYTES; } @Override public Collection<Accountable> getChildResources() { List<Accountable> resources = new ArrayList<>(3); resources.add(Accountables.namedAccountable("term bytes", bytes)); resources.add(Accountables.namedAccountable("ord -> term", termOrdToBytesOffset)); resources.add(Accountables.namedAccountable("doc -> ord", docToTermOrd)); return Collections.unmodifiableList(resources); } } public SortedDocValues getTermsIndex(LeafReader reader, String field) throws IOException { return getTermsIndex(reader, field, PackedInts.FAST); } public SortedDocValues getTermsIndex(LeafReader reader, String field, float acceptableOverheadRatio) throws IOException { SortedDocValues valuesIn = reader.getSortedDocValues(field); if (valuesIn != null) { // Not cached here by FieldCacheImpl (cached instead // per-thread by SegmentReader): return valuesIn; } else { final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (info == null) { return DocValues.emptySorted(); } else if (info.getDocValuesType() != DocValuesType.NONE) { // we don't try to build a sorted instance from numeric/binary doc // values because dedup can be very costly throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); } else if (info.getIndexOptions() == IndexOptions.NONE) { return DocValues.emptySorted(); } SortedDocValuesImpl impl = (SortedDocValuesImpl) caches.get(SortedDocValues.class).get(reader, new CacheKey(field, acceptableOverheadRatio)); return impl.iterator(); } } static class SortedDocValuesCache extends Cache { SortedDocValuesCache(FieldCacheImpl wrapper) { super(wrapper); } @Override protected Accountable createValue(LeafReader reader, CacheKey key) throws IOException { final int maxDoc = reader.maxDoc(); Terms terms = reader.terms(key.field); final float acceptableOverheadRatio = ((Float) key.custom).floatValue(); final PagedBytes bytes = new PagedBytes(15); int startTermsBPV; // TODO: use Uninvert? if (terms != null) { // Try for coarse estimate for number of bits; this // should be an underestimate most of the time, which // is fine -- GrowableWriter will reallocate as needed long numUniqueTerms = terms.size(); if (numUniqueTerms != -1L) { if (numUniqueTerms > maxDoc) { throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead"); } startTermsBPV = PackedInts.bitsRequired(numUniqueTerms); } else { startTermsBPV = 1; } } else { startTermsBPV = 1; } PackedLongValues.Builder termOrdToBytesOffset = PackedLongValues.monotonicBuilder(PackedInts.COMPACT); final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio); int termOrd = 0; // TODO: use Uninvert? if (terms != null) { final TermsEnum termsEnum = terms.iterator(); PostingsEnum docs = null; while(true) { final BytesRef term = termsEnum.next(); if (term == null) { break; } if (termOrd >= maxDoc) { throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead"); } termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term)); docs = termsEnum.postings(docs, PostingsEnum.NONE); while (true) { final int docID = docs.nextDoc(); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } // Store 1+ ord into packed bits docToTermOrd.set(docID, 1+termOrd); } termOrd++; } } // maybe an int-only impl? return new SortedDocValuesImpl(bytes.freeze(true), termOrdToBytesOffset.build(), docToTermOrd.getMutable(), termOrd); } } public static class BinaryDocValuesImpl implements Accountable { private final PagedBytes.Reader bytes; private final PackedInts.Reader docToOffset; private final Bits docsWithField; public BinaryDocValuesImpl(PagedBytes.Reader bytes, PackedInts.Reader docToOffset, Bits docsWithField) { this.bytes = bytes; this.docToOffset = docToOffset; this.docsWithField = docsWithField; } public BinaryDocValues iterator() { return new BinaryDocValues() { final BytesRef term = new BytesRef(); int docID = -1; @Override public int docID() { return docID; } @Override public int nextDoc() { while (true) { docID++; if (docID >= docToOffset.size()) { docID = NO_MORE_DOCS; return docID; } if (docsWithField.get(docID)) { return docID; } } } @Override public int advance(int target) { if (target < docToOffset.size()) { docID = target; if (docsWithField.get(docID)) { return docID; } else{ return nextDoc(); } } else { docID = NO_MORE_DOCS; return docID; } } @Override public boolean advanceExact(int target) throws IOException { docID = target; return docsWithField.get(docID); } @Override public long cost() { return 0; } @Override public BytesRef binaryValue() { final long pointer = docToOffset.get(docID); if (pointer == 0) { term.length = 0; } else { bytes.fill(term, pointer); } return term; } }; } @Override public long ramBytesUsed() { return bytes.ramBytesUsed() + docToOffset.ramBytesUsed() + 2*RamUsageEstimator.NUM_BYTES_OBJECT_REF; } @Override public Collection<Accountable> getChildResources() { List<Accountable> resources = new ArrayList<>(2); resources.add(Accountables.namedAccountable("term bytes", bytes)); resources.add(Accountables.namedAccountable("addresses", docToOffset)); return Collections.unmodifiableList(resources); } } // TODO: this if DocTermsIndex was already created, we // should share it... public BinaryDocValues getTerms(LeafReader reader, String field) throws IOException { return getTerms(reader, field, PackedInts.FAST); } public BinaryDocValues getTerms(LeafReader reader, String field, float acceptableOverheadRatio) throws IOException { BinaryDocValues valuesIn = reader.getBinaryDocValues(field); if (valuesIn == null) { valuesIn = reader.getSortedDocValues(field); } if (valuesIn != null) { // Not cached here by FieldCacheImpl (cached instead // per-thread by SegmentReader): return valuesIn; } final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (info == null) { return DocValues.emptyBinary(); } else if (info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); } else if (info.getIndexOptions() == IndexOptions.NONE) { return DocValues.emptyBinary(); } BinaryDocValuesImpl impl = (BinaryDocValuesImpl) caches.get(BinaryDocValues.class).get(reader, new CacheKey(field, acceptableOverheadRatio)); return impl.iterator(); } static final class BinaryDocValuesCache extends Cache { BinaryDocValuesCache(FieldCacheImpl wrapper) { super(wrapper); } @Override protected Accountable createValue(LeafReader reader, CacheKey key) throws IOException { // TODO: would be nice to first check if DocTermsIndex // was already cached for this field and then return // that instead, to avoid insanity final int maxDoc = reader.maxDoc(); Terms terms = reader.terms(key.field); final float acceptableOverheadRatio = ((Float) key.custom).floatValue(); final int termCountHardLimit = maxDoc; // Holds the actual term data, expanded. final PagedBytes bytes = new PagedBytes(15); int startBPV; if (terms != null) { // Try for coarse estimate for number of bits; this // should be an underestimate most of the time, which // is fine -- GrowableWriter will reallocate as needed long numUniqueTerms = terms.size(); if (numUniqueTerms != -1L) { if (numUniqueTerms > termCountHardLimit) { numUniqueTerms = termCountHardLimit; } startBPV = PackedInts.bitsRequired(numUniqueTerms*4); } else { startBPV = 1; } } else { startBPV = 1; } final GrowableWriter docToOffset = new GrowableWriter(startBPV, maxDoc, acceptableOverheadRatio); // pointer==0 means not set bytes.copyUsingLengthPrefix(new BytesRef()); if (terms != null) { int termCount = 0; final TermsEnum termsEnum = terms.iterator(); PostingsEnum docs = null; while(true) { if (termCount++ == termCountHardLimit) { // app is misusing the API (there is more than // one term per doc); in this case we make best // effort to load what we can (see LUCENE-2142) break; } final BytesRef term = termsEnum.next(); if (term == null) { break; } final long pointer = bytes.copyUsingLengthPrefix(term); docs = termsEnum.postings(docs, PostingsEnum.NONE); while (true) { final int docID = docs.nextDoc(); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } docToOffset.set(docID, pointer); } } } final PackedInts.Reader offsetReader = docToOffset.getMutable(); Bits docsWithField = new Bits() { @Override public boolean get(int index) { return offsetReader.get(index) != 0; } @Override public int length() { return maxDoc; } }; wrapper.setDocsWithField(reader, key.field, docsWithField, null); // maybe an int-only impl? return new BinaryDocValuesImpl(bytes.freeze(true), offsetReader, docsWithField); } } // TODO: this if DocTermsIndex was already created, we // should share it... public SortedSetDocValues getDocTermOrds(LeafReader reader, String field, BytesRef prefix) throws IOException { // not a general purpose filtering mechanism... assert prefix == null || prefix == INT32_TERM_PREFIX || prefix == INT64_TERM_PREFIX; SortedSetDocValues dv = reader.getSortedSetDocValues(field); if (dv != null) { return dv; } SortedDocValues sdv = reader.getSortedDocValues(field); if (sdv != null) { return DocValues.singleton(sdv); } final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (info == null) { return DocValues.emptySortedSet(); } else if (info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException("Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); } else if (info.getIndexOptions() == IndexOptions.NONE) { return DocValues.emptySortedSet(); } // ok we need to uninvert. check if we can optimize a bit. Terms terms = reader.terms(field); if (terms == null) { return DocValues.emptySortedSet(); } else { // if #postings = #docswithfield we know that the field is "single valued enough". // it's possible the same term might appear twice in the same document, but SORTED_SET discards frequency. // it's still ok with filtering (which we limit to numerics), it just means precisionStep = Inf long numPostings = terms.getSumDocFreq(); if (numPostings != -1 && numPostings == terms.getDocCount()) { return DocValues.singleton(getTermsIndex(reader, field)); } } DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, prefix)); return dto.iterator(reader); } static final class DocTermOrdsCache extends Cache { DocTermOrdsCache(FieldCacheImpl wrapper) { super(wrapper); } @Override protected Accountable createValue(LeafReader reader, CacheKey key) throws IOException { BytesRef prefix = (BytesRef) key.custom; return new DocTermOrds(reader, null, key.field, prefix); } } }