/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.search; import java.io.IOException; import java.io.Reader; import java.lang.invoke.MethodHandles; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.DocumentStoredFieldVisitor; import org.apache.lucene.document.LazyDocument; import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.IndexableFieldType; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.SortedDocValues; import org.apache.lucene.index.SortedNumericDocValues; import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.NumericUtils; import org.apache.solr.common.SolrDocumentBase; import org.apache.solr.core.SolrConfig; import org.apache.solr.schema.BoolField; import org.apache.solr.schema.EnumField; import org.apache.solr.schema.NumberType; import org.apache.solr.schema.SchemaField; import org.apache.solr.schema.TrieDateField; import org.apache.solr.schema.TrieDoubleField; import org.apache.solr.schema.TrieFloatField; import org.apache.solr.schema.TrieIntField; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * A helper class of {@link org.apache.solr.search.SolrIndexSearcher} for stored Document related matters * including DocValue substitutions. */ public class SolrDocumentFetcher { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private final SolrIndexSearcher searcher; private final boolean enableLazyFieldLoading; private final SolrCache<Integer,Document> documentCache; /** Contains the names/patterns of all docValues=true,stored=false fields in the schema. */ private final Set<String> allNonStoredDVs; /** Contains the names/patterns of all docValues=true,stored=false,useDocValuesAsStored=true fields in the schema. */ private final Set<String> nonStoredDVsUsedAsStored; /** Contains the names/patterns of all docValues=true,stored=false fields, excluding those that are copyField targets in the schema. */ private final Set<String> nonStoredDVsWithoutCopyTargets; private static int largeValueLengthCacheThreshold = Integer.getInteger("solr.largeField.cacheThreshold", 512 * 1024); // internal setting private final Set<String> largeFields; private Collection<String> storedHighlightFieldNames; // lazy populated; use getter SolrDocumentFetcher(SolrIndexSearcher searcher, SolrConfig solrConfig, boolean cachingEnabled) { this.searcher = searcher; this.enableLazyFieldLoading = solrConfig.enableLazyFieldLoading; if (cachingEnabled) { documentCache = solrConfig.documentCacheConfig == null ? null : solrConfig.documentCacheConfig.newInstance(); } else { documentCache = null; } final Set<String> nonStoredDVsUsedAsStored = new HashSet<>(); final Set<String> allNonStoredDVs = new HashSet<>(); final Set<String> nonStoredDVsWithoutCopyTargets = new HashSet<>(); final Set<String> storedLargeFields = new HashSet<>(); for (FieldInfo fieldInfo : searcher.getFieldInfos()) { // can find materialized dynamic fields, unlike using the Solr IndexSchema. final SchemaField schemaField = searcher.getSchema().getFieldOrNull(fieldInfo.name); if (schemaField == null) { continue; } if (!schemaField.stored() && schemaField.hasDocValues()) { if (schemaField.useDocValuesAsStored()) { nonStoredDVsUsedAsStored.add(fieldInfo.name); } allNonStoredDVs.add(fieldInfo.name); if (!searcher.getSchema().isCopyFieldTarget(schemaField)) { nonStoredDVsWithoutCopyTargets.add(fieldInfo.name); } } if (schemaField.stored() && schemaField.isLarge()) { storedLargeFields.add(schemaField.getName()); } } this.nonStoredDVsUsedAsStored = Collections.unmodifiableSet(nonStoredDVsUsedAsStored); this.allNonStoredDVs = Collections.unmodifiableSet(allNonStoredDVs); this.nonStoredDVsWithoutCopyTargets = Collections.unmodifiableSet(nonStoredDVsWithoutCopyTargets); this.largeFields = Collections.unmodifiableSet(storedLargeFields); } public boolean isLazyFieldLoadingEnabled() { return enableLazyFieldLoading; } public SolrCache<Integer, Document> getDocumentCache() { return documentCache; } /** * Returns a collection of the names of all stored fields which can be highlighted the index reader knows about. */ public Collection<String> getStoredHighlightFieldNames() { synchronized (this) { if (storedHighlightFieldNames == null) { storedHighlightFieldNames = new LinkedList<>(); for (FieldInfo fieldInfo : searcher.getFieldInfos()) { final String fieldName = fieldInfo.name; try { SchemaField field = searcher.getSchema().getField(fieldName); if (field.stored() && ((field.getType() instanceof org.apache.solr.schema.TextField) || (field.getType() instanceof org.apache.solr.schema.StrField))) { storedHighlightFieldNames.add(fieldName); } } catch (RuntimeException e) { // getField() throws a SolrException, but it arrives as a RuntimeException log.warn("Field [{}] found in index, but not defined in schema.", fieldName); } } } return storedHighlightFieldNames; } } /** @see SolrIndexSearcher#doc(int) */ public Document doc(int docId) throws IOException { return doc(docId, (Set<String>) null); } /** * Retrieve the {@link Document} instance corresponding to the document id. * <p> * <b>NOTE</b>: the document will have all fields accessible, but if a field filter is provided, only the provided * fields will be loaded (the remainder will be available lazily). * * @see SolrIndexSearcher#doc(int, Set) */ public Document doc(int i, Set<String> fields) throws IOException { Document d; if (documentCache != null) { d = documentCache.get(i); if (d != null) return d; } final DirectoryReader reader = searcher.getIndexReader(); if (documentCache != null && !enableLazyFieldLoading) { // we do not filter the fields in this case because that would return an incomplete document which would // be eventually cached. The alternative would be to read the stored fields twice; once with the fields // and then without for caching leading to a performance hit // see SOLR-8858 for related discussion fields = null; } final SolrDocumentStoredFieldVisitor visitor = new SolrDocumentStoredFieldVisitor(fields, reader, i); reader.document(i, visitor); d = visitor.getDocument(); if (documentCache != null) { documentCache.put(i, d); } return d; } /** {@link StoredFieldVisitor} which loads the specified fields eagerly (or all if null). * If {@link #enableLazyFieldLoading} then the rest get special lazy field entries. Designated "large" * fields will always get a special field entry. */ private class SolrDocumentStoredFieldVisitor extends DocumentStoredFieldVisitor { private final Document doc; private final LazyDocument lazyFieldProducer; // arguably a better name than LazyDocument; at least how we use it here private final int docId; private final boolean addLargeFieldsLazily; SolrDocumentStoredFieldVisitor(Set<String> toLoad, IndexReader reader, int docId) { super(toLoad); this.docId = docId; this.doc = getDocument(); this.lazyFieldProducer = toLoad != null && enableLazyFieldLoading ? new LazyDocument(reader, docId) : null; this.addLargeFieldsLazily = (documentCache != null && !largeFields.isEmpty()); //TODO can we return Status.STOP after a val is loaded and we know there are no other fields of interest? // When: toLoad is one single-valued field, no lazyFieldProducer } @Override public Status needsField(FieldInfo fieldInfo) throws IOException { Status status = super.needsField(fieldInfo); assert status != Status.STOP : "Status.STOP not supported or expected"; if (addLargeFieldsLazily && largeFields.contains(fieldInfo.name)) { // load "large" fields using this lazy mechanism if (lazyFieldProducer != null || status == Status.YES) { doc.add(new LargeLazyField(fieldInfo.name, docId)); } return Status.NO; } if (status == Status.NO && lazyFieldProducer != null) { // lazy doc.add(lazyFieldProducer.getField(fieldInfo)); } return status; } } /** @see SolrIndexSearcher#doc(int, StoredFieldVisitor) */ public void doc(int docId, StoredFieldVisitor visitor) throws IOException { if (documentCache != null) { Document cached = documentCache.get(docId); if (cached != null) { visitFromCached(cached, visitor); return; } } searcher.getIndexReader().document(docId, visitor); } /** Executes a stored field visitor against a hit from the document cache */ private void visitFromCached(Document document, StoredFieldVisitor visitor) throws IOException { for (IndexableField f : document) { final FieldInfo info = searcher.getFieldInfos().fieldInfo(f.name()); final StoredFieldVisitor.Status needsField = visitor.needsField(info); if (needsField == StoredFieldVisitor.Status.STOP) return; if (needsField == StoredFieldVisitor.Status.NO) continue; BytesRef binaryValue = f.binaryValue(); if (binaryValue != null) { visitor.binaryField(info, toByteArrayUnwrapIfPossible(binaryValue)); continue; } Number numericValue = f.numericValue(); if (numericValue != null) { if (numericValue instanceof Double) { visitor.doubleField(info, numericValue.doubleValue()); } else if (numericValue instanceof Integer) { visitor.intField(info, numericValue.intValue()); } else if (numericValue instanceof Float) { visitor.floatField(info, numericValue.floatValue()); } else if (numericValue instanceof Long) { visitor.longField(info, numericValue.longValue()); } else { throw new AssertionError(); } continue; } // must be String if (f instanceof LargeLazyField) { // optimization to avoid premature string conversion visitor.stringField(info, toByteArrayUnwrapIfPossible(((LargeLazyField) f).readBytes())); } else { visitor.stringField(info, f.stringValue().getBytes(StandardCharsets.UTF_8)); } } } private byte[] toByteArrayUnwrapIfPossible(BytesRef bytesRef) { if (bytesRef.offset == 0 && bytesRef.bytes.length == bytesRef.length) { return bytesRef.bytes; } else { return Arrays.copyOfRange(bytesRef.bytes, bytesRef.offset, bytesRef.offset + bytesRef.length); } } /** Unlike LazyDocument.LazyField, we (a) don't cache large values, and (b) provide access to the byte[]. */ class LargeLazyField implements IndexableField { final String name; final int docId; // synchronize on 'this' to access: BytesRef cachedBytes; // we only conditionally populate this if it's big enough private LargeLazyField(String name, int docId) { this.name = name; this.docId = docId; } @Override public String toString() { return fieldType().toString() + "<" + name() + ">"; // mimic Field.java } @Override public String name() { return name; } @Override public IndexableFieldType fieldType() { return searcher.getSchema().getField(name()); } @Override public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) { return analyzer.tokenStream(name(), stringValue()); // or we could throw unsupported exception? } /** (for tests) */ synchronized boolean hasBeenLoaded() { return cachedBytes != null; } @Override public synchronized String stringValue() { try { return readBytes().utf8ToString(); } catch (IOException e) { throw new RuntimeException(e); } } synchronized BytesRef readBytes() throws IOException { if (cachedBytes != null) { return cachedBytes; } else { BytesRef bytesRef = new BytesRef(); searcher.getIndexReader().document(docId, new StoredFieldVisitor() { boolean done = false; @Override public Status needsField(FieldInfo fieldInfo) throws IOException { if (done) { return Status.STOP; } return fieldInfo.name.equals(name()) ? Status.YES : Status.NO; } @Override public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException { bytesRef.bytes = value; bytesRef.length = value.length; done = true; } @Override public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException { throw new UnsupportedOperationException("'large' binary fields are not (yet) supported"); } }); if (bytesRef.length < largeValueLengthCacheThreshold) { return cachedBytes = bytesRef; } else { return bytesRef; } } } @Override public BytesRef binaryValue() { return null; } @Override public Reader readerValue() { return null; } @Override public Number numericValue() { return null; } } /** * This will fetch and add the docValues fields to a given SolrDocument/SolrInputDocument * * @param doc * A SolrDocument or SolrInputDocument instance where docValues will be added * @param docid * The lucene docid of the document to be populated * @param fields * The list of docValues fields to be decorated */ public void decorateDocValueFields(@SuppressWarnings("rawtypes") SolrDocumentBase doc, int docid, Set<String> fields) throws IOException { final List<LeafReaderContext> leafContexts = searcher.getLeafContexts(); final int subIndex = ReaderUtil.subIndex(docid, leafContexts); final int localId = docid - leafContexts.get(subIndex).docBase; final LeafReader leafReader = leafContexts.get(subIndex).reader(); for (String fieldName : fields) { final SchemaField schemaField = searcher.getSchema().getFieldOrNull(fieldName); if (schemaField == null || !schemaField.hasDocValues() || doc.containsKey(fieldName)) { log.warn("Couldn't decorate docValues for field: [{}], schemaField: [{}]", fieldName, schemaField); continue; } FieldInfo fi = searcher.getFieldInfos().fieldInfo(fieldName); if (fi == null) { continue; // Searcher doesn't have info about this field, hence ignore it. } final DocValuesType dvType = fi.getDocValuesType(); switch (dvType) { case NUMERIC: final NumericDocValues ndv = leafReader.getNumericDocValues(fieldName); if (ndv == null) { continue; } Long val; if (ndv.advanceExact(localId)) { val = ndv.longValue(); } else { continue; } Object newVal = val; if (schemaField.getType().isPointField()) { // TODO: Maybe merge PointField with TrieFields here NumberType type = schemaField.getType().getNumberType(); switch (type) { case INTEGER: newVal = val.intValue(); break; case LONG: newVal = val.longValue(); break; case FLOAT: newVal = Float.intBitsToFloat(val.intValue()); break; case DOUBLE: newVal = Double.longBitsToDouble(val); break; case DATE: newVal = new Date(val); break; default: throw new AssertionError("Unexpected PointType: " + type); } } else { if (schemaField.getType() instanceof TrieIntField) { newVal = val.intValue(); } else if (schemaField.getType() instanceof TrieFloatField) { newVal = Float.intBitsToFloat(val.intValue()); } else if (schemaField.getType() instanceof TrieDoubleField) { newVal = Double.longBitsToDouble(val); } else if (schemaField.getType() instanceof TrieDateField) { newVal = new Date(val); } else if (schemaField.getType() instanceof EnumField) { newVal = ((EnumField) schemaField.getType()).intValueToStringValue(val.intValue()); } } doc.addField(fieldName, newVal); break; case BINARY: BinaryDocValues bdv = leafReader.getBinaryDocValues(fieldName); if (bdv == null) { continue; } BytesRef value; if (bdv.advanceExact(localId)) { value = BytesRef.deepCopyOf(bdv.binaryValue()); } else { continue; } doc.addField(fieldName, value); break; case SORTED: SortedDocValues sdv = leafReader.getSortedDocValues(fieldName); if (sdv == null) { continue; } if (sdv.advanceExact(localId)) { final BytesRef bRef = sdv.binaryValue(); // Special handling for Boolean fields since they're stored as 'T' and 'F'. if (schemaField.getType() instanceof BoolField) { doc.addField(fieldName, schemaField.getType().toObject(schemaField, bRef)); } else { doc.addField(fieldName, bRef.utf8ToString()); } } break; case SORTED_NUMERIC: final SortedNumericDocValues numericDv = leafReader.getSortedNumericDocValues(fieldName); NumberType type = schemaField.getType().getNumberType(); if (numericDv != null) { if (numericDv.advance(localId) == localId) { final List<Object> outValues = new ArrayList<Object>(numericDv.docValueCount()); for (int i = 0; i < numericDv.docValueCount(); i++) { long number = numericDv.nextValue(); switch (type) { case INTEGER: outValues.add((int)number); break; case LONG: outValues.add(number); break; case FLOAT: outValues.add(NumericUtils.sortableIntToFloat((int)number)); break; case DOUBLE: outValues.add(NumericUtils.sortableLongToDouble(number)); break; case DATE: outValues.add(new Date(number)); break; default: throw new AssertionError("Unexpected PointType: " + type); } } assert outValues.size() > 0; doc.addField(fieldName, outValues); } } case SORTED_SET: final SortedSetDocValues values = leafReader.getSortedSetDocValues(fieldName); if (values != null && values.getValueCount() > 0) { if (values.advance(localId) == localId) { final List<Object> outValues = new LinkedList<>(); for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) { value = values.lookupOrd(ord); outValues.add(schemaField.getType().toObject(schemaField, value)); } assert outValues.size() > 0; doc.addField(fieldName, outValues); } } case NONE: break; } } } /** * Returns an unmodifiable set of non-stored docValues field names. * * @param onlyUseDocValuesAsStored * If false, returns all non-stored docValues. If true, returns only those non-stored docValues which have * the {@link SchemaField#useDocValuesAsStored()} flag true. */ public Set<String> getNonStoredDVs(boolean onlyUseDocValuesAsStored) { return onlyUseDocValuesAsStored ? nonStoredDVsUsedAsStored : allNonStoredDVs; } /** * Returns an unmodifiable set of names of non-stored docValues fields, except those that are targets of a copy field. */ public Set<String> getNonStoredDVsWithoutCopyTargets() { return nonStoredDVsWithoutCopyTargets; } }