SolrDocumentFetcher.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.search;

import java.io.IOException;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.document.LazyDocument;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;
import org.apache.solr.common.SolrDocumentBase;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.schema.BoolField;
import org.apache.solr.schema.EnumField;
import org.apache.solr.schema.NumberType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TrieDateField;
import org.apache.solr.schema.TrieDoubleField;
import org.apache.solr.schema.TrieFloatField;
import org.apache.solr.schema.TrieIntField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A helper class of {@link org.apache.solr.search.SolrIndexSearcher} for stored Document related matters
 * including DocValue substitutions.
 */
public class SolrDocumentFetcher {

  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  private final SolrIndexSearcher searcher;

  private final boolean enableLazyFieldLoading;

  private final SolrCache<Integer,Document> documentCache;

  /** Contains the names/patterns of all docValues=true,stored=false fields in the schema. */
  private final Set<String> allNonStoredDVs;

  /** Contains the names/patterns of all docValues=true,stored=false,useDocValuesAsStored=true fields in the schema. */
  private final Set<String> nonStoredDVsUsedAsStored;

  /** Contains the names/patterns of all docValues=true,stored=false fields, excluding those that are copyField targets in the schema. */
  private final Set<String> nonStoredDVsWithoutCopyTargets;

  private static int largeValueLengthCacheThreshold = Integer.getInteger("solr.largeField.cacheThreshold", 512 * 1024); // internal setting

  private final Set<String> largeFields;

  private Collection<String> storedHighlightFieldNames; // lazy populated; use getter

  SolrDocumentFetcher(SolrIndexSearcher searcher, SolrConfig solrConfig, boolean cachingEnabled) {
    this.searcher = searcher;
    this.enableLazyFieldLoading = solrConfig.enableLazyFieldLoading;
    if (cachingEnabled) {
      documentCache = solrConfig.documentCacheConfig == null ? null : solrConfig.documentCacheConfig.newInstance();
    } else {
      documentCache = null;
    }

    final Set<String> nonStoredDVsUsedAsStored = new HashSet<>();
    final Set<String> allNonStoredDVs = new HashSet<>();
    final Set<String> nonStoredDVsWithoutCopyTargets = new HashSet<>();
    final Set<String> storedLargeFields = new HashSet<>();

    for (FieldInfo fieldInfo : searcher.getFieldInfos()) { // can find materialized dynamic fields, unlike using the Solr IndexSchema.
      final SchemaField schemaField = searcher.getSchema().getFieldOrNull(fieldInfo.name);
      if (schemaField == null) {
        continue;
      }
      if (!schemaField.stored() && schemaField.hasDocValues()) {
        if (schemaField.useDocValuesAsStored()) {
          nonStoredDVsUsedAsStored.add(fieldInfo.name);
        }
        allNonStoredDVs.add(fieldInfo.name);
        if (!searcher.getSchema().isCopyFieldTarget(schemaField)) {
          nonStoredDVsWithoutCopyTargets.add(fieldInfo.name);
        }
      }
      if (schemaField.stored() && schemaField.isLarge()) {
        storedLargeFields.add(schemaField.getName());
      }
    }

    this.nonStoredDVsUsedAsStored = Collections.unmodifiableSet(nonStoredDVsUsedAsStored);
    this.allNonStoredDVs = Collections.unmodifiableSet(allNonStoredDVs);
    this.nonStoredDVsWithoutCopyTargets = Collections.unmodifiableSet(nonStoredDVsWithoutCopyTargets);
    this.largeFields = Collections.unmodifiableSet(storedLargeFields);
  }

  public boolean isLazyFieldLoadingEnabled() {
    return enableLazyFieldLoading;
  }

  public SolrCache<Integer, Document> getDocumentCache() {
    return documentCache;
  }

  /**
   * Returns a collection of the names of all stored fields which can be highlighted the index reader knows about.
   */
  public Collection<String> getStoredHighlightFieldNames() {
    synchronized (this) {
      if (storedHighlightFieldNames == null) {
        storedHighlightFieldNames = new LinkedList<>();
        for (FieldInfo fieldInfo : searcher.getFieldInfos()) {
          final String fieldName = fieldInfo.name;
          try {
            SchemaField field = searcher.getSchema().getField(fieldName);
            if (field.stored() && ((field.getType() instanceof org.apache.solr.schema.TextField)
                || (field.getType() instanceof org.apache.solr.schema.StrField))) {
              storedHighlightFieldNames.add(fieldName);
            }
          } catch (RuntimeException e) { // getField() throws a SolrException, but it arrives as a RuntimeException
            log.warn("Field [{}] found in index, but not defined in schema.", fieldName);
          }
        }
      }
      return storedHighlightFieldNames;
    }
  }

  /** @see SolrIndexSearcher#doc(int) */
  public Document doc(int docId) throws IOException {
    return doc(docId, (Set<String>) null);
  }

  /**
   * Retrieve the {@link Document} instance corresponding to the document id.
   * <p>
   * <b>NOTE</b>: the document will have all fields accessible, but if a field filter is provided, only the provided
   * fields will be loaded (the remainder will be available lazily).
   *
   * @see SolrIndexSearcher#doc(int, Set)
   */
  public Document doc(int i, Set<String> fields) throws IOException {
    Document d;
    if (documentCache != null) {
      d = documentCache.get(i);
      if (d != null) return d;
    }

    final DirectoryReader reader = searcher.getIndexReader();
    if (documentCache != null && !enableLazyFieldLoading) {
      // we do not filter the fields in this case because that would return an incomplete document which would
      // be eventually cached. The alternative would be to read the stored fields twice; once with the fields
      // and then without for caching leading to a performance hit
      // see SOLR-8858 for related discussion
      fields = null;
    }
    final SolrDocumentStoredFieldVisitor visitor = new SolrDocumentStoredFieldVisitor(fields, reader, i);
    reader.document(i, visitor);
    d = visitor.getDocument();

    if (documentCache != null) {
      documentCache.put(i, d);
    }

    return d;
  }

  /** {@link StoredFieldVisitor} which loads the specified fields eagerly (or all if null).
   * If {@link #enableLazyFieldLoading} then the rest get special lazy field entries.  Designated "large"
   * fields will always get a special field entry. */
  private class SolrDocumentStoredFieldVisitor extends DocumentStoredFieldVisitor {
    private final Document doc;
    private final LazyDocument lazyFieldProducer; // arguably a better name than LazyDocument; at least how we use it here
    private final int docId;
    private final boolean addLargeFieldsLazily;

    SolrDocumentStoredFieldVisitor(Set<String> toLoad, IndexReader reader, int docId) {
      super(toLoad);
      this.docId = docId;
      this.doc = getDocument();
      this.lazyFieldProducer = toLoad != null && enableLazyFieldLoading ? new LazyDocument(reader, docId) : null;
      this.addLargeFieldsLazily = (documentCache != null && !largeFields.isEmpty());
      //TODO can we return Status.STOP after a val is loaded and we know there are no other fields of interest?
      //    When: toLoad is one single-valued field, no lazyFieldProducer
    }

    @Override
    public Status needsField(FieldInfo fieldInfo) throws IOException {
      Status status = super.needsField(fieldInfo);
      assert status != Status.STOP : "Status.STOP not supported or expected";
      if (addLargeFieldsLazily && largeFields.contains(fieldInfo.name)) { // load "large" fields using this lazy mechanism
        if (lazyFieldProducer != null || status == Status.YES) {
          doc.add(new LargeLazyField(fieldInfo.name, docId));
        }
        return Status.NO;
      }
      if (status == Status.NO && lazyFieldProducer != null) { // lazy
        doc.add(lazyFieldProducer.getField(fieldInfo));
      }
      return status;
    }
  }

  /** @see SolrIndexSearcher#doc(int, StoredFieldVisitor) */
  public void doc(int docId, StoredFieldVisitor visitor) throws IOException {
    if (documentCache != null) {
      Document cached = documentCache.get(docId);
      if (cached != null) {
        visitFromCached(cached, visitor);
        return;
      }
    }
    searcher.getIndexReader().document(docId, visitor);
  }

  /** Executes a stored field visitor against a hit from the document cache */
  private void visitFromCached(Document document, StoredFieldVisitor visitor) throws IOException {
    for (IndexableField f : document) {
      final FieldInfo info = searcher.getFieldInfos().fieldInfo(f.name());
      final StoredFieldVisitor.Status needsField = visitor.needsField(info);
      if (needsField == StoredFieldVisitor.Status.STOP) return;
      if (needsField == StoredFieldVisitor.Status.NO) continue;
      BytesRef binaryValue = f.binaryValue();
      if (binaryValue != null) {
        visitor.binaryField(info, toByteArrayUnwrapIfPossible(binaryValue));
        continue;
      }
      Number numericValue = f.numericValue();
      if (numericValue != null) {
        if (numericValue instanceof Double) {
          visitor.doubleField(info, numericValue.doubleValue());
        } else if (numericValue instanceof Integer) {
          visitor.intField(info, numericValue.intValue());
        } else if (numericValue instanceof Float) {
          visitor.floatField(info, numericValue.floatValue());
        } else if (numericValue instanceof Long) {
          visitor.longField(info, numericValue.longValue());
        } else {
          throw new AssertionError();
        }
        continue;
      }
      // must be String
      if (f instanceof LargeLazyField) { // optimization to avoid premature string conversion
        visitor.stringField(info, toByteArrayUnwrapIfPossible(((LargeLazyField) f).readBytes()));
      } else {
        visitor.stringField(info, f.stringValue().getBytes(StandardCharsets.UTF_8));
      }
    }
  }

  private byte[] toByteArrayUnwrapIfPossible(BytesRef bytesRef) {
    if (bytesRef.offset == 0 && bytesRef.bytes.length == bytesRef.length) {
      return bytesRef.bytes;
    } else {
      return Arrays.copyOfRange(bytesRef.bytes, bytesRef.offset, bytesRef.offset + bytesRef.length);
    }
  }

  /** Unlike LazyDocument.LazyField, we (a) don't cache large values, and (b) provide access to the byte[]. */
  class LargeLazyField implements IndexableField {

    final String name;
    final int docId;
    // synchronize on 'this' to access:
    BytesRef cachedBytes; // we only conditionally populate this if it's big enough

    private LargeLazyField(String name, int docId) {
      this.name = name;
      this.docId = docId;
    }

    @Override
    public String toString() {
      return fieldType().toString() + "<" + name() + ">"; // mimic Field.java
    }

    @Override
    public String name() {
      return name;
    }

    @Override
    public IndexableFieldType fieldType() {
      return searcher.getSchema().getField(name());
    }

    @Override
    public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
      return analyzer.tokenStream(name(), stringValue()); // or we could throw unsupported exception?
    }
    /** (for tests) */
    synchronized boolean hasBeenLoaded() {
      return cachedBytes != null;
    }

    @Override
    public synchronized String stringValue() {
      try {
        return readBytes().utf8ToString();
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }

    synchronized BytesRef readBytes() throws IOException {
      if (cachedBytes != null) {
        return cachedBytes;
      } else {
        BytesRef bytesRef = new BytesRef();
        searcher.getIndexReader().document(docId, new StoredFieldVisitor() {
          boolean done = false;
          @Override
          public Status needsField(FieldInfo fieldInfo) throws IOException {
            if (done) {
              return Status.STOP;
            }
            return fieldInfo.name.equals(name()) ? Status.YES : Status.NO;
          }

          @Override
          public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException {
            bytesRef.bytes = value;
            bytesRef.length = value.length;
            done = true;
          }

          @Override
          public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
            throw new UnsupportedOperationException("'large' binary fields are not (yet) supported");
          }
        });
        if (bytesRef.length < largeValueLengthCacheThreshold) {
          return cachedBytes = bytesRef;
        } else {
          return bytesRef;
        }
      }
    }

    @Override
    public BytesRef binaryValue() {
      return null;
    }

    @Override
    public Reader readerValue() {
      return null;
    }

    @Override
    public Number numericValue() {
      return null;
    }
  }

  /**
   * This will fetch and add the docValues fields to a given SolrDocument/SolrInputDocument
   *
   * @param doc
   *          A SolrDocument or SolrInputDocument instance where docValues will be added
   * @param docid
   *          The lucene docid of the document to be populated
   * @param fields
   *          The list of docValues fields to be decorated
   */
  public void decorateDocValueFields(@SuppressWarnings("rawtypes") SolrDocumentBase doc, int docid, Set<String> fields)
      throws IOException {
    final List<LeafReaderContext> leafContexts = searcher.getLeafContexts();
    final int subIndex = ReaderUtil.subIndex(docid, leafContexts);
    final int localId = docid - leafContexts.get(subIndex).docBase;
    final LeafReader leafReader = leafContexts.get(subIndex).reader();
    for (String fieldName : fields) {
      final SchemaField schemaField = searcher.getSchema().getFieldOrNull(fieldName);
      if (schemaField == null || !schemaField.hasDocValues() || doc.containsKey(fieldName)) {
        log.warn("Couldn't decorate docValues for field: [{}], schemaField: [{}]", fieldName, schemaField);
        continue;
      }
      FieldInfo fi = searcher.getFieldInfos().fieldInfo(fieldName);
      if (fi == null) {
        continue; // Searcher doesn't have info about this field, hence ignore it.
      }
      final DocValuesType dvType = fi.getDocValuesType();
      switch (dvType) {
        case NUMERIC:
          final NumericDocValues ndv = leafReader.getNumericDocValues(fieldName);
          if (ndv == null) {
            continue;
          }
          Long val;
          if (ndv.advanceExact(localId)) {
            val = ndv.longValue();
          } else {
            continue;
          }
          Object newVal = val;
          if (schemaField.getType().isPointField()) {
            // TODO: Maybe merge PointField with TrieFields here
            NumberType type = schemaField.getType().getNumberType();
            switch (type) {
              case INTEGER:
                newVal = val.intValue();
                break;
              case LONG:
                newVal = val.longValue();
                break;
              case FLOAT:
                newVal = Float.intBitsToFloat(val.intValue());
                break;
              case DOUBLE:
                newVal = Double.longBitsToDouble(val);
                break;
              case DATE:
                newVal = new Date(val);
                break;
              default:
                throw new AssertionError("Unexpected PointType: " + type);
            }
          } else {
            if (schemaField.getType() instanceof TrieIntField) {
              newVal = val.intValue();
            } else if (schemaField.getType() instanceof TrieFloatField) {
              newVal = Float.intBitsToFloat(val.intValue());
            } else if (schemaField.getType() instanceof TrieDoubleField) {
              newVal = Double.longBitsToDouble(val);
            } else if (schemaField.getType() instanceof TrieDateField) {
              newVal = new Date(val);
            } else if (schemaField.getType() instanceof EnumField) {
              newVal = ((EnumField) schemaField.getType()).intValueToStringValue(val.intValue());
            }
          }
          doc.addField(fieldName, newVal);
          break;
        case BINARY:
          BinaryDocValues bdv = leafReader.getBinaryDocValues(fieldName);
          if (bdv == null) {
            continue;
          }
          BytesRef value;
          if (bdv.advanceExact(localId)) {
            value = BytesRef.deepCopyOf(bdv.binaryValue());
          } else {
            continue;
          }
          doc.addField(fieldName, value);
          break;
        case SORTED:
          SortedDocValues sdv = leafReader.getSortedDocValues(fieldName);
          if (sdv == null) {
            continue;
          }
          if (sdv.advanceExact(localId)) {
            final BytesRef bRef = sdv.binaryValue();
            // Special handling for Boolean fields since they're stored as 'T' and 'F'.
            if (schemaField.getType() instanceof BoolField) {
              doc.addField(fieldName, schemaField.getType().toObject(schemaField, bRef));
            } else {
              doc.addField(fieldName, bRef.utf8ToString());
            }
          }
          break;
        case SORTED_NUMERIC:
          final SortedNumericDocValues numericDv = leafReader.getSortedNumericDocValues(fieldName);
          NumberType type = schemaField.getType().getNumberType();
          if (numericDv != null) {
            if (numericDv.advance(localId) == localId) {
              final List<Object> outValues = new ArrayList<Object>(numericDv.docValueCount());
              for (int i = 0; i < numericDv.docValueCount(); i++) {
                long number = numericDv.nextValue();
                switch (type) {
                  case INTEGER:
                    outValues.add((int)number);
                    break;
                  case LONG:
                    outValues.add(number);
                    break;
                  case FLOAT:
                    outValues.add(NumericUtils.sortableIntToFloat((int)number));
                    break;
                  case DOUBLE:
                    outValues.add(NumericUtils.sortableLongToDouble(number));
                    break;
                  case DATE:
                    outValues.add(new Date(number));
                    break;
                  default:
                    throw new AssertionError("Unexpected PointType: " + type);
                }
              }
              assert outValues.size() > 0;
              doc.addField(fieldName, outValues);
            }
          }
        case SORTED_SET:
          final SortedSetDocValues values = leafReader.getSortedSetDocValues(fieldName);
          if (values != null && values.getValueCount() > 0) {
            if (values.advance(localId) == localId) {
              final List<Object> outValues = new LinkedList<>();
              for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
                value = values.lookupOrd(ord);
                outValues.add(schemaField.getType().toObject(schemaField, value));
              }
              assert outValues.size() > 0;
              doc.addField(fieldName, outValues);
            }
          }
        case NONE:
          break;
      }
    }
  }

  /**
   * Returns an unmodifiable set of non-stored docValues field names.
   *
   * @param onlyUseDocValuesAsStored
   *          If false, returns all non-stored docValues. If true, returns only those non-stored docValues which have
   *          the {@link SchemaField#useDocValuesAsStored()} flag true.
   */
  public Set<String> getNonStoredDVs(boolean onlyUseDocValuesAsStored) {
    return onlyUseDocValuesAsStored ? nonStoredDVsUsedAsStored : allNonStoredDVs;
  }

  /**
   * Returns an unmodifiable set of names of non-stored docValues fields, except those that are targets of a copy field.
   */
  public Set<String> getNonStoredDVsWithoutCopyTargets() {
    return nonStoredDVsWithoutCopyTargets;
  }

}