NumericFacets.java example

Explorer
heliosearch-master
- lucene
- solr
package org.apache.solr.request;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.ArrayDeque;
import java.util.Collections;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.document.FieldType.NumericType;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.solr.search.QueryContext;
import org.apache.solr.search.function.FuncValues;
import org.apache.solr.search.function.ValueSource;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.StringHelper;
import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TrieField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrIndexSearcher;

/** Utility class to compute facets on numeric fields. */
final class NumericFacets {

  NumericFacets() {}

  static class HashTable {

    static final float LOAD_FACTOR = 0.7f;

    long[] bits; // bits identifying a value
    int[] counts;
    int[] docIDs;
    int mask;
    int size;
    int threshold;

    HashTable() {
      final int capacity = 64; // must be a power of 2
      bits = new long[capacity];
      counts = new int[capacity];
      docIDs = new int[capacity];
      mask = capacity - 1;
      size = 0;
      threshold = (int) (capacity * LOAD_FACTOR);
    }

    private int hash(long v) {
      int h = (int) (v ^ (v >>> 32));
      h = (31 * h) & mask; // * 31 to try to use the whole table, even if values are dense
      return h;
    }

    void add(int docID, long value, int count) {
      if (size >= threshold) {
        rehash();
      }
      final int h = hash(value);
      for (int slot = h; ; slot = (slot + 1) & mask) {
        if (counts[slot] == 0) {
          bits[slot] = value;
          docIDs[slot] = docID;
          ++size;
        } else if (bits[slot] != value) {
          continue;
        }
        counts[slot] += count;
        break;
      }
    }

    private void rehash() {
      final long[] oldBits = bits;
      final int[] oldCounts = counts;
      final int[] oldDocIDs = docIDs;

      final int newCapacity = bits.length * 2;
      bits = new long[newCapacity];
      counts = new int[newCapacity];
      docIDs = new int[newCapacity];
      mask = newCapacity - 1;
      threshold = (int) (LOAD_FACTOR * newCapacity);
      size = 0;

      for (int i = 0; i < oldBits.length; ++i) {
        if (oldCounts[i] > 0) {
          add(oldDocIDs[i], oldBits[i], oldCounts[i]);
        }
      }
    }

  }

  private static class Entry {
    int docID;
    int count;
    long bits;
  }

  public static NamedList<Integer> getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort) throws IOException {
    QueryContext context = new QueryContext(searcher);

    final boolean zeros = mincount <= 0;
    mincount = Math.max(mincount, 1);
    final SchemaField sf = searcher.getSchema().getField(fieldName);
    final FieldType ft = sf.getType();
    final NumericType numericType = ft.getNumericType();
    if (numericType == null) {
      throw new IllegalStateException();
    }
    final List<AtomicReaderContext> leaves = searcher.getIndexReader().leaves();

    // 1. accumulate
    final HashTable hashTable = new HashTable();
    final Iterator<AtomicReaderContext> ctxIt = leaves.iterator();
    AtomicReaderContext ctx = null;
    FieldCache.Longs longs = null;
    Bits docsWithField = null;
    int missingCount = 0;
    for (DocIterator docsIt = docs.iterator(); docsIt.hasNext(); ) {
      final int doc = docsIt.nextDoc();
      if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) {
        do {
          ctx = ctxIt.next();
        } while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc());
        assert doc >= ctx.docBase;
        switch (numericType) {
          case LONG:
            longs = FieldCache.DEFAULT.getLongs(ctx.reader(), fieldName, true);
            break;
          case INT:
            final FieldCache.Ints ints = FieldCache.DEFAULT.getInts(ctx.reader(), fieldName, true);
            longs = new FieldCache.Longs() {
              @Override
              public long get(int docID) {
                return ints.get(docID);
              }
            };
            break;
          case FLOAT:
            final FieldCache.Floats floats = FieldCache.DEFAULT.getFloats(ctx.reader(), fieldName, true);
            longs = new FieldCache.Longs() {
              @Override
              public long get(int docID) {
                return NumericUtils.floatToSortableInt(floats.get(docID));
              }
            };
            break;
          case DOUBLE:
            final FieldCache.Doubles doubles = FieldCache.DEFAULT.getDoubles(ctx.reader(), fieldName, true);
            longs = new FieldCache.Longs() {
              @Override
              public long get(int docID) {
                return NumericUtils.doubleToSortableLong(doubles.get(docID));
              }
            };
            break;
          default:
            throw new AssertionError();
        }
        docsWithField = FieldCache.DEFAULT.getDocsWithField(ctx.reader(), fieldName);
      }
      long v = longs.get(doc - ctx.docBase);
      if (v != 0 || docsWithField.get(doc - ctx.docBase)) {
        hashTable.add(doc, v, 1);
      } else {
        ++missingCount;
      }
    }

    // 2. select top-k facet values
    final int pqSize = limit < 0 ? hashTable.size : Math.min(offset + limit, hashTable.size);
    final PriorityQueue<Entry> pq;
    if (FacetParams.FACET_SORT_COUNT.equals(sort) || FacetParams.FACET_SORT_COUNT_LEGACY.equals(sort)) {
      pq = new PriorityQueue<Entry>(pqSize) {
        @Override
        protected boolean lessThan(Entry a, Entry b) {
          if (a.count < b.count || (a.count == b.count && a.bits > b.bits)) {
            return true;
          } else {
            return false;
          }
        }
      };
    } else {
      pq = new PriorityQueue<Entry>(pqSize) {
        @Override
        protected boolean lessThan(Entry a, Entry b) {
          return a.bits > b.bits;
        }
      };
    }
    Entry e = null;
    for (int i = 0; i < hashTable.bits.length; ++i) {
      if (hashTable.counts[i] >= mincount) {
        if (e == null) {
          e = new Entry();
        }
        e.bits = hashTable.bits[i];
        e.count = hashTable.counts[i];
        e.docID = hashTable.docIDs[i];
        e = pq.insertWithOverflow(e);
      }
    }

    // 4. build the NamedList
    final ValueSource vs = ft.getValueSource(sf, null);
    final NamedList<Integer> result = new NamedList<>();

    // This stuff is complicated because if facet.mincount=0, the counts needs
    // to be merged with terms from the terms dict
    if (!zeros || FacetParams.FACET_SORT_COUNT.equals(sort) || FacetParams.FACET_SORT_COUNT_LEGACY.equals(sort)) {
      // Only keep items we're interested in
      final Deque<Entry> counts = new ArrayDeque<>();
      while (pq.size() > offset) {
        counts.addFirst(pq.pop());
      }
      
      // Entries from the PQ first, then using the terms dictionary
      for (Entry entry : counts) {
        final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves);
        final FuncValues values = vs.getValues(context, leaves.get(readerIdx));
        result.add(values.strVal(entry.docID - leaves.get(readerIdx).docBase), entry.count);
      }

      if (zeros && (limit < 0 || result.size() < limit)) { // need to merge with the term dict
        if (!sf.indexed()) {
          throw new IllegalStateException("Cannot use " + FacetParams.FACET_MINCOUNT + "=0 on field " + sf.getName() + " which is not indexed");
        }
        // Add zeros until there are limit results
        final Set<String> alreadySeen = new HashSet<>();
        while (pq.size() > 0) {
          Entry entry = pq.pop();
          final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves);
          final FuncValues values = vs.getValues(context, leaves.get(readerIdx));
          alreadySeen.add(values.strVal(entry.docID - leaves.get(readerIdx).docBase));
        }
        for (int i = 0; i < result.size(); ++i) {
          alreadySeen.add(result.getName(i));
        }
        final Terms terms = searcher.getAtomicReader().terms(fieldName);
        if (terms != null) {
          final String prefixStr = TrieField.getMainValuePrefix(ft);
          final BytesRef prefix;
          if (prefixStr != null) {
            prefix = new BytesRef(prefixStr);
          } else {
            prefix = new BytesRef();
          }
          final TermsEnum termsEnum = terms.iterator(null);
          BytesRef term;
          switch (termsEnum.seekCeil(prefix)) {
            case FOUND:
            case NOT_FOUND:
              term = termsEnum.term();
              break;
            case END:
              term = null;
              break;
            default:
              throw new AssertionError();
          }
          final CharsRef spare = new CharsRef();
          for (int skipped = hashTable.size; skipped < offset && term != null && StringHelper.startsWith(term, prefix); ) {
            ft.indexedToReadable(term, spare);
            final String termStr = spare.toString();
            if (!alreadySeen.contains(termStr)) {
              ++skipped;
            }
            term = termsEnum.next();
          }
          for ( ; term != null && StringHelper.startsWith(term, prefix) && (limit < 0 || result.size() < limit); term = termsEnum.next()) {
            ft.indexedToReadable(term, spare);
            final String termStr = spare.toString();
            if (!alreadySeen.contains(termStr)) {
              result.add(termStr, 0);
            }
          }
        }
      }
    } else {
      // sort=index, mincount=0 and we have less than limit items
      // => Merge the PQ and the terms dictionary on the fly
      if (!sf.indexed()) {
        throw new IllegalStateException("Cannot use " + FacetParams.FACET_SORT + "=" + FacetParams.FACET_SORT_INDEX + " on a field which is not indexed");
      }
      final Map<String, Integer> counts = new HashMap<>();
      while (pq.size() > 0) {
        final Entry entry = pq.pop();
        final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves);
        final FuncValues values = vs.getValues(context, leaves.get(readerIdx));
        counts.put(values.strVal(entry.docID - leaves.get(readerIdx).docBase), entry.count);
      }
      final Terms terms = searcher.getAtomicReader().terms(fieldName);
      if (terms != null) {
        final String prefixStr = TrieField.getMainValuePrefix(ft);
        final BytesRef prefix;
        if (prefixStr != null) {
          prefix = new BytesRef(prefixStr);
        } else {
          prefix = new BytesRef();
        }
        final TermsEnum termsEnum = terms.iterator(null);
        BytesRef term;
        switch (termsEnum.seekCeil(prefix)) {
          case FOUND:
          case NOT_FOUND:
            term = termsEnum.term();
            break;
          case END:
            term = null;
            break;
          default:
            throw new AssertionError();
        }
        final CharsRef spare = new CharsRef();
        for (int i = 0; i < offset && term != null && StringHelper.startsWith(term, prefix); ++i) {
          term = termsEnum.next();
        }
        for ( ; term != null && StringHelper.startsWith(term, prefix) && (limit < 0 || result.size() < limit); term = termsEnum.next()) {
          ft.indexedToReadable(term, spare);
          final String termStr = spare.toString();
          Integer count = counts.get(termStr);
          if (count == null) {
            count = 0;
          }
          result.add(termStr, count);
        }
      }
    }

    if (missing) {
      result.add(null, missingCount);
    }
    return result;
  }

}