NumericFacets.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.request;

import java.io.IOException;
import java.util.ArrayDeque;
import java.util.Collections;
import java.util.Date;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FilterNumericDocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.StringHelper;
import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.NumberType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TrieField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrIndexSearcher;

/** Utility class to compute facets on numeric fields. */
final class NumericFacets {

  NumericFacets() {}

  static class HashTable {

    static final float LOAD_FACTOR = 0.7f;

    long[] bits; // bits identifying a value
    int[] counts;
    int[] docIDs; //Will be null if HashTable is created with needsDocId=false
    int mask;
    int size;
    int threshold;

    HashTable(boolean needsDocId) {
      final int capacity = 64; // must be a power of 2
      bits = new long[capacity];
      counts = new int[capacity];
      if (needsDocId) {
        docIDs = new int[capacity];
      }
      mask = capacity - 1;
      size = 0;
      threshold = (int) (capacity * LOAD_FACTOR);
    }

    private int hash(long v) {
      int h = (int) (v ^ (v >>> 32));
      h = (31 * h) & mask; // * 31 to try to use the whole table, even if values are dense
      return h;
    }

    void add(int docID, long value, int count) {
      if (size >= threshold) {
        rehash();
      }
      final int h = hash(value);
      for (int slot = h; ; slot = (slot + 1) & mask) {
        if (counts[slot] == 0) {
          bits[slot] = value;
          docIDs[slot] = docID;
          ++size;
        } else if (bits[slot] != value) {
          continue;
        }
        counts[slot] += count;
        break;
      }
    }
    
    void add(long value, int count) {
      if (size >= threshold) {
        rehash();
      }
      final int h = hash(value);
      for (int slot = h; ; slot = (slot + 1) & mask) {
        if (counts[slot] == 0) {
          bits[slot] = value;
          ++size;
        } else if (bits[slot] != value) {
          continue;
        }
        counts[slot] += count;
        break;
      }
    }

    private void rehash() {
      final long[] oldBits = bits;
      final int[] oldCounts = counts;
      final int[] oldDocIDs = docIDs;

      final int newCapacity = bits.length * 2;
      bits = new long[newCapacity];
      counts = new int[newCapacity];
      if (oldDocIDs!= null) {
        docIDs = new int[newCapacity];
      }
      mask = newCapacity - 1;
      threshold = (int) (LOAD_FACTOR * newCapacity);
      size = 0;

      if (oldDocIDs!= null) {
        for (int i = 0; i < oldBits.length; ++i) {
          if (oldCounts[i] > 0) {
            add(oldDocIDs[i], oldBits[i], oldCounts[i]);
          }
        }
      } else {
        for (int i = 0; i < oldBits.length; ++i) {
          if (oldCounts[i] > 0) {
            add(oldBits[i], oldCounts[i]);
          }
        }
      }
    }

  }

  private static class Entry {
    int docID;
    int count;
    long bits;
  }

  public static NamedList<Integer> getCounts(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort) throws IOException {
    final SchemaField sf = searcher.getSchema().getField(fieldName);
    if (sf.multiValued()) {
      // TODO: evaluate using getCountsMultiValued for singleValued numerics with SingletonSortedNumericDocValues
      return getCountsMultiValued(searcher, docs, fieldName, offset, limit, mincount, missing, sort);
    }
    return getCountsSingleValue(searcher, docs, fieldName, offset, limit, mincount, missing, sort);
  }
  
  private static NamedList<Integer> getCountsSingleValue(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort) throws IOException {
    boolean zeros = mincount <= 0;
    mincount = Math.max(mincount, 1);
    final SchemaField sf = searcher.getSchema().getField(fieldName);
    final FieldType ft = sf.getType();
    final NumberType numericType = ft.getNumberType();
    if (numericType == null) {
      throw new IllegalStateException();
    }
    zeros = zeros && !ft.isPointField() && sf.indexed(); // We don't return zeros when using PointFields or when index=false
    final List<LeafReaderContext> leaves = searcher.getIndexReader().leaves();

    // 1. accumulate
    final HashTable hashTable = new HashTable(true);
    final Iterator<LeafReaderContext> ctxIt = leaves.iterator();
    LeafReaderContext ctx = null;
    NumericDocValues longs = null;
    int missingCount = 0;
    for (DocIterator docsIt = docs.iterator(); docsIt.hasNext(); ) {
      final int doc = docsIt.nextDoc();
      if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) {
        do {
          ctx = ctxIt.next();
        } while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc());
        assert doc >= ctx.docBase;
        switch (numericType) {
          case LONG:
          case DATE:
          case INTEGER:
            // Long, Date and Integer
            longs = DocValues.getNumeric(ctx.reader(), fieldName);
            break;
          case FLOAT:
            // TODO: this bit flipping should probably be moved to tie-break in the PQ comparator
            longs = new FilterNumericDocValues(DocValues.getNumeric(ctx.reader(), fieldName)) {
              @Override
              public long longValue() throws IOException {
                long bits = super.longValue();
                if (bits < 0) bits ^= 0x7fffffffffffffffL;
                return bits;
              }
            };
            break;
          case DOUBLE:
            // TODO: this bit flipping should probably be moved to tie-break in the PQ comparator
            longs = new FilterNumericDocValues(DocValues.getNumeric(ctx.reader(), fieldName)) {
              @Override
              public long longValue() throws IOException {
                long bits = super.longValue();
                if (bits < 0) bits ^= 0x7fffffffffffffffL;
                return bits;
              }
            };
            break;
          default:
            throw new AssertionError("Unexpected type: " + numericType);
        }
      }
      int valuesDocID = longs.docID();
      if (valuesDocID < doc - ctx.docBase) {
        valuesDocID = longs.advance(doc - ctx.docBase);
      }
      if (valuesDocID == doc - ctx.docBase) {
        hashTable.add(doc, longs.longValue(), 1);
      } else {
        ++missingCount;
      }
    }

    // 2. select top-k facet values
    final int pqSize = limit < 0 ? hashTable.size : Math.min(offset + limit, hashTable.size);
    final PriorityQueue<Entry> pq;
    if (FacetParams.FACET_SORT_COUNT.equals(sort) || FacetParams.FACET_SORT_COUNT_LEGACY.equals(sort)) {
      pq = new PriorityQueue<Entry>(pqSize) {
        @Override
        protected boolean lessThan(Entry a, Entry b) {
          if (a.count < b.count || (a.count == b.count && a.bits > b.bits)) {
            return true;
          } else {
            return false;
          }
        }
      };
    } else {
      pq = new PriorityQueue<Entry>(pqSize) {
        @Override
        protected boolean lessThan(Entry a, Entry b) {
          return a.bits > b.bits;
        }
      };
    }
    Entry e = null;
    for (int i = 0; i < hashTable.bits.length; ++i) {
      if (hashTable.counts[i] >= mincount) {
        if (e == null) {
          e = new Entry();
        }
        e.bits = hashTable.bits[i];
        e.count = hashTable.counts[i];
        e.docID = hashTable.docIDs[i];
        e = pq.insertWithOverflow(e);
      }
    }

    // 4. build the NamedList
    final ValueSource vs = ft.getValueSource(sf, null);
    final NamedList<Integer> result = new NamedList<>();

    // This stuff is complicated because if facet.mincount=0, the counts needs
    // to be merged with terms from the terms dict
    if (!zeros || FacetParams.FACET_SORT_COUNT.equals(sort) || FacetParams.FACET_SORT_COUNT_LEGACY.equals(sort)) {
      // Only keep items we're interested in
      final Deque<Entry> counts = new ArrayDeque<>();
      while (pq.size() > offset) {
        counts.addFirst(pq.pop());
      }
      
      // Entries from the PQ first, then using the terms dictionary
      for (Entry entry : counts) {
        final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves);
        final FunctionValues values = vs.getValues(Collections.emptyMap(), leaves.get(readerIdx));
        result.add(values.strVal(entry.docID - leaves.get(readerIdx).docBase), entry.count);
      }

      if (zeros && (limit < 0 || result.size() < limit)) { // need to merge with the term dict
        if (!sf.indexed() && !sf.hasDocValues()) {
          throw new IllegalStateException("Cannot use " + FacetParams.FACET_MINCOUNT + "=0 on field " + sf.getName() + " which is neither indexed nor docValues");
        }
        // Add zeros until there are limit results
        final Set<String> alreadySeen = new HashSet<>();
        while (pq.size() > 0) {
          Entry entry = pq.pop();
          final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves);
          final FunctionValues values = vs.getValues(Collections.emptyMap(), leaves.get(readerIdx));
          alreadySeen.add(values.strVal(entry.docID - leaves.get(readerIdx).docBase));
        }
        for (int i = 0; i < result.size(); ++i) {
          alreadySeen.add(result.getName(i));
        }
        final Terms terms = searcher.getSlowAtomicReader().terms(fieldName);
        if (terms != null) {
          final String prefixStr = TrieField.getMainValuePrefix(ft);
          final BytesRef prefix;
          if (prefixStr != null) {
            prefix = new BytesRef(prefixStr);
          } else {
            prefix = new BytesRef();
          }
          final TermsEnum termsEnum = terms.iterator();
          BytesRef term;
          switch (termsEnum.seekCeil(prefix)) {
            case FOUND:
            case NOT_FOUND:
              term = termsEnum.term();
              break;
            case END:
              term = null;
              break;
            default:
              throw new AssertionError();
          }
          final CharsRefBuilder spare = new CharsRefBuilder();
          for (int skipped = hashTable.size; skipped < offset && term != null && StringHelper.startsWith(term, prefix); ) {
            ft.indexedToReadable(term, spare);
            final String termStr = spare.toString();
            if (!alreadySeen.contains(termStr)) {
              ++skipped;
            }
            term = termsEnum.next();
          }
          for ( ; term != null && StringHelper.startsWith(term, prefix) && (limit < 0 || result.size() < limit); term = termsEnum.next()) {
            ft.indexedToReadable(term, spare);
            final String termStr = spare.toString();
            if (!alreadySeen.contains(termStr)) {
              result.add(termStr, 0);
            }
          }
        }
      }
    } else {
      // sort=index, mincount=0 and we have less than limit items
      // => Merge the PQ and the terms dictionary on the fly
      if (!sf.indexed()) {
        throw new IllegalStateException("Cannot use " + FacetParams.FACET_SORT + "=" + FacetParams.FACET_SORT_INDEX + " on a field which is not indexed");
      }
      final Map<String, Integer> counts = new HashMap<>();
      while (pq.size() > 0) {
        final Entry entry = pq.pop();
        final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves);
        final FunctionValues values = vs.getValues(Collections.emptyMap(), leaves.get(readerIdx));
        counts.put(values.strVal(entry.docID - leaves.get(readerIdx).docBase), entry.count);
      }
      final Terms terms = searcher.getSlowAtomicReader().terms(fieldName);
      if (terms != null) {
        final String prefixStr = TrieField.getMainValuePrefix(ft);
        final BytesRef prefix;
        if (prefixStr != null) {
          prefix = new BytesRef(prefixStr);
        } else {
          prefix = new BytesRef();
        }
        final TermsEnum termsEnum = terms.iterator();
        BytesRef term;
        switch (termsEnum.seekCeil(prefix)) {
          case FOUND:
          case NOT_FOUND:
            term = termsEnum.term();
            break;
          case END:
            term = null;
            break;
          default:
            throw new AssertionError();
        }
        final CharsRefBuilder spare = new CharsRefBuilder();
        for (int i = 0; i < offset && term != null && StringHelper.startsWith(term, prefix); ++i) {
          term = termsEnum.next();
        }
        for ( ; term != null && StringHelper.startsWith(term, prefix) && (limit < 0 || result.size() < limit); term = termsEnum.next()) {
          ft.indexedToReadable(term, spare);
          final String termStr = spare.toString();
          Integer count = counts.get(termStr);
          if (count == null) {
            count = 0;
          }
          result.add(termStr, count);
        }
      }
    }

    if (missing) {
      result.add(null, missingCount);
    }
    return result;
  }

  private static NamedList<Integer> getCountsMultiValued(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort) throws IOException {
    // If facet.mincount=0 with PointFields the only option is to get the values from DocValues
    // not currently supported. See SOLR-10033
    mincount = Math.max(mincount, 1);
    final SchemaField sf = searcher.getSchema().getField(fieldName);
    final FieldType ft = sf.getType();
    assert sf.multiValued();
    final List<LeafReaderContext> leaves = searcher.getIndexReader().leaves();

    // 1. accumulate
    final HashTable hashTable = new HashTable(false);
    final Iterator<LeafReaderContext> ctxIt = leaves.iterator();
    LeafReaderContext ctx = null;
    SortedNumericDocValues longs = null;
    int missingCount = 0;
    for (DocIterator docsIt = docs.iterator(); docsIt.hasNext(); ) {
      final int doc = docsIt.nextDoc();
      if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) {
        do {
          ctx = ctxIt.next();
        } while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc());
        assert doc >= ctx.docBase;
        longs = DocValues.getSortedNumeric(ctx.reader(), fieldName);
      }
      int valuesDocID = longs.docID();
      if (valuesDocID < doc - ctx.docBase) {
        valuesDocID = longs.advance(doc - ctx.docBase);
      }
      if (valuesDocID == doc - ctx.docBase) {
        long l = longs.nextValue(); // This document must have at least one value
        hashTable.add(l, 1);
        for (int i = 1; i < longs.docValueCount(); i++) {
          long lnew = longs.nextValue();
          if (lnew > l) { // Skip the value if it's equal to the last one, we don't want to double-count it
            hashTable.add(lnew, 1);
          }
          l = lnew;
         }
        
      } else {
        ++missingCount;
      }
    }

    // 2. select top-k facet values
    final int pqSize = limit < 0 ? hashTable.size : Math.min(offset + limit, hashTable.size);
    final PriorityQueue<Entry> pq;
    if (FacetParams.FACET_SORT_COUNT.equals(sort) || FacetParams.FACET_SORT_COUNT_LEGACY.equals(sort)) {
      pq = new PriorityQueue<Entry>(pqSize) {
        @Override
        protected boolean lessThan(Entry a, Entry b) {
          if (a.count < b.count || (a.count == b.count && a.bits > b.bits)) {
            return true;
          } else {
            return false;
          }
        }
      };
    } else {
      // sort=index
      pq = new PriorityQueue<Entry>(pqSize) {
        @Override
        protected boolean lessThan(Entry a, Entry b) {
          return a.bits > b.bits;
        }
      };
    }
    Entry e = null;
    for (int i = 0; i < hashTable.bits.length; ++i) {
      if (hashTable.counts[i] >= mincount) {
        if (e == null) {
          e = new Entry();
        }
        e.bits = hashTable.bits[i];
        e.count = hashTable.counts[i];
        e = pq.insertWithOverflow(e);
      }
    }

    // 4. build the NamedList
    final NamedList<Integer> result = new NamedList<>(Math.max(pq.size() - offset + 1, 1));
    final Deque<Entry> counts = new ArrayDeque<>(pq.size() - offset);
    while (pq.size() > offset) {
      counts.addFirst(pq.pop());
    }
    
    for (Entry entry : counts) {
      result.add(bitsToStringValue(ft, entry.bits), entry.count); // TODO: convert to correct value
    }
    
    // Once facet.mincount=0 is supported we'll need to add logic similar to the SingleValue case, but obtaining values
    // with count 0 from DocValues

    if (missing) {
      result.add(null, missingCount);
    }
    return result;
  }

  private static String bitsToStringValue(FieldType fieldType, long bits) {
    switch (fieldType.getNumberType()) {
      case LONG:
      case INTEGER:
        return String.valueOf(bits);
      case FLOAT:
        return String.valueOf(NumericUtils.sortableIntToFloat((int)bits));
      case DOUBLE:
        return String.valueOf(NumericUtils.sortableLongToDouble(bits));
      case DATE:
        return new Date(bits).toInstant().toString();
      default:
        throw new AssertionError("Unsupported NumberType: " + fieldType.getNumberType());
    }
  }

}