HashDocSet.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.search;

import java.util.Collection;
import java.util.Collections;

import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.RamUsageEstimator;


/**
 * <code>HashDocSet</code> represents an unordered set of Lucene Document Ids
 * using a primitive int hash table.  It can be a better choice if there are few docs
 * in the set because it takes up less memory and is faster to iterate and take
 * set intersections.
 *
 *
 * @since solr 0.9
 */
public final class HashDocSet extends DocSetBase {
  private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(HashDocSet.class) + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER;

  /** Default load factor to use for HashDocSets.  We keep track of the inverse
   *  since multiplication is so much faster than division.  The default
   *  is 1.0f / 0.75f
   */
  static float DEFAULT_INVERSE_LOAD_FACTOR = 1.0f /0.75f;

  // public final static int MAX_SIZE = SolrConfig.config.getInt("//HashDocSet/@maxSize",-1);


  // lucene docs are numbered from 0, so a neg number must be used for missing.
  // an alternative to having to init the array to EMPTY at the start is
  //
  private final static int EMPTY=-1;
  private final int[] table;
  private final int size;
  private final int mask;

  public HashDocSet(HashDocSet set) {
    this.table = set.table.clone();
    this.size = set.size;
    this.mask = set.mask;
  }

  /** Create a HashDocSet from a list of *unique* ids */
  public HashDocSet(int[] docs, int offset, int len) {
    this(docs, offset, len, DEFAULT_INVERSE_LOAD_FACTOR);
  }

  /** Create a HashDocSet from a list of *unique* ids */  
  public HashDocSet(int[] docs, int offset, int len, float inverseLoadFactor) {
    int tsize = Math.max(BitUtil.nextHighestPowerOfTwo(len), 1);
    if (tsize < len * inverseLoadFactor) {
      tsize <<= 1;
    }

    mask=tsize-1;

    table = new int[tsize];
    // (for now) better then: Arrays.fill(table, EMPTY);
    // https://issues.apache.org/jira/browse/SOLR-390
    for (int i=tsize-1; i>=0; i--) table[i]=EMPTY;

    int end = offset + len;
    for (int i=offset; i<end; i++) {
      put(docs[i]);
    }

    size = len;
  }

  void put(int doc) {
    int s = doc & mask;
    while (table[s]!=EMPTY) {
      // Adding an odd number to this power-of-two hash table is
      // guaranteed to do a full traversal, so instead of re-hashing
      // we jump straight to a "linear" traversal.
      // The key is that we provide many different ways to do the
      // traversal (tablesize/2) based on the last hash code (the doc).
      // Rely on loop invariant code motion to eval ((doc>>7)|1) only once.
      // otherwise, we would need to pull the first case out of the loop.
      s = (s + ((doc>>7)|1)) & mask;
    }
    table[s]=doc;
  }

  @Override
  public boolean exists(int doc) {
    int s = doc & mask;
    for(;;) {
      int v = table[s];
      if (v==EMPTY) return false;
      if (v==doc) return true;
      // see put() for algorithm details.
      s = (s + ((doc>>7)|1)) & mask;
    }
  }


  @Override
  public int size() {
    return size;
  }

  @Override
  public DocIterator iterator() {
    return new DocIterator() {
      int pos=0;
      int doc;
      { goNext(); }

      @Override
      public boolean hasNext() {
        return pos < table.length;
      }

      @Override
      public Integer next() {
        return nextDoc();
      }

      @Override
      public void remove() {
      }

      void goNext() {
        while (pos<table.length && table[pos]==EMPTY) pos++;
      }

      // modify to return -1 at end of iteration?
      @Override
      public int nextDoc() {
        int doc = table[pos];
        pos++;
        goNext();
        return doc;
      }

      @Override
      public float score() {
        return 0.0f;
      }
    };
  }

  @Override
  public DocSet intersection(DocSet other) {
   if (other instanceof HashDocSet) {
     // set "a" to the smallest doc set for the most efficient
     // intersection.
     final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other;
     final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this;

     int[] result = new int[a.size()];
     int resultCount=0;
     for (int i=0; i<a.table.length; i++) {
       int id=a.table[i];
       if (id >= 0 && b.exists(id)) {
         result[resultCount++]=id;
       }
     }
     return new HashDocSet(result,0,resultCount);

   } else {

     int[] result = new int[size()];
     int resultCount=0;
     for (int i=0; i<table.length; i++) {
       int id=table[i];
       if (id >= 0 && other.exists(id)) {
         result[resultCount++]=id;
       }
     }
     return new HashDocSet(result,0,resultCount);
   }

  }

  @Override
  public int intersectionSize(DocSet other) {
   if (other instanceof HashDocSet) {
     // set "a" to the smallest doc set for the most efficient
     // intersection.
     final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other;
     final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this;

     int resultCount=0;
     for (int i=0; i<a.table.length; i++) {
       int id=a.table[i];
       if (id >= 0 && b.exists(id)) {
         resultCount++;
       }
     }
     return resultCount;
   } else {
     int resultCount=0;
     for (int i=0; i<table.length; i++) {
       int id=table[i];
       if (id >= 0 && other.exists(id)) {
         resultCount++;
       }
     }
     return resultCount;
   }

  }

  @Override
  public boolean intersects(DocSet other) {
   if (other instanceof HashDocSet) {
     // set "a" to the smallest doc set for the most efficient
     // intersection.
     final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other;
     final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this;

     for (int i=0; i<a.table.length; i++) {
       int id=a.table[i];
       if (id >= 0 && b.exists(id)) {
         return true;
       }
     }
     return false;
   } else {
     for (int i=0; i<table.length; i++) {
       int id=table[i];
       if (id >= 0 && other.exists(id)) {
         return true;
       }
     }
     return false;
   }
  }

  @Override
  public DocSet andNot(DocSet other) {
    int[] result = new int[size()];
    int resultCount=0;

    for (int i=0; i<table.length; i++) {
      int id=table[i];
      if (id >= 0 && !other.exists(id)) {
        result[resultCount++]=id;
      }
    }
    return new HashDocSet(result,0,resultCount);
  }

  @Override
  public DocSet union(DocSet other) {
   if (other instanceof HashDocSet) {
     // set "a" to the smallest doc set
     final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other;
     final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this;

     int[] result = new int[a.size()+b.size()];
     int resultCount=0;
     // iterate over the largest table first, adding w/o checking.
     for (int i=0; i<b.table.length; i++) {
       int id=b.table[i];
       if (id>=0) result[resultCount++]=id;
     }

     // now iterate over smaller set, adding all not already in larger set.
     for (int i=0; i<a.table.length; i++) {
       int id=a.table[i];
       if (id>=0 && !b.exists(id)) result[resultCount++]=id;
     }

     return new HashDocSet(result,0,resultCount);
   } else {
     return other.union(this);
   }
  }

  @Override
  public HashDocSet clone() {
    return new HashDocSet(this);
  }

  // don't implement andNotSize() and unionSize() on purpose... they are implemented
  // in BaseDocSet in terms of intersectionSize().


  @Override
  public long ramBytesUsed() {
    return BASE_RAM_BYTES_USED + (table.length<<2);
  }

  @Override
  public Collection<Accountable> getChildResources() {
    return Collections.emptyList();
  }
}