DirectoryTaxonomyReader.java example

Explorer
solr-analytics-master
- lucene
- solr
package org.apache.lucene.facet.taxonomy.directory;

import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.InconsistentTaxonomyException;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.Consts.LoadFullPathOnly;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.collections.LRUHashMap;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * A {@link TaxonomyReader} which retrieves stored taxonomy information from a
 * {@link Directory}.
 * <P>
 * Reading from the on-disk index on every method call is too slow, so this
 * implementation employs caching: Some methods cache recent requests and their
 * results, while other methods prefetch all the data into memory and then
 * provide answers directly from in-memory tables. See the documentation of
 * individual methods for comments on their performance.
 * 
 * @lucene.experimental
 */
public class DirectoryTaxonomyReader implements TaxonomyReader {

  private static final Logger logger = Logger.getLogger(DirectoryTaxonomyReader.class.getName());
  
  private DirectoryReader indexReader;

  // The following lock is used to allow multiple threads to read from the
  // index concurrently, while having them block during the very short
  // critical moment of refresh() (see comments below). Note, however, that
  // we only read from the index when we don't have the entry in our cache,
  // and the caches are locked separately.
  private ReadWriteLock indexReaderLock = new ReentrantReadWriteLock();

  // The following are the limited-size LRU caches used to cache the latest
  // results from getOrdinal() and getLabel().
  // Because LRUHashMap is not thread-safe, we need to synchronize on this
  // object when using it. Unfortunately, this is not optimal under heavy
  // contention because it means that while one thread is using the cache
  // (reading or modifying) others are blocked from using it - or even
  // starting to do benign things like calculating the hash function. A more
  // efficient approach would be to use a non-locking (as much as possible)
  // concurrent solution, along the lines of java.util.concurrent.ConcurrentHashMap
  // but with LRU semantics.
  // However, even in the current sub-optimal implementation we do not make
  // the mistake of locking out readers while waiting for disk in a cache
  // miss - below, we do not hold cache lock while reading missing data from
  // disk.
  private final LRUHashMap<String, Integer> ordinalCache;
  private final LRUHashMap<Integer, String> categoryCache;

  // getParent() needs to be extremely efficient, to the point that we need
  // to fetch all the data in advance into memory, and answer these calls
  // from memory. Currently we use a large integer array, which is
  // initialized when the taxonomy is opened, and potentially enlarged
  // when it is refresh()ed.
  // These arrays are not syncrhonized. Rather, the reference to the array
  // is volatile, and the only writing operation (refreshPrefetchArrays)
  // simply creates a new array and replaces the reference. The volatility
  // of the reference ensures the correct atomic replacement and its
  // visibility properties (the content of the array is visible when the
  // new reference is visible).
  private ParentArray parentArray;

  private char delimiter = Consts.DEFAULT_DELIMITER;

  private volatile boolean closed = false;
  
  // set refCount to 1 at start
  private final AtomicInteger refCount = new AtomicInteger(1);
  
  /**
   * Open for reading a taxonomy stored in a given {@link Directory}.
   * @param directory
   *    The {@link Directory} in which to the taxonomy lives. Note that
   *    the taxonomy is read directly to that directory (not from a
   *    subdirectory of it).
   * @throws CorruptIndexException if the Taxonomy is corrupted.
   * @throws IOException if another error occurred.
   */
  public DirectoryTaxonomyReader(Directory directory) throws IOException {
    this.indexReader = openIndexReader(directory);

    // These are the default cache sizes; they can be configured after
    // construction with the cache's setMaxSize() method
    ordinalCache = new LRUHashMap<String, Integer>(4000);
    categoryCache = new LRUHashMap<Integer, String>(4000);

    // TODO (Facet): consider lazily create parent array when asked, not in the constructor
    parentArray = new ParentArray();
    parentArray.refresh(indexReader);
  }

  protected DirectoryReader openIndexReader(Directory directory) throws IOException {
    return DirectoryReader.open(directory);
  }

  /**
   * @throws AlreadyClosedException if this IndexReader is closed
   */
  protected final void ensureOpen() throws AlreadyClosedException {
    if (getRefCount() <= 0) {
      throw new AlreadyClosedException("this TaxonomyReader is closed");
    }
  }
  
  /**
   * setCacheSize controls the maximum allowed size of each of the caches
   * used by {@link #getPath(int)} and {@link #getOrdinal(CategoryPath)}.
   * <P>
   * Currently, if the given size is smaller than the current size of
   * a cache, it will not shrink, and rather we be limited to its current
   * size.
   * @param size the new maximum cache size, in number of entries.
   */
  public void setCacheSize(int size) {
    ensureOpen();
    synchronized(categoryCache) {
      categoryCache.setMaxSize(size);
    }
    synchronized(ordinalCache) {
      ordinalCache.setMaxSize(size);
    }
  }

  /**
   * setDelimiter changes the character that the taxonomy uses in its
   * internal storage as a delimiter between category components. Do not
   * use this method unless you really know what you are doing.
   * <P>
   * If you do use this method, make sure you call it before any other
   * methods that actually queries the taxonomy. Moreover, make sure you
   * always pass the same delimiter for all LuceneTaxonomyWriter and
   * LuceneTaxonomyReader objects you create.
   */
  public void setDelimiter(char delimiter) {
    ensureOpen();
    this.delimiter = delimiter;
  }

  public int getOrdinal(CategoryPath categoryPath) throws IOException {
    ensureOpen();
    if (categoryPath.length()==0) {
      return ROOT_ORDINAL;
    }
    String path = categoryPath.toString(delimiter);

    // First try to find the answer in the LRU cache:
    synchronized(ordinalCache) {
      Integer res = ordinalCache.get(path);
      if (res!=null) {
        return res.intValue();
      }
    }

    // If we're still here, we have a cache miss. We need to fetch the
    // value from disk, and then also put it in the cache:
    int ret = TaxonomyReader.INVALID_ORDINAL;
    try {
      indexReaderLock.readLock().lock();
      // TODO (Facet): avoid Multi*?
      Bits liveDocs = MultiFields.getLiveDocs(indexReader);
      DocsEnum docs = MultiFields.getTermDocsEnum(indexReader, liveDocs, Consts.FULL, new BytesRef(path), 0);
      if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        ret = docs.docID();
      }
    } finally {
      indexReaderLock.readLock().unlock();
    }

    // Put the new value in the cache. Note that it is possible that while
    // we were doing the above fetching (without the cache locked), some
    // other thread already added the same category to the cache. We do
    // not care about this possibilty, as LRUCache replaces previous values
    // of the same keys (it doesn't store duplicates).
    synchronized(ordinalCache) {
      // GB: new Integer(int); creates a new object each and every time.
      // Integer.valueOf(int) might not (See JavaDoc). 
      ordinalCache.put(path, Integer.valueOf(ret));
    }

    return ret;
  }

  public CategoryPath getPath(int ordinal) throws IOException {
    ensureOpen();
    // TODO (Facet): Currently, the LRU cache we use (getCategoryCache) holds
    // strings with delimiters, not CategoryPath objects, so even if
    // we have a cache hit, we need to process the string and build a new
    // CategoryPath object every time. What is preventing us from putting
    // the actual CategoryPath object in the cache is the fact that these
    // objects are mutable. So we should create an immutable (read-only)
    // interface that CategoryPath implements, and this method should
    // return this interface, not the writable CategoryPath.
    String label = getLabel(ordinal);
    if (label==null) {
      return null;  
    }
    return new CategoryPath(label, delimiter);
  }

  public boolean getPath(int ordinal, CategoryPath result) throws IOException {
    ensureOpen();
    String label = getLabel(ordinal);
    if (label==null) {
      return false;
    }
    result.clear();
    result.add(label, delimiter);
    return true;
  }

  private String getLabel(int catID) throws IOException {
    ensureOpen();
    // First try to find the answer in the LRU cache. It is very
    // unfortunate that we need to allocate an Integer object here -
    // it would have been better if we used a hash table specifically
    // designed for int keys...
    // GB: new Integer(int); creates a new object each and every time.
    // Integer.valueOf(int) might not (See JavaDoc). 
    Integer catIDInteger = Integer.valueOf(catID);

    synchronized(categoryCache) {
      String res = categoryCache.get(catIDInteger);
      if (res!=null) {
        return res;
      }
    }

    // If we're still here, we have a cache miss. We need to fetch the
    // value from disk, and then also put it in the cache:
    String ret;
    try {
      indexReaderLock.readLock().lock();
      // The taxonomy API dictates that if we get an invalid category
      // ID, we should return null, If we don't check this here, we
      // can some sort of an exception from the document() call below.
      // NOTE: Currently, we *do not* cache this return value; There
      // isn't much point to do so, because checking the validity of
      // the docid doesn't require disk access - just comparing with
      // the number indexReader.maxDoc().
      if (catID<0 || catID>=indexReader.maxDoc()) {
        return null;
      }
      final LoadFullPathOnly loader = new LoadFullPathOnly();
      indexReader.document(catID, loader);
      ret = loader.getFullPath();
    } finally {
      indexReaderLock.readLock().unlock();
    }
    // Put the new value in the cache. Note that it is possible that while
    // we were doing the above fetching (without the cache locked), some
    // other thread already added the same category to the cache. We do
    // not care about this possibility, as LRUCache replaces previous
    // values of the same keys (it doesn't store duplicates).
    synchronized (categoryCache) {
      categoryCache.put(catIDInteger, ret);
    }

    return ret;
  }

  public int getParent(int ordinal) {
    ensureOpen();
    // Note how we don't need to hold the read lock to do the following,
    // because the array reference is volatile, ensuring the correct
    // visibility and ordering: if we get the new reference, the new
    // data is also visible to this thread.
    return getParentArray()[ordinal];
  }

  /**
   * getParentArray() returns an int array of size getSize() listing the
   * ordinal of the parent category of each category in the taxonomy.
   * <P>
   * The caller can hold on to the array it got indefinitely - it is
   * guaranteed that no-one else will modify it. The other side of the
   * same coin is that the caller must treat the array it got as read-only
   * and <B>not modify it</B>, because other callers might have gotten the
   * same array too, and getParent() calls are also answered from the
   * same array.
   * <P>
   * The getParentArray() call is extremely efficient, merely returning
   * a reference to an array that already exists. For a caller that plans
   * to call getParent() for many categories, using getParentArray() and
   * the array it returns is a somewhat faster approach because it avoids
   * the overhead of method calls and volatile dereferencing.
   * <P>
   * If you use getParentArray() instead of getParent(), remember that
   * the array you got is (naturally) not modified after a refresh(),
   * so you should always call getParentArray() again after a refresh().
   */

  public int[] getParentArray() {
    ensureOpen();
    // Note how we don't need to hold the read lock to do the following,
    // because the array reference is volatile, ensuring the correct
    // visibility and ordering: if we get the new reference, the new
    // data is also visible to this thread.
    return parentArray.getArray();
  }

  // Note that refresh() is synchronized (it is the only synchronized
  // method in this class) to ensure that it never gets called concurrently
  // with itself.
  public synchronized boolean refresh() throws IOException, InconsistentTaxonomyException {
    ensureOpen();
    /*
     * Since refresh() can be a lengthy operation, it is very important that we
     * avoid locking out all readers for its duration. This is why we don't hold
     * the indexReaderLock write lock for the entire duration of this method. In
     * fact, it is enough to hold it only during a single assignment! Other
     * comments in this method will explain this.
     */

    // note that the lengthy operation indexReader.reopen() does not
    // modify the reader, so we can do it without holding a lock. We can
    // safely read indexReader without holding the write lock, because
    // no other thread can be writing at this time (this method is the
    // only possible writer, and it is "synchronized" to avoid this case).
    DirectoryReader r2 = DirectoryReader.openIfChanged(indexReader);
    if (r2 == null) {
    	return false; // no changes, nothing to do
    } 
    
    // validate that a refresh is valid at this point, i.e. that the taxonomy 
    // was not recreated since this reader was last opened or refresshed.
    String t1 = indexReader.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_CREATE_TIME);
    String t2 = r2.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_CREATE_TIME);
    if (t1==null) {
    	if (t2!=null) {
    		r2.close();
    		throw new InconsistentTaxonomyException("Taxonomy was recreated at: "+t2);
    	}
    } else if (!t1.equals(t2)) {
    	r2.close();
    	throw new InconsistentTaxonomyException("Taxonomy was recreated at: "+t2+"  !=  "+t1);
    }
    
      IndexReader oldreader = indexReader;
      // we can close the old searcher, but need to synchronize this
      // so that we don't close it in the middle that another routine
      // is reading from it.
      indexReaderLock.writeLock().lock();
      indexReader = r2;
      indexReaderLock.writeLock().unlock();
      // We can close the old reader, but need to be certain that we
      // don't close it while another method is reading from it.
      // Luckily, we can be certain of that even without putting the
      // oldreader.close() in the locked section. The reason is that
      // after lock() succeeded above, we know that all existing readers
      // had finished (this is what a read-write lock ensures). New
      // readers, starting after the unlock() we just did, already got
      // the new indexReader we set above. So nobody can be possibly
      // using the old indexReader, and we can close it:
      oldreader.close();

      // We prefetch some of the arrays to make requests much faster.
      // Let's refresh these prefetched arrays; This refresh is much
      // is made more efficient by assuming that it is enough to read
      // the values for new categories (old categories could not have been
      // changed or deleted)
      // Note that this this done without the write lock being held,
      // which means that it is possible that during a refresh(), a
      // reader will have some methods (like getOrdinal and getCategory)
      // return fresh information, while getParent()
      // (only to be prefetched now) still return older information.
      // We consider this to be acceptable. The important thing,
      // however, is that refreshPrefetchArrays() itself writes to
      // the arrays in a correct manner (see discussion there)
      parentArray.refresh(indexReader);

      // Remove any INVALID_ORDINAL values from the ordinal cache,
      // because it is possible those are now answered by the new data!
      Iterator<Entry<String, Integer>> i = ordinalCache.entrySet().iterator();
      while (i.hasNext()) {
        Entry<String, Integer> e = i.next();
        if (e.getValue().intValue() == INVALID_ORDINAL) {
          i.remove();
        }
      }
      return true;
    }

  public void close() throws IOException {
    if (!closed) {
      synchronized (this) {
        if (!closed) {
          decRef();
          closed = true;
        }
      }
    }
  }
  
  /** Do the actual closing, free up resources */
  private void doClose() throws IOException {
    indexReader.close();
    closed = true;

    parentArray = null;
    childrenArrays = null;
    categoryCache.clear();
    ordinalCache.clear();
  }

  public int getSize() {
    ensureOpen();
    indexReaderLock.readLock().lock();
    try {
      return indexReader.numDocs();
    } finally {
      indexReaderLock.readLock().unlock();
    }
  }

  public Map<String, String> getCommitUserData() throws IOException {
    ensureOpen();
    return indexReader.getIndexCommit().getUserData();
  }
  
  private ChildrenArrays childrenArrays;
  Object childrenArraysRebuild = new Object();

  public ChildrenArrays getChildrenArrays() {
    ensureOpen();
    // Check if the taxonomy grew since we built the array, and if it
    // did, create new (and larger) arrays and fill them as required.
    // We do all this under a lock, two prevent to concurrent calls to
    // needlessly do the same array building at the same time.
    synchronized(childrenArraysRebuild) {
      int num = getSize();
      int first;
      if (childrenArrays==null) {
        first = 0;
      } else {
        first = childrenArrays.getYoungestChildArray().length;
      }
      // If the taxonomy hasn't grown, we can return the existing object
      // immediately
      if (first == num) {
        return childrenArrays;
      }
      // Otherwise, build new arrays for a new ChildrenArray object.
      // These arrays start with an enlarged copy of the previous arrays,
      // and then are modified to take into account the new categories:
      int[] newYoungestChildArray = new int[num];
      int[] newOlderSiblingArray = new int[num];
      // In Java 6, we could just do Arrays.copyOf()...
      if (childrenArrays!=null) {
        System.arraycopy(childrenArrays.getYoungestChildArray(), 0,
            newYoungestChildArray, 0, childrenArrays.getYoungestChildArray().length);
        System.arraycopy(childrenArrays.getOlderSiblingArray(), 0,
            newOlderSiblingArray, 0, childrenArrays.getOlderSiblingArray().length);
      }
      int[] parents = getParentArray();
      for (int i=first; i<num; i++) {
        newYoungestChildArray[i] = INVALID_ORDINAL;
      }
      // In the loop below we can ignore the root category (0) because
      // it has no parent
      if (first==0) {
        first = 1;
        newOlderSiblingArray[0] = INVALID_ORDINAL;
      }
      for (int i=first; i<num; i++) {
        // Note that parents[i] is always < i, so the right-hand-side of
        // the following line is already set when we get here.
        newOlderSiblingArray[i] = newYoungestChildArray[parents[i]];
        newYoungestChildArray[parents[i]] = i;
      }
      // Finally switch to the new arrays
      childrenArrays = new ChildrenArraysImpl(newYoungestChildArray,
          newOlderSiblingArray);
      return childrenArrays;
    }
  }

  public String toString(int max) {
    ensureOpen();
    StringBuilder sb = new StringBuilder();
    int upperl = Math.min(max, this.indexReader.maxDoc());
    for (int i = 0; i < upperl; i++) {
      try {
        CategoryPath category = this.getPath(i);
        if (category == null) {
          sb.append(i + ": NULL!! \n");
          continue;
        } 
        if (category.length() == 0) {
          sb.append(i + ": EMPTY STRING!! \n");
          continue;
        }
        sb.append(i +": "+category.toString()+"\n");
      } catch (IOException e) {
        if (logger.isLoggable(Level.FINEST)) {
          logger.log(Level.FINEST, e.getMessage(), e);
        }
      }
    }
    return sb.toString();
  }

  private static final class ChildrenArraysImpl implements ChildrenArrays {
    private int[] youngestChildArray, olderSiblingArray;
    public ChildrenArraysImpl(int[] youngestChildArray, int[] olderSiblingArray) {
      this.youngestChildArray = youngestChildArray;
      this.olderSiblingArray = olderSiblingArray;
    }
    public int[] getOlderSiblingArray() {
      return olderSiblingArray;
    }
    public int[] getYoungestChildArray() {
      return youngestChildArray;
    }    
  }

  /**
   * Expert:  This method is only for expert use.
   * Note also that any call to refresh() will invalidate the returned reader,
   * so the caller needs to take care of appropriate locking.
   * 
   * @return lucene indexReader
   */
  DirectoryReader getInternalIndexReader() {
    ensureOpen();
    return this.indexReader;
  }

  /**
   * Expert: decreases the refCount of this TaxonomyReader instance. If the
   * refCount drops to 0, then this reader is closed.
   */
  public void decRef() throws IOException {
    ensureOpen();
    final int rc = refCount.decrementAndGet();
    if (rc == 0) {
      boolean success = false;
      try {
        doClose();
        success = true;
      } finally {
        if (!success) {
          // Put reference back on failure
          refCount.incrementAndGet();
        }
      }
    } else if (rc < 0) {
      throw new IllegalStateException("too many decRef calls: refCount is " + rc + " after decrement");
    }
  }
  
  /** Expert: returns the current refCount for this taxonomy reader */
  public int getRefCount() {
    return refCount.get();
  }
  
  /**
   * Expert: increments the refCount of this TaxonomyReader instance. 
   * RefCounts are used to determine when a taxonomy reader can be closed 
   * safely, i.e. as soon as there are no more references. 
   * Be sure to always call a corresponding decRef(), in a finally clause; 
   * otherwise the reader may never be closed. 
   */
  public void incRef() {
    ensureOpen();
    refCount.incrementAndGet();
  }
}