DirectoryTaxonomyReader.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.facet.taxonomy.directory;

import java.io.IOException;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.lucene.document.Document;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.apache.lucene.facet.taxonomy.LRUHashMap;
import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.CorruptIndexException; // javadocs
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;

/**
 * A {@link TaxonomyReader} which retrieves stored taxonomy information from a
 * {@link Directory}.
 * <P>
 * Reading from the on-disk index on every method call is too slow, so this
 * implementation employs caching: Some methods cache recent requests and their
 * results, while other methods prefetch all the data into memory and then
 * provide answers directly from in-memory tables. See the documentation of
 * individual methods for comments on their performance.
 * 
 * @lucene.experimental
 */
public class DirectoryTaxonomyReader extends TaxonomyReader {

  private static final Logger logger = Logger.getLogger(DirectoryTaxonomyReader.class.getName());

  private static final int DEFAULT_CACHE_VALUE = 4000;
  
  private final DirectoryTaxonomyWriter taxoWriter;
  private final long taxoEpoch; // used in doOpenIfChanged 
  private final DirectoryReader indexReader;

  // TODO: test DoubleBarrelLRUCache and consider using it instead
  private LRUHashMap<FacetLabel, Integer> ordinalCache;
  private LRUHashMap<Integer, FacetLabel> categoryCache;

  private volatile TaxonomyIndexArrays taxoArrays;

  /**
   * Called only from {@link #doOpenIfChanged()}. If the taxonomy has been
   * recreated, you should pass {@code null} as the caches and parent/children
   * arrays.
   */
  DirectoryTaxonomyReader(DirectoryReader indexReader, DirectoryTaxonomyWriter taxoWriter,
      LRUHashMap<FacetLabel,Integer> ordinalCache, LRUHashMap<Integer,FacetLabel> categoryCache,
      TaxonomyIndexArrays taxoArrays) throws IOException {
    this.indexReader = indexReader;
    this.taxoWriter = taxoWriter;
    this.taxoEpoch = taxoWriter == null ? -1 : taxoWriter.getTaxonomyEpoch();
    
    // use the same instance of the cache, note the protective code in getOrdinal and getPath
    this.ordinalCache = ordinalCache == null ? new LRUHashMap<FacetLabel,Integer>(DEFAULT_CACHE_VALUE) : ordinalCache;
    this.categoryCache = categoryCache == null ? new LRUHashMap<Integer,FacetLabel>(DEFAULT_CACHE_VALUE) : categoryCache;
    
    this.taxoArrays = taxoArrays != null ? new TaxonomyIndexArrays(indexReader, taxoArrays) : null;
  }
  
  /**
   * Open for reading a taxonomy stored in a given {@link Directory}.
   * 
   * @param directory
   *          The {@link Directory} in which the taxonomy resides.
   * @throws CorruptIndexException
   *           if the Taxonomy is corrupt.
   * @throws IOException
   *           if another error occurred.
   */
  public DirectoryTaxonomyReader(Directory directory) throws IOException {
    indexReader = openIndexReader(directory);
    taxoWriter = null;
    taxoEpoch = -1;

    // These are the default cache sizes; they can be configured after
    // construction with the cache's setMaxSize() method
    ordinalCache = new LRUHashMap<>(DEFAULT_CACHE_VALUE);
    categoryCache = new LRUHashMap<>(DEFAULT_CACHE_VALUE);
  }
  
  /**
   * Opens a {@link DirectoryTaxonomyReader} over the given
   * {@link DirectoryTaxonomyWriter} (for NRT).
   * 
   * @param taxoWriter
   *          The {@link DirectoryTaxonomyWriter} from which to obtain newly
   *          added categories, in real-time.
   */
  public DirectoryTaxonomyReader(DirectoryTaxonomyWriter taxoWriter) throws IOException {
    this.taxoWriter = taxoWriter;
    taxoEpoch = taxoWriter.getTaxonomyEpoch();
    indexReader = openIndexReader(taxoWriter.getInternalIndexWriter());
    
    // These are the default cache sizes; they can be configured after
    // construction with the cache's setMaxSize() method
    ordinalCache = new LRUHashMap<>(DEFAULT_CACHE_VALUE);
    categoryCache = new LRUHashMap<>(DEFAULT_CACHE_VALUE);
  }
  
  private synchronized void initTaxoArrays() throws IOException {
    if (taxoArrays == null) {
      // according to Java Concurrency in Practice, this might perform better on
      // some JVMs, because the array initialization doesn't happen on the
      // volatile member.
      TaxonomyIndexArrays tmpArrays = new TaxonomyIndexArrays(indexReader);
      taxoArrays = tmpArrays;
    }
  }
  
  @Override
  protected void doClose() throws IOException {
    indexReader.close();
    taxoArrays = null;
    // do not clear() the caches, as they may be used by other DTR instances.
    ordinalCache = null;
    categoryCache = null;
  }
  
  /**
   * Implements the opening of a new {@link DirectoryTaxonomyReader} instance if
   * the taxonomy has changed.
   * 
   * <p>
   * <b>NOTE:</b> the returned {@link DirectoryTaxonomyReader} shares the
   * ordinal and category caches with this reader. This is not expected to cause
   * any issues, unless the two instances continue to live. The reader
   * guarantees that the two instances cannot affect each other in terms of
   * correctness of the caches, however if the size of the cache is changed
   * through {@link #setCacheSize(int)}, it will affect both reader instances.
   */
  @Override
  protected DirectoryTaxonomyReader doOpenIfChanged() throws IOException {
    ensureOpen();
    
    // This works for both NRT and non-NRT readers (i.e. an NRT reader remains NRT).
    final DirectoryReader r2 = DirectoryReader.openIfChanged(indexReader);
    if (r2 == null) {
      return null; // no changes, nothing to do
    }

    // check if the taxonomy was recreated
    boolean success = false;
    try {
      boolean recreated = false;
      if (taxoWriter == null) {
        // not NRT, check epoch from commit data
        String t1 = indexReader.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH);
        String t2 = r2.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH);
        if (t1 == null) {
          if (t2 != null) {
            recreated = true;
          }
        } else if (!t1.equals(t2)) {
          // t1 != null and t2 must not be null b/c DirTaxoWriter always puts the commit data.
          // it's ok to use String.equals because we require the two epoch values to be the same.
          recreated = true;
        }
      } else {
        // NRT, compare current taxoWriter.epoch() vs the one that was given at construction
        if (taxoEpoch != taxoWriter.getTaxonomyEpoch()) {
          recreated = true;
        }
      }

      final DirectoryTaxonomyReader newtr;
      if (recreated) {
        // if recreated, do not reuse anything from this instace. the information
        // will be lazily computed by the new instance when needed.
        newtr = new DirectoryTaxonomyReader(r2, taxoWriter, null, null, null);
      } else {
        newtr = new DirectoryTaxonomyReader(r2, taxoWriter, ordinalCache, categoryCache, taxoArrays);
      }
      
      success = true;
      return newtr;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(r2);
      }
    }
  }

  /** Open the {@link DirectoryReader} from this {@link
   *  Directory}. */
  protected DirectoryReader openIndexReader(Directory directory) throws IOException {
    return DirectoryReader.open(directory);
  }

  /** Open the {@link DirectoryReader} from this {@link
   *  IndexWriter}. */
  protected DirectoryReader openIndexReader(IndexWriter writer) throws IOException {
    return DirectoryReader.open(writer);
  }

  /**
   * Expert: returns the underlying {@link DirectoryReader} instance that is
   * used by this {@link TaxonomyReader}.
   */
  DirectoryReader getInternalIndexReader() {
    ensureOpen();
    return indexReader;
  }

  @Override
  public ParallelTaxonomyArrays getParallelTaxonomyArrays() throws IOException {
    ensureOpen();
    if (taxoArrays == null) {
      initTaxoArrays();
    }
    return taxoArrays;
  }

  @Override
  public Map<String, String> getCommitUserData() throws IOException {
    ensureOpen();
    return indexReader.getIndexCommit().getUserData();
  }

  @Override
  public int getOrdinal(FacetLabel cp) throws IOException {
    ensureOpen();
    if (cp.length == 0) {
      return ROOT_ORDINAL;
    }

    // First try to find the answer in the LRU cache:
    synchronized (ordinalCache) {
      Integer res = ordinalCache.get(cp);
      if (res != null) {
        if (res.intValue() < indexReader.maxDoc()) {
          // Since the cache is shared with DTR instances allocated from
          // doOpenIfChanged, we need to ensure that the ordinal is one that
          // this DTR instance recognizes.
          return res.intValue();
        } else {
          // if we get here, it means that the category was found in the cache,
          // but is not recognized by this TR instance. Therefore there's no
          // need to continue search for the path on disk, because we won't find
          // it there too.
          return TaxonomyReader.INVALID_ORDINAL;
        }
      }
    }

    // If we're still here, we have a cache miss. We need to fetch the
    // value from disk, and then also put it in the cache:
    int ret = TaxonomyReader.INVALID_ORDINAL;
    PostingsEnum docs = MultiFields.getTermDocsEnum(indexReader, Consts.FULL, new BytesRef(FacetsConfig.pathToString(cp.components, cp.length)), 0);
    if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
      ret = docs.docID();
      
      // we only store the fact that a category exists, not its inexistence.
      // This is required because the caches are shared with new DTR instances
      // that are allocated from doOpenIfChanged. Therefore, if we only store
      // information about found categories, we cannot accidently tell a new
      // generation of DTR that a category does not exist.
      synchronized (ordinalCache) {
        ordinalCache.put(cp, Integer.valueOf(ret));
      }
    }

    return ret;
  }

  @Override
  public FacetLabel getPath(int ordinal) throws IOException {
    ensureOpen();
    
    // Since the cache is shared with DTR instances allocated from
    // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
    // instance recognizes. Therefore we do this check up front, before we hit
    // the cache.
    if (ordinal < 0 || ordinal >= indexReader.maxDoc()) {
      return null;
    }
    
    // TODO: can we use an int-based hash impl, such as IntToObjectMap,
    // wrapped as LRU?
    Integer catIDInteger = Integer.valueOf(ordinal);
    synchronized (categoryCache) {
      FacetLabel res = categoryCache.get(catIDInteger);
      if (res != null) {
        return res;
      }
    }
    
    Document doc = indexReader.document(ordinal);
    FacetLabel ret = new FacetLabel(FacetsConfig.stringToPath(doc.get(Consts.FULL)));
    synchronized (categoryCache) {
      categoryCache.put(catIDInteger, ret);
    }
    
    return ret;
  }

  @Override
  public int getSize() {
    ensureOpen();
    return indexReader.numDocs();
  }
  
  /**
   * setCacheSize controls the maximum allowed size of each of the caches
   * used by {@link #getPath(int)} and {@link #getOrdinal(FacetLabel)}.
   * <P>
   * Currently, if the given size is smaller than the current size of
   * a cache, it will not shrink, and rather we be limited to its current
   * size.
   * @param size the new maximum cache size, in number of entries.
   */
  public void setCacheSize(int size) {
    ensureOpen();
    synchronized (categoryCache) {
      categoryCache.setMaxSize(size);
    }
    synchronized (ordinalCache) {
      ordinalCache.setMaxSize(size);
    }
  }

  /** Returns ordinal -> label mapping, up to the provided
   *  max ordinal or number of ordinals, whichever is
   *  smaller. */
  public String toString(int max) {
    ensureOpen();
    StringBuilder sb = new StringBuilder();
    int upperl = Math.min(max, indexReader.maxDoc());
    for (int i = 0; i < upperl; i++) {
      try {
        FacetLabel category = this.getPath(i);
        if (category == null) {
          sb.append(i + ": NULL!! \n");
          continue;
        } 
        if (category.length == 0) {
          sb.append(i + ": EMPTY STRING!! \n");
          continue;
        }
        sb.append(i +": "+category.toString()+"\n");
      } catch (IOException e) {
        if (logger.isLoggable(Level.FINEST)) {
          logger.log(Level.FINEST, e.getMessage(), e);
        }
      }
    }
    return sb.toString();
  }
  
}