CachingIndexReader.java example

Explorer
jcr-master
- jcr-develop
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.exoplatform.services.jcr.impl.core.query.lucene;

import java.io.FileNotFoundException;
import java.util.Arrays;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.collections.map.LRUMap;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FilterIndexReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.text.NumberFormat;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

/**
 * Implements an <code>IndexReader</code> that maintains caches to resolve
 * {@link #getParent(int, BitSet)} calls efficiently.
 * <br>
 */
class CachingIndexReader extends FilterIndexReader
{

   /**
    * The logger instance for this class.
    */
   private static final Logger log = LoggerFactory.getLogger("exo.jcr.component.core.CachingIndexReader");

   /**
    * The current value of the global creation tick counter.
    */
   private static long currentTick;

   /**
    * BitSet where bits that correspond to document numbers are set for
    * sharable nodes.
    */
   private final BitSet shareableNodes;

   /**
    * Cache of nodes parent relation. If an entry in the array is >= 0,
    * that means the node with the document number = array-index has the node
    * node with the value at that position as parent.
    */
    private final int[] inSegmentParents;

   /**
    * Cache of nodes parent relation that point to a foreign index segment.
    */
   private final Map<Integer, DocId> foreignParentDocIds = new ConcurrentHashMap<Integer, DocId>();

   /**
    * Initializes the {@link #inSegmentParents} and {@link #foreignParentDocIds}
    * caches.
    */
   private CacheInitializer cacheInitializer;

   /**
    * Tick when this index reader was created.
    */
   private final long creationTick = getNextCreationTick();

   /**
    * Document number cache if available. May be <code>null</code>.
    */
   private final DocNumberCache cache;

   /**
    * Maps document number to node UUID.
    */
   private final Map<Integer, String> docNumber2uuid;

   /**
    * A cache of TermDocs that are regularly read from the index.
    */
   private final TermDocsCache termDocsCache;

   /**
    * Creates a new <code>CachingIndexReader</code> based on
    * <code>delegate</code>
    *
    * @param delegatee the base <code>IndexReader</code>.
    * @param cache     a document number cache, or <code>null</code> if not
    *                  available to this reader.
    * @param initCache if the {@link #inSegmentParents} cache should be initialized
    *                  when this index reader is constructed.
    * @throws IOException if an error occurs while reading from the index.
    */
   @SuppressWarnings("unchecked")
   CachingIndexReader(IndexReader delegatee, DocNumberCache cache, boolean initCache) throws IOException
   {
      super(delegatee);
      this.cache = cache;
      this.inSegmentParents = new int[delegatee.maxDoc()];
      Arrays.fill(this.inSegmentParents, -1);
      this.shareableNodes = initShareableNodes(delegatee);
      this.cacheInitializer = new CacheInitializer(delegatee);
      if (initCache)
      {
         cacheInitializer.run();
      }
      // limit cache to 1% of maxDoc(), but at least 10.
      this.docNumber2uuid =
         (Map<Integer, String>)Collections.synchronizedMap(new LRUMap(Math.max(10, delegatee.maxDoc() / 100)));
      this.termDocsCache = new TermDocsCache(delegatee, FieldNames.PROPERTIES);
   }

   private BitSet initShareableNodes(IndexReader delegatee) throws IOException {
      BitSet shareableNodes = new BitSet();
      TermDocs tDocs = delegatee.termDocs(new Term(FieldNames.SHAREABLE_NODE,
         ""));
      try {
         while (tDocs.next()) {
            shareableNodes.set(tDocs.doc());
         }
      } finally {
         tDocs.close();
      }
      return shareableNodes;
   }

   /**
    * Returns the <code>DocId</code> of the parent of <code>n</code> or
    * {@link DocId#NULL} if <code>n</code> does not have a parent
    * (<code>n</code> is the root node).
    *
    * @param n the document number.
    * @param deleted the documents that should be regarded as deleted.
    * @return the <code>DocId</code> of <code>n</code>'s parent.
    * @throws IOException if an error occurs while reading from the index.
    */
   DocId getParent(int n, BitSet deleted) throws IOException
   {
      DocId parent;
      boolean existing = false;
      int parentDocNum = inSegmentParents[n];
      if (parentDocNum != -1) {
         parent = DocId.create(parentDocNum);
      } else {
         parent = foreignParentDocIds.get(n);
      }

      if (parent != null)
      {
         existing = true;

         // check if valid and reset if necessary
         if (!parent.isValid(deleted))
         {
            if (log.isDebugEnabled())
            {
               log.debug(parent + " not valid anymore.");
            }
            parent = null;
         }
      }

      if (parent == null)
      {
         int plainDocId = -1;
         Document doc = document(n, FieldSelectors.UUID_AND_PARENT);
         String[] parentUUIDs = doc.getValues(FieldNames.PARENT);
         if (parentUUIDs.length == 0 || parentUUIDs[0].length() == 0)
         {
            // root node
            parent = DocId.NULL;
         }
         else
         {
            if (shareableNodes.get(n))
            {
               parent = DocId.create(parentUUIDs);
            }
            else
            {
               if (!existing)
               {
                  Term id = new Term(FieldNames.UUID, parentUUIDs[0]);
                  TermDocs docs = termDocs(id);
                  try
                  {
                     while (docs.next())
                     {
                        if (!deleted.get(docs.doc()))
                        {
                           plainDocId = docs.doc();
                           parent = DocId.create(plainDocId);
                           break;
                        }
                     }
                  }
                  finally
                  {
                     docs.close();
                  }
               }
               // if still null, then parent is not in this index, or existing
               // DocId was invalid. thus, only allowed to create DocId from uuid
               if (parent == null)
               {
                  parent = DocId.create(parentUUIDs[0]);
               }
            }
         }

         // finally put to cache
         if (plainDocId != -1) {
            // PlainDocId
            inSegmentParents[n] = plainDocId;
         } else {
            // UUIDDocId
            foreignParentDocIds.put(n, parent);
            if (existing) {
               // there was an existing parent reference in
               // inSegmentParents, which was invalid and is replaced
               // inSegmentParents, which was invalid and is replaced
               // mark as unknown
               inSegmentParents[n] = -1;
            }
         }
      }
      return parent;
   }

   /**
    * Returns the tick value when this reader was created.
    *
    * @return the creation tick for this reader.
    */
   public long getCreationTick()
   {
      return creationTick;
   }

   //--------------------< FilterIndexReader overwrites >----------------------

   /**
    * Uses the {@link #docNumber2uuid} cache for document lookups that are only
    * interested in the {@link FieldSelectors#UUID}.
    *
    * @param n the document number.
    * @param fieldSelector the field selector.
    * @return the document.
    * @throws CorruptIndexException if the index is corrupt.
    * @throws IOException if an error occurs while reading from the index.
    */
   public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException
   {
      if (fieldSelector == FieldSelectors.UUID)
      {
         Integer docNum = new Integer(n);
         Document doc;
         String uuid = docNumber2uuid.get(docNum);
         if (uuid == null)
         {
            doc = super.document(n, fieldSelector);
            uuid = doc.get(FieldNames.UUID);
            docNumber2uuid.put(docNum, uuid);
         }
         else
         {
            doc = new Document();
            doc.add(new Field(FieldNames.UUID, uuid.toString(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
         }
         return doc;
      }
      else
      {
         return super.document(n, fieldSelector);
      }
   }

   /**
    * If the field of <code>term</code> is {@link FieldNames#UUID} this
    * <code>CachingIndexReader</code> returns a <code>TermDocs</code> instance
    * with a cached document id. If <code>term</code> has any other field
    * the call is delegated to the base <code>IndexReader</code>.<br>
    * If <code>term</code> is for a {@link FieldNames#UUID} field and this
    * <code>CachingIndexReader</code> does not have such a document,
    * {@link EmptyTermDocs#INSTANCE} is returned.
    *
    * @param term the term to start the <code>TermDocs</code> enumeration.
    * @return a TermDocs instance.
    * @throws IOException if an error occurs while reading from the index.
    */
   public TermDocs termDocs(Term term) throws IOException
   {
      if (term!=null && term.field() == FieldNames.UUID)
      {
         // check cache if we have one
         if (cache != null)
         {
            DocNumberCache.Entry e = cache.get(term.text());
            if (e != null)
            {
               // check if valid
               // the cache may contain entries from a different reader
               // with the same uuid. that happens when a node is updated
               // and is reindexed. the node 'travels' from an older index
               // to a newer one. the cache will still contain a cache
               // entry from the old until it is overwritten by the
               // newer index.
               if (e.creationTick == creationTick && !isDeleted(e.doc))
               {
                  return new SingleTermDocs(e.doc);
               }
            }

            // not in cache or invalid
            TermDocs docs = in.termDocs(term);
            try
            {
               if (docs.next())
               {
                  // put to cache
                  cache.put(term.text(), this, docs.doc());
                  // and return
                  return new SingleTermDocs(docs.doc());
               }
               else
               {
                  return EmptyTermDocs.INSTANCE;
               }
            }
            finally
            {
               docs.close();
            }
         }
      }
      return termDocsCache.termDocs(term);
   }

   /**
    * {@inheritDoc}
    */
   protected void doClose() throws IOException
   {
      try
      {
         cacheInitializer.waitUntilStopped();
      }
      catch (InterruptedException e)
      {
         // ignore
      }
      super.doClose();
   }

   //----------------------< internal >----------------------------------------

   /**
    * Returns the next creation tick value.
    *
    * @return the next creation tick value.
    */
   private static long getNextCreationTick()
   {
      synchronized (CachingIndexReader.class)
      {
         return currentTick++;
      }
   }

   /**
    * Initializes the {@link CachingIndexReader#inSegmentParents} cache.
    */
   private final class CacheInitializer implements Runnable
   {

      /**
       * From where to read.
       */
      private final IndexReader reader;

      /**
       * Set to <code>true</code> while this initializer does its work.
       */
      private boolean running = false;

      /**
       * Set to <code>true</code> when this index reader is about to be closed.
       */
      private volatile boolean stopRequested = false;

      /**
       * The {@link #inSegmentParents} is persisted using this filename.
       */
      private static final String FILE_CACHE_NAME_ARRAY = "cache.inSegmentParents";

      /**
       * Creates a new initializer with the given <code>reader</code>.
       *
       * @param reader an index reader.
       */
      public CacheInitializer(IndexReader reader)
      {
         this.reader = reader;
      }

      /**
       * Initializes the cache.
       */
      public void run()
      {
         synchronized (this)
         {
            running = true;
         }
         try
         {
            if (stopRequested)
            {
               // immediately return when stop is requested
               return;
            }
            boolean initCacheFromFile = loadCacheFromFile();
            if (!initCacheFromFile) {
               // file-based cache is not available, load from the
               // repository
               log.debug("persisted cache is not available, will load directly from the repository.");
               initializeParents(reader);
            }

         }
         catch (IOException e)
         {
            // only log warn message during regular operation
            if (!stopRequested)
            {
               log.warn("Error initializing parents cache.", e);
            }
         }
         finally
         {
            synchronized (this)
            {
               running = false;
               notifyAll();
            }
         }
      }

      /**
       * Waits until this cache initializer is stopped.
       *
       * @throws InterruptedException if the current thread is interrupted.
       */
      public void waitUntilStopped() throws InterruptedException
      {
         stopRequested = true;
         synchronized (this)
         {
            while (running)
            {
               wait();
            }
         }
      }

      /**
       * Initializes the {@link CachingIndexReader#inSegmentParents} <code>DocId</code>
       * array.
       *
       * @param reader the underlying index reader.
       * @throws IOException if an error occurs while reading from the index.
       */
      private void initializeParents(IndexReader reader) throws IOException
      {
         long time = 0;
         if (log.isDebugEnabled())
         {
            time = System.currentTimeMillis();
         }
         final Map<Object, NodeInfo> docs = new HashMap<Object, NodeInfo>();
         // read UUIDs
         collectTermDocs(reader, new Term(FieldNames.UUID, ""), new TermDocsCollector()
         {
            public void collect(Term term, TermDocs tDocs) throws IOException
            {
               String uuid = term.text();
               while (tDocs.next())
               {
                  int doc = tDocs.doc();
                  // skip sharable nodes
                  if (!shareableNodes.get(doc))
                  {
                     NodeInfo info = new NodeInfo(doc, uuid);
                     docs.put(new Integer(doc), info);
                  }
               }
            }
         });

         // read PARENTs
         collectTermDocs(reader, new Term(FieldNames.PARENT, "0"), new TermDocsCollector()
         {
            public void collect(Term term, TermDocs tDocs) throws IOException
            {
               String uuid = term.text();
               while (tDocs.next())
               {
                  Integer docId = new Integer(tDocs.doc());
                  NodeInfo info = docs.get(docId);
                  if (info == null)
                  {
                     // sharable node, see above
                  }
                  else
                  {
                     info.parent = uuid;
                     docs.remove(docId);
                     docs.put(info.uuid, info);
                  }
               }
            }
         });

         if (stopRequested)
         {
            return;
         }

         double foreignParents = 0;
         Iterator<NodeInfo> it = docs.values().iterator();
         while (it.hasNext())
         {
            NodeInfo info = it.next();
            NodeInfo parent = docs.get(info.parent);
            if (parent != null)
            {
               inSegmentParents[info.docId] = parent.docId;
            }
            else if (info.parent != null)
            {
               foreignParents++;
               foreignParentDocIds.put(info.docId, DocId.create(info.parent));
            }
            else if (shareableNodes.get(info.docId))
            {
               Document doc = reader.document(info.docId, FieldSelectors.UUID_AND_PARENT);
               foreignParentDocIds.put(info.docId, DocId.create(doc.getValues(FieldNames.PARENT)));
            }
            else
            {
               // no parent -> root node
               foreignParentDocIds.put(info.docId, DocId.NULL);
            }
         }
         // Initialize, persist cache to file
         saveCacheToFile();
         if (log.isDebugEnabled())
         {
            NumberFormat nf = NumberFormat.getPercentInstance();
            nf.setMaximumFractionDigits(1);
            time = System.currentTimeMillis() - time;
            if(inSegmentParents.length > 0)
            {
               foreignParents /= inSegmentParents.length;
            }
            log.debug("initialized {} DocIds in {} ms, {} foreign parents", new Object[]{new Integer(inSegmentParents.length),
               new Long(time), nf.format(foreignParents)});
         }

      }

      /**
       * Collects term docs for a given start term. All terms with the same
       * field as <code>start</code> are enumerated.
       *
       * @param reader the index reader.
       * @param start the term where to start the term enumeration.
       * @param collector collects the term docs for each term.
       * @throws IOException if an error occurs while reading from the index.
       */
      private void collectTermDocs(IndexReader reader, Term start, TermDocsCollector collector) throws IOException
      {
         TermDocs tDocs = reader.termDocs();
         try
         {
            TermEnum terms = reader.terms(start);
            try
            {
               int count = 0;
               do
               {
                  Term t = terms.term();
                  if (t != null && t.field() == start.field())
                  {
                     tDocs.seek(terms);
                     collector.collect(t, tDocs);
                  }
                  else
                  {
                     break;
                  }
                  // once in a while check if we should quit
                  if (++count % 10000 == 0)
                  {
                     if (stopRequested)
                     {
                        break;
                     }
                  }
               }
               while (terms.next());
            }
            finally
            {
               terms.close();
            }
         }
         finally
         {
            tDocs.close();
         }
      }

      /**
       * Persists the cache info {@link #inSegmentParents} to a file:
       * {@link #FILE_CACHE_NAME_ARRAY}, for faster init times on startup.
       **/
      public void saveCacheToFile() throws IOException {
         try (
            IndexOutput io = reader.directory().createOutput(FILE_CACHE_NAME_ARRAY)
            ){
            for (int parent : inSegmentParents) {
               io.writeInt(parent);
            }
         } catch (Exception e) {
            log.error(
               "Error saving " + FILE_CACHE_NAME_ARRAY + ": "
                  + e.getMessage(), e);
         }
      }

      /**
       * Loads the cache info {@link #inSegmentParents} from the file
       * {@link #FILE_CACHE_NAME_ARRAY}.
       *
       * @return true if the cache has been initialized of false if the cache
       *         file does not exist yet, or an error happened
       */
      private boolean loadCacheFromFile() throws IOException {
         try(
            IndexInput ii = reader.directory().openInput(FILE_CACHE_NAME_ARRAY);
            ) {
            long time = System.currentTimeMillis();

            for (int i = 0; i < inSegmentParents.length; i++) {
               inSegmentParents[i] = ii.readInt();
            }
            if(log.isDebugEnabled())
            {
               log.debug(
                  "persisted cache initialized {} DocIds in {} ms",
                  new Object[] { inSegmentParents.length,
                     System.currentTimeMillis() - time });
            }
            return true;
         } catch (FileNotFoundException ignore) {
            if(log.isDebugEnabled()) {
               // expected in the case where the file-based cache has not been
               // initialized yet
               log.debug("Saved state (file-based) of CachingIndexReader has not been initialized yet", ignore);
            }
         } catch (IOException ignore) {
            log.warn(
               "Saved state of CachingIndexReader is corrupt, will try to remove offending file "
                  + FILE_CACHE_NAME_ARRAY, ignore);
            // In the case where is a read error, the cache file is removed
            // so it can be recreated after
            // the cache loads the data from the repository directly
            reader.directory().deleteFile(FILE_CACHE_NAME_ARRAY);
         }
         return false;
      }
   }

   /**
    * Simple interface to collect a term and its term docs.
    */
   private interface TermDocsCollector
   {

      /**
       * Called for each term encountered.
       *
       * @param term the term.
       * @param tDocs the term docs of <code>term</code>.
       * @throws IOException if an error occurs while reading from the index.
       */
      void collect(Term term, TermDocs tDocs) throws IOException;
   }

   private final static class NodeInfo
   {

      final int docId;

      final String uuid;

      String parent;

      public NodeInfo(int docId, String uuid)
      {
         this.docId = docId;
         this.uuid = uuid;
      }
   }
}