/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.exoplatform.services.jcr.impl.core.query.lucene; import java.io.FileNotFoundException; import java.util.Arrays; import java.util.concurrent.ConcurrentHashMap; import org.apache.commons.collections.map.LRUMap; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FilterIndexReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.TermEnum; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.text.NumberFormat; import java.util.BitSet; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.Map; /** * Implements an <code>IndexReader</code> that maintains caches to resolve * {@link #getParent(int, BitSet)} calls efficiently. * <br> */ class CachingIndexReader extends FilterIndexReader { /** * The logger instance for this class. */ private static final Logger log = LoggerFactory.getLogger("exo.jcr.component.core.CachingIndexReader"); /** * The current value of the global creation tick counter. */ private static long currentTick; /** * BitSet where bits that correspond to document numbers are set for * sharable nodes. */ private final BitSet shareableNodes; /** * Cache of nodes parent relation. If an entry in the array is >= 0, * that means the node with the document number = array-index has the node * node with the value at that position as parent. */ private final int[] inSegmentParents; /** * Cache of nodes parent relation that point to a foreign index segment. */ private final Map<Integer, DocId> foreignParentDocIds = new ConcurrentHashMap<Integer, DocId>(); /** * Initializes the {@link #inSegmentParents} and {@link #foreignParentDocIds} * caches. */ private CacheInitializer cacheInitializer; /** * Tick when this index reader was created. */ private final long creationTick = getNextCreationTick(); /** * Document number cache if available. May be <code>null</code>. */ private final DocNumberCache cache; /** * Maps document number to node UUID. */ private final Map<Integer, String> docNumber2uuid; /** * A cache of TermDocs that are regularly read from the index. */ private final TermDocsCache termDocsCache; /** * Creates a new <code>CachingIndexReader</code> based on * <code>delegate</code> * * @param delegatee the base <code>IndexReader</code>. * @param cache a document number cache, or <code>null</code> if not * available to this reader. * @param initCache if the {@link #inSegmentParents} cache should be initialized * when this index reader is constructed. * @throws IOException if an error occurs while reading from the index. */ @SuppressWarnings("unchecked") CachingIndexReader(IndexReader delegatee, DocNumberCache cache, boolean initCache) throws IOException { super(delegatee); this.cache = cache; this.inSegmentParents = new int[delegatee.maxDoc()]; Arrays.fill(this.inSegmentParents, -1); this.shareableNodes = initShareableNodes(delegatee); this.cacheInitializer = new CacheInitializer(delegatee); if (initCache) { cacheInitializer.run(); } // limit cache to 1% of maxDoc(), but at least 10. this.docNumber2uuid = (Map<Integer, String>)Collections.synchronizedMap(new LRUMap(Math.max(10, delegatee.maxDoc() / 100))); this.termDocsCache = new TermDocsCache(delegatee, FieldNames.PROPERTIES); } private BitSet initShareableNodes(IndexReader delegatee) throws IOException { BitSet shareableNodes = new BitSet(); TermDocs tDocs = delegatee.termDocs(new Term(FieldNames.SHAREABLE_NODE, "")); try { while (tDocs.next()) { shareableNodes.set(tDocs.doc()); } } finally { tDocs.close(); } return shareableNodes; } /** * Returns the <code>DocId</code> of the parent of <code>n</code> or * {@link DocId#NULL} if <code>n</code> does not have a parent * (<code>n</code> is the root node). * * @param n the document number. * @param deleted the documents that should be regarded as deleted. * @return the <code>DocId</code> of <code>n</code>'s parent. * @throws IOException if an error occurs while reading from the index. */ DocId getParent(int n, BitSet deleted) throws IOException { DocId parent; boolean existing = false; int parentDocNum = inSegmentParents[n]; if (parentDocNum != -1) { parent = DocId.create(parentDocNum); } else { parent = foreignParentDocIds.get(n); } if (parent != null) { existing = true; // check if valid and reset if necessary if (!parent.isValid(deleted)) { if (log.isDebugEnabled()) { log.debug(parent + " not valid anymore."); } parent = null; } } if (parent == null) { int plainDocId = -1; Document doc = document(n, FieldSelectors.UUID_AND_PARENT); String[] parentUUIDs = doc.getValues(FieldNames.PARENT); if (parentUUIDs.length == 0 || parentUUIDs[0].length() == 0) { // root node parent = DocId.NULL; } else { if (shareableNodes.get(n)) { parent = DocId.create(parentUUIDs); } else { if (!existing) { Term id = new Term(FieldNames.UUID, parentUUIDs[0]); TermDocs docs = termDocs(id); try { while (docs.next()) { if (!deleted.get(docs.doc())) { plainDocId = docs.doc(); parent = DocId.create(plainDocId); break; } } } finally { docs.close(); } } // if still null, then parent is not in this index, or existing // DocId was invalid. thus, only allowed to create DocId from uuid if (parent == null) { parent = DocId.create(parentUUIDs[0]); } } } // finally put to cache if (plainDocId != -1) { // PlainDocId inSegmentParents[n] = plainDocId; } else { // UUIDDocId foreignParentDocIds.put(n, parent); if (existing) { // there was an existing parent reference in // inSegmentParents, which was invalid and is replaced // inSegmentParents, which was invalid and is replaced // mark as unknown inSegmentParents[n] = -1; } } } return parent; } /** * Returns the tick value when this reader was created. * * @return the creation tick for this reader. */ public long getCreationTick() { return creationTick; } //--------------------< FilterIndexReader overwrites >---------------------- /** * Uses the {@link #docNumber2uuid} cache for document lookups that are only * interested in the {@link FieldSelectors#UUID}. * * @param n the document number. * @param fieldSelector the field selector. * @return the document. * @throws CorruptIndexException if the index is corrupt. * @throws IOException if an error occurs while reading from the index. */ public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { if (fieldSelector == FieldSelectors.UUID) { Integer docNum = new Integer(n); Document doc; String uuid = docNumber2uuid.get(docNum); if (uuid == null) { doc = super.document(n, fieldSelector); uuid = doc.get(FieldNames.UUID); docNumber2uuid.put(docNum, uuid); } else { doc = new Document(); doc.add(new Field(FieldNames.UUID, uuid.toString(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); } return doc; } else { return super.document(n, fieldSelector); } } /** * If the field of <code>term</code> is {@link FieldNames#UUID} this * <code>CachingIndexReader</code> returns a <code>TermDocs</code> instance * with a cached document id. If <code>term</code> has any other field * the call is delegated to the base <code>IndexReader</code>.<br> * If <code>term</code> is for a {@link FieldNames#UUID} field and this * <code>CachingIndexReader</code> does not have such a document, * {@link EmptyTermDocs#INSTANCE} is returned. * * @param term the term to start the <code>TermDocs</code> enumeration. * @return a TermDocs instance. * @throws IOException if an error occurs while reading from the index. */ public TermDocs termDocs(Term term) throws IOException { if (term!=null && term.field() == FieldNames.UUID) { // check cache if we have one if (cache != null) { DocNumberCache.Entry e = cache.get(term.text()); if (e != null) { // check if valid // the cache may contain entries from a different reader // with the same uuid. that happens when a node is updated // and is reindexed. the node 'travels' from an older index // to a newer one. the cache will still contain a cache // entry from the old until it is overwritten by the // newer index. if (e.creationTick == creationTick && !isDeleted(e.doc)) { return new SingleTermDocs(e.doc); } } // not in cache or invalid TermDocs docs = in.termDocs(term); try { if (docs.next()) { // put to cache cache.put(term.text(), this, docs.doc()); // and return return new SingleTermDocs(docs.doc()); } else { return EmptyTermDocs.INSTANCE; } } finally { docs.close(); } } } return termDocsCache.termDocs(term); } /** * {@inheritDoc} */ protected void doClose() throws IOException { try { cacheInitializer.waitUntilStopped(); } catch (InterruptedException e) { // ignore } super.doClose(); } //----------------------< internal >---------------------------------------- /** * Returns the next creation tick value. * * @return the next creation tick value. */ private static long getNextCreationTick() { synchronized (CachingIndexReader.class) { return currentTick++; } } /** * Initializes the {@link CachingIndexReader#inSegmentParents} cache. */ private final class CacheInitializer implements Runnable { /** * From where to read. */ private final IndexReader reader; /** * Set to <code>true</code> while this initializer does its work. */ private boolean running = false; /** * Set to <code>true</code> when this index reader is about to be closed. */ private volatile boolean stopRequested = false; /** * The {@link #inSegmentParents} is persisted using this filename. */ private static final String FILE_CACHE_NAME_ARRAY = "cache.inSegmentParents"; /** * Creates a new initializer with the given <code>reader</code>. * * @param reader an index reader. */ public CacheInitializer(IndexReader reader) { this.reader = reader; } /** * Initializes the cache. */ public void run() { synchronized (this) { running = true; } try { if (stopRequested) { // immediately return when stop is requested return; } boolean initCacheFromFile = loadCacheFromFile(); if (!initCacheFromFile) { // file-based cache is not available, load from the // repository log.debug("persisted cache is not available, will load directly from the repository."); initializeParents(reader); } } catch (IOException e) { // only log warn message during regular operation if (!stopRequested) { log.warn("Error initializing parents cache.", e); } } finally { synchronized (this) { running = false; notifyAll(); } } } /** * Waits until this cache initializer is stopped. * * @throws InterruptedException if the current thread is interrupted. */ public void waitUntilStopped() throws InterruptedException { stopRequested = true; synchronized (this) { while (running) { wait(); } } } /** * Initializes the {@link CachingIndexReader#inSegmentParents} <code>DocId</code> * array. * * @param reader the underlying index reader. * @throws IOException if an error occurs while reading from the index. */ private void initializeParents(IndexReader reader) throws IOException { long time = 0; if (log.isDebugEnabled()) { time = System.currentTimeMillis(); } final Map<Object, NodeInfo> docs = new HashMap<Object, NodeInfo>(); // read UUIDs collectTermDocs(reader, new Term(FieldNames.UUID, ""), new TermDocsCollector() { public void collect(Term term, TermDocs tDocs) throws IOException { String uuid = term.text(); while (tDocs.next()) { int doc = tDocs.doc(); // skip sharable nodes if (!shareableNodes.get(doc)) { NodeInfo info = new NodeInfo(doc, uuid); docs.put(new Integer(doc), info); } } } }); // read PARENTs collectTermDocs(reader, new Term(FieldNames.PARENT, "0"), new TermDocsCollector() { public void collect(Term term, TermDocs tDocs) throws IOException { String uuid = term.text(); while (tDocs.next()) { Integer docId = new Integer(tDocs.doc()); NodeInfo info = docs.get(docId); if (info == null) { // sharable node, see above } else { info.parent = uuid; docs.remove(docId); docs.put(info.uuid, info); } } } }); if (stopRequested) { return; } double foreignParents = 0; Iterator<NodeInfo> it = docs.values().iterator(); while (it.hasNext()) { NodeInfo info = it.next(); NodeInfo parent = docs.get(info.parent); if (parent != null) { inSegmentParents[info.docId] = parent.docId; } else if (info.parent != null) { foreignParents++; foreignParentDocIds.put(info.docId, DocId.create(info.parent)); } else if (shareableNodes.get(info.docId)) { Document doc = reader.document(info.docId, FieldSelectors.UUID_AND_PARENT); foreignParentDocIds.put(info.docId, DocId.create(doc.getValues(FieldNames.PARENT))); } else { // no parent -> root node foreignParentDocIds.put(info.docId, DocId.NULL); } } // Initialize, persist cache to file saveCacheToFile(); if (log.isDebugEnabled()) { NumberFormat nf = NumberFormat.getPercentInstance(); nf.setMaximumFractionDigits(1); time = System.currentTimeMillis() - time; if(inSegmentParents.length > 0) { foreignParents /= inSegmentParents.length; } log.debug("initialized {} DocIds in {} ms, {} foreign parents", new Object[]{new Integer(inSegmentParents.length), new Long(time), nf.format(foreignParents)}); } } /** * Collects term docs for a given start term. All terms with the same * field as <code>start</code> are enumerated. * * @param reader the index reader. * @param start the term where to start the term enumeration. * @param collector collects the term docs for each term. * @throws IOException if an error occurs while reading from the index. */ private void collectTermDocs(IndexReader reader, Term start, TermDocsCollector collector) throws IOException { TermDocs tDocs = reader.termDocs(); try { TermEnum terms = reader.terms(start); try { int count = 0; do { Term t = terms.term(); if (t != null && t.field() == start.field()) { tDocs.seek(terms); collector.collect(t, tDocs); } else { break; } // once in a while check if we should quit if (++count % 10000 == 0) { if (stopRequested) { break; } } } while (terms.next()); } finally { terms.close(); } } finally { tDocs.close(); } } /** * Persists the cache info {@link #inSegmentParents} to a file: * {@link #FILE_CACHE_NAME_ARRAY}, for faster init times on startup. **/ public void saveCacheToFile() throws IOException { try ( IndexOutput io = reader.directory().createOutput(FILE_CACHE_NAME_ARRAY) ){ for (int parent : inSegmentParents) { io.writeInt(parent); } } catch (Exception e) { log.error( "Error saving " + FILE_CACHE_NAME_ARRAY + ": " + e.getMessage(), e); } } /** * Loads the cache info {@link #inSegmentParents} from the file * {@link #FILE_CACHE_NAME_ARRAY}. * * @return true if the cache has been initialized of false if the cache * file does not exist yet, or an error happened */ private boolean loadCacheFromFile() throws IOException { try( IndexInput ii = reader.directory().openInput(FILE_CACHE_NAME_ARRAY); ) { long time = System.currentTimeMillis(); for (int i = 0; i < inSegmentParents.length; i++) { inSegmentParents[i] = ii.readInt(); } if(log.isDebugEnabled()) { log.debug( "persisted cache initialized {} DocIds in {} ms", new Object[] { inSegmentParents.length, System.currentTimeMillis() - time }); } return true; } catch (FileNotFoundException ignore) { if(log.isDebugEnabled()) { // expected in the case where the file-based cache has not been // initialized yet log.debug("Saved state (file-based) of CachingIndexReader has not been initialized yet", ignore); } } catch (IOException ignore) { log.warn( "Saved state of CachingIndexReader is corrupt, will try to remove offending file " + FILE_CACHE_NAME_ARRAY, ignore); // In the case where is a read error, the cache file is removed // so it can be recreated after // the cache loads the data from the repository directly reader.directory().deleteFile(FILE_CACHE_NAME_ARRAY); } return false; } } /** * Simple interface to collect a term and its term docs. */ private interface TermDocsCollector { /** * Called for each term encountered. * * @param term the term. * @param tDocs the term docs of <code>term</code>. * @throws IOException if an error occurs while reading from the index. */ void collect(Term term, TermDocs tDocs) throws IOException; } private final static class NodeInfo { final int docId; final String uuid; String parent; public NodeInfo(int docId, String uuid) { this.docId = docId; this.uuid = uuid; } } }