package org.apache.lucene.facet.taxonomy.directory;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.InconsistentTaxonomyException;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.Consts.LoadFullPathOnly;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.collections.LRUHashMap;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A {@link TaxonomyReader} which retrieves stored taxonomy information from a
* {@link Directory}.
* <P>
* Reading from the on-disk index on every method call is too slow, so this
* implementation employs caching: Some methods cache recent requests and their
* results, while other methods prefetch all the data into memory and then
* provide answers directly from in-memory tables. See the documentation of
* individual methods for comments on their performance.
*
* @lucene.experimental
*/
public class DirectoryTaxonomyReader implements TaxonomyReader {
private static final Logger logger = Logger.getLogger(DirectoryTaxonomyReader.class.getName());
private DirectoryReader indexReader;
// The following lock is used to allow multiple threads to read from the
// index concurrently, while having them block during the very short
// critical moment of refresh() (see comments below). Note, however, that
// we only read from the index when we don't have the entry in our cache,
// and the caches are locked separately.
private ReadWriteLock indexReaderLock = new ReentrantReadWriteLock();
// The following are the limited-size LRU caches used to cache the latest
// results from getOrdinal() and getLabel().
// Because LRUHashMap is not thread-safe, we need to synchronize on this
// object when using it. Unfortunately, this is not optimal under heavy
// contention because it means that while one thread is using the cache
// (reading or modifying) others are blocked from using it - or even
// starting to do benign things like calculating the hash function. A more
// efficient approach would be to use a non-locking (as much as possible)
// concurrent solution, along the lines of java.util.concurrent.ConcurrentHashMap
// but with LRU semantics.
// However, even in the current sub-optimal implementation we do not make
// the mistake of locking out readers while waiting for disk in a cache
// miss - below, we do not hold cache lock while reading missing data from
// disk.
private final LRUHashMap<String, Integer> ordinalCache;
private final LRUHashMap<Integer, String> categoryCache;
// getParent() needs to be extremely efficient, to the point that we need
// to fetch all the data in advance into memory, and answer these calls
// from memory. Currently we use a large integer array, which is
// initialized when the taxonomy is opened, and potentially enlarged
// when it is refresh()ed.
// These arrays are not syncrhonized. Rather, the reference to the array
// is volatile, and the only writing operation (refreshPrefetchArrays)
// simply creates a new array and replaces the reference. The volatility
// of the reference ensures the correct atomic replacement and its
// visibility properties (the content of the array is visible when the
// new reference is visible).
private ParentArray parentArray;
private char delimiter = Consts.DEFAULT_DELIMITER;
private volatile boolean closed = false;
// set refCount to 1 at start
private final AtomicInteger refCount = new AtomicInteger(1);
/**
* Open for reading a taxonomy stored in a given {@link Directory}.
* @param directory
* The {@link Directory} in which to the taxonomy lives. Note that
* the taxonomy is read directly to that directory (not from a
* subdirectory of it).
* @throws CorruptIndexException if the Taxonomy is corrupted.
* @throws IOException if another error occurred.
*/
public DirectoryTaxonomyReader(Directory directory) throws IOException {
this.indexReader = openIndexReader(directory);
// These are the default cache sizes; they can be configured after
// construction with the cache's setMaxSize() method
ordinalCache = new LRUHashMap<String, Integer>(4000);
categoryCache = new LRUHashMap<Integer, String>(4000);
// TODO (Facet): consider lazily create parent array when asked, not in the constructor
parentArray = new ParentArray();
parentArray.refresh(indexReader);
}
protected DirectoryReader openIndexReader(Directory directory) throws IOException {
return DirectoryReader.open(directory);
}
/**
* @throws AlreadyClosedException if this IndexReader is closed
*/
protected final void ensureOpen() throws AlreadyClosedException {
if (getRefCount() <= 0) {
throw new AlreadyClosedException("this TaxonomyReader is closed");
}
}
/**
* setCacheSize controls the maximum allowed size of each of the caches
* used by {@link #getPath(int)} and {@link #getOrdinal(CategoryPath)}.
* <P>
* Currently, if the given size is smaller than the current size of
* a cache, it will not shrink, and rather we be limited to its current
* size.
* @param size the new maximum cache size, in number of entries.
*/
public void setCacheSize(int size) {
ensureOpen();
synchronized(categoryCache) {
categoryCache.setMaxSize(size);
}
synchronized(ordinalCache) {
ordinalCache.setMaxSize(size);
}
}
/**
* setDelimiter changes the character that the taxonomy uses in its
* internal storage as a delimiter between category components. Do not
* use this method unless you really know what you are doing.
* <P>
* If you do use this method, make sure you call it before any other
* methods that actually queries the taxonomy. Moreover, make sure you
* always pass the same delimiter for all LuceneTaxonomyWriter and
* LuceneTaxonomyReader objects you create.
*/
public void setDelimiter(char delimiter) {
ensureOpen();
this.delimiter = delimiter;
}
public int getOrdinal(CategoryPath categoryPath) throws IOException {
ensureOpen();
if (categoryPath.length()==0) {
return ROOT_ORDINAL;
}
String path = categoryPath.toString(delimiter);
// First try to find the answer in the LRU cache:
synchronized(ordinalCache) {
Integer res = ordinalCache.get(path);
if (res!=null) {
return res.intValue();
}
}
// If we're still here, we have a cache miss. We need to fetch the
// value from disk, and then also put it in the cache:
int ret = TaxonomyReader.INVALID_ORDINAL;
try {
indexReaderLock.readLock().lock();
// TODO (Facet): avoid Multi*?
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
DocsEnum docs = MultiFields.getTermDocsEnum(indexReader, liveDocs, Consts.FULL, new BytesRef(path), 0);
if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
ret = docs.docID();
}
} finally {
indexReaderLock.readLock().unlock();
}
// Put the new value in the cache. Note that it is possible that while
// we were doing the above fetching (without the cache locked), some
// other thread already added the same category to the cache. We do
// not care about this possibilty, as LRUCache replaces previous values
// of the same keys (it doesn't store duplicates).
synchronized(ordinalCache) {
// GB: new Integer(int); creates a new object each and every time.
// Integer.valueOf(int) might not (See JavaDoc).
ordinalCache.put(path, Integer.valueOf(ret));
}
return ret;
}
public CategoryPath getPath(int ordinal) throws IOException {
ensureOpen();
// TODO (Facet): Currently, the LRU cache we use (getCategoryCache) holds
// strings with delimiters, not CategoryPath objects, so even if
// we have a cache hit, we need to process the string and build a new
// CategoryPath object every time. What is preventing us from putting
// the actual CategoryPath object in the cache is the fact that these
// objects are mutable. So we should create an immutable (read-only)
// interface that CategoryPath implements, and this method should
// return this interface, not the writable CategoryPath.
String label = getLabel(ordinal);
if (label==null) {
return null;
}
return new CategoryPath(label, delimiter);
}
public boolean getPath(int ordinal, CategoryPath result) throws IOException {
ensureOpen();
String label = getLabel(ordinal);
if (label==null) {
return false;
}
result.clear();
result.add(label, delimiter);
return true;
}
private String getLabel(int catID) throws IOException {
ensureOpen();
// First try to find the answer in the LRU cache. It is very
// unfortunate that we need to allocate an Integer object here -
// it would have been better if we used a hash table specifically
// designed for int keys...
// GB: new Integer(int); creates a new object each and every time.
// Integer.valueOf(int) might not (See JavaDoc).
Integer catIDInteger = Integer.valueOf(catID);
synchronized(categoryCache) {
String res = categoryCache.get(catIDInteger);
if (res!=null) {
return res;
}
}
// If we're still here, we have a cache miss. We need to fetch the
// value from disk, and then also put it in the cache:
String ret;
try {
indexReaderLock.readLock().lock();
// The taxonomy API dictates that if we get an invalid category
// ID, we should return null, If we don't check this here, we
// can some sort of an exception from the document() call below.
// NOTE: Currently, we *do not* cache this return value; There
// isn't much point to do so, because checking the validity of
// the docid doesn't require disk access - just comparing with
// the number indexReader.maxDoc().
if (catID<0 || catID>=indexReader.maxDoc()) {
return null;
}
final LoadFullPathOnly loader = new LoadFullPathOnly();
indexReader.document(catID, loader);
ret = loader.getFullPath();
} finally {
indexReaderLock.readLock().unlock();
}
// Put the new value in the cache. Note that it is possible that while
// we were doing the above fetching (without the cache locked), some
// other thread already added the same category to the cache. We do
// not care about this possibility, as LRUCache replaces previous
// values of the same keys (it doesn't store duplicates).
synchronized (categoryCache) {
categoryCache.put(catIDInteger, ret);
}
return ret;
}
public int getParent(int ordinal) {
ensureOpen();
// Note how we don't need to hold the read lock to do the following,
// because the array reference is volatile, ensuring the correct
// visibility and ordering: if we get the new reference, the new
// data is also visible to this thread.
return getParentArray()[ordinal];
}
/**
* getParentArray() returns an int array of size getSize() listing the
* ordinal of the parent category of each category in the taxonomy.
* <P>
* The caller can hold on to the array it got indefinitely - it is
* guaranteed that no-one else will modify it. The other side of the
* same coin is that the caller must treat the array it got as read-only
* and <B>not modify it</B>, because other callers might have gotten the
* same array too, and getParent() calls are also answered from the
* same array.
* <P>
* The getParentArray() call is extremely efficient, merely returning
* a reference to an array that already exists. For a caller that plans
* to call getParent() for many categories, using getParentArray() and
* the array it returns is a somewhat faster approach because it avoids
* the overhead of method calls and volatile dereferencing.
* <P>
* If you use getParentArray() instead of getParent(), remember that
* the array you got is (naturally) not modified after a refresh(),
* so you should always call getParentArray() again after a refresh().
*/
public int[] getParentArray() {
ensureOpen();
// Note how we don't need to hold the read lock to do the following,
// because the array reference is volatile, ensuring the correct
// visibility and ordering: if we get the new reference, the new
// data is also visible to this thread.
return parentArray.getArray();
}
// Note that refresh() is synchronized (it is the only synchronized
// method in this class) to ensure that it never gets called concurrently
// with itself.
public synchronized boolean refresh() throws IOException, InconsistentTaxonomyException {
ensureOpen();
/*
* Since refresh() can be a lengthy operation, it is very important that we
* avoid locking out all readers for its duration. This is why we don't hold
* the indexReaderLock write lock for the entire duration of this method. In
* fact, it is enough to hold it only during a single assignment! Other
* comments in this method will explain this.
*/
// note that the lengthy operation indexReader.reopen() does not
// modify the reader, so we can do it without holding a lock. We can
// safely read indexReader without holding the write lock, because
// no other thread can be writing at this time (this method is the
// only possible writer, and it is "synchronized" to avoid this case).
DirectoryReader r2 = DirectoryReader.openIfChanged(indexReader);
if (r2 == null) {
return false; // no changes, nothing to do
}
// validate that a refresh is valid at this point, i.e. that the taxonomy
// was not recreated since this reader was last opened or refresshed.
String t1 = indexReader.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_CREATE_TIME);
String t2 = r2.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_CREATE_TIME);
if (t1==null) {
if (t2!=null) {
r2.close();
throw new InconsistentTaxonomyException("Taxonomy was recreated at: "+t2);
}
} else if (!t1.equals(t2)) {
r2.close();
throw new InconsistentTaxonomyException("Taxonomy was recreated at: "+t2+" != "+t1);
}
IndexReader oldreader = indexReader;
// we can close the old searcher, but need to synchronize this
// so that we don't close it in the middle that another routine
// is reading from it.
indexReaderLock.writeLock().lock();
indexReader = r2;
indexReaderLock.writeLock().unlock();
// We can close the old reader, but need to be certain that we
// don't close it while another method is reading from it.
// Luckily, we can be certain of that even without putting the
// oldreader.close() in the locked section. The reason is that
// after lock() succeeded above, we know that all existing readers
// had finished (this is what a read-write lock ensures). New
// readers, starting after the unlock() we just did, already got
// the new indexReader we set above. So nobody can be possibly
// using the old indexReader, and we can close it:
oldreader.close();
// We prefetch some of the arrays to make requests much faster.
// Let's refresh these prefetched arrays; This refresh is much
// is made more efficient by assuming that it is enough to read
// the values for new categories (old categories could not have been
// changed or deleted)
// Note that this this done without the write lock being held,
// which means that it is possible that during a refresh(), a
// reader will have some methods (like getOrdinal and getCategory)
// return fresh information, while getParent()
// (only to be prefetched now) still return older information.
// We consider this to be acceptable. The important thing,
// however, is that refreshPrefetchArrays() itself writes to
// the arrays in a correct manner (see discussion there)
parentArray.refresh(indexReader);
// Remove any INVALID_ORDINAL values from the ordinal cache,
// because it is possible those are now answered by the new data!
Iterator<Entry<String, Integer>> i = ordinalCache.entrySet().iterator();
while (i.hasNext()) {
Entry<String, Integer> e = i.next();
if (e.getValue().intValue() == INVALID_ORDINAL) {
i.remove();
}
}
return true;
}
public void close() throws IOException {
if (!closed) {
synchronized (this) {
if (!closed) {
decRef();
closed = true;
}
}
}
}
/** Do the actual closing, free up resources */
private void doClose() throws IOException {
indexReader.close();
closed = true;
parentArray = null;
childrenArrays = null;
categoryCache.clear();
ordinalCache.clear();
}
public int getSize() {
ensureOpen();
indexReaderLock.readLock().lock();
try {
return indexReader.numDocs();
} finally {
indexReaderLock.readLock().unlock();
}
}
public Map<String, String> getCommitUserData() throws IOException {
ensureOpen();
return indexReader.getIndexCommit().getUserData();
}
private ChildrenArrays childrenArrays;
Object childrenArraysRebuild = new Object();
public ChildrenArrays getChildrenArrays() {
ensureOpen();
// Check if the taxonomy grew since we built the array, and if it
// did, create new (and larger) arrays and fill them as required.
// We do all this under a lock, two prevent to concurrent calls to
// needlessly do the same array building at the same time.
synchronized(childrenArraysRebuild) {
int num = getSize();
int first;
if (childrenArrays==null) {
first = 0;
} else {
first = childrenArrays.getYoungestChildArray().length;
}
// If the taxonomy hasn't grown, we can return the existing object
// immediately
if (first == num) {
return childrenArrays;
}
// Otherwise, build new arrays for a new ChildrenArray object.
// These arrays start with an enlarged copy of the previous arrays,
// and then are modified to take into account the new categories:
int[] newYoungestChildArray = new int[num];
int[] newOlderSiblingArray = new int[num];
// In Java 6, we could just do Arrays.copyOf()...
if (childrenArrays!=null) {
System.arraycopy(childrenArrays.getYoungestChildArray(), 0,
newYoungestChildArray, 0, childrenArrays.getYoungestChildArray().length);
System.arraycopy(childrenArrays.getOlderSiblingArray(), 0,
newOlderSiblingArray, 0, childrenArrays.getOlderSiblingArray().length);
}
int[] parents = getParentArray();
for (int i=first; i<num; i++) {
newYoungestChildArray[i] = INVALID_ORDINAL;
}
// In the loop below we can ignore the root category (0) because
// it has no parent
if (first==0) {
first = 1;
newOlderSiblingArray[0] = INVALID_ORDINAL;
}
for (int i=first; i<num; i++) {
// Note that parents[i] is always < i, so the right-hand-side of
// the following line is already set when we get here.
newOlderSiblingArray[i] = newYoungestChildArray[parents[i]];
newYoungestChildArray[parents[i]] = i;
}
// Finally switch to the new arrays
childrenArrays = new ChildrenArraysImpl(newYoungestChildArray,
newOlderSiblingArray);
return childrenArrays;
}
}
public String toString(int max) {
ensureOpen();
StringBuilder sb = new StringBuilder();
int upperl = Math.min(max, this.indexReader.maxDoc());
for (int i = 0; i < upperl; i++) {
try {
CategoryPath category = this.getPath(i);
if (category == null) {
sb.append(i + ": NULL!! \n");
continue;
}
if (category.length() == 0) {
sb.append(i + ": EMPTY STRING!! \n");
continue;
}
sb.append(i +": "+category.toString()+"\n");
} catch (IOException e) {
if (logger.isLoggable(Level.FINEST)) {
logger.log(Level.FINEST, e.getMessage(), e);
}
}
}
return sb.toString();
}
private static final class ChildrenArraysImpl implements ChildrenArrays {
private int[] youngestChildArray, olderSiblingArray;
public ChildrenArraysImpl(int[] youngestChildArray, int[] olderSiblingArray) {
this.youngestChildArray = youngestChildArray;
this.olderSiblingArray = olderSiblingArray;
}
public int[] getOlderSiblingArray() {
return olderSiblingArray;
}
public int[] getYoungestChildArray() {
return youngestChildArray;
}
}
/**
* Expert: This method is only for expert use.
* Note also that any call to refresh() will invalidate the returned reader,
* so the caller needs to take care of appropriate locking.
*
* @return lucene indexReader
*/
DirectoryReader getInternalIndexReader() {
ensureOpen();
return this.indexReader;
}
/**
* Expert: decreases the refCount of this TaxonomyReader instance. If the
* refCount drops to 0, then this reader is closed.
*/
public void decRef() throws IOException {
ensureOpen();
final int rc = refCount.decrementAndGet();
if (rc == 0) {
boolean success = false;
try {
doClose();
success = true;
} finally {
if (!success) {
// Put reference back on failure
refCount.incrementAndGet();
}
}
} else if (rc < 0) {
throw new IllegalStateException("too many decRef calls: refCount is " + rc + " after decrement");
}
}
/** Expert: returns the current refCount for this taxonomy reader */
public int getRefCount() {
return refCount.get();
}
/**
* Expert: increments the refCount of this TaxonomyReader instance.
* RefCounts are used to determine when a taxonomy reader can be closed
* safely, i.e. as soon as there are no more references.
* Be sure to always call a corresponding decRef(), in a finally clause;
* otherwise the reader may never be closed.
*/
public void incRef() {
ensureOpen();
refCount.incrementAndGet();
}
}