package org.apache.lucene.facet.taxonomy.directory;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.LogByteSizeMergePolicy;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.NativeFSLockFactory;
import org.apache.lucene.store.SimpleFSLockFactory;
import org.apache.lucene.util.Version;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.cl2o.Cl2oTaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.lru.LruTaxonomyWriterCache;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* {@link TaxonomyWriter} which uses a {@link Directory} to store the taxonomy
* information on disk, and keeps an additional in-memory cache of some or all
* categories.
* <p>
* In addition to the permanently-stored information in the {@link Directory},
* efficiency dictates that we also keep an in-memory cache of <B>recently
* seen</B> or <B>all</B> categories, so that we do not need to go back to disk
* for every category addition to see which ordinal this category already has,
* if any. A {@link TaxonomyWriterCache} object determines the specific caching
* algorithm used.
* <p>
* This class offers some hooks for extending classes to control the
* {@link IndexWriter} instance that is used. See {@link #openIndexWriter}.
*
* @lucene.experimental
*/
public class DirectoryTaxonomyWriter implements TaxonomyWriter {
/**
* Property name of user commit data that contains the creation time of a taxonomy index.
* <p>
* Applications making use of {@link TaxonomyWriter#commit(Map)} should not use this
* particular property name.
*/
public static final String INDEX_CREATE_TIME = "index.create.time";
private IndexWriter indexWriter;
private int nextID;
private char delimiter = Consts.DEFAULT_DELIMITER;
private SinglePositionTokenStream parentStream = new SinglePositionTokenStream(Consts.PAYLOAD_PARENT);
private Field parentStreamField;
private Field fullPathField;
private TaxonomyWriterCache cache;
/**
* We call the cache "complete" if we know that every category in our
* taxonomy is in the cache. When the cache is <B>not</B> complete, and
* we can't find a category in the cache, we still need to look for it
* in the on-disk index; Therefore when the cache is not complete, we
* need to open a "reader" to the taxonomy index.
* The cache becomes incomplete if it was never filled with the existing
* categories, or if a put() to the cache ever returned true (meaning
* that some of the cached data was cleared).
*/
private boolean cacheIsComplete;
private IndexReader reader;
private int cacheMisses;
/**
* When a taxonomy is created, we mark that its create time should be committed in the
* next commit.
*/
private String taxoIndexCreateTime = null;
/**
* setDelimiter changes the character that the taxonomy uses in its internal
* storage as a delimiter between category components. Do not use this
* method unless you really know what you are doing. It has nothing to do
* with whatever character the application may be using to represent
* categories for its own use.
* <P>
* If you do use this method, make sure you call it before any other methods
* that actually queries the taxonomy. Moreover, make sure you always pass
* the same delimiter for all LuceneTaxonomyWriter and LuceneTaxonomyReader
* objects you create for the same directory.
*/
public void setDelimiter(char delimiter) {
ensureOpen();
this.delimiter = delimiter;
}
/**
* Forcibly unlocks the taxonomy in the named directory.
* <P>
* Caution: this should only be used by failure recovery code, when it is
* known that no other process nor thread is in fact currently accessing
* this taxonomy.
* <P>
* This method is unnecessary if your {@link Directory} uses a
* {@link NativeFSLockFactory} instead of the default
* {@link SimpleFSLockFactory}. When the "native" lock is used, a lock
* does not stay behind forever when the process using it dies.
*/
public static void unlock(Directory directory) throws IOException {
IndexWriter.unlock(directory);
}
/**
* Construct a Taxonomy writer.
*
* @param directory
* The {@link Directory} in which to store the taxonomy. Note that
* the taxonomy is written directly to that directory (not to a
* subdirectory of it).
* @param openMode
* Specifies how to open a taxonomy for writing: <code>APPEND</code>
* means open an existing index for append (failing if the index does
* not yet exist). <code>CREATE</code> means create a new index (first
* deleting the old one if it already existed).
* <code>APPEND_OR_CREATE</code> appends to an existing index if there
* is one, otherwise it creates a new index.
* @param cache
* A {@link TaxonomyWriterCache} implementation which determines
* the in-memory caching policy. See for example
* {@link LruTaxonomyWriterCache} and {@link Cl2oTaxonomyWriterCache}.
* If null or missing, {@link #defaultTaxonomyWriterCache()} is used.
* @throws CorruptIndexException
* if the taxonomy is corrupted.
* @throws LockObtainFailedException
* if the taxonomy is locked by another writer. If it is known
* that no other concurrent writer is active, the lock might
* have been left around by an old dead process, and should be
* removed using {@link #unlock(Directory)}.
* @throws IOException
* if another error occurred.
*/
public DirectoryTaxonomyWriter(Directory directory, OpenMode openMode,
TaxonomyWriterCache cache)
throws CorruptIndexException, LockObtainFailedException,
IOException {
if (!IndexReader.indexExists(directory) || openMode==OpenMode.CREATE) {
taxoIndexCreateTime = Long.toString(System.nanoTime());
}
indexWriter = openIndexWriter(directory, openMode);
reader = null;
parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream);
parentStreamField.setOmitNorms(true);
fullPathField = new Field(Consts.FULL, "", Store.YES, Index.NOT_ANALYZED_NO_NORMS);
fullPathField.setIndexOptions(IndexOptions.DOCS_ONLY);
this.nextID = indexWriter.maxDoc();
if (cache==null) {
cache = defaultTaxonomyWriterCache();
}
this.cache = cache;
if (nextID == 0) {
cacheIsComplete = true;
// Make sure that the taxonomy always contain the root category
// with category id 0.
addCategory(new CategoryPath());
refreshReader();
} else {
// There are some categories on the disk, which we have not yet
// read into the cache, and therefore the cache is incomplete.
// We chose not to read all the categories into the cache now,
// to avoid terrible performance when a taxonomy index is opened
// to add just a single category. We will do it later, after we
// notice a few cache misses.
cacheIsComplete = false;
}
cacheMisses = 0;
}
/**
* A hook for extensions of this class to provide their own
* {@link IndexWriter} implementation or instance. Extending classes can
* instantiate and configure the {@link IndexWriter} as they see fit,
* including setting a {@link org.apache.lucene.index.MergeScheduler}, or
* {@link org.apache.lucene.index.IndexDeletionPolicy}, different RAM size
* etc.<br>
* <b>NOTE:</b> the instance this method returns will be closed upon calling
* to {@link #close()}.
*
* @param directory
* the {@link Directory} on top of which an {@link IndexWriter}
* should be opened.
* @param openMode
* see {@link OpenMode}
*/
protected IndexWriter openIndexWriter(Directory directory, OpenMode openMode)
throws IOException {
// Make sure we use a MergePolicy which merges segments in-order and thus
// keeps the doc IDs ordered as well (this is crucial for the taxonomy
// index).
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_30,
new KeywordAnalyzer()).setOpenMode(openMode).setMergePolicy(
new LogByteSizeMergePolicy());
return new IndexWriter(directory, config);
}
// Currently overridden by a unit test that verifies that every index we open
// is close()ed.
/**
* Open an {@link IndexReader} from the {@link #indexWriter} member, by
* calling {@link IndexWriter#getReader()}. Extending classes can override
* this method to return their own {@link IndexReader}.
*/
protected IndexReader openReader() throws IOException {
return IndexReader.open(indexWriter, true);
}
/**
* Creates a new instance with a default cached as defined by
* {@link #defaultTaxonomyWriterCache()}.
*/
public DirectoryTaxonomyWriter(Directory directory, OpenMode openMode)
throws CorruptIndexException, LockObtainFailedException, IOException {
this(directory, openMode, defaultTaxonomyWriterCache());
}
/**
* Defines the default {@link TaxonomyWriterCache} to use in constructors
* which do not specify one.
* <P>
* The current default is {@link Cl2oTaxonomyWriterCache} constructed
* with the parameters (1024, 0.15f, 3), i.e., the entire taxonomy is
* cached in memory while building it.
*/
public static TaxonomyWriterCache defaultTaxonomyWriterCache() {
return new Cl2oTaxonomyWriterCache(1024, 0.15f, 3);
}
// convenience constructors:
public DirectoryTaxonomyWriter(Directory d)
throws CorruptIndexException, LockObtainFailedException,
IOException {
this(d, OpenMode.CREATE_OR_APPEND);
}
/**
* Frees used resources as well as closes the underlying {@link IndexWriter},
* which commits whatever changes made to it to the underlying
* {@link Directory}.
*/
public synchronized void close() throws CorruptIndexException, IOException {
if (indexWriter != null) {
if (taxoIndexCreateTime != null) {
indexWriter.commit(combinedCommitData(null));
taxoIndexCreateTime = null;
}
doClose();
}
}
private void doClose() throws CorruptIndexException, IOException {
indexWriter.close();
indexWriter = null;
closeResources();
}
/**
* Returns the number of memory bytes used by the cache.
* @return Number of cache bytes in memory, for CL2O only; zero otherwise.
*/
public int getCacheMemoryUsage() {
ensureOpen();
if (this.cache == null || !(this.cache instanceof Cl2oTaxonomyWriterCache)) {
return 0;
}
return ((Cl2oTaxonomyWriterCache)this.cache).getMemoryUsage();
}
/**
* A hook for extending classes to close additional resources that were used.
* The default implementation closes the {@link IndexReader} as well as the
* {@link TaxonomyWriterCache} instances that were used. <br>
* <b>NOTE:</b> if you override this method, you should include a
* <code>super.closeResources()</code> call in your implementation.
*/
protected synchronized void closeResources() throws IOException {
if (reader != null) {
reader.close();
reader = null;
}
if (cache != null) {
cache.close();
cache = null;
}
}
/**
* Look up the given category in the cache and/or the on-disk storage,
* returning the category's ordinal, or a negative number in case the
* category does not yet exist in the taxonomy.
*/
protected int findCategory(CategoryPath categoryPath) throws IOException {
// If we can find the category in our cache, we can return the
// response directly from it:
int res = cache.get(categoryPath);
if (res >= 0) {
return res;
}
// If we know that the cache is complete, i.e., contains every category
// which exists, we can return -1 immediately. However, if the cache is
// not complete, we need to check the disk.
if (cacheIsComplete) {
return -1;
}
cacheMisses++;
// After a few cache misses, it makes sense to read all the categories
// from disk and into the cache. The reason not to do this on the first
// cache miss (or even when opening the writer) is that it will
// significantly slow down the case when a taxonomy is opened just to
// add one category. The idea only spending a long time on reading
// after enough time was spent on cache misses is known as a "online
// algorithm".
if (perhapsFillCache()) {
return cache.get(categoryPath);
}
// We need to get an answer from the on-disk index. If a reader
// is not yet open, do it now:
if (reader == null) {
reader = openReader();
}
TermDocs docs = reader.termDocs(new Term(Consts.FULL, categoryPath
.toString(delimiter)));
if (!docs.next()) {
return -1; // category does not exist in taxonomy
}
// Note: we do NOT add to the cache the fact that the category
// does not exist. The reason is that our only use for this
// method is just before we actually add this category. If
// in the future this usage changes, we should consider caching
// the fact that the category is not in the taxonomy.
addToCache(categoryPath, docs.doc());
return docs.doc();
}
/**
* Look up the given prefix of the given category in the cache and/or the
* on-disk storage, returning that prefix's ordinal, or a negative number in
* case the category does not yet exist in the taxonomy.
*/
private int findCategory(CategoryPath categoryPath, int prefixLen)
throws IOException {
int res = cache.get(categoryPath, prefixLen);
if (res >= 0) {
return res;
}
if (cacheIsComplete) {
return -1;
}
cacheMisses++;
if (perhapsFillCache()) {
return cache.get(categoryPath, prefixLen);
}
if (reader == null) {
reader = openReader();
}
TermDocs docs = reader.termDocs(new Term(Consts.FULL, categoryPath
.toString(delimiter, prefixLen)));
if (!docs.next()) {
return -1; // category does not exist in taxonomy
}
addToCache(categoryPath, prefixLen, docs.doc());
return docs.doc();
}
// TODO (Facet): addCategory() is synchronized. This means that if indexing is
// multi-threaded, a new category that needs to be written to disk (and
// potentially even trigger a lengthy merge) locks out other addCategory()
// calls - even those which could immediately return a cached value.
// We definitely need to fix this situation!
public synchronized int addCategory(CategoryPath categoryPath) throws IOException {
ensureOpen();
// If the category is already in the cache and/or the taxonomy, we
// should return its existing ordinal:
int res = findCategory(categoryPath);
if (res < 0) {
// This is a new category, and we need to insert it into the index
// (and the cache). Actually, we might also need to add some of
// the category's ancestors before we can add the category itself
// (while keeping the invariant that a parent is always added to
// the taxonomy before its child). internalAddCategory() does all
// this recursively:
res = internalAddCategory(categoryPath, categoryPath.length());
}
return res;
}
/**
* Add a new category into the index (and the cache), and return its new
* ordinal.
* <P>
* Actually, we might also need to add some of the category's ancestors
* before we can add the category itself (while keeping the invariant that a
* parent is always added to the taxonomy before its child). We do this by
* recursion.
*/
private int internalAddCategory(CategoryPath categoryPath, int length)
throws CorruptIndexException, IOException {
// Find our parent's ordinal (recursively adding the parent category
// to the taxonomy if it's not already there). Then add the parent
// ordinal as payloads (rather than a stored field; payloads can be
// more efficiently read into memory in bulk by LuceneTaxonomyReader)
int parent;
if (length > 1) {
parent = findCategory(categoryPath, length - 1);
if (parent < 0) {
parent = internalAddCategory(categoryPath, length - 1);
}
} else if (length == 1) {
parent = TaxonomyReader.ROOT_ORDINAL;
} else {
parent = TaxonomyReader.INVALID_ORDINAL;
}
int id = addCategoryDocument(categoryPath, length, parent);
return id;
}
/**
* Verifies that this instance wasn't closed, or throws
* {@link AlreadyClosedException} if it is.
*/
protected final void ensureOpen() {
if (indexWriter == null) {
throw new AlreadyClosedException("The taxonomy writer has already been closed");
}
}
// Note that the methods calling addCategoryDocument() are synchornized,
// so this method is effectively synchronized as well, but we'll add
// synchronized to be on the safe side, and we can reuse class-local objects
// instead of allocating them every time
protected synchronized int addCategoryDocument(CategoryPath categoryPath,
int length, int parent)
throws CorruptIndexException, IOException {
// Before Lucene 2.9, position increments >=0 were supported, so we
// added 1 to parent to allow the parent -1 (the parent of the root).
// Unfortunately, starting with Lucene 2.9, after LUCENE-1542, this is
// no longer enough, since 0 is not encoded consistently either (see
// comment in SinglePositionTokenStream). But because we must be
// backward-compatible with existing indexes, we can't just fix what
// we write here (e.g., to write parent+2), and need to do a workaround
// in the reader (which knows that anyway only category 0 has a parent
// -1).
parentStream.set(parent+1);
Document d = new Document();
d.add(parentStreamField);
fullPathField.setValue(categoryPath.toString(delimiter, length));
d.add(fullPathField);
// Note that we do no pass an Analyzer here because the fields that are
// added to the Document are untokenized or contains their own TokenStream.
// Therefore the IndexWriter's Analyzer has no effect.
indexWriter.addDocument(d);
int id = nextID++;
addToCache(categoryPath, length, id);
// also add to the parent array
getParentArray().add(id, parent);
return id;
}
private static class SinglePositionTokenStream extends TokenStream {
private CharTermAttribute termAtt;
private PositionIncrementAttribute posIncrAtt;
private boolean returned;
public SinglePositionTokenStream(String word) {
termAtt = addAttribute(CharTermAttribute.class);
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
termAtt.setEmpty().append(word);
returned = true;
}
/**
* Set the value we want to keep, as the position increment.
* Note that when TermPositions.nextPosition() is later used to
* retrieve this value, val-1 will be returned, not val.
* <P>
* IMPORTANT NOTE: Before Lucene 2.9, val>=0 were safe (for val==0,
* the retrieved position would be -1). But starting with Lucene 2.9,
* this unfortunately changed, and only val>0 are safe. val=0 can
* still be used, but don't count on the value you retrieve later
* (it could be 0 or -1, depending on circumstances or versions).
* This change is described in Lucene's JIRA: LUCENE-1542.
*/
public void set(int val) {
posIncrAtt.setPositionIncrement(val);
returned = false;
}
@Override
public boolean incrementToken() throws IOException {
if (returned) {
return false;
}
returned = true;
return true;
}
}
private void addToCache(CategoryPath categoryPath, int id)
throws CorruptIndexException, IOException {
if (cache.put(categoryPath, id)) {
// If cache.put() returned true, it means the cache was limited in
// size, became full, so parts of it had to be cleared.
// Unfortunately we don't know which part was cleared - it is
// possible that a relatively-new category that hasn't yet been
// committed to disk (and therefore isn't yet visible in our
// "reader") was deleted from the cache, and therefore we must
// now refresh the reader.
// Because this is a slow operation, cache implementations are
// expected not to delete entries one-by-one but rather in bulk
// (LruTaxonomyWriterCache removes the 2/3rd oldest entries).
refreshReader();
cacheIsComplete = false;
}
}
private void addToCache(CategoryPath categoryPath, int prefixLen, int id)
throws CorruptIndexException, IOException {
if (cache.put(categoryPath, prefixLen, id)) {
refreshReader();
cacheIsComplete = false;
}
}
private synchronized void refreshReader() throws IOException {
if (reader != null) {
IndexReader r2 = IndexReader.openIfChanged(reader);
if (r2 != null) {
reader.close();
reader = r2;
}
}
}
/**
* Calling commit() ensures that all the categories written so far are
* visible to a reader that is opened (or reopened) after that call.
* When the index is closed(), commit() is also implicitly done.
* See {@link TaxonomyWriter#commit()}
*/
public synchronized void commit() throws CorruptIndexException, IOException {
ensureOpen();
if (taxoIndexCreateTime != null) {
indexWriter.commit(combinedCommitData(null));
taxoIndexCreateTime = null;
} else {
indexWriter.commit();
}
refreshReader();
}
/**
* Combine original user data with that of the taxonomy creation time
*/
private Map<String,String> combinedCommitData(Map<String,String> userData) {
Map<String,String> m = new HashMap<String, String>();
if (userData != null) {
m.putAll(userData);
}
m.put(INDEX_CREATE_TIME, taxoIndexCreateTime);
return m;
}
/**
* Like commit(), but also store properties with the index. These properties
* are retrievable by {@link DirectoryTaxonomyReader#getCommitUserData}.
* See {@link TaxonomyWriter#commit(Map)}.
*/
public synchronized void commit(Map<String,String> commitUserData) throws CorruptIndexException, IOException {
ensureOpen();
if (taxoIndexCreateTime != null) {
indexWriter.commit(combinedCommitData(commitUserData));
taxoIndexCreateTime = null;
} else {
indexWriter.commit(commitUserData);
}
refreshReader();
}
/**
* prepare most of the work needed for a two-phase commit.
* See {@link IndexWriter#prepareCommit}.
*/
public synchronized void prepareCommit() throws CorruptIndexException, IOException {
ensureOpen();
if (taxoIndexCreateTime != null) {
indexWriter.prepareCommit(combinedCommitData(null));
taxoIndexCreateTime = null;
} else {
indexWriter.prepareCommit();
}
}
/**
* Like above, and also prepares to store user data with the index.
* See {@link IndexWriter#prepareCommit(Map)}
*/
public synchronized void prepareCommit(Map<String,String> commitUserData) throws CorruptIndexException, IOException {
ensureOpen();
if (taxoIndexCreateTime != null) {
indexWriter.prepareCommit(combinedCommitData(commitUserData));
taxoIndexCreateTime = null;
} else {
indexWriter.prepareCommit(commitUserData);
}
}
/**
* getSize() returns the number of categories in the taxonomy.
* <P>
* Because categories are numbered consecutively starting with 0, it means
* the taxonomy contains ordinals 0 through getSize()-1.
* <P>
* Note that the number returned by getSize() is often slightly higher than
* the number of categories inserted into the taxonomy; This is because when
* a category is added to the taxonomy, its ancestors are also added
* automatically (including the root, which always get ordinal 0).
*/
synchronized public int getSize() {
ensureOpen();
return indexWriter.maxDoc();
}
private boolean alreadyCalledFillCache = false;
/**
* Set the number of cache misses before an attempt is made to read the
* entire taxonomy into the in-memory cache.
* <P>
* LuceneTaxonomyWriter holds an in-memory cache of recently seen
* categories to speed up operation. On each cache-miss, the on-disk index
* needs to be consulted. When an existing taxonomy is opened, a lot of
* slow disk reads like that are needed until the cache is filled, so it
* is more efficient to read the entire taxonomy into memory at once.
* We do this complete read after a certain number (defined by this method)
* of cache misses.
* <P>
* If the number is set to <CODE>0</CODE>, the entire taxonomy is read
* into the cache on first use, without fetching individual categories
* first.
* <P>
* Note that if the memory cache of choice is limited in size, and cannot
* hold the entire content of the on-disk taxonomy, then it is never
* read in its entirety into the cache, regardless of the setting of this
* method.
*/
public void setCacheMissesUntilFill(int i) {
ensureOpen();
cacheMissesUntilFill = i;
}
private int cacheMissesUntilFill = 11;
private boolean perhapsFillCache() throws IOException {
// Note: we assume that we're only called when cacheIsComplete==false.
// TODO (Facet): parametrize this criterion:
if (cacheMisses < cacheMissesUntilFill) {
return false;
}
// If the cache was already filled (or we decided not to fill it because
// there was no room), there is no sense in trying it again.
if (alreadyCalledFillCache) {
return false;
}
alreadyCalledFillCache = true;
// TODO (Facet): we should probably completely clear the cache before starting
// to read it?
if (reader == null) {
reader = openReader();
}
if (!cache.hasRoom(reader.numDocs())) {
return false;
}
CategoryPath cp = new CategoryPath();
TermDocs td = reader.termDocs();
Term fullPathTerm = new Term(Consts.FULL);
String field = fullPathTerm.field(); // needed so we can later use !=
TermEnum terms = reader.terms(fullPathTerm);
// The check is done here to avoid checking it on every iteration of the
// below loop. A null term wlil be returned if there are no terms in the
// lexicon, or after the Consts.FULL term. However while the loop is
// executed we're safe, because we only iterate as long as there are next()
// terms.
if (terms.term() != null) {
do {
Term t = terms.term();
if (t.field() != field) break;
// Since we guarantee uniqueness of categories, each term has exactly
// one document. Also, since we do not allow removing categories (and
// hence documents), there are no deletions in the index. Therefore, it
// is sufficient to call next(), and then doc(), exactly once with no
// 'validation' checks.
td.seek(t);
td.next();
cp.clear();
cp.add(t.text(), delimiter);
cache.put(cp, td.doc());
} while (terms.next());
}
cacheIsComplete = true;
// No sense to keep the reader open - we will not need to read from it
// if everything is in the cache.
reader.close();
reader = null;
return true;
}
private ParentArray parentArray;
private synchronized ParentArray getParentArray() throws IOException {
if (parentArray==null) {
if (reader == null) {
reader = openReader();
}
parentArray = new ParentArray();
parentArray.refresh(reader);
}
return parentArray;
}
public int getParent(int ordinal) throws IOException {
ensureOpen();
// Note: the following if() just enforces that a user can never ask
// for the parent of a nonexistant category - even if the parent array
// was allocated bigger than it really needs to be.
if (ordinal >= getSize()) {
throw new ArrayIndexOutOfBoundsException();
}
return getParentArray().getArray()[ordinal];
}
/**
* Take all the categories of one or more given taxonomies, and add them to
* the main taxonomy (this), if they are not already there.
* <P>
* Additionally, fill a <I>mapping</I> for each of the added taxonomies,
* mapping its ordinals to the ordinals in the enlarged main taxonomy.
* These mapping are saved into an array of OrdinalMap objects given by the
* user, one for each of the given taxonomies (not including "this", the main
* taxonomy). Often the first of these will be a MemoryOrdinalMap and the
* others will be a DiskOrdinalMap - see discussion in {OrdinalMap}.
* <P>
* Note that the taxonomies to be added are given as Directory objects,
* not opened TaxonomyReader/TaxonomyWriter objects, so if any of them are
* currently managed by an open TaxonomyWriter, make sure to commit() (or
* close()) it first. The main taxonomy (this) is an open TaxonomyWriter,
* and does not need to be commit()ed before this call.
*/
public void addTaxonomies(Directory[] taxonomies, OrdinalMap[] ordinalMaps) throws IOException {
ensureOpen();
// To prevent us stepping on the rest of this class's decisions on when
// to open a reader, and when not, we'll be opening a new reader instead
// of using the existing "reader" object:
IndexReader mainreader = openReader();
TermEnum mainte = mainreader.terms(new Term(Consts.FULL));
IndexReader[] otherreaders = new IndexReader[taxonomies.length];
TermEnum[] othertes = new TermEnum[taxonomies.length];
for (int i=0; i<taxonomies.length; i++) {
otherreaders[i] = IndexReader.open(taxonomies[i]);
othertes[i] = otherreaders[i].terms(new Term(Consts.FULL));
// Also tell the ordinal maps their expected sizes:
ordinalMaps[i].setSize(otherreaders[i].numDocs());
}
CategoryPath cp = new CategoryPath();
// We keep a "current" cursor over the alphabetically-ordered list of
// categories in each taxonomy. We start the cursor on the first
// (alphabetically) category of each taxonomy:
String currentMain;
String[] currentOthers = new String[taxonomies.length];
currentMain = nextTE(mainte);
int otherTaxonomiesLeft = 0;
for (int i=0; i<taxonomies.length; i++) {
currentOthers[i] = nextTE(othertes[i]);
if (currentOthers[i]!=null) {
otherTaxonomiesLeft++;
}
}
// And then, at each step look at the first (alphabetically) of the
// current taxonomies.
// NOTE: The most efficient way we could have done this is using a
// PriorityQueue. But for simplicity, and assuming that usually we'll
// have a very small number of other taxonomies (often just 1), we use
// a more naive algorithm (o(ntaxonomies) instead of o(ln ntaxonomies)
// per step)
while (otherTaxonomiesLeft>0) {
String first=null;
for (int i=0; i<taxonomies.length; i++) {
if (currentOthers[i]==null) continue;
if (first==null || first.compareTo(currentOthers[i])>0) {
first = currentOthers[i];
}
}
int comp = 0;
if (currentMain==null || (comp = currentMain.compareTo(first))>0) {
// If 'first' is before currentMain, or currentMain is null,
// then 'first' is a new category and we need to add it to the
// main taxonomy. Then for all taxonomies with this 'first'
// category, we need to add the new category number to their
// map, and move to the next category in all of them.
cp.clear();
cp.add(first, delimiter);
// We can call internalAddCategory() instead of addCategory()
// because we know the category hasn't been seen yet.
int newordinal = internalAddCategory(cp, cp.length());
// TODO (Facet): we already had this term in our hands before, in nextTE...
Term t = new Term(Consts.FULL, first);
for (int i=0; i<taxonomies.length; i++) {
if (first.equals(currentOthers[i])) {
// remember the remapping of this ordinal. Note how
// this requires reading a posting list from the index -
// but since we do this in lexical order of terms, just
// like Lucene's merge works, we hope there are few seeks.
// TODO (Facet): is there a quicker way? E.g., not specifying the
// next term by name every time?
TermDocs td = otherreaders[i].termDocs(t);
td.next(); // TODO (Facet): check?
int origordinal = td.doc();
ordinalMaps[i].addMapping(origordinal, newordinal);
// and move to the next category in the i'th taxonomy
currentOthers[i] = nextTE(othertes[i]);
if (currentOthers[i]==null) {
otherTaxonomiesLeft--;
}
}
}
} else if (comp==0) {
// 'first' and currentMain are the same, so both the main and some
// other taxonomies need to be moved, but a category doesn't need
// to be added because it already existed in the main taxonomy.
// TODO (Facet): Again, is there a quicker way?
Term t = new Term(Consts.FULL, first);
TermDocs td = mainreader.termDocs(t);
td.next(); // TODO (Facet): check?
int newordinal = td.doc();
currentMain = nextTE(mainte);
for (int i=0; i<taxonomies.length; i++) {
if (first.equals(currentOthers[i])) {
// TODO (Facet): again, is there a quicker way?
td = otherreaders[i].termDocs(t);
td.next(); // TODO (Facet): check?
int origordinal = td.doc();
ordinalMaps[i].addMapping(origordinal, newordinal);
// and move to the next category
currentOthers[i] = nextTE(othertes[i]);
if (currentOthers[i]==null) {
otherTaxonomiesLeft--;
}
}
}
} else /* comp > 0 */ {
// The currentMain doesn't appear in any of the other taxonomies -
// we don't need to do anything, just continue to the next one
currentMain = nextTE(mainte);
}
}
// Close all the readers we've opened, and also tell the ordinal maps
// we're done adding to them
mainreader.close();
for (int i=0; i<taxonomies.length; i++) {
otherreaders[i].close();
// We never actually added a mapping for the root ordinal - let's do
// it now, just so that the map is complete (every ordinal between 0
// and size-1 is remapped)
ordinalMaps[i].addMapping(0, 0);
ordinalMaps[i].addDone();
}
}
/**
* Mapping from old ordinal to new ordinals, used when merging indexes
* wit separate taxonomies.
* <p>
* addToTaxonomies() merges one or more taxonomies into the given taxonomy
* (this). An OrdinalMap is filled for each of the added taxonomies,
* containing the new ordinal (in the merged taxonomy) of each of the
* categories in the old taxonomy.
* <P>
* There exist two implementations of OrdinalMap: MemoryOrdinalMap and
* DiskOrdinalMap. As their names suggest, the former keeps the map in
* memory and the latter in a temporary disk file. Because these maps will
* later be needed one by one (to remap the counting lists), not all at the
* same time, it is recommended to put the first taxonomy's map in memory,
* and all the rest on disk (later to be automatically read into memory one
* by one, when needed).
*/
public static interface OrdinalMap {
/**
* Set the size of the map. This MUST be called before addMapping().
* It is assumed (but not verified) that addMapping() will then be
* called exactly 'size' times, with different origOrdinals between 0
* and size-1.
*/
public void setSize(int size) throws IOException;
public void addMapping(int origOrdinal, int newOrdinal) throws IOException;
/**
* Call addDone() to say that all addMapping() have been done.
* In some implementations this might free some resources.
*/
public void addDone() throws IOException;
/**
* Return the map from the taxonomy's original (consecutive) ordinals
* to the new taxonomy's ordinals. If the map has to be read from disk
* and ordered appropriately, it is done when getMap() is called.
* getMap() should only be called once, and only when the map is actually
* needed. Calling it will also free all resources that the map might
* be holding (such as temporary disk space), other than the returned int[].
*/
public int[] getMap() throws IOException;
}
/**
* {@link OrdinalMap} maintained in memory
*/
public static final class MemoryOrdinalMap implements OrdinalMap {
int[] map;
public void setSize(int taxonomySize) {
map = new int[taxonomySize];
}
public void addMapping(int origOrdinal, int newOrdinal) {
map[origOrdinal] = newOrdinal;
}
public void addDone() { /* nothing to do */ }
public int[] getMap() {
return map;
}
}
/**
* {@link OrdinalMap} maintained on file system
*/
public static final class DiskOrdinalMap implements OrdinalMap {
File tmpfile;
DataOutputStream out;
public DiskOrdinalMap(File tmpfile) throws FileNotFoundException {
this.tmpfile = tmpfile;
out = new DataOutputStream(new BufferedOutputStream(
new FileOutputStream(tmpfile)));
}
public void addMapping(int origOrdinal, int newOrdinal) throws IOException {
out.writeInt(origOrdinal);
out.writeInt(newOrdinal);
}
public void setSize(int taxonomySize) throws IOException {
out.writeInt(taxonomySize);
}
public void addDone() throws IOException {
if (out!=null) {
out.close();
out = null;
}
}
int[] map = null;
public int[] getMap() throws IOException {
if (map!=null) {
return map;
}
addDone(); // in case this wasn't previously called
DataInputStream in = new DataInputStream(new BufferedInputStream(
new FileInputStream(tmpfile)));
map = new int[in.readInt()];
// NOTE: The current code assumes here that the map is complete,
// i.e., every ordinal gets one and exactly one value. Otherwise,
// we may run into an EOF here, or vice versa, not read everything.
for (int i=0; i<map.length; i++) {
int origordinal = in.readInt();
int newordinal = in.readInt();
map[origordinal] = newordinal;
}
in.close();
// Delete the temporary file, which is no longer needed.
if (!tmpfile.delete()) {
tmpfile.deleteOnExit();
}
return map;
}
}
private static final String nextTE(TermEnum te) throws IOException {
if (te.next()) {
Term t = te.term();
// If our enumeration reached a different field, we're done. Note
// how we're allowed compare string references, rather than the
// actual string's contents.
if (t.field()==Consts.FULL) {
return t.text();
}
return null;
}
return null;
}
/**
* Rollback changes to the taxonomy writer and closes the instance. Following
* this method the instance becomes unusable (calling any of its API methods
* will yield an {@link AlreadyClosedException}).
*/
public void rollback() throws IOException {
ensureOpen();
indexWriter.rollback();
// since IndexWriter.rollback() closes the IW instance, we should close too.
doClose();
}
}