/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.jackrabbit.core.query.lucene; import java.io.IOException; import java.io.OutputStream; import java.io.PrintStream; import java.io.StringReader; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.concurrent.CountDownLatch; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.IndexDeletionPolicy; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LogByteSizeMergePolicy; import org.apache.lucene.index.LogMergePolicy; import org.apache.lucene.index.Payload; import org.apache.lucene.index.Term; import org.apache.lucene.search.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Version; import org.apache.tika.io.IOExceptionWithCause; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Implements common functionality for a lucene index. * <p> * Note on synchronization: This class is not entirely thread-safe. Certain * concurrent access is however allowed. Read-only access on this index using * {@link #getReadOnlyIndexReader()} is thread-safe. That is, multiple threads * my call that method concurrently and use the returned IndexReader at the same * time.<br/> * Modifying threads must be synchronized externally in a way that only one * thread is using the returned IndexReader and IndexWriter instances returned * by {@link #getIndexReader()} and {@link #getIndexWriter()} at a time.<br/> * Concurrent access by <b>one</b> modifying thread and multiple read-only * threads is safe! */ abstract class AbstractIndex { /** The logger instance for this class */ private static final Logger log = LoggerFactory.getLogger(AbstractIndex.class); /** PrintStream that pipes all calls to println(String) into log.info() */ private static final LoggingPrintStream STREAM_LOGGER = new LoggingPrintStream(); /** Executor with a pool size equal to the number of available processors */ private final DynamicPooledExecutor executor = new DynamicPooledExecutor(); /** The currently set IndexWriter or <code>null</code> if none is set */ private IndexWriter indexWriter; /** The currently set IndexReader or <code>null</code> if none is set */ private CommittableIndexReader indexReader; /** The underlying Directory where the index is stored */ private Directory directory; /** Analyzer we use to tokenize text */ private Analyzer analyzer; /** The similarity in use for indexing and searching. */ private final Similarity similarity; /** Compound file flag */ private boolean useCompoundFile = true; /** termInfosIndexDivisor config parameter */ private int termInfosIndexDivisor = SearchIndex.DEFAULT_TERM_INFOS_INDEX_DIVISOR; /** * The document number cache if this index may use one. */ private DocNumberCache cache; /** The shared IndexReader for all read-only IndexReaders */ private SharedIndexReader sharedReader; /** * The most recent read-only reader if there is any. */ private ReadOnlyIndexReader readOnlyReader; /** * The indexing queue. */ private IndexingQueue indexingQueue; /** * Flag that indicates whether there was an index present in the directory * when this AbstractIndex was created. */ private boolean isExisting; /** * Constructs an index with an <code>analyzer</code> and a * <code>directory</code>. * * @param analyzer the analyzer for text tokenizing. * @param similarity the similarity implementation. * @param directory the underlying directory. * @param cache the document number cache if this index should use * one; otherwise <code>cache</code> is * <code>null</code>. * @param indexingQueue the indexing queue. * @throws IOException if the index cannot be initialized. */ AbstractIndex(Analyzer analyzer, Similarity similarity, Directory directory, DocNumberCache cache, IndexingQueue indexingQueue) throws IOException { this.analyzer = analyzer; this.similarity = similarity; this.directory = directory; this.cache = cache; this.indexingQueue = indexingQueue; this.isExisting = IndexReader.indexExists(directory); if (!isExisting) { indexWriter = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_36, analyzer)); // immediately close, now that index has been created indexWriter.close(); indexWriter = null; } } /** * Default implementation returns the same instance as passed * in the constructor. * * @return the directory instance passed in the constructor */ Directory getDirectory() { return directory; } /** * Returns <code>true</code> if this index was openend on a directory with * an existing index in it; <code>false</code> otherwise. * * @return <code>true</code> if there was an index present when this index * was created; <code>false</code> otherwise. */ boolean isExisting() { return isExisting; } /** * Adds documents to this index and invalidates the shared reader. * * @param docs the documents to add. * @throws IOException if an error occurs while writing to the index. */ void addDocuments(Document[] docs) throws IOException { final List<IOException> exceptions = Collections.synchronizedList(new ArrayList<IOException>()); final CountDownLatch latch = new CountDownLatch(docs.length); final IndexWriter writer = getIndexWriter(); for (final Document doc : docs) { executor.execute(new Runnable() { public void run() { try { // check if text extractor completed its work Document document = getFinishedDocument(doc); if (log.isDebugEnabled()) { long start = System.nanoTime(); writer.addDocument(document); log.debug("Inverted a document in {}us", (System.nanoTime() - start) / 1000); } else { writer.addDocument(document); } } catch (IOException e) { log.warn("Exception while inverting a document", e); exceptions.add(e); } finally { latch.countDown(); } } }); } for (;;) { try { latch.await(); break; } catch (InterruptedException e) { // retry } } invalidateSharedReader(); if (!exceptions.isEmpty()) { throw new IOExceptionWithCause( exceptions.size() + " of " + docs.length + " background indexer tasks failed", exceptions.get(0)); } } /** * Removes the document from this index. This call will not invalidate * the shared reader. If a subclass whishes to do so, it should overwrite * this method and call {@link #invalidateSharedReader()}. * * @param idTerm the id term of the document to remove. * @throws IOException if an error occurs while removing the document. * @return number of documents deleted */ int removeDocument(Term idTerm) throws IOException { return getIndexReader().deleteDocuments(idTerm); } /** * Returns an <code>IndexReader</code> on this index. This index reader * may be used to delete documents. * * @return an <code>IndexReader</code> on this index. * @throws IOException if the reader cannot be obtained. */ protected synchronized CommittableIndexReader getIndexReader() throws IOException { if (indexWriter != null) { indexWriter.close(); log.debug("closing IndexWriter."); indexWriter = null; } if (indexReader == null) { IndexDeletionPolicy idp = getIndexDeletionPolicy(); IndexReader reader = IndexReader.open(getDirectory(), idp, false, termInfosIndexDivisor); indexReader = new CommittableIndexReader(reader); } return indexReader; } /** * Returns the index deletion policy for this index. This implementation * always returns <code>null</code>. * * @return the index deletion policy for this index or <code>null</code> if * none is present. */ protected IndexDeletionPolicy getIndexDeletionPolicy() { return null; } /** * Returns a read-only index reader, that can be used concurrently with * other threads writing to this index. The returned index reader is * read-only, that is, any attempt to delete a document from the index * will throw an <code>UnsupportedOperationException</code>. * * @param initCache if the caches in the index reader should be initialized * before the index reader is returned. * @return a read-only index reader. * @throws IOException if an error occurs while obtaining the index reader. */ synchronized ReadOnlyIndexReader getReadOnlyIndexReader(boolean initCache) throws IOException { // get current modifiable index reader CommittableIndexReader modifiableReader = getIndexReader(); long modCount = modifiableReader.getModificationCount(); if (readOnlyReader != null) { if (readOnlyReader.getDeletedDocsVersion() == modCount) { // reader up-to-date readOnlyReader.acquire(); return readOnlyReader; } else { // reader outdated if (readOnlyReader.getRefCountJr() == 1) { // not in use, except by this index // update the reader readOnlyReader.updateDeletedDocs(modifiableReader); readOnlyReader.acquire(); return readOnlyReader; } else { // cannot update reader, it is still in use // need to create a new instance readOnlyReader.release(); readOnlyReader = null; } } } // if we get here there is no up-to-date read-only reader if (sharedReader == null) { // create new shared reader IndexReader reader = IndexReader.open(getDirectory(), termInfosIndexDivisor); CachingIndexReader cr = new CachingIndexReader( reader, cache, initCache); sharedReader = new SharedIndexReader(cr); } readOnlyReader = new ReadOnlyIndexReader(sharedReader, modifiableReader.getDeletedDocs(), modCount); readOnlyReader.acquire(); return readOnlyReader; } /** * Returns a read-only index reader, that can be used concurrently with * other threads writing to this index. The returned index reader is * read-only, that is, any attempt to delete a document from the index * will throw an <code>UnsupportedOperationException</code>. * * @return a read-only index reader. * @throws IOException if an error occurs while obtaining the index reader. */ protected ReadOnlyIndexReader getReadOnlyIndexReader() throws IOException { return getReadOnlyIndexReader(false); } /** * Returns an <code>IndexWriter</code> on this index. * @return an <code>IndexWriter</code> on this index. * @throws IOException if the writer cannot be obtained. */ protected synchronized IndexWriter getIndexWriter() throws IOException { if (indexReader != null) { indexReader.close(); log.debug("closing IndexReader."); indexReader = null; } if (indexWriter == null) { IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer); config.setSimilarity(similarity); LogMergePolicy mergePolicy = new LogByteSizeMergePolicy(); mergePolicy.setUseCompoundFile(useCompoundFile); mergePolicy.setNoCFSRatio(1.0); config.setMergePolicy(mergePolicy); indexWriter = new IndexWriter(getDirectory(), config); indexWriter.setInfoStream(STREAM_LOGGER); } return indexWriter; } /** * Commits all pending changes to the underlying <code>Directory</code>. * @throws IOException if an error occurs while commiting changes. */ protected void commit() throws IOException { commit(false); } /** * Commits all pending changes to the underlying <code>Directory</code>. * * @param optimize if <code>true</code> the index is optimized after the * commit. * @throws IOException if an error occurs while commiting changes. */ protected synchronized void commit(boolean optimize) throws IOException { if (indexReader != null) { log.debug("committing IndexReader."); indexReader.flush(); } if (indexWriter != null) { log.debug("committing IndexWriter."); indexWriter.commit(); } // optimize if requested if (optimize) { IndexWriter writer = getIndexWriter(); writer.forceMerge(1, true); writer.close(); indexWriter = null; } } /** * Closes this index, releasing all held resources. */ synchronized void close() { releaseWriterAndReaders(); if (directory != null) { try { directory.close(); } catch (IOException e) { directory = null; } } executor.close(); } /** * Releases all potentially held index writer and readers. */ protected void releaseWriterAndReaders() { if (indexWriter != null) { try { indexWriter.close(); } catch (IOException e) { log.warn("Exception closing index writer: " + e.toString()); } indexWriter = null; } if (indexReader != null) { try { indexReader.close(); } catch (IOException e) { log.warn("Exception closing index reader: " + e.toString()); } indexReader = null; } if (readOnlyReader != null) { try { readOnlyReader.release(); } catch (IOException e) { log.warn("Exception closing index reader: " + e.toString()); } readOnlyReader = null; } if (sharedReader != null) { try { sharedReader.release(); } catch (IOException e) { log.warn("Exception closing index reader: " + e.toString()); } sharedReader = null; } } /** * @return the number of bytes this index occupies in memory. */ synchronized long getRamSizeInBytes() { if (indexWriter != null) { return indexWriter.ramSizeInBytes(); } else { return 0; } } /** * Closes the shared reader. * * @throws IOException if an error occurs while closing the reader. */ protected synchronized void invalidateSharedReader() throws IOException { // also close the read-only reader if (readOnlyReader != null) { readOnlyReader.release(); readOnlyReader = null; } // invalidate shared reader if (sharedReader != null) { sharedReader.release(); sharedReader = null; } } /** * Returns a document that is finished with text extraction and is ready to * be added to the index. * * @param doc the document to check. * @return <code>doc</code> if it is finished already or a stripped down * copy of <code>doc</code> without text extractors. * @throws IOException if the document cannot be added to the indexing * queue. */ private Document getFinishedDocument(Document doc) throws IOException { if (!Util.isDocumentReady(doc)) { Document copy = new Document(); // mark the document that reindexing is required copy.add(new Field(FieldNames.REINDEXING_REQUIRED, false, "", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO)); for (Fieldable f : doc.getFields()) { Fieldable field = null; Field.TermVector tv = getTermVectorParameter(f); Field.Store stored = f.isStored() ? Field.Store.YES : Field.Store.NO; Field.Index indexed = getIndexParameter(f); if (f instanceof LazyTextExtractorField || f.readerValue() != null) { // replace all readers with empty string reader field = new Field(f.name(), new StringReader(""), tv); } else if (f.stringValue() != null) { field = new Field(f.name(), false, f.stringValue(), stored, indexed, tv); } else if (f.isBinary()) { field = new Field(f.name(), f.getBinaryValue(), stored); } else if (f.tokenStreamValue() != null && f.tokenStreamValue() instanceof SingletonTokenStream) { TokenStream tokenStream = f.tokenStreamValue(); TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class); PayloadAttribute payloadAttribute = tokenStream.addAttribute(PayloadAttribute.class); tokenStream.incrementToken(); String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength()); tokenStream.reset(); field = new Field(f.name(), new SingletonTokenStream(value, (Payload) payloadAttribute.getPayload().clone())); } if (field != null) { field.setOmitNorms(f.getOmitNorms()); copy.add(field); } } // schedule the original document for later indexing Document existing = indexingQueue.addDocument(doc); if (existing != null) { // the queue already contained a pending document for this // node. -> dispose the document Util.disposeDocument(existing); } // use the stripped down copy for now doc = copy; } return doc; } //-------------------------< properties >----------------------------------- /** * Whether the index writer should use the compound file format */ void setUseCompoundFile(boolean b) { useCompoundFile = b; } /** * @return the current value for termInfosIndexDivisor. */ public int getTermInfosIndexDivisor() { return termInfosIndexDivisor; } /** * Sets a new value for termInfosIndexDivisor. * * @param termInfosIndexDivisor the new value. */ public void setTermInfosIndexDivisor(int termInfosIndexDivisor) { this.termInfosIndexDivisor = termInfosIndexDivisor; } //------------------------------< internal >-------------------------------- /** * Returns the index parameter set on <code>f</code>. * * @param f a lucene field. * @return the index parameter on <code>f</code>. */ private static Field.Index getIndexParameter(Fieldable f) { if (!f.isIndexed()) { return Field.Index.NO; } else if (f.isTokenized()) { return Field.Index.ANALYZED; } else { return Field.Index.NOT_ANALYZED; } } /** * Returns the term vector parameter set on <code>f</code>. * * @param f a lucene field. * @return the term vector parameter on <code>f</code>. */ private static Field.TermVector getTermVectorParameter(Fieldable f) { if (f.isStorePositionWithTermVector() && f.isStoreOffsetWithTermVector()) { return Field.TermVector.WITH_POSITIONS_OFFSETS; } else if (f.isStorePositionWithTermVector()) { return Field.TermVector.WITH_POSITIONS; } else if (f.isStoreOffsetWithTermVector()) { return Field.TermVector.WITH_OFFSETS; } else if (f.isTermVectorStored()) { return Field.TermVector.YES; } else { return Field.TermVector.NO; } } /** * Adapter to pipe info messages from lucene into log messages. */ private static final class LoggingPrintStream extends PrintStream { /** Buffer print calls until a newline is written */ private StringBuffer buffer = new StringBuffer(); public LoggingPrintStream() { super(new OutputStream() { public void write(int b) { // do nothing } }); } public void print(String s) { buffer.append(s); } public void println(String s) { buffer.append(s); log.debug(buffer.toString()); buffer.setLength(0); } } }