/******************************************************************************* * Copyright (c) 2004, 2007 IBM Corporation and Cambridge Semantics Incorporated. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * File: $Source: /cvsroot/slrp/common/com.ibm.adtech.indexer.lucene/src/com/ibm/adtech/indexer/lucene/LuceneIndexerBase.java,v $ * Created by: Wing Yung ( <a href="mailto:wingyung@us.ibm.com">wingyung@us.ibm.com </a>) * Created on: 10/11/2005 * Revision: $Id: LuceneIndexerBase.java 169 2007-07-31 14:11:15Z mroy $ * * Contributors: * IBM Corporation - initial API and implementation * Cambridge Semantics Incorporated - Fork to Anzo *******************************************************************************/ package org.openanzo.indexer.lucene; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.util.Dictionary; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.MergeScheduler; import org.apache.lucene.index.Term; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; import org.openanzo.exceptions.ExceptionConstants; import org.openanzo.exceptions.LogUtils; import org.openanzo.indexer.IIndexer; import org.openanzo.indexer.IndexerDictionary; import org.openanzo.indexer.IndexerException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Configures lucene directory from provided anzo properties file and provides basic lucene IndexWriter lifecycle management. * * @param <T> * Type of objects that will be indexed * @param <F> * Type of object that will be source of rebuild * * @author Wing Yung (<a href="mailto:wingyung@us.ibm.com">wingyung@us.ibm.com</a>) */ public abstract class LuceneIndexerBase<T, F> implements IIndexer<T, F> { private static final Logger log = LoggerFactory.getLogger(LuceneIndexerBase.class); protected String location = null; protected final StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); protected IndexWriter indexWriter; protected boolean needsIndexRebuild = false; /** * Initializes the indexer. * * The properties object should contain the location of the directory containing the index (org.openanzo.indexer.lucene.indexLocation) and whether or not * the index should be cleared upon initialization (org.openanzo.indexer.indexClear). * * Note that there are additional Lucene properties that can be set via Java system properties. For example, org.apache.lucene.writeLockTimeout (in ms) * * @throws IndexerException * {@link ExceptionConstants.INDEX#INDEX_CONFIG_PARAM_MISSING} if the {@link LuceneProperties#KEY_LUCENE_INDEX_LOCATION} property is missing */ public void initialize(Dictionary<? extends Object, ? extends Object> configProperties) throws IndexerException { location = LuceneDictionary.getIndexLocation(configProperties); if (location == null) { throw new IndexerException(ExceptionConstants.INDEX.INDEX_CONFIG_PARAM_MISSING, LuceneProperties.KEY_LUCENE_INDEX_LOCATION); } String indexerHome = LuceneDictionary.getIndexerHome(configProperties); location = getAbsoluteIndexLocation(location, indexerHome); boolean clear = IndexerDictionary.getIndexClear(configProperties); boolean removeLock = LuceneDictionary.getRemoveLockFile(configProperties); needsIndexRebuild = initialize(location, clear, removeLock); } /** * * @param rebuildIndex * @param removeLock * @param location * @param indexHome * @throws IndexerException */ public void initialize(boolean rebuildIndex, boolean removeLock, String location, String indexHome) throws IndexerException { this.location = location; if (location == null) { throw new IndexerException(ExceptionConstants.INDEX.INDEX_CONFIG_PARAM_MISSING, LuceneProperties.KEY_LUCENE_INDEX_LOCATION); } location = getAbsoluteIndexLocation(location, indexHome); needsIndexRebuild = initialize(location, rebuildIndex, removeLock); } public boolean needsIndexRebuild() { return needsIndexRebuild; } /** * If an relative index path is provided, attempt to use the ANZO_HOME property to create an absolute file path. * * @param indexDirectoryPath * @param properties * @return the absolute path to the index directory */ protected static String getAbsoluteIndexLocation(String indexDirectoryPath, String indexerHome) { if (indexDirectoryPath == null) return null; if (indexerHome != null) { File indexerDir = new File(indexerHome); File indexDirectoryFile = new File(indexDirectoryPath); if (!indexDirectoryFile.isAbsolute() && indexerDir.exists() && indexerDir.isDirectory()) { File absoluteLocation = new File(indexerDir, indexDirectoryPath); return absoluteLocation.getAbsolutePath(); } } return indexDirectoryPath; } /** * Initializes the indexer. * * @param location * the location of the directory containing the index * @param indexClear * whether or not the index should be cleared upon initialization * @param removeLock * whether or not the index's lock should be removed upon initialization * @throws IndexerException * {@link ExceptionConstants.INDEX#LOCK_FILE_NO_EXIST} if there was an exception trying to remove a lock file that didn't exist * @throws IndexerException * {@link ExceptionConstants.INDEX#FAILED_DELETE_LOCK_FILE} if there was an exception trying to remove a lock file * @throws IndexerException * {@link ExceptionConstants.INDEX#NO_DELETE_LOCK_FILE} if there was a lock file, but removeLock was false * @throws IndexerException * {@link ExceptionConstants.INDEX#FAILED_INDEX_INIT} if there was an exception initializing the index * @return true if the index has to be rebuilt */ public boolean initialize(String location, boolean indexClear, boolean removeLock) throws IndexerException { boolean clearTheIndex = indexClear; try { File f = new File(location); log.info(LogUtils.DATASOURCE_MARKER, "lucene index location:{}", location); if (!f.exists() || f.list().length == 0) { // Create a new index if there isn't one there already. log.info(LogUtils.DATASOURCE_MARKER, "Creating new index at location:{}", new String[] { location }); clearTheIndex = getIndexWriter(location, true); } else { clearTheIndex = getIndexWriter(location, indexClear); } } catch (IOException e) { Exception currException = e; while (currException != null && currException.getMessage().contains(LuceneConstants.LockExceptionMessage)) { Pattern p = Pattern.compile(LuceneConstants.LockFileExpression); Matcher m = p.matcher(e.getMessage()); String lockfilename = null; if (m.find()) { lockfilename = m.group(1); } // Depending on whether it's possible to have multiple locks (probably would // require some strange sequence of events), may want to put this into a // while loop. if (removeLock) { if (lockfilename != null) { File lockfile = new File(lockfilename); if (!lockfile.exists()) { throw new IndexerException(ExceptionConstants.INDEX.LOCK_FILE_NO_EXIST, lockfilename); } if (!lockfile.delete()) { throw new IndexerException(ExceptionConstants.INDEX.FAILED_DELETE_LOCK_FILE, lockfilename); } log.info(LogUtils.DATASOURCE_MARKER, "Deleted indexer file:{} ", lockfilename); try { clearTheIndex = getIndexWriter(location, indexClear); currException = null; } catch (IOException e1) { currException = e1; } } else { throw new IndexerException(ExceptionConstants.INDEX.FAILED_DELETE_LOCK_FILE, e); } } else { throw new IndexerException(ExceptionConstants.INDEX.NO_DELETE_LOCK_FILE, lockfilename); } } if (currException != null) { throw new IndexerException(ExceptionConstants.INDEX.FAILED_INDEX_INIT, currException); } } if (log.isInfoEnabled()) { log.info(LogUtils.DATASOURCE_MARKER, "Index Location:{}", location); if (clearTheIndex) { log.info(LogUtils.DATASOURCE_MARKER, "Index Clear"); } else { log.info(LogUtils.DATASOURCE_MARKER, "Index Don't clear"); } } return clearTheIndex; } private boolean getIndexWriter(String location, boolean indexClear) throws CorruptIndexException, LockObtainFailedException, IOException { boolean indexCleared = false; try { indexWriter = new IndexWriter(FSDirectory.open(new File(location)), analyzer, indexClear, new IndexWriter.MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH)); indexCleared = indexClear; } catch (CorruptIndexException cie) { log.error(LogUtils.DATASOURCE_MARKER, "Corrupt index error:" + cie); indexWriter = new IndexWriter(FSDirectory.open(new File(location)), analyzer, true, new IndexWriter.MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH)); indexCleared = true; } catch (FileNotFoundException cie) { log.error(LogUtils.DATASOURCE_MARKER, "Corrupt index error:" + cie); indexWriter = new IndexWriter(FSDirectory.open(new File(location)), analyzer, true, new IndexWriter.MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH)); indexCleared = true; } indexWriter.setMaxBufferedDocs(20000); MergeScheduler mergeScheduler = indexWriter.getMergeScheduler(); if (mergeScheduler instanceof ConcurrentMergeScheduler) { ConcurrentMergeScheduler cScheduler = (ConcurrentMergeScheduler) mergeScheduler; cScheduler.setMergeThreadPriority(Thread.MIN_PRIORITY); } return indexCleared; } /** * A no-op. IndexWriter takes care of all of the reading/writing in a threadsafe manner. * * @throws IndexerException */ public void preIndex() throws IndexerException { } /** * Flushes and optimizes the index. * * @throws IndexerException */ public void postIndex() throws IndexerException { try { if (indexWriter != null) indexWriter.commit(); } catch (IOException e) { log.error(LogUtils.DATASOURCE_MARKER, "Couldn't flush the index.", e); } } /** * A no-op. IndexWriter takes care of all of the reading/writing in a thread safe manner. * * @throws IndexerException */ public void preRemove() throws IndexerException { } /** * Flushes and optimizes the index. * * @throws IndexerException */ public void postRemove() throws IndexerException { try { indexWriter.commit(); } catch (IOException e) { log.error(LogUtils.DATASOURCE_MARKER, "Couldn't flush the indexs:", e); } } /** * @throws IndexerException * {@link ExceptionConstants.INDEX#FAILED_INDEX_CLEAR} if there was an exception clearing the index */ public void clear() throws IndexerException { try { if (indexWriter != null) { indexWriter.close(); indexWriter = null; } indexWriter = new IndexWriter(FSDirectory.open(new File(location)), analyzer, true, new IndexWriter.MaxFieldLength(IndexWriter.DEFAULT_MAX_FIELD_LENGTH)); log.trace(LogUtils.DATASOURCE_MARKER, "Index cleared"); } catch (IOException ioe) { throw new IndexerException(ExceptionConstants.INDEX.FAILED_INDEX_CLEAR, ioe); } } /** * Closes the resources used by the indexer. * * Closes the IndexWriter. * * @throws IndexerException * {@link ExceptionConstants.INDEX#FAILED_INDEX_CLOSE} if there was an exception closing the index */ public void close() throws IndexerException { try { indexWriter.close(); } catch (IOException e) { throw new IndexerException(ExceptionConstants.INDEX.FAILED_INDEX_CLOSE, e); } } /** * Add document to index * * @param doc * document to index * @throws IndexerException */ public void addDocument(Document doc) throws IndexerException { try { indexWriter.addDocument(doc); log.trace(LogUtils.DATASOURCE_MARKER, "statements added to index"); } catch (IOException e) { throw new IndexerException(ExceptionConstants.INDEX.FAILED_CREATE_DOC, e); } } /** * Delete document from index * * @param term * term to delete * @throws IndexerException */ public void deleteDocuments(Term term) throws IndexerException { try { indexWriter.deleteDocuments(term); log.trace(LogUtils.DATASOURCE_MARKER, "statements deleted from index"); } catch (IOException e) { throw new IndexerException(ExceptionConstants.INDEX.FAILED_DELETE_DOC, e); } } }