/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.source.lucene; import java.io.IOException; import java.util.Collection; import java.util.IdentityHashMap; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; import org.carrot2.core.Document; import org.carrot2.core.Document.IDocumentSerializationListener; import org.carrot2.core.IControllerContext; import org.carrot2.core.IControllerContextListener; import org.carrot2.core.IDocumentSource; import org.carrot2.core.ProcessingComponentBase; import org.carrot2.core.ProcessingException; import org.carrot2.core.attribute.AttributeNames; import org.carrot2.core.attribute.CommonAttributes; import org.carrot2.core.attribute.Init; import org.carrot2.core.attribute.Internal; import org.carrot2.core.attribute.Processing; import org.carrot2.source.SearchEngineResponse; import org.carrot2.util.ExceptionUtils; import org.carrot2.util.attribute.Attribute; import org.carrot2.util.attribute.AttributeLevel; import org.carrot2.util.attribute.AttributeUtils; import org.carrot2.util.attribute.Bindable; import org.carrot2.util.attribute.DefaultGroups; import org.carrot2.util.attribute.Group; import org.carrot2.util.attribute.Input; import org.carrot2.util.attribute.Label; import org.carrot2.util.attribute.Level; import org.carrot2.util.attribute.Output; import org.carrot2.util.attribute.Required; import org.carrot2.util.attribute.constraint.ImplementingClasses; import org.carrot2.util.attribute.constraint.IntRange; import org.carrot2.util.attribute.constraint.NotBlank; import org.carrot2.util.simplexml.SimpleXmlWrappers; import org.slf4j.Logger; import org.carrot2.shaded.guava.common.collect.Maps; /** * A {@link IDocumentSource} fetching {@link Document}s from a local Apache Lucene index. * The index should be binary-compatible with the Lucene version actually imported by this * plugin. */ @Bindable(prefix = "LuceneDocumentSource", inherit = CommonAttributes.class) public final class LuceneDocumentSource extends ProcessingComponentBase implements IDocumentSource { protected final static String INDEX_PROPERTIES = "Index properties"; /** Logger for this class. */ private final static Logger logger = org.slf4j.LoggerFactory .getLogger(LuceneDocumentSource.class); /* * Register selected SimpleXML wrappers for Lucene data types. */ static { SimpleXmlWrappers.addWrapper( FSDirectory.class, FSDirectoryWrapper.class, false); } @Processing @Input @Attribute(key = AttributeNames.RESULTS, inherit = true) @IntRange(min = 1) public int results = 100; @Processing @Output @Attribute(key = AttributeNames.RESULTS_TOTAL, inherit = true) public long resultsTotal; @Processing @Output @Attribute(key = AttributeNames.DOCUMENTS, inherit = true) @Internal public Collection<Document> documents; /** * Search index {@link org.apache.lucene.store.Directory}. Must be unlocked for * reading. */ @Input @Attribute @Init @Processing @Required @Internal(configuration = true) @ImplementingClasses(classes = { RAMDirectory.class, FSDirectory.class }, strict = false) @Label("Index directory") @Level(AttributeLevel.BASIC) @Group(INDEX_PROPERTIES) public Directory directory; /** * {@link org.apache.lucene.analysis.Analyzer} used at indexing time. The same * analyzer should be used for querying. */ @Input @Init @Processing @Required @Attribute @Internal(configuration = false) @ImplementingClasses(classes = { /* No suggestions for default implementations. */ }, strict = false) @Label("Analyzer") @Level(AttributeLevel.MEDIUM) @Group(INDEX_PROPERTIES) public Analyzer analyzer = new StandardAnalyzer(); /** * {@link IFieldMapper} provides the link between Carrot2 * {@link org.carrot2.core.Document} fields and Lucene index fields. */ @Input @Init @Processing @Required @Attribute @Internal @ImplementingClasses(classes = { SimpleFieldMapper.class }, strict = false) @Label("Field mapper") @Level(AttributeLevel.ADVANCED) @Group(SimpleFieldMapper.INDEX_FIELD_MAPPING) public IFieldMapper fieldMapper = new SimpleFieldMapper(); /** * A pre-parsed {@link org.apache.lucene.search.Query} object or a {@link String} * parsed using the built-in classic QueryParser over a * set of search fields returned from the {@link #fieldMapper}. */ @Input @Processing @Attribute(key = AttributeNames.QUERY, inherit = false) // false intentional! @Required @ImplementingClasses(classes = { Query.class, String.class }, strict = false) @NotBlank @Label("Query") @Level(AttributeLevel.BASIC) @Group(DefaultGroups.QUERY) public Object query; /** * Keeps references to Lucene document instances in Carrot2 documents. Please bear in * mind two limitations: * <ul> * <li><strong>Lucene documents will not be serialized to XML/JSON.</strong> * Therefore, they can only be accessed when invoking clustering through Carrot2 Java * API. To pass some of the fields of Lucene documents to Carrot2 XML/JSON output, * implement a custom {@link IFieldMapper} that will store those fields as regular * Carrot2 fields.</li> * <li><strong>Increased memory usage</strong> when using a {@link org.carrot2.core.Controller} * {@link org.carrot2.core.ControllerFactory#createCachingPooling(Class...) configured to cache} the * output from {@link LuceneDocumentSource}.</li> * </ul> */ @Input @Processing @Attribute @Internal @Label("Keep Lucene documents") @Level(AttributeLevel.ADVANCED) @Group(DefaultGroups.RESULT_INFO) public boolean keepLuceneDocuments = false; /** * Carrot2 {@link Document} field that stores the original Lucene document instance. * Keeping of Lucene document instances is disabled by default. Enable it using the * {@link #keepLuceneDocuments} attribute. */ public final static String LUCENE_DOCUMENT_FIELD = "luceneDocument"; /** * A context-shared map between {@link org.apache.lucene.store.Directory} objects and * any opened {@link org.apache.lucene.search.IndexSearcher}s. */ private IdentityHashMap<Directory, IndexSearcher> openIndexes; /** * Controller context serving as the synchronization monitor when opening indices. */ private IControllerContext context; /** * A serialization listener that prevents Lucene documents from appearing in the * Carrot2 documents serialized to XML/JSON. */ private static final IDocumentSerializationListener removeLuceneDocument = new IDocumentSerializationListener() { @Override public void beforeSerialization(Document document, Map<String, ?> otherFieldsForSerialization) { otherFieldsForSerialization.remove(LUCENE_DOCUMENT_FIELD); } }; /* * */ @SuppressWarnings("unchecked") @Override public void init(IControllerContext context) { super.init(context); this.context = context; synchronized (context) { final String key = AttributeUtils.getKey(getClass(), "openIndexes"); if (context.getAttribute(key) == null) { context.setAttribute(key, Maps.newIdentityHashMap()); context.addListener(new IControllerContextListener() { public void beforeDisposal(IControllerContext context) { closeAllIndexes(); } }); } this.openIndexes = (IdentityHashMap<Directory, IndexSearcher>) context .getAttribute(key); } } /* * */ public void process() throws ProcessingException { try { final SearchEngineResponse response = fetchSearchResponse(); documents = response.results; resultsTotal = response.getResultsTotal(); } catch (Exception e) { throw ExceptionUtils.wrapAs(ProcessingException.class, e); } } /** * Fetch search engine response. */ protected SearchEngineResponse fetchSearchResponse() throws Exception { if (directory == null) { throw new ProcessingException("Directory attribute must not be empty."); } if (this.query instanceof String) { final String [] searchFields = fieldMapper.getSearchFields(); if (searchFields == null || searchFields.length == 0) { throw new ProcessingException( "At least one search field must be given for a plain text query. " + "Alternatively, use a Lucene Query object."); } final String textQuery = (String) query; if (StringUtils.isEmpty(textQuery)) { throw new ProcessingException( "An instantiated Lucene Query object or a non-empty " + "plain text query is required."); } if (searchFields.length == 1) { query = new QueryParser(searchFields[0], analyzer) .parse(textQuery); } else { query = new MultiFieldQueryParser(searchFields, analyzer).parse(textQuery); } } final SearchEngineResponse response = new SearchEngineResponse(); final IndexSearcher searcher = indexOpen(directory); final TopDocs docs = searcher.search((Query) query, results); response.metadata.put(SearchEngineResponse.RESULTS_TOTAL_KEY, docs.totalHits); for (ScoreDoc scoreDoc : docs.scoreDocs) { final Document doc = new Document(); final org.apache.lucene.document.Document luceneDoc = searcher .doc(scoreDoc.doc); // Set score before mapping to give the mapper a chance to override it doc.setScore((double) scoreDoc.score); if (keepLuceneDocuments) { doc.setField(LUCENE_DOCUMENT_FIELD, luceneDoc); doc.addSerializationListener(removeLuceneDocument); } this.fieldMapper.map((Query) query, analyzer, luceneDoc, doc); response.results.add(doc); } return response; } /** * Close all opened indexes in the shared context. */ private void closeAllIndexes() { synchronized (context) { for (IndexSearcher searcher : openIndexes.values()) { try { searcher.getIndexReader().close(); } catch (IOException e) { logger.warn("Could not close search index: " + searcher, e); } } } } /** * Open or retrieve an open handle to an {@link IndexSearcher}. */ private IndexSearcher indexOpen(Directory directory) throws ProcessingException { synchronized (context) { IndexSearcher searcher = openIndexes.get(directory); if (searcher == null) { try { searcher = new IndexSearcher(DirectoryReader.open(directory)); openIndexes.put(directory, searcher); } catch (IOException e) { throw ExceptionUtils.wrapAs(ProcessingException.class, e); } } return searcher; } } }