/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.examples.clustering;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.FSDirectory;
import org.carrot2.clustering.lingo.LingoClusteringAlgorithm;
import org.carrot2.core.Controller;
import org.carrot2.core.ControllerFactory;
import org.carrot2.core.ProcessingComponentConfiguration;
import org.carrot2.core.ProcessingResult;
import org.carrot2.core.attribute.CommonAttributesDescriptor;
import org.carrot2.examples.ConsoleFormatter;
import org.carrot2.examples.CreateLuceneIndex;
import org.carrot2.source.lucene.IFieldMapper;
import org.carrot2.source.lucene.LuceneDocumentSource;
import org.carrot2.source.lucene.LuceneDocumentSourceDescriptor;
import org.carrot2.source.lucene.SimpleFieldMapper;
import org.carrot2.util.annotations.ThreadSafe;
import org.carrot2.util.attribute.IObjectFactory;
import org.carrot2.shaded.guava.common.collect.Maps;
/**
* This example shows how to apply custom processing to documents returned by the
* {@link LuceneDocumentSource}.
* <p>
* It is assumed that you are familiar with {@link ClusteringDocumentList},
* {@link UsingCachingController} and {@link ClusteringDataFromLucene} examples.
*
* @see CreateLuceneIndex
* @see ClusteringDataFromLucene
* @see ClusteringDocumentList
* @see UsingCachingController
*/
public class ClusteringDataFromLuceneWithCustomFields
{
/**
* Entry point.
*/
public static void main(String [] args) throws IOException
{
/*
* We will use the CachingController for this example. Running
* LuceneDocumentSource within the CachingController will let us open the index
* once per component initialization and not once per query, which would be the
* case with SimpleController. We will also use this opportunity to show how
* component-specific attribute values can be passed during CachingComponent
* initialization.
*/
/*
* Create a caching controller that will reuse processing component instances, but
* will not perform any caching of results produced by components. We will leave
* caching of documents from Lucene index to Lucene and the operating system
* caches.
*/
final Controller controller = ControllerFactory.createPooling();
/*
* Prepare a map with component-specific attributes. Here, this map will contain
* the index location and names of fields to be used to fetch document title and
* summary.
*/
final Map<String, Object> luceneGlobalAttributes = new HashMap<String, Object>();
String indexPath = "put your index path here or pass as the first argument";
if (args.length == 1)
{
indexPath = args[0];
}
// Sanity check.
if (!new File(indexPath).isDirectory()) {
System.err.println("Index directory does not exist: " + indexPath);
return;
}
LuceneDocumentSourceDescriptor
.attributeBuilder(luceneGlobalAttributes)
.directory(FSDirectory.open(Paths.get(indexPath)));
/*
* In ClusteringDataFromLucene we used a simple configuration of
* LuceneDocumentSource whereby we only provided the names of Lucene fields to be
* used for titles and summaries. If more advanced mapping of Lucene documents is
* required, you can implement your own version of IFieldMapper as below.
*
* Note that we could also provide here an instance of the mapper rather than
* its class. The differences are summarized below:
*
* > Class: Class has to have a no-parameter constructor. Instances of the
* class will not be shared between processing threads, which means the
* implementation does not have to be thread-safe. Recommended in most
* situations unless the instances are expensive to create.
*
* > Instance: The provided instance will be shared across processing threads,
* which means the implementation MUST be thread-safe.
*/
LuceneDocumentSourceDescriptor
.attributeBuilder(luceneGlobalAttributes)
.fieldMapper(new CustomFieldMapper());
/*
* The Analyzer used by Lucene while searching can also be provided via factory
* because it does not have a parameterless constructor.
*/
LuceneDocumentSourceDescriptor
.attributeBuilder(luceneGlobalAttributes)
.analyzer(StandardAnalyzerFactory.class);
/*
* Initialize the controller passing the above attributes as component-specific
* for Lucene. The global attributes map will be empty. Note that we've provided
* an identifier for our specially-configured Lucene component, we'll need to use
* this identifier when performing processing.
*/
controller.init(
new HashMap<String, Object>(),
new ProcessingComponentConfiguration(
LuceneDocumentSource.class, "lucene", luceneGlobalAttributes));
/*
* Perform processing.
*/
final String query = "mining";
final Map<String, Object> processingAttributes = Maps.newHashMap();
CommonAttributesDescriptor.attributeBuilder(processingAttributes)
.query(query);
/*
* We need to refer to the Lucene component by its identifier we set during
* initialization. As we've not assigned any identifier to the
* LingoClusteringAlgorithm we want to use, we can its fully qualified class name.
*/
ProcessingResult process = controller.process(
processingAttributes, "lucene", LingoClusteringAlgorithm.class.getName());
ConsoleFormatter.displayResults(process);
}
/**
* A wrapper class producing {@link StandardAnalyzer} instances.
*/
public static final class StandardAnalyzerFactory implements IObjectFactory<Analyzer> {
@Override
public Analyzer create()
{
return new StandardAnalyzer();
}
}
/**
* Our custom Lucene -> Carrot2 content mapper. You can {@link SimpleFieldMapper}
* source code for the default implementation.
*/
@ThreadSafe
public static final class CustomFieldMapper implements IFieldMapper
{
public void map(Query luceneQuery, Analyzer analyzer, Document luceneDoc,
org.carrot2.core.Document carrot2Doc)
{
/*
* Here we need to transfer the desired content from the provided Lucene
* document to the provided Carrot2 document.
*/
carrot2Doc.setContentUrl(luceneDoc.get("url"));
carrot2Doc.setTitle(luceneDoc.get("title"));
carrot2Doc.setSummary(luceneDoc.get("snippet"));
carrot2Doc.setField("category", luceneDoc.get("rating"));
}
public String [] getSearchFields()
{
/*
* Here we need to return the names of Lucene fields that should be searched.
* Note that these fields don't necessarily have to be the same as the fields
* used in the map() method.
*/
return new String []
{
"fullContent"
};
}
}
}