/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.handler.clustering.carrot2.CarrotClusteringEngine;
import org.apache.solr.handler.component.ResponseBuilder;
import org.apache.solr.handler.component.SearchComponent;
import org.apache.solr.handler.component.ShardRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocListAndSet;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Provides a plugin for performing cluster analysis. This can either be applied to
* search results (e.g., via <a href="http://project.carrot2.org">Carrot<sup>2</sup></a>) or for
* clustering documents (e.g., via <a href="http://mahout.apache.org/">Mahout</a>).
* <p>
* See Solr example for configuration examples.</p>
*
* @lucene.experimental
*/
public class ClusteringComponent extends SearchComponent implements SolrCoreAware {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
/**
* Base name for all component parameters. This name is also used to
* register this component with SearchHandler.
*/
public static final String COMPONENT_NAME = "clustering";
/**
* Declaration-order list of search clustering engines.
*/
private final LinkedHashMap<String, SearchClusteringEngine> searchClusteringEngines = new LinkedHashMap<>();
/**
* Declaration order list of document clustering engines.
*/
private final LinkedHashMap<String, DocumentClusteringEngine> documentClusteringEngines = new LinkedHashMap<>();
/**
* An unmodifiable view of {@link #searchClusteringEngines}.
*/
private final Map<String, SearchClusteringEngine> searchClusteringEnginesView = Collections.unmodifiableMap(searchClusteringEngines);
/**
* Initialization parameters temporarily saved here, the component
* is initialized in {@link #inform(SolrCore)} because we need to know
* the core's {@link SolrResourceLoader}.
*
* @see #init(NamedList)
*/
private NamedList<Object> initParams;
/**
* Convert a DocList to a SolrDocumentList
*
* The optional param "ids" is populated with the lucene document id
* for each SolrDocument.
*
* @param docs The {@link org.apache.solr.search.DocList} to convert
* @param searcher The {@link org.apache.solr.search.SolrIndexSearcher} to use to load the docs from the Lucene index
* @param fields The names of the Fields to load
* @param ids A map to store the ids of the docs
* @return The new {@link SolrDocumentList} containing all the loaded docs
* @throws IOException if there was a problem loading the docs
* @since solr 1.4
*/
public static SolrDocumentList docListToSolrDocumentList(
DocList docs,
SolrIndexSearcher searcher,
Set<String> fields,
Map<SolrDocument, Integer> ids ) throws IOException
{
IndexSchema schema = searcher.getSchema();
SolrDocumentList list = new SolrDocumentList();
list.setNumFound(docs.matches());
list.setMaxScore(docs.maxScore());
list.setStart(docs.offset());
DocIterator dit = docs.iterator();
while (dit.hasNext()) {
int docid = dit.nextDoc();
Document luceneDoc = searcher.doc(docid, fields);
SolrDocument doc = new SolrDocument();
for( IndexableField field : luceneDoc) {
if (null == fields || fields.contains(field.name())) {
SchemaField sf = schema.getField( field.name() );
doc.addField( field.name(), sf.getType().toObject( field ) );
}
}
if (docs.hasScores() && (null == fields || fields.contains("score"))) {
doc.addField("score", dit.score());
}
list.add( doc );
if( ids != null ) {
ids.put( doc, new Integer(docid) );
}
}
return list;
}
@Override
@SuppressWarnings({"rawtypes", "unchecked"})
public void init(NamedList args) {
this.initParams = args;
super.init(args);
}
@SuppressWarnings("unchecked")
@Override
public void inform(SolrCore core) {
if (initParams != null) {
log.info("Initializing Clustering Engines");
// Our target list of engines, split into search-results and document clustering.
SolrResourceLoader loader = core.getResourceLoader();
for (Map.Entry<String,Object> entry : initParams) {
if ("engine".equals(entry.getKey())) {
NamedList<Object> engineInitParams = (NamedList<Object>) entry.getValue();
Boolean optional = engineInitParams.getBooleanArg("optional");
optional = (optional == null ? Boolean.FALSE : optional);
String engineClassName = StringUtils.defaultIfBlank(
(String) engineInitParams.get("classname"),
CarrotClusteringEngine.class.getName());
// Instantiate the clustering engine and split to appropriate map.
final ClusteringEngine engine = loader.newInstance(engineClassName, ClusteringEngine.class);
final String name = StringUtils.defaultIfBlank(engine.init(engineInitParams, core), "");
if (!engine.isAvailable()) {
if (optional) {
log.info("Optional clustering engine not available: " + name);
} else {
throw new SolrException(ErrorCode.SERVER_ERROR,
"A required clustering engine failed to initialize, check the logs: " + name);
}
}
final ClusteringEngine previousEntry;
if (engine instanceof SearchClusteringEngine) {
previousEntry = searchClusteringEngines.put(name, (SearchClusteringEngine) engine);
} else if (engine instanceof DocumentClusteringEngine) {
previousEntry = documentClusteringEngines.put(name, (DocumentClusteringEngine) engine);
} else {
log.warn("Unknown type of a clustering engine for class: " + engineClassName);
continue;
}
if (previousEntry != null) {
log.warn("Duplicate clustering engine component named '" + name + "'.");
}
}
}
// Set up the default engine key for both types of engines.
setupDefaultEngine("search results clustering", searchClusteringEngines);
setupDefaultEngine("document clustering", documentClusteringEngines);
log.info("Finished Initializing Clustering Engines");
}
}
@Override
public void prepare(ResponseBuilder rb) throws IOException {
SolrParams params = rb.req.getParams();
if (!params.getBool(COMPONENT_NAME, false)) {
return;
}
}
@Override
public void process(ResponseBuilder rb) throws IOException {
SolrParams params = rb.req.getParams();
if (!params.getBool(COMPONENT_NAME, false)) {
return;
}
final String name = getClusteringEngineName(rb);
boolean useResults = params.getBool(ClusteringParams.USE_SEARCH_RESULTS, false);
if (useResults == true) {
SearchClusteringEngine engine = searchClusteringEngines.get(name);
if (engine != null) {
checkAvailable(name, engine);
DocListAndSet results = rb.getResults();
Map<SolrDocument,Integer> docIds = new HashMap<>(results.docList.size());
SolrDocumentList solrDocList = docListToSolrDocumentList(
results.docList, rb.req.getSearcher(), engine.getFieldsToLoad(rb.req), docIds);
Object clusters = engine.cluster(rb.getQuery(), solrDocList, docIds, rb.req);
rb.rsp.add("clusters", clusters);
} else {
log.warn("No engine named: " + name);
}
}
boolean useCollection = params.getBool(ClusteringParams.USE_COLLECTION, false);
if (useCollection == true) {
DocumentClusteringEngine engine = documentClusteringEngines.get(name);
if (engine != null) {
checkAvailable(name, engine);
boolean useDocSet = params.getBool(ClusteringParams.USE_DOC_SET, false);
NamedList<?> nl = null;
// TODO: This likely needs to be made into a background task that runs in an executor
if (useDocSet == true) {
nl = engine.cluster(rb.getResults().docSet, params);
} else {
nl = engine.cluster(params);
}
rb.rsp.add("clusters", nl);
} else {
log.warn("No engine named: " + name);
}
}
}
private void checkAvailable(String name, ClusteringEngine engine) {
if (!engine.isAvailable()) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"Clustering engine declared, but not available, check the logs: " + name);
}
}
private String getClusteringEngineName(ResponseBuilder rb){
return rb.req.getParams().get(ClusteringParams.ENGINE_NAME, ClusteringEngine.DEFAULT_ENGINE_NAME);
}
@Override
public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) {
SolrParams params = rb.req.getParams();
if (!params.getBool(COMPONENT_NAME, false) || !params.getBool(ClusteringParams.USE_SEARCH_RESULTS, false)) {
return;
}
sreq.params.remove(COMPONENT_NAME);
if( ( sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS ) != 0 ){
String fl = sreq.params.get(CommonParams.FL,"*");
// if fl=* then we don't need to check.
if (fl.indexOf('*') >= 0) {
return;
}
String name = getClusteringEngineName(rb);
SearchClusteringEngine engine = searchClusteringEngines.get(name);
if (engine != null) {
checkAvailable(name, engine);
Set<String> fields = engine.getFieldsToLoad(rb.req);
if (fields == null || fields.size() == 0) {
return;
}
StringBuilder sb = new StringBuilder();
String[] flparams = fl.split( "[,\\s]+" );
Set<String> flParamSet = new HashSet<>(flparams.length);
for (String flparam : flparams) {
// no need trim() because of split() by \s+
flParamSet.add(flparam);
}
for (String aFieldToLoad : fields) {
if (!flParamSet.contains(aFieldToLoad )) {
sb.append(',').append(aFieldToLoad);
}
}
if (sb.length() > 0) {
sreq.params.set(CommonParams.FL, fl + sb.toString());
}
} else {
log.warn("No engine named: " + name);
}
}
}
@Override
public void finishStage(ResponseBuilder rb) {
SolrParams params = rb.req.getParams();
if (!params.getBool(COMPONENT_NAME, false) ||
!params.getBool(ClusteringParams.USE_SEARCH_RESULTS, false)) {
return;
}
if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) {
String name = getClusteringEngineName(rb);
SearchClusteringEngine engine = searchClusteringEngines.get(name);
if (engine != null) {
checkAvailable(name, engine);
SolrDocumentList solrDocList = (SolrDocumentList) rb.rsp.getResponse();
// TODO: Currently, docIds is set to null in distributed environment.
// This causes CarrotParams.PRODUCE_SUMMARY doesn't work.
// To work CarrotParams.PRODUCE_SUMMARY under distributed mode, we can choose either one of:
// (a) In each shard, ClusteringComponent produces summary and finishStage()
// merges these summaries.
// (b) Adding doHighlighting(SolrDocumentList, ...) method to SolrHighlighter and
// making SolrHighlighter uses "external text" rather than stored values to produce snippets.
Map<SolrDocument,Integer> docIds = null;
Object clusters = engine.cluster(rb.getQuery(), solrDocList, docIds, rb.req);
rb.rsp.add("clusters", clusters);
} else {
log.warn("No engine named: " + name);
}
}
}
/**
* @return Expose for tests.
*/
Map<String, SearchClusteringEngine> getSearchClusteringEngines() {
return searchClusteringEnginesView;
}
@Override
public String getDescription() {
return "A Clustering component";
}
/**
* Setup the default clustering engine.
* @see "https://issues.apache.org/jira/browse/SOLR-5219"
*/
private static <T extends ClusteringEngine> void setupDefaultEngine(String type, LinkedHashMap<String,T> map) {
// If there's already a default algorithm, leave it as is.
String engineName = ClusteringEngine.DEFAULT_ENGINE_NAME;
T defaultEngine = map.get(engineName);
if (defaultEngine == null ||
!defaultEngine.isAvailable()) {
// If there's no default algorithm, and there are any algorithms available,
// the first definition becomes the default algorithm.
for (Map.Entry<String, T> e : map.entrySet()) {
if (e.getValue().isAvailable()) {
engineName = e.getKey();
defaultEngine = e.getValue();
map.put(ClusteringEngine.DEFAULT_ENGINE_NAME, defaultEngine);
break;
}
}
}
if (defaultEngine != null) {
log.info("Default engine for " + type + ": " + engineName + " [" + defaultEngine.getClass().getSimpleName() + "]");
} else {
log.warn("No default engine for " + type + ".");
}
}
}