package org.apache.solr.handler.clustering; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.core.SolrCore; import org.apache.solr.core.SolrResourceLoader; import org.apache.solr.handler.clustering.carrot2.CarrotClusteringEngine; import org.apache.solr.handler.component.ResponseBuilder; import org.apache.solr.handler.component.SearchComponent; import org.apache.solr.handler.component.ShardRequest; import org.apache.solr.search.DocListAndSet; import org.apache.solr.util.plugin.SolrCoreAware; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; /** * Provide a plugin for clustering results. Can either be for search results (i.e. via Carrot2) or for * clustering documents (i.e. via Mahout) * <p/> * This engine is experimental. Output from this engine is subject to change in future releases. * * <pre class="prettyprint" > * <searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="clustering"> * <lst name="engine"> * <str name="name">default</str> * <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str> * </lst> * </searchComponent></pre> */ public class ClusteringComponent extends SearchComponent implements SolrCoreAware { private transient static Logger log = LoggerFactory.getLogger(ClusteringComponent.class); private Map<String, SearchClusteringEngine> searchClusteringEngines = new HashMap<String, SearchClusteringEngine>(); private Map<String, DocumentClusteringEngine> documentClusteringEngines = new HashMap<String, DocumentClusteringEngine>(); /** * Base name for all spell checker query parameters. This name is also used to * register this component with SearchHandler. */ public static final String COMPONENT_NAME = "clustering"; private NamedList initParams; @Override public void prepare(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(COMPONENT_NAME, false)) { return; } } @Override public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(COMPONENT_NAME, false)) { return; } String name = getClusteringEngineName(rb); boolean useResults = params.getBool(ClusteringParams.USE_SEARCH_RESULTS, false); if (useResults == true) { SearchClusteringEngine engine = getSearchClusteringEngine(rb); if (engine != null) { DocListAndSet results = rb.getResults(); Map<SolrDocument,Integer> docIds = new HashMap<SolrDocument, Integer>(results.docList.size()); SolrDocumentList solrDocList = engine.getSolrDocumentList(results.docList, rb.req, docIds); Object clusters = engine.cluster(rb.getQuery(), solrDocList, docIds, rb.req); rb.rsp.add("clusters", clusters); } else { log.warn("No engine for: " + name); } } boolean useCollection = params.getBool(ClusteringParams.USE_COLLECTION, false); if (useCollection == true) { DocumentClusteringEngine engine = documentClusteringEngines.get(name); if (engine != null) { boolean useDocSet = params.getBool(ClusteringParams.USE_DOC_SET, false); NamedList nl = null; //TODO: This likely needs to be made into a background task that runs in an executor if (useDocSet == true) { nl = engine.cluster(rb.getResults().docSet, params); } else { nl = engine.cluster(params); } rb.rsp.add("clusters", nl); } else { log.warn("No engine for " + name); } } } private SearchClusteringEngine getSearchClusteringEngine(ResponseBuilder rb){ return searchClusteringEngines.get(getClusteringEngineName(rb)); } private String getClusteringEngineName(ResponseBuilder rb){ return rb.req.getParams().get(ClusteringParams.ENGINE_NAME, ClusteringEngine.DEFAULT_ENGINE_NAME); } @Override public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) { SolrParams params = rb.req.getParams(); if (!params.getBool(COMPONENT_NAME, false) || !params.getBool(ClusteringParams.USE_SEARCH_RESULTS, false)) { return; } sreq.params.remove(COMPONENT_NAME); if( ( sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS ) != 0 ){ String fl = sreq.params.get(CommonParams.FL,"*"); // if fl=* then we don't need check if( fl.indexOf( '*' ) >= 0 ) return; Set<String> fields = getSearchClusteringEngine(rb).getFieldsToLoad(rb.req); if( fields == null || fields.size() == 0 ) return; StringBuilder sb = new StringBuilder(); String[] flparams = fl.split( "[,\\s]+" ); Set<String> flParamSet = new HashSet<String>(flparams.length); for( String flparam : flparams ){ // no need trim() because of split() by \s+ flParamSet.add(flparam); } for( String aFieldToLoad : fields ){ if( !flParamSet.contains( aFieldToLoad ) ){ sb.append( ',' ).append( aFieldToLoad ); } } if( sb.length() > 0 ){ sreq.params.set( CommonParams.FL, fl + sb.toString() ); } } } @Override public void finishStage(ResponseBuilder rb) { SolrParams params = rb.req.getParams(); if (!params.getBool(COMPONENT_NAME, false) || !params.getBool(ClusteringParams.USE_SEARCH_RESULTS, false)) { return; } if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) { SearchClusteringEngine engine = getSearchClusteringEngine(rb); if (engine != null) { SolrDocumentList solrDocList = (SolrDocumentList)rb.rsp.getValues().get("response"); // TODO: Currently, docIds is set to null in distributed environment. // This causes CarrotParams.PRODUCE_SUMMARY doesn't work. // To work CarrotParams.PRODUCE_SUMMARY under distributed mode, we can choose either one of: // (a) In each shard, ClusteringComponent produces summary and finishStage() // merges these summaries. // (b) Adding doHighlighting(SolrDocumentList, ...) method to SolrHighlighter and // making SolrHighlighter uses "external text" rather than stored values to produce snippets. Map<SolrDocument,Integer> docIds = null; Object clusters = engine.cluster(rb.getQuery(), solrDocList, docIds, rb.req); rb.rsp.add("clusters", clusters); } else { String name = getClusteringEngineName(rb); log.warn("No engine for: " + name); } } } @Override @SuppressWarnings("unchecked") public void init(NamedList args) { super.init(args); this.initParams = args; } public void inform(SolrCore core) { if (initParams != null) { log.info("Initializing Clustering Engines"); boolean searchHasDefault = false; boolean documentHasDefault = false; for (int i = 0; i < initParams.size(); i++) { if (initParams.getName(i).equals("engine")) { NamedList engineNL = (NamedList) initParams.getVal(i); String className = (String) engineNL.get("classname"); if (className == null) { className = CarrotClusteringEngine.class.getName(); } SolrResourceLoader loader = core.getResourceLoader(); ClusteringEngine clusterer = loader.newInstance(className, ClusteringEngine.class); if (clusterer != null) { String name = clusterer.init(engineNL, core); if (name != null) { boolean isDefault = name.equals(ClusteringEngine.DEFAULT_ENGINE_NAME); if (clusterer instanceof SearchClusteringEngine) { if (isDefault == true && searchHasDefault == false) { searchHasDefault = true; } else if (isDefault == true && searchHasDefault == true) { throw new RuntimeException("More than one engine is missing name: " + engineNL); } searchClusteringEngines.put(name, (SearchClusteringEngine) clusterer); } else if (clusterer instanceof DocumentClusteringEngine) { if (isDefault == true && documentHasDefault == false) { searchHasDefault = true; } else if (isDefault == true && documentHasDefault == true) { throw new RuntimeException("More than one engine is missing name: " + engineNL); } documentClusteringEngines.put(name, (DocumentClusteringEngine) clusterer); } } else { if (clusterer instanceof SearchClusteringEngine && searchHasDefault == false) { searchClusteringEngines.put(ClusteringEngine.DEFAULT_ENGINE_NAME, (SearchClusteringEngine) clusterer); searchHasDefault = true; } else if (clusterer instanceof DocumentClusteringEngine && documentHasDefault == false) { documentClusteringEngines.put(ClusteringEngine.DEFAULT_ENGINE_NAME, (DocumentClusteringEngine) clusterer); documentHasDefault = true; } else { throw new RuntimeException("More than one engine is missing name: " + engineNL); } } } } } log.info("Finished Initializing Clustering Engines"); } } /* * @return Unmodifiable Map of the engines, key is the name from the config, value is the engine * */ public Map<String, SearchClusteringEngine> getSearchClusteringEngines() { return Collections.unmodifiableMap(searchClusteringEngines); } // /////////////////////////////////////////// // / SolrInfoMBean // ////////////////////////////////////////// @Override public String getDescription() { return "A Clustering component"; } @Override public String getSource() { return "$URL: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene_solr_4_0/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/ClusteringComponent.java $"; } }