/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.handler.clustering.carrot2; import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.commons.lang.ObjectUtils; import org.apache.commons.lang.StringUtils; import org.apache.lucene.search.Query; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.params.HighlightParams; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.core.SolrCore; import org.apache.solr.handler.clustering.ClusteringEngine; import org.apache.solr.handler.clustering.SearchClusteringEngine; import org.apache.solr.handler.component.HighlightComponent; import org.apache.solr.highlight.SolrHighlighter; import org.apache.solr.request.LocalSolrQueryRequest; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.DocList; import org.apache.solr.search.DocSlice; import org.apache.solr.search.SolrIndexSearcher; import org.carrot2.core.Cluster; import org.carrot2.core.Controller; import org.carrot2.core.ControllerFactory; import org.carrot2.core.Document; import org.carrot2.core.IClusteringAlgorithm; import org.carrot2.core.LanguageCode; import org.carrot2.core.attribute.AttributeNames; import org.carrot2.shaded.guava.common.base.MoreObjects; import org.carrot2.shaded.guava.common.base.Strings; import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor; import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor; import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor.AttributeBuilder; import org.carrot2.util.attribute.AttributeValueSet; import org.carrot2.util.attribute.AttributeValueSets; import org.carrot2.util.resource.ClassLoaderLocator; import org.carrot2.util.resource.IResource; import org.carrot2.util.resource.ResourceLookup; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Search results clustering engine based on Carrot2 clustering algorithms. * * @see "http://project.carrot2.org" * @lucene.experimental */ public class CarrotClusteringEngine extends SearchClusteringEngine { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); /** * The subdirectory in Solr config dir to read customized Carrot2 resources from. */ static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2"; /** * Name of Carrot2 document's field containing Solr document's identifier. */ private static final String SOLR_DOCUMENT_ID = "solrId"; /** * Name of Solr document's field containing the document's identifier. To avoid * repeating the content of documents in clusters on output, each cluster contains * identifiers of documents it contains. */ private String idFieldName; /** * Carrot2 controller that manages instances of clustering algorithms */ private Controller controller = ControllerFactory.createPooling(); /** * {@link IClusteringAlgorithm} class used for actual clustering. */ private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass; /** Solr core we're bound to. */ private SolrCore core; @Override public boolean isAvailable() { return clusteringAlgorithmClass != null; } @Override @SuppressWarnings("rawtypes") public String init(NamedList config, final SolrCore core) { this.core = core; String result = super.init(config, core); final SolrParams initParams = SolrParams.toSolrParams(config); // Initialization attributes for Carrot2 controller. HashMap<String, Object> initAttributes = new HashMap<>(); // Customize Carrot2's resource lookup to first look for resources // using Solr's resource loader. If that fails, try loading from the classpath. ResourceLookup resourceLookup = new ResourceLookup( // Solr-specific resource loading. new SolrResourceLocator(core, initParams), // Using the class loader directly because this time we want to omit the prefix new ClassLoaderLocator(core.getResourceLoader().getClassLoader())); DefaultLexicalDataFactoryDescriptor.attributeBuilder(initAttributes) .resourceLookup(resourceLookup); // Make sure the requested Carrot2 clustering algorithm class is available String carrotAlgorithmClassName = initParams.get(CarrotParams.ALGORITHM); try { this.clusteringAlgorithmClass = core.getResourceLoader().findClass( carrotAlgorithmClassName, IClusteringAlgorithm.class); } catch (SolrException s) { if (!(s.getCause() instanceof ClassNotFoundException)) { throw s; } } // Load Carrot2-Workbench exported attribute XMLs based on the 'name' attribute // of this component. This by-name convention lookup is used to simplify configuring algorithms. String componentName = initParams.get(ClusteringEngine.ENGINE_NAME); log.info("Initializing Clustering Engine '" + MoreObjects.firstNonNull(componentName, "<no 'name' attribute>") + "'"); if (!Strings.isNullOrEmpty(componentName)) { IResource[] attributeXmls = resourceLookup.getAll(componentName + "-attributes.xml"); if (attributeXmls.length > 0) { if (attributeXmls.length > 1) { log.warn("More than one attribute file found, first one will be used: " + Arrays.toString(attributeXmls)); } Thread ct = Thread.currentThread(); ClassLoader prev = ct.getContextClassLoader(); try { ct.setContextClassLoader(core.getResourceLoader().getClassLoader()); AttributeValueSets avs = AttributeValueSets.deserialize(attributeXmls[0].open()); AttributeValueSet defaultSet = avs.getDefaultAttributeValueSet(); initAttributes.putAll(defaultSet.getAttributeValues()); } catch (Exception e) { throw new SolrException(ErrorCode.SERVER_ERROR, "Could not read attributes XML for clustering component: " + componentName, e); } finally { ct.setContextClassLoader(prev); } } } // Extract solrconfig attributes, they take precedence. extractCarrotAttributes(initParams, initAttributes); // Customize the stemmer and tokenizer factories. The implementations we provide here // are included in the code base of Solr, so that it's possible to refactor // the Lucene APIs the factories rely on if needed. // Additionally, we set a custom lexical resource factory for Carrot2 that // will use both Carrot2 default stop words as well as stop words from // the StopFilter defined on the field. final AttributeBuilder attributeBuilder = BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes); attributeBuilder.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class); if (!initAttributes.containsKey(BasicPreprocessingPipelineDescriptor.Keys.TOKENIZER_FACTORY)) { attributeBuilder.tokenizerFactory(LuceneCarrot2TokenizerFactory.class); } if (!initAttributes.containsKey(BasicPreprocessingPipelineDescriptor.Keys.STEMMER_FACTORY)) { attributeBuilder.stemmerFactory(LuceneCarrot2StemmerFactory.class); } // Pass the schema (via the core) to SolrStopwordsCarrot2LexicalDataFactory. initAttributes.put("solrCore", core); // Carrot2 uses current thread's context class loader to get // certain classes (e.g. custom tokenizer/stemmer) at initialization time. // To make sure classes from contrib JARs are available, // we swap the context class loader for the time of clustering. Thread ct = Thread.currentThread(); ClassLoader prev = ct.getContextClassLoader(); try { ct.setContextClassLoader(core.getResourceLoader().getClassLoader()); this.controller.init(initAttributes); } finally { ct.setContextClassLoader(prev); } SchemaField uniqueField = core.getLatestSchema().getUniqueKeyField(); if (uniqueField == null) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotClusteringEngine.class.getSimpleName() + " requires the schema to have a uniqueKeyField"); } this.idFieldName = uniqueField.getName(); return result; } @Override public Object cluster(Query query, SolrDocumentList solrDocList, Map<SolrDocument, Integer> docIds, SolrQueryRequest sreq) { try { // Prepare attributes for Carrot2 clustering call Map<String, Object> attributes = new HashMap<>(); List<Document> documents = getDocuments(solrDocList, docIds, query, sreq); attributes.put(AttributeNames.DOCUMENTS, documents); attributes.put(AttributeNames.QUERY, query.toString()); // Pass the fields on which clustering runs. attributes.put("solrFieldNames", getFieldsForClustering(sreq)); // Pass extra overriding attributes from the request, if any extractCarrotAttributes(sreq.getParams(), attributes); // Perform clustering and convert to an output structure of clusters. // // Carrot2 uses current thread's context class loader to get // certain classes (e.g. custom tokenizer/stemmer) at runtime. // To make sure classes from contrib JARs are available, // we swap the context class loader for the time of clustering. Thread ct = Thread.currentThread(); ClassLoader prev = ct.getContextClassLoader(); try { ct.setContextClassLoader(core.getResourceLoader().getClassLoader()); return clustersToNamedList(controller.process(attributes, clusteringAlgorithmClass).getClusters(), sreq.getParams()); } finally { ct.setContextClassLoader(prev); } } catch (Exception e) { log.error("Carrot2 clustering failed", e); throw new SolrException(ErrorCode.SERVER_ERROR, "Carrot2 clustering failed", e); } } @Override protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){ SolrParams solrParams = sreq.getParams(); HashSet<String> fields = new HashSet<>(getFieldsForClustering(sreq)); fields.add(idFieldName); fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url")); fields.addAll(getCustomFieldsMap(solrParams).keySet()); String languageField = solrParams.get(CarrotParams.LANGUAGE_FIELD_NAME); if (StringUtils.isNotBlank(languageField)) { fields.add(languageField); } return fields; } /** * Returns the names of fields that will be delivering the actual * content for clustering. Currently, there are two such fields: document * title and document content. */ private Set<String> getFieldsForClustering(SolrQueryRequest sreq) { SolrParams solrParams = sreq.getParams(); String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title"); String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec); if (StringUtils.isBlank(snippetFieldSpec)) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME + " must not be blank."); } final Set<String> fields = new HashSet<>(); fields.addAll(Arrays.asList(titleFieldSpec.split("[, ]"))); fields.addAll(Arrays.asList(snippetFieldSpec.split("[, ]"))); return fields; } /** * Prepares Carrot2 documents for clustering. */ private List<Document> getDocuments(SolrDocumentList solrDocList, Map<SolrDocument, Integer> docIds, Query query, final SolrQueryRequest sreq) throws IOException { SolrHighlighter highlighter = null; SolrParams solrParams = sreq.getParams(); SolrCore core = sreq.getCore(); String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url"); String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title"); String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec); String languageField = solrParams.get(CarrotParams.LANGUAGE_FIELD_NAME, null); // Maps Solr field names to Carrot2 custom field names Map<String, String> customFields = getCustomFieldsMap(solrParams); // Parse language code map string into a map Map<String, String> languageCodeMap = new HashMap<>(); if (StringUtils.isNotBlank(languageField)) { for (String pair : solrParams.get(CarrotParams.LANGUAGE_CODE_MAP, "").split("[, ]")) { final String[] split = pair.split(":"); if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) { languageCodeMap.put(split[0], split[1]); } else { log.warn("Unsupported format for " + CarrotParams.LANGUAGE_CODE_MAP + ": '" + pair + "'. Skipping this mapping."); } } } // Get the documents boolean produceSummary = solrParams.getBool(CarrotParams.PRODUCE_SUMMARY, false); SolrQueryRequest req = null; String[] snippetFieldAry = null; if (produceSummary) { highlighter = HighlightComponent.getHighlighter(core); if (highlighter != null){ Map<String, Object> args = new HashMap<>(); snippetFieldAry = snippetFieldSpec.split("[, ]"); args.put(HighlightParams.FIELDS, snippetFieldAry); args.put(HighlightParams.HIGHLIGHT, "true"); args.put(HighlightParams.SIMPLE_PRE, ""); //we don't care about actually highlighting the area args.put(HighlightParams.SIMPLE_POST, ""); args.put(HighlightParams.FRAGSIZE, solrParams.getInt(CarrotParams.SUMMARY_FRAGSIZE, solrParams.getInt(HighlightParams.FRAGSIZE, 100))); args.put(HighlightParams.SNIPPETS, solrParams.getInt(CarrotParams.SUMMARY_SNIPPETS, solrParams.getInt(HighlightParams.SNIPPETS, 1))); req = new LocalSolrQueryRequest(core, query.toString(), "", 0, 1, args) { @Override public SolrIndexSearcher getSearcher() { return sreq.getSearcher(); } }; } else { log.warn("No highlighter configured, cannot produce summary"); produceSummary = false; } } Iterator<SolrDocument> docsIter = solrDocList.iterator(); List<Document> result = new ArrayList<>(solrDocList.size()); float[] scores = {1.0f}; int[] docsHolder = new int[1]; Query theQuery = query; while (docsIter.hasNext()) { SolrDocument sdoc = docsIter.next(); String snippet = null; // TODO: docIds will be null when running distributed search. // See comment in ClusteringComponent#finishStage(). if (produceSummary && docIds != null) { docsHolder[0] = docIds.get(sdoc).intValue(); DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f); NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry); if (highlights != null && highlights.size() == 1) { // should only be one value given our setup // should only be one document @SuppressWarnings("unchecked") NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0); final StringBuilder sb = new StringBuilder(); for (int j = 0; j < snippetFieldAry.length; j++) { // Join fragments with a period, so that Carrot2 does not create // cross-fragment phrases, such phrases rarely make sense. String [] highlt = tmp.get(snippetFieldAry[j]); if (highlt != null && highlt.length > 0) { for (int i = 0; i < highlt.length; i++) { sb.append(highlt[i]); sb.append(" . "); } } } snippet = sb.toString(); } } // If summaries not enabled or summary generation failed, use full content. if (snippet == null) { snippet = getConcatenated(sdoc, snippetFieldSpec); } // Create a Carrot2 document Document carrotDocument = new Document(getConcatenated(sdoc, titleFieldSpec), snippet, ObjectUtils.toString(sdoc.getFieldValue(urlField), "")); // Store Solr id of the document, we need it to map document instances // found in clusters back to identifiers. carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName)); // Set language if (StringUtils.isNotBlank(languageField)) { Collection<Object> languages = sdoc.getFieldValues(languageField); if (languages != null) { // Use the first Carrot2-supported language for (Object l : languages) { String lang = ObjectUtils.toString(l, ""); if (languageCodeMap.containsKey(lang)) { lang = languageCodeMap.get(lang); } // Language detection Library for Java uses dashes to separate // language variants, such as 'zh-cn', but Carrot2 uses underscores. if (lang.indexOf('-') > 0) { lang = lang.replace('-', '_'); } // If the language is supported by Carrot2, we'll get a non-null value final LanguageCode carrot2Language = LanguageCode.forISOCode(lang); if (carrot2Language != null) { carrotDocument.setLanguage(carrot2Language); break; } } } } // Add custom fields if (customFields != null) { for (Entry<String, String> entry : customFields.entrySet()) { carrotDocument.setField(entry.getValue(), sdoc.getFieldValue(entry.getKey())); } } result.add(carrotDocument); } return result; } /** * Expose clustering algorithm class for tests. */ Class<? extends IClusteringAlgorithm> getClusteringAlgorithmClass() { return clusteringAlgorithmClass; } /** * Prepares a map of Solr field names (keys) to the corresponding Carrot2 * custom field names. */ private Map<String, String> getCustomFieldsMap(SolrParams solrParams) { Map<String, String> customFields = new HashMap<>(); String [] customFieldsSpec = solrParams.getParams(CarrotParams.CUSTOM_FIELD_NAME); if (customFieldsSpec != null) { customFields = new HashMap<>(); for (String customFieldSpec : customFieldsSpec) { String [] split = customFieldSpec.split(":"); if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) { customFields.put(split[0], split[1]); } else { log.warn("Unsupported format for " + CarrotParams.CUSTOM_FIELD_NAME + ": '" + customFieldSpec + "'. Skipping this field definition."); } } } return customFields; } private String getConcatenated(SolrDocument sdoc, String fieldsSpec) { StringBuilder result = new StringBuilder(); for (String field : fieldsSpec.split("[, ]")) { Collection<Object> vals = sdoc.getFieldValues(field); if (vals == null) continue; Iterator<Object> ite = vals.iterator(); while(ite.hasNext()){ // Join multiple values with a period so that Carrot2 does not pick up // phrases that cross field value boundaries (in most cases it would // create useless phrases). result.append(ObjectUtils.toString(ite.next())).append(" . "); } } return result.toString().trim(); } private List<NamedList<Object>> clustersToNamedList(List<Cluster> carrotClusters, SolrParams solrParams) { List<NamedList<Object>> result = new ArrayList<>(); clustersToNamedList(carrotClusters, result, solrParams.getBool( CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt( CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE)); return result; } private void clustersToNamedList(List<Cluster> outputClusters, List<NamedList<Object>> parent, boolean outputSubClusters, int maxLabels) { for (Cluster outCluster : outputClusters) { NamedList<Object> cluster = new SimpleOrderedMap<>(); parent.add(cluster); // Add labels List<String> labels = outCluster.getPhrases(); if (labels.size() > maxLabels) { labels = labels.subList(0, maxLabels); } cluster.add("labels", labels); // Add cluster score final Double score = outCluster.getScore(); if (score != null) { cluster.add("score", score); } // Add other topics marker if (outCluster.isOtherTopics()) { cluster.add("other-topics", outCluster.isOtherTopics()); } // Add documents List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments(); List<Object> docList = new ArrayList<>(); cluster.add("docs", docList); for (Document doc : docs) { docList.add(doc.getField(SOLR_DOCUMENT_ID)); } // Add subclusters if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) { List<NamedList<Object>> subclusters = new ArrayList<>(); cluster.add("clusters", subclusters); clustersToNamedList(outCluster.getSubclusters(), subclusters, outputSubClusters, maxLabels); } } } /** * Extracts parameters that can possibly match some attributes of Carrot2 algorithms. */ private void extractCarrotAttributes(SolrParams solrParams, Map<String, Object> attributes) { // Extract all non-predefined parameters. This way, we'll be able to set all // parameters of Carrot2 algorithms without defining their names as constants. for (Iterator<String> paramNames = solrParams.getParameterNamesIterator(); paramNames .hasNext();) { String paramName = paramNames.next(); if (!CarrotParams.CARROT_PARAM_NAMES.contains(paramName)) { attributes.put(paramName, solrParams.get(paramName)); } } } }