/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.handler.clustering.carrot2; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.lucene.index.Term; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; import org.apache.lucene.search.TermQuery; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.handler.clustering.AbstractClusteringTestCase; import org.apache.solr.handler.clustering.ClusteringComponent; import org.apache.solr.handler.clustering.ClusteringEngine; import org.apache.solr.handler.clustering.SearchClusteringEngine; import org.apache.solr.request.LocalSolrQueryRequest; import org.apache.solr.search.DocList; import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.util.RefCounted; import org.carrot2.clustering.lingo.LingoClusteringAlgorithm; import org.carrot2.core.LanguageCode; import org.carrot2.util.attribute.AttributeUtils; import org.junit.Test; /** * */ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase { @Test public void testCarrotLingo() throws Exception { // Note: the expected number of clusters may change after upgrading Carrot2 // due to e.g. internal improvements or tuning of Carrot2 clustering. final int expectedNumClusters = 10; checkEngine(getClusteringEngine("default"), expectedNumClusters); } @Test public void testProduceSummary() throws Exception { // We'll make two queries, one with- and another one without summary // and assert that documents are shorter when highlighter is in use. final List<NamedList<Object>> noSummaryClusters = clusterWithHighlighting(false, 80); final List<NamedList<Object>> summaryClusters = clusterWithHighlighting(true, 80); assertEquals("Equal number of clusters", noSummaryClusters.size(), summaryClusters.size()); for (int i = 0; i < noSummaryClusters.size(); i++) { assertTrue("Summary shorter than original document", getLabels(noSummaryClusters.get(i)).get(1).length() > getLabels(summaryClusters.get(i)).get(1).length()); } } @Test public void testSummaryFragSize() throws Exception { // We'll make two queries, one short summaries and another one with longer // summaries and will check that the results differ. final List<NamedList<Object>> shortSummaryClusters = clusterWithHighlighting(true, 30); final List<NamedList<Object>> longSummaryClusters = clusterWithHighlighting(true, 80); assertEquals("Equal number of clusters", shortSummaryClusters.size(), longSummaryClusters.size()); for (int i = 0; i < shortSummaryClusters.size(); i++) { assertTrue("Summary shorter than original document", getLabels(shortSummaryClusters.get(i)).get(1).length() < getLabels(longSummaryClusters.get(i)).get(1).length()); } } private List<NamedList<Object>> clusterWithHighlighting( boolean enableHighlighting, int fragSize) throws IOException { // Some documents don't have mining in the snippet return clusterWithHighlighting(enableHighlighting, fragSize, 1, "mine", numberOfDocs - 7); } private List<NamedList<Object>> clusterWithHighlighting( boolean enableHighlighting, int fragSize, int summarySnippets, String term, int expectedNumDocuments) throws IOException { final TermQuery query = new TermQuery(new Term("snippet", term)); final ModifiableSolrParams summaryParams = new ModifiableSolrParams(); summaryParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet"); summaryParams.add(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(enableHighlighting)); summaryParams .add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(fragSize)); summaryParams .add(CarrotParams.SUMMARY_SNIPPETS, Integer.toString(summarySnippets)); final List<NamedList<Object>> summaryClusters = checkEngine( getClusteringEngine("echo"), expectedNumDocuments, expectedNumDocuments, query, summaryParams); return summaryClusters; } @Test public void testCarrotStc() throws Exception { checkEngine(getClusteringEngine("stc"), 3); } @Test public void testWithoutSubclusters() throws Exception { checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs), 1, 1, 0); } @Test public void testExternalXmlAttributesFile() throws Exception { checkClusters( checkEngine(getClusteringEngine("mock-external-attrs"), 13), 1, 4, 0); } @Test public void testWithSubclusters() throws Exception { ModifiableSolrParams params = new ModifiableSolrParams(); params.set(CarrotParams.OUTPUT_SUB_CLUSTERS, true); checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs), 1, 1, 2); } @Test public void testNumDescriptions() throws Exception { ModifiableSolrParams params = new ModifiableSolrParams(); params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 5); params.set(CarrotParams.NUM_DESCRIPTIONS, 3); checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs, params), 1, 3, 0); } @Test public void testClusterScores() throws Exception { ModifiableSolrParams params = new ModifiableSolrParams(); params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1); List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs, params); int i = 1; for (NamedList<Object> cluster : clusters) { final Double score = getScore(cluster); assertNotNull(score); assertEquals(0.25 * i++, score, 0); } } @Test public void testOtherTopics() throws Exception { ModifiableSolrParams params = new ModifiableSolrParams(); params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1); params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "otherTopicsModulo"), 2); List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs, params); int i = 1; for (NamedList<Object> cluster : clusters) { assertEquals(i++ % 2 == 0 ? true : null, isOtherTopics(cluster)); } } @Test public void testCarrotAttributePassing() throws Exception { ModifiableSolrParams params = new ModifiableSolrParams(); params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1); params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 3); checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs, params), 1, 3, 0); } @Test public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception { checkLexicalResourcesFromSolrConfig("lexical-resource-check", "online,customsolrstopword,customsolrstoplabel"); } @Test public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception { checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir", "online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir"); } private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck) throws IOException { ModifiableSolrParams params = new ModifiableSolrParams(); params.set("merge-resources", false); params.set(AttributeUtils.getKey( LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"), wordsToCheck); // "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in // stoplabels.mt, so we're expecting only one cluster with label "online". final List<NamedList<Object>> clusters = checkEngine( getClusteringEngine(engineName), 1, params); assertEquals(getLabels(clusters.get(0)), Collections.singletonList("online")); } @Test public void testSolrStopWordsUsedInCarrot2Clustering() throws Exception { ModifiableSolrParams params = new ModifiableSolrParams(); params.set("merge-resources", false); params.set(AttributeUtils.getKey( LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"), "online,solrownstopword"); // "solrownstopword" is in stopwords.txt, so we're expecting // only one cluster with label "online". final List<NamedList<Object>> clusters = checkEngine( getClusteringEngine("lexical-resource-check"), 1, params); assertEquals(getLabels(clusters.get(0)), Collections.singletonList("online")); } @Test public void testSolrStopWordsNotDefinedOnAFieldForClustering() throws Exception { ModifiableSolrParams params = new ModifiableSolrParams(); // Force string fields to be used for clustering. Does not make sense // in a real word, but does the job in the test. params.set(CarrotParams.TITLE_FIELD_NAME, "url"); params.set(CarrotParams.SNIPPET_FIELD_NAME, "url"); params.set("merge-resources", false); params.set(AttributeUtils.getKey( LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"), "online,solrownstopword"); final List<NamedList<Object>> clusters = checkEngine( getClusteringEngine("lexical-resource-check"), 2, params); assertEquals(Collections.singletonList("online"), getLabels(clusters.get(0))); assertEquals(Collections.singletonList("solrownstopword"), getLabels(clusters.get(1))); } @Test public void testHighlightingOfMultiValueField() throws Exception { final String snippetWithoutSummary = getLabels(clusterWithHighlighting( false, 30, 3, "multi", 1).get(0)).get(1); assertTrue("Snippet contains first value", snippetWithoutSummary.contains("First")); assertTrue("Snippet contains second value", snippetWithoutSummary.contains("Second")); assertTrue("Snippet contains third value", snippetWithoutSummary.contains("Third")); final String snippetWithSummary = getLabels(clusterWithHighlighting( true, 30, 3, "multi", 1).get(0)).get(1); assertTrue("Snippet with summary shorter than full snippet", snippetWithoutSummary.length() > snippetWithSummary.length()); assertTrue("Summary covers first value", snippetWithSummary.contains("First")); assertTrue("Summary covers second value", snippetWithSummary.contains("Second")); assertTrue("Summary covers third value", snippetWithSummary.contains("Third")); } @Test public void testConcatenatingMultipleFields() throws Exception { final ModifiableSolrParams params = new ModifiableSolrParams(); params.add(CarrotParams.TITLE_FIELD_NAME, "title,heading"); params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet,body"); final List<String> labels = getLabels(checkEngine( getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("body", "snippet")), params).get(0)); assertTrue("Snippet contains third value", labels.get(0).contains("Title field")); assertTrue("Snippet contains third value", labels.get(0).contains("Heading field")); assertTrue("Snippet contains third value", labels.get(1).contains("Snippet field")); assertTrue("Snippet contains third value", labels.get(1).contains("Body field")); } @Test public void testHighlightingMultipleFields() throws Exception { final TermQuery query = new TermQuery(new Term("snippet", "content")); final ModifiableSolrParams params = new ModifiableSolrParams(); params.add(CarrotParams.TITLE_FIELD_NAME, "title,heading"); params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet,body"); params.add(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(false)); final String snippetWithoutSummary = getLabels(checkEngine( getClusteringEngine("echo"), 1, 1, query, params).get(0)).get(1); assertTrue("Snippet covers snippet field", snippetWithoutSummary.contains("snippet field")); assertTrue("Snippet covers body field", snippetWithoutSummary.contains("body field")); params.set(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(true)); params.add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(30)); params.add(CarrotParams.SUMMARY_SNIPPETS, Integer.toString(2)); final String snippetWithSummary = getLabels(checkEngine( getClusteringEngine("echo"), 1, 1, query, params).get(0)).get(1); assertTrue("Snippet with summary shorter than full snippet", snippetWithoutSummary.length() > snippetWithSummary.length()); assertTrue("Snippet covers snippet field", snippetWithSummary.contains("snippet field")); assertTrue("Snippet covers body field", snippetWithSummary.contains("body field")); } @Test public void testOneCarrot2SupportedLanguage() throws Exception { final ModifiableSolrParams params = new ModifiableSolrParams(); params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang"); final List<String> labels = getLabels(checkEngine( getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url", "one_supported_language")), params).get(0)); assertEquals(3, labels.size()); assertEquals("Correct Carrot2 language", LanguageCode.CHINESE_SIMPLIFIED.name(), labels.get(2)); } @Test public void testOneCarrot2SupportedLanguageOfMany() throws Exception { final ModifiableSolrParams params = new ModifiableSolrParams(); params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang"); final List<String> labels = getLabels(checkEngine( getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url", "one_supported_language_of_many")), params).get(0)); assertEquals(3, labels.size()); assertEquals("Correct Carrot2 language", LanguageCode.GERMAN.name(), labels.get(2)); } @Test public void testLanguageCodeMapping() throws Exception { final ModifiableSolrParams params = new ModifiableSolrParams(); params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang"); params.add(CarrotParams.LANGUAGE_CODE_MAP, "POLISH:pl"); final List<String> labels = getLabels(checkEngine( getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url", "one_supported_language_of_many")), params).get(0)); assertEquals(3, labels.size()); assertEquals("Correct Carrot2 language", LanguageCode.POLISH.name(), labels.get(2)); } @Test public void testPassingOfCustomFields() throws Exception { final ModifiableSolrParams params = new ModifiableSolrParams(); params.add(CarrotParams.CUSTOM_FIELD_NAME, "intfield_i:intfield"); params.add(CarrotParams.CUSTOM_FIELD_NAME, "floatfield_f:floatfield"); params.add(CarrotParams.CUSTOM_FIELD_NAME, "heading:multi"); // Let the echo mock clustering algorithm know which custom field to echo params.add("custom-fields", "intfield,floatfield,multi"); final List<String> labels = getLabels(checkEngine( getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url", "custom_fields")), params).get(0)); assertEquals(5, labels.size()); assertEquals("Integer field", "10", labels.get(2)); assertEquals("Float field", "10.5", labels.get(3)); assertEquals("List field", "[first, second]", labels.get(4)); } @Test public void testCustomTokenizer() throws Exception { final ModifiableSolrParams params = new ModifiableSolrParams(); params.add(CarrotParams.TITLE_FIELD_NAME, "title"); params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet"); final List<String> labels = getLabels(checkEngine( getClusteringEngine("custom-duplicating-tokenizer"), 1, 15, new TermQuery(new Term("title", "field")), params).get(0)); // The custom test tokenizer duplicates each token's text assertTrue("First token", labels.get(0).contains("TitleTitle")); } @Test public void testCustomStemmer() throws Exception { final ModifiableSolrParams params = new ModifiableSolrParams(); params.add(CarrotParams.TITLE_FIELD_NAME, "title"); params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet"); final List<String> labels = getLabels(checkEngine( getClusteringEngine("custom-duplicating-stemmer"), 1, 12, new TermQuery(new Term("title", "field")), params).get(0)); // The custom test stemmer duplicates and lowercases each token's text assertTrue("First token", labels.get(0).contains("titletitle")); } @Test public void testDefaultEngineOrder() throws Exception { ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-default"); Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp); assertEquals( Arrays.asList("stc", "default", "mock"), new ArrayList<>(engines.keySet())); assertEquals( LingoClusteringAlgorithm.class, ((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass()); } @Test public void testDeclarationEngineOrder() throws Exception { ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-decl-order"); Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp); assertEquals( Arrays.asList("unavailable", "lingo", "stc", "mock", "default"), new ArrayList<>(engines.keySet())); assertEquals( LingoClusteringAlgorithm.class, ((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass()); } @Test public void testDeclarationNameDuplicates() throws Exception { ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-dups"); Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp); assertEquals( Arrays.asList("", "default"), new ArrayList<>(engines.keySet())); assertEquals( MockClusteringAlgorithm.class, ((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass()); } private CarrotClusteringEngine getClusteringEngine(String engineName) { ClusteringComponent comp = (ClusteringComponent) h.getCore() .getSearchComponent("clustering"); assertNotNull("clustering component should not be null", comp); CarrotClusteringEngine engine = (CarrotClusteringEngine) getSearchClusteringEngines(comp).get(engineName); assertNotNull("clustering engine for name: " + engineName + " should not be null", engine); return engine; } private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine, int expectedNumClusters) throws IOException { return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), new ModifiableSolrParams()); } private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine, int expectedNumClusters, SolrParams clusteringParams) throws IOException { return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), clusteringParams); } private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine, int expectedNumDocs, int expectedNumClusters, Query query, SolrParams clusteringParams) throws IOException { // Get all documents to cluster RefCounted<SolrIndexSearcher> ref = h.getCore().getSearcher(); DocList docList; try { SolrIndexSearcher searcher = ref.get(); docList = searcher.getDocList(query, (Query) null, new Sort(), 0, numberOfDocs); assertEquals("docList size", expectedNumDocs, docList.matches()); ModifiableSolrParams solrParams = new ModifiableSolrParams(); solrParams.add(clusteringParams); // Perform clustering LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams); Map<SolrDocument,Integer> docIds = new HashMap<>(docList.size()); SolrDocumentList solrDocList = ClusteringComponent.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds ); @SuppressWarnings("unchecked") List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req); req.close(); assertEquals("number of clusters: " + results, expectedNumClusters, results.size()); checkClusters(results, false); return results; } finally { ref.decref(); } } private void checkClusters(List<NamedList<Object>> results, int expectedDocCount, int expectedLabelCount, int expectedSubclusterCount) { for (int i = 0; i < results.size(); i++) { NamedList<Object> cluster = results.get(i); checkCluster(cluster, expectedDocCount, expectedLabelCount, expectedSubclusterCount); } } private void checkClusters(List<NamedList<Object>> results, boolean hasSubclusters) { for (int i = 0; i < results.size(); i++) { checkCluster(results.get(i), hasSubclusters); } } private void checkCluster(NamedList<Object> cluster, boolean hasSubclusters) { List<Object> docs = getDocs(cluster); assertNotNull("docs is null and it shouldn't be", docs); for (int j = 0; j < docs.size(); j++) { Object id = docs.get(j); assertNotNull("id is null and it shouldn't be", id); } List<String> labels = getLabels(cluster); assertNotNull("labels is null but it shouldn't be", labels); if (hasSubclusters) { List<NamedList<Object>> subclusters = getSubclusters(cluster); assertNotNull("subclusters is null but it shouldn't be", subclusters); } } private void checkCluster(NamedList<Object> cluster, int expectedDocCount, int expectedLabelCount, int expectedSubclusterCount) { checkCluster(cluster, expectedSubclusterCount > 0); assertEquals("number of docs in cluster", expectedDocCount, getDocs(cluster).size()); assertEquals("number of labels in cluster", expectedLabelCount, getLabels(cluster).size()); if (expectedSubclusterCount > 0) { List<NamedList<Object>> subclusters = getSubclusters(cluster); assertEquals("numClusters", expectedSubclusterCount, subclusters.size()); assertEquals("number of subclusters in cluster", expectedSubclusterCount, subclusters.size()); } } @SuppressWarnings("unchecked") private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) { return (List<NamedList<Object>>) cluster.get("clusters"); } @SuppressWarnings("unchecked") private List<String> getLabels(NamedList<Object> cluster) { return (List<String>) cluster.get("labels"); } private Double getScore(NamedList<Object> cluster) { return (Double) cluster.get("score"); } private Boolean isOtherTopics(NamedList<Object> cluster) { return (Boolean)cluster.get("other-topics"); } @SuppressWarnings("unchecked") private List<Object> getDocs(NamedList<Object> cluster) { return (List<Object>) cluster.get("docs"); } }