package org.carrot2.elasticsearch; import java.io.IOException; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.assertj.core.api.Assertions; import org.carrot2.clustering.lingo.LingoClusteringAlgorithmDescriptor; import org.carrot2.clustering.stc.STCClusteringAlgorithmDescriptor; import org.carrot2.core.LanguageCode; import org.carrot2.elasticsearch.ClusteringAction.ClusteringActionRequestBuilder; import org.carrot2.elasticsearch.ClusteringAction.ClusteringActionResponse; import org.carrot2.elasticsearch.ListAlgorithmsAction.ListAlgorithmsActionRequestBuilder; import org.carrot2.elasticsearch.ListAlgorithmsAction.ListAlgorithmsActionResponse; import org.carrot2.text.clustering.MultilingualClustering.LanguageAggregationStrategy; import org.carrot2.text.clustering.MultilingualClusteringDescriptor; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.action.search.SearchRequestBuilder; import org.elasticsearch.common.xcontent.ToXContent; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; import org.json.JSONObject; /** * API tests for {@link ClusteringAction}. */ public class ClusteringActionIT extends SampleIndexTestCase { public void testComplexQuery() throws IOException { ClusteringActionResponse result = new ClusteringActionRequestBuilder(client) .setQueryHint("data mining") .addSourceFieldMapping("title", LogicalField.TITLE) .addHighlightedFieldMapping("content", LogicalField.CONTENT) .setSearchRequest( client.prepareSearch() .setIndices(INDEX_NAME) .setTypes("test") .setSize(100) .setQuery(QueryBuilders.termQuery("content", "data")) .highlighter(new HighlightBuilder().preTags("").postTags("")) .setFetchSource(new String[] {"title"}, null) .highlighter(new HighlightBuilder().field("content"))) .execute().actionGet(); checkValid(result); checkJsonSerialization(result); } public void testAttributes() throws IOException { Map<String,Object> attrs = new HashMap<>(); LingoClusteringAlgorithmDescriptor.attributeBuilder(attrs) .desiredClusterCountBase(5); ClusteringActionResponse result = new ClusteringActionRequestBuilder(client) .setQueryHint("data mining") .addSourceFieldMapping("title", LogicalField.TITLE) .addSourceFieldMapping("content", LogicalField.CONTENT) .addAttributes(attrs) .setSearchRequest( client.prepareSearch() .setIndices(INDEX_NAME) .setTypes("test") .setSize(100) .setQuery(QueryBuilders.matchAllQuery()) .setFetchSource(new String[] {"title", "content"}, null)) .execute().actionGet(); checkValid(result); checkJsonSerialization(result); Assertions.assertThat(result.getDocumentGroups().length) .isBetween(0, 5 + /* other topics */ 1); } public void testLanguageField() throws IOException { Map<String,Object> attrs = new HashMap<>(); // We can't serialize enum attributes via ES infrastructure so use string // constants from the descriptor. attrs.put( MultilingualClusteringDescriptor.Keys.LANGUAGE_AGGREGATION_STRATEGY, LanguageAggregationStrategy.FLATTEN_NONE.name()); ClusteringActionResponse result = new ClusteringActionRequestBuilder(client) .setQueryHint("data mining") .addSourceFieldMapping("title", LogicalField.TITLE) .addSourceFieldMapping("content", LogicalField.CONTENT) .addSourceFieldMapping("rndlang", LogicalField.LANGUAGE) .addAttributes(attrs) .setSearchRequest( client.prepareSearch() .setIndices(INDEX_NAME) .setTypes("test") .setSize(100) .setQuery(QueryBuilders.termQuery("content", "data")) .setFetchSource(new String[] {"title", "content", "rndlang"}, null)) .get(); checkValid(result); checkJsonSerialization(result); // Top level groups should be input documents' languages (aggregation strategy above). DocumentGroup[] documentGroups = result.getDocumentGroups(); Set<String> allLanguages = new HashSet<>(); for (LanguageCode code : LanguageCode.values()) { allLanguages.add(code.toString()); } for (DocumentGroup group : documentGroups) { if (!group.isOtherTopics()) { allLanguages.remove(group.getLabel()); } } Assertions.assertThat(allLanguages.size()) .describedAs("Expected a lot of languages to appear in top groups: " + allLanguages) .isLessThan(LanguageCode.values().length / 2); } public void testListAlgorithms() throws IOException { ListAlgorithmsActionResponse response = new ListAlgorithmsActionRequestBuilder(client).get(); List<String> algorithms = response.getAlgorithms(); Assertions.assertThat(algorithms) .isNotEmpty() .contains("stc", "lingo", "kmeans"); } public void testNonexistentFields() throws IOException { ClusteringActionResponse result = new ClusteringActionRequestBuilder(client) .setQueryHint("data mining") .addSourceFieldMapping("_nonexistent_", LogicalField.TITLE) .addSourceFieldMapping("_nonexistent_", LogicalField.CONTENT) .setSearchRequest( client.prepareSearch() .setIndices(INDEX_NAME) .setTypes("test") .setSize(100) .setQuery(QueryBuilders.termQuery("content", "data")) .setFetchSource(new String[] {"title", "content"}, null)) .execute().actionGet(); // There should be no clusters, but no errors. checkValid(result); checkJsonSerialization(result); // Top level groups should be input documents' languages (aggregation strategy above). DocumentGroup[] documentGroups = result.getDocumentGroups(); for (DocumentGroup group : documentGroups) { if (!group.isOtherTopics()) { Assertions.fail("Expected no clusters for non-existent fields."); } } } public void testNonexistentAlgorithmId() throws IOException { // The query should result in an error. try { new ClusteringActionRequestBuilder(client) .setQueryHint("") .addSourceFieldMapping("_nonexistent_", LogicalField.TITLE) .setAlgorithm("_nonexistent_") .setSearchRequest( client.prepareSearch() .setIndices(INDEX_NAME) .setTypes("test") .setSize(100) .setQuery(QueryBuilders.termQuery("content", "data")) .setFetchSource(new String[] {"title", "content"}, null)) .execute().actionGet(); throw Preconditions.unreachable(); } catch (IllegalArgumentException e) { Assertions.assertThat(e) .hasMessageContaining("No such algorithm:"); } } public void testPropagatingAlgorithmException() throws IOException { // The query should result in an error. try { Map<String,Object> attrs = new HashMap<>(); // Out of allowed range (should cause an exception). STCClusteringAlgorithmDescriptor.attributeBuilder(attrs) .ignoreWordIfInHigherDocsPercent(Double.MAX_VALUE); new ClusteringActionRequestBuilder(client) .setQueryHint("") .addSourceFieldMapping("title", LogicalField.TITLE) .addSourceFieldMapping("content", LogicalField.CONTENT) .setAlgorithm("stc") .addAttributes(attrs) .setSearchRequest( client.prepareSearch() .setIndices(INDEX_NAME) .setTypes("test") .setSize(100) .setQuery(QueryBuilders.termQuery("content", "data")) .setFetchSource(new String[] {"title", "content"}, null)) .execute().actionGet(); throw Preconditions.unreachable(); } catch (ElasticsearchException e) { Assertions.assertThat(e) .hasMessageContaining("Search results clustering error:") .hasMessageContaining(STCClusteringAlgorithmDescriptor.Keys.IGNORE_WORD_IF_IN_HIGHER_DOCS_PERCENT); } } public void testIncludeHits() throws IOException { // same search with and without hits SearchRequestBuilder req = client.prepareSearch() .setIndices(INDEX_NAME) .setTypes("test") .setSize(2) .setQuery(QueryBuilders.termQuery("content", "data")) .setFetchSource(new String[] {"content"}, null); // with hits (default) ClusteringActionResponse resultWithHits = new ClusteringActionRequestBuilder(client) .setQueryHint("data mining") .setAlgorithm("stc") .addSourceFieldMapping("title", LogicalField.TITLE) .setSearchRequest(req) .execute().actionGet(); checkValid(resultWithHits); checkJsonSerialization(resultWithHits); // get JSON output XContentBuilder builder = XContentFactory.jsonBuilder().prettyPrint(); builder.startObject(); resultWithHits.toXContent(builder, ToXContent.EMPTY_PARAMS); builder.endObject(); JSONObject jsonWithHits = new JSONObject(builder.string()); Assertions.assertThat(jsonWithHits.has("hits")).isTrue(); // without hits ClusteringActionResponse resultWithoutHits = new ClusteringActionRequestBuilder(client) .setQueryHint("data mining") .setMaxHits(0) .setAlgorithm("stc") .addSourceFieldMapping("title", LogicalField.TITLE) .setSearchRequest(req) .execute().actionGet(); checkValid(resultWithoutHits); checkJsonSerialization(resultWithoutHits); // get JSON output builder = XContentFactory.jsonBuilder().prettyPrint(); builder.startObject(); resultWithoutHits.toXContent(builder, ToXContent.EMPTY_PARAMS); builder.endObject(); JSONObject jsonWithoutHits = new JSONObject(builder.string()); Assertions.assertThat( jsonWithoutHits .getJSONObject("hits") .getJSONArray("hits").length()).isEqualTo(0); // insert hits into jsonWithoutHits JSONObject jsonHits = (JSONObject)jsonWithHits.get("hits"); jsonWithoutHits.put("hits", jsonHits); // took can vary, so ignore it jsonWithoutHits.remove("took"); jsonWithHits.remove("took"); // info can vary (clustering-millis, output_hits), so ignore it jsonWithoutHits.remove("info"); jsonWithHits.remove("info"); // profile can vary jsonWithoutHits.remove("profile"); jsonWithHits.remove("profile"); // now they should match logger.debug("--> with:\n" + jsonWithHits.toString()); logger.debug("--> without:\n" + jsonWithoutHits.toString()); Assertions.assertThat(jsonWithHits.toString()).isEqualTo(jsonWithoutHits.toString()); } public void testMaxHits() throws IOException { // same search with and without hits SearchRequestBuilder req = client.prepareSearch() .setIndices(INDEX_NAME) .setTypes("test") .setSize(2) .setQuery(QueryBuilders.termQuery("content", "data")) .setFetchSource(new String[] {"content"}, null); // Limit the set of hits to just top 2. ClusteringActionResponse limitedHits = new ClusteringActionRequestBuilder(client) .setQueryHint("data mining") .setMaxHits(2) .setAlgorithm("stc") .addSourceFieldMapping("title", LogicalField.TITLE) .setSearchRequest(req) .execute().actionGet(); checkValid(limitedHits); checkJsonSerialization(limitedHits); Assertions.assertThat(limitedHits.getSearchResponse().getHits().getHits()) .hasSize(2); // get JSON output XContentBuilder builder = XContentFactory.jsonBuilder().prettyPrint(); builder.startObject(); limitedHits.toXContent(builder, ToXContent.EMPTY_PARAMS); builder.endObject(); JSONObject json = new JSONObject(builder.string()); Assertions.assertThat(json .getJSONObject("hits") .getJSONArray("hits").length()).isEqualTo(2); } }