/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.search.aggregations.bucket.significant; import org.apache.lucene.util.BytesRef; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.Version; import org.elasticsearch.common.io.stream.InputStreamStreamInput; import org.elasticsearch.common.io.stream.OutputStreamStreamOutput; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.json.JsonXContent; import org.elasticsearch.search.SearchShardTarget; import org.elasticsearch.search.aggregations.InternalAggregation; import org.elasticsearch.search.aggregations.InternalAggregations; import org.elasticsearch.search.aggregations.bucket.significant.heuristics.*; import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; import org.elasticsearch.search.aggregations.support.format.ValueFormatter; import org.elasticsearch.search.internal.SearchContext; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.TestSearchContext; import org.junit.Test; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.*; import static org.elasticsearch.test.VersionUtils.randomVersion; import static org.hamcrest.Matchers.*; /** * */ public class SignificanceHeuristicTests extends ESTestCase { static class SignificantTermsTestSearchContext extends TestSearchContext { @Override public int numberOfShards() { return 1; } @Override public SearchShardTarget shardTarget() { return new SearchShardTarget("no node, this is a unit test", "no index, this is a unit test", 0); } } // test that stream output can actually be read - does not replace bwc test @Test public void testStreamResponse() throws Exception { Version version = randomVersion(random()); InternalSignificantTerms[] sigTerms = getRandomSignificantTerms(getRandomSignificanceheuristic()); // write ByteArrayOutputStream outBuffer = new ByteArrayOutputStream(); OutputStreamStreamOutput out = new OutputStreamStreamOutput(outBuffer); out.setVersion(version); sigTerms[0].writeTo(out); // read ByteArrayInputStream inBuffer = new ByteArrayInputStream(outBuffer.toByteArray()); InputStreamStreamInput in = new InputStreamStreamInput(inBuffer); in.setVersion(version); sigTerms[1].readFrom(in); assertTrue(sigTerms[1].significanceHeuristic.equals(sigTerms[0].significanceHeuristic)); InternalSignificantTerms.Bucket originalBucket = (InternalSignificantTerms.Bucket) sigTerms[0].buckets.get(0); InternalSignificantTerms.Bucket streamedBucket = (InternalSignificantTerms.Bucket) sigTerms[1].buckets.get(0); assertThat(originalBucket.getKeyAsString(), equalTo(streamedBucket.getKeyAsString())); assertThat(originalBucket.getSupersetDf(), equalTo(streamedBucket.getSupersetDf())); assertThat(originalBucket.getSubsetDf(), equalTo(streamedBucket.getSubsetDf())); assertThat(streamedBucket.getSubsetSize(), equalTo(10l)); assertThat(streamedBucket.getSupersetSize(), equalTo(20l)); } InternalSignificantTerms[] getRandomSignificantTerms(SignificanceHeuristic heuristic) { InternalSignificantTerms[] sTerms = new InternalSignificantTerms[2]; ArrayList<InternalSignificantTerms.Bucket> buckets = new ArrayList<>(); if (randomBoolean()) { buckets.add(new SignificantLongTerms.Bucket(1, 2, 3, 4, 123, InternalAggregations.EMPTY, null)); sTerms[0] = new SignificantLongTerms(10, 20, "some_name", null, 1, 1, heuristic, buckets, (List<PipelineAggregator>) Collections.EMPTY_LIST, null); sTerms[1] = new SignificantLongTerms(); } else { BytesRef term = new BytesRef("someterm"); buckets.add(new SignificantStringTerms.Bucket(term, 1, 2, 3, 4, InternalAggregations.EMPTY)); sTerms[0] = new SignificantStringTerms(10, 20, "some_name", 1, 1, heuristic, buckets, (List<PipelineAggregator>) Collections.EMPTY_LIST, null); sTerms[1] = new SignificantStringTerms(); } return sTerms; } SignificanceHeuristic getRandomSignificanceheuristic() { List<SignificanceHeuristic> heuristics = new ArrayList<>(); heuristics.add(JLHScore.INSTANCE); heuristics.add(new MutualInformation(randomBoolean(), randomBoolean())); heuristics.add(new GND(randomBoolean())); heuristics.add(new ChiSquare(randomBoolean(), randomBoolean())); return heuristics.get(randomInt(3)); } @Test public void testReduce() { List<InternalAggregation> aggs = createInternalAggregations(); SignificantTerms reducedAgg = (SignificantTerms) aggs.get(0).doReduce(aggs, null); assertThat(reducedAgg.getBuckets().size(), equalTo(2)); assertThat(reducedAgg.getBuckets().get(0).getSubsetDf(), equalTo(8l)); assertThat(reducedAgg.getBuckets().get(0).getSubsetSize(), equalTo(16l)); assertThat(reducedAgg.getBuckets().get(0).getSupersetDf(), equalTo(10l)); assertThat(reducedAgg.getBuckets().get(0).getSupersetSize(), equalTo(30l)); assertThat(reducedAgg.getBuckets().get(1).getSubsetDf(), equalTo(8l)); assertThat(reducedAgg.getBuckets().get(1).getSubsetSize(), equalTo(16l)); assertThat(reducedAgg.getBuckets().get(1).getSupersetDf(), equalTo(10l)); assertThat(reducedAgg.getBuckets().get(1).getSupersetSize(), equalTo(30l)); } // Create aggregations as they might come from three different shards and return as list. private List<InternalAggregation> createInternalAggregations() { String type = randomBoolean() ? "long" : "string"; SignificanceHeuristic significanceHeuristic = getRandomSignificanceheuristic(); List<InternalAggregation> aggs = new ArrayList<>(); List<InternalSignificantTerms.Bucket> terms0Buckets = new ArrayList<>(); terms0Buckets.add(createBucket(type, 4, 4, 5, 10, 0)); aggs.add(createAggregation(type, significanceHeuristic, terms0Buckets, 4, 10)); List<InternalSignificantTerms.Bucket> terms1Buckets = new ArrayList<>(); terms0Buckets.add(createBucket(type, 4, 4, 5, 10, 1)); aggs.add(createAggregation(type, significanceHeuristic, terms1Buckets, 4, 10)); List<InternalSignificantTerms.Bucket> terms01Buckets = new ArrayList<>(); terms0Buckets.add(createBucket(type, 4, 8, 5, 10, 0)); terms0Buckets.add(createBucket(type, 4, 8, 5, 10, 1)); aggs.add(createAggregation(type, significanceHeuristic, terms01Buckets, 8, 10)); return aggs; } private InternalSignificantTerms createAggregation(String type, SignificanceHeuristic significanceHeuristic, List<InternalSignificantTerms.Bucket> buckets, long subsetSize, long supersetSize) { if (type.equals("string")) { return new SignificantStringTerms(subsetSize, supersetSize, "sig_terms", 2, -1, significanceHeuristic, buckets, new ArrayList<PipelineAggregator>(), new HashMap<String, Object>()); } else { return new SignificantLongTerms(subsetSize, supersetSize, "sig_terms", ValueFormatter.RAW, 2, -1, significanceHeuristic, buckets, new ArrayList<PipelineAggregator>(), new HashMap<String, Object>()); } } private InternalSignificantTerms.Bucket createBucket(String type, long subsetDF, long subsetSize, long supersetDF, long supersetSize, long label) { if (type.equals("string")) { return new SignificantStringTerms.Bucket(new BytesRef(Long.toString(label).getBytes(StandardCharsets.UTF_8)), subsetDF, subsetSize, supersetDF, supersetSize, InternalAggregations.EMPTY); } else { return new SignificantLongTerms.Bucket(subsetDF, subsetSize, supersetDF, supersetSize, label, InternalAggregations.EMPTY, ValueFormatter.RAW); } } // test that // 1. The output of the builders can actually be parsed // 2. The parser does not swallow parameters after a significance heuristic was defined @Test public void testBuilderAndParser() throws Exception { Set<SignificanceHeuristicParser> parsers = new HashSet<>(); SignificanceHeuristicParserMapper heuristicParserMapper = new SignificanceHeuristicParserMapper(parsers, null); SearchContext searchContext = new SignificantTermsTestSearchContext(); // test jlh with string assertTrue(parseFromString(heuristicParserMapper, searchContext, "\"jlh\":{}") instanceof JLHScore); // test gnd with string assertTrue(parseFromString(heuristicParserMapper, searchContext, "\"gnd\":{}") instanceof GND); // test mutual information with string boolean includeNegatives = randomBoolean(); boolean backgroundIsSuperset = randomBoolean(); assertThat(parseFromString(heuristicParserMapper, searchContext, "\"mutual_information\":{\"include_negatives\": " + includeNegatives + ", \"background_is_superset\":" + backgroundIsSuperset + "}"), equalTo((SignificanceHeuristic) (new MutualInformation(includeNegatives, backgroundIsSuperset)))); assertThat(parseFromString(heuristicParserMapper, searchContext, "\"chi_square\":{\"include_negatives\": " + includeNegatives + ", \"background_is_superset\":" + backgroundIsSuperset + "}"), equalTo((SignificanceHeuristic) (new ChiSquare(includeNegatives, backgroundIsSuperset)))); // test with builders assertTrue(parseFromBuilder(heuristicParserMapper, searchContext, new JLHScore.JLHScoreBuilder()) instanceof JLHScore); assertTrue(parseFromBuilder(heuristicParserMapper, searchContext, new GND.GNDBuilder(backgroundIsSuperset)) instanceof GND); assertThat(parseFromBuilder(heuristicParserMapper, searchContext, new MutualInformation.MutualInformationBuilder(includeNegatives, backgroundIsSuperset)), equalTo((SignificanceHeuristic) new MutualInformation(includeNegatives, backgroundIsSuperset))); assertThat(parseFromBuilder(heuristicParserMapper, searchContext, new ChiSquare.ChiSquareBuilder(includeNegatives, backgroundIsSuperset)), equalTo((SignificanceHeuristic) new ChiSquare(includeNegatives, backgroundIsSuperset))); // test exceptions String faultyHeuristicdefinition = "\"mutual_information\":{\"include_negatives\": false, \"some_unknown_field\": false}"; String expectedError = "unknown field [some_unknown_field]"; checkParseException(heuristicParserMapper, searchContext, faultyHeuristicdefinition, expectedError); faultyHeuristicdefinition = "\"chi_square\":{\"unknown_field\": true}"; expectedError = "unknown field [unknown_field]"; checkParseException(heuristicParserMapper, searchContext, faultyHeuristicdefinition, expectedError); faultyHeuristicdefinition = "\"jlh\":{\"unknown_field\": true}"; expectedError = "expected an empty object, but found "; checkParseException(heuristicParserMapper, searchContext, faultyHeuristicdefinition, expectedError); faultyHeuristicdefinition = "\"gnd\":{\"unknown_field\": true}"; expectedError = "unknown field [unknown_field]"; checkParseException(heuristicParserMapper, searchContext, faultyHeuristicdefinition, expectedError); } protected void checkParseException(SignificanceHeuristicParserMapper heuristicParserMapper, SearchContext searchContext, String faultyHeuristicDefinition, String expectedError) throws IOException { try { XContentParser stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", " + faultyHeuristicDefinition + ",\"min_doc_count\":200}"); stParser.nextToken(); new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext); fail(); } catch (ElasticsearchParseException e) { assertTrue(e.getMessage().contains(expectedError)); } } protected SignificanceHeuristic parseFromBuilder(SignificanceHeuristicParserMapper heuristicParserMapper, SearchContext searchContext, SignificanceHeuristicBuilder significanceHeuristicBuilder) throws IOException { SignificantTermsBuilder stBuilder = new SignificantTermsBuilder("testagg"); stBuilder.significanceHeuristic(significanceHeuristicBuilder).field("text").minDocCount(200); XContentBuilder stXContentBuilder = XContentFactory.jsonBuilder(); stBuilder.internalXContent(stXContentBuilder, null); XContentParser stParser = JsonXContent.jsonXContent.createParser(stXContentBuilder.string()); return parseSignificanceHeuristic(heuristicParserMapper, searchContext, stParser); } private SignificanceHeuristic parseSignificanceHeuristic(SignificanceHeuristicParserMapper heuristicParserMapper, SearchContext searchContext, XContentParser stParser) throws IOException { stParser.nextToken(); SignificantTermsAggregatorFactory aggregatorFactory = (SignificantTermsAggregatorFactory) new SignificantTermsParser(heuristicParserMapper).parse("testagg", stParser, searchContext); stParser.nextToken(); assertThat(aggregatorFactory.getBucketCountThresholds().getMinDocCount(), equalTo(200l)); assertThat(stParser.currentToken(), equalTo(null)); stParser.close(); return aggregatorFactory.getSignificanceHeuristic(); } protected SignificanceHeuristic parseFromString(SignificanceHeuristicParserMapper heuristicParserMapper, SearchContext searchContext, String heuristicString) throws IOException { XContentParser stParser = JsonXContent.jsonXContent.createParser("{\"field\":\"text\", " + heuristicString + ", \"min_doc_count\":200}"); return parseSignificanceHeuristic(heuristicParserMapper, searchContext, stParser); } void testBackgroundAssertions(SignificanceHeuristic heuristicIsSuperset, SignificanceHeuristic heuristicNotSuperset) { try { heuristicIsSuperset.getScore(2, 3, 1, 4); fail(); } catch (IllegalArgumentException illegalArgumentException) { assertNotNull(illegalArgumentException.getMessage()); assertTrue(illegalArgumentException.getMessage().contains("subsetFreq > supersetFreq")); } try { heuristicIsSuperset.getScore(1, 4, 2, 3); fail(); } catch (IllegalArgumentException illegalArgumentException) { assertNotNull(illegalArgumentException.getMessage()); assertTrue(illegalArgumentException.getMessage().contains("subsetSize > supersetSize")); } try { heuristicIsSuperset.getScore(2, 1, 3, 4); fail(); } catch (IllegalArgumentException illegalArgumentException) { assertNotNull(illegalArgumentException.getMessage()); assertTrue(illegalArgumentException.getMessage().contains("subsetFreq > subsetSize")); } try { heuristicIsSuperset.getScore(1, 2, 4, 3); fail(); } catch (IllegalArgumentException illegalArgumentException) { assertNotNull(illegalArgumentException.getMessage()); assertTrue(illegalArgumentException.getMessage().contains("supersetFreq > supersetSize")); } try { heuristicIsSuperset.getScore(1, 3, 4, 4); fail(); } catch (IllegalArgumentException assertionError) { assertNotNull(assertionError.getMessage()); assertTrue(assertionError.getMessage().contains("supersetFreq - subsetFreq > supersetSize - subsetSize")); } try { int idx = randomInt(3); long[] values = {1, 2, 3, 4}; values[idx] *= -1; heuristicIsSuperset.getScore(values[0], values[1], values[2], values[3]); fail(); } catch (IllegalArgumentException illegalArgumentException) { assertNotNull(illegalArgumentException.getMessage()); assertTrue(illegalArgumentException.getMessage().contains("Frequencies of subset and superset must be positive")); } try { heuristicNotSuperset.getScore(2, 1, 3, 4); fail(); } catch (IllegalArgumentException illegalArgumentException) { assertNotNull(illegalArgumentException.getMessage()); assertTrue(illegalArgumentException.getMessage().contains("subsetFreq > subsetSize")); } try { heuristicNotSuperset.getScore(1, 2, 4, 3); fail(); } catch (IllegalArgumentException illegalArgumentException) { assertNotNull(illegalArgumentException.getMessage()); assertTrue(illegalArgumentException.getMessage().contains("supersetFreq > supersetSize")); } try { int idx = randomInt(3); long[] values = {1, 2, 3, 4}; values[idx] *= -1; heuristicNotSuperset.getScore(values[0], values[1], values[2], values[3]); fail(); } catch (IllegalArgumentException illegalArgumentException) { assertNotNull(illegalArgumentException.getMessage()); assertTrue(illegalArgumentException.getMessage().contains("Frequencies of subset and superset must be positive")); } } void testAssertions(SignificanceHeuristic heuristic) { try { int idx = randomInt(3); long[] values = {1, 2, 3, 4}; values[idx] *= -1; heuristic.getScore(values[0], values[1], values[2], values[3]); fail(); } catch (IllegalArgumentException illegalArgumentException) { assertNotNull(illegalArgumentException.getMessage()); assertTrue(illegalArgumentException.getMessage().contains("Frequencies of subset and superset must be positive")); } try { heuristic.getScore(1, 2, 4, 3); fail(); } catch (IllegalArgumentException illegalArgumentException) { assertNotNull(illegalArgumentException.getMessage()); assertTrue(illegalArgumentException.getMessage().contains("supersetFreq > supersetSize")); } try { heuristic.getScore(2, 1, 3, 4); fail(); } catch (IllegalArgumentException illegalArgumentException) { assertNotNull(illegalArgumentException.getMessage()); assertTrue(illegalArgumentException.getMessage().contains("subsetFreq > subsetSize")); } } @Test public void testAssertions() throws Exception { testBackgroundAssertions(new MutualInformation(true, true), new MutualInformation(true, false)); testBackgroundAssertions(new ChiSquare(true, true), new ChiSquare(true, false)); testBackgroundAssertions(new GND(true), new GND(false)); testAssertions(PercentageScore.INSTANCE); testAssertions(JLHScore.INSTANCE); } @Test public void basicScoreProperties() { basicScoreProperties(JLHScore.INSTANCE, true); basicScoreProperties(new GND(true), true); basicScoreProperties(PercentageScore.INSTANCE, true); basicScoreProperties(new MutualInformation(true, true), false); basicScoreProperties(new ChiSquare(true, true), false); } public void basicScoreProperties(SignificanceHeuristic heuristic, boolean test0) { assertThat(heuristic.getScore(1, 1, 1, 3), greaterThan(0.0)); assertThat(heuristic.getScore(1, 1, 2, 3), lessThan(heuristic.getScore(1, 1, 1, 3))); assertThat(heuristic.getScore(1, 1, 3, 4), lessThan(heuristic.getScore(1, 1, 2, 4))); if (test0) { assertThat(heuristic.getScore(0, 1, 2, 3), equalTo(0.0)); } double score = 0.0; try { long a = randomLong(); long b = randomLong(); long c = randomLong(); long d = randomLong(); score = heuristic.getScore(a, b, c, d); } catch (IllegalArgumentException e) { } assertThat(score, greaterThanOrEqualTo(0.0)); } @Test public void scoreMutual() throws Exception { SignificanceHeuristic heuristic = new MutualInformation(true, true); assertThat(heuristic.getScore(1, 1, 1, 3), greaterThan(0.0)); assertThat(heuristic.getScore(1, 1, 2, 3), lessThan(heuristic.getScore(1, 1, 1, 3))); assertThat(heuristic.getScore(2, 2, 2, 4), equalTo(1.0)); assertThat(heuristic.getScore(0, 2, 2, 4), equalTo(1.0)); assertThat(heuristic.getScore(2, 2, 4, 4), equalTo(0.0)); assertThat(heuristic.getScore(1, 2, 2, 4), equalTo(0.0)); assertThat(heuristic.getScore(3, 6, 9, 18), equalTo(0.0)); double score = 0.0; try { long a = randomLong(); long b = randomLong(); long c = randomLong(); long d = randomLong(); score = heuristic.getScore(a, b, c, d); } catch (IllegalArgumentException e) { } assertThat(score, lessThanOrEqualTo(1.0)); assertThat(score, greaterThanOrEqualTo(0.0)); heuristic = new MutualInformation(false, true); assertThat(heuristic.getScore(0, 1, 2, 3), equalTo(Double.NEGATIVE_INFINITY)); heuristic = new MutualInformation(true, false); score = heuristic.getScore(2, 3, 1, 4); assertThat(score, greaterThanOrEqualTo(0.0)); assertThat(score, lessThanOrEqualTo(1.0)); score = heuristic.getScore(1, 4, 2, 3); assertThat(score, greaterThanOrEqualTo(0.0)); assertThat(score, lessThanOrEqualTo(1.0)); score = heuristic.getScore(1, 3, 4, 4); assertThat(score, greaterThanOrEqualTo(0.0)); assertThat(score, lessThanOrEqualTo(1.0)); } @Test public void testGNDCornerCases() throws Exception { GND gnd = new GND(true); //term is only in the subset, not at all in the other set but that is because the other set is empty. // this should actually not happen because only terms that are in the subset are considered now, // however, in this case the score should be 0 because a term that does not exist cannot be relevant... assertThat(gnd.getScore(0, randomIntBetween(1, 2), 0, randomIntBetween(2,3)), equalTo(0.0)); // the terms do not co-occur at all - should be 0 assertThat(gnd.getScore(0, randomIntBetween(1, 2), randomIntBetween(2, 3), randomIntBetween(5,6)), equalTo(0.0)); // comparison between two terms that do not exist - probably not relevant assertThat(gnd.getScore(0, 0, 0, randomIntBetween(1,2)), equalTo(0.0)); // terms co-occur perfectly - should be 1 assertThat(gnd.getScore(1, 1, 1, 1), equalTo(1.0)); gnd = new GND(false); assertThat(gnd.getScore(0, 0, 0, 0), equalTo(0.0)); } }