/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.search.aggregations.bucket.significant.heuristics; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.index.query.QueryParseContext; import org.elasticsearch.index.query.QueryShardException; import java.io.IOException; public abstract class NXYSignificanceHeuristic extends SignificanceHeuristic { protected static final ParseField BACKGROUND_IS_SUPERSET = new ParseField("background_is_superset"); protected static final ParseField INCLUDE_NEGATIVES_FIELD = new ParseField("include_negatives"); protected static final String SCORE_ERROR_MESSAGE = ", does your background filter not include all documents in the bucket? If so and it is intentional, set \"" + BACKGROUND_IS_SUPERSET.getPreferredName() + "\": false"; protected final boolean backgroundIsSuperset; /** * Some heuristics do not differentiate between terms that are descriptive for subset or for * the background without the subset. We might want to filter out the terms that are appear much less often * in the subset than in the background without the subset. */ protected final boolean includeNegatives; protected NXYSignificanceHeuristic(boolean includeNegatives, boolean backgroundIsSuperset) { this.includeNegatives = includeNegatives; this.backgroundIsSuperset = backgroundIsSuperset; } /** * Read from a stream. */ protected NXYSignificanceHeuristic(StreamInput in) throws IOException { includeNegatives = in.readBoolean(); backgroundIsSuperset = in.readBoolean(); } @Override public void writeTo(StreamOutput out) throws IOException { out.writeBoolean(includeNegatives); out.writeBoolean(backgroundIsSuperset); } @Override public boolean equals(Object other) { return ((NXYSignificanceHeuristic) other).includeNegatives == includeNegatives && ((NXYSignificanceHeuristic) other).backgroundIsSuperset == backgroundIsSuperset; } @Override public int hashCode() { int result = (includeNegatives ? 1 : 0); result = 31 * result + (backgroundIsSuperset ? 1 : 0); return result; } protected static class Frequencies { double N00, N01, N10, N11, N0_, N1_, N_0, N_1, N; } protected Frequencies computeNxys(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize, String scoreFunctionName) { checkFrequencies(subsetFreq, subsetSize, supersetFreq, supersetSize, scoreFunctionName); Frequencies frequencies = new Frequencies(); if (backgroundIsSuperset) { //documents not in class and do not contain term frequencies.N00 = supersetSize - supersetFreq - (subsetSize - subsetFreq); //documents in class and do not contain term frequencies.N01 = (subsetSize - subsetFreq); // documents not in class and do contain term frequencies.N10 = supersetFreq - subsetFreq; // documents in class and do contain term frequencies.N11 = subsetFreq; //documents that do not contain term frequencies.N0_ = supersetSize - supersetFreq; //documents that contain term frequencies.N1_ = supersetFreq; //documents that are not in class frequencies.N_0 = supersetSize - subsetSize; //documents that are in class frequencies.N_1 = subsetSize; //all docs frequencies.N = supersetSize; } else { //documents not in class and do not contain term frequencies.N00 = supersetSize - supersetFreq; //documents in class and do not contain term frequencies.N01 = subsetSize - subsetFreq; // documents not in class and do contain term frequencies.N10 = supersetFreq; // documents in class and do contain term frequencies.N11 = subsetFreq; //documents that do not contain term frequencies.N0_ = supersetSize - supersetFreq + subsetSize - subsetFreq; //documents that contain term frequencies.N1_ = supersetFreq + subsetFreq; //documents that are not in class frequencies.N_0 = supersetSize; //documents that are in class frequencies.N_1 = subsetSize; //all docs frequencies.N = supersetSize + subsetSize; } return frequencies; } protected void checkFrequencies(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize, String scoreFunctionName) { checkFrequencyValidity(subsetFreq, subsetSize, supersetFreq, supersetSize, scoreFunctionName); if (backgroundIsSuperset) { if (subsetFreq > supersetFreq) { throw new IllegalArgumentException("subsetFreq > supersetFreq" + SCORE_ERROR_MESSAGE); } if (subsetSize > supersetSize) { throw new IllegalArgumentException("subsetSize > supersetSize" + SCORE_ERROR_MESSAGE); } if (supersetFreq - subsetFreq > supersetSize - subsetSize) { throw new IllegalArgumentException("supersetFreq - subsetFreq > supersetSize - subsetSize" + SCORE_ERROR_MESSAGE); } } } protected void build(XContentBuilder builder) throws IOException { builder.field(INCLUDE_NEGATIVES_FIELD.getPreferredName(), includeNegatives).field(BACKGROUND_IS_SUPERSET.getPreferredName(), backgroundIsSuperset); } public abstract static class NXYParser implements SignificanceHeuristicParser { @Override public SignificanceHeuristic parse(QueryParseContext context) throws IOException, QueryShardException { XContentParser parser = context.parser(); String givenName = parser.currentName(); boolean includeNegatives = false; boolean backgroundIsSuperset = true; XContentParser.Token token = parser.nextToken(); while (!token.equals(XContentParser.Token.END_OBJECT)) { if (INCLUDE_NEGATIVES_FIELD.match(parser.currentName())) { parser.nextToken(); includeNegatives = parser.booleanValue(); } else if (BACKGROUND_IS_SUPERSET.match(parser.currentName())) { parser.nextToken(); backgroundIsSuperset = parser.booleanValue(); } else { throw new ElasticsearchParseException("failed to parse [{}] significance heuristic. unknown field [{}]", givenName, parser.currentName()); } token = parser.nextToken(); } return newHeuristic(includeNegatives, backgroundIsSuperset); } protected abstract SignificanceHeuristic newHeuristic(boolean includeNegatives, boolean backgroundIsSuperset); } protected abstract static class NXYBuilder implements SignificanceHeuristicBuilder { protected boolean includeNegatives = true; protected boolean backgroundIsSuperset = true; public NXYBuilder(boolean includeNegatives, boolean backgroundIsSuperset) { this.includeNegatives = includeNegatives; this.backgroundIsSuperset = backgroundIsSuperset; } protected void build(XContentBuilder builder) throws IOException { builder.field(INCLUDE_NEGATIVES_FIELD.getPreferredName(), includeNegatives) .field(BACKGROUND_IS_SUPERSET.getPreferredName(), backgroundIsSuperset); } } }