/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.search.aggregations.bucket.significant.heuristics; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.ParseFieldMatcher; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.index.query.QueryParsingException; import org.elasticsearch.search.internal.SearchContext; import java.io.IOException; public class GND extends NXYSignificanceHeuristic { protected static final ParseField NAMES_FIELD = new ParseField("gnd"); public GND(boolean backgroundIsSuperset) { super(true, backgroundIsSuperset); } @Override public boolean equals(Object other) { if (!(other instanceof GND)) { return false; } return super.equals(other); } @Override public int hashCode() { int result = NAMES_FIELD.getPreferredName().hashCode(); result = 31 * result + super.hashCode(); return result; } public static final SignificanceHeuristicStreams.Stream STREAM = new SignificanceHeuristicStreams.Stream() { @Override public SignificanceHeuristic readResult(StreamInput in) throws IOException { return new GND(in.readBoolean()); } @Override public String getName() { return NAMES_FIELD.getPreferredName(); } }; /** * Calculates Google Normalized Distance, as described in "The Google Similarity Distance", Cilibrasi and Vitanyi, 2007 * link: http://arxiv.org/pdf/cs/0412098v3.pdf */ @Override public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) { Frequencies frequencies = computeNxys(subsetFreq, subsetSize, supersetFreq, supersetSize, "GND"); double fx = frequencies.N1_; double fy = frequencies.N_1; double fxy = frequencies.N11; double N = frequencies.N; if (fxy == 0) { // no co-occurrence return 0.0; } if ((fx == fy) && (fx == fxy)) { // perfect co-occurrence return 1.0; } double score = (Math.max(Math.log(fx), Math.log(fy)) - Math.log(fxy)) / (Math.log(N) - Math.min(Math.log(fx), Math.log(fy))); //we must invert the order of terms because GND scores relevant terms low score = Math.exp(-1.0d * score); return score; } @Override public void writeTo(StreamOutput out) throws IOException { out.writeString(STREAM.getName()); out.writeBoolean(backgroundIsSuperset); } public static class GNDParser extends NXYParser { @Override public String[] getNames() { return NAMES_FIELD.getAllNamesIncludedDeprecated(); } @Override protected SignificanceHeuristic newHeuristic(boolean includeNegatives, boolean backgroundIsSuperset) { return new GND(backgroundIsSuperset); } @Override public SignificanceHeuristic parse(XContentParser parser, ParseFieldMatcher parseFieldMatcher, SearchContext context) throws IOException, QueryParsingException { String givenName = parser.currentName(); boolean backgroundIsSuperset = true; XContentParser.Token token = parser.nextToken(); while (!token.equals(XContentParser.Token.END_OBJECT)) { if (parseFieldMatcher.match(parser.currentName(), BACKGROUND_IS_SUPERSET)) { parser.nextToken(); backgroundIsSuperset = parser.booleanValue(); } else { throw new ElasticsearchParseException("failed to parse [{}] significance heuristic. unknown field [{}]", givenName, parser.currentName()); } token = parser.nextToken(); } return newHeuristic(true, backgroundIsSuperset); } } public static class GNDBuilder extends NXYBuilder { public GNDBuilder(boolean backgroundIsSuperset) { super(true, backgroundIsSuperset); } @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject(STREAM.getName()); builder.field(BACKGROUND_IS_SUPERSET.getPreferredName(), backgroundIsSuperset); builder.endObject(); return builder; } } }