/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.aggregations.bucket.significant.heuristics;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.xcontent.XContentBuilder;
import java.io.IOException;
public class MutualInformation extends NXYSignificanceHeuristic {
public static final String NAME = "mutual_information";
private static final double log2 = Math.log(2.0);
public MutualInformation(boolean includeNegatives, boolean backgroundIsSuperset) {
super(includeNegatives, backgroundIsSuperset);
}
/**
* Read from a stream.
*/
public MutualInformation(StreamInput in) throws IOException {
super(in);
}
@Override
public boolean equals(Object other) {
if (!(other instanceof MutualInformation)) {
return false;
}
return super.equals(other);
}
@Override
public int hashCode() {
int result = NAME.hashCode();
result = 31 * result + super.hashCode();
return result;
}
/**
* Calculates mutual information
* see "Information Retrieval", Manning et al., Eq. 13.17
*/
@Override
public double getScore(long subsetFreq, long subsetSize, long supersetFreq, long supersetSize) {
Frequencies frequencies = computeNxys(subsetFreq, subsetSize, supersetFreq, supersetSize, "MutualInformation");
double score = (getMITerm(frequencies.N00, frequencies.N0_, frequencies.N_0, frequencies.N) +
getMITerm(frequencies.N01, frequencies.N0_, frequencies.N_1, frequencies.N) +
getMITerm(frequencies.N10, frequencies.N1_, frequencies.N_0, frequencies.N) +
getMITerm(frequencies.N11, frequencies.N1_, frequencies.N_1, frequencies.N))
/ log2;
if (Double.isNaN(score)) {
score = Double.NEGATIVE_INFINITY;
}
// here we check if the term appears more often in subset than in background without subset.
if (!includeNegatives && frequencies.N11 / frequencies.N_1 < frequencies.N10 / frequencies.N_0) {
score = Double.NEGATIVE_INFINITY;
}
return score;
}
/* make sure that
0 * log(0/0) = 0
0 * log(0) = 0
Else, this would be the score:
double score =
N11 / N * Math.log((N * N11) / (N1_ * N_1))
+ N01 / N * Math.log((N * N01) / (N0_ * N_1))
+ N10 / N * Math.log((N * N10) / (N1_ * N_0))
+ N00 / N * Math.log((N * N00) / (N0_ * N_0));
but we get many NaN if we do not take case of the 0s */
double getMITerm(double Nxy, double Nx_, double N_y, double N) {
double numerator = Math.abs(N * Nxy);
double denominator = Math.abs(Nx_ * N_y);
double factor = Math.abs(Nxy / N);
if (numerator < 1.e-7 && factor < 1.e-7) {
return 0.0;
} else {
return factor * Math.log(numerator / denominator);
}
}
@Override
public String getWriteableName() {
return NAME;
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(NAME);
super.build(builder);
builder.endObject();
return builder;
}
public static final SignificanceHeuristicParser PARSER = new NXYParser() {
@Override
protected SignificanceHeuristic newHeuristic(boolean includeNegatives, boolean backgroundIsSuperset) {
return new MutualInformation(includeNegatives, backgroundIsSuperset);
}
};
public static class MutualInformationBuilder extends NXYBuilder {
public MutualInformationBuilder(boolean includeNegatives, boolean backgroundIsSuperset) {
super(includeNegatives, backgroundIsSuperset);
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(NAME);
super.build(builder);
builder.endObject();
return builder;
}
}
}