/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.math.stats;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.MahoutTestCase;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.function.DoubleFunction;
import org.apache.mahout.math.function.Functions;
import org.junit.Test;
import java.util.List;
import java.util.Random;
public final class LogLikelihoodTest extends MahoutTestCase {
@Test
public void testEntropy() throws Exception {
assertEquals(1.386294, LogLikelihood.entropy(1, 1), 0.0001);
assertEquals(0.0, LogLikelihood.entropy(1), 0.0);
//TODO: more tests here
try {
LogLikelihood.entropy(-1, -1);//exception
fail();
} catch (IllegalArgumentException e) {
}
}
@Test
public void testLogLikelihood() throws Exception {
//TODO: check the epsilons
assertEquals(2.772589, LogLikelihood.logLikelihoodRatio(1, 0, 0, 1), 0.000001);
assertEquals(27.72589, LogLikelihood.logLikelihoodRatio(10, 0, 0, 10), 0.00001);
assertEquals(39.33052, LogLikelihood.logLikelihoodRatio(5, 1995, 0, 100000), 0.00001);
assertEquals(4730.737, LogLikelihood.logLikelihoodRatio(1000, 1995, 1000, 100000), 0.001);
assertEquals(5734.343, LogLikelihood.logLikelihoodRatio(1000, 1000, 1000, 100000), 0.001);
assertEquals(5714.932, LogLikelihood.logLikelihoodRatio(1000, 1000, 1000, 99000), 0.001);
}
@Test
public void testRootLogLikelihood() throws Exception {
// positive where k11 is bigger than expected.
assertTrue(LogLikelihood.rootLogLikelihoodRatio(904, 21060, 1144, 283012) > 0.0);
// negative because k11 is lower than expected
assertTrue(LogLikelihood.rootLogLikelihoodRatio(36, 21928, 60280, 623876) < 0.0);
}
@Test
public void testRootNegativeLLR() {
assertTrue(LogLikelihood.rootLogLikelihoodRatio(6, 7567, 1924, 2426487) > 0.0);
}
@Test
public void testFrequencyComparison() {
final Random rand = RandomUtils.getRandom();
// build a vector full of sample from exponential distribuiton
// this will have lots of little positive values and a few big ones
Vector p1 = new DenseVector(25)
.assign(new DoubleFunction() {
@Override
public double apply(double arg1) {
return -Math.log1p(-rand.nextDouble());
}
});
// make a copy
Vector p2 = p1.like().assign(p1);
// nuke elements 0..4
p1.viewPart(0, 5).assign(0);
// and boost elements 5..7
p1.viewPart(5, 3).assign(Functions.mult(4));
// then normalize to turn it into a probability distribution
p1.assign(Functions.div(p1.norm(1)));
// likewise normalize p2
p2.assign(Functions.div(p2.norm(1)));
// sample 100 times from p1
Multiset<Integer> w1 = HashMultiset.create();
for (int i = 0; i < 100; i++) {
w1.add(sample(p1, rand));
}
// and 1000 times from p2
Multiset<Integer> w2 = HashMultiset.create();
for (int i = 0; i < 1000; i++) {
w2.add(sample(p2, rand));
}
// comparing frequencies, we should be able to find 8 items with score > 0
List<LogLikelihood.ScoredItem<Integer>> r = LogLikelihood.compareFrequencies(w1, w2, 8, 0);
assertTrue(r.size() <= 8);
assertFalse(r.isEmpty());
for (LogLikelihood.ScoredItem<Integer> item : r) {
assertTrue(item.getScore() >= 0);
}
// the most impressive should be 7
assertEquals(7, (int) r.get(0).getItem());
// make sure scores are descending
double lastScore = r.get(0).getScore();
for (LogLikelihood.ScoredItem<Integer> item : r) {
assertTrue(item.getScore() <= lastScore);
lastScore = item.getScore();
}
// now as many as have score >= 1
r = LogLikelihood.compareFrequencies(w1, w2, 40, 1);
// only the boosted items should make the cut
assertEquals(3, r.size());
assertEquals(7, (int) r.get(0).getItem());
assertEquals(5, (int) r.get(1).getItem());
assertEquals(6, (int) r.get(2).getItem());
r = LogLikelihood.compareFrequencies(w1, w2, 1000, -100);
Multiset<Integer> k = HashMultiset.create();
for (LogLikelihood.ScoredItem<Integer> item : r) {
k.add(item.getItem());
}
for (int i = 0; i < 25; i++) {
assertTrue("i = " + i, k.count(i) == 1 || w2.count(i) == 0);
}
// all values that had non-zero counts in larger set should have result scores
assertEquals(w2.elementSet().size(), r.size());
assertEquals(7, (int) r.get(0).getItem());
assertEquals(5, (int) r.get(1).getItem());
assertEquals(6, (int) r.get(2).getItem());
// the last item should definitely have negative score
assertTrue(r.get(r.size() - 1).getScore() < 0);
// make sure scores are descending
lastScore = r.get(0).getScore();
for (LogLikelihood.ScoredItem<Integer> item : r) {
assertTrue(item.getScore() <= lastScore);
lastScore = item.getScore();
}
}
/**
* Samples from a multinomial distribution with parameters p and random generator rand.
* @param p A vector describing the distribution. Should sum to 1.
* @param rand A random number generator.
* @return A single sample from the multinomial distribution.
*/
private static int sample(Vector p, Random rand) {
double u = rand.nextDouble();
// simple sequential algorithm. Not the fastest, but we don't care
for (int i = 0; i < p.size(); i++) {
if (u <= p.get(i)) {
return i;
}
u -= p.get(i);
}
return p.size() - 1;
}
}