LogLikelihoodTest.java example

Explorer
mahout-rbmClassifier-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.math.stats;

import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.MahoutTestCase;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.function.DoubleFunction;
import org.apache.mahout.math.function.Functions;
import org.junit.Test;

import java.util.List;
import java.util.Random;

public final class LogLikelihoodTest extends MahoutTestCase {

  @Test
  public void testEntropy() throws Exception {

    assertEquals(1.386294, LogLikelihood.entropy(1, 1), 0.0001);
    assertEquals(0.0, LogLikelihood.entropy(1), 0.0);
    //TODO: more tests here
    try {
      LogLikelihood.entropy(-1, -1);//exception
      fail();
    } catch (IllegalArgumentException e) {

    }
  }

  @Test
  public void testLogLikelihood() throws Exception {
    //TODO: check the epsilons
    assertEquals(2.772589, LogLikelihood.logLikelihoodRatio(1, 0, 0, 1), 0.000001);
    assertEquals(27.72589, LogLikelihood.logLikelihoodRatio(10, 0, 0, 10), 0.00001);
    assertEquals(39.33052, LogLikelihood.logLikelihoodRatio(5, 1995, 0, 100000), 0.00001);
    assertEquals(4730.737, LogLikelihood.logLikelihoodRatio(1000, 1995, 1000, 100000), 0.001);
    assertEquals(5734.343, LogLikelihood.logLikelihoodRatio(1000, 1000, 1000, 100000), 0.001);
    assertEquals(5714.932, LogLikelihood.logLikelihoodRatio(1000, 1000, 1000, 99000), 0.001);
  }

  @Test
  public void testRootLogLikelihood() throws Exception {
    // positive where k11 is bigger than expected.
    assertTrue(LogLikelihood.rootLogLikelihoodRatio(904, 21060, 1144, 283012) > 0.0);

    // negative because k11 is lower than expected
    assertTrue(LogLikelihood.rootLogLikelihoodRatio(36, 21928, 60280, 623876) < 0.0);
  }

  @Test
  public void testRootNegativeLLR() {
    assertTrue(LogLikelihood.rootLogLikelihoodRatio(6, 7567, 1924, 2426487) > 0.0);
  }

  @Test
  public void testFrequencyComparison() {
    final Random rand = RandomUtils.getRandom();

    // build a vector full of sample from exponential distribuiton
    // this will have lots of little positive values and a few big ones
    Vector p1 = new DenseVector(25)
      .assign(new DoubleFunction() {
        @Override
        public double apply(double arg1) {
          return -Math.log1p(-rand.nextDouble());
        }
      });

    // make a copy
    Vector p2 = p1.like().assign(p1);

    // nuke elements 0..4
    p1.viewPart(0, 5).assign(0);

    // and boost elements 5..7
    p1.viewPart(5, 3).assign(Functions.mult(4));

    // then normalize to turn it into a probability distribution
    p1.assign(Functions.div(p1.norm(1)));

    // likewise normalize p2
    p2.assign(Functions.div(p2.norm(1)));

    // sample 100 times from p1
    Multiset<Integer> w1 = HashMultiset.create();
    for (int i = 0; i < 100; i++) {
      w1.add(sample(p1, rand));
    }

    // and 1000 times from p2
    Multiset<Integer> w2 = HashMultiset.create();
    for (int i = 0; i < 1000; i++) {
      w2.add(sample(p2, rand));
    }

    // comparing frequencies, we should be able to find 8 items with score > 0
    List<LogLikelihood.ScoredItem<Integer>> r = LogLikelihood.compareFrequencies(w1, w2, 8, 0);
    assertTrue(r.size() <= 8);
    assertFalse(r.isEmpty());
    for (LogLikelihood.ScoredItem<Integer> item : r) {
      assertTrue(item.getScore() >= 0);
    }

    // the most impressive should be 7
    assertEquals(7, (int) r.get(0).getItem());

    // make sure scores are descending
    double lastScore = r.get(0).getScore();
    for (LogLikelihood.ScoredItem<Integer> item : r) {
      assertTrue(item.getScore() <= lastScore);
      lastScore = item.getScore();
    }

    // now as many as have score >= 1
    r = LogLikelihood.compareFrequencies(w1, w2, 40, 1);

    // only the boosted items should make the cut
    assertEquals(3, r.size());
    assertEquals(7, (int) r.get(0).getItem());
    assertEquals(5, (int) r.get(1).getItem());
    assertEquals(6, (int) r.get(2).getItem());

    r = LogLikelihood.compareFrequencies(w1, w2, 1000, -100);
    Multiset<Integer> k = HashMultiset.create();
    for (LogLikelihood.ScoredItem<Integer> item : r) {
      k.add(item.getItem());
    }
    for (int i = 0; i < 25; i++) {
      assertTrue("i = " + i, k.count(i) == 1 || w2.count(i) == 0);
    }

    // all values that had non-zero counts in larger set should have result scores
    assertEquals(w2.elementSet().size(), r.size());
    assertEquals(7, (int) r.get(0).getItem());
    assertEquals(5, (int) r.get(1).getItem());
    assertEquals(6, (int) r.get(2).getItem());
    
    // the last item should definitely have negative score
    assertTrue(r.get(r.size() - 1).getScore() < 0);

    // make sure scores are descending
    lastScore = r.get(0).getScore();
    for (LogLikelihood.ScoredItem<Integer> item : r) {
      assertTrue(item.getScore() <= lastScore);
      lastScore = item.getScore();
    }
  }

  /**
   * Samples from a multinomial distribution with parameters p and random generator rand.
   * @param p      A vector describing the distribution.  Should sum to 1.
   * @param rand   A random number generator.
   * @return  A single sample from the multinomial distribution.
   */
  private static int sample(Vector p, Random rand) {
    double u = rand.nextDouble();

    // simple sequential algorithm.  Not the fastest, but we don't care
    for (int i = 0; i < p.size(); i++) {
      if (u <= p.get(i)) {
        return i;
      }
      u -= p.get(i);
    }
    return p.size() - 1;
  }
}