DataTest.java example

Explorer
mahout-rbmClassifier-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.classifier.df.data;

import java.util.Arrays;
import java.util.Random;

import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.classifier.df.data.conditions.Condition;
import org.junit.Test;

public class DataTest extends MahoutTestCase {

  private static final int ATTRIBUTE_COUNT = 10;

  private static final int DATA_SIZE = 100;

  private Random rng;

  private Data classifierData;

  private Data regressionData;

  @Override
  public void setUp() throws Exception {
    super.setUp();
    rng = RandomUtils.getRandom();
    classifierData = Utils.randomData(rng, ATTRIBUTE_COUNT, false, DATA_SIZE);
    regressionData = Utils.randomData(rng, ATTRIBUTE_COUNT, true, DATA_SIZE);
  }

  /**
   * Test method for
   * {@link org.apache.mahout.classifier.df.data.Data#subset(org.apache.mahout.classifier.df.data.conditions.Condition)}.
   */
  @Test
  public void testSubset() {
    int n = 10;

    for (int nloop = 0; nloop < n; nloop++) {
      int attr = rng.nextInt(classifierData.getDataset().nbAttributes());

      double[] values = classifierData.values(attr);
      double value = values[rng.nextInt(values.length)];

      Data eSubset = classifierData.subset(Condition.equals(attr, value));
      Data lSubset = classifierData.subset(Condition.lesser(attr, value));
      Data gSubset = classifierData.subset(Condition.greaterOrEquals(attr, value));

      for (int index = 0; index < DATA_SIZE; index++) {
        Instance instance = classifierData.get(index);

        if (instance.get(attr) < value) {
          assertTrue(lSubset.contains(instance));
          assertFalse(eSubset.contains(instance));
          assertFalse(gSubset.contains(instance));
        } else if (instance.get(attr) == value) {
          assertFalse(lSubset.contains(instance));
          assertTrue(eSubset.contains(instance));
          assertTrue(gSubset.contains(instance));
        } else {
          assertFalse(lSubset.contains(instance));
          assertFalse(eSubset.contains(instance));
          assertTrue(gSubset.contains(instance));
        }
      }

      // regression
      attr = rng.nextInt(regressionData.getDataset().nbAttributes());

      values = regressionData.values(attr);
      value = values[rng.nextInt(values.length)];

      eSubset = regressionData.subset(Condition.equals(attr, value));
      lSubset = regressionData.subset(Condition.lesser(attr, value));
      gSubset = regressionData.subset(Condition.greaterOrEquals(attr, value));

      for (int index = 0; index < DATA_SIZE; index++) {
        Instance instance = regressionData.get(index);

        if (instance.get(attr) < value) {
          assertTrue(lSubset.contains(instance));
          assertFalse(eSubset.contains(instance));
          assertFalse(gSubset.contains(instance));
        } else if (instance.get(attr) == value) {
          assertFalse(lSubset.contains(instance));
          assertTrue(eSubset.contains(instance));
          assertTrue(gSubset.contains(instance));
        } else {
          assertFalse(lSubset.contains(instance));
          assertFalse(eSubset.contains(instance));
          assertTrue(gSubset.contains(instance));
        }
      }
    }
  }

  @Test
  public void testValues() throws Exception {
    for (int attr = 0; attr < classifierData.getDataset().nbAttributes(); attr++) {
      double[] values = classifierData.values(attr);

      // each value of the attribute should appear exactly one time in values
      for (int index = 0; index < DATA_SIZE; index++) {
        assertEquals(1, count(values, classifierData.get(index).get(attr)));
      }
    }

    for (int attr = 0; attr < regressionData.getDataset().nbAttributes(); attr++) {
      double[] values = regressionData.values(attr);

      // each value of the attribute should appear exactly one time in values
      for (int index = 0; index < DATA_SIZE; index++) {
        assertEquals(1, count(values, regressionData.get(index).get(attr)));
      }
    }
  }

  private static int count(double[] values, double value) {
    int count = 0;
    for (double v : values) {
      if (v == value) {
        count++;
      }
    }
    return count;
  }

  @Test
  public void testIdenticalTrue() throws Exception {
    // generate a small data, only to get the dataset
    Dataset dataset = Utils.randomData(rng, ATTRIBUTE_COUNT, false, 1).getDataset();
    
    // test empty data
    Data empty = new Data(dataset);
    assertTrue(empty.isIdentical());

    // test identical data, except for the labels
    Data identical = Utils.randomData(rng, ATTRIBUTE_COUNT, false, DATA_SIZE);
    Instance model = identical.get(0);
    for (int index = 1; index < DATA_SIZE; index++) {
      for (int attr = 0; attr < identical.getDataset().nbAttributes(); attr++) {
        identical.get(index).set(attr, model.get(attr));
      }
    }

    assertTrue(identical.isIdentical());
  }

  @Test
  public void testIdenticalFalse() throws Exception {
    int n = 10;

    for (int nloop = 0; nloop < n; nloop++) {
      Data data = Utils.randomData(rng, ATTRIBUTE_COUNT, false, DATA_SIZE);

      // choose a random instance
      int index = rng.nextInt(DATA_SIZE);
      Instance instance = data.get(index);

      // change a random attribute
      int attr = rng.nextInt(data.getDataset().nbAttributes());
      instance.set(attr, instance.get(attr) + 1);

      assertFalse(data.isIdentical());
    }
  }

  @Test
  public void testIdenticalLabelTrue() throws Exception {
    // generate a small data, only to get a dataset
    Dataset dataset = Utils.randomData(rng, ATTRIBUTE_COUNT, false, 1).getDataset();
    
    // test empty data
    Data empty = new Data(dataset);
    assertTrue(empty.identicalLabel());

    // test identical labels
    String descriptor = Utils.randomDescriptor(rng, ATTRIBUTE_COUNT);
    double[][] source = Utils.randomDoublesWithSameLabel(rng, descriptor, false,
            DATA_SIZE, rng.nextInt());
    String[] sData = Utils.double2String(source);
    
    dataset = DataLoader.generateDataset(descriptor, false, sData);
    Data data = DataLoader.loadData(dataset, sData);
    
    assertTrue(data.identicalLabel());
  }

  @Test
  public void testIdenticalLabelFalse() throws Exception {
    int n = 10;

    for (int nloop = 0; nloop < n; nloop++) {
      String descriptor = Utils.randomDescriptor(rng, ATTRIBUTE_COUNT);
      int label = Utils.findLabel(descriptor);
      double[][] source = Utils.randomDoublesWithSameLabel(rng, descriptor, false,
              DATA_SIZE, rng.nextInt());
      // choose a random vector and change its label
      int index = rng.nextInt(DATA_SIZE);
      source[index][label]++;

      String[] sData = Utils.double2String(source);
      
      Dataset dataset = DataLoader.generateDataset(descriptor, false, sData);
      Data data = DataLoader.loadData(dataset, sData);

      assertFalse(data.identicalLabel());
    }
  }

  /**
   * Test method for
   * {@link org.apache.mahout.classifier.df.data.Data#bagging(java.util.Random)}.
   */
  @Test
  public void testBagging() {
    Data bag = classifierData.bagging(rng);

    // the bag should have the same size as the data
    assertEquals(classifierData.size(), bag.size());

    // at least one element from the data should not be in the bag
    boolean found = false;
    for (int index = 0; index < classifierData.size() && !found; index++) {
      found = !bag.contains(classifierData.get(index));
    }
    
    assertTrue("some instances from data should not be in the bag", found);

    // regression
    bag = regressionData.bagging(rng);

    // the bag should have the same size as the data
    assertEquals(regressionData.size(), bag.size());

    // at least one element from the data should not be in the bag
    found = false;
    for (int index = 0; index < regressionData.size() && !found; index++) {
      found = !bag.contains(regressionData.get(index));
    }
    
    assertTrue("some instances from data should not be in the bag", found);
}

  /**
   * Test method for
   * {@link org.apache.mahout.classifier.df.data.Data#rsplit(java.util.Random, int)}.
   */
  @Test
  public void testRsplit() {

    // rsplit should handle empty subsets
    Data source = classifierData.clone();
    Data subset = source.rsplit(rng, 0);
    assertTrue("subset should be empty", subset.isEmpty());
    assertEquals("source.size is incorrect", DATA_SIZE, source.size());

    // rsplit should handle full size subsets
    source = classifierData.clone();
    subset = source.rsplit(rng, DATA_SIZE);
    assertEquals("subset.size is incorrect", DATA_SIZE, subset.size());
    assertTrue("source should be empty", source.isEmpty());

    // random case
    int subsize = rng.nextInt(DATA_SIZE);
    source = classifierData.clone();
    subset = source.rsplit(rng, subsize);
    assertEquals("subset.size is incorrect", subsize, subset.size());
    assertEquals("source.size is incorrect", DATA_SIZE - subsize, source.size());

    // regression
    // rsplit should handle empty subsets
    source = regressionData.clone();
    subset = source.rsplit(rng, 0);
    assertTrue("subset should be empty", subset.isEmpty());
    assertEquals("source.size is incorrect", DATA_SIZE, source.size());

    // rsplit should handle full size subsets
    source = regressionData.clone();
    subset = source.rsplit(rng, DATA_SIZE);
    assertEquals("subset.size is incorrect", DATA_SIZE, subset.size());
    assertTrue("source should be empty", source.isEmpty());

    // random case
    subsize = rng.nextInt(DATA_SIZE);
    source = regressionData.clone();
    subset = source.rsplit(rng, subsize);
    assertEquals("subset.size is incorrect", subsize, subset.size());
    assertEquals("source.size is incorrect", DATA_SIZE - subsize, source.size());
}

  @Test
  public void testCountLabel() throws Exception {
    Dataset dataset = classifierData.getDataset();
    int[] counts = new int[dataset.nblabels()];

    int n = 10;

    for (int nloop = 0; nloop < n; nloop++) {
      Arrays.fill(counts, 0);
      classifierData.countLabels(counts);
      
      for (int index = 0; index < classifierData.size(); index++) {
        counts[(int) dataset.getLabel(classifierData.get(index))]--;
      }
      
      for (int label = 0; label < classifierData.getDataset().nblabels(); label++) {
        assertEquals("Wrong label 'equals' count", 0, counts[0]);
      }
    }
  }

  @Test
  public void testMajorityLabel() throws Exception {

    // all instances have the same label
    String descriptor = Utils.randomDescriptor(rng, ATTRIBUTE_COUNT);
    int label = Utils.findLabel(descriptor);

    int label1 = rng.nextInt();
    double[][] source = Utils.randomDoublesWithSameLabel(rng, descriptor, false, 100,
        label1);
    String[] sData = Utils.double2String(source);
    
    Dataset dataset = DataLoader.generateDataset(descriptor, false, sData);
    Data data = DataLoader.loadData(dataset, sData);

    int code1 = dataset.labelCode(Double.toString(label1));

    assertEquals(code1, data.majorityLabel(rng));

    // 51/100 vectors have label2
    int label2 = label1 + 1;
    int nblabel2 = 51;
    while (nblabel2 > 0) {
      double[] vector = source[rng.nextInt(100)];
      if (vector[label] != label2) {
        vector[label] = label2;
        nblabel2--;
      }
    }
    sData = Utils.double2String(source);
    dataset = DataLoader.generateDataset(descriptor, false, sData);
    data = DataLoader.loadData(dataset, sData);
    int code2 = dataset.labelCode(Double.toString(label2));

    // label2 should be the majority label
    assertEquals(code2, data.majorityLabel(rng));

    // 50 vectors with label1 and 50 vectors with label2
    do {
      double[] vector = source[rng.nextInt(100)];
      if (vector[label] == label2) {
        vector[label] = label1;
        break;
      }
    } while (true);
    sData = Utils.double2String(source);
    
    data = DataLoader.loadData(dataset, sData);
    code1 = dataset.labelCode(Double.toString(label1));
    code2 = dataset.labelCode(Double.toString(label2));

    // majorityLabel should return label1 and label2 at random
    boolean found1 = false;
    boolean found2 = false;
    for (int index = 0; index < 10 && (!found1 || !found2); index++) {
      int major = data.majorityLabel(rng);
      if (major == code1) {
        found1 = true;
      }
      if (major == code2) {
        found2 = true;
      }
    }
    assertTrue(found1 && found2);
  }

}