/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.classifier.df.data;
import java.util.Arrays;
import java.util.Random;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.classifier.df.data.conditions.Condition;
import org.junit.Test;
public class DataTest extends MahoutTestCase {
private static final int ATTRIBUTE_COUNT = 10;
private static final int DATA_SIZE = 100;
private Random rng;
private Data classifierData;
private Data regressionData;
@Override
public void setUp() throws Exception {
super.setUp();
rng = RandomUtils.getRandom();
classifierData = Utils.randomData(rng, ATTRIBUTE_COUNT, false, DATA_SIZE);
regressionData = Utils.randomData(rng, ATTRIBUTE_COUNT, true, DATA_SIZE);
}
/**
* Test method for
* {@link org.apache.mahout.classifier.df.data.Data#subset(org.apache.mahout.classifier.df.data.conditions.Condition)}.
*/
@Test
public void testSubset() {
int n = 10;
for (int nloop = 0; nloop < n; nloop++) {
int attr = rng.nextInt(classifierData.getDataset().nbAttributes());
double[] values = classifierData.values(attr);
double value = values[rng.nextInt(values.length)];
Data eSubset = classifierData.subset(Condition.equals(attr, value));
Data lSubset = classifierData.subset(Condition.lesser(attr, value));
Data gSubset = classifierData.subset(Condition.greaterOrEquals(attr, value));
for (int index = 0; index < DATA_SIZE; index++) {
Instance instance = classifierData.get(index);
if (instance.get(attr) < value) {
assertTrue(lSubset.contains(instance));
assertFalse(eSubset.contains(instance));
assertFalse(gSubset.contains(instance));
} else if (instance.get(attr) == value) {
assertFalse(lSubset.contains(instance));
assertTrue(eSubset.contains(instance));
assertTrue(gSubset.contains(instance));
} else {
assertFalse(lSubset.contains(instance));
assertFalse(eSubset.contains(instance));
assertTrue(gSubset.contains(instance));
}
}
// regression
attr = rng.nextInt(regressionData.getDataset().nbAttributes());
values = regressionData.values(attr);
value = values[rng.nextInt(values.length)];
eSubset = regressionData.subset(Condition.equals(attr, value));
lSubset = regressionData.subset(Condition.lesser(attr, value));
gSubset = regressionData.subset(Condition.greaterOrEquals(attr, value));
for (int index = 0; index < DATA_SIZE; index++) {
Instance instance = regressionData.get(index);
if (instance.get(attr) < value) {
assertTrue(lSubset.contains(instance));
assertFalse(eSubset.contains(instance));
assertFalse(gSubset.contains(instance));
} else if (instance.get(attr) == value) {
assertFalse(lSubset.contains(instance));
assertTrue(eSubset.contains(instance));
assertTrue(gSubset.contains(instance));
} else {
assertFalse(lSubset.contains(instance));
assertFalse(eSubset.contains(instance));
assertTrue(gSubset.contains(instance));
}
}
}
}
@Test
public void testValues() throws Exception {
for (int attr = 0; attr < classifierData.getDataset().nbAttributes(); attr++) {
double[] values = classifierData.values(attr);
// each value of the attribute should appear exactly one time in values
for (int index = 0; index < DATA_SIZE; index++) {
assertEquals(1, count(values, classifierData.get(index).get(attr)));
}
}
for (int attr = 0; attr < regressionData.getDataset().nbAttributes(); attr++) {
double[] values = regressionData.values(attr);
// each value of the attribute should appear exactly one time in values
for (int index = 0; index < DATA_SIZE; index++) {
assertEquals(1, count(values, regressionData.get(index).get(attr)));
}
}
}
private static int count(double[] values, double value) {
int count = 0;
for (double v : values) {
if (v == value) {
count++;
}
}
return count;
}
@Test
public void testIdenticalTrue() throws Exception {
// generate a small data, only to get the dataset
Dataset dataset = Utils.randomData(rng, ATTRIBUTE_COUNT, false, 1).getDataset();
// test empty data
Data empty = new Data(dataset);
assertTrue(empty.isIdentical());
// test identical data, except for the labels
Data identical = Utils.randomData(rng, ATTRIBUTE_COUNT, false, DATA_SIZE);
Instance model = identical.get(0);
for (int index = 1; index < DATA_SIZE; index++) {
for (int attr = 0; attr < identical.getDataset().nbAttributes(); attr++) {
identical.get(index).set(attr, model.get(attr));
}
}
assertTrue(identical.isIdentical());
}
@Test
public void testIdenticalFalse() throws Exception {
int n = 10;
for (int nloop = 0; nloop < n; nloop++) {
Data data = Utils.randomData(rng, ATTRIBUTE_COUNT, false, DATA_SIZE);
// choose a random instance
int index = rng.nextInt(DATA_SIZE);
Instance instance = data.get(index);
// change a random attribute
int attr = rng.nextInt(data.getDataset().nbAttributes());
instance.set(attr, instance.get(attr) + 1);
assertFalse(data.isIdentical());
}
}
@Test
public void testIdenticalLabelTrue() throws Exception {
// generate a small data, only to get a dataset
Dataset dataset = Utils.randomData(rng, ATTRIBUTE_COUNT, false, 1).getDataset();
// test empty data
Data empty = new Data(dataset);
assertTrue(empty.identicalLabel());
// test identical labels
String descriptor = Utils.randomDescriptor(rng, ATTRIBUTE_COUNT);
double[][] source = Utils.randomDoublesWithSameLabel(rng, descriptor, false,
DATA_SIZE, rng.nextInt());
String[] sData = Utils.double2String(source);
dataset = DataLoader.generateDataset(descriptor, false, sData);
Data data = DataLoader.loadData(dataset, sData);
assertTrue(data.identicalLabel());
}
@Test
public void testIdenticalLabelFalse() throws Exception {
int n = 10;
for (int nloop = 0; nloop < n; nloop++) {
String descriptor = Utils.randomDescriptor(rng, ATTRIBUTE_COUNT);
int label = Utils.findLabel(descriptor);
double[][] source = Utils.randomDoublesWithSameLabel(rng, descriptor, false,
DATA_SIZE, rng.nextInt());
// choose a random vector and change its label
int index = rng.nextInt(DATA_SIZE);
source[index][label]++;
String[] sData = Utils.double2String(source);
Dataset dataset = DataLoader.generateDataset(descriptor, false, sData);
Data data = DataLoader.loadData(dataset, sData);
assertFalse(data.identicalLabel());
}
}
/**
* Test method for
* {@link org.apache.mahout.classifier.df.data.Data#bagging(java.util.Random)}.
*/
@Test
public void testBagging() {
Data bag = classifierData.bagging(rng);
// the bag should have the same size as the data
assertEquals(classifierData.size(), bag.size());
// at least one element from the data should not be in the bag
boolean found = false;
for (int index = 0; index < classifierData.size() && !found; index++) {
found = !bag.contains(classifierData.get(index));
}
assertTrue("some instances from data should not be in the bag", found);
// regression
bag = regressionData.bagging(rng);
// the bag should have the same size as the data
assertEquals(regressionData.size(), bag.size());
// at least one element from the data should not be in the bag
found = false;
for (int index = 0; index < regressionData.size() && !found; index++) {
found = !bag.contains(regressionData.get(index));
}
assertTrue("some instances from data should not be in the bag", found);
}
/**
* Test method for
* {@link org.apache.mahout.classifier.df.data.Data#rsplit(java.util.Random, int)}.
*/
@Test
public void testRsplit() {
// rsplit should handle empty subsets
Data source = classifierData.clone();
Data subset = source.rsplit(rng, 0);
assertTrue("subset should be empty", subset.isEmpty());
assertEquals("source.size is incorrect", DATA_SIZE, source.size());
// rsplit should handle full size subsets
source = classifierData.clone();
subset = source.rsplit(rng, DATA_SIZE);
assertEquals("subset.size is incorrect", DATA_SIZE, subset.size());
assertTrue("source should be empty", source.isEmpty());
// random case
int subsize = rng.nextInt(DATA_SIZE);
source = classifierData.clone();
subset = source.rsplit(rng, subsize);
assertEquals("subset.size is incorrect", subsize, subset.size());
assertEquals("source.size is incorrect", DATA_SIZE - subsize, source.size());
// regression
// rsplit should handle empty subsets
source = regressionData.clone();
subset = source.rsplit(rng, 0);
assertTrue("subset should be empty", subset.isEmpty());
assertEquals("source.size is incorrect", DATA_SIZE, source.size());
// rsplit should handle full size subsets
source = regressionData.clone();
subset = source.rsplit(rng, DATA_SIZE);
assertEquals("subset.size is incorrect", DATA_SIZE, subset.size());
assertTrue("source should be empty", source.isEmpty());
// random case
subsize = rng.nextInt(DATA_SIZE);
source = regressionData.clone();
subset = source.rsplit(rng, subsize);
assertEquals("subset.size is incorrect", subsize, subset.size());
assertEquals("source.size is incorrect", DATA_SIZE - subsize, source.size());
}
@Test
public void testCountLabel() throws Exception {
Dataset dataset = classifierData.getDataset();
int[] counts = new int[dataset.nblabels()];
int n = 10;
for (int nloop = 0; nloop < n; nloop++) {
Arrays.fill(counts, 0);
classifierData.countLabels(counts);
for (int index = 0; index < classifierData.size(); index++) {
counts[(int) dataset.getLabel(classifierData.get(index))]--;
}
for (int label = 0; label < classifierData.getDataset().nblabels(); label++) {
assertEquals("Wrong label 'equals' count", 0, counts[0]);
}
}
}
@Test
public void testMajorityLabel() throws Exception {
// all instances have the same label
String descriptor = Utils.randomDescriptor(rng, ATTRIBUTE_COUNT);
int label = Utils.findLabel(descriptor);
int label1 = rng.nextInt();
double[][] source = Utils.randomDoublesWithSameLabel(rng, descriptor, false, 100,
label1);
String[] sData = Utils.double2String(source);
Dataset dataset = DataLoader.generateDataset(descriptor, false, sData);
Data data = DataLoader.loadData(dataset, sData);
int code1 = dataset.labelCode(Double.toString(label1));
assertEquals(code1, data.majorityLabel(rng));
// 51/100 vectors have label2
int label2 = label1 + 1;
int nblabel2 = 51;
while (nblabel2 > 0) {
double[] vector = source[rng.nextInt(100)];
if (vector[label] != label2) {
vector[label] = label2;
nblabel2--;
}
}
sData = Utils.double2String(source);
dataset = DataLoader.generateDataset(descriptor, false, sData);
data = DataLoader.loadData(dataset, sData);
int code2 = dataset.labelCode(Double.toString(label2));
// label2 should be the majority label
assertEquals(code2, data.majorityLabel(rng));
// 50 vectors with label1 and 50 vectors with label2
do {
double[] vector = source[rng.nextInt(100)];
if (vector[label] == label2) {
vector[label] = label1;
break;
}
} while (true);
sData = Utils.double2String(source);
data = DataLoader.loadData(dataset, sData);
code1 = dataset.labelCode(Double.toString(label1));
code2 = dataset.labelCode(Double.toString(label2));
// majorityLabel should return label1 and label2 at random
boolean found1 = false;
boolean found2 = false;
for (int index = 0; index < 10 && (!found1 || !found2); index++) {
int major = data.majorityLabel(rng);
if (major == code1) {
found1 = true;
}
if (major == code2) {
found2 = true;
}
}
assertTrue(found1 && found2);
}
}