/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.ga.watchmaker.cd.utils;
import com.google.common.base.Preconditions;
import org.apache.mahout.ga.watchmaker.cd.DataSet;
import org.easymock.EasyMock;
import java.util.Random;
/**
* Generate a mock dataset using EasyMock. The dataset contains a random number
* of attributes. Each attribute can be numerical or categorical (choosen
* randomly).
*/
public final class MockDataSet {
private final Random rng;
private final int maxnba;
private final DataSet dataset;
/**
*
* @param maxnba max number of attributes
*/
public MockDataSet(Random rng, int maxnba) {
Preconditions.checkArgument(maxnba > 0, "maxnba must be positive");
this.rng = rng;
this.maxnba = maxnba;
dataset = EasyMock.createMock(DataSet.class);
DataSet.initialize(dataset);
}
/**
* Generate a new dataset.
*
* @param numRate numerical attributes rate.<br>
* 0.0 : all attributes are categorical<br>
* 1.0 : all attributes are numerical<br>
* otherwise : both numerical an categorical attributes are probable
*/
public void randomDataset(double numRate) {
EasyMock.reset(dataset);
int nba = rng.nextInt(maxnba) + 1;
EasyMock.expect(dataset.getNbAttributes()).andReturn(nba).anyTimes();
// label at random position
int labelpos = rng.nextInt(nba);
EasyMock.expect(dataset.getLabelIndex()).andReturn(labelpos).anyTimes();
for (int index = 0; index < nba; index++) {
if (index == labelpos) {
// two-classes
prepareCategoricalAttribute(index, 2);
} else if (rng.nextDouble() < numRate) {
prepareNumericalAttribute(index);
} else {
prepareCategoricalAttribute(index, rng.nextInt(100) + 1);
}
}
EasyMock.replay(dataset);
}
/**
* Generate a new dataset. The attributes can be both numerical or
* categorical.
*/
public void randomDataset() {
randomDataset(0.5);
}
/**
* Generate a new dataset. All the attributes are numerical.
*/
public void numericalDataset() {
randomDataset(1.0);
}
/**
* Generate a new dataset. All the attributes are categorical.
*/
public void categoricalDataset() {
randomDataset(0.0);
}
/**
* Verifies the dataset mock object.
*
* @see org.easymock.EasyMock#verify(Object...)
*/
public void verify() {
EasyMock.verify(dataset);
}
private void prepareNumericalAttribute(int index) {
// srowen: I 'fixed' this to not use Double.{MAX,MIN}_VALUE since
// it does not seem like that has the desired effect
double max = rng.nextDouble() * ((long) Integer.MAX_VALUE - Integer.MIN_VALUE) + Integer.MIN_VALUE;
double min = rng.nextDouble() * (max - Integer.MIN_VALUE) + Integer.MIN_VALUE;
EasyMock.expect(dataset.isNumerical(index)).andReturn(true).anyTimes();
EasyMock.expect(dataset.getMax(index)).andReturn(max).anyTimes();
EasyMock.expect(dataset.getMin(index)).andReturn(min).anyTimes();
}
private void prepareCategoricalAttribute(int index, int nbcats) {
EasyMock.expect(dataset.isNumerical(index)).andReturn(false).anyTimes();
EasyMock.expect(dataset.getNbValues(index)).andReturn(nbcats).anyTimes();
}
}