package com.datascience.gal.dataGenerator;
import com.datascience.core.base.AssignedLabel;
import com.datascience.core.base.LObject;
import com.datascience.datastoring.datamodels.memory.InMemoryNominalData;
import com.datascience.gal.BatchDawidSkene;
import com.datascience.utils.CostMatrix;
import org.apache.log4j.Logger;
import java.util.*;
/**
* This class is used to create test data for Troia client tests.
*
* @author piotr.gnys@10clouds.com
*/
public class DataGenerator {
/**
* Generate collection of test objects.
*
* @see TroiaObjectCollection
* @param objectCount
* Numbers of objects to generate
* @param categories
* Map that associates class names with probability of object
* belonging to that class ( form 0 to 1 )
* @return TestObjectCollection object.
*/
public TroiaObjectCollection generateTestObjects(int objectCount,
Map<String, Double> categories) {
Collection<Double> percentages = categories.values();
double totalPercentage = 0;
for (Double percentage : percentages) {
totalPercentage += percentage.doubleValue();
}
if (Math.abs(1 - totalPercentage) > 0.0001) {
throw new ArithmeticException("Percentage values sum up to "
+ totalPercentage + " instead of 1.");
} else {
int[] borders = new int[percentages.size()];
int index = 0;
for (Double percentage : percentages) {
borders[index] = (int) (percentage * objectCount);
index++;
}
Map<String, String> objects = new HashMap<String, String>();
index = 0;
int categortySwitchCounter = 0;
int categoryIndex = 0;
Collection<String> categoryNames = categories.keySet();
Iterator<String> categoryNameIterator = categoryNames.iterator();
String categoryName = categoryNameIterator.next();
for (index = 0; index < objectCount; index++) {
if (categortySwitchCounter < borders[categoryIndex]) {
if (categoryIndex < categoryNames.size()) {
categortySwitchCounter++;
}
} else {
categortySwitchCounter = 0;
categoryIndex++;
categoryName = categoryNameIterator.next();
}
String objectName = "Object-" + index;
objects.put(objectName, categoryName);
}
return new TroiaObjectCollection(objects);
}
}
/**
* Generates test object collection with auto generated categories that have
* equal distribution among the objects.
*
* @see TroiaObjectCollection
* @param objectCount
* Numbers of objects to generate
* @param categoryCount
* Number of categories to generate
* @return TestObjectCollection object
*/
public TroiaObjectCollection generateTestObjects(int objectCount,
int categoryCount) {
Collection<String> categoryNames = this
.generateCategoryNames(categoryCount);
return this.generateTestObjects(objectCount, categoryNames);
}
public TroiaObjectCollection generateTestObjects(int objectCount,
Collection<String> categoryNames) {
double p = 1.0 / (double) categoryNames.size();
Double percentage = new Double(p);
Map<String, Double> categories = new HashMap<String, Double>();
for (String category : categoryNames) {
categories.put(category, percentage);
}
return this.generateTestObjects(objectCount, categories);
}
/**
* Generates category names
*
* @param categoryCount
* Number of categories to generate
* @return Collection of category names.
*/
public Collection<String> generateCategoryNames(int categoryCount) {
ArrayList<String> categories = new ArrayList<String>();
for (int i = 0; i < categoryCount; i++) {
categories.add("Category-" + i);
}
return categories;
}
/**
* Creates artificial worker that will be working in environment with
* categories given as a parameter
*
* @param name
* Worker name
* @param quality
* Quality of the worker (probability that he will label object
* correctly)
* @param categories
* Categories that exist in task executed by this worker
* @return Artificial worker
*/
public ArtificialWorker generateArtificialWorker(String name,
double quality, Collection<String> categories) {
ArtificialWorker worker = new ArtificialWorker();
Map<String, Map<String, Double>> confMatrix = new HashMap<String, Map<String, Double>>();
Map<String, Double> confVector;
double wrongProb = 1 - quality;
Object []array = categories.toArray();
int length = array.length;
Random random = new Random();
for (String correctCategory : categories) {
double restProb = wrongProb;
confVector = new HashMap<String, Double>();
for (String category : categories) {
confVector.put(category, 0.0);
}
confVector.put(correctCategory, quality);
while (restProb > 0) {
double prob = Math.random() * (wrongProb / 2.0);
String category = (String) array[random.nextInt(length)];
if (!category.equals(correctCategory)) {
Double actualProb = confVector.get(category);
if (actualProb == null) {
actualProb = 0.0;
}
actualProb += Math.min(restProb, prob);
confVector.put(category, actualProb);
restProb -= prob;
}
}
confMatrix.put(correctCategory, confVector);
}
worker.setName(name);
worker.setConfusionMatrix(new ConfusionMatrix(confMatrix));
logger.debug("Generated artifical worker with quality " + quality);
return worker;
}
/**
* Creates artificial worker, with random work quality, that will be working
* in environment with categories given as a parameter
*
* @param name
* Worker name
* @param categories
* Categories that exist in task executed by this worker
* @return Artificial worker
*/
public ArtificialWorker generateArtificialWorker(String name,
Collection<String> categories) {
return this.generateArtificialWorker(name, Math.random(), categories);
}
/**
* Creates collection of workers, with qualities form given range, that will
* operate in environment that contains categories given as a parameter.
*
* @param workerCount
* Number of workers that will be generated
* @param categories
* Collection of categories that workers will be assigning
* @param minQuality
* Minimal quality of generated worker (from 0 to 1)
* @param maxQuality
* Maximal quality of generated worker (from 0 to 1)
* @return Collection of artifical workers
*/
public Collection<ArtificialWorker> generateArtificialWorkers(
int workerCount, Collection<String> categories, double minQuality,
double maxQuality) {
Collection<ArtificialWorker> workers = new ArrayList<ArtificialWorker>();
if (minQuality > maxQuality)
minQuality = 0;
double qualityRange = maxQuality - minQuality;
for (int i = 0; i < workerCount; i++) {
double quality = Math.random() * qualityRange + minQuality;
ArtificialWorker worker = this.generateArtificialWorker("Worker-"
+ i, quality, categories);
workers.add(worker);
}
return workers;
}
/**
* Generates labels assigned by artificial workers.
*
* @param workers
* Collection of artificial workers
* @param objects
* Test objects collection
* @param workersPerObject
* How many workers will assign label to same object
* @return Collection of worker assigned labels
*/
public Collection<AssignedLabel<String>> generateLabels(Collection<ArtificialWorker> workers, TroiaObjectCollection objects, int workersPerObject) {
Collection<AssignedLabel<String>> labels = new ArrayList<AssignedLabel<String>>();
Map<ArtificialWorker, NoisedLabelGenerator> generators = NoisedLabelGeneratorFactory
.getInstance().getRouletteGeneratorsForWorkers(workers);
Iterator<ArtificialWorker> workersIterator = workers.iterator();
for (String object : objects) {
String correctCat = objects.getCategory(object);
ArtificialWorker worker;
for (int labelsForObject = 0; labelsForObject < workersPerObject; labelsForObject++) {
String assignedLabel;
if (!workersIterator.hasNext()) {
workersIterator = workers.iterator();
}
worker = workersIterator.next();
assignedLabel = generators.get(worker).getCategoryWithNoise(
correctCat);
labels.add(new AssignedLabel<String>(worker, new LObject<String>(object), assignedLabel));
}
}
return labels;
}
/**
* Generates gold labels from collection of test objects
*
* @param objects
* Test objects
* @param goldCoverage
* Fraction of objects that will have gold label
* @return Collection of gold labels.
*/
public Collection<LObject<String>> generateGoldLabels(
TroiaObjectCollection objects, double goldCoverage) {
int goldCount = (int) (objects.size() * goldCoverage);
Collection<LObject<String>> goldLabels = new ArrayList<LObject<String>>();
Iterator<String> objectsIterator = objects.iterator();
for (int i = 0; i < goldCount; i++) {
String objectName;
if (objectsIterator.hasNext()) {
objectName = objectsIterator.next();
LObject<String> goldObject = new LObject<String>(objectName);
goldObject.setGoldLabel(objects.getCategory(objectName));
goldLabels.add(goldObject);
} else {
break;
}
}
return goldLabels;
}
public Collection<Map<String, Object>> computeArtificialWorkerQualities(
Collection<String> categories,
TroiaObjectCollection objects,
Collection<ArtificialWorker> workers,
Collection<AssignedLabel<String>> labels,
Collection<LObject<String>> goldLabels) {
// Objects conversion.
Collection<AssignedLabel<String>> tsLabels =
new ArrayList<AssignedLabel<String>>();
for (AssignedLabel<String> label : labels) {
tsLabels.add(new AssignedLabel<String>(label.getWorker(), label.getLobject(), label.getLabel()));
}
Collection<LObject<String>> tsCorrectLabels =
new ArrayList<LObject<String>>();
for (String objectName : objects.testObject.keySet()) {
LObject<String> goldObject = new LObject<String>(objectName);
goldObject.setGoldLabel(objects.getCategory(objectName));
tsCorrectLabels.add(goldObject);
}
// com.datascience.gal.DawidSkene dawidSkene =
// new com.datascience.gal.BatchDawidSkene("data-generation", tsCategories);
// dawidSkene.addAssignedLabels(tsLabels);
// dawidSkene.addCorrectLabels(tsCorrectLabels);
BatchDawidSkene dawidSkene = new BatchDawidSkene();
InMemoryNominalData data = new InMemoryNominalData();
for (AssignedLabel<String> assign : tsLabels) {
data.addAssign(assign);
}
dawidSkene.setData(data);
return null;
}
public Data generateTestData(String requestId, int objectCount,
int categoryCount, int workerCount, double minQuality,
double maxQuality, double goldRatio, int workersPerObject) {
Data data = new Data();
Collection<String> categories = this.generateCategoryNames(
categoryCount);
TroiaObjectCollection objects = this.generateTestObjects(objectCount,
categories);
CostMatrix<String> cm = CategoryFactory.getInstance().createMatrix(categories);
Collection<LObject<String>> goldLabels = this.generateGoldLabels(objects,
goldRatio);
Collection<ArtificialWorker> workers = null;
Collection<AssignedLabel<String>> labels = null;
Collection<String> workerNames = null;
if(workerCount>0) {
workers = this.generateArtificialWorkers(workerCount, categories, minQuality, maxQuality);
labels = this.generateLabels(workers, objects, workersPerObject);
workerNames = new ArrayList<String>();
for (ArtificialWorker worker : workers) {
workerNames.add(worker.getName());
}
}
data.setCategories(categories);
data.setGoldLabels(goldLabels);
data.setLabels(labels);
data.setCostMatrix(cm);
data.setObjectCollection(objects);
data.setRequestId(requestId);
data.setWorkers(workerNames);
data.setArtificialWorkers(workers);
data.setArtificialWorkerQualities(computeArtificialWorkerQualities(categories, objects, workers, labels, goldLabels));
return data;
}
public static DataGenerator getInstance() {
return instance;
}
private static DataGenerator instance = new DataGenerator();
private DataGenerator() {
}
public final static double CONFUSION_VECTOR_SUM_EPSILON = 1E-6;
/**
* Logger for this class
*/
private static Logger logger = Logger.getLogger(DataGenerator.class);
}