package quickml.supervised.dataProcessing;
import com.google.common.collect.Maps;
import quickml.data.instances.InstanceWithAttributesMap;
import quickml.supervised.tree.constants.AttributeType;
import java.io.Serializable;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* Created by alexanderhawk on 3/19/15.
*/
public class BasicTrainingDataSurveyor<T extends InstanceWithAttributesMap<?>> {
private boolean considerBooleanAttributes = false;
public BasicTrainingDataSurveyor(boolean considerBooleanAttributes) {
this.considerBooleanAttributes = considerBooleanAttributes;
}
public Map<AttributeType, Set<String>> groupAttributesByType(final List<T> trainingData) {
Map<String, AttributeCharacteristics> attributeCharacteristics = getMapOfAttributesToAttributeCharacteristics(trainingData);
Map<AttributeType, Set<String>> attributesByType = groupByType(attributeCharacteristics);
return attributesByType;
}
public Map<String, AttributeCharacteristics> getMapOfAttributesToAttributeCharacteristics(List<T> trainingData) {
Map<String, AttributeCharacteristics> attributeCharacteristics = Maps.newHashMap();
for (T instance : trainingData) {
for (Map.Entry<String, Serializable> e : instance.getAttributes().entrySet()) {
AttributeCharacteristics attributeCharacteristic = attributeCharacteristics.get(e.getKey());
if (attributeCharacteristic == null) {
attributeCharacteristic = new AttributeCharacteristics();
attributeCharacteristics.put(e.getKey(), attributeCharacteristic);
}
if (!(e.getValue() instanceof Number)) {
attributeCharacteristic.isNumber = false;
}
attributeCharacteristic.updateBooleanStatus(e.getValue());
}
}
return attributeCharacteristics;
}
private Map<AttributeType, Set<String>> groupByType(Map<String, AttributeCharacteristics> attributeCharacteristics) {
Map<AttributeType, Set<String>> attributesByType = Maps.newHashMap();
attributesByType.put(AttributeType.CATEGORICAL, new HashSet<String>());
attributesByType.put(AttributeType.NUMERIC, new HashSet<String>());
if (considerBooleanAttributes)
attributesByType.put(AttributeType.BOOLEAN, new HashSet<String>());
for (String attribute : attributeCharacteristics.keySet()) {
if (attributeCharacteristics.get(attribute).isNumber) {
attributesByType.get(AttributeType.NUMERIC).add(attribute);
} else if (considerBooleanAttributes && attributeCharacteristics.get(attribute).isBoolean) {
attributesByType.get(AttributeType.BOOLEAN).add(attribute);
} else {
attributesByType.get(AttributeType.CATEGORICAL).add(attribute);
}
}
return attributesByType;
}
}