/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.learner.bayes;
import java.util.ArrayList;
import java.util.Collection;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.set.ExampleSetUtilities;
import com.rapidminer.operator.OperatorProgress;
import com.rapidminer.operator.ProcessStoppedException;
import com.rapidminer.tools.Tools;
import com.rapidminer.tools.math.VectorMath;
import com.rapidminer.tools.math.distribution.DiscreteDistribution;
import com.rapidminer.tools.math.distribution.Distribution;
import com.rapidminer.tools.math.distribution.NormalDistribution;
/**
* DistributionModel is a model for learners which estimate distributions of attribute values from
* example sets like NaiveBayes.
*
* Predictions are calculated as product of the conditional probabilities for all attributes times
* the class probability.
*
* The basic learning concept is to simply count occurrences of classes and attribute values. This
* means no probabilities are calculated during the learning step. This is only done before output.
* Optionally, this calculation can apply a Laplace correction which means in particular that zero
* probabilities are avoided which would hide information in distributions of other attributes.
*
* @author Tobias Malbrecht
*/
public class SimpleDistributionModel extends DistributionModel {
private static final long serialVersionUID = -402827845291958569L;
private static final String UNKNOWN_VALUE_NAME = "unknown";
public static final int INDEX_VALUE_SUM = 0;
public static final int INDEX_SQUARED_VALUE_SUM = 1;
public static final int INDEX_MISSING_WEIGHTS = 2;
public static final int INDEX_MEAN = 0;
public static final int INDEX_STANDARD_DEVIATION = 1;
public static final int INDEX_LOG_FACTOR = 2;
private static final int OPERATOR_PROGRESS_STEPS = 200;
/** The number of classes. */
private int numberOfClasses;
/** The number of attributes. */
private int numberOfAttributes;
/** Flags indicating which attribute is nominal. */
private boolean[] nominal;
/** Class name (used for result displaying). */
private String className;
/** Class values (used for result displaying). */
private String[] classValues;
/** Attribute names (used for result displaying). */
private String[] attributeNames;
/** Nominal attribute values (used for result displaying). */
private String[][] attributeValues;
/** Total weight (or number) of examples used to build the model. */
private double totalWeight;
/** Total weight of examples belonging to the separate classes. */
private double[] classWeights;
/**
* Specifies the total weight of examples in which the different combinations of classes and
* (nominal) attribute values co-occur. In the case of numeric attributes the (weighted) sum and
* the (weighted) sum of the squared attribute values are stored which are needed to calculate
* the mean and the standard deviation/variance of the resulting (assumed) normal distribution.
*
* Array dimensions: 1st: attributes 2nd: classes 3nd: nominal values or value sum (index=0) and
* squared value sum (index=1)
*/
private double[][][] weightSums;
/** Class log (!) a-priori probabilities. */
private double[] priors;
/**
* Specifies the a-postiori distributions. Contains the log (!) a-postiori probabilities that
* certain values occur given the class value for nominal values. Contains the means and
* standard deviations for numerical attributes.
*
* Array dimensions: 1st: attributes 2nd: classes 3nd: nominal values or mean (index=0) and
* standard deviation (index=1)
*/
private double[][][] distributionProperties;
/**
* Captures if laplace correction should be applied when calculating probabilities.
*/
boolean laplaceCorrectionEnabled;
/**
* Indicates if the model has recently been updated and the actual probabilities have to be
* calculated.
*/
private boolean modelRecentlyUpdated;
/**
* This constructor allows to build a distribution model from the given data characteristics. It
* is fully updateable. For details on weightsSums, please take a look at the member variable
* weightSums. The ExampleSet is only used for storing the header. The attributes and their
* values, including the class values, must be in the same order in the headerSet as they are in
* the encoded in the weight sums.
*/
public SimpleDistributionModel(ExampleSet headerSet, double classWeights[], double[][][] weightSums) {
super(headerSet, ExampleSetUtilities.SetsCompareOption.ALLOW_SUPERSET,
ExampleSetUtilities.TypesCompareOption.ALLOW_SAME_PARENTS);
Attributes attributes = headerSet.getAttributes();
// label
Attribute labelAttribute = attributes.getLabel();
this.className = labelAttribute.getName();
this.numberOfClasses = labelAttribute.getMapping().size();
this.classValues = new String[numberOfClasses];
int i = 0;
for (String value : labelAttribute.getMapping().getValues()) {
classValues[i] = value;
i++;
}
// attributes
this.numberOfAttributes = attributes.size();
this.attributeNames = new String[numberOfAttributes];
this.attributeValues = new String[numberOfAttributes][];
this.nominal = new boolean[numberOfAttributes];
i = 0;
for (Attribute attribute : attributes) {
attributeNames[i] = attribute.getName();
if (attribute.isNominal()) {
nominal[i] = true;
attributeValues[i] = new String[attribute.getMapping().size()];
int j = 0;
for (String value : attribute.getMapping().getValues()) {
attributeValues[i][j] = value;
j++;
}
}
i++;
}
// distribution properties
this.weightSums = weightSums;
this.classWeights = classWeights;
this.totalWeight = VectorMath.sum(classWeights);
// initializing other arrays
this.distributionProperties = new double[numberOfAttributes][numberOfClasses][];
for (i = 0; i < numberOfAttributes; i++) {
for (int j = 0; j < numberOfClasses; j++) {
if (nominal[i]) {
distributionProperties[i][j] = new double[attributeValues[i].length];
} else {
distributionProperties[i][j] = new double[3];
}
}
}
this.priors = new double[numberOfClasses];
// finally derive properties from data
updateDistributionProperties();
}
/**
* This constructor will derive a complete distribution model on basis of the given trainings
* data with Laplace correcture enabled.
*/
public SimpleDistributionModel(ExampleSet trainExampleSet) {
this(trainExampleSet, true);
}
/**
* This constructor will derive a complete distribution model on basis of the given trainings
* data with Laplace correcture depending on the parameter.
*/
public SimpleDistributionModel(ExampleSet trainExampleSet, boolean laplaceCorrectionEnabled) {
super(trainExampleSet, ExampleSetUtilities.SetsCompareOption.ALLOW_SUPERSET,
ExampleSetUtilities.TypesCompareOption.ALLOW_SAME_PARENTS);
createSimpleDistributionModel(trainExampleSet, laplaceCorrectionEnabled);
// update the model
update(trainExampleSet);
// calculate the probabilities
updateDistributionProperties();
}
/**
* This constructor will derive a complete distribution model on basis of the given trainings
* data with Laplace correcture depending on the parameter.
*
* The OperatorProgress is used to update it.
*
* @throws ProcessStoppedException
*/
public SimpleDistributionModel(ExampleSet trainExampleSet, boolean laplaceCorrectionEnabled, OperatorProgress opProg)
throws ProcessStoppedException {
super(trainExampleSet, ExampleSetUtilities.SetsCompareOption.ALLOW_SUPERSET,
ExampleSetUtilities.TypesCompareOption.ALLOW_SAME_PARENTS);
createSimpleDistributionModel(trainExampleSet, laplaceCorrectionEnabled);
// update the model
update(trainExampleSet, opProg);
// calculate the probabilities
updateDistributionProperties();
}
/**
* Helper method for the constructor
*/
private void createSimpleDistributionModel(ExampleSet trainExampleSet, boolean laplaceCorrectionEnabled) {
this.laplaceCorrectionEnabled = laplaceCorrectionEnabled;
Attribute labelAttribute = trainExampleSet.getAttributes().getLabel();
numberOfClasses = labelAttribute.getMapping().size();
numberOfAttributes = trainExampleSet.getAttributes().size();
nominal = new boolean[numberOfAttributes];
attributeNames = new String[numberOfAttributes];
attributeValues = new String[numberOfAttributes][];
className = labelAttribute.getName();
classValues = new String[numberOfClasses];
for (int i = 0; i < numberOfClasses; i++) {
classValues[i] = labelAttribute.getMapping().mapIndex(i);
}
int attributeIndex = 0;
weightSums = new double[numberOfAttributes][numberOfClasses][];
distributionProperties = new double[numberOfAttributes][numberOfClasses][];
for (Attribute attribute : trainExampleSet.getAttributes()) {
attributeNames[attributeIndex] = attribute.getName();
if (attribute.isNominal()) {
nominal[attributeIndex] = true;
int mappingSize = attribute.getMapping().size() + 1;
attributeValues[attributeIndex] = new String[mappingSize];
for (int i = 0; i < mappingSize - 1; i++) {
attributeValues[attributeIndex][i] = attribute.getMapping().mapIndex(i);
}
attributeValues[attributeIndex][mappingSize - 1] = UNKNOWN_VALUE_NAME;
for (int i = 0; i < numberOfClasses; i++) {
weightSums[attributeIndex][i] = new double[mappingSize];
distributionProperties[attributeIndex][i] = new double[mappingSize];
}
} else {
nominal[attributeIndex] = false;
for (int i = 0; i < numberOfClasses; i++) {
weightSums[attributeIndex][i] = new double[3];
distributionProperties[attributeIndex][i] = new double[3];
}
}
attributeIndex++;
}
// initialization of total and a priori weight counters
totalWeight = 0.0d;
classWeights = new double[numberOfClasses];
priors = new double[numberOfClasses];
}
@Override
public String[] getAttributeNames() {
return this.attributeNames;
}
@Override
public int getNumberOfAttributes() {
return this.attributeNames.length;
}
/**
* Updates the model by counting the occurrences of classes and attribute values in combination
* with the class values.
*
* ATTENTION: only updates the weight counters, distribution properties are not updated, call
* updateDistributionProperties() to accomplish this task.
*
* The OperatorProgress is used to update it.
*
* @throws ProcessStoppedException
*/
public void update(ExampleSet exampleSet, OperatorProgress opProg) throws ProcessStoppedException {
Attribute weightAttribute = exampleSet.getAttributes().getWeight();
if (opProg != null) {
opProg.setTotal(exampleSet.size());
}
Attribute[] regularAttributes = exampleSet.getAttributes().createRegularAttributeArray();
int progressCounter = 0;
for (Example example : exampleSet) {
double weight = weightAttribute == null ? 1.0d : example.getWeight();
totalWeight += weight;
double labelValue = example.getLabel();
if (!Double.isNaN(labelValue)) {
int classIndex = (int) example.getLabel();
classWeights[classIndex] += weight;
int attributeIndex = 0;
for (Attribute attribute : regularAttributes) {
double attributeValue = example.getValue(attribute);
if (nominal[attributeIndex]) {
// the check of the value is needed because the mapping returns -1 for
// missing values:
if (!Double.isNaN(attributeValue) & attributeValue >= 0) {
if ((int) attributeValue < weightSums[attributeIndex][classIndex].length - 1) {
weightSums[attributeIndex][classIndex][(int) attributeValue] += weight;
} else {
// extend weight array if attribute value is not in mapping
for (int i = 0; i < numberOfClasses; i++) {
double[] newWeightSums = new double[(int) attributeValue + 2];
newWeightSums[newWeightSums.length
- 1] = weightSums[attributeIndex][i][weightSums[attributeIndex][i].length - 1];
for (int j = 0; j < weightSums[attributeIndex][i].length - 1; j++) {
newWeightSums[j] = weightSums[attributeIndex][i][j];
}
weightSums[attributeIndex][i] = newWeightSums;
distributionProperties[attributeIndex][i] = new double[(int) attributeValue + 2];
}
weightSums[attributeIndex][classIndex][(int) attributeValue] += weight;
// recreate internal attribute value mapping
attributeValues[attributeIndex] = new String[(int) attributeValue + 2];
for (int i = 0; i < attributeValues[attributeIndex].length - 1; i++) {
attributeValues[attributeIndex][i] = attribute.getMapping().mapIndex(i);
}
attributeValues[attributeIndex][attributeValues[attributeIndex].length
- 1] = UNKNOWN_VALUE_NAME;
}
} else {
weightSums[attributeIndex][classIndex][weightSums[attributeIndex][classIndex].length
- 1] += weight;
}
} else if (attribute.isNumerical() || attribute.isDateTime()) {
// numerical or date attribute
if (!Double.isNaN(attributeValue)) {
weightSums[attributeIndex][classIndex][INDEX_VALUE_SUM] += weight * attributeValue;
weightSums[attributeIndex][classIndex][INDEX_SQUARED_VALUE_SUM] += weight * attributeValue
* attributeValue;
} else {
// these are used to distinguish between total class weights and the
// current attribute's weights
weightSums[attributeIndex][classIndex][INDEX_MISSING_WEIGHTS] += weight;
}
}
attributeIndex++;
}
}
if (opProg != null && ++progressCounter % 100 == 0) {
opProg.setCompleted(progressCounter);
}
}
modelRecentlyUpdated = true;
}
/**
* Updates the model by counting the occurrences of classes and attribute values in combination
* with the class values.
*
* ATTENTION: only updates the weight counters, distribution properties are not updated, call
* updateDistributionProperties() to accomplish this task.
*/
@Override
public void update(ExampleSet exampleSet) {
try {
this.update(exampleSet, null);
} catch (ProcessStoppedException e) {
// Cannot happen, because operator is null
}
}
/**
* Updates the distribution properties by calculating the logged probabilities and distribution
* parameters on the basis of the weight counters.
*/
private void updateDistributionProperties() {
double f = laplaceCorrectionEnabled ? 1 / totalWeight : Double.MIN_VALUE;
double logFactorCoefficient = Math.sqrt(2 * Math.PI);
for (int i = 0; i < numberOfClasses; i++) {
priors[i] = Math.log(classWeights[i] / totalWeight);
}
for (int i = 0; i < numberOfAttributes; i++) {
if (nominal[i]) {
for (int j = 0; j < numberOfClasses; j++) {
for (int k = 0; k < weightSums[i][j].length; k++) {
distributionProperties[i][j][k] = Math
.log((weightSums[i][j][k] + f) / (classWeights[j] + f * weightSums[i][j].length));
}
}
} else {
for (int j = 0; j < numberOfClasses; j++) {
double classWeight = classWeights[j] - weightSums[i][j][INDEX_MISSING_WEIGHTS];
distributionProperties[i][j][INDEX_MEAN] = weightSums[i][j][INDEX_VALUE_SUM] / classWeight;
double standardDeviationSquared = (weightSums[i][j][INDEX_SQUARED_VALUE_SUM]
- weightSums[i][j][INDEX_VALUE_SUM] * weightSums[i][j][INDEX_VALUE_SUM] / classWeight)
/ (classWeight - 1);
double standardDeviation = 1e-3;
if (standardDeviationSquared > 0) {
standardDeviation = Math.sqrt(standardDeviationSquared);
if (Double.isNaN(standardDeviation) || standardDeviation <= 1e-3) {
standardDeviation = 1e-3;
}
}
distributionProperties[i][j][INDEX_STANDARD_DEVIATION] = standardDeviation;
distributionProperties[i][j][INDEX_LOG_FACTOR] = Math
.log(distributionProperties[i][j][INDEX_STANDARD_DEVIATION] * logFactorCoefficient);
}
}
}
modelRecentlyUpdated = false;
}
@Override
public ExampleSet performPrediction(ExampleSet exampleSet, Attribute predictedLabel) throws ProcessStoppedException {
OperatorProgress progress = null;
if (getShowProgress() && getOperator() != null && getOperator().getProgress() != null) {
progress = getOperator().getProgress();
progress.setTotal(exampleSet.size());
}
int progressCounter = 0;
if (modelRecentlyUpdated) {
updateDistributionProperties();
}
double[] probabilities = new double[numberOfClasses];
Attribute[] regularAttributes = exampleSet.getAttributes().createRegularAttributeArray();
for (Example example : exampleSet) {
double maxLogProbability = Double.NEGATIVE_INFINITY;
double probabilitySum = 0;
int mostProbableClass = 0;
int j = 0;
for (int i = 0; i < numberOfClasses; i++) {
probabilities[i] = priors[i];
}
for (Attribute attribute : regularAttributes) {
double value = example.getValue(attribute);
if (nominal[j]) {
if (!Double.isNaN(value)) {
int intValue = (int) value;
for (int i = 0; i < numberOfClasses; i++) {
if (intValue < distributionProperties[j][i].length) {
probabilities[i] += distributionProperties[j][i][intValue];
}
}
} else {
for (int i = 0; i < numberOfClasses; i++) {
probabilities[i] += distributionProperties[j][i][distributionProperties[j][i].length - 1];
}
}
} else {
if (!Double.isNaN(value)) {
for (int i = 0; i < numberOfClasses; i++) {
double base = (value - distributionProperties[j][i][INDEX_MEAN])
/ distributionProperties[j][i][INDEX_STANDARD_DEVIATION];
probabilities[i] -= distributionProperties[j][i][INDEX_LOG_FACTOR] + 0.5 * base * base;
}
}
}
j++;
}
for (int i = 0; i < numberOfClasses; i++) {
if (!Double.isNaN(probabilities[i]) && probabilities[i] > maxLogProbability) {
maxLogProbability = probabilities[i];
mostProbableClass = i;
}
}
for (int i = 0; i < numberOfClasses; i++) {
if (!Double.isNaN(probabilities[i])) {
probabilities[i] = Math.exp(probabilities[i] - maxLogProbability);
probabilitySum += probabilities[i];
} else {
probabilities[i] = 0;
}
}
if (maxLogProbability == Double.NEGATIVE_INFINITY) {
example.setPredictedLabel(Double.NaN);
for (int i = 0; i < numberOfClasses; i++) {
example.setConfidence(classValues[i], Double.NaN);
}
} else {
example.setPredictedLabel(mostProbableClass);
for (int i = 0; i < numberOfClasses; i++) {
example.setConfidence(classValues[i], probabilities[i] / probabilitySum);
}
}
// trigger progress
if (progress != null && ++progressCounter % OPERATOR_PROGRESS_STEPS == 0) {
progress.setCompleted(progressCounter);
}
}
return exampleSet;
}
public void setLaplaceCorrectionEnabled(boolean laplaceCorrectionEnabled) {
this.laplaceCorrectionEnabled = laplaceCorrectionEnabled;
}
public boolean getLaplaceCorrectionEnabled() {
return laplaceCorrectionEnabled;
}
@Override
public double getLowerBound(int attributeIndex) {
if (!nominal[attributeIndex]) {
double lowerBound = Double.POSITIVE_INFINITY;
for (int i = 0; i < numberOfClasses; i++) {
double currentLowerBound = NormalDistribution.getLowerBound(
distributionProperties[attributeIndex][i][INDEX_MEAN],
distributionProperties[attributeIndex][i][INDEX_STANDARD_DEVIATION]);
if (!Double.isNaN(currentLowerBound)) {
lowerBound = Math.min(lowerBound, currentLowerBound);
}
}
return lowerBound;
} else {
return Double.NaN;
}
}
@Override
public double getUpperBound(int attributeIndex) {
if (!nominal[attributeIndex]) {
double upperBound = Double.NEGATIVE_INFINITY;
for (int i = 0; i < numberOfClasses; i++) {
double currentUpperBound = NormalDistribution.getUpperBound(
distributionProperties[attributeIndex][i][INDEX_MEAN],
distributionProperties[attributeIndex][i][INDEX_STANDARD_DEVIATION]);
if (!Double.isNaN(currentUpperBound)) {
upperBound = Math.max(upperBound, currentUpperBound);
}
}
return upperBound;
} else {
return Double.NaN;
}
}
@Override
public boolean isDiscrete(int attributeIndex) {
if (attributeIndex >= 0 && attributeIndex < nominal.length) {
return nominal[attributeIndex];
}
return false;
}
@Override
public Collection<Integer> getClassIndices() {
Collection<Integer> classValueIndices = new ArrayList<Integer>(numberOfClasses);
for (int i = 0; i < numberOfClasses; i++) {
classValueIndices.add(i);
}
return classValueIndices;
}
@Override
public int getNumberOfClasses() {
return numberOfClasses;
}
@Override
public String getClassName(int index) {
return classValues[index];
}
/**
* This returns the raw numerical parameters of the distribution. Depends on the attribute value
* type! Use with caution.
*/
public double[] getRawDistributionParameter(int classIndex, int attributeIndex) {
return distributionProperties[attributeIndex][classIndex];
}
@Override
public Distribution getDistribution(int classIndex, int attributeIndex) {
if (nominal[attributeIndex]) {
double[] probabilities = new double[distributionProperties[attributeIndex][classIndex].length];
for (int i = 0; i < probabilities.length; i++) {
probabilities[i] = Math.exp(distributionProperties[attributeIndex][classIndex][i]);
}
return new DiscreteDistribution(attributeNames[attributeIndex], probabilities, attributeValues[attributeIndex]);
} else {
return new NormalDistribution(distributionProperties[attributeIndex][classIndex][INDEX_MEAN],
distributionProperties[attributeIndex][classIndex][INDEX_STANDARD_DEVIATION]);
}
}
public double getTotalWeight() {
return totalWeight;
}
public double[] getClassWeights() {
return classWeights;
}
public double[] getAprioriProbabilities() {
return priors;
}
@Override
public String toString() {
if (modelRecentlyUpdated) {
updateDistributionProperties();
}
StringBuffer buffer = new StringBuffer();
buffer.append("Distribution model for label attribute " + className);
buffer.append(Tools.getLineSeparators(2));
for (int i = 0; i < numberOfClasses; i++) {
String classTitle = "Class " + classValues[i] + " (" + Tools.formatNumber(Math.exp(priors[i])) + ")";
buffer.append(Tools.getLineSeparator());
buffer.append(classTitle);
buffer.append(Tools.getLineSeparator());
buffer.append(attributeNames.length + " distributions");
buffer.append(Tools.getLineSeparator());
}
return buffer.toString();
}
}