/*
* RapidMiner
*
* Copyright (C) 2001-2008 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.visualization.dependencies;
import java.util.Iterator;
import java.util.List;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.AttributeWeights;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
/**
* <p>This operator calculates the correlation matrix between all attributes of the
* input example set. Furthermore, attribute weights based on the correlations
* can be returned. This allows the deselection of highly correlated attributes
* with the help of an
* {@link com.rapidminer.operator.features.selection.AttributeWeightSelection}
* operator. If no weights should be created, this operator produces simply a
* correlation matrix which up to now cannot be used by other operators but can
* be displayed to the user in the result tab.</p>
*
* <p>Please note that this simple implementation
* performs a data scan for each attribute combination and might therefore take
* some time for non-memory example tables.</p>
*
* @author Ingo Mierswa
* @version $Id: CorrelationMatrixOperator.java,v 1.8 2006/04/14 15:04:22
* ingomierswa Exp $
*/
public class CorrelationMatrixOperator extends Operator {
public static final String PARAMETER_CREATE_WEIGHTS = "create_weights";
public static final String PARAMETER_NORMALIZE_WEIGHTS = "normalize_weights";
public static final String PARAMETER_SQUARED_CORRELATION = "squared_correlation";
public CorrelationMatrixOperator(OperatorDescription description) {
super(description);
}
public IOObject[] apply() throws OperatorException {
ExampleSet exampleSet = getInput(ExampleSet.class);
SymmetricalMatrix matrix = new SymmetricalMatrix("Correlation", exampleSet);
int numberOfAttributes = exampleSet.getAttributes().size();
boolean squared = getParameterAsBoolean(PARAMETER_SQUARED_CORRELATION);
boolean createWeights = getParameterAsBoolean(PARAMETER_CREATE_WEIGHTS);
boolean normalizeWeights = getParameterAsBoolean(PARAMETER_NORMALIZE_WEIGHTS);
int k = 0;
for (Attribute firstAttribute : exampleSet.getAttributes()) {
int l = 0;
for (Attribute secondAttribute : exampleSet.getAttributes()) {
matrix.setValue(k, l, getCorrelation(exampleSet, firstAttribute, secondAttribute, squared || createWeights));
checkForStop();
l++;
}
k++;
}
if (createWeights) {
AttributeWeights weights = new AttributeWeights();
// use squared correlations for weights --> learning schemes should
// be able to use both positively and negatively high correlated
// values
int i = 0;
for (Attribute attribute : exampleSet.getAttributes()) {
double sum = 0.0d;
for (int j = 0; j < numberOfAttributes; j++) {
sum += (1.0d - matrix.getValue(i, j)); // actually the
// squared value
}
weights.setWeight(attribute.getName(), sum / numberOfAttributes);
i++;
}
if (normalizeWeights) {
weights.normalize();
}
return new IOObject[] { exampleSet, weights };
} else {
return new IOObject[] { exampleSet, matrix };
}
}
/** Updates all sums needed to compute the correlation coefficient. */
private double getCorrelation(ExampleSet exampleSet, Attribute firstAttribute, Attribute secondAttribute, boolean squared) {
double sumProd = 0.0d;
double sumFirst = 0.0d;
double sumSecond = 0.0d;
double sumFirstSquared = 0.0d;
double sumSecondSquared = 0.0d;
int counter = 0;
Iterator<Example> reader = exampleSet.iterator();
while (reader.hasNext()) {
Example example = reader.next();
double first = example.getValue(firstAttribute);
double second = example.getValue(secondAttribute);
double prod = first * second;
if (!Double.isNaN(prod)) {
sumProd += prod;
sumFirst += first;
sumFirstSquared += first * first;
sumSecond += second;
sumSecondSquared += second * second;
counter++;
}
}
double r = (counter * sumProd - sumFirst * sumSecond) / (Math.sqrt((counter * sumFirstSquared - sumFirst * sumFirst) * (counter * sumSecondSquared - sumSecond * sumSecond)));
if (squared)
return r * r;
else
return r;
}
public Class<?>[] getInputClasses() {
return new Class[] { ExampleSet.class };
}
public Class<?>[] getOutputClasses() {
return getParameterAsBoolean(PARAMETER_CREATE_WEIGHTS) ? new Class[] { ExampleSet.class, AttributeWeights.class } : new Class[] { ExampleSet.class, SymmetricalMatrix.class };
}
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
ParameterType type = new ParameterTypeBoolean(PARAMETER_CREATE_WEIGHTS, "Indicates if attribute weights based on correlation should be calculated or if the complete matrix should be returned.", false);
type.setExpert(false);
types.add(type);
types.add(new ParameterTypeBoolean(PARAMETER_NORMALIZE_WEIGHTS, "Indicates if the attributes weights should be normalized.", true));
types.add(new ParameterTypeBoolean(PARAMETER_SQUARED_CORRELATION, "Indicates if the squared correlation should be calculated.", false));
return types;
}
}