/*
* Encog(tm) Core v3.4 - Java Version
* http://www.heatonresearch.com/encog/
* https://github.com/encog/encog-java-core
* Copyright 2008-2016 Heaton Research, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* For more information on Heaton Research copyrights, licenses
* and trademarks visit:
* http://www.heatonresearch.com/copyright
*/
package org.encog.ml.importance;
import org.encog.EncogError;
import org.encog.ml.data.MLDataPair;
import org.encog.ml.data.MLDataSet;
import org.encog.ml.importance.AbstractFeatureImportance;
/**
* Compute feature importance using correlation between each of the features and the target. This ranker should only
* be used with datasets that are either regression or binary classifiers. If there are multiple outputs, this ranker
* will throw an error. Also, if there are are more than 2 classes encoded into a single output neuron (e.g. a
* SVM designed for more than two classes), this ranker should not be used.
*
* This ranker does not require a trained model, it simply looks at the training data and determines how closely each
* of the inputs correlates to the target.
*
* https://en.wikipedia.org/wiki/Correlation_and_dependence
*/
public class CorrelationFeatureImportanceCalc extends AbstractFeatureImportance {
/**
* {@inheritDoc}
*/
@Override
public void performRanking() {
throw new EncogError("This algorithm requires a dataset to measure performance against, please call performRanking with a dataset.");
}
/**
* {@inheritDoc}
*/
@Override
public void performRanking(MLDataSet theDataset) {
if( getModel() !=null ) {
throw new EncogError("This algorithm does not use a model, please pass null to init for the model.");
}
if( theDataset.getIdealSize() !=1 ) {
throw new EncogError("This algorithm can only be used with a single-output dataset.");
}
// Calculate sums for the mean values
double[] mean = new double[1+theDataset.getInputSize()];
for(MLDataPair pair: theDataset) {
mean[0] += pair.getIdeal().getData(0);
for(int i=0;i<pair.getInput().size();i++) {
mean[1+i] += pair.getInput().getData(i);
}
}
// Calculate the means
for(int i=0;i<mean.length;i++) {
mean[i]/=theDataset.size();
}
// Calculate the variance
double[] sd = new double[1+theDataset.getInputSize()];
for(MLDataPair pair: theDataset) {
double d = pair.getIdeal().getData(0) - mean[0];
sd[0] += d*d;
for(int i=0;i<pair.getInput().size();i++) {
d = pair.getInput().getData(i) - mean[1+i];
sd[1+i] += d*d;
}
}
// Turn variance to SD
for(int i=0;i<mean.length;i++) {
sd[i]=Math.sqrt(sd[i]/theDataset.size());
}
// Compute correlation
for(int i=0;i<theDataset.getInputSize();i++) {
FeatureRank rank = this.getFeatures().get(i);
double acc = 0;
for(MLDataPair pair: theDataset) {
double a = pair.getIdeal().getData(0) - mean[0];
double b = pair.getInput().getData(i) - mean[i+1];
acc+=a*b;
}
double cov = (1.0/(1.0-theDataset.size()))*acc;
double cor = cov / (sd[0]*sd[i+1]);
rank.setImportancePercent(Math.abs(cor));
rank.setTotalWeight(cor);
}
}
}