/*
* RapidMiner
*
* Copyright (C) 2001-2008 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.performance;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.TreeMap;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.set.ConditionedExampleSet;
import com.rapidminer.example.set.NoMissingAttributesCondition;
import com.rapidminer.operator.OperatorException;
/**
* Provides methods to compute ranks for a single attribute and
* rank correlations for two attributes. When computing rank
* correlations, examples containing missing values for either
* attribute are skipped. When computing ranks, missing values
* are given missing ranks. All methods include an option to
* specify an imprecision tolerance when comparing values.
*
* @author Paul Rubin
* @version $Id: RankStatistics.java,v 1.5 2008/05/09 19:22:43 ingomierswa Exp $
*/
public class RankStatistics {
/**
* Calculates the Spearman rank correlation between two attributes.
*
* @param eSet the example set
* @param a the first attribute to correlate
* @param b the second attribute to correlate
* @param f a fuzz factor (allowance for imprecision) when ranking
* @return the rank correlation
* @throws OperatorException
*/
public static double rho(ExampleSet eSet, Attribute a, Attribute b, double f) throws OperatorException {
// create a new example set containing just attributes a and b
ExampleSet e = extract(eSet, a, b);
double[] ranka = rank(e, a, null, f);
double[] rankb = rank(e, b, a, f);
int nObs = ranka.length;
double mu2 = nObs * (nObs + 1.0d) * (nObs + 1.0d) / 4.0d;
// store rank products in a priority queue to sort them
double prod = 0;
double a2 = 0;
double b2 = 0;
for (int i = 0; i < nObs; i++) {
prod += ranka[i] * rankb[i];
a2 += ranka[i] * ranka[i];
b2 += rankb[i] * rankb[i];
}
// return Spearman's rho
double value = Math.sqrt((a2 - mu2) * (b2 - mu2));
if (value != 0)
return (prod - mu2) / value;
else
return 0;
}
/**
* Calculates the Spearman rank correlation between two attributes.
*
* @param eSet the example set
* @param a the first attribute to correlate
* @param b the second attribute to correlate
* @return the rank correlation
* @throws OperatorException
*/
public static double rho(ExampleSet eSet, Attribute a, Attribute b) throws OperatorException {
return rho(eSet, a, b, 0.0);
}
/**
* Calculates ranks for an attribute.
*
* Ranks are returned as double precision values, with 1 as the
* rank of the smallest value. Values within +/- fuzz of each other
* may be considered tied. Tied values receive identical ranks.
* Missing values receive rank NaN.
*
* Note that application of the "fuzz" factor is dependent on the
* order of the observations in the example set. For instance, if
* the first three values encountered are x, x+fuzz and x+2*fuzz,
* the first two will be considered tied but the third will not,
* since x+2*fuzz is not within +/- fuzz of x.
*
* @param eSet the example set
* @param att the attribute to rank
* @param fuzz values within +/- fuzz may be considered tied
* @return a double precision array of ranks
*/
public static double[] rank(ExampleSet eSet, Attribute att, Attribute mappingAtt, double fuzz) {
TreeMap<Double, ArrayList<Integer>> map;
if (fuzz == 0.0)
map = new TreeMap<Double, ArrayList<Integer>>();
else {
FuzzyComp fc = new FuzzyComp(fuzz);
map = new TreeMap<Double, ArrayList<Integer>>(fc);
}
double[] rank = new double[eSet.size()];
Iterator<Example> reader = eSet.iterator();
int i = 0; // example index
// iterate through the example set
while (reader.hasNext()) {
// get the attribute values from the next example
Example e = reader.next();
double x = e.getValue(att);
if (att.isNominal() && mappingAtt != null) {
String xString = att.getMapping().mapIndex((int)x);
x = mappingAtt.getMapping().getIndex(xString);
}
// punt if either is missing
if (Double.isNaN(x))
rank[i++] = Double.NaN;
else {
// insert x into the tree
if (!map.containsKey(x))
// new key -- create a new entry in the map
map.put(x, new ArrayList<Integer>());
map.get(x).add(i++); // add the index to the list
}
}
//convert the map to ranks
double r = 0;
for (double x : map.keySet()) {
ArrayList<Integer> y = map.get(x);
double v = r + (1.0 + y.size()) / 2.0;
for (int j : y)
rank[j] = v;
r += y.size();
}
return rank;
}
/**
* Calculates ranks for an attribute.
*
* Ranks are returned as double precision values, with 1 as the
* rank of the smallest value. Tied values receive identical ranks.
* Missing values receive rank NaN.
*
* @param eSet the example set
* @param att the attribute to rank
* @param mappingAtt the attribute which might be used for remapping the values
* @return a double precision array of ranks
*/
public static double[] rank(ExampleSet eSet, Attribute att, Attribute mappingAtt) {
return rank(eSet, att, mappingAtt, 0.0);
}
/* Comparator for doubles using fuzz factor. */
static class FuzzyComp implements Comparator<Double>, Serializable {
private static final long serialVersionUID = -7752907616633799595L;
private double fuzz; // comparison fuzz factor
/* Constructor */
FuzzyComp(double f) {
fuzz = Math.abs(f);
}
public int compare(Double x, Double y) {
return (x > y + fuzz) ? 1 : ((x < y - fuzz) ? -1 : 0);
}
}
/*
* Extracts an example set containing just the two specified
* attributes and no missing values.
*
* @param eSet the source example set
* @param a the first attribute to extract
* @param b the second attribute to extract
* @return the reduced example set
*/
private static ExampleSet extract(ExampleSet eSet, Attribute a, Attribute b) {
// create a new example set containing just attributes a and b
ExampleSet e = (ExampleSet) eSet.clone();
e.getAttributes().clearRegular();
e.getAttributes().addRegular(a);
e.getAttributes().addRegular(b);
return new ConditionedExampleSet(e, new NoMissingAttributesCondition(e, null));
}
/**
* Computes Kendall's tau-b rank correlation statistic, ignoring
* examples containing missing values.
*
* @param eSet the example set
* @param a the first attribute to correlate
* @param b the second attribute to correlate
* @return Kendall's tau-b rank correlation
* @throws OperatorException
*/
public static double tau_b(ExampleSet eSet, Attribute a, Attribute b) throws OperatorException {
ExampleSet e = extract(eSet, a, b); // reduced example set
long c = 0; // concordant pairs
long d = 0; // discordant pairs
long ta = 0; // pairs tied on a (only)
long tb = 0; // pairs tied on b (only)
long tc = 0; // pairs tied on both a and b
int n = 0; // number of times iterator i is bumped
Iterator<Example> i = e.iterator();
while (i.hasNext()) {
// iterate through all possible pairs
Example z1 = i.next();
n++;
double x = z1.getValue(a);
double y = z1.getValue(b);
if (b.isNominal() && a != null) {
String yString = b.getMapping().mapIndex((int)y);
y = a.getMapping().getIndex(yString);
}
Iterator<Example> j = e.iterator();
for (int k = 0; k < n; k++)
j.next(); // increment j to match i
while (j.hasNext()) {
// move on to subsequent examples
Example z2 = j.next();
double xx = z2.getValue(a);
double yy = z2.getValue(b);
if (b.isNominal() && a != null) {
String yyString = b.getMapping().mapIndex((int)yy);
yy = a.getMapping().getIndex(yyString);
}
if (x == xx) {
if (y == yy)
tc++; // tied on both attributes
else
ta++; // tied only on a
} else if (y == yy)
tb++; // tied only on b
else if ((x > xx && y > yy) || (x < xx && y < yy))
c++;
// concordant pair
else
d++; // discordant pair
}
}
double num = c - d;
double f1 = c + d + ta;
double f2 = c + d + tb;
double den = Math.sqrt(f1 * f2);
if (den != 0)
return num / den;
else
return 0;
}
/**
* Computes Kendall's tau-b rank correlation statistic, ignoring
* examples containing missing values, with approximate comparisons.
*
* @param eSet the example set
* @param a the first attribute to correlate
* @param b the second attribute to correlate
* @param fuzz values within +/- fuzz may be considered tied
* @return Kendall's tau-b rank correlation
* @throws OperatorException
*/
public static double tau_b(ExampleSet eSet, Attribute a, Attribute b, double fuzz) throws OperatorException {
ExampleSet e = extract(eSet, a, b); // reduced example set
FuzzyComp fc = new FuzzyComp(fuzz);
int c = 0; // concordant pairs
int d = 0; // discordant pairs
int ta = 0; // pairs tied on a (only)
int tb = 0; // pairs tied on b (only)
int tc = 0; // pairs tied on both a and b
int n = 0; // number of times iterator i is bumped
Iterator<Example> i = e.iterator();
while (i.hasNext()) {
// iterate through all possible pairs
Example z1 = i.next();
n++;
double x = z1.getValue(a);
double y = z1.getValue(b);
if (b.isNominal() && a != null) {
String yString = b.getMapping().mapIndex((int)y);
y = a.getMapping().getIndex(yString);
}
Iterator<Example> j = e.iterator();
for (int k = 0; k < n; k++)
j.next(); // increment j to match i
while (j.hasNext()) {
// move on to subsequent examples
Example z2 = j.next();
double xx = z2.getValue(a);
double yy = z2.getValue(b);
if (b.isNominal() && a != null) {
String yyString = b.getMapping().mapIndex((int)yy);
yy = a.getMapping().getIndex(yyString);
}
int xc = fc.compare(x, xx);
int yc = fc.compare(y, yy);
if (xc == 0) {
if (yc == 0)
tc++; // tied on both attributes
else
ta++; // tied only on a
} else if (yc == 0)
tb++; // tied only on b
else if (xc == yc)
c++; // concordant pair
else
d++; // discordant pair
}
}
double num = c - d;
double den = Math.sqrt((c + d + ta) * (c + d + tb));
if (den != 0)
return num / den;
else
return 0;
}
}