/* * Created on 18.05.2005 for PIROL * * SVN header information: * $Author: LBST-PF-3\orahn $ * $Rev: 2509 $ * $Date: 2006-10-06 12:01:50 +0200 (Fr, 06 Okt 2006) $ * $Id: CorrelationCoefficients.java 2509 2006-10-06 10:01:50Z LBST-PF-3\orahn $ */ package org.openjump.core.attributeoperations.statistics; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import org.openjump.core.apitools.FeatureCollectionTools; import org.openjump.core.apitools.comparisonandsorting.ObjectComparator; import com.vividsolutions.jump.feature.Feature; import com.vividsolutions.jump.feature.FeatureSchema; /** * Class that calculates various correlation coefficients. * * @author Ole Rahn * <br> * <br>FH Osnabrück - University of Applied Sciences Osnabrück, * <br>Project: PIROL (2005), * <br>Subproject: Daten- und Wissensmanagement * * @version $Rev: 2509 $ * modified: [sstein]: 16.Feb.2009 changed logger-entries to comments */ public class CorrelationCoefficients { protected Object[] dataArray = null; // TODO: use array or list instead of two, three, etc. variables protected String attrName1 = null, attrName2 = null; protected double[] means = null; protected Feature[] rawFeatures = null; //protected PersonalLogger logger = new PersonalLogger(DebugUserIds.OLE); protected class SpearmanRankNumberPair { protected double a, b; public SpearmanRankNumberPair(double a, double b) { super(); this.a = a; this.b = b; } public double getDifference(){ return this.a - this.b; } public double getA() { return a; } public double getB() { return b; } } public class CorrelationInformation { protected double coefficient = 0; public CorrelationInformation(double coefficient) { super(); this.coefficient = coefficient; } public double getCoefficient() { return coefficient; } } public class RankCorrelationInformation extends CorrelationInformation { double secondCoefficient = Double.NaN; int numberOfConcordantPairs, numberOfDiscordantPairs; public RankCorrelationInformation(double coefficient, double secondCoefficient, int numberOfConcordantPairs, int numberOfDiscordantPairs) { super(coefficient); this.secondCoefficient = secondCoefficient; this.numberOfConcordantPairs = numberOfConcordantPairs; this.numberOfDiscordantPairs = numberOfDiscordantPairs; } public RankCorrelationInformation(double coefficient, int numberOfConcordantPairs, int numberOfDiscordantPairs) { super(coefficient); this.numberOfConcordantPairs = numberOfConcordantPairs; this.numberOfDiscordantPairs = numberOfDiscordantPairs; } public int getNumberOfConcordantPairs() { return numberOfConcordantPairs; } public int getNumberOfDiscordantPairs() { return numberOfDiscordantPairs; } public double getSecondCoefficient() { return secondCoefficient; } } public CorrelationCoefficients(Feature[] features, String attr1, String attr2) { super(); this.means = new double[2]; this.attrName1 = attr1; this.attrName2 = attr2; this.rawFeatures = features; this.dataArray = this.initializeDataStorage(features); } protected CorrelationDataPair[] initializeDataStorage(Feature[] features){ List<CorrelationDataPair> points = new ArrayList<CorrelationDataPair>(); Feature feature = features[0]; FeatureSchema fs = feature.getSchema(); double x, y; int attrIndX = fs.getAttributeIndex(this.attrName1); int attrIndY = fs.getAttributeIndex(this.attrName2); this.means[0] = this.aritmeticMiddle(features, attrIndX); this.means[1] = this.aritmeticMiddle(features, attrIndY); for (int i=0; i<features.length; i++){ feature = features[i]; x = ObjectComparator.getDoubleValue(feature.getAttribute(attrIndX)); y = ObjectComparator.getDoubleValue(feature.getAttribute(attrIndY)); points.add(new CorrelationDataPair(new double[]{x,y}, i)); } return points.toArray(new CorrelationDataPair[0]); } /** * Returns the deviation of the values of the given attribute. Uses a given * mean to avoid multiple calculation of the mean. To get the mean take a look * at the FeatureCollectionTools class. This class is also used by aritmeticMiddle(). *@param features array containing the features we want the deviation for *@param attr name of the attribute to calculate the deviation for *@param mean the mean for the given features *@return the deviation *@throws IllegalArgumentException if the attribute is not of a numerical type *@see FeatureCollectionTools */ public static double getDeviation(Feature[] features, String attr, double mean){ //PersonalLogger logger = new PersonalLogger(DebugUserIds.OLE); Feature feat = features[0]; int attrIndex = feat.getSchema().getAttributeIndex(attr); if (!FeatureCollectionTools.isAttributeTypeNumeric(feat.getSchema().getAttributeType(attrIndex))) throw new IllegalArgumentException("attribute is not numeric!"); double squareSum = 0; for (int i=0; i<features.length; i++){ feat = features[i]; if (feat.getAttribute(attrIndex) != null) squareSum += Math.pow( ( ObjectComparator.getDoubleValue(feat.getAttribute(attrIndex)) - mean) ,2); else{ //logger.printWarning("skipping value (NULL), when calculating deviation"); } } return Math.sqrt( 1d/(features.length-1)*squareSum ); } protected double getVariance(String attr){ // not needed, yet // TODO: implement variance, use deviation (or the other way round) return 0.0; } protected double aritmeticMiddle(Feature[] features, int attr){ return FeatureCollectionTools.getAritmeticMiddleForAttribute(features, attr); } /** * Get the aritmetic middle for the nr-th attribut given *@param nr index number of attribut to calculate the mean for *@return the mean for the attribute or Double.NaN, if errors occured */ public double getMean(int nr){ return this.means[nr]; } /** * get Pearson's correlation coefficient (good, dimension-less measure, if there is a linear relation between the attributes) * <br>see: <a href="http://www.netzwelt.de/lexikon/Korrelationskoeffizient.html">http://www.netzwelt.de/lexikon/Korrelationskoeffizient.html</a> *@return Pearson's correlation coefficient */ public CorrelationInformation getPearsonCoefficient(){ double coefficient = 0; CorrelationDataPair pkt; double x, y, errorX, errorY; double sumErrorProducts = 0; double sumXErrorSquares = 0, sumYErrorSquares = 0; for (int i=0; i<this.dataArray.length; i++){ pkt = (CorrelationDataPair)this.dataArray[i]; try { x = pkt.getX(); y = pkt.getY(); } catch (Exception e) { //logger.printWarning(e.getMessage()); continue; } errorX = x - this.means[0]; errorY = y - this.means[1]; // zaehler sumErrorProducts += (errorX*errorY); // nenner sumXErrorSquares += (errorX*errorX); sumYErrorSquares += (errorY*errorY); } coefficient = sumErrorProducts / ( Math.sqrt(sumXErrorSquares) * Math.sqrt(sumYErrorSquares)); return new CorrelationInformation(coefficient); } protected HashMap<Integer,Double> getRank2SpearmanRankMap(Object[] sortedValues, HashMap<Object, Integer> value2NumAppearances){ HashMap<Integer,Double> rank2SpearmanRank = new HashMap<Integer, Double>(); int sumOfRanks = 0; int numAppearances; double meanSum; for (int i=0; i<sortedValues.length; i++){ numAppearances = value2NumAppearances.get(sortedValues[i]); meanSum = 0; for (int sr=0; sr<numAppearances; sr++){ meanSum += (sr + sumOfRanks + 1); } rank2SpearmanRank.put(i, (double)(meanSum/numAppearances) ); // logger.printDebug("putting: " + i + " -> " + (double)(meanSum/numAppearances) + " (" + numAppearances + ")"); sumOfRanks += numAppearances; } return rank2SpearmanRank; } /** * get Pearson's correlation coefficient (good, dimension-less measure, if there is a linear relation between the attributes) * <br>see: <a href="http://www.netzwelt.de/lexikon/Korrelationskoeffizient.html">http://www.netzwelt.de/lexikon/Korrelationskoeffizient.html</a> *@return Pearson's correlation coefficient */ public RankCorrelationInformation getSpearmansRhoCoefficient(){ double coefficient = 0; // ------------- FeatureSchema fs = this.rawFeatures[0].getSchema(); int attributeIndex1 = fs.getAttributeIndex(this.attrName1), attributeIndex2 = fs.getAttributeIndex(this.attrName2); List valuesA = Collections.list( Collections.enumeration( FeatureCollectionTools.getSetOfDifferentAttributeValues(this.rawFeatures, attributeIndex1 ))); List valuesB = Collections.list( Collections.enumeration( FeatureCollectionTools.getSetOfDifferentAttributeValues(this.rawFeatures, attributeIndex2 ))); Collections.sort( valuesA ); Collections.sort( valuesB ); Object[] valuesArrayA = valuesA.toArray(), valuesArrayB = valuesB.toArray(); HashMap<Object, Integer> value2NumAppearances1 = null, value2NumAppearances2 = null; HashMap<Object, Integer>[] maps = FeatureCollectionTools.getValueAppearancesCount(this.rawFeatures, new int[]{attributeIndex1,attributeIndex2}); value2NumAppearances1 = maps[0]; value2NumAppearances2 = maps[1]; HashMap<Integer,Double> rank2SpearmanRankA = new HashMap<Integer, Double>(), rank2SpearmanRankB = new HashMap<Integer, Double>(); rank2SpearmanRankA = this.getRank2SpearmanRankMap(valuesArrayA, value2NumAppearances1); rank2SpearmanRankB = this.getRank2SpearmanRankMap(valuesArrayB, value2NumAppearances2); CorrelationCoefficients.SpearmanRankNumberPair[] spearmanPairs = new CorrelationCoefficients.SpearmanRankNumberPair[this.rawFeatures.length]; int rankA, rankB; for (int i=0; i<this.rawFeatures.length; i++){ rankA = valuesA.indexOf(this.rawFeatures[i].getAttribute(attributeIndex1)); rankB = valuesB.indexOf(this.rawFeatures[i].getAttribute(attributeIndex2)); spearmanPairs[i] = new SpearmanRankNumberPair(rank2SpearmanRankA.get(rankA), rank2SpearmanRankB.get(rankB)); } double sumOfSpearmanDifferences = 0, difference; int numCon = 0, numDis = 0; int numPairs = spearmanPairs.length; for (int i=0; i<numPairs; i++){ difference = spearmanPairs[i].getDifference(); sumOfSpearmanDifferences += Math.pow(difference, 2); if (difference>0) numCon++; else if (difference<0) numDis++; } return new RankCorrelationInformation( ( 1 - ( (6*sumOfSpearmanDifferences)/(Math.pow(numPairs, 3)-numPairs) ) ), numCon, numDis); } /** * "Spearman Rank Order Correlations (or "rho") and Kendall's Tau-b (or "tau") Correlations are used when the variables are measured as ranks (from highest-to-lowest or lowest-to-highest)" * <br><a href="http://www.themeasurementgroup.com/datamining/definitions/correlation.htm">http://www.themeasurementgroup.com/datamining/definitions/correlation.htm</a> *@return RankCorrelationInformation */ public RankCorrelationInformation getKendalsTauRankCoefficient(){ CorrelationDataPair currentPair, referncePair; double currX, currY, refX, refY; int numConcordant = 0, numDiscordant = 0, numTiesX = 0, numTiesY = 0, numVergleiche = 0; //logger.printDebug("starting calculation"); for (int i=0; i<this.dataArray.length; i++){ currentPair = (CorrelationDataPair)this.dataArray[i]; try { currX = currentPair.getCoordinate(0); currY = currentPair.getCoordinate(1); } catch (Exception e1) { //logger.printMinorError(e1.getMessage()); continue; } for (int j=i+1; j<this.dataArray.length; j++, numVergleiche++){ referncePair = (CorrelationDataPair)this.dataArray[j]; try { refX = referncePair.getCoordinate(0); refY = referncePair.getCoordinate(1); } catch (Exception e2) { //logger.printMinorError(e2.getMessage()); continue; } if ( (currX < refX && currY < refY) || (currX > refX && currY > refY) ) { numConcordant ++; } else if ( (currX < refX && currY > refY) || (currX > refX && currY < refY) ) { numDiscordant ++; } try { if (currX == refX){ numTiesX ++; } if (currY == refY){ numTiesY ++; } } catch (Exception e) { //logger.printWarning(e.getMessage()); } } } //logger.printDebug("finishing calculation"); //logger.printDebug("Vergleiche: " + numVergleiche); //logger.printDebug("numConcordant: " + numConcordant); //logger.printDebug("numDiscordant: " + numDiscordant); if (numTiesX > 0){ //logger.printWarning("numTiesX: " + numTiesX); } if (numTiesY > 0){ //logger.printWarning("numTiesY: " + numTiesY); } // according to the formula: //double kendallsTau_alpha = (double)(numConcordant-numDiscordant) / (((double)this.dataArray.length*(double)(this.dataArray.length-1.0))/2.0); // less calculation, but equivalent (?) double kendallsTau_alpha = (double)(numConcordant-numDiscordant) / (double)(numVergleiche); double kendallsTau_beta = (double)(numConcordant-numDiscordant) / Math.sqrt( (double)(numConcordant+numDiscordant+numTiesX) * (double)(numConcordant+numDiscordant+numTiesY) ); // third kendalls tau coefficient should be equal to the first one, since we allways have the same number of x and y values //double kendallsTau_gamma = 2.0*this.dataArray.length * (numConcordant-numDiscordant) / ((this.dataArray.length-1)*(this.dataArray.length*this.dataArray.length)); return new RankCorrelationInformation(kendallsTau_alpha, kendallsTau_beta, numConcordant, numDiscordant); } }