/*
* The Unified Mapping Platform (JUMP) is an extensible, interactive GUI
* for visualizing and manipulating spatial features with geometry and attributes.
*
* JUMP is Copyright (C) 2003 Vivid Solutions
*
* This program implements extensions to JUMP and is
* Copyright (C) Stefan Steiniger.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* For more information, contact:
* Stefan Steiniger
* perriger@gmx.de
*/
/***********************************************
* created on 05.11.2007
* last modified: 09.11.2007 - improved equal#/quantiles method for chained similar values
*
* author: sstein
*
* description:
* provides some 1-D classification method for
* arrays of double values.
*
*
***********************************************/
package org.openjump.core.attributeoperations;
import java.util.ArrayList;
import java.util.List;
import org.math.array.DoubleArray;
import org.math.array.StatisticSample;
import com.vividsolutions.jump.I18N;
public class Classifier1D {
private static String pluginname = "classifyplot";
//-- note: add these strings to the List in Classifier1D.getAvailableClassificationMethods()
// to make them available for the JUMP GUI
public static String EQUAL_RANGE = "Equal Range";
public static String EQUAL_NUMBER = "Equal Number/Quantiles";
public static String MEAN_STDEV = "Mean Standard Deviation";
public static String MAX_BREAKS = "Maximal Breaks";
public static String JENKS_BREAKS = "Jenks Optimization";
public static String KMEANS_OPTIMIZE = "Optimization with k-means";
public static List getAvailableClassificationMethods(){
//-- assign i18N strings
// this shall work because this method should be called before
// the class field strings are used for comparison
Classifier1D.EQUAL_RANGE = I18N.get("ui.renderer.style.ColorThemingStylePanel.Equal-Interval");
Classifier1D.EQUAL_NUMBER = I18N.get("ui.renderer.style.ColorThemingStylePanel.Quantile-Equal-Number");
Classifier1D.MEAN_STDEV = I18N.get( "ui.renderer.style.ColorThemingStylePanel.Mean-Standard-Deviation");
Classifier1D.MAX_BREAKS = I18N.get("ui.renderer.style.ColorThemingStylePanel.Maximal-Breaks");
Classifier1D.JENKS_BREAKS = I18N.get("ui.renderer.style.ColorThemingStylePanel.Jenks-Optimal-Method");
//-- make a list for the GUI
List classifierList = new ArrayList();
classifierList.add(Classifier1D.EQUAL_RANGE);
classifierList.add(Classifier1D.EQUAL_NUMBER);
classifierList.add(Classifier1D.MEAN_STDEV);
classifierList.add(Classifier1D.MAX_BREAKS);
classifierList.add(Classifier1D.JENKS_BREAKS);
return classifierList;
}
/**
* calculates class limits with equal range
* @param data
* @param numberClasses
* @return break values for classes. E.g. for 4 ranges 3 breaks are returned. Min and Max Values are not returned.
*/
public static double[] classifyEqualRange(double[] data, int numberClasses){
double[] limits = new double[numberClasses-1];
double min = DoubleArray.min(data);
double max = DoubleArray.max(data);
double delta = (max - min)/numberClasses;
for (int i = 0; i < limits.length; i++) {
limits[i]=min + (delta*(i+1));
}
return limits;
}
/**
* calculates class limits with equal number, which is euqal to the "quantiles" method.
* Note that differences in the items per classes occure, if items have same values
* and need to be grouped into the same class.
* @param data
* @param numberClasses
* @return break values for classes. E.g. for 4 ranges 3 breaks are returned. Min and Max Values are not returned.
*/
public static double[] classifyEqualNumber(double[] data, int numberClasses){
double[] limits = new double[numberClasses-1];
int itemsPerClass = (int)Math.floor(data.length/numberClasses);
double[] orderedItems = DoubleArray.sort(data);
for (int i = 0; i < limits.length; i++) {
int pos = 0 + itemsPerClass*(i+1);
int bias = 0; //index-count used for cases when items have similar values
double border = 0;
if (orderedItems[bias + pos-1] != orderedItems[bias + pos]){
border = 0.5*(orderedItems[bias + pos-1] + orderedItems[bias + pos]);
}
else{ //both values are equal
//move on, until values are different
int index = bias + pos;
int nrEqualVal = 0;
while(orderedItems[bias + pos-1] == orderedItems[index]){
index=index+1;
nrEqualVal = nrEqualVal+1;
}
border = 0.5*(orderedItems[bias + pos-1] + orderedItems[index]);
bias = bias+nrEqualVal;
}
limits[i]= border;
}
return limits;
}
/**
* calculates class limits using mean value and standard deviation, i.e. for 5 classes:
* c1: values < m- 2std, c2: m - 2std < values < m - 1std,
* c3: m - 1std < values < m + 1std, c4: m + 1std < values < m + 2std
* c5: values > m- 2std
*
* @param data
* @param numberClasses
* @return break values for classes. E.g. for 4 ranges 3 breaks are returned. Min and Max Values are not returned.
*/
public static double[] classifyMeanStandardDeviation(double[] data, int numberClasses){
double[] limits = new double[numberClasses-1];
//double[] orderedItems = DoubleArray.sort(data);
double mean = StatisticSample.mean(data);
double std = StatisticSample.stddeviation(data);
boolean evenNumber = true;
if ( (numberClasses/2.0) != Math.floor(numberClasses/2.0)){
evenNumber = false;
}
int startMultiplier = -1*(int)Math.floor(numberClasses/2.0);
if (evenNumber){//adjust for an even number of classes
startMultiplier = startMultiplier + 1;
}
for (int i = 0; i < limits.length; i++) {
double border = mean + (startMultiplier*std);
limits[i]= border;
startMultiplier = startMultiplier+1;
//-- ensure that middle class is around mean, for an un-even number of classes
if ((startMultiplier == 0) && (evenNumber == false)) {
startMultiplier = 1;
}
}
return limits;
}
/**
* calculates class limits using Maximum Breaks method (see e.g. T. A. Slocum:
* "Thematic Cartography and Visualization", 1999)
*
* @param data
* @param numberClasses
* @return break values for classes. E.g. for 4 ranges 3 breaks are returned. Min and Max Values are not returned.
*/
public static double[] classifyMaxBreaks(double[] data, int numberClasses){
double[] limits = new double[numberClasses-1];
double[] sortData = DoubleArray.sort(data);
//-- calc differences (distance between values)
double[] deltaX = new double[data.length];
for (int i = 0; i < (sortData.length-1); i++) {
deltaX[i] = sortData[i+1] - sortData[i];
}
//-- find largest differences
double[] unSortedLimits = new double[numberClasses-1];
double minX = DoubleArray.min(deltaX);
for (int i = 0; i < limits.length; i++) {
//-- get max value
double maxX = DoubleArray.max(deltaX);
//-- find max positions and replace value by minValue
// we need to replace the value, because in the next round
// we still want to get the right index to calc the breakpos
boolean found = false; int j = 0;
while (found == false){
if (deltaX[j] == maxX){
found = true;
unSortedLimits[i] = 0.5*(sortData[j] + sortData[j+1]);
deltaX[j] = minX;
}
else{
j++;
}
}
}
//-- sort limits from min to max
limits = DoubleArray.sort(unSortedLimits);
return limits;
}
/**
* calculates class limits using Jenks's Optimisation Method(Natural Break)
*
* @param data
* @param numberClasses
* @return break values for classes. E.g. for 4 ranges 3 breaks are
* returned. Min and Max Values are not returned.
*/
public static double[] classifyNaturalBreaks(double[] data, int numberClasses) {
double[] limits = new double[numberClasses - 1];
if (limits.length == 0) return limits;
double[] orderedItems = DoubleArray.sort(data);
int numData = data.length;
if (numData == 0) return limits;
double[][] mat1 = new double[numData + 1][numberClasses + 1];
double[][] mat2 = new double[numData + 1][numberClasses + 1];
for (int i = 1; i <= numberClasses; i++) {
mat1[1][i] = 1;
mat2[1][i] = 0;
for (int j = 2; j <= numData; j++)
mat2[j][i] = Double.MAX_VALUE;
}
double v = 0;
for (int l = 2; l <= numData; l++) {
double s1 = 0;
double s2 = 0;
double w = 0;
for (int m = 1; m <= l; m++) {
int i3 = l - m + 1;
double val = orderedItems[i3-1];
s2 += val * val;
s1 += val;
w++;
v = s2 - (s1 * s1) / w;
int i4 = i3 - 1;
if (i4 != 0) {
for (int j = 2; j <= numberClasses; j++) {
if (mat2[l][j] >= (v + mat2[i4][j- 1])) {
mat1[l][j] = i3;
mat2[l][j] = v + mat2[i4][j -1];
};
};
};
};
mat1[l][1] = 1;
mat2[l][1] = v;
};
int k = numData;
for (int j = numberClasses; j >= 2; j--) {
int id = (int) (mat1[k][j]) - 2;
//-- [sstein] modified version from Hisaji,
// otherwise breaks will be "on" one item
// limits[j - 2] = orderedItems[id];
//-- new
double limit = 0.5*(orderedItems[id]+orderedItems[id+1]);
limits[j - 2] = limit;
k = (int) mat1[k][j] - 1;
};
return limits;
}
/**
* calculates class limits using optimal breaks method (see e.g. T. A. Slocum:
* "Thematic Cartography and Visualization", 1999, p.73) or B.D. Dent: "Cartography:
* Thematic Map Design", 1999, p.146). \n
* Note: limits should not be equal to values. Since values that are equal to bounds
* can be classified into 2 classes.
* @param data
* @param numberClasses
* @param initialLimitAlgorithm 1: maxBreaks, 2: equalRange, 3: quantiles, 4: MeanStd-Dev 5: Jenks
* @return break values for classes. E.g. for 4 ranges 3 breaks are returned. Min and Max Values are not returned.
*/
public static double[] classifyKMeansOnExistingBreaks(double[] data, int numberClasses, int initialLimitAlgorithm){
int maxRuns = 50;
double[] limits = new double[numberClasses-1];
//-- sort Data (to make movement of limits easier)
double[] sortedData = DoubleArray.sort(data);
//========== first round ==============
//-- calc intial SDAM (squared deviation, array mean)
double SDAM = Classifier1D.calcSDAM(sortedData);
//-- develop class boundaries
// we start with xxx breaks groups
double[] tempLimits = new double[limits.length];
if (initialLimitAlgorithm == 1){
tempLimits = Classifier1D.classifyMaxBreaks(sortedData, numberClasses);
}
else if(initialLimitAlgorithm == 2){
tempLimits = Classifier1D.classifyEqualRange(sortedData, numberClasses);
}
else if(initialLimitAlgorithm == 3){
tempLimits = Classifier1D.classifyEqualNumber(sortedData, numberClasses);
}
else if(initialLimitAlgorithm == 4){
tempLimits = Classifier1D.classifyMeanStandardDeviation(sortedData, numberClasses);
}
else if (initialLimitAlgorithm == 5) {
tempLimits = Classifier1D.classifyNaturalBreaks(sortedData,
numberClasses);
}
else{
//=== Default ===
//TODO: change this to create arbitrary ones???
tempLimits = Classifier1D.classifyMaxBreaks(sortedData, numberClasses);
}
limits = tempLimits;
double GVF = Classifier1D.calcGVF(sortedData, tempLimits, SDAM);
//========== optimize ==============
ArrayList<Double> gdfVals = new ArrayList<Double>();
gdfVals.add(new Double(GVF));
boolean moveOn = true; int runs = 0;
while(moveOn){
runs++;
//-- move/adjust class boundaries
tempLimits = Classifier1D.adjustLimitsKMeans(sortedData, limits);
//-- calc fit (i.e. GVF)
double newGVF = Classifier1D.calcGVF(sortedData, tempLimits, SDAM);
//-- GVF should move towards 1 (i.e. newGVF should be larger)
double dGVF = newGVF- GVF;
if ((dGVF > 0) && (maxRuns > runs)){
GVF = newGVF;
limits = tempLimits;
}
else{
moveOn = false;
}
}
//==================================
return limits;
}
/**
* Moves the limits, by assigning data points to the closest class mean value.
* This approach is equal to the k-means procedure (see e.g. Duda, Hart and
* Stork 2000, p. 526).
* @param data (sortedData from min to max, e.g. use jmathtools DoubleArray.sort())
* @param oldLimits
* @return a double array of adjusted limits
*/
public static double[] adjustLimitsKMeans(double[] data, double[] oldLimits){
double[] newLimits = new double[oldLimits.length];
int numberClasses = oldLimits.length+1;
int[] oldClasses = Classifier1D.classifyData(data, oldLimits);
//-- calc class means
double[] means = Classifier1D.calcClassMeans(data, oldClasses, numberClasses);
//========== reclassify by assigning to closest mean ===========
int[] newClasses = new int[data.length];
double[] classChange = new double[data.length];
for (int i = 0; i < data.length; i++) {
double smallestDist = 0;
int assignedClass = -1;
//-- init with first mean
smallestDist = Math.abs(data[i]-means[0]);
assignedClass = 0;
for (int j = 1; j < means.length; j++) {
double dist = Math.abs(data[i]-means[j]);
if (dist < smallestDist){
assignedClass = j;
smallestDist = dist;
}
}
newClasses[i]=assignedClass;
//-- record changes
if (newClasses[i] == oldClasses[i]){
classChange[i] = 0;
}
else{
classChange[i] = 1;
}
}
double modifications = DoubleArray.sum(classChange);
if (modifications > 0){
//System.out.println("Classifier1D.adjustLimitsKMeans(): points reassigned: " + (int)modifications);
//========= calc limits by observing changes in newClasses =========
//-- this works because data items are ordered
int classPrev = newClasses[0];
int classNext = -1;
int limitIdx = 0;
for (int i = 1; i < data.length; i++) {
classNext = newClasses[i];
if (classPrev != classNext){
//-- change occured => get limit
newLimits[limitIdx] = 0.5*(data[i-1] + data[i]);
limitIdx++;
}
classPrev = classNext;
}
}
else{
newLimits = oldLimits;
//System.out.println("Classifier1D.adjustLimitsKMeans(): no reassignment of points; limits not modified");
}
return newLimits;
}
/**
* Classifies the given data according to the given limits.
* @param data
* @param limits The break/decision values between the classes. Highest and lowest values
* are not delivered. Example Limits are for instance delivered by the
* Classifier1D.classifyEqualNumber() method.
* @return array containg a class ID for every item.
*/
public static int[] classifyData(double[] data, double[] limits){
int[] classes = new int[data.length];
int nClasses = limits.length+1;
//============ get Limits ===================
//-- get min and max values
double minAll = DoubleArray.min(data);
double maxAll = DoubleArray.max(data);
//-- add min and max limits
double[] finalLimits = new double[limits.length+2];
for (int i = 0; i < limits.length; i++) {
finalLimits[i+1] = limits[i];
}
finalLimits[0]= minAll;
finalLimits[finalLimits.length-1] = maxAll;
//============ assign data to classes =============
// Note: lowest and highest needs to be equal to the limit/break value
boolean isInClass = false;
for (int i = 0; i < data.length; i++) {
//-- check with all classes
// maybe speed up with while loop (using "assigned")
boolean assigned = false;
for (int j = 0; j < nClasses; j++) {
isInClass = Classifier1D.isInClass(data[i], finalLimits[j], finalLimits[j+1]);
if (isInClass){
classes[i]=j;
assigned = true;
}
}
if(assigned == false){
classes[i]=-1;
System.out.println("Classifier1D: could not classify point: " + i + " value:" + data[i] + " -- set class to -1");
}
}
return classes;
}
/**
* Checks if value is within limits.\n
* Note: values equal to the bound values return "true".
* (qery: lowerlimit <= val <= upperlimit)
* @param val
* @param lowerBound
* @param upperBound
* @return true if val is included between lowerBound (included) and upperBound (included)
*/
public static boolean isInClass(double val, double lowerBound, double upperBound){
boolean isInClass = false;
if(val <= upperBound){
if(val >= lowerBound){
isInClass = true;
}
}
return isInClass;
}
/**
* SDAM (squared deviation [from] array mean): see B.D. Dent (1999, p. 148)
* alternatively look for T.A. Slocum (1999, p. 73). \n
* Used for Optimal Breaks Method.
* @param data
* @return the squared deviation from double array mean
*/
public static double calcSDAM(double[] data){
double meanAll = StatisticSample.mean(data);
double SDAM = 0; double sum =0;
for (int i = 0; i < data.length; i++) {
sum = sum + ((data[i]-meanAll)*(data[i]-meanAll));
}
SDAM = sum;
return SDAM;
}
/**
* SDCM (squared deviations [from] class means): see B.D. Dent (1999, p. 148)
* alternatively look for T.A. Slocum (1999, p. 73). \n
* Used for Optimal Breaks Method.
* TODO : definition of SDCM (relative to SDAM)
* @param data
* @param classes the classes for every item of the data array
* @param classMeans
* @param numClasses
*/
public static double calcSDCM(double[] data, int[] classes, double[] classMeans, int numClasses){
double SDCM = 0;
double[] classSum = new double[numClasses];
for (int i = 0; i < data.length; i++) {
int z = classes[i];
classSum[z] = classSum[z] + ((data[i]-classMeans[z])*(data[i]-classMeans[z]));
}
double sum = 0;
for (int i = 0; i < classSum.length; i++) {
sum = sum + classSum[i];
}
SDCM = sum;
return SDCM;
}
/**
* GVF (goodness of variance fit): see B.D. Dent (1999, p. 148)
* alternatively look for T.A. Slocum (1999, p. 73). \n
* Used for Optimal Breaks Method.
* @param SDAM squared deviation [from] array mean
* @param SDCM squared deviation [from] class mean
* @return the Goodness of Variant Fit for a particular SDAM and SDCM
*/
public static double calcGVF(double SDAM, double SDCM){
double gvf = (SDAM - SDCM) / SDAM;
return gvf;
}
/**
* GVF (goodness of variance fit): see B.D. Dent (1999, p. 148)
* alternatively look for T.A. Slocum (1999, p. 73). \n
* Used for Optimal Breaks Method.
* @param data
* @param limits The break/decision values between the classes. Highest and lowest values
* are not delivered. Example Limits are for instance delivered by the
* Classifier1D.classifyEqualNumber() method.
* @param SDAM squared deviation [from] array mean
*/
public static double calcGVF(double[] data, double[] limits, double SDAM){
int numberClasses = limits.length+1;
//-- assign to class with initial limits
int[] classes = Classifier1D.classifyData(data, limits);
//-- calc class mean values
double[] means = Classifier1D.calcClassMeans(data, classes, numberClasses);
//-- calc SDCM (squared deviations, class means)
double SDCM = Classifier1D.calcSDCM(data, classes, means, numberClasses);
//-- calc Goodness of Variance fit (GVF)
double GDF = Classifier1D.calcGVF(SDAM, SDCM);
return GDF;
}
/**
*
* @param data input data
* @param classes the vector containing the information on the class for an item
* @param numClasses the number of classes
*/
public static double[] calcClassMeans(double[] data, int[] classes, int numClasses){
double means[] = new double[numClasses];
double[] sumC = new double[numClasses];
int[] countCMembers = new int[numClasses];
for (int i = 0; i < data.length; i++) {
if (classes[i] != -1){
sumC[classes[i]] = sumC[classes[i]] + data[i];
countCMembers[classes[i]] = countCMembers[classes[i]] +1;
}
}
for (int i = 0; i < means.length; i++) {
means[i] = sumC[i]/countCMembers[i];
}
return means;
}
}