package ca.pfv.spmf.algorithms.clustering.optics;
/* This file is copyright (c) 2008-2013 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.PriorityQueue;
import ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceEuclidian;
import ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceFunction;
import ca.pfv.spmf.datastructures.kdtree.KDTree;
import ca.pfv.spmf.datastructures.kdtree.KNNPoint;
import ca.pfv.spmf.patterns.cluster.Cluster;
import ca.pfv.spmf.patterns.cluster.DoubleArray;
import ca.pfv.spmf.tools.MemoryLogger;
/* This file is copyright (c) 2008-2015 Philippe Fournier-Viger
*
* This file is part of the SPMF DATA MINING SOFTWARE
* (http://www.philippe-fournier-viger.com/spmf).
*
* SPMF is free software: you can redistribute it and/or modify it under the
* terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
* SPMF is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
* A PARTICULAR PURPOSE. See the GNU General Public License for more details.
* You should have received a copy of the GNU General Public License along with
* SPMF. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* An implementation of the OPTICS algorithm (Ester et al., 1996). Note that
* original algorithm suggested using a R*-tree to index points to have a log(n)
* complexity instead of a O(n^2) complexity. In this implementation, we instead
* used a KD-Tree, which also have a log(n) complexity but may perform
* less well than a R*-tree. The OPTICS algorithm was
* originally published in: <br/>
* <br/>
*
* Mihael Ankerst, Markus M. Breunig, Hans-Peter Kriegel, J�rg Sander (1999).
* OPTICS: Ordering Points To Identify the Clustering Structure. ACM SIGMOD
* international conference on Management of data. ACM Press. pp. 49�60.
*
* @author Philippe Fournier-Viger, 2015
*/
public class AlgoOPTICS {
/** the time for extracting the cluster ordering */
protected long timeExtractClusterOrdering;
/* The distance function to be used for clustering */
DistanceFunction distanceFunction = new DistanceEuclidian();
/*
* This KD-Tree is used to index the data points for fast access to points
* in the epsilon radius
*/
KDTree kdtree;
/** Variable to store the cluster-ordering found by OPTICS */
List<DoubleArrayOPTICS> clusterOrdering = null;
/** The clusters found by the OPTICS algorithm */
List<Cluster> clusters = null;
/**
* Default constructor
*/
public AlgoOPTICS() {
}
/**
* Run the OPTICS algorithm
*
* @param inputFile
* an input file path containing a list of vectors of double
* values
* @param minPts
* the minimum number of points (see DBScan article)
* @param epsilon
* the epsilon distance (see DBScan article)
* @param seaparator
* the string that is used to separate double values on each line
* of the input file (default: single space)
* @return a list of clusters (some of them may be empty)
* @throws IOException
* exception if an error while writing the file occurs
*/
public List<DoubleArrayOPTICS> computerClusterOrdering(String inputFile,
int minPts, double epsilon, String separator)
throws NumberFormatException, IOException {
// record the start time
timeExtractClusterOrdering = 0;
long startTimestampClusterOrdering = System.currentTimeMillis();
// Structure to store the vectors from the file
List<DoubleArray> points = new ArrayList<DoubleArray>();
// read the vectors from the input file
BufferedReader reader = new BufferedReader(new FileReader(inputFile));
String line;
// for each line until the end of the file
while (((line = reader.readLine()) != null)) {
// if the line is a comment, is empty or is a
// kind of metadata
if (line.isEmpty() == true || line.charAt(0) == '#'
|| line.charAt(0) == '%' || line.charAt(0) == '@') {
continue;
}
line = line.trim();
// split the line by spaces
String[] lineSplited = line.split(separator);
// create a vector of double
double[] vector = new double[lineSplited.length];
// for each value of the current line
for (int i = 0; i < lineSplited.length; i++) {
// convert to double
double value = Double.parseDouble(lineSplited[i]);
// add the value to the current vector
vector[i] = value;
}
// add the vector to the list of vectors
points.add(new DoubleArrayOPTICS(vector));
}
// close the file
reader.close();
// build kd-tree
kdtree = new KDTree();
kdtree.buildtree(points);
// For debugging, you can print the KD-Tree by uncommenting the
// following line:
// System.out.println(kdtree.toString());
// Variable to store the order of points generated by OPTICS
clusterOrdering = new ArrayList<DoubleArrayOPTICS>();
// For each point in the dataset
for (DoubleArray point : points) {
// if the node is already visited, we skip it
DoubleArrayOPTICS pointDBS = (DoubleArrayOPTICS) point;
if (pointDBS.visited == false) {
// process this point
expandClusterOrder(pointDBS, clusterOrdering, epsilon, minPts);
}
}
// check memory usage
MemoryLogger.getInstance().checkMemory();
// record end time
timeExtractClusterOrdering = System.currentTimeMillis()
- startTimestampClusterOrdering;
kdtree = null;
// return the clusters
return clusterOrdering;
}
/**
* This method extract cluster from the cluster-ordering using a DBScan
* based approach, as proposed in the OPTICS paper. However, as explained in
* the OPTICS paper, it is better to use the extractClusters() method to
* find the clusters than this method. But I have implemented it.
*
* @return a list of clusters (some of them may be empty)
*/
public List<Cluster> extractDBScan(int minPts, double epsilonPrime)
throws IOException {
clusters = new ArrayList<Cluster>();
Cluster currentCluster = new Cluster();
// for each object (point) of the cluster ordering
for (DoubleArrayOPTICS objectOP : clusterOrdering) {
// if object is not densiy-reachable with respect to e' and minPts
// from a preceding object in the order, it means that the current
// object is not part of the current cluster
if (objectOP.reachabilityDistance > epsilonPrime) {
// we look a the core distance of object and start a new cluster
// if object is a core object w.r.t e' and minPTs
if (objectOP.core_distance <= epsilonPrime) {
// add previous cluster to the list of clusters if not empty
if (currentCluster.getVectors().size() > 0) {
clusters.add(currentCluster);
}
// create a new cluster for the current object
currentCluster = new Cluster();
currentCluster.addVector(objectOP);
}// else, it is noise
} else {
// the object belongs to the current cluster, so we add it
currentCluster.addVector(objectOP);
}
}
// add the current cluster to the list of clusters if not empty
if (currentCluster.getVectors().size() > 0) {
clusters.add(currentCluster);
}
return clusters;
}
// /**
// * This method used the ordered points (objects) generated by OPTICS to
// * produce cluster. It is the ExtractClusters() method as based on the
// * OPTICS article.
// *
// * @param orderedFile
// * the ordered points generated by OPTICS
// * @return a list of clusters
// */
// public List<Cluster> extractClusters(double xi, int minPts) {
//
// // Initialize set of clusters
// clusters = new ArrayList<Cluster>();
//
// // Initialize set of steep down areas
// List<SteepDownArea> steepDownAreas = new ArrayList<SteepDownArea>();
//
// // Initialize index and mib variables to 0
// int index = 0;
// double mib = 0;
//
// // For each point in the ordering
// while (index < clusterOrdering.size()) {
// double rIndex = clusterOrdering.get(index).reachabilityDistance;
// // Update Mib value
// mib = Math.max(mib, rIndex);
//
// // Try to get a steep down area starting from this point
// SteepDownArea downArea = getSteepDownArea(xi,index, rIndex, minPts);
//
// // IF(start of a steep down area D at index)
// if (downArea != null) {
// // update mib-values and filter SetOfSteepDownAreas(*)
//
// // set D.mib = 0
// downArea.mib = 0;
//
// // add D to the SetOfSteepDownAreas
// steepDownAreas.add(downArea);
//
// // index = end of D + 1;
// index = downArea.endIndex+1;
//
// // mib = r(index)
// mib = clusterOrdering.get(index).reachabilityDistance;
// } else {
//
// // Try to get a steep down area starting from this point
// SteepUpArea upArea = getSteepUpArea(xi,index, rIndex, minPts);
//
//
// // ELSE IF(start of steep up area U at index)
// if (upArea != null) {
// // update mib-values and filter SetOfSteepDownAreas
//
// // index = end of U + 1;
// index = upArea.endIndex +1;
//
// mib = rIndex;
//
// // FOR EACH D in SetOfSteepDownAreas DO
// for (SteepDownArea D : steepDownAreas) {
// // IF(combination of D and U is valid AND(**)
// // satisfies cluster conditions 1, 2, 3a)
// // compute [s, e] add cluster to SetOfClusters
//
// }
// } else {
// index++;
// }
// }
//
// }
// System.out.println("Nb of down areas :" + steepDownAreas.size());
// return clusters;
// }
//
//
// /**
// * Get the steep up area starting at a given point
// * @param xi the xi parameter
// * @param index the current index
// * @param rIndex the reachability distance at position index
// * @param minPts the minPts parameter
// * @return a steep up area if there is one, otherwise null
// */
// private SteepUpArea getSteepUpArea(double xi, int index, double rIndex, int minPts) {
//
// // First, check if it is the current index is a steep up point
// if(isSteepUpPoint(xi, index, rIndex) == false) {
// return null;
// }
//
// // we will start from the next point
// int lastlySeenSteepPoint = index;
// int currentIndex = index+1;
// int nbOfConsecutiveNonSteepPoints = 0;
// double rPredecessor = rIndex;
//
//
// // for each point until the end
// while(currentIndex < clusterOrdering.size()) {
//
// // if the current point is larger than its predecessor,
// // then it is not part of a down area
// double rCurrent = clusterOrdering.get(currentIndex).reachabilityDistance;
// if(rCurrent < rPredecessor) {
// break;
// }
//
// // now we check if the current point is a steep down point
// boolean isSteepUpPoint = isSteepUpPoint(xi, currentIndex, rCurrent);
// // if not
// if(isSteepUpPoint == false) {
// // then we increment the number of non Steep up point
// nbOfConsecutiveNonSteepPoints++;
// // if we reach enough, we stop and set
// // the end index to the last steep up point position
// if(nbOfConsecutiveNonSteepPoints > minPts) {
// break;
// }
// }else {
// // if it is a steep up point, we remember it
// // and reset the number of consecutive steep up point
// lastlySeenSteepPoint = currentIndex;
// nbOfConsecutiveNonSteepPoints = 0;
// }
// // we move to the next point
// currentIndex++;
// rPredecessor = rCurrent;
// }
//
// // this area is not big enough
// if((lastlySeenSteepPoint - index)+1 < minPts) {
// return null;
// }
//
// return new SteepUpArea(index, lastlySeenSteepPoint);
// }
//
// /**
// * Get the steep down area starting at a given point
// * @param xi the xi parameter
// * @param index the current index
// * @param rIndex the reachability distance at position index
// * @param minPts the minPts parameter
// * @return a steep down area if there is one, otherwise null
// */
// private SteepDownArea getSteepDownArea(double xi, int index, double rIndex, int minPts) {
//
// // First, check if it is the current index is a steep down point
// if( isSteepDownPoint(xi, index, rIndex) == false) {
// return null;
// }
//
// // we will start from the next point
// int lastlySeenSteepDownPoint = index;
// int currentIndex = index+1;
// int nbOfConsecutiveNonSteepDownPoints = 0;
// double rPredecessor = rIndex;
//
//
// // for each point until the end
// while(currentIndex < clusterOrdering.size()) {
//
// // if the current point is larger than its predecessor,
// // then it is not part of a down area
// double rCurrent = clusterOrdering.get(currentIndex).reachabilityDistance;
// if(rCurrent > rPredecessor) {
// break;
// }
//
// // now we check if the current point is a steep down point
// boolean isSteepDownPoint = isSteepDownPoint(xi, currentIndex, rCurrent);
// // if not
// if(isSteepDownPoint == false) {
// // then we increment the number of non Steep down point
// nbOfConsecutiveNonSteepDownPoints++;
// // if we reach enough, we stop and set
// // the end index to the last steep down point position
// if(nbOfConsecutiveNonSteepDownPoints > minPts) {
// break;
// }
// }else {
// // if it is a steep down point, we remember it
// // and reset the number of consecutive steep down point
// lastlySeenSteepDownPoint = currentIndex;
// nbOfConsecutiveNonSteepDownPoints = 0;
// }
// // we move to the next point
// currentIndex++;
// rPredecessor = rCurrent;
// }
//
// // this area is not big enough
// if((lastlySeenSteepDownPoint - index)+1 < minPts) {
// return null;
// }
//
// return new SteepDownArea(index, lastlySeenSteepDownPoint, 0);
// }
//
// /**
// * This method checks if a point at position "index" is a steep up point
// * @param xi the xi parameter
// * @param index the index position
// * @param rIndex the reachability distance of point at position "index"
// * @return true if it is a steep up point
// */
// private boolean isSteepUpPoint(double xi, int index, double rIndex) {
// // if not the last point
// if (index != clusterOrdering.size()-1) {
// double rSuccessor = clusterOrdering.get(index + 1).reachabilityDistance;
// // we check
// return rIndex <= (1 - xi) * rSuccessor;
// }
// return false;
// }
//
// /**
// * This method checks if a point at position "index" is a steep down point
// * @param xi the xi parameter
// * @param index the index position
// * @param rIndex the reachability distance of point at position "index"
// * @return true if it is a steep down point
// */
// private boolean isSteepDownPoint(double xi, int index, double rIndex) {
// // if not the last point
// if (index != clusterOrdering.size()-1) {
// double rSuccessor = clusterOrdering.get(index + 1).reachabilityDistance;
// return rIndex * (1 - xi) >= rSuccessor;
// }
// return false;
// }
//
// /**
// * A steep down area as used by OPTICS
// *
// * @author Philippe Fournier-Viger
// */
// public class SteepDownArea {
// /** start index */
// public int startIndex;
// /** end index */
// public int endIndex;
// /** mib value */
// public double mib;
//
// /**
// * Constructor of a steep down area
// *
// * @param startIndex
// * start index
// * @param endIndex
// * end index
// * @param mibValue
// * mib value
// */
// public SteepDownArea(int startIndex, int endIndex, double mibValue) {
// this.startIndex = startIndex;
// this.endIndex = endIndex;
// this.mib = mibValue;
// }
// }
//
// /**
// * A steep down area as used by OPTICS
// *
// * @author Philippe Fournier-Viger
// */
// public class SteepUpArea {
// /** start index */
// public int startIndex;
// /** end index */
// public int endIndex;
//
// /**
// * Constructor of a steep down area
// *
// * @param startIndex
// * start index
// * @param endIndex
// * end index
// * @param mibValue
// * mib value
// */
// public SteepUpArea(int startIndex, int endIndex) {
// this.startIndex = startIndex;
// this.endIndex = endIndex;
// }
// }
/**
* The DBScan expandCluster() method
*
* @param object
* the current point
* @param orderedFile
* the current order of points generated by OPTICS
* @param epsilon
* the epsilon parameter
* @param minPts
* the minPts parameter
*/
private void expandClusterOrder(DoubleArrayOPTICS pointDBS,
List<DoubleArrayOPTICS> orderedFile, double epsilon, int minPts) {
// find the neighboors of this point with their distance
List<KNNPoint> neighbors = kdtree.pointsWithinRadiusOfWithDistance(
pointDBS, epsilon);
// mark the point as visited
pointDBS.visited = true;
// ********** NEXT LINE, WE USE EPSILON AS MAX DISTANCE******* ///
pointDBS.reachabilityDistance = Double.POSITIVE_INFINITY; // /
// Double.POSITIVE_INFINITY
pointDBS.setCoreDistance(neighbors, epsilon, minPts);
// add the current point to the order
orderedFile.add(pointDBS);
if (pointDBS.core_distance != Double.POSITIVE_INFINITY) {
// Create the orderSeeds structure to store points ordered by
// increasing reachability-distances
PriorityQueue<DoubleArrayOPTICS> orderSeeds = new PriorityQueue<DoubleArrayOPTICS>();
update(neighbors, pointDBS, orderSeeds);
while (orderSeeds.isEmpty() == false) {
DoubleArrayOPTICS currentObject = (DoubleArrayOPTICS) orderSeeds
.poll();
List<KNNPoint> neighborsCurrent = kdtree
.pointsWithinRadiusOfWithDistance(pointDBS, epsilon);
// mark the point as visited
currentObject.visited = true;
currentObject.setCoreDistance(neighborsCurrent, epsilon, minPts);
// add the current point to the order
orderedFile.add(currentObject);
if (currentObject.core_distance != Double.POSITIVE_INFINITY) {
update(neighborsCurrent, currentObject, orderSeeds);
}
}
}
// check memory usage
MemoryLogger.getInstance().checkMemory();
}
/**
* Update the orderSeeds w.r.t to the current object
*
* @param neighbors
* the neighbors of the current object
* @param centerObject
* the current object
* @param orderSeeds
* the orderSeeds structure
*/
private void update(List<KNNPoint> neighbors,
DoubleArrayOPTICS centerObject, PriorityQueue<DoubleArrayOPTICS> orderSeeds) {
double cDist = centerObject.core_distance;
// FOR all object from neighbors DO:
for (KNNPoint object : neighbors) {
// if the object has not been visited yet
DoubleArrayOPTICS objectOP = (DoubleArrayOPTICS) object.values;
if (objectOP.visited == false) {
double newRDistance = Math.max(cDist, distanceFunction
.calculateDistance(objectOP, centerObject));
// if not already in orderSeeds
if (objectOP.reachabilityDistance == Double.POSITIVE_INFINITY) {
objectOP.reachabilityDistance = newRDistance;
orderSeeds.add(objectOP);
} else {
// the object was already in orderSeeds
if (newRDistance < objectOP.reachabilityDistance) {
objectOP.reachabilityDistance = newRDistance;
// ******** THE FOLLOWING CODE MIGHT BE OPTIMIZED IN A
// BETTER WAY.... *****\\\\\
// Currently, we just remove and insert again...
orderSeeds.remove(objectOP);
orderSeeds.add(objectOP);
}
}
}
}
// check memory usage
MemoryLogger.getInstance().checkMemory();
}
/**
* Save the clusters to an output file
*
* @param output
* the output file path
* @throws IOException
* exception if there is some writing error.
*/
public void saveToFile(String output) throws IOException {
BufferedWriter writer = new BufferedWriter(new FileWriter(output));
// for each cluster
for (int i = 0; i < clusters.size(); i++) {
// if the cluster is not empty
if (clusters.get(i).getVectors().size() >= 1) {
// write the cluster
writer.write(clusters.get(i).toString());
// if not the last cluster, add a line return
if (i < clusters.size() - 1) {
writer.newLine();
}
}
}
// close the file
writer.close();
}
/**
* Save the cluster ordering to a file
* @param output the output file path
* @throws IOException if error while writting to file
*/
public void saveClusterOrderingToFile(String output) throws IOException {
BufferedWriter writer = new BufferedWriter(new FileWriter(output));
// for each cluster
// Print the cluster-ordering of points to the console (for debugging)
for(DoubleArrayOPTICS arrayOP : clusterOrdering) {
writer.write(" ");
writer.write(Arrays.toString(arrayOP.data));
writer.write(" - ");
writer.write(Double.toString(arrayOP.reachabilityDistance));
writer.newLine();
}
// close the file
writer.close();
}
/**
* Print statistics of the latest execution to System.out.
*/
public void printStatistics() {
System.out.println("========== OPTICS - STATS ============");
System.out.println(" Time ExtractClusterOrdering() ~: "
+ timeExtractClusterOrdering + " ms");
System.out.println(" Max memory:"
+ MemoryLogger.getInstance().getMaxMemory() + " mb ");
// System.out.println(" Number of noise points: " +
// numberOfNoisePoints);
System.out.println("=====================================");
}
}