package ca.pfv.spmf.algorithms.clustering.dbscan; /* This file is copyright (c) 2008-2013 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.List; import ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceEuclidian; import ca.pfv.spmf.algorithms.clustering.distanceFunctions.DistanceFunction; import ca.pfv.spmf.datastructures.kdtree.KDTree; import ca.pfv.spmf.patterns.cluster.Cluster; import ca.pfv.spmf.patterns.cluster.DoubleArray; import ca.pfv.spmf.tools.MemoryLogger; /* This file is copyright (c) 2008-2015 Philippe Fournier-Viger * * This file is part of the SPMF DATA MINING SOFTWARE * (http://www.philippe-fournier-viger.com/spmf). * * SPMF is free software: you can redistribute it and/or modify it under the * terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * SPMF is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR * A PARTICULAR PURPOSE. See the GNU General Public License for more details. * You should have received a copy of the GNU General Public License along with * SPMF. If not, see <http://www.gnu.org/licenses/>. */ /** * An implementation of the DBSCAN algorithm (Ester et al., 1996). * Note that original algorithm suggested using a R*-tree to index points * to avoid having a O(n^2) complexity, but we instead used a KD-Tree. * The DBScan algorithm was originally published in: * <br/><br/> * * Ester, Martin; Kriegel, Hans-Peter; Sander, J�rg; Xu, Xiaowei (1996). Simoudis, Evangelos; * Han, Jiawei; Fayyad, Usama M., eds. A density-based algorithm for discovering clusters in * large spatial databases with noise. Proceedings of the Second International Conference on Knowledge * Discovery and Data Mining (KDD-96). AAAI Press. pp. 226�231. * * @author Philippe Fournier-Viger */ public class AlgoDBSCAN { // The list of clusters generated protected List<Cluster> clusters = null; // For statistics protected long startTimestamp; // the start time of the latest execution protected long endTimestamp; // the end time of the latest execution long numberOfNoisePoints; // the number of iterations that was performed /* The distance function to be used for clustering */ DistanceFunction distanceFunction = new DistanceEuclidian(); /* This KD-Tree is used to index the data points for fast access to points in the epsilon radius*/ KDTree kdtree; /** * Default constructor */ public AlgoDBSCAN() { } /** * Run the DBSCAN algorithm * @param inputFile an input file path containing a list of vectors of double values * @param minPts the minimum number of points (see DBScan article) * @param epsilon the epsilon distance (see DBScan article) * @param seaparator the string that is used to separate double values on each line of the input file (default: single space) * @return a list of clusters (some of them may be empty) * @throws IOException exception if an error while writing the file occurs */ public List<Cluster> runAlgorithm(String inputFile, int minPts, double epsilon, String separator) throws NumberFormatException, IOException { // record the start time startTimestamp = System.currentTimeMillis(); // reset the number of noise points to 0 numberOfNoisePoints =0; // Structure to store the vectors from the file List<DoubleArray> points = new ArrayList<DoubleArray>(); // read the vectors from the input file BufferedReader reader = new BufferedReader(new FileReader(inputFile)); String line; // for each line until the end of the file while (((line = reader.readLine()) != null)) { // if the line is a comment, is empty or is a // kind of metadata if (line.isEmpty() == true || line.charAt(0) == '#' || line.charAt(0) == '%' || line.charAt(0) == '@') { continue; } // split the line by spaces String[] lineSplited = line.split(separator); // create a vector of double double [] vector = new double[lineSplited.length]; // for each value of the current line for (int i=0; i< lineSplited.length; i++) { // convert to double double value = Double.parseDouble(lineSplited[i]); // add the value to the current vector vector[i] = value; } // add the vector to the list of vectors points.add(new DoubleArrayDBS(vector)); } // close the file reader.close(); // build kd-tree kdtree = new KDTree(); kdtree.buildtree(points); // For debugging, you can print the KD-Tree by uncommenting the following line: // System.out.println(kdtree.toString()); // Create a single cluster and return it clusters = new ArrayList<Cluster>(); // For each point in the dataset for(DoubleArray point: points) { // if the node is already visited, we skip it DoubleArrayDBS pointDBS = (DoubleArrayDBS) point; if(pointDBS.visited == true) { continue; } // mark the point as visited pointDBS.visited = true; // find the neighboors of this point List<DoubleArray> neighboors = kdtree.pointsWithinRadiusOf(pointDBS, epsilon); // if it is not noise if(neighboors.size() >= minPts -1) { // - 1 because we don't count the point itself in its neighborood // create a new cluster Cluster cluster = new Cluster(); clusters.add(cluster); // transitively add all points that can be reached expandCluster(pointDBS, neighboors, cluster, epsilon, minPts); }else { // it is noise numberOfNoisePoints++; } } kdtree = null; // check memory usage MemoryLogger.getInstance().checkMemory(); // record end time endTimestamp = System.currentTimeMillis(); // return the clusters return clusters; } /** * The DBScan expandCluster() method * @param currentPoint the current point * @param neighboors the neighboors of the current point * @param cluster the current cluster * @param epsilon the epsilon parameter * @param minPts the minPts parameter */ private void expandCluster(DoubleArrayDBS currentPoint, List<DoubleArray> neighboors, Cluster cluster, double epsilon, int minPts) { // add the current point to the cluster cluster.addVector(currentPoint); // for each neighboor for(DoubleArray newPoint: neighboors) { DoubleArrayDBS newPointDBS = (DoubleArrayDBS) newPoint; // if this point has not been visited yet if(newPointDBS.visited == false) { // mark the point as visited newPointDBS.visited = true; // find the neighboors of this point List<DoubleArray> newNeighboors = kdtree.pointsWithinRadiusOf(newPointDBS, epsilon); // if this point is not noise if(newNeighboors.size() >= minPts - 1) { // - 1 because we don't count the point itself in its neighborood expandCluster(newPointDBS, newNeighboors, cluster, epsilon, minPts); }else { // it is noise numberOfNoisePoints++; } } } // check memory usage MemoryLogger.getInstance().checkMemory(); } /** * Save the clusters to an output file * @param output the output file path * @throws IOException exception if there is some writing error. */ public void saveToFile(String output) throws IOException { BufferedWriter writer = new BufferedWriter(new FileWriter(output)); // for each cluster for(int i=0; i< clusters.size(); i++){ // if the cluster is not empty if(clusters.get(i).getVectors().size() >= 1){ // write the cluster writer.write(clusters.get(i).toString()); // if not the last cluster, add a line return if(i < clusters.size()-1){ writer.newLine(); } } } // close the file writer.close(); } /** * Print statistics of the latest execution to System.out. */ public void printStatistics() { System.out.println("========== DBSCAN - STATS ============"); System.out.println(" Total time ~: " + (endTimestamp - startTimestamp) + " ms"); System.out.println(" Max memory:" + MemoryLogger.getInstance().getMaxMemory() + " mb "); System.out.println(" Number of noise points: " + numberOfNoisePoints); System.out.println("====================================="); } }