SpectralClustering.java example

Explorer
smile-master
/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package smile.clustering;

import java.io.Serializable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import smile.math.Math;
import smile.math.matrix.ColumnMajorMatrix;
import smile.math.matrix.DenseMatrix;
import smile.math.matrix.EigenValueDecomposition;
import smile.math.matrix.Lanczos;

/**
 * Spectral Clustering. Given a set of data points, the similarity matrix may
 * be defined as a matrix S where S<sub>ij</sub> represents a measure of the
 * similarity between points. Spectral clustering techniques make use of the
 * spectrum of the similarity matrix of the data to perform dimensionality
 * reduction for clustering in fewer dimensions. Then the clustering will
 * be performed in the dimension-reduce space, in which clusters of non-convex
 * shape may become tight. There are some intriguing similarities between
 * spectral clustering methods and kernel PCA, which has been empirically
 * observed to perform clustering.
 *
 * <h2>References</h2>
 * <ol>
 * <li> A.Y. Ng, M.I. Jordan, and Y. Weiss. On Spectral Clustering: Analysis and an algorithm. NIPS, 2001. </li>
 * <li> Marina Maila and Jianbo Shi. Learning segmentation by random walks. NIPS, 2000. </li>
 * <li> Deepak Verma and Marina Meila. A Comparison of Spectral Clustering Algorithms. 2003. </li>
 * </ol>
 * 
 * @author Haifeng Li
 */
public class SpectralClustering implements Serializable {
    private static final long serialVersionUID = 1L;
    private static final Logger logger = LoggerFactory.getLogger(SpectralClustering.class);

    /**
     * The number of clusters.
     */
    private int k;
    /**
     * The cluster labels of data.
     */
    private int[] y;
    /**
     * The number of samples in each cluster.
     */
    private int[] size;
    /**
     * The width of Gaussian kernel.
     */
    private double sigma;
    /**
     * The distortion in feature space.
     */
    private double distortion;

    /**
     * Constructor. Spectral graph clustering.
     * @param W the adjacency matrix of graph.
     * @param k the number of clusters.
     */
    public SpectralClustering(double[][] W, int k) {
        if (k < 2) {
            throw new IllegalArgumentException("Invalid number of clusters: " + k);
        }

        this.k = k;        
        int n = W.length;
        
        for (int i = 0; i < n; i++) {
            if (W[i].length != n) {
                throw new IllegalArgumentException("The adjacency matrix is not square.");
            }
            
            if (W[i][i] != 0.0) {
                throw new IllegalArgumentException(String.format("Vertex %d has self loop: ", i));
            }
            
            for (int j = 0; j < i; j++) {
                if (W[i][j] != W[j][i]) {
                    throw new IllegalArgumentException("The adjacency matrix is not symmetric.");                    
                }
                
                if (W[i][j] < 0.0) {
                    throw new IllegalArgumentException("Negative entry of adjacency matrix: " + W[i][j]);                    
                }
            }
        }
        
        double[] D = new double[n];
        for (int i = 0; i < n; i++) {
            for (int j = 0; j < n; j++) {
                D[i] += W[i][j];
            }
            
            if (D[i] == 0.0) {
                throw new IllegalArgumentException("Isolated vertex: " + i);                    
            }
            
            D[i] = 1.0 / Math.sqrt(D[i]);
        }

        DenseMatrix L = new ColumnMajorMatrix(n, n);
        for (int i = 0; i < n; i++) {
            for (int j = 0; j < i; j++) {
                double l = D[i] * W[i][j] * D[j];
                L.set(i, j, l);
                L.set(j, i, l);
            }
        }
        
        EigenValueDecomposition eigen = Lanczos.eigen(L, k);
        double[][] Y = eigen.getEigenVectors().array();
        for (int i = 0; i < n; i++) {
            Math.unitize2(Y[i]);
        }

        KMeans kmeans = new KMeans(Y, k);
        distortion = kmeans.distortion;
        y = kmeans.getClusterLabel();
        size = kmeans.getClusterSize();
    }

    /**
     * Constructor. Spectral clustering the data.
     * @param data the dataset for clustering.
     * @param k the number of clusters.
     * @param sigma the smooth/width parameter of Gaussian kernel, which
     * is a somewhat sensitive parameter. To search for the best setting,
     * one may pick the value that gives the tightest clusters (smallest
     * distortion, see {@link #distortion()}) in feature space.
     */
    public SpectralClustering(double[][] data, int k, double sigma) {
        if (k < 2) {
            throw new IllegalArgumentException("Invalid number of clusters: " + k);
        }

        if (sigma <= 0.0) {
            throw new IllegalArgumentException("Invalid standard deviation of Gaussian kernel: " + sigma);
        }

        this.k = k;
        this.sigma = sigma;

        int n = data.length;
        double gamma = -0.5 / (sigma * sigma);

        DenseMatrix W = new ColumnMajorMatrix(n, n);
        for (int i = 0; i < n; i++) {
            for (int j = 0; j < i; j++) {
                double w = Math.exp(gamma * Math.squaredDistance(data[i], data[j]));
                W.set(i, j, w);
                W.set(j, i, w);
            }
        }
        
        double[] D = new double[n];
        for (int i = 0; i < n; i++) {
            for (int j = 0; j < n; j++) {
                D[i] += W.get(i, j);
            }

            if (D[i] < 1E-5) {
                logger.error(String.format("Small D[%d] = %f. The data may contain outliers.", i, D[i]));
            }
            
            D[i] = 1.0 / Math.sqrt(D[i]);
        }

        DenseMatrix L = W;
        for (int i = 0; i < n; i++) {
            for (int j = 0; j < i; j++) {
                double l = D[i] * W.get(i, j) * D[j];
                L.set(i, j, l);
                L.set(j, i, l);
            }
        }

        EigenValueDecomposition eigen = Lanczos.eigen(L, k);
        double[][] Y = eigen.getEigenVectors().array();
        for (int i = 0; i < n; i++) {
            Math.unitize2(Y[i]);
        }

        KMeans kmeans = new KMeans(Y, k);
        distortion = kmeans.distortion;
        y = kmeans.getClusterLabel();
        size = kmeans.getClusterSize();
    }

    /**
     * Constructor. Spectral clustering with Nystrom approximation.
     * @param data the dataset for clustering.
     * @param l the number of random samples for Nystrom approximation.
     * @param k the number of clusters.
     * @param sigma the smooth/width parameter of Gaussian kernel, which
     * is a somewhat sensitive parameter. To search for the best setting,
     * one may pick the value that gives the tightest clusters (smallest
     * distortion, see {@link #distortion()}) in feature space.
     */
    public SpectralClustering(double[][] data, int k, int l, double sigma) {
        if (l < k || l >= data.length) {
            throw new IllegalArgumentException("Invalid number of random samples: " + l);
        }
        
        if (k < 2) {
            throw new IllegalArgumentException("Invalid number of clusters: " + k);
        }

        if (sigma <= 0.0) {
            throw new IllegalArgumentException("Invalid standard deviation of Gaussian kernel: " + sigma);
        }
        
        this.k = k;
        this.sigma = sigma;

        int n = data.length;
        double gamma = -0.5 / (sigma * sigma);

        int[] index = Math.permutate(n);
        double[][] x = new double[n][];
        for (int i = 0; i < n; i++) {
            x[i] = data[index[i]];
        }
        data = x;
        
        DenseMatrix C = new ColumnMajorMatrix(n, l);
        double[] D = new double[n];
        for (int i = 0; i < n; i++) {
            double sum = 0.0;
            for (int j = 0; j < n; j++) {
                if (i != j) {
                    double w = Math.exp(gamma * Math.squaredDistance(data[i], data[j]));
                    sum += w;
                    if (j < l) {
                        C.set(i, j, w);
                    }
                }
            }
            
            if (sum < 1E-5) {
                logger.error(String.format("Small D[%d] = %f. The data may contain outliers.", i, sum));
            }
            
            D[i] = 1.0 / Math.sqrt(sum);
        }
        
        for (int i = 0; i < n; i++) {
            for (int j = 0; j < l; j++) {
                C.set(i, j, D[i] * C.get(i, j) * D[j]);
            }
        }

        DenseMatrix W = new ColumnMajorMatrix(l, l);
        for (int i = 0; i < l; i++) {
            for (int j = 0; j < l; j++) {
                W.set(i, j, C.get(i, j));
            }
        }
        
        EigenValueDecomposition eigen = Lanczos.eigen(W, k);
        double[] e = eigen.getEigenValues();
        double scale = Math.sqrt((double)l / n);
        for (int i = 0; i < k; i++) {
            if (e[i] <= 0.0) {
                throw new IllegalStateException("Non-positive eigen value: " + e[i]);
            }
            
            e[i] = scale / e[i];
        }
        
        DenseMatrix U = eigen.getEigenVectors();
        for (int i = 0; i < l; i++) {
            for (int j = 0; j < k; j++) {
                U.mul(i, j, e[j]);
            }
        }
        
        double[][] Y = C.abmm(U).array();
        for (int i = 0; i < n; i++) {
            Math.unitize2(Y[i]);
        }

        KMeans kmeans = new KMeans(Y, k);
        distortion = kmeans.distortion;
        size = kmeans.getClusterSize();

        int[] label = kmeans.getClusterLabel();
        y = new int[n];
        for (int i = 0; i < n; i++) {
            y[index[i]] = label[i];
        }
    }
    
    /**
     * Returns the number of clusters.
     */
    public int getNumClusters() {
        return k;
    }

    /**
     * Returns the cluster labels of data.
     */
    public int[] getClusterLabel() {
        return y;
    }

    /**
     * Returns the size of clusters.
     */
    public int[] getClusterSize() {
        return size;
    }

    /**
     * Returns the width of Gaussian kernel.
     */
    public double getGaussianKernelWidth() {
        return sigma;
    }

    /**
     * Returns the distortion in feature space.
     */
    public double distortion() {
        return distortion;
    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();

        sb.append(String.format("Spectral Clustering distortion in feature space: %.5f%n", distortion));
        sb.append(String.format("Clusters of %d data points:%n", y.length));
        for (int i = 0; i < k; i++) {
            int r = (int) Math.round(1000.0 * size[i] / y.length);
            sb.append(String.format("%3d\t%5d (%2d.%1d%%)%n", i, size[i], r / 10, r % 10));
        }

        return sb.toString();
    }
}