IsoMap.java example

Explorer
smile-master
/*******************************************************************************
 * Copyright (c) 2010 Haifeng Li
 *   
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *  
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package smile.manifold;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import smile.graph.AdjacencyList;
import smile.graph.Graph;
import smile.graph.Graph.Edge;
import smile.math.distance.EuclideanDistance;
import smile.math.Math;
import smile.math.matrix.ColumnMajorMatrix;
import smile.math.matrix.DenseMatrix;
import smile.math.matrix.EigenValueDecomposition;
import smile.math.matrix.Lanczos;
import smile.neighbor.CoverTree;
import smile.neighbor.KDTree;
import smile.neighbor.KNNSearch;
import smile.neighbor.Neighbor;

/**
 * Isometric feature mapping. Isomap is a widely used low-dimensional embedding methods,
 * where geodesic distances on a weighted graph are incorporated with the
 * classical multidimensional scaling. Isomap is used for computing a
 * quasi-isometric, low-dimensional embedding of a set of high-dimensional
 * data points. Isomap is highly efficient and generally applicable to a broad
 * range of data sources and dimensionalities.
 * <p>
 * To be specific, the classical MDS performs low-dimensional embedding based
 * on the pairwise distance between data points, which is generally measured
 * using straight-line Euclidean distance. Isomap is distinguished by
 * its use of the geodesic distance induced by a neighborhood graph
 * embedded in the classical scaling. This is done to incorporate manifold
 * structure in the resulting embedding. Isomap defines the geodesic distance
 * to be the sum of edge weights along the shortest path between two nodes.
 * The top n eigenvectors of the geodesic distance matrix, represent the
 * coordinates in the new n-dimensional Euclidean space.
 * <p>
 * The connectivity of each data point in the neighborhood graph is defined
 * as its nearest k Euclidean neighbors in the high-dimensional space. This
 * step is vulnerable to "short-circuit errors" if k is too large with
 * respect to the manifold structure or if noise in the data moves the
 * points slightly off the manifold. Even a single short-circuit error
 * can alter many entries in the geodesic distance matrix, which in turn
 * can lead to a drastically different (and incorrect) low-dimensional
 * embedding. Conversely, if k is too small, the neighborhood graph may
 * become too sparse to approximate geodesic paths accurately.
 * <p>
 * This class implements C-Isomap that involves magnifying the regions
 * of high density and shrink the regions of low density of data points
 * in the manifold. Edge weights that are maximized in Multi-Dimensional
 * Scaling(MDS) are modified, with everything else remaining unaffected.
 * 
 * @see LLE
 * @see LaplacianEigenmap
 * 
 * <h2>References</h2>
 * <ol>
 * <li> J. B. Tenenbaum, V. de Silva and J. C. Langford  A Global Geometric Framework for Nonlinear Dimensionality Reduction. Science 290(5500):2319-2323, 2000. </li> 
 * </ol>
 * 
 * @author Haifeng Li
 */
public class IsoMap {
    private static final Logger logger = LoggerFactory.getLogger(IsoMap.class);

    /**
     * The original sample index.
     */
    private int[] index;
    /**
     * Coordinate matrix.
     */
    private double[][] coordinates;
    /**
     * Nearest neighbor graph.
     */
    private Graph graph;

    /**
     * Constructor. C-Isomap algorithm by default.
     * @param data the dataset.
     * @param d the dimension of the manifold.
     * @param k k-nearest neighbor.
     */
    public IsoMap(double[][] data, int d, int k) {
        this(data, d, k, true);
    }
    
    /**
     * Constructor.
     * @param data the dataset.
     * @param d the dimension of the manifold.
     * @param k k-nearest neighbor.
     * @param CIsomap C-Isomap algorithm if true, otherwise standard algorithm.
     */
    public IsoMap(double[][] data, int d, int k, boolean CIsomap) {
        int n = data.length;

        KNNSearch<double[], double[]> knn = null;
        if (data[0].length < 10) {
            knn = new KDTree<>(data, data);
        } else {
            knn = new CoverTree<>(data, new EuclideanDistance());
        }

        graph = new AdjacencyList(n);
        double[] M = new double[n];
        for (int i = 0; i < n; i++) {
            Neighbor<double[], double[]>[] neighbors = knn.knn(data[i], k);

            for (int j = 0; j < neighbors.length; j++) {
                graph.setWeight(i, neighbors[j].index, neighbors[j].distance);
                M[i] += neighbors[j].distance;
            }
            M[i] = Math.sqrt(M[i] / neighbors.length);
        }

        // C-Isomap
        if (CIsomap) {
            for (Edge edge : graph.getEdges()) {
                edge.weight /= (M[edge.v1] * M[edge.v2]);
            }
        }
        
        // Use largest connected component.
        int[][] cc = graph.bfs();
        if (cc.length == 1) {
            index = new int[n];
            for (int i = 0; i < n; i++) {
                index[i] = i;
            }
        } else {
            n = 0;
            int component = 0;
            for (int i = 0; i < cc.length; i++) {
                if (cc[i].length > n) {
                    component = i;
                    n = cc[i].length;
                }
            }

            logger.info("IsoMap: {} connected components, largest one has {} samples.", cc.length, n);

            index = cc[component];
            graph = graph.subgraph(index);
        }

        double[][] D = graph.dijkstra();

        for (int i = 0; i < n; i++) {
            for (int j = 0; j < i; j++) {
                D[i][j] = -0.5 * D[i][j] * D[i][j];
                D[j][i] = D[i][j];
            }
        }

        double[] mean = Math.rowMean(D);
        double mu = Math.mean(mean);

        DenseMatrix B = new ColumnMajorMatrix(n, n);
        for (int i = 0; i < n; i++) {
            for (int j = 0; j <= i; j++) {
                double b = D[i][j] - mean[i] - mean[j] + mu;
                B.set(i, j, b);
                B.set(j, i, b);
            }
        }

        EigenValueDecomposition eigen = Lanczos.eigen(B, d);
        coordinates = new double[n][d];
        for (int j = 0; j < d; j++) {
            if (eigen.getEigenValues()[j] < 0) {
                throw new IllegalArgumentException(String.format("Some of the first %d eigenvalues are < 0.", d));
            }

            double scale = Math.sqrt(eigen.getEigenValues()[j]);
            for (int i = 0; i < n; i++) {
                coordinates[i][j] = eigen.getEigenVectors().get(i, j) * scale;
            }
        }        
    }

    /**
     * Returns the original sample index. Because IsoMap is applied to the largest
     * connected component of k-nearest neighbor graph, we record the the original
     * indices of samples in the largest component.
     */
    public int[] getIndex() {
        return index;
    }

    /**
     * Returns the coordinates of projected data.
     */
    public double[][] getCoordinates() {
        return coordinates;
    }

    /**
     * Returns the nearest neighbor graph.
     */
    public Graph getNearestNeighborGraph() {
        return graph;
    }
}