package ch.akuhn.isomap; import ch.akuhn.graph.DijkstraAlgorithm2; import ch.akuhn.graph.Graph; import ch.akuhn.matrix.DenseMatrix; import ch.akuhn.matrix.Function; import ch.akuhn.matrix.SymmetricMatrix; import ch.akuhn.matrix.eigenvalues.Eigenvalues; import ch.akuhn.org.ggobi.plugins.ggvis.Points; /** Embeds n-dimensional data in two dimensions. *<P> * The Isomap algorithm takes as input the distances d_x(i,j) between all * pairs i, j from N data points in the high-dimensional input space X, * measured either in the standard Euclidean metric or in some domain-specific * metric. The algorithm outputs coordinate vectors y_i in a d-dimensional * Euclidean space Y that best represent the intrinsic geometry of the data. * The only free parameter (e or k) appears in Step 1. * * @see http://waldron.stanford.edu/~isomap/ * @see http://www.sciencemag.org/cgi/content/full/290/5500/2319 * */ public abstract class Isomap { public double e = 0.2; public int k = 3; public boolean useKNN = true; public final int n; private DenseMatrix graph; private Points points; public Isomap(int size) { this.n = size; } protected abstract double dist(int i, int j); public Points getPoints() { return points; } /** Step 1: Construct neighborhood graph. Define the graph G over all data * points by connecting points i and j if [as measured by d_x(i,j)] they * are closer than e (e-Isomap), or if i is one of the k nearest neighbors * of j (k-Isomap). Set edge lengths equal to d_x(i,j). * */ public void constructNeighborhoodGraph() { if (useKNN) { constructKayNearestNeighborhoodGraph(); } else { constructCloserThanEpsilonNeighborhoodGraph(); } } private void constructCloserThanEpsilonNeighborhoodGraph() { graph = new SymmetricMatrix(n); graph.fill(Double.POSITIVE_INFINITY); for (int i = 0; i < n; i++) { for (int j = 0; j < i; j++) { if (dist(i,j) < e) graph.put(i, j, e); } } } private void constructKayNearestNeighborhoodGraph() { graph = new SymmetricMatrix(n); graph.fill(Double.POSITIVE_INFINITY); for (int i = 0; i < n; i++) { double[] data = new double[n]; for (int j = 0; j < n; j++) data[j] = dist(i,j); int[] index = indicesOfMinima(data, k+1); for (int k = 0; k < index.length; k++) { if (i == index[k]) continue; graph.put(i,index[k],dist(i,index[k])); } } } static int[] indicesOfMinima(double[] ds, int k) { if (ds.length <= k) return range(ds.length); int[] index = new int[k]; for (int n = 0; n < index.length; n++) { int i = indexOfMinimum(ds); index[n] = i; ds[i] = Double.POSITIVE_INFINITY; } return index; } private static int[] range(int k) { int[] range = new int[k]; for (int n = 0; n < range.length; n++) range[n] = n; return range; } static int indexOfMinimum(double[] ds) { assert ds.length > 0; int index = 0; double min = ds[0]; for (int n = 0; n < ds.length; n++) { if (ds[n] < min) { index = n; min = ds[n]; } } return index; } /** Step 2: Compute shortest path. Initialize d_G(i,j) = d_X(i,j) if i, j are * linked by an edge; d_G(i,j) = infinity otherwise. Then for each value of * k in 1, 2, ..., N in turn, replace all entries d_G(i,j) by min { d_G(i,j), * d_G(i,k) + d_G(k,j)}. The matrix of final values D_G = { d_G(i,j) } will * contain the shortest path distances between all pairs of points in G. * */ public void computeShortestPathWithBruteForce() { for (int k0 = 0; k0 < n; k0++) { for (int i = 0; i < n; i++) { double path_i_k = graph.get(i,k0); for (int j = 0; j < n; j++) { graph.put(i,j, Math.min(graph.get(i,j), path_i_k + graph.get(k0,j))); } } } } public void computeShortestPathWithDijkstra() { Graph g = new Graph(graph); DijkstraAlgorithm2 dijsktra = new DijkstraAlgorithm2(); for (int i = 0; i < n; i++) { /* NB: it is safe to update graph and use it for distances * in the same time since we only lookup the distances of * directly connected nodes, which are not updated. */ double[] cost = dijsktra.apply(g, g.nodes[i], graph); for (int j = 0; j < n; j++) graph.put(i,j, cost[j]); } } public boolean[][] getEdges() { boolean[][] edges = new boolean[graph.rowCount()][]; for (int i = 0; i < edges.length; i++) edges[i] = new boolean[i]; for (int i = 0; i < edges.length; i++) { for (int j = 0; j < i; j++) { edges[i][j] = !Double.isInfinite(graph.get(i,j)); } } return edges; } /** Step 3: Construct d-dimensional embedding. Let λ_p be the p-th eigenvalue * (in decreasing order) of the matrix tau(D_G), and v^i_p be the i-th component * of the p-th eigenvector. Then set the p-th component of the d-dimensional * coordinate vector y_i equal to sqrt of λ_p * v^i_p. * */ public void constructDeeDimensionalEmbedding() { this.applyTauOperator(); Eigenvalues eigen = Eigenvalues.of(graph).largest(2); eigen.run(); points = new Points(n); for (int i = 0; i < n; i++) { points.x[i] = Math.sqrt(eigen.value[0]) * eigen.vector[0].get(i); points.y[i] = Math.sqrt(eigen.value[1]) * eigen.vector[1].get(i); } } /** Where the operator tau is defined by tau(D) = -HSH/2, where S is the matrix * of squared distances { s_ij = d_ij^2 }, and H is the "centering matrix" * { H_ij = d_ij - 1/N }. *<P> * This leaves open whether the centering matrix of D or S is used, and why they * refer to a delta in the centering matrix instead of using unit matrix minus * reciprocal N. However, in their matlab script, the isomap authors use *<PRE>-.5*(D.^2 - sum(D.^2)'*ones(1,N)/N - ones(N,1)*sum(D.^2)/N + sum(sum(D.^2))/(N^2))</PRE> * so I went with the same: first squaring the matrix, then subtracting column- * and row-wise mean and adding global mean, and eventually divide by minus two. */ private void applyTauOperator() { graph.apply(Function.SQUARE); double mean = graph.mean(); double[] columnMean = graph.rowwiseMean(); int N = graph.columnCount(); for (int i = 0; i < N; i++) { for (int j = 0; j < i; j++) { graph.add(i,j,mean - columnMean[i] - columnMean[j]); } } graph.applyMultiplication(-0.5); } public void run() { if (n > 1) { this.constructNeighborhoodGraph(); this.computeShortestPathWithDijkstra(); this.constructDeeDimensionalEmbedding(); } else { points = new Points(n); } graph = null; } }