/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.clustering.meanshift; import java.util.Collection; import java.util.List; import com.google.common.collect.Lists; import org.apache.hadoop.conf.Configuration; import org.apache.mahout.common.ClassUtils; import org.apache.mahout.common.distance.DistanceMeasure; import org.apache.mahout.common.kernel.IKernelProfile; import org.apache.mahout.math.Vector; public class MeanShiftCanopyClusterer { private final double convergenceDelta; // the T1 distance threshold private final double t1; // the T2 distance threshold private final double t2; // the distance measure private final DistanceMeasure measure; private final IKernelProfile kernelProfile; // if true accumulate clusters during merge so clusters can be produced later private final boolean runClustering; public MeanShiftCanopyClusterer(Configuration configuration) { measure = ClassUtils.instantiateAs(configuration.get(MeanShiftCanopyConfigKeys.DISTANCE_MEASURE_KEY), DistanceMeasure.class); measure.configure(configuration); runClustering = configuration.getBoolean(MeanShiftCanopyConfigKeys.CLUSTER_POINTS_KEY, true); kernelProfile = ClassUtils.instantiateAs(configuration.get(MeanShiftCanopyConfigKeys.KERNEL_PROFILE_KEY), IKernelProfile.class); // nextCanopyId = 0; // never read? t1 = Double .parseDouble(configuration.get(MeanShiftCanopyConfigKeys.T1_KEY)); t2 = Double .parseDouble(configuration.get(MeanShiftCanopyConfigKeys.T2_KEY)); convergenceDelta = Double.parseDouble(configuration .get(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY)); } public MeanShiftCanopyClusterer(DistanceMeasure aMeasure, IKernelProfile aKernelProfileDerivative, double aT1, double aT2, double aDelta, boolean runClustering) { // nextCanopyId = 100; // so canopyIds will sort properly // never read? measure = aMeasure; t1 = aT1; t2 = aT2; convergenceDelta = aDelta; kernelProfile = aKernelProfileDerivative; this.runClustering = runClustering; } public double getT1() { return t1; } public double getT2() { return t2; } /** * Merge the given canopy into the canopies list. If it touches any existing * canopy (norm<T1) then add the center of each to the other. If it covers any * other canopies (norm<T2), then merge the given canopy with the closest * covering canopy. If the given canopy does not cover any other canopies, add * it to the canopies list. * * @param aCanopy * a MeanShiftCanopy to be merged * @param canopies * the List<Canopy> to be appended */ public void mergeCanopy(MeanShiftCanopy aCanopy, Collection<MeanShiftCanopy> canopies) { MeanShiftCanopy closestCoveringCanopy = null; double closestNorm = Double.MAX_VALUE; for (MeanShiftCanopy canopy : canopies) { double norm = measure.distance(canopy.getCenter(), aCanopy.getCenter()); double weight = kernelProfile.calculateDerivativeValue(norm, t1); if (weight > 0.0) { aCanopy.touch(canopy, weight); } if (norm < t2 && (closestCoveringCanopy == null || norm < closestNorm)) { closestNorm = norm; closestCoveringCanopy = canopy; } } if (closestCoveringCanopy == null) { canopies.add(aCanopy); } else { closestCoveringCanopy.merge(aCanopy, runClustering); } } /** * Shift the center to the new centroid of the cluster * * @param canopy * the canopy to shift. * @return if the cluster is converged */ public boolean shiftToMean(MeanShiftCanopy canopy) { canopy.observe(canopy.getCenter(), canopy.getMass()); canopy.computeConvergence(measure, convergenceDelta); canopy.computeParameters(); return canopy.isConverged(); } /** * Return if the point is covered by this canopy * * @param canopy * a canopy. * @param point * a Vector point * @return if the point is covered */ boolean covers(MeanShiftCanopy canopy, Vector point) { return measure.distance(canopy.getCenter(), point) < t1; } /** * Return if the point is closely covered by the canopy * * @param canopy * a canopy. * @param point * a Vector point * @return if the point is covered */ public boolean closelyBound(MeanShiftCanopy canopy, Vector point) { return measure.distance(canopy.getCenter(), point) < t2; } /** * This is the reference mean-shift implementation. Given its inputs it * iterates over the points and clusters until their centers converge or until * the maximum number of iterations is exceeded. * * @param points * the input List<Vector> of points * @param measure * the DistanceMeasure to use * @param numIter * the maximum number of iterations */ public static List<MeanShiftCanopy> clusterPoints(Iterable<Vector> points, DistanceMeasure measure, IKernelProfile aKernelProfileDerivative, double convergenceThreshold, double t1, double t2, int numIter) { MeanShiftCanopyClusterer clusterer = new MeanShiftCanopyClusterer(measure, aKernelProfileDerivative, t1, t2, convergenceThreshold, true); int nextCanopyId = 0; List<MeanShiftCanopy> canopies = Lists.newArrayList(); for (Vector point : points) { clusterer.mergeCanopy( new MeanShiftCanopy(point, nextCanopyId++, measure), canopies); } List<MeanShiftCanopy> newCanopies = canopies; boolean[] converged = { false }; for (int iter = 0; !converged[0] && iter < numIter; iter++) { newCanopies = clusterer.iterate(newCanopies, converged); } return newCanopies; } protected List<MeanShiftCanopy> iterate(Iterable<MeanShiftCanopy> canopies, boolean[] converged) { converged[0] = true; List<MeanShiftCanopy> migratedCanopies = Lists.newArrayList(); for (MeanShiftCanopy canopy : canopies) { converged[0] = shiftToMean(canopy) && converged[0]; mergeCanopy(canopy, migratedCanopies); } return migratedCanopies; } protected static MeanShiftCanopy findCoveringCanopy(MeanShiftCanopy canopy, Iterable<MeanShiftCanopy> clusters) { // canopies use canopyIds assigned when input vectors are processed as // vectorIds too int vectorId = canopy.getId(); for (MeanShiftCanopy msc : clusters) { for (int containedId : msc.getBoundPoints().toList()) { if (vectorId == containedId) { return msc; } } } return null; } }