/* * File: MedoidClusterCreator.java * Authors: Kevin R. Dixon * Company: Sandia National Laboratories * Project: Cognitive Foundry * * Copyright June 21, 2007, Sandia Corporation. Under the terms of Contract * DE-AC04-94AL85000, there is a non-exclusive license for use of this work by * or on behalf of the U.S. Government. Export of this program may require a * license from the United States Government. See CopyrightHistory.txt for * complete details. * */ package gov.sandia.cognition.learning.algorithm.clustering.cluster; import gov.sandia.cognition.annotation.CodeReview; import gov.sandia.cognition.learning.function.distance.DefaultDivergenceFunctionContainer; import gov.sandia.cognition.math.DivergenceFunction; import java.util.Collection; /** * The <code>MedoidClusterCreator</codE> class creates a * <code>CentroidCluster</code> at the sample that minimizes the sum * of the divergence to the objects assigned to the cluster. * * @param <DataType> The algorithm operates on a Collection of DataType, so * DataType will be something like Vector or String * @author Justin Basilico * @author Kevin R. Dixon * @since 2.0 */ @CodeReview( reviewer="Kevin R. Dixon", date="2008-07-22", changesNeeded=false, comments="Code generally looks fine." ) public class MedoidClusterCreator<DataType> extends DefaultDivergenceFunctionContainer<DataType,DataType> implements ClusterCreator<CentroidCluster<DataType>, DataType> { /** * Creates a new instance of MedoidClusterCreator */ public MedoidClusterCreator() { this(null); } /** * Creates a new instance of MedoidClusterCreator * * @param divergenceFunction * Divergence function used to evaluate the dissimilarity between * two data points */ public MedoidClusterCreator( final DivergenceFunction<? super DataType, ? super DataType> divergenceFunction) { super( divergenceFunction ); } /** * Creates a CentroidCluster at the member that minimizes the sum of * divergence between all members * * @param members * Data points that have been assigned to the cluster * @return * CentroidCluster that minimizes the sum of divergence between all * assigned members */ public CentroidCluster<DataType> createCluster( final Collection<? extends DataType> members) { double minTotalDivergence = Double.POSITIVE_INFINITY; DataType medoid = null; // TODO: This code could be made faster by caching divergence function // values. Since divergence functions must be symmetric, there's no need // to compute both f(x,y) and f(y,x). However, this could use a // large amount of memory and may not be worth it. for ( DataType candidate : members ) { double totalDivergence = 0.0; for ( DataType member : members ) { // Divergence functions must obey f(x,x) = 0, so we don't // need to compute the divergence between two identical objects if ( candidate != member ) { totalDivergence += this.divergenceFunction.evaluate(candidate, member); } // Divergence functions are nonnegative, so if we're already // above the minimum, we can just stop counting if ( minTotalDivergence < totalDivergence ) { break; } } if ( medoid == null || minTotalDivergence > totalDivergence ) { minTotalDivergence = totalDivergence; medoid = candidate; } } return new CentroidCluster<DataType>(medoid, members); } }