DBScan.java example

Explorer
xxl-master
/* XXL: The eXtensible and fleXible Library for data processing

Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger
                        Head of the Database Research Group
                        Department of Mathematics and Computer Science
                        University of Marburg
                        Germany

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 3 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library;  If not, see <http://www.gnu.org/licenses/>. 

    http://code.google.com/p/xxl/

*/

package xxl.core.cursors.groupers;

import java.util.Iterator;
import java.util.LinkedList;
import java.util.NoSuchElementException;

import xxl.core.collections.queues.FIFOQueue;
import xxl.core.collections.queues.ListQueue;
import xxl.core.collections.queues.Queue;
import xxl.core.collections.queues.Queues;
import xxl.core.collections.queues.StackQueue;
import xxl.core.cursors.AbstractCursor;
import xxl.core.cursors.Cursor;
import xxl.core.cursors.Cursors;
import xxl.core.cursors.filters.Filter;
import xxl.core.cursors.mappers.Mapper;
import xxl.core.cursors.unions.Sequentializer;
import xxl.core.cursors.wrappers.QueueCursor;
import xxl.core.functions.AbstractFunction;
import xxl.core.functions.Function;
import xxl.core.predicates.AbstractPredicate;
import xxl.core.predicates.And;
import xxl.core.predicates.EqualReference;
import xxl.core.predicates.LeftBind;
import xxl.core.predicates.Not;
import xxl.core.predicates.Predicate;
import xxl.core.util.Classifiable;

/**
 * A grouper that generates clusters from a given input iteration. So, each call
 * to the <tt>next</tt> method returns a new cursor representing a new cluster.
 * This cursor is based on the DBScan algorithm published in <i>"A Density-Based
 * Algorithm for Discovering Clusters in Large Spatial Databases with Noise"</i>
 * ([EKS+96]) and <i>"Incremental Clustering for Mining in a Data Warehousing
 * Environment"</i> ([EKS+98]). But the algorithm is modified in a way, that the
 * computation of the clusters and their elements is implemented <i>absolutely
 * lazy</i>.
 * 
 * <p>"The key idea is that for each point of a cluster the neighborhood
 * concerning a given radius has to contain at least a minimum number of points,
 * i.e. the density in the neighborhood has to exceed some threshold."
 * ([EKS+96])</p>
 * 
 * <p><b>IMPORTANT:</b> All objects of the given input iterator have to implement
 * the {@link xxl.core.util.Classifiable classifiable} interface.</p>
 * 
 * <p><b>Note:</b> If the input iteration is given by an object of the class
 * {@link java.util.Iterator Iterator}, i.e., it does not support the
 * <tt>peek</tt> operation, it is internally wrapped to a cursor.</p>
 * 
 * <p><b>Example usage:</b>
 * <pre>
 *     // get the data; here the data is contained in a list-bag
 *     
 *     Iterator input = data.cursor();
 * 
 *     // using an <i>Euclidean</i> metric as distance function
 *     // eps = 1.6
 *     // minPts = 3
 * 
 * 		DBScan clusterCursor = new DBScan(
 *			input,
 * 			1.6,
 *			3,
 *			new AbstractFunction() {
 *				public Object invoke(Object descriptor) {
 *					final Sphere sphere = (Sphere)descriptor;
 *					return data.query(  // define a simple range query: iterating over all elements
 *						new AbstractPredicate() { // and checking if the given point is contained in the search sphere
 *							public boolean invoke(Object o) {
 *								return sphere.contains(
 *								        new Sphere(((ClassifiableObject)o).getObject(), 0d, null)
 *								);
 *							}
 * 						}
 *					);
 *				}
 *			},
 *			new AbstractFunction() {
 *				public Object invoke(Object object, Object eps) {
 *					return new Sphere(((ClassifiableObject)object).getObject(), ((Double)eps).doubleValue(), null);
 *				}
 *			}
 *		);
 *		
 *		clusterCursor.open();
 *
 *		for (int i = 0; clusterCursor.hasNext(); i++) {
 *			Cursor next = (Cursor)clusterCursor.next(); // each element of the clusterCursor is a new cursor, namely a new cluster
 * 			System.out.println("cluster " + i + ": ");
 * 			Cursors.println(next);
 * 		}
 *		
 *		clusterCursor.close();
 * </pre>
 * To perform a more efficient clustering, the range queries have to use an
 * index-structure, e.g. an {@link xxl.core.indexStructures.MTree M-tree}, where
 * the data, i.e. the classifiable objects, are located in.
 *
 * @see java.util.Iterator
 * @see xxl.core.cursors.Cursor
 * @see xxl.core.util.Classifiable
 * @see xxl.core.util.Distance
 */
public class DBScan extends AbstractCursor {

	/**
	 * The search 'radius' used for the range queries.
	 */
	protected double eps;

	/**
	 * The minimum number of elements that have to be positioned in the
	 * Eps-neighborhood of an element.
	 */
	protected int minPts;

	/**
	 * The input iteration providing the data to be grouped.
	 */
	protected Cursor input;

	/**
	 * This queue contains all elements that are already belonging to a cluster,
	 * but a range query has to be performed with them.
	 */
	protected Queue queue;

	/**
	 * This queue contains all elements that have been marked as noise. Due to
	 * performance reasons the implementation of this DBScan algorithm does not
	 * delete elements, that were inserted into the noise-cluster, but later are
	 * assigned to a special cluster. Therefore this queue may contain elements
	 * that have already been returned to the user with regard to another
	 * cluster. When returning the noise, i.e. the last cluster, all elements
	 * that have a different clusterID will be filtered out.
	 */
	protected Queue noiseQueue;

	/**
	 * This queue contains all elements that belonged to the noise-cluster, but
	 * which were assigned later to a special cluster. So, the clusterID of these
	 * elements changed.
	 */
	protected Queue changedCIDQueue;

	/**
	 * A unary function that internally holds a data structure, e.g. an index,
	 * to perform range queries efficiently. The argument of this function should
	 * be a kind of an arbitrary descriptor and its result should be a cursor of
	 * objects implementing the {@link xxl.core.util.Classifiable classifiable}
	 * interface.
	 * <pre>
	 *     f : Object (Descriptor) --> Cursor of ClassifiableObjects
	 * </pre>
	 */
	protected Function rangeQuery;

	/**
	 * A binary function returning a kind of descriptor used for the range query.
	 * To determine this descriptor the function gets two arguments, namely the
	 * search object and a user-defined radius <tt>eps</tt>.
	 * <pre>
	 *     f : Object x eps --> Object (Descriptor)
	 * </pre>
	 */
	protected Function getDescriptor;

	/**
	 * An internal used cursor representing the next cluster to be returned to
	 * the user by a call to the <tt>next</tt> or <tt>peek</tt> method.
	 */
	protected Cursor nextCluster;

	/**
	 * The cluster number with which the elements will be marked during the
	 * algorithms execution.
	 */
	protected long CLUSTER_NO = UNDEFINED;

	/**
	 * A constant cluster number for noise.
	 */
	public static final int NOISE = -1;

	/**
	 * A constant cluster number for a cluster that is undefined, e.g., at the
	 * start of the algorithm.
	 */
	public static final int UNDEFINED = -2;

	/**
	 * Returns an unary predicate that verifies if a given classifiable object
	 * is already classified.
	 */
	public static Predicate isUnclassified = new AbstractPredicate() {
		public boolean invoke(Object o) {
			return !((Classifiable)o).isClassified();
		}
	};

	/**
	 * Returns an unary predicate that verifies if a given classifiable object
	 * is marked as noise.
	 */
	public static Predicate isNoise = new AbstractPredicate() {
		public boolean invoke(Object o) {
			return ((Classifiable)o).getClusterID() == NOISE;
		}
	};

	/**
	 * Returns an unary function that can be used to mark classifiable objects
	 * with regard to assign them to a user-defined cluster.
	 *
	 * @param clusterID the clusterID determining to which cluster the objects
	 *        should be assigned to.
	 * @return an unary function that can be used to mark classifiable objects
	 *         with regard to assign them to a user-defined cluster.
	 */
	public static Function ClusterID_FACTORY(final long clusterID) {
		return new AbstractFunction() {
			public Object invoke(Object o) {
				((Classifiable)o).setClusterID(clusterID);
				return o;
			}
		};
	}

	/**
	 * A helper class for classifiable objects that implements the
	 * {@link xxl.core.util.Classifiable classifiable} interface. Its a standard
	 * implementation for classifiable objects and decorates each object given to
	 * a constructor with classification criteria.
	 */
	public static class ClassifiableObject implements Classifiable {

		/**
		 * The object to be classified.
		 */
		protected Object object;

		/**
		 * The clusterID assigned to this object.
		 */
		protected long CLUSTER_ID = UNDEFINED;

		/**
		 * A flag that signals if this object has already been classified.
		 */
		protected boolean isClassified = false;

		/**
		 * Creates a new classifiable object.
		 *
		 * @param object the object to be classified.
		 */
		public ClassifiableObject(Object object) {
			this.object = object;
		}

		/**
		 * Creates a new classifiable object.
		 *
		 * @param object the object to be classified.
		 * @param CLUSTER_ID the clusterID for this object.
		 */
		public ClassifiableObject(Object object, int CLUSTER_ID) {
			this.object = object;
			this.CLUSTER_ID = CLUSTER_ID;
			this.isClassified = true;
		}

		/**
		 * Returns <tt>true</tt>, if the object has already been classified.
		 *
		 * @return <tt>true</tt> if the object has already been classified,
		 *         <tt>false</tt> otherwise.
		 */
		public boolean isClassified() {
			return isClassified;
		}

		/**
		 * Returns the cluster ID of this object.
		 *
		 * @return the cluster ID of this object.
		 */
		public long getClusterID() {
			return CLUSTER_ID;
		}

		/**
		 *	Sets the cluster ID of this object.
		 *
		 * @param CLUSTER_ID the new cluster ID of this object.
		 */
		public void setClusterID(long CLUSTER_ID) {
			this.CLUSTER_ID = CLUSTER_ID;
			this.isClassified = true;
		}

		/**
		 * Returns the object specified in the constructor.
		 *
		 * @return the object.
		 */
		public Object getObject() {
			return object;
		}

		/**
		 * Returns <tt>true</tt> if two classifiable objects are equal,
		 * <tt>false</tt> otherwise.
		 *
		 * @param o the reference object with which to compare. 
		 * @return <tt>true</tt> if two classifiable objects are equal,
		 *         <tt>false</tt> otherwise.
		 */
		public boolean equals(Object o) {
			ClassifiableObject co = (ClassifiableObject)o;
			return object == co.object && isClassified == co.isClassified && CLUSTER_ID == co.CLUSTER_ID;
		}

		/**
		 * Returns a hash code value for the object. This method is supported for
		 * the benefit of hashtables such as those provided by
		 * <tt>java.util.Hashtable</tt>.
		 * 
		 * <p>The general contract of <tt>hashCode</tt> is:
		 * <ul>
		 *     <li>
		 *         Whenever it is invoked on the same object more than once
		 *         during an execution of a Java application, the
		 *         <tt>hashCode</tt> method must consistently return the same
		 *         integer, provided no information used in equals comparisons on
		 *         the object is modified. This integer need not remain
		 *         consistent from one execution of an application to another
		 *         execution of the same application.
		 *     </li>
		 *     <li>
		 *         If two objects are equal according to the
		 *         {@link java.lang.Object#equals(Object)} method, then calling
		 *         the <tt>hashCode</tt> method on each of the two objects must
		 *         produce the same integer result.
		 *     </li>
		 *     <li>
		 *         It is <i>not</i> required that if two objects are unequal
		 *         according to the {@link java.lang.Object#equals(Object)}
		 *         method, then calling the <tt>hashCode</tt> method on each of
		 *         the two objects must produce distinct integer results.
		 *         However, the programmer should be aware that producing
		 *         distinct integer results for unequal objects may improve the
		 *         performance of hashtables.
		 *     </li>
		 * </ul></p>
		 * 
		 * @return a hash code value for this object.
		 */
		public int hashCode() {
			return object.hashCode() + (int)CLUSTER_ID;
		}

		/**
		 * The string representation of a classifiable object.
		 *
		 * @return the string representation of a classifiable object.
		 */
		public String toString() {
			return "object: " + object.toString() + "; classified: " + isClassified + "; clusterID: " + CLUSTER_ID;
		}
	}

	/**
	 * Creates a new lazy DBScan cluster operator.
	 *
	 * @param input the input iteration providing the data to be grouped.
	 * @param eps the eps-radius used for range queries.
	 * @param minPts the minimum number of elements that have to be located in
	 *        the eps-neighborhood of a core point.
	 * @param rangeQuery a function performing a range query based on a given
	 *        descriptor.
	 * @param getDescriptor a function delivering a kind of descriptor for a
	 *        range query.
	 * @param newQueue a parameterless function returning a new queue used for
	 *        storing elements with which a range query will be performed.
	 * @param newNoiseQueue a parameterless function delivering a new queue
	 *        holding the noise.
	 * @param newChangedCIDQueue a parameterless function returning a queue that
	 *        gets the elements that belonged to noise, but later are assigned to
	 *        a special cluster.
	 * @throws IllegalArgumentException if a negative value for <tt>eps</tt> or
	 *         <tt>minPts</tt> has been specified.
	 */
	public DBScan(Iterator input, double eps, int minPts, Function rangeQuery, Function getDescriptor, Function newQueue, Function newNoiseQueue, Function newChangedCIDQueue) throws IllegalArgumentException {
		if (eps < 0)
			throw new IllegalArgumentException("cannot compute Eps-Neighborhood with negative eps-distance!");
		if (minPts <= 0)
			throw new IllegalArgumentException("a cluster must contain at least one element.");
		this.input = new Filter(input, isUnclassified);
		this.eps = eps;
		this.minPts = minPts;
		this.queue = (Queue)newQueue.invoke();
		this.noiseQueue = (Queue)newNoiseQueue.invoke();
		this.changedCIDQueue = (Queue)newChangedCIDQueue.invoke();
		this.rangeQuery = rangeQuery;
		this.getDescriptor = getDescriptor;
	}

	/**
	 * Creates a new lazy DBScan cluster operator. Uses a
	 * {@link xxl.core.collections.queues.StackQueue stack-queue} for storing the
	 * elements range queries will be performed with, a
	 * {@link xxl.core.collections.queues.ListQueue list-queue} for noise and a
	 * default {@link xxl.core.collections.queues.FIFOQueue FIFO-queue} for the
	 * elements that changed from the noise cluster to an other cluster.
	 *
	 * @param input the input iteration providing the data to be grouped.
	 * @param eps the eps-radius used for range queries.
	 * @param minPts the minimum number of elements that have to be located in
	 *        the eps-neighborhood of a core point.
	 * @param rangeQuery a function performing a range query based on a given
	 *        descriptor.
	 * @param getDescriptor a function delivering a kind of descriptor for a
	 *        range query.
	 * @throws IllegalArgumentException if a negative value for <tt>eps</tt> or
	 *         <tt>minPts</tt> has been specified.
	 */
	public DBScan(Iterator input, double eps, int minPts, Function rangeQuery, Function getDescriptor) throws IllegalArgumentException {
		this(input, eps, minPts, rangeQuery, getDescriptor, StackQueue.FACTORY_METHOD, ListQueue.FACTORY_METHOD, FIFOQueue.FACTORY_METHOD);
	}

	/**
	 * Opens the cursor, i.e., signals the cursor to reserve resources, open the
	 * input iteration, etc. Before a cursor has been opened calls to methods
	 * like <tt>next</tt> or <tt>peek</tt> are not guaranteed to yield proper
	 * results. Therefore <tt>open</tt> must be called before a cursor's data
	 * can be processed. Multiple calls to <tt>open</tt> do not have any effect,
	 * i.e., if <tt>open</tt> was called the cursor remains in the state
	 * <i>opened</i> until its <tt>close</tt> method is called.
	 * 
	 * <p>Note, that a call to the <tt>open</tt> method of a closed cursor
	 * usually does not open it again because of the fact that its state
	 * generally cannot be restored when resources are released respectively
	 * files are closed.</p>
	 */
	public void open() {
		if (isOpened) return;
		super.open();
		input.open();
		queue.open();
		noiseQueue.open();
		changedCIDQueue.open();
	}
	
	/**
	 * Closes the cursor, i.e., signals the cursor to clean up resources, close
	 * queues, etc. When a cursor has been closed calls to methods like
	 * <tt>next</tt> or <tt>peek</tt> are not guaranteed to yield proper
	 * results. Multiple calls to <tt>close</tt> do not have any effect, i.e.,
	 * if <tt>close</tt> was called the cursor remains in the state
	 * <i>closed</i>.
	 * 
	 * <p>Note, that a closed cursor usually cannot be opened again because of
	 * the fact that its state generally cannot be restored when resources are
	 * released respectively files are closed.</p>
	 */
	public void close () {
		if (isClosed) return;
		super.close();
		input.close();
		queue.close();
		noiseQueue.close();
		changedCIDQueue.close();
	}

	/**
	 * Returns <tt>true</tt> if the iteration has more elements. (In other
	 * words, returns <tt>true</tt> if <tt>next</tt> or <tt>peek</tt> would
	 * return an element rather than throwing an exception.)
	 * 
	 * @return <tt>true</tt> if the cursor has more elements.
	 */
	protected boolean hasNextObject() {
		if (nextCluster != null) // consume last processed cursor completely
			Cursors.consume(nextCluster);
		if (input.hasNext()) {
			CLUSTER_NO = CLUSTER_NO == UNDEFINED ?
				CLUSTER_NO = 0 :
				++CLUSTER_NO; // select new clusterID
			nextCluster = new AbstractCursor() {
				protected Classifiable x;
				protected boolean inputMode = input.hasNext();

				public boolean hasNextObject() {
					while (!queue.isEmpty() || inputMode) {
						x = !queue.isEmpty() ?
							(Classifiable)queue.dequeue() :
							(Classifiable)input.next();
						// range query
						Cursor results = (Cursor)rangeQuery.invoke(getDescriptor.invoke(x, new Double(eps)));
						// check number of elements
						LinkedList list = new LinkedList();
						while (results.hasNext() && list.size() < minPts-1)
							list.add(results.next());
						// x is a core point
						if (results.hasNext()) {
							if (inputMode) {
								x.setClusterID(CLUSTER_NO);
								inputMode = false;
								// insert all objects\{x} into the queue
								Queues.enqueueAll(
									queue,
									// mark all objects in the eps-neighborhood of x with current clusterID
									new Mapper(
										ClusterID_FACTORY(CLUSTER_NO),
										new Filter(
											new Sequentializer(
												list.iterator(),
												results
											),
											new And(
												new LeftBind(
													new Not(
														EqualReference.DEFAULT_INSTANCE
													),
													x
												),
												isUnclassified
											)
										)
									)
								);
							}
							else {
								Cursors.consume(
									new Mapper(
										// mark all objects with current clusterID
										ClusterID_FACTORY(CLUSTER_NO),
										// select all objects in the eps-neighborhood of x that
										new Filter(
											// are not yet classified or
											new Sequentializer(
												list.iterator(),
												results
											),
											// marked as noise;
											new AbstractPredicate() {
												public boolean invoke(Object o) {
													Classifiable c = (Classifiable)o;
													boolean unClassified = !c.isClassified();
													boolean isNoise = c.getClusterID() == NOISE;
													// insert the unclassified objects into the queue
													if (unClassified)
														queue.enqueue(o);
													else
														// changed clusterID
														if (isNoise)
															changedCIDQueue.enqueue(o);
													return unClassified || isNoise;
												}
											}
										)
									)
								);
							}
							next = x;
							results.close();
							return true;
						}
						// x is a border point
						if (inputMode) {
							// mark x as noise and process with input
							x.setClusterID(NOISE);
							noiseQueue.enqueue(x);
							inputMode = input.hasNext();
						}
						else {
							// set x as next result, x has been located in queue
							next = x;
							results.close();
							return true;
						}
					}
					return false;
				}

				public Object nextObject() throws NoSuchElementException {
					return next;
				}
			};
			nextCluster = new Sequentializer(
				nextCluster,
				new QueueCursor(changedCIDQueue)
			);
			if (nextCluster.hasNext())
				return true;
		}
		nextCluster = new Filter(new QueueCursor(noiseQueue), isNoise);
		if (nextCluster.hasNext())  // return all elements of the noiseQueue that are noise
			return true;
		return false;
	}

	/**
	 * Returns the next element in the iteration. This element will be
	 * accessible by some of the cursor's methods, e.g., <tt>update</tt> or
	 * <tt>remove</tt>, until a call to <tt>next</tt> or <tt>peek</tt> occurs.
	 * This is calling <tt>next</tt> or <tt>peek</tt> proceeds the iteration and
	 * therefore its previous element will not be accessible any more.
	 * 
	 * @return the next element in the iteration.
	 */
	protected Object nextObject() {
		return nextCluster;
	}

	/**
	 * Resets the DBScan-cursor to its initial state such that the caller is
	 * able to traverse the underlying data structure again without constructing
	 * a new cursor (optional operation). The modifications, removes and updates
	 * concerning the underlying data structure, are still persistent.
	 * 
	 * <p>Note, that this operation is optional and might not work for all
	 * cursors.</p>
	 *
	 * @throws UnsupportedOperationException if the <tt>reset</tt> operation is
	 *         not supported by the cursor.
	 */
	public void reset () throws UnsupportedOperationException{
		input.reset();
		queue.clear();
		noiseQueue.clear();
		changedCIDQueue.clear();
		CLUSTER_NO = UNDEFINED;
	}

	/**
	 * Returns <tt>true</tt> if the <tt>reset</tt> operation is supported by
	 * the DBScan-cursor. Otherwise it returns <tt>false</tt>.
	 *
	 * @return <tt>true</tt> if the <tt>reset</tt> operation is supported by
	 *         the DBScan-cursor, otherwise <tt>false</tt>.
	 */
	public boolean supportsReset() {
		return input.supportsReset();
	}
}