/* XXL: The eXtensible and fleXible Library for data processing
Copyright (C) 2000-2011 Prof. Dr. Bernhard Seeger
Head of the Database Research Group
Department of Mathematics and Computer Science
University of Marburg
Germany
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 3 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; If not, see <http://www.gnu.org/licenses/>.
http://code.google.com/p/xxl/
*/
package xxl.core.cursors.groupers;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.NoSuchElementException;
import xxl.core.collections.queues.FIFOQueue;
import xxl.core.collections.queues.ListQueue;
import xxl.core.collections.queues.Queue;
import xxl.core.collections.queues.Queues;
import xxl.core.collections.queues.StackQueue;
import xxl.core.cursors.AbstractCursor;
import xxl.core.cursors.Cursor;
import xxl.core.cursors.Cursors;
import xxl.core.cursors.filters.Filter;
import xxl.core.cursors.mappers.Mapper;
import xxl.core.cursors.unions.Sequentializer;
import xxl.core.cursors.wrappers.QueueCursor;
import xxl.core.functions.AbstractFunction;
import xxl.core.functions.Function;
import xxl.core.predicates.AbstractPredicate;
import xxl.core.predicates.And;
import xxl.core.predicates.EqualReference;
import xxl.core.predicates.LeftBind;
import xxl.core.predicates.Not;
import xxl.core.predicates.Predicate;
import xxl.core.util.Classifiable;
/**
* A grouper that generates clusters from a given input iteration. So, each call
* to the <tt>next</tt> method returns a new cursor representing a new cluster.
* This cursor is based on the DBScan algorithm published in <i>"A Density-Based
* Algorithm for Discovering Clusters in Large Spatial Databases with Noise"</i>
* ([EKS+96]) and <i>"Incremental Clustering for Mining in a Data Warehousing
* Environment"</i> ([EKS+98]). But the algorithm is modified in a way, that the
* computation of the clusters and their elements is implemented <i>absolutely
* lazy</i>.
*
* <p>"The key idea is that for each point of a cluster the neighborhood
* concerning a given radius has to contain at least a minimum number of points,
* i.e. the density in the neighborhood has to exceed some threshold."
* ([EKS+96])</p>
*
* <p><b>IMPORTANT:</b> All objects of the given input iterator have to implement
* the {@link xxl.core.util.Classifiable classifiable} interface.</p>
*
* <p><b>Note:</b> If the input iteration is given by an object of the class
* {@link java.util.Iterator Iterator}, i.e., it does not support the
* <tt>peek</tt> operation, it is internally wrapped to a cursor.</p>
*
* <p><b>Example usage:</b>
* <pre>
* // get the data; here the data is contained in a list-bag
*
* Iterator input = data.cursor();
*
* // using an <i>Euclidean</i> metric as distance function
* // eps = 1.6
* // minPts = 3
*
* DBScan clusterCursor = new DBScan(
* input,
* 1.6,
* 3,
* new AbstractFunction() {
* public Object invoke(Object descriptor) {
* final Sphere sphere = (Sphere)descriptor;
* return data.query( // define a simple range query: iterating over all elements
* new AbstractPredicate() { // and checking if the given point is contained in the search sphere
* public boolean invoke(Object o) {
* return sphere.contains(
* new Sphere(((ClassifiableObject)o).getObject(), 0d, null)
* );
* }
* }
* );
* }
* },
* new AbstractFunction() {
* public Object invoke(Object object, Object eps) {
* return new Sphere(((ClassifiableObject)object).getObject(), ((Double)eps).doubleValue(), null);
* }
* }
* );
*
* clusterCursor.open();
*
* for (int i = 0; clusterCursor.hasNext(); i++) {
* Cursor next = (Cursor)clusterCursor.next(); // each element of the clusterCursor is a new cursor, namely a new cluster
* System.out.println("cluster " + i + ": ");
* Cursors.println(next);
* }
*
* clusterCursor.close();
* </pre>
* To perform a more efficient clustering, the range queries have to use an
* index-structure, e.g. an {@link xxl.core.indexStructures.MTree M-tree}, where
* the data, i.e. the classifiable objects, are located in.
*
* @see java.util.Iterator
* @see xxl.core.cursors.Cursor
* @see xxl.core.util.Classifiable
* @see xxl.core.util.Distance
*/
public class DBScan extends AbstractCursor {
/**
* The search 'radius' used for the range queries.
*/
protected double eps;
/**
* The minimum number of elements that have to be positioned in the
* Eps-neighborhood of an element.
*/
protected int minPts;
/**
* The input iteration providing the data to be grouped.
*/
protected Cursor input;
/**
* This queue contains all elements that are already belonging to a cluster,
* but a range query has to be performed with them.
*/
protected Queue queue;
/**
* This queue contains all elements that have been marked as noise. Due to
* performance reasons the implementation of this DBScan algorithm does not
* delete elements, that were inserted into the noise-cluster, but later are
* assigned to a special cluster. Therefore this queue may contain elements
* that have already been returned to the user with regard to another
* cluster. When returning the noise, i.e. the last cluster, all elements
* that have a different clusterID will be filtered out.
*/
protected Queue noiseQueue;
/**
* This queue contains all elements that belonged to the noise-cluster, but
* which were assigned later to a special cluster. So, the clusterID of these
* elements changed.
*/
protected Queue changedCIDQueue;
/**
* A unary function that internally holds a data structure, e.g. an index,
* to perform range queries efficiently. The argument of this function should
* be a kind of an arbitrary descriptor and its result should be a cursor of
* objects implementing the {@link xxl.core.util.Classifiable classifiable}
* interface.
* <pre>
* f : Object (Descriptor) --> Cursor of ClassifiableObjects
* </pre>
*/
protected Function rangeQuery;
/**
* A binary function returning a kind of descriptor used for the range query.
* To determine this descriptor the function gets two arguments, namely the
* search object and a user-defined radius <tt>eps</tt>.
* <pre>
* f : Object x eps --> Object (Descriptor)
* </pre>
*/
protected Function getDescriptor;
/**
* An internal used cursor representing the next cluster to be returned to
* the user by a call to the <tt>next</tt> or <tt>peek</tt> method.
*/
protected Cursor nextCluster;
/**
* The cluster number with which the elements will be marked during the
* algorithms execution.
*/
protected long CLUSTER_NO = UNDEFINED;
/**
* A constant cluster number for noise.
*/
public static final int NOISE = -1;
/**
* A constant cluster number for a cluster that is undefined, e.g., at the
* start of the algorithm.
*/
public static final int UNDEFINED = -2;
/**
* Returns an unary predicate that verifies if a given classifiable object
* is already classified.
*/
public static Predicate isUnclassified = new AbstractPredicate() {
public boolean invoke(Object o) {
return !((Classifiable)o).isClassified();
}
};
/**
* Returns an unary predicate that verifies if a given classifiable object
* is marked as noise.
*/
public static Predicate isNoise = new AbstractPredicate() {
public boolean invoke(Object o) {
return ((Classifiable)o).getClusterID() == NOISE;
}
};
/**
* Returns an unary function that can be used to mark classifiable objects
* with regard to assign them to a user-defined cluster.
*
* @param clusterID the clusterID determining to which cluster the objects
* should be assigned to.
* @return an unary function that can be used to mark classifiable objects
* with regard to assign them to a user-defined cluster.
*/
public static Function ClusterID_FACTORY(final long clusterID) {
return new AbstractFunction() {
public Object invoke(Object o) {
((Classifiable)o).setClusterID(clusterID);
return o;
}
};
}
/**
* A helper class for classifiable objects that implements the
* {@link xxl.core.util.Classifiable classifiable} interface. Its a standard
* implementation for classifiable objects and decorates each object given to
* a constructor with classification criteria.
*/
public static class ClassifiableObject implements Classifiable {
/**
* The object to be classified.
*/
protected Object object;
/**
* The clusterID assigned to this object.
*/
protected long CLUSTER_ID = UNDEFINED;
/**
* A flag that signals if this object has already been classified.
*/
protected boolean isClassified = false;
/**
* Creates a new classifiable object.
*
* @param object the object to be classified.
*/
public ClassifiableObject(Object object) {
this.object = object;
}
/**
* Creates a new classifiable object.
*
* @param object the object to be classified.
* @param CLUSTER_ID the clusterID for this object.
*/
public ClassifiableObject(Object object, int CLUSTER_ID) {
this.object = object;
this.CLUSTER_ID = CLUSTER_ID;
this.isClassified = true;
}
/**
* Returns <tt>true</tt>, if the object has already been classified.
*
* @return <tt>true</tt> if the object has already been classified,
* <tt>false</tt> otherwise.
*/
public boolean isClassified() {
return isClassified;
}
/**
* Returns the cluster ID of this object.
*
* @return the cluster ID of this object.
*/
public long getClusterID() {
return CLUSTER_ID;
}
/**
* Sets the cluster ID of this object.
*
* @param CLUSTER_ID the new cluster ID of this object.
*/
public void setClusterID(long CLUSTER_ID) {
this.CLUSTER_ID = CLUSTER_ID;
this.isClassified = true;
}
/**
* Returns the object specified in the constructor.
*
* @return the object.
*/
public Object getObject() {
return object;
}
/**
* Returns <tt>true</tt> if two classifiable objects are equal,
* <tt>false</tt> otherwise.
*
* @param o the reference object with which to compare.
* @return <tt>true</tt> if two classifiable objects are equal,
* <tt>false</tt> otherwise.
*/
public boolean equals(Object o) {
ClassifiableObject co = (ClassifiableObject)o;
return object == co.object && isClassified == co.isClassified && CLUSTER_ID == co.CLUSTER_ID;
}
/**
* Returns a hash code value for the object. This method is supported for
* the benefit of hashtables such as those provided by
* <tt>java.util.Hashtable</tt>.
*
* <p>The general contract of <tt>hashCode</tt> is:
* <ul>
* <li>
* Whenever it is invoked on the same object more than once
* during an execution of a Java application, the
* <tt>hashCode</tt> method must consistently return the same
* integer, provided no information used in equals comparisons on
* the object is modified. This integer need not remain
* consistent from one execution of an application to another
* execution of the same application.
* </li>
* <li>
* If two objects are equal according to the
* {@link java.lang.Object#equals(Object)} method, then calling
* the <tt>hashCode</tt> method on each of the two objects must
* produce the same integer result.
* </li>
* <li>
* It is <i>not</i> required that if two objects are unequal
* according to the {@link java.lang.Object#equals(Object)}
* method, then calling the <tt>hashCode</tt> method on each of
* the two objects must produce distinct integer results.
* However, the programmer should be aware that producing
* distinct integer results for unequal objects may improve the
* performance of hashtables.
* </li>
* </ul></p>
*
* @return a hash code value for this object.
*/
public int hashCode() {
return object.hashCode() + (int)CLUSTER_ID;
}
/**
* The string representation of a classifiable object.
*
* @return the string representation of a classifiable object.
*/
public String toString() {
return "object: " + object.toString() + "; classified: " + isClassified + "; clusterID: " + CLUSTER_ID;
}
}
/**
* Creates a new lazy DBScan cluster operator.
*
* @param input the input iteration providing the data to be grouped.
* @param eps the eps-radius used for range queries.
* @param minPts the minimum number of elements that have to be located in
* the eps-neighborhood of a core point.
* @param rangeQuery a function performing a range query based on a given
* descriptor.
* @param getDescriptor a function delivering a kind of descriptor for a
* range query.
* @param newQueue a parameterless function returning a new queue used for
* storing elements with which a range query will be performed.
* @param newNoiseQueue a parameterless function delivering a new queue
* holding the noise.
* @param newChangedCIDQueue a parameterless function returning a queue that
* gets the elements that belonged to noise, but later are assigned to
* a special cluster.
* @throws IllegalArgumentException if a negative value for <tt>eps</tt> or
* <tt>minPts</tt> has been specified.
*/
public DBScan(Iterator input, double eps, int minPts, Function rangeQuery, Function getDescriptor, Function newQueue, Function newNoiseQueue, Function newChangedCIDQueue) throws IllegalArgumentException {
if (eps < 0)
throw new IllegalArgumentException("cannot compute Eps-Neighborhood with negative eps-distance!");
if (minPts <= 0)
throw new IllegalArgumentException("a cluster must contain at least one element.");
this.input = new Filter(input, isUnclassified);
this.eps = eps;
this.minPts = minPts;
this.queue = (Queue)newQueue.invoke();
this.noiseQueue = (Queue)newNoiseQueue.invoke();
this.changedCIDQueue = (Queue)newChangedCIDQueue.invoke();
this.rangeQuery = rangeQuery;
this.getDescriptor = getDescriptor;
}
/**
* Creates a new lazy DBScan cluster operator. Uses a
* {@link xxl.core.collections.queues.StackQueue stack-queue} for storing the
* elements range queries will be performed with, a
* {@link xxl.core.collections.queues.ListQueue list-queue} for noise and a
* default {@link xxl.core.collections.queues.FIFOQueue FIFO-queue} for the
* elements that changed from the noise cluster to an other cluster.
*
* @param input the input iteration providing the data to be grouped.
* @param eps the eps-radius used for range queries.
* @param minPts the minimum number of elements that have to be located in
* the eps-neighborhood of a core point.
* @param rangeQuery a function performing a range query based on a given
* descriptor.
* @param getDescriptor a function delivering a kind of descriptor for a
* range query.
* @throws IllegalArgumentException if a negative value for <tt>eps</tt> or
* <tt>minPts</tt> has been specified.
*/
public DBScan(Iterator input, double eps, int minPts, Function rangeQuery, Function getDescriptor) throws IllegalArgumentException {
this(input, eps, minPts, rangeQuery, getDescriptor, StackQueue.FACTORY_METHOD, ListQueue.FACTORY_METHOD, FIFOQueue.FACTORY_METHOD);
}
/**
* Opens the cursor, i.e., signals the cursor to reserve resources, open the
* input iteration, etc. Before a cursor has been opened calls to methods
* like <tt>next</tt> or <tt>peek</tt> are not guaranteed to yield proper
* results. Therefore <tt>open</tt> must be called before a cursor's data
* can be processed. Multiple calls to <tt>open</tt> do not have any effect,
* i.e., if <tt>open</tt> was called the cursor remains in the state
* <i>opened</i> until its <tt>close</tt> method is called.
*
* <p>Note, that a call to the <tt>open</tt> method of a closed cursor
* usually does not open it again because of the fact that its state
* generally cannot be restored when resources are released respectively
* files are closed.</p>
*/
public void open() {
if (isOpened) return;
super.open();
input.open();
queue.open();
noiseQueue.open();
changedCIDQueue.open();
}
/**
* Closes the cursor, i.e., signals the cursor to clean up resources, close
* queues, etc. When a cursor has been closed calls to methods like
* <tt>next</tt> or <tt>peek</tt> are not guaranteed to yield proper
* results. Multiple calls to <tt>close</tt> do not have any effect, i.e.,
* if <tt>close</tt> was called the cursor remains in the state
* <i>closed</i>.
*
* <p>Note, that a closed cursor usually cannot be opened again because of
* the fact that its state generally cannot be restored when resources are
* released respectively files are closed.</p>
*/
public void close () {
if (isClosed) return;
super.close();
input.close();
queue.close();
noiseQueue.close();
changedCIDQueue.close();
}
/**
* Returns <tt>true</tt> if the iteration has more elements. (In other
* words, returns <tt>true</tt> if <tt>next</tt> or <tt>peek</tt> would
* return an element rather than throwing an exception.)
*
* @return <tt>true</tt> if the cursor has more elements.
*/
protected boolean hasNextObject() {
if (nextCluster != null) // consume last processed cursor completely
Cursors.consume(nextCluster);
if (input.hasNext()) {
CLUSTER_NO = CLUSTER_NO == UNDEFINED ?
CLUSTER_NO = 0 :
++CLUSTER_NO; // select new clusterID
nextCluster = new AbstractCursor() {
protected Classifiable x;
protected boolean inputMode = input.hasNext();
public boolean hasNextObject() {
while (!queue.isEmpty() || inputMode) {
x = !queue.isEmpty() ?
(Classifiable)queue.dequeue() :
(Classifiable)input.next();
// range query
Cursor results = (Cursor)rangeQuery.invoke(getDescriptor.invoke(x, new Double(eps)));
// check number of elements
LinkedList list = new LinkedList();
while (results.hasNext() && list.size() < minPts-1)
list.add(results.next());
// x is a core point
if (results.hasNext()) {
if (inputMode) {
x.setClusterID(CLUSTER_NO);
inputMode = false;
// insert all objects\{x} into the queue
Queues.enqueueAll(
queue,
// mark all objects in the eps-neighborhood of x with current clusterID
new Mapper(
ClusterID_FACTORY(CLUSTER_NO),
new Filter(
new Sequentializer(
list.iterator(),
results
),
new And(
new LeftBind(
new Not(
EqualReference.DEFAULT_INSTANCE
),
x
),
isUnclassified
)
)
)
);
}
else {
Cursors.consume(
new Mapper(
// mark all objects with current clusterID
ClusterID_FACTORY(CLUSTER_NO),
// select all objects in the eps-neighborhood of x that
new Filter(
// are not yet classified or
new Sequentializer(
list.iterator(),
results
),
// marked as noise;
new AbstractPredicate() {
public boolean invoke(Object o) {
Classifiable c = (Classifiable)o;
boolean unClassified = !c.isClassified();
boolean isNoise = c.getClusterID() == NOISE;
// insert the unclassified objects into the queue
if (unClassified)
queue.enqueue(o);
else
// changed clusterID
if (isNoise)
changedCIDQueue.enqueue(o);
return unClassified || isNoise;
}
}
)
)
);
}
next = x;
results.close();
return true;
}
// x is a border point
if (inputMode) {
// mark x as noise and process with input
x.setClusterID(NOISE);
noiseQueue.enqueue(x);
inputMode = input.hasNext();
}
else {
// set x as next result, x has been located in queue
next = x;
results.close();
return true;
}
}
return false;
}
public Object nextObject() throws NoSuchElementException {
return next;
}
};
nextCluster = new Sequentializer(
nextCluster,
new QueueCursor(changedCIDQueue)
);
if (nextCluster.hasNext())
return true;
}
nextCluster = new Filter(new QueueCursor(noiseQueue), isNoise);
if (nextCluster.hasNext()) // return all elements of the noiseQueue that are noise
return true;
return false;
}
/**
* Returns the next element in the iteration. This element will be
* accessible by some of the cursor's methods, e.g., <tt>update</tt> or
* <tt>remove</tt>, until a call to <tt>next</tt> or <tt>peek</tt> occurs.
* This is calling <tt>next</tt> or <tt>peek</tt> proceeds the iteration and
* therefore its previous element will not be accessible any more.
*
* @return the next element in the iteration.
*/
protected Object nextObject() {
return nextCluster;
}
/**
* Resets the DBScan-cursor to its initial state such that the caller is
* able to traverse the underlying data structure again without constructing
* a new cursor (optional operation). The modifications, removes and updates
* concerning the underlying data structure, are still persistent.
*
* <p>Note, that this operation is optional and might not work for all
* cursors.</p>
*
* @throws UnsupportedOperationException if the <tt>reset</tt> operation is
* not supported by the cursor.
*/
public void reset () throws UnsupportedOperationException{
input.reset();
queue.clear();
noiseQueue.clear();
changedCIDQueue.clear();
CLUSTER_NO = UNDEFINED;
}
/**
* Returns <tt>true</tt> if the <tt>reset</tt> operation is supported by
* the DBScan-cursor. Otherwise it returns <tt>false</tt>.
*
* @return <tt>true</tt> if the <tt>reset</tt> operation is supported by
* the DBScan-cursor, otherwise <tt>false</tt>.
*/
public boolean supportsReset() {
return input.supportsReset();
}
}