package it.unito.geosummly.clustering.subspace;
import it.unito.geosummly.utils.Pair;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;
import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.DoubleVector;
import de.lmu.ifi.dbs.elki.data.model.ClusterModel;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.QueryUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDMIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDRef;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList;
import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceLPNormDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
public class DBSCAN<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm<O, D, Clustering<Model>> implements ClusteringAlgorithm<Clustering<Model>> {
/**
* The logger for this class.
*/
private static final Logging LOG = Logging.getLogger(DBSCAN.class);
/**
* Parameter to specify the maximum radius of the neighborhood to be
* considered, must be suitable to the distance function specified.
*/
public static final OptionID EPSILON_ID = new OptionID("dbscan.epsilon", "The maximum radius of the neighborhood to be considered.");
/**
* Holds the value of {@link #EPSILON_ID}.
*/
private D epsilon;
private ArrayList<Pair<Double, Double>> boundaries;
/**
* Parameter to specify the threshold for minimum number of points in the
* epsilon-neighborhood of a point, must be an integer greater than 0.
*/
public static final OptionID MINPTS_ID = new OptionID("dbscan.minpts", "Threshold for minimum number of points in the epsilon-neighborhood of a point.");
/**
* Holds the value of {@link #MINPTS_ID}.
*/
protected int minpts;
/**
* Holds a list of clusters found.
*/
protected List<ModifiableDBIDs> resultList;
/**
* Holds a set of noise.
*/
protected ModifiableDBIDs noise;
/**
* Holds a set of processed ids.
*/
protected ModifiableDBIDs processedIDs;
/**
* Constructor with parameters.
*
* @param distanceFunction Distance function
* @param epsilon Epsilon value
* @param minpts Minpts parameter
*/
public DBSCAN(DistanceFunction<? super O, D> distanceFunction, D epsilon, int minpts) {
super(distanceFunction);
this.epsilon = epsilon;
this.minpts = minpts;
}
/**
* Performs the DBSCAN algorithm on the given database.
*/
public Clustering<Model> run(Relation<O> relation)
{
RangeQuery<O, D> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction());
int size = relation.size();
//final int size = getDensity(null, relation, null, (FirstSubspaceEuclideanDistanceFunction) getDistanceFunction());
FiniteProgress objprog = LOG.isVerbose() ? new FiniteProgress("Processing objects", size, LOG) : null;
IndefiniteProgress clusprog = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;
resultList = new ArrayList<>();
noise = DBIDUtil.newHashSet();
processedIDs = DBIDUtil.newHashSet(size);
if(size < minpts) {
// The can't be any clusters
noise.addDBIDs(relation.getDBIDs());
//objprog.setProcessed(noise.size(), LOG); //FIXME raises a bug
}
else {
for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
if(!processedIDs.contains(iditer)) {
expandCluster(relation, rangeQuery, iditer, objprog, clusprog);
}
if(objprog != null && clusprog != null) {
objprog.setProcessed(processedIDs.size(), LOG);
clusprog.setProcessed(resultList.size(), LOG);
}
if(processedIDs.size() == size) {
break;
}
}
}
// Finish progress logging
if(objprog != null) {
objprog.ensureCompleted(LOG);
}
if(clusprog != null) {
clusprog.setCompleted(LOG);
}
Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
for(ModifiableDBIDs res : resultList) {
Cluster<Model> c = new Cluster<Model>(res, ClusterModel.CLUSTER);
result.addToplevelCluster(c);
}
Cluster<Model> n = new Cluster<Model>(noise, true, ClusterModel.CLUSTER);
result.addToplevelCluster(n);
return result;
}
/**
* DBSCAN-function expandCluster.
* <p/>
* Border-Objects become members of the first possible cluster.
*
* @param relation Database relation to run on
* @param rangeQuery Range query to use
* @param startObjectID potential seed of a new potential cluster
* @param objprog the progress object for logging the current status
*/
protected void expandCluster(
Relation<O> relation,
RangeQuery<O, D> rangeQuery,
DBIDRef startObjectID,
FiniteProgress objprog,
IndefiniteProgress clusprog
)
{
DistanceDBIDList<D> neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon);
//int size = neighbors.size();
int size = getDensity(relation, neighbors, (SubspaceLPNormDistanceFunction) getDistanceFunction());
// startObject is no core-object
// if(neighbors.size() < minpts) {
if( size <= minpts ) {
noise.add(startObjectID);
processedIDs.add(startObjectID);
if(objprog != null && clusprog != null) {
objprog.setProcessed(processedIDs.size(), LOG);
clusprog.setProcessed(resultList.size(), LOG);
}
return;
}
// try to expand the cluster
HashSetModifiableDBIDs seeds = DBIDUtil.newHashSet();
ModifiableDBIDs currentCluster = DBIDUtil.newArray();
// this allows to consider single objects as cluster items
if( neighbors.size() == 0 ) currentCluster.add(startObjectID);
for(DBIDIter seed = neighbors.iter(); seed.valid(); seed.advance()) {
if(!processedIDs.contains(seed)) {
currentCluster.add(seed);
processedIDs.add(seed);
seeds.add(seed);
}
else if(noise.contains(seed)) {
currentCluster.add(seed);
noise.remove(seed);
}
}
seeds.remove(startObjectID);
while(seeds.size() > 0) {
DBIDMIter o = seeds.iter();
DistanceDBIDList<D> neighborhood = rangeQuery.getRangeForDBID(o, epsilon);
o.remove();
size = getDensity(relation, neighbors, (SubspaceLPNormDistanceFunction) getDistanceFunction());
//if(neighborhood.size() >= minpts) {
if( size > minpts ) {
for(DBIDIter neighbor = neighborhood.iter(); neighbor.valid(); neighbor.advance()) {
boolean inNoise = noise.contains(neighbor);
boolean unclassified = !processedIDs.contains(neighbor);
if(inNoise || unclassified) {
if(unclassified) {
seeds.add(neighbor);
}
currentCluster.add(neighbor);
processedIDs.add(neighbor);
if(inNoise) {
noise.remove(neighbor);
}
}
}
}
if(processedIDs.size() == relation.size() && noise.size() == 0) {
break;
}
if(objprog != null && clusprog != null) {
objprog.setProcessed(processedIDs.size(), LOG);
int numClusters = currentCluster.size() > minpts ? resultList.size() + 1 : resultList.size();
clusprog.setProcessed(numClusters, LOG);
}
}
size = getDensity(relation, currentCluster, (SubspaceLPNormDistanceFunction) getDistanceFunction());
//if(currentCluster.size() >= minpts) {
if( size > minpts) {
resultList.add(currentCluster);
}
else {
noise.addDBIDs(currentCluster);
noise.add(startObjectID);
processedIDs.add(startObjectID);
}
}
private int getDensity(
Relation<O> relation,
DBIDs ids,
SubspaceLPNormDistanceFunction distanceFunction
)
{
Relation<DoubleVector> vectors = relation.getDatabase().getRelation(TypeUtil.DOUBLE_VECTOR_FIELD);
BitSet bs = (BitSet) distanceFunction.getSelectedDimensions().clone();
// not consider lat and lng
bs.set(0, 2, false);
// consider the density of the core element
Double density = 0.0;
// then the cardinality of the set is 1
int cardinality = 0;
//relation contains also the startObject
if (relation != null)
{
for(DBIDIter seed = ids.iter(); seed.valid(); seed.advance())
{
Double cellDensity = 1.0;
for (int i = bs.nextSetBit(0); i >= 0; i = bs.nextSetBit(i+1))
{
double dn = vectors.get(seed).doubleValue(i);
double dmax = boundaries.get(i).getSecond();
double dmin = boundaries.get(i).getFirst();
double d = dn * (dmax - dmin) + dmin;
cellDensity *= d;
}
density += cellDensity;
cardinality ++;
}
}
return (int) Math.floor( density/( Math.pow(cardinality, bs.cardinality()) ) ); //(int) (density/cardinality);
}
@Override
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(getDistanceFunction().getInputTypeRestriction());
}
@Override
protected Logging getLogger() {
return LOG;
}
public ArrayList<Pair<Double, Double>> getBoundaries() {
return boundaries;
}
public void setBoundaries(ArrayList<Pair<Double, Double>> boundaries) {
this.boundaries = boundaries;
}
public static class Parameterizer<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D>
{
protected D epsilon = null;
protected int minpts = 0;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
DistanceParameter<D> epsilonP = new DistanceParameter<>(EPSILON_ID, distanceFunction);
if(config.grab(epsilonP)) {
epsilon = epsilonP.getValue();
}
IntParameter minptsP = new IntParameter(MINPTS_ID);
minptsP.addConstraint(new GreaterConstraint(0));
if(config.grab(minptsP)) {
minpts = minptsP.getValue();
}
}
@Override
protected DBSCAN<O, D> makeInstance() {
return new DBSCAN<>(distanceFunction, epsilon, minpts);
}
}
}