package it.unito.geosummly.clustering.subspace;
import it.unito.geosummly.utils.Pair;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.SubspaceClusteringAlgorithm;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.NumberVector;
import de.lmu.ifi.dbs.elki.data.Subspace;
import de.lmu.ifi.dbs.elki.data.model.Model;
import de.lmu.ifi.dbs.elki.data.model.SubspaceModel;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
import de.lmu.ifi.dbs.elki.database.ProxyDatabase;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.DimensionSelectingSubspaceDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceLPNormDistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.StepProgress;
import de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid;
import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
* <p>
* TODO rephrase it
*
* @author Giuseppe Rizzo
*
* @apiviz.uses DBSCAN
* @apiviz.uses AbstractDimensionsSelectingDoubleDistanceFunction
* @apiviz.has SubspaceModel
*
* @param <V> the type of FeatureVector handled by this Algorithm
*/
@Title("GEOSUBCLU: Density connected Subspace Clustering on seed Geospatial Data")
@Description("Largely inspired by SUBCLU, GEOSUBCLU applies subspace clustering on a geospatial data set")
@Reference(authors = "Giuseppe Rizzo", title = "", booktitle = "")
public class GEOSUBCLU<V extends NumberVector<?>>
extends AbstractAlgorithm<Clustering<SubspaceModel<V>>>
implements SubspaceClusteringAlgorithm<SubspaceModel<V>>
{
/**
* The logger for this class.
*/
private static final Logging LOG = Logging.getLogger(GEOSUBCLU.class);
/**
* Holds the result;
*/
private ArrayList<Pair<Double, Double>> boundaries;
private Clustering<SubspaceModel<V>> result;
private Map<Integer,String> FEATUREMAPPER ;
private Map<String,Double> DELTAD ;
private Map<String,Double> EPS ;
private int DENSITY;
private double epsValue;
private StringBuilder sbLog;
/**
* Constructor.
*
* @param distanceFunction Distance function
* @param epsilon Epsilon value
* @param minpts Minpts value
*/
public GEOSUBCLU( DimensionSelectingSubspaceDistanceFunction<V, DoubleDistance> distanceFunction)
{
super();
EPS = new HashMap<>();
}
public void setFeatureMapper(HashMap<Integer, String> featuremapper) {
this.FEATUREMAPPER=featuremapper;
}
public void setDeltad(HashMap<String, Double> deltad) {
this.DELTAD=deltad;
}
public void setDensity(int density) {
this.DENSITY=density;
}
public void setEpsValue(double epsValue) {
this.epsValue=epsValue;
}
public void setSbLog(StringBuilder sb) {
this.sbLog=sb;
}
public StringBuilder getSbLog() {
return sbLog;
}
/**
* Performs the GEOSUBCLU algorithm on the given database.
*
* @param relation Relation to process
* @return Clustering result
*/
public Clustering<SubspaceModel<V>> run(Relation<V> relation)
{
final int dimensionality = RelationUtil.dimensionality(relation);
StepProgress stepprog = LOG.isVerbose() ? new StepProgress(dimensionality) : null;
// Generate all 3-dimensional clusters
if (stepprog != null) {
stepprog.beginStep(1, "Generate all 3-dimensional clusters.", LOG);
}
// mapping of dimensionality to set of subspaces
HashMap<Integer, List<Subspace>> subspaceMap = new HashMap<>();
// list of 3-dimensional subspaces containing clusters
List<Subspace> s_3 = new ArrayList<>();
subspaceMap.put(0, s_3);
// mapping of subspaces to list of clusters
TreeMap<Subspace, List<Cluster<Model>>> clusterMap = new TreeMap<>(new Subspace.DimensionComparator());
//the first two dimensions are used as seeds for each cluster generation
for (int d = 2; d < dimensionality; d++)
{
Subspace currentSubspace = new Subspace(d);
//List<Cluster<Model>> clusters = runDBSCAN(relation, null, currentSubspace);
List<Cluster<Model>> clusters = runDBSCAN(relation, null, currentSubspace, new FirstSubspaceEuclideanDistanceFunction(new BitSet()));
if (LOG.isDebuggingFiner()) {
StringBuilder msg = new StringBuilder();
msg.append('\n').append(clusters.size()).append(" clusters in subspace ").append(currentSubspace.dimensonsToString()).append(": \n");
for (Cluster<Model> cluster : clusters) {
msg.append(" " + cluster.getIDs() + "\n");
}
LOG.debugFiner(msg.toString());
}
if (!clusters.isEmpty()) {
s_3.add(currentSubspace);
clusterMap.put(currentSubspace, clusters);
}
}
// Generate (d+1)-dimensional clusters from d-dimensional clusters
for (int d = 0; d < dimensionality - 1; d++) {
if (stepprog != null) {
stepprog.beginStep(d + 2, "Generate " + (d + 2) + "-dimensional clusters from " + (d + 1) + "-dimensional clusters.", LOG);
}
List<Subspace> subspaces = subspaceMap.get(d);
if (subspaces == null || subspaces.isEmpty()) {
if (stepprog != null) {
for (int dim = d + 1; dim < dimensionality - 1; dim++) {
stepprog.beginStep(dim + 2, "Generation of" + (dim + 2) + "-dimensional clusters not applicable, because no more " + (d + 2) + "-dimensional subspaces found.", LOG);
}
}
break;
}
List<Subspace> candidates = generateSubspaceCandidates(subspaces);
List<Subspace> s_d = new ArrayList<>();
for (Subspace candidate : candidates) {
Subspace bestSubspace = bestSubspace(subspaces, candidate, clusterMap);
if (LOG.isDebuggingFine()) {
LOG.debugFine("best subspace of " + candidate.dimensonsToString() + ": " + bestSubspace.dimensonsToString());
}
List<Cluster<Model>> bestSubspaceClusters = clusterMap.get(bestSubspace);
List<Cluster<Model>> clusters = new ArrayList<>();
for (Cluster<Model> cluster : bestSubspaceClusters) {
List<Cluster<Model>> candidateClusters = runDBSCAN(relation, cluster.getIDs(), candidate, new SubspaceEuclideanDistanceFunction(new BitSet()));
if (!candidateClusters.isEmpty()) {
clusters.addAll(candidateClusters);
}
}
if (LOG.isDebuggingFine()) {
StringBuilder msg = new StringBuilder();
msg.append(clusters.size() + " cluster(s) in subspace " + candidate + ": \n");
for (Cluster<Model> c : clusters) {
msg.append(" " + c.getIDs() + "\n");
}
LOG.debugFine(msg.toString());
}
if (!clusters.isEmpty()) {
s_d.add(candidate);
clusterMap.put(candidate, clusters);
}
}
if (!s_d.isEmpty()) {
subspaceMap.put(d + 1, s_d);
}
}
result = new Clustering<>("GEOSUBCLU clustering", "geosubclu");
for (Subspace subspace : clusterMap.descendingKeySet())
{
List<Cluster<Model>> clusters = clusterMap.get(subspace);
for (Cluster<Model> cluster : clusters)
{
Cluster<SubspaceModel<V>> newCluster = new Cluster<>(cluster.getIDs());
newCluster.setModel(new SubspaceModel<>(subspace, Centroid.make(relation, cluster.getIDs()).toVector(relation)));
//newCluster.setName("geocluster_" + numClusters++);
String name = "c(";
BitSet bs = subspace.getDimensions();
int iteration = 1;
if(bs.cardinality()>1) {
for (int i = bs.nextSetBit(0); i >= 0; i = bs.nextSetBit(i+1)) {
//feature += FEATUREMAPPER.get(bs.nextSetBit(i)+1);
name+= FEATUREMAPPER.get(i);
if(iteration < bs.cardinality()) name += ",";
++iteration;
}
}
else name += FEATUREMAPPER.get(bs.nextSetBit(0));
name += ")";
newCluster.setName(name);
result.addToplevelCluster(newCluster);
sbLog.append("\n"+newCluster.getName() + " has been generated from " + subspace.toString());
}
}
return result;
}
/**
* Returns the result of the algorithm.
*
* @return the result of the algorithm
*/
public Clustering<SubspaceModel<V>> getResult() {
return result;
}
/**
* Runs the DBSCAN algorithm on the specified partition of the database in the
* given subspace. If parameter {@code ids} is null DBSCAN will be applied to
* the whole database.
*
* @param relation the database holding the objects to run DBSCAN on
* @param ids the IDs of the database defining the partition to run DBSCAN on
* - if this parameter is null DBSCAN will be applied to the whole
* database
* @param subspace the subspace to run DBSCAN on
* @param firstSubspaceEuclideanDistanceFunction
* @return the clustering result of the DBSCAN run
*/
private List<Cluster<Model>> runDBSCAN(
Relation<V> relation,
DBIDs ids,
Subspace subspace,
SubspaceLPNormDistanceFunction dist
)
{
BitSet bs = (BitSet) subspace.getDimensions().clone();
String feature= "";
int iteration = 1;
if(bs.cardinality()>1) {
for (int i = bs.nextSetBit(0); i >= 0; i = bs.nextSetBit(i+1)) {
//feature += FEATUREMAPPER.get(bs.nextSetBit(i)+1);
feature += FEATUREMAPPER.get(i);
if(iteration < bs.cardinality()) feature += " AND ";
++iteration;
}
}
else {
feature = FEATUREMAPPER.get(bs.nextSetBit(0));
}
if (!DELTAD.containsKey(feature) || DELTAD.get(feature) == 0 )
return new LinkedList<Cluster<Model>>();
int minpts = DELTAD.get(feature).intValue();
DoubleDistance epsilon = (EPS.containsKey(feature)) ?
new DoubleDistance (EPS.get(feature).doubleValue()) :
new DoubleDistance (epsValue);
sbLog.append(bs.toString() + "." + feature + ".minpts=" + minpts+"\n");
// set the 2 dimensions lat and lng as seeds
bs.set(0, 2, true);
// distance function
//distanceFunction.setSelectedDimensions(bs);
dist.setSelectedDimensions(bs);
ProxyDatabase proxy;
if (ids == null) {
// TODO: in this case, we might want to use an index - the proxy below
// will prevent this!
ids = relation.getDBIDs();
}
proxy = new ProxyDatabase(ids, relation);
DBSCAN<V, DoubleDistance> dbscan = new DBSCAN<>(dist, epsilon, minpts);
dbscan.setBoundaries(boundaries);
// run DBSCAN
if (LOG.isVerbose()) {
LOG.verbose("\nRun DBSCAN on subspace " + subspace.dimensonsToString());
}
Clustering<Model> dbsres = dbscan.run(proxy);
// separate cluster and noise
List<Cluster<Model>> clusterAndNoise = dbsres.getAllClusters();
List<Cluster<Model>> clusters = new ArrayList<>();
for (Cluster<Model> c : clusterAndNoise) {
if (!c.isNoise()) {
DBIDs objects = c.getIDs();
//if ( objects.size()>1 ) { //&& objects.size()<DENSITY ) {
clusters.add(c);
sbLog.append("\tnumber of objects ci=" + objects.size()+";ids=");
for (DBIDIter iter = c.getIDs().iter(); iter.valid(); iter.advance()) {
sbLog.append(DBIDUtil.toString(iter)+" ");
}
sbLog.append("\n");
}
}
return clusters;
}
/**
* Generates {@code d+1}-dimensional subspace candidates from the specified
* {@code d}-dimensional subspaces.
*
* @param subspaces the {@code d}-dimensional subspaces
* @return the {@code d+1}-dimensional subspace candidates
*/
private List<Subspace> generateSubspaceCandidates(List<Subspace> subspaces)
{
List<Subspace> candidates = new ArrayList<>();
if (subspaces.isEmpty()) {
return candidates;
}
// Generate (d+1)-dimensional candidate subspaces
int d = subspaces.get(0).dimensionality();
StringBuilder msgFine = new StringBuilder("\n");
if (LOG.isDebuggingFiner()) {
msgFine.append("subspaces ").append(subspaces).append('\n');
}
for (int i = 0; i < subspaces.size(); i++) {
Subspace s1 = subspaces.get(i);
for (int j = i + 1; j < subspaces.size(); j++) {
Subspace s2 = subspaces.get(j);
Subspace candidate = s1.join(s2);
if (candidate != null) {
if (LOG.isDebuggingFiner()) {
msgFine.append("candidate: ").append(candidate.dimensonsToString()).append('\n');
}
// prune irrelevant candidate subspaces
List<Subspace> lowerSubspaces = lowerSubspaces(candidate);
if (LOG.isDebuggingFiner()) {
msgFine.append("lowerSubspaces: ").append(lowerSubspaces).append('\n');
}
boolean irrelevantCandidate = false;
for (Subspace s : lowerSubspaces) {
if (!subspaces.contains(s)) {
irrelevantCandidate = true;
break;
}
}
if (!irrelevantCandidate) {
candidates.add(candidate);
}
}
}
}
if (LOG.isDebuggingFiner()) {
LOG.debugFiner(msgFine.toString());
}
if (LOG.isDebugging()) {
StringBuilder msg = new StringBuilder();
msg.append(d + 1).append("-dimensional candidate subspaces: ");
for (Subspace candidate : candidates) {
msg.append(candidate.dimensonsToString()).append(' ');
}
LOG.debug(msg.toString());
}
return candidates;
}
/**
* Returns the list of all {@code (d-1)}-dimensional subspaces of the
* specified {@code d}-dimensional subspace.
*
* @param subspace the {@code d}-dimensional subspace
* @return a list of all {@code (d-1)}-dimensional subspaces
*/
private List<Subspace> lowerSubspaces(Subspace subspace) {
int dimensionality = subspace.dimensionality();
if (dimensionality <= 1) {
return null;
}
// order result according to the dimensions
List<Subspace> result = new ArrayList<>();
BitSet dimensions = subspace.getDimensions();
for (int dim = dimensions.nextSetBit(0); dim >= 0; dim = dimensions.nextSetBit(dim + 1)) {
BitSet newDimensions = (BitSet) dimensions.clone();
newDimensions.set(dim, false);
result.add(new Subspace(newDimensions));
}
return result;
}
/**
* Determines the {@code d}-dimensional subspace of the {@code (d+1)}
* -dimensional candidate with minimal number of objects in the cluster.
*
* @param subspaces the list of {@code d}-dimensional subspaces containing
* clusters
* @param candidate the {@code (d+1)}-dimensional candidate subspace
* @param clusterMap the mapping of subspaces to clusters
* @return the {@code d}-dimensional subspace of the {@code (d+1)}
* -dimensional candidate with minimal number of objects in the
* cluster
*/
private Subspace bestSubspace(List<Subspace> subspaces, Subspace candidate, TreeMap<Subspace, List<Cluster<Model>>> clusterMap) {
Subspace bestSubspace = null;
for (Subspace subspace : subspaces) {
int min = Integer.MAX_VALUE;
if (subspace.isSubspace(candidate)) {
List<Cluster<Model>> clusters = clusterMap.get(subspace);
for (Cluster<Model> cluster : clusters) {
int clusterSize = cluster.size();
if (clusterSize < min) {
min = clusterSize;
bestSubspace = subspace;
}
}
}
}
return bestSubspace;
}
@Override
public TypeInformation[] getInputTypeRestriction() {
return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
}
@Override
protected Logging getLogger() {
return LOG;
}
public ArrayList<Pair<Double, Double>> getBoundaries() {
return boundaries;
}
public void setBoundaries(ArrayList<Pair<Double, Double>> boundaries) {
this.boundaries = boundaries;
}
/**
- * Parameterization class.
- *
- * @author Erich Schubert
- *
- * @apiviz.exclude
- */
public static class Parameterizer<V extends NumberVector<?>> extends AbstractParameterizer {
protected int minpts = 0;
protected DoubleDistance epsilon = null;
protected DimensionSelectingSubspaceDistanceFunction<V, DoubleDistance> distance = null;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
ObjectParameter<DimensionSelectingSubspaceDistanceFunction<V, DoubleDistance>> param =
new ObjectParameter<>(new OptionID("geosubclu.distancefunction", "Distance function to determine the distance between database objects."),
DimensionSelectingSubspaceDistanceFunction.class,
SubspaceEuclideanDistanceFunction.class);
if (config.grab(param)) {
distance = param.instantiateClass(config);
}
}
@Override
protected GEOSUBCLU<V> makeInstance() {
return new GEOSUBCLU<>(distance);
}
}
}