package com.rapidminer.operator.learner.clustering.clusterer;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Vector;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.IOContainer;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.OperatorCreationException;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.learner.clustering.ClusterModel;
import com.rapidminer.operator.learner.clustering.DefaultCluster;
import com.rapidminer.operator.learner.clustering.FlatClusterModel;
import com.rapidminer.operator.learner.clustering.FlatCrispClusterModel;
import com.rapidminer.operator.learner.clustering.IdUtils;
import com.rapidminer.operator.uncertain.AbstractPDFSampler;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.tools.OperatorService;
/**
* Aggregates several clusterings into one. The strategy of this operator is
* based on correlation clustering.
*
* @author Peter B. Volk
* @see com.rapidminer.operator.learner.clustering.clusterer.DBScanClustering
* @see com.rapidminer.operator.learner.clustering.clusterer.uncertain.DBScanEAClustering
* @see com.rapidminer.operator.learner.clustering.clusterer.uncertain.FDBScanClustering
*/
public class ClusteringAggregationWithUnvertainSampledElements extends
AbstractFlatClusterer {
private Vector<FlatCrispClusterModel> clusteringModels;
private AbstractPDFSampler sampler;
private IOContainer results;
private AbstractClustering clust;
private static final String NUM_THREADS = "Numer of cuncurrent threads";
@Override
public List<ParameterType> getParameterTypes() {
if (sampler == null) {
loadInternalOperators();
}
List<ParameterType> params = super.getParameterTypes();
params.addAll(clust.getParameterTypes());
params.addAll(sampler.getParameterTypes());
ParameterType param = new ParameterTypeInt(NUM_THREADS,
"Specifies how many threads may run concurrently.", 1, 300, 1);
params.add(param);
return params;
}
public ClusteringAggregationWithUnvertainSampledElements(
OperatorDescription description) {
super(description);
}
private void loadInternalOperators() {
try {
sampler = (AbstractPDFSampler) OperatorService
.createOperator("PDFSampling");
clust = (AbstractClustering) OperatorService
.createOperator("DBScanClustering");
} catch (OperatorCreationException e) {
e.printStackTrace();
}
}
public ClusterModel createClusterModel(ExampleSet es)
throws OperatorException {
if (es == null) {
throw new OperatorException("Example set may not be null"
+ Thread.currentThread().getName());
}
// initialize all components
sampler.setParameters(getParameters());
System.err.println("Starting sampling");
IOContainer io = sampler.apply(new IOContainer(es));
System.err.println("Completed sampling");
results = new IOContainer();
LinkedList<ClusteringThread> ll = new LinkedList<ClusteringThread>();
IOObject[] obj =io.getIOObjects();
int numExampleSetsPerThread =obj.length/getParameterAsInt(NUM_THREADS);
int j = 0;
System.err.println("Distributing "+obj.length+" of es to "+getParameterAsInt(NUM_THREADS)+" threads");
for (int i = 0; i < getParameterAsInt(NUM_THREADS); i++) {
// create the threads
AbstractClustering clusti;
//create the workload for the threads
ExampleSet[] objNew = null;
if(i!=getParameterAsInt(NUM_THREADS)-1){
objNew = new ExampleSet[numExampleSetsPerThread];
}else{
objNew = new ExampleSet[obj.length-numExampleSetsPerThread*i];
}
for(int k=0;k<objNew.length;j++){
System.err.println("Thread "+i+" getting "+j);
objNew[k] = (ExampleSet)obj[j];
k++;
}
//create the threads
if(objNew.length>0){
clusti = new DBScanClustering(OperatorService.getOperatorDescriptions(DBScanClustering.class)[0]);
clusti.setParameters(getParameters());
ll.add(new ClusteringThread(objNew,(DBScanClustering) clusti));
}
}
System.err.println("Starting all threads");
for (ClusteringThread ct : ll) {
ct.start();
}
System.err.println("all Clusteringthreads started.");
// block until everyone is done
for (ClusteringThread ct : ll) {
try {
ct.join();
System.err.println("Joining");
results = results.append(ct.getResult().getIOObjects());
} catch (InterruptedException e) {
logError("Error: "+e.getMessage());
}
}
System.err.println("completed clustering");
clusteringModels = new Vector<FlatCrispClusterModel>();
FlatCrispClusterModel result = new FlatCrispClusterModel();
double edge;
int uClusterId, vClusterId;
int clusterCount = 0;
int clustermodelSize = clusteringModels.size();
// Durchlaufen aller Paar-Kombinationen der Objekte u,v
for (Example u : es) {
for (Example v : es) {
String uid = IdUtils.getIdFromExample(u);
String vid = IdUtils.getIdFromExample(v);
// Berechnung der Wahrscheinlichkeit, zusammengeclustert zu
// werden
edge = 0;
for (int i = 0; i < clusteringModels.size(); i++) {
uClusterId = getClusterId(uid, clusteringModels.get(i));
vClusterId = getClusterId(vid, clusteringModels.get(i));
if (uClusterId == vClusterId) { // hier wird Noise auch als
// Cluster gesehen
edge++;
}
}
edge = edge / clustermodelSize;
// logNote("(" + uid + ";" + vid + ") = " + edge);
boolean uidIsInCluster = containsId(uid, result);
boolean vidIsInCluster = containsId(vid, result);
if (edge > 0.5) {
if (!uidIsInCluster && !vidIsInCluster) {
// beide in neues Cluster einf�gen
result.addCluster(new DefaultCluster(String .valueOf(clusterCount)));
((DefaultCluster) result.getClusterAt(clusterCount)).addObject(uid);
((DefaultCluster) result.getClusterAt(clusterCount)).addObject(vid);
clusterCount++;
} else if ((uidIsInCluster && !vidIsInCluster)
|| (!uidIsInCluster && vidIsInCluster)) {
// ein Objekt ist bereits im ClusterModel result, das
// andere nicht
String a = uid;
String b = vid;
if (vidIsInCluster) {
a = vid;
b = uid;
}
// das nicht enthaltene Objekt zum Cluster des anderen
// hinzuf�gen
((DefaultCluster) result.getClusterAt(getClusterId(a,result))).addObject(b);
} else if (uidIsInCluster&& vidIsInCluster) {// also wenn beide
// bereits drin sind
// UND sie sich in
// verschiedenen
// Clustern befinden
int clusterIdUID = getClusterId(uid, result);
int clusterIdVID = getClusterId(vid, result);
if (clusterIdUID != clusterIdVID) {
// beide Cluster mergen!
DefaultCluster uCluster = (DefaultCluster) result.getClusterAt(clusterIdUID);
DefaultCluster vCluster = (DefaultCluster) result.getClusterAt(clusterIdVID);
uCluster.addAll(vCluster);
result.removeCluster(vCluster);
//resetClusterIds(result);
clusterCount--; // weil es jetzt eines weniger
}
// TODO: Bei L�schen eines Clusters Bezeichnungen
// erneuern
// (die darin gef�hrte Nummererierung stimmt nicht mehr)
}
}
}
}
return result;
}
private void resetClusterIds(FlatCrispClusterModel cm) {
for (int i = 0; i < cm.getNumberOfClusters(); i++)
{
// TODO: resetClusterIds(FlatClusterModel cm)
// Die Klasse Cluster bietet keine M�glichkeit im Nachhinein die
// Bezeichner zu �ndern.
}
}
private int getClusterId(String id, FlatCrispClusterModel cm) {
try{
return Integer.valueOf(cm.getClusterById(id).getId());
}catch(Exception e){
return 0;
}
}
private boolean containsId(String id, FlatClusterModel cm) {
for (int i = 0; i < cm.getNumberOfClusters(); i++) {
if (cm.getClusterAt(i).contains(id))
return true;
}
return false;
}
public Class[] getInputClasses() {
return new Class[] { ExampleSet.class };
}
}