package it.unito.geosummly;
import it.unito.geosummly.clustering.subspace.AGNES;
import it.unito.geosummly.io.CSVDataIO;
import it.unito.geosummly.io.GeoJSONWriter;
import it.unito.geosummly.io.GeoTurtleWriter;
import it.unito.geosummly.io.LogDataIO;
import it.unito.geosummly.tools.ClusteringTools;
import it.unito.geosummly.utils.Pair;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.List;
import java.util.TreeSet;
import java.util.Vector;
import java.util.logging.Logger;
import org.apache.commons.csv.CSVRecord;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.database.Database;
import de.lmu.ifi.dbs.elki.result.ResultUtil;
public class ClusteringOperator {
public static Logger logger = Logger.getLogger(ClusteringOperator.class.toString());
public void execute(
ArrayList<Double> coord,
String inDens,
String inNorm,
String inDeltad,
String inSingles,
String out,
double eps,
String method
)
throws IOException
{
//Read all the csv files
CSVDataIO dataIO=new CSVDataIO();
List<CSVRecord> listDens=dataIO.readCSVFile(inDens);
List<CSVRecord> listNorm=dataIO.readCSVFile(inNorm);
List<CSVRecord> listDeltad=dataIO.readCSVFile(inDeltad);
List<CSVRecord> listSingles=dataIO.readCSVFile(inSingles);
//Get the bounding box
Double north = new Double(coord.get(0));
Double east = new Double(coord.get(1));
Double south = new Double(coord.get(2));
Double west = new Double(coord.get(3));
BoundingBox bbox=new BoundingBox(north, east, south, west);
double area = bbox.getArea();
//get the number of cell
//-1 because of the header
int cellNum = listDens.size()-1;
//get the option -combination
//that is the maximum number of categories considered in a combination
int comb = 3;
ClusteringTools tools=new ClusteringTools();
//fill in the matrix of normalized values
//the column timestamp is not included in normMatrix
ArrayList<ArrayList<Double>> normMatrix=tools.buildNormalizedFromCSV(listNorm);
ArrayList<Pair<Double, Double>> boundaries = tools.getFeatureBoundariesFromCSV(listDens);
//build db from the normalized matrix
Database db=tools.buildDatabaseFromMatrix(normMatrix);
//fill in the feature hashmap only with single features
HashMap<Integer, String> featuresMap=tools.getFeaturesMapFromDeltad(listDeltad);
//fill in the deltad hashmap with that values which are greater than or equal to 0 and whose feature is in the features hashmap
HashMap<String, Double> deltadMap=tools.getValuesMapFromDeltad(listDeltad);
//get the calendar
Calendar cal=tools.getCalendar(listNorm);
//90% of cells
Double density=normMatrix.size()*0.9;
//Get eps value
//This means that no eps value has been specified in CLI or the eps specified is negative
if(eps <= 0.0) {
eps=tools.getEps(normMatrix);
}
System.out.println("Eps: " + eps);
//Run GEOSUBCLU algorithm and get the clustering result
Clustering<?> result = tools
.runGEOSUBCLU(db, boundaries, featuresMap, deltadMap, density.intValue(), eps, new StringBuilder());
ArrayList<Clustering<?>> cs = ResultUtil.filterResults(result, Clustering.class);
ArrayList<Clustering<?>> cs_agnes = new ArrayList<>();
//Run AGNES algorithm in order to optimize the clustering
for(Clustering<?> c: cs) {
int threshold = (int)(c.getAllClusters().size()*0.05); //How to set this parameter
AGNES agnes = new AGNES(c, threshold);
cs_agnes.add(agnes.run());
}
cs = cs_agnes;
HashMap<Integer, String> clustersName=new HashMap<Integer, String>(); //key, cluster name
HashMap<Integer, ArrayList<ArrayList<Double>>> cellsOfCluster=new HashMap<Integer, ArrayList<ArrayList<Double>>>(); //key, cell_ids + lat + lng
HashMap<Integer, ArrayList<ArrayList<String>>> venuesOfCell=new HashMap<Integer, ArrayList<ArrayList<String>>>(); //cell_id, venue_record
HashMap<Integer, Double> cDistance = new HashMap<>();
HashMap<Integer, Double> cSSE = new HashMap<>();
HashMap<Integer, Double> cSurface = new HashMap<>();
HashMap<Integer, Double> cHeterogeneity = new HashMap<>();
HashMap<Integer, Double> cDensity = new HashMap<>();
ArrayList<ArrayList<Double>> cells;
int count = 0;
for(Clustering<?> c: cs) {
for(Cluster<?> cluster: c.getAllClusters()) {
count++;
System.out.println(count);//Make acount about clusters' num
int index=clustersName.size(); //at first clustersName.size()=0
clustersName.put(index, cluster.getName()); //put the cluster name in the map (clustersName.size()++)
cellsOfCluster=tools.putCompleteCellsOfCluster(cellsOfCluster, cluster, index, listDens); //get all the cell_ids for the selected cluster
cells=cellsOfCluster.get(index);
venuesOfCell=tools.putVenuesOfCells(cluster.getName(), index, venuesOfCell, cells, listSingles);
cDistance.put(index, new Double(tools.getDistance(db, cluster, featuresMap, listDens)));
cSSE.put(index, tools.getClusterSSE(db, cluster, featuresMap));
cSurface.put(index, tools.getClusterSurface(cluster, normMatrix.size()));
cHeterogeneity.put(index, tools.getClusterHeterogeneity(cluster, featuresMap));
cDensity.put(index, tools.getClusterDensity(venuesOfCell.size(), area, cSurface.get(index)));
}
}
//Get the SSE
double sse=tools.getClusteringSSE(db, cs, featuresMap);
//serialize the log
LogDataIO lWriter=new LogDataIO();
StringBuilder sb=tools.getLog();
lWriter.writeClusteringLog(sb, eps, sse, north, east, south, west, cellNum, out);
//serialize the clustering output to geojson and turtle files
GeoJSONWriter jWriter=new GeoJSONWriter();
jWriter.writeStream(bbox,
clustersName,
cellsOfCluster,
venuesOfCell,
cDistance,
cSSE,
cSurface,
cHeterogeneity,
cDensity,
eps,
out,
cal);
GeoTurtleWriter tWriter=new GeoTurtleWriter();
tWriter.writeStream(bbox,
clustersName,
cellsOfCluster,
venuesOfCell,
cDistance,
cSSE,
cSurface,
cHeterogeneity,
cDensity,
eps,
out,
cal);
}
public HashMap<String, Vector<Integer>> executeForValidation(
String inDens,
ArrayList<ArrayList<Double>> normalized,
int length, ArrayList<String> labels,
ArrayList<String> minpts,
double eps
)
throws IOException
{
ClusteringTools tools=new ClusteringTools();
//build the database from the normalized matrix without considering timestamp values
ArrayList<ArrayList<Double>> normMatrix=tools.buildNormalizedFromList(normalized);
Database db=tools.buildDatabaseFromMatrix(normMatrix);
//fill in the feature hashmap only with single features
HashMap<Integer, String> featuresMap=tools.getFeaturesMap(labels);
//fill in the deltad hashmap
HashMap<String, Double> deltadMap=tools.getDeltadMap(labels, minpts);
//% of cells
Double density=normMatrix.size()*0.9;
CSVDataIO dataIO=new CSVDataIO();
List<CSVRecord> listDens=dataIO.readCSVFile(inDens);
ArrayList<Pair<Double, Double>> boundaries = tools.getFeatureBoundariesFromCSV(listDens);
//Run GEOSUBCLU algorithm and get the clustering result
Clustering<?> result = tools.runGEOSUBCLU(db, boundaries, featuresMap, deltadMap, density.intValue(), eps, new StringBuilder());
ArrayList<Clustering<?>> cs = ResultUtil.filterResults(result, Clustering.class);
HashMap<Integer, String> clustersName=new HashMap<Integer, String>(); //key, cluster name
HashMap<Integer, ArrayList<Integer>> cellsOfCluster=new HashMap<Integer, ArrayList<Integer>>(); //key, cell_ids
for(Clustering<?> c: cs) {
for(Cluster<?> cluster: c.getAllClusters()) {
int index=clustersName.size();
//put the cluster name in the map
clustersName.put(index, cluster.getName());
//get all the cell_ids for the selected cluster
cellsOfCluster=tools.putIdCellsOfCluster(cellsOfCluster, cluster, index);
}
}
//Get the distinct cluster labels
TreeSet<String> distinctLabels =tools.getClusterLabels(clustersName);
//Associate cells to cluster
ArrayList<TreeSet<Integer>> allCells=tools.getCellsOfClusters(clustersName, cellsOfCluster, distinctLabels);
//Create holdout map
HashMap<String, Vector<Integer>> holdout=tools.buildHoldoutMap(distinctLabels, allCells, length);
return holdout;
}
public double executeForCorrectness(
String inDens,
ArrayList<ArrayList<Double>> normalized,
ArrayList<String> labels,
ArrayList<String> minpts,
double eps
)
throws IOException
{
ClusteringTools tools=new ClusteringTools();
//build the database from the normalized matrix
//the column of timestamp values is not present in the variable "normalized"
Database db=tools.buildDatabaseFromMatrix(normalized);
//fill in the feature hashmap only with single features
HashMap<Integer, String> featuresMap=tools.getFeaturesMap(labels);
//fill in the deltad hashmap
HashMap<String, Double> deltadMap=tools.getDeltadMap(labels, minpts);
//% of cells
Double density=normalized.size()*0.9;
CSVDataIO dataIO=new CSVDataIO();
List<CSVRecord> listDens=dataIO.readCSVFile(inDens);
ArrayList<Pair<Double, Double>> boundaries = tools.getFeatureBoundariesFromCSV(listDens);
//Run GEOSUBCLU algorithm and get the clustering result
Clustering<?> result = tools.runGEOSUBCLU(db, boundaries, featuresMap, deltadMap, density.intValue(), eps, new StringBuilder());
ArrayList<Clustering<?>> cs = ResultUtil.filterResults(result, Clustering.class);
//Get the SSE
double sse=tools.getClusteringSSE(db, cs, featuresMap);
return sse;
}
}