package it.unito.geosummly.tools; import it.unito.geosummly.clustering.subspace.GEOSUBCLU; import it.unito.geosummly.clustering.subspace.InMemoryDatabase; import it.unito.geosummly.utils.Pair; import java.util.ArrayList; import java.util.Calendar; import java.util.Collections; import java.util.Date; import java.util.GregorianCalendar; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.TreeSet; import java.util.Vector; import org.apache.commons.csv.CSVRecord; import de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.SUBCLU; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.DoubleVector; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.model.SubspaceModel; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.datasource.ArrayAdapterDatabaseConnection; import de.lmu.ifi.dbs.elki.datasource.filter.FixedDBIDsFilter; import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization; public class ClusteringTools { private StringBuilder log; /** * Constructor method */ public ClusteringTools() { log=new StringBuilder(); } /** * Set the log StringBuilder */ public void setLog(StringBuilder log) { this.log=log; } /** * Get the log StringBuilder */ public StringBuilder getLog() { return log; } /**Fill in the matrix of normalized values from a list of CSV records. * The header won't be considered. * The column of timestamp values won't be considered. */ public ArrayList<ArrayList<Double>> buildNormalizedFromCSV(List<CSVRecord> list) { ArrayList<ArrayList<Double>> matrix=new ArrayList<ArrayList<Double>>(); for(CSVRecord r: list) { //we exclude the header if(!r.get(0).contains("Timestamp")) { ArrayList<Double> record=new ArrayList<Double>(); //we don't have to consider timepstamp values, so i=1 for(int i=1;i<r.size();i++) record.add(Double.parseDouble(r.get(i))); matrix.add(record); } } return matrix; } /**Fill in the matrix of normalized values from a list of list of double values. * The column of timestamp values won't be considered. */ public ArrayList<ArrayList<Double>> buildNormalizedFromList(ArrayList<ArrayList<Double>> inputMatrix) { ArrayList<ArrayList<Double>> matrix=new ArrayList<ArrayList<Double>>(); ArrayList<Double> record; for(ArrayList<Double> array: inputMatrix) { record=new ArrayList<Double>(); for(int j=0;j<array.size();j++) { record.add(array.get(j)); } matrix.add(record); } return matrix; } /**Fill in the feature hashmap from a list of CSV records. * Only single features will be considered, so feature combinations (i.e. with 'AND') will be excluded. */ public HashMap<Integer, String> getFeaturesMapFromDeltad(List<CSVRecord> list) { HashMap<Integer, String> features=new HashMap<Integer, String>(); for(CSVRecord r: list) { String f=(String) r.get(0).replace("deltad", "").replaceAll("\\(", "").replaceAll("\\)", ""); //take only feature name if(!f.contains("AND")) { int mSize=features.size(); features.put(mSize+2, f); //keys start from 2 } } return features; } /**Fill in the feature hashmap from a list of labels * Only single features will be considered, so feature combinations (i.e. with 'AND') will be excluded. */ public HashMap<Integer, String> getFeaturesMap(ArrayList<String> labels) { HashMap<Integer, String> features=new HashMap<Integer, String>(); for(String s: labels) { if(!s.contains("AND")) { int mSize=features.size(); features.put(mSize+2, s); //keys start from 2 } } return features; } /**Fill in the deltad hashmap from a list of CSV records. * Will be considered only that values which are greater than 0 and whose feature is in the features hashmap.*/ public HashMap<String, Double> getValuesMapFromDeltad(List<CSVRecord> list) { HashMap<String, Double> map=new HashMap<String, Double>(); ArrayList<String> toExclude=new ArrayList<String>(); //list of excluded features boolean excluded=false; boolean isFound=false; for(CSVRecord r: list) { String feature=(String) r.get(0).replace("deltad", "").replaceAll("\\(", "").replaceAll("\\)", ""); //take only feature name excluded=false; isFound=false; for(int i=0;i<toExclude.size() && !isFound;i++) { if(feature.contains(toExclude.get(i))) { toExclude.add(feature); excluded=true; isFound=true; } } double d=Math.floor(Double.parseDouble(r.get(1))); //floor of deltad value if(d >= 0 && !excluded) { map.put(feature, d); } else if(!excluded) toExclude.add(feature); } return map; } /**Fill in the deltad hashmap from a list of minpts values taken as strings.*/ public HashMap<String, Double> getDeltadMap(ArrayList<String> labels, ArrayList<String> minpts) { HashMap<String, Double> map=new HashMap<String, Double>(); for(int i=0;i<labels.size();i++) map.put(labels.get(i), Double.parseDouble(minpts.get(i))); return map; } /** * Fill in the cells hashmap with the cells of a cluster. * Each entry of the map will be a couple: key, list_of_triple (cell_id, lat, lng). */ public HashMap<Integer, ArrayList<ArrayList<Double>>> putCompleteCellsOfCluster( HashMap<Integer, ArrayList<ArrayList<Double>>> cellsOfCluster, Cluster<?> cluster, int index, List<CSVRecord> listDens) { int cellId=0; String cellLat=""; String cellLng=""; ArrayList<Double> cellRecord; ArrayList<ArrayList<Double>> cells=new ArrayList<ArrayList<Double>>(); for(DBIDIter iter=cluster.getIDs().iter(); iter.valid(); iter.advance()) { cellId=Integer.parseInt(DBIDUtil.toString(iter)); cellLat=listDens.get(cellId).get(1); //latitude cellLng=listDens.get(cellId).get(2); //longitude cellRecord=new ArrayList<Double>(); cellRecord.add((double) cellId); cellRecord.add(Double.parseDouble(cellLat)); cellRecord.add(Double.parseDouble(cellLng)); cells.add(cellRecord); //add a triple: id, lat, lng } cellsOfCluster.put(index, cells); return cellsOfCluster; } /** * Fill in the cells hashmap with the cell_ids of a cluster. * Each entry of the map will be a couple: key, list of cell_ids. */ public HashMap<Integer, ArrayList<Integer>> putIdCellsOfCluster(HashMap<Integer, ArrayList<Integer>> cellsOfCluster, Cluster<?> cluster, int index) { int cellId=0; ArrayList<Integer> cells=new ArrayList<Integer>(); for(DBIDIter iter=cluster.getIDs().iter(); iter.valid(); iter.advance()) { cellId=Integer.parseInt(DBIDUtil.toString(iter)); cells.add(cellId); //add a triple: id, lat, lng } cellsOfCluster.put(index, cells); return cellsOfCluster; } /** * Fill in the venues hashmap with the venues of the cells of a cluster. * Each entry of the map will be a couple: key=clusterId, list of lists of venue_info. * Only venues of the same label of the cluster will be included. */ public HashMap<Integer, ArrayList<ArrayList<String>>> putVenuesOfCells( String clusterName, int index, HashMap<Integer, ArrayList<ArrayList<String>>> venuesOfCell, ArrayList<ArrayList<Double>> cells, List<CSVRecord> listSingles) { double cellLat = 0.0; double cellLng = 0.0; ArrayList<ArrayList<String>> venuesInfo=new ArrayList<ArrayList<String>>(); ArrayList<String> venueRecord; //clean cluster name String str= clusterName.substring(2, clusterName.length()-1); //keep only category names String[] str_array= str.split(","); //all labels of the cluster //get the header of singles in order to get the correct venue category name ArrayList<String> features=new ArrayList<String>(); for(int i=7;i<listSingles.get(0).size();i++) features.add(listSingles.get(0).get(i)); for(ArrayList<Double> array: cells) { cellLat=array.get(1); cellLng=array.get(2); //go through the venue dataset. i=1 because we don't have to consider the header for(int i=1;i<listSingles.size();i++) { //check if the venue belong to the cell CSVRecord r = listSingles.get(i); Double f_lat=Double.parseDouble(r.get(5)); Double f_lng=Double.parseDouble(r.get(6)); if(f_lat==cellLat && f_lng==cellLng) { venueRecord=getVenueRecord(str_array, features, r); if(venueRecord.size()>0) { venuesInfo.add(venueRecord); } } } } //add venue_id only if the venue exists in the cell if(venuesInfo.size()>0) venuesOfCell.put(index, venuesInfo); return venuesOfCell; } /** * Get the infos of a venue record. * This method is exploit by the method putVenuesOfCell */ public ArrayList<String> getVenueRecord(String[] str_array, ArrayList<String> features, CSVRecord r) { ArrayList<String> venueRecord=new ArrayList<String>(); String venueLabel=checkVenueLabel(r, str_array, features); if(venueLabel!=null) { venueRecord.add(r.get(0)); //timestamp venueRecord.add(r.get(1)); //beenHere venueRecord.add(r.get(2)); //venue id venueRecord.add(r.get(3)); //venue lat venueRecord.add(r.get(4)); //venue lat venueRecord.add(r.get(5)); //focal lat venueRecord.add(r.get(6)); //focal lng venueRecord.add(venueLabel); //category of the venue } return venueRecord; } /** * Check whether a venue has the same label of the cluster. * This method is exploit by getVenueRecord */ public String checkVenueLabel(CSVRecord r, String[] str_array, ArrayList<String> features) { boolean found = false; boolean matched = false; String venueLabel = null; for(int h=7;h<r.size() && !matched;h++) { if(r.get(h).equals("1")) { matched=true; //keep only venues of the same labels of the cluster for(int k=0; k<str_array.length && !found; k++) { if(features.get(h-7).equals(str_array[k].trim())) { venueLabel=features.get(h-7); found=true; } } } } return venueLabel; } /*public HashMap<Integer, ArrayList<ArrayList<String>>> putVenuesOfCells(String clusterName, HashMap<Integer, ArrayList<ArrayList<String>>> venuesOfCell, ArrayList<ArrayList<Double>> cells, List<CSVRecord> listSingles) { int cellId=0; String cellLat=""; String cellLng=""; String venueLabel=""; ArrayList<ArrayList<String>> venuesInfo; ArrayList<String> venueRecord; boolean found=false; boolean added=false; boolean catFound=false; //clean cluster name String str= clusterName.substring(2, clusterName.length()-1); //keep only category names String[] str_array= str.split(","); //get the header of singles in order to get the correct venue category name ArrayList<String> features=new ArrayList<String>(); for(int i=7;i<listSingles.get(0).size();i++) features.add(listSingles.get(0).get(i)); //get the venues for(ArrayList<Double> array: cells) { cellId=array.get(0).intValue(); cellLat=array.get(1)+""; cellLng=array.get(2)+""; venuesInfo=new ArrayList<ArrayList<String>>(); found=false; added=false; for(int i=0;i<listSingles.size() && !found;i++) { CSVRecord r=listSingles.get(i); //venue information //we don't have to consider the header if(!r.get(0).contains("Timestamp")) { venueLabel=""; catFound=false; //check if the venue has the same label of the cluster for(int h=7;h<r.size() && !catFound;h++) { if(r.get(h).equals("1.0")) { //keep only venues of the same labels of the cluster for(String s: str_array) { if(features.get(h-7).equals(s.trim())) { venueLabel=features.get(h-7); catFound=true; } } } } //check if the venue belong to the cell if(r.get(5).equals(cellLat) && r.get(6).equals(cellLng) && catFound) { venueRecord=new ArrayList<String>(); venueRecord.add(r.get(0)); //timestamp venueRecord.add(r.get(1)); //beenHere venueRecord.add(r.get(2)); //venue id venueRecord.add(r.get(3)); //venue lat venueRecord.add(r.get(4)); //venue lat venueRecord.add(r.get(5)); //focal lat venueRecord.add(r.get(6)); //focal lng venueRecord.add(venueLabel); //category of the venue venuesInfo.add(venueRecord); //add the venue informations added=true; } else if(added) found=true; //since venues of the same cell are consecutive, we stop the loop once we found different focal coordinate values } } //add venue_id only if the venue exists in the cell if(added) venuesOfCell.put(cellId, venuesInfo); } return venuesOfCell; }*/ /** * Get the Calendar of a timestamp value from a list of CSV records. */ public Calendar getCalendar(List<CSVRecord> list) { Calendar cal=GregorianCalendar.getInstance(); long timestamp=Long.parseLong(list.get(1).get(0)); //get the timestamp Date d=new Date(timestamp); cal.setTime(d); return cal; } /** * Get a properly eps value for the given dataset */ public double getEps(ArrayList<ArrayList<Double>> dataset) { double side=Math.sqrt(dataset.size()); double eps=Math.sqrt(2)*(1/side); // eps = eps/Math.sqrt(2); //For the higher zooming level // eps = eps*Math.sqrt(2); //For the lower zooming level return eps; } /** * Get all the distinct cluster labels. */ public TreeSet<String> getClusterLabels(HashMap<Integer, String> clusters) { ArrayList<Integer> keys=new ArrayList<Integer>(clusters.keySet()); TreeSet<String> tree =new TreeSet<String>(); for(Integer i: keys) { tree.add(clusters.get(i)); } return tree; } /** * Get all the cell corresponding to the cluster label. */ public ArrayList<TreeSet<Integer>> getCellsOfClusters(HashMap<Integer, String> clusters, HashMap<Integer, ArrayList<Integer>> cells, TreeSet<String> tree) { Iterator<String> iter=tree.iterator(); ArrayList<Integer> keys=new ArrayList<Integer>(clusters.keySet()); ArrayList<TreeSet<Integer>> allCells =new ArrayList<TreeSet<Integer>>(); TreeSet<Integer> cellIndex; String label=""; while(iter.hasNext()) { label=iter.next(); cellIndex=new TreeSet<Integer>(); for(Integer i: keys) { if(clusters.get(i).equals(label)) { cellIndex.addAll(cells.get(i)); } } allCells.add(cellIndex); } return allCells; } /** * Create an hashmap for the holdout used to compute Jaccard evaluation. * each entry of the map will be a couple: cluster_name, list_of_cells */ public HashMap<String, Vector<Integer>> buildHoldoutMap(TreeSet<String> tree, ArrayList<TreeSet<Integer>> cells, int length) { HashMap<String, Vector<Integer>> holdout=new HashMap<String, Vector<Integer>>(); Vector<Integer> vector; Iterator<String> distinctIter=tree.iterator(); Iterator<TreeSet<Integer>> cellsIter=cells.iterator(); while(distinctIter.hasNext() && cellsIter.hasNext()) { String label=distinctIter.next(); Iterator<Integer> treeIter = cellsIter.next().iterator(); vector=new Vector<Integer>(); while(treeIter.hasNext()) { vector.add(treeIter.next()-length); } holdout.put(label, vector); } return holdout; } /**Set SUBCLU parameters and run the algorithm*/ public Clustering<?> runSUBCLU (Database db, double eps, int minpts) { ListParameterization params = new ListParameterization(); params.addParameter(SUBCLU.EPSILON_ID, eps); params.addParameter(SUBCLU.MINPTS_ID, minpts); // setup algorithm SUBCLU<DoubleVector> subclu = ClassGenericsUtil.parameterizeOrAbort(SUBCLU.class, params); // run SUBCLU on database Clustering<SubspaceModel<DoubleVector>> result = subclu.run(db); return result; } /**Set GEOSUBCLU parameters and run the algorithm*/ public Clustering<?> runGEOSUBCLU ( Database db, ArrayList<Pair<Double,Double>> boundaries, HashMap<Integer, String> map, HashMap<String, Double>deltad, int density, double eps, StringBuilder sb ) { ListParameterization params = new ListParameterization(); // setup algorithm GEOSUBCLU<DoubleVector> geosubclu = ClassGenericsUtil.parameterizeOrAbort(GEOSUBCLU.class, params); geosubclu.setFeatureMapper(map); geosubclu.setDeltad(deltad); geosubclu.setDensity(density); geosubclu.setEpsValue(eps); geosubclu.setSbLog(sb); geosubclu.setBoundaries(boundaries); // run GEOSUBCLU on database Clustering<SubspaceModel<DoubleVector>> result = geosubclu.run(db); this.log=geosubclu.getSbLog(); return result; } /**Build a Database from the matrix of normalized density values*/ public <T> Database buildDatabaseFromMatrix (ArrayList<ArrayList<Double>> matrix) { double[][] data = new double[matrix.size()][]; for (int i=0; i<matrix.size(); i++) { data[i] = new double[matrix.get(i).size()]; for(int j=0; j<matrix.get(i).size(); j++) { data[i][j] = (matrix.get(i)).get(j); } } List<Class<?>> filterlist = new ArrayList<>(); filterlist.add(FixedDBIDsFilter.class); Database db = new InMemoryDatabase(new ArrayAdapterDatabaseConnection(data), null); db.initialize(); return db; } @SuppressWarnings("unchecked") public <V extends NumberVector<?>> double getDistance( Database db, Cluster<?> cluster, HashMap<Integer, String> featuresMap, List<CSVRecord> listDens ) { double distance=0.0; Iterator<Relation<?>> iter=db.getRelations().iterator(); iter.next(); Relation<V> relation=(Relation<V>) iter.next(); double sum_distance = 0.0; double total_number = 0.0; Vector<Integer> dimensions = getDimensions(cluster.getName(), featuresMap); for (DBIDIter i1 = cluster.getIDs().iter(); i1.valid(); i1.advance()) { V o1 = relation.get(i1); int cellId1 = Integer.parseInt(DBIDUtil.toString(i1)); double lat1 = Double.parseDouble( listDens.get(cellId1).get(1) ); //latitude double lng1 = Double.parseDouble( listDens.get(cellId1).get(2) ); //longitude for (DBIDIter i2 = cluster.getIDs().iter(); i2.valid(); i2.advance()) { V o2 = relation.get(i2); int cellId2 = Integer.parseInt(DBIDUtil.toString(i2)); double lat2 = Double.parseDouble( listDens.get(cellId2).get(1) ); //latitude double lng2 = Double.parseDouble( listDens.get(cellId2).get(2) ); //longitude double sum = 0.0; for (int i=0; i<dimensions.size(); i++) { double d1 = o1.doubleValue(dimensions.get(i)); double d2 = o2.doubleValue(dimensions.get(i)); ImportTools tools = new ImportTools(); //sum += d1*d2* ( tools.getDistance(lat1, lng1, lat2, lng2) ); //convert to Km sum += tools.getDistance(lat1, lng1, lat2, lng2); } sum_distance += sum; } total_number++; //total number of points in a cluster } distance+= sum_distance * 1/(2*total_number); return distance; } /* * Get SSE cluster */ @SuppressWarnings("unchecked") public <V extends NumberVector<?>> double getClusterSSE( Database db, Cluster<?> cluster, HashMap<Integer, String> featuresMap ) { double eps=0.0000001; double sse=0.0; Iterator<Relation<?>> iter=db.getRelations().iterator(); iter.next(); Relation<V> relation=(Relation<V>) iter.next(); double sum_distance = 0.0; double total_number = 0.0; Vector<Integer> dimensions = getDimensions(cluster.getName(), featuresMap); for (DBIDIter i1 = cluster.getIDs().iter(); i1.valid(); i1.advance()) { V o1 = relation.get(i1); for (DBIDIter i2 = cluster.getIDs().iter(); i2.valid(); i2.advance()) { V o2 = relation.get(i2); double sum_squared = 0.0; for (int i=0; i<dimensions.size(); i++) { double d1 = o1.doubleValue(dimensions.get(i)); double d2 = o2.doubleValue(dimensions.get(i)); double dist_lat_pow2 = Math.pow(o1.doubleValue(0) - o2.doubleValue(0), 2); double dist_lng_pow2 = Math.pow(o1.doubleValue(1) - o2.doubleValue(1), 2); sum_squared += d1*d2*dist_lat_pow2 + d1*d2*dist_lng_pow2; } // int dimension = o1.getDimensionality(); // for (int i=0; i<dimension; i++) // { // double d1 = o1.doubleValue(i); // double d2 = o2.doubleValue(i); // sum_squared += (d1-d2)*(d1-d2); // } // for (Integer i : dimensions) // { // double d1 = o1.doubleValue(i); // double d2 = o2.doubleValue(i); // sum_squared += (d1-d2)*(d1-d2); // } sum_distance += sum_squared; } // for (int i=2; i<dimensions.size(); i++){ // total_number += o1.doubleValue(dimensions.get(i)); // } total_number++; //total number of points in a cluster } sse+= ( sum_distance * 1/(2*total_number) ) + eps; return sse; } /** * Get the SSE value of the clustering * @param featuresMap */ public <V extends NumberVector<?>> double getClusteringSSE( Database db, ArrayList<Clustering<?>> cs, HashMap<Integer, String> featuresMap ) { double sse=0.0; //int nClusters = 0; for(Clustering<?> c: cs) for(Cluster<?> cluster: c.getAllClusters()) { //nClusters ++; sse += getClusterSSE(db, cluster, featuresMap); } //return sse/nClusters; return sse; } private static <V, K> Map<V, K> invert(Map<K, V> map) { Map<V, K> inv = new HashMap<V, K>(); for (Entry<K, V> entry : map.entrySet()) inv.put(entry.getValue(), entry.getKey()); return inv; } private Vector<Integer> getDimensions(String clusterName, HashMap<Integer, String> featuresMap) { Vector<Integer> dimensions = new Vector<>(); Map<String, Integer> inv = invert(featuresMap); String[] features = clusterName.split(","); for (String feature : features) { String feat = feature .replace("c(", "") .replace(")", "") .trim(); dimensions.add(inv.get(feat)); } return dimensions; } public Double getClusterSurface(Cluster<?> cluster, int size) { int objects = 0; for (DBIDIter i1 = cluster.getIDs().iter(); i1.valid(); i1.advance()) objects ++; return objects/(1.0 * size); } public Double getClusterHeterogeneity(Cluster<?> cluster, HashMap<Integer, String> featuresMap) { int cCategories = cluster.getName().split(",").length; return cCategories/(1.0*featuresMap.size()); } public Double getClusterDensity(int i, double surfaceTotal, Double surfaceClusterPercentage) { return i/(1.0 * surfaceClusterPercentage * surfaceTotal); } public ArrayList<Pair<Double, Double>> getFeatureBoundariesFromCSV(List<CSVRecord> list) { ArrayList<Pair<Double,Double>> result=new ArrayList<Pair<Double, Double>>(); //get size of the first element int featureNumber = list.get(0).size(); boolean header = (list.get(0).get(0).contains("Timestamp")) ? true : false; //let's discard the first feature, timestamp not yet used for (int j=1; j<featureNumber; j++) { Vector<Double> v = new Vector<>(); int i = (header) ? 1 : 0; for (;i<list.size(); i++) v.add(Double.parseDouble( list.get(i).get(j) )); Collections.sort(v); result.add( new Pair<Double, Double>(v.get(0), v.get(v.size()-1)) ); } return result; } }