package edu.uc.cssl.tacit.cluster.kmeans.services; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Scanner; import weka.clusterers.SimpleKMeans; import weka.core.Attribute; import weka.core.FastVector; import weka.core.Instance; import weka.core.Instances; import weka.filters.Filter; import weka.filters.unsupervised.attribute.StringToWordVector; import edu.usc.cssl.tacit.common.TacitUtility; import edu.usc.cssl.tacit.common.ui.views.ConsoleView; public class KmeansClusterAnalysis { private static int[] doClustering(List<File> inputFiles, int numOfClusters) { try { StringToWordVector filter = new StringToWordVector(); SimpleKMeans kmeans = new SimpleKMeans(); FastVector atts = new FastVector(1); atts.addElement(new Attribute("text", (FastVector) null)); Instances docs = new Instances("text_files", atts, 0); for (int i = 0; i < inputFiles.size(); i++) { try { double[] newInst = new double[1]; String content = new Scanner(inputFiles.get(i)) .useDelimiter("\\Z").next(); newInst[0] = (double) docs.attribute(0).addStringValue( content); docs.add(new Instance(1.0, newInst)); } catch (Exception e) { ConsoleView.printlInConsoleln("Exception occurred in reading files" + e); return null; } } filter.setInputFormat(docs); Instances filteredData = Filter.useFilter(docs, filter); kmeans.setPreserveInstancesOrder(true); kmeans.setNumClusters(numOfClusters); kmeans.buildClusterer(filteredData); int[] assignments = kmeans.getAssignments(); int i = 0; for (int clusterNum : assignments) { System.out .printf("Instance %d -> Cluster %d \n", i, clusterNum); i++; } return assignments; } catch (Exception e) { ConsoleView.printlInConsoleln("Exception occurred in K means " + e); } return null; } public static void runClustering(int fNumClusters, List<File> listOfFiles, String fOutputDir, Date dateObj) { List<File> inputFiles = new ArrayList<File>(); for (File f : listOfFiles) { if (f.getAbsolutePath().contains("DS_Store")) continue; inputFiles.add(f); } DateFormat df = new SimpleDateFormat("MM-dd-yy-HH-mm-ss"); ConsoleView.printlInConsoleln("Running KMeans Clustering..."); int[] clusters = doClustering(inputFiles, fNumClusters); if (clusters == null) { ConsoleView.printlInConsoleln("Sorry. Something went wrong with KMeans Clustering. Please check your input and try again.\n"); return; } int i = 0; ConsoleView.printlInConsoleln("Output for KMeans Clustering"); ConsoleView.printlInConsoleln("Clusters formed: \n"); Map<Integer, List<String>> outputClusters = new HashMap<Integer, List<String>>(); for (i = 0; i < fNumClusters; i++) { outputClusters.put(i, new ArrayList<String>()); } List<String> vec; i = 0; for (int clusterNum : clusters) { vec = outputClusters.get(clusterNum); vec.add(inputFiles.get(i).getName()); outputClusters.put(clusterNum, vec); i++; } try { String op = fOutputDir + File.separator + "KMeansClusters-"+df.format(dateObj)+".txt"; ConsoleView.printlInConsoleln("Saving the output for Kmeans clustering in " + op); FileWriter fw = new FileWriter(new File(op)); for (int c : outputClusters.keySet()) { System.out.printf("Cluster %d \n", c); ConsoleView.printlInConsoleln("Cluster " + c + ": \n"); fw.write("Cluster " + c + ": \n"); vec = outputClusters.get(c); for (String f : vec) { ConsoleView.printlInConsoleln("File " + f); fw.write("File" + f + "\n"); } fw.write("\n"); ConsoleView.printlInConsoleln(""); } fw.close(); TacitUtility.createRunReport(fOutputDir, "K Means Clustering",dateObj); } catch (IOException e) { ConsoleView.printlInConsoleln("Error writing output to files" + e); } } }