/** * @author Aswin Rajkumar <aswin.rajkumar@usc.edu> */ package edu.usc.cssl.tacit.topicmodel.zlda.services; import java.io.File; import java.io.FileWriter; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.AbstractMap; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.Date; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.eclipse.core.runtime.OperationCanceledException; import org.eclipse.core.runtime.SubProgressMonitor; import edu.usc.cssl.tacit.common.TacitUtility; import edu.usc.cssl.tacit.common.ui.views.ConsoleView; public class ZlabelTopicModelAnalysis { private SubProgressMonitor monitor; public ZlabelTopicModelAnalysis(SubProgressMonitor monitor) { this.monitor = monitor; } private void runLDA(File dir, File preSeedFile, int numTopics, int noOfSamples, double alphaval, double betaval, double confidenceValue, String outputdir, Date dateObj) { DateFormat df = new SimpleDateFormat("MM-dd-yy-HH-mm-ss"); File[] listOfFiles = dir.listFiles(); List<File> inputFiles = new ArrayList<File>(); monitor.subTask("Collecting files from the directory..."); for (File f : listOfFiles) { if (monitor.isCanceled()) { throw new OperationCanceledException(); } if (f.getAbsolutePath().contains("DS_Store")) continue; inputFiles.add(f); } monitor.worked(5); ConsoleView.printlInConsoleln("running zlabel LDA..."); DTWC dtwc = new DTWC(inputFiles, preSeedFile,this.monitor); dtwc.computeDocumentVectors(); if (monitor.isCanceled()) { throw new OperationCanceledException(); } int[][][] zlabels = dtwc.getTopicSeedsAsInt(); int[][] docs = dtwc.getDocVectorsAsInt(); int T = numTopics; int W = dtwc.getVocabSize(); double[][] alpha = new double[1][T]; for (int i = 0; i < T; i++) { alpha[0][i] = alphaval; } double[][] beta = new double[T][W]; for (int i = 0; i < T; i++) { for (int j = 0; j < W; j++) { beta[i][j] = betaval; } } ZlabelLDA zelda = new ZlabelLDA(docs, zlabels, confidenceValue, alpha, beta, noOfSamples); this.monitor.subTask("Calculating Z label ..."); boolean retVal = zelda.zLDA(); if (!retVal) { System.out .println("Sorry, something is wrong with the input - please check format and try again"); return; } this.monitor.worked(15); double[][] theta, phi; theta = zelda.getTheta(); phi = zelda.getPhi(); Map<String, Integer> dictionary = dtwc.getTermIndex(); Map<Integer, String> revDict = dtwc.getIndexTerm(); this.monitor.subTask("Processing Topic Words ..."); List<List<Map.Entry<String, Double>>> topicWords = new ArrayList<List<Map.Entry<String, Double>>>(); for (int i = 0; i < T; i++) { if (monitor.isCanceled()) { throw new OperationCanceledException(); } topicWords.add(new ArrayList<Map.Entry<String, Double>>()); } for (int i = 0; i < T; i++) { if (monitor.isCanceled()) { throw new OperationCanceledException(); } for (int j = 0; j < W; j++) { if (monitor.isCanceled()) { throw new OperationCanceledException(); } if (phi[i][j] > 0.001) { topicWords.get(i).add( new AbstractMap.SimpleEntry<String, Double>(revDict .get(j), new Double(phi[i][j]))); } } } this.monitor.worked(15); this.monitor.subTask("writing corresponding words and phi values in topicwords-"+df.format(dateObj)+".csv"); System.out .println("\nTopic and its corresponding words and phi values stored in " + outputdir + File.separator + "topicwords-"+df.format(dateObj)+".csv"); try { FileWriter fw = new FileWriter(new File(outputdir + File.separator + "topicwords-"+df.format(dateObj)+".csv")); for (int i = 0; i < T; i++) { if (monitor.isCanceled()) { throw new OperationCanceledException(); } fw.write("Topic" + i + ","); Collections.sort(topicWords.get(i), new Comparator<Map.Entry<String, Double>>() { @Override public int compare(Entry<String, Double> arg0, Entry<String, Double> arg1) { return -(arg0.getValue()).compareTo(arg1 .getValue()); } }); for (int j = 0; (j < topicWords.get(i).size() && j < 50); j++) { if (monitor.isCanceled()) { throw new OperationCanceledException(); } fw.write(topicWords.get(i).get(j).getKey() + "," + topicWords.get(i).get(j).getValue() + ","); } fw.write("\n"); fw.flush(); } fw.flush(); fw.close(); this.monitor.worked(15); this.monitor.subTask("writing Phi values for each stopic in phi-"+df.format(dateObj)+".csv"); ConsoleView.printlInConsoleln("\nPhi values for each stopic stored in " + outputdir + File.separator + "phi-"+df.format(dateObj)+".csv"); fw = new FileWriter( new File(outputdir + File.separator + "phi-"+df.format(dateObj)+".csv")); for (int i = 0; i < T; i++) { if (monitor.isCanceled()) { throw new OperationCanceledException(); } fw.write("Topic" + i + ","); for (int j = 0; j < phi[i].length; j++) { if (monitor.isCanceled()) { throw new OperationCanceledException(); } if (phi[i][j] > 0.001) { fw.write(phi[i][j] + ","); } } fw.write("\n"); fw.flush(); } fw.flush(); fw.close(); this.monitor.worked(15); this.monitor.subTask("writing Theta values for each stopic in theta-"+df.format(dateObj)+".csv"); ConsoleView.printlInConsoleln("\nTheta values for each document stored in " + outputdir + File.separator + "theta-"+df.format(dateObj)+".csv"); fw = new FileWriter(new File(outputdir + File.separator + "theta-"+df.format(dateObj)+".csv")); for (int i = 0; i < docs.length; i++) { if (monitor.isCanceled()) { throw new OperationCanceledException(); } fw.write("Document" + i + ","); for (int j = 0; j < theta[i].length; j++) { fw.write(theta[i][j] + ","); } fw.write("\n"); fw.flush(); } fw.flush(); fw.close(); TacitUtility.createRunReport(outputdir, "Z-Label LDA",dateObj); } catch (Exception e) { ConsoleView.printlInConsoleln("Error writing output to files " + e); } ConsoleView.printlInConsoleln("\nDone zlabel LDA..."); this.monitor.worked(15); this.monitor.done(); } public void invokeLDA(String inputDir, String seedFileName, int numTopics, String outputDir, Date dateObj) { File dir = new File(inputDir); File seedFile = new File(seedFileName); double alphaval = 0.5; double betaval = 0.1; int noOfSamples = 2000; double confidenceValue = 1; runLDA(dir, seedFile, numTopics, noOfSamples, alphaval, betaval, confidenceValue, outputDir, dateObj); } }