package edu.usc.cssl.tacit.topicmodel.lda.services;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import org.eclipse.core.runtime.IProgressMonitor;
import edu.usc.cssl.tacit.common.TacitUtility;
import edu.usc.cssl.tacit.common.ui.views.ConsoleView;
public class LdaAnalysis {
private StringBuilder readMe = new StringBuilder();
private String sourceDir;
private int numTopics;
private String outputDir;
private String label;
private boolean wordWeights;
public void initialize(String sourceDir, int numTopics, String outputDir,
String label, boolean wordWeights) {
this.sourceDir = sourceDir;
this.numTopics = numTopics;
this.outputDir = outputDir;
this.label = label;
this.wordWeights = wordWeights;
}
public void doLDA(IProgressMonitor monitor, Date dateObj)
throws FileNotFoundException, IOException {
String outputPath = outputDir + System.getProperty("file.separator")
+ label;
String keepSeq = "TRUE", stopWords = "FALSE", preserveCase = "TRUE";
/*
* if (removeStopwords){ stopWords = "TRUE"; } if (doLowercase){
* preserveCase = "FALSE"; }
*/
String[] t2vArgs = { "--input", sourceDir, "--output",
outputPath + ".mallet", "--keep-sequence", keepSeq,
"--remove-stopwords", stopWords, "--preserve-case",
preserveCase };
String[] v2tArgs = { "--input", outputPath + ".mallet", "--num-topics",
String.valueOf(numTopics), "--optimize-interval", "20",
"--output-state", outputPath + ".topic-state.gz",
"--output-topic-keys", outputPath + ".topic-keys.txt",
"--output-doc-topics", outputPath + ".topic-composition.txt",
"--topic-word-weights-file", outputPath + ".word-weights.txt",
"--word-topic-counts-file", outputPath + ".word-counts.txt" };
monitor.subTask("Performing text to vector conversion");
// --input pathway\to\the\directory\with\the\files --output
// tutorial.mallet --keep-sequence --remove-stopwords
Text2Vectors.main(t2vArgs);
monitor.worked(15);
monitor.subTask("Performing vector to topics conversion");
// --input tutorial.mallet --num-topics 20 --output-state topic-state.gz
// --output-topic-keys tutorial_keys.txt --output-doc-topics
// tutorial_compostion.txt
Vectors2Topics.main(v2tArgs);
monitor.worked(5);
monitor.subTask("Created complete state file " + outputPath
+ ".topic-state.gz");
// ConsoleView.printlInConsoleln("Created complete state file "+outputPath+".topic-state.gz");
// ConsoleView.printlInConsoleln("Created topic keys file "+outputPath+".topic_keys.txt");
// ConsoleView.printlInConsoleln("Created topic composition file "+outputPath+".topic_composition.txt");
// ConsoleView.printlInConsoleln("Created topic word counts file "+outputPath+".word_counts.txt");
monitor.subTask("Convert " + outputPath + ".topic-keys to csv");
convertKeys2csv(outputPath + ".topic-keys", dateObj);
monitor.worked(5);
monitor.subTask("Convert " + outputPath + ".topic-composition to csv");
convertComposition2csv(outputPath + ".topic-composition", dateObj);
monitor.worked(5);
monitor.subTask("Convert " + outputPath + ".word-counts to csv");
if (wordWeights) {
convertWeights2csv(outputPath + ".word-weights", dateObj);
}
monitor.worked(5);
deleteFiles(outputPath);
TacitUtility.createRunReport(outputDir, "LDA Analysis", dateObj);
monitor.worked(5);
}
private void deleteFiles(String outputPath) {
File toDel = new File(outputPath + ".topic-state.gz");
toDel.delete();
toDel = new File(outputPath + ".word_counts.txt");
toDel.delete();
toDel = new File(outputPath + ".topic_keys.txt");
toDel.delete();
toDel = new File(outputPath + ".topic_composition.txt");
toDel.delete();
toDel = new File(outputPath + ".word_weights.txt");
toDel.delete();
toDel = new File(outputPath + ".mallet");
toDel.delete();
}
private void convertWeights2csv(String fileName, Date dateObj) {
DateFormat df = new SimpleDateFormat("MM-dd-yy-HH-mm-ss");
BufferedReader br;
BufferedWriter bw;
try {
br = new BufferedReader(new FileReader(new File(fileName + ".txt")));
bw = new BufferedWriter(new FileWriter(new File(fileName + "-"
+ df.format(dateObj) + ".csv")));
String currentLine = "Topic,Word,Weight";
bw.write(currentLine);
bw.newLine();
while ((currentLine = br.readLine()) != null) {
currentLine = currentLine.replace('\t', ',');
List<String> wordList = Arrays.asList(currentLine.split(","));
bw.write(wordList.get(0) + "," + wordList.get(1) + ","
+ wordList.get(2));
bw.newLine();
}
br.close();
bw.close();
ConsoleView.printlInConsoleln(fileName + "-" + df.format(dateObj)
+ ".csv");
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
private void convertKeys2csv(String fileName, Date dateObj) {
DateFormat df = new SimpleDateFormat("MM-dd-yy-HH-mm-ss");
BufferedReader br;
BufferedWriter bw;
try {
br = new BufferedReader(new FileReader(new File(fileName + ".txt")));
bw = new BufferedWriter(new FileWriter(new File(fileName + "-"
+ df.format(dateObj) + ".csv")));
String currentLine = "Topic,Keywords";
bw.write(currentLine);
bw.newLine();
while ((currentLine = br.readLine()) != null) {
currentLine = currentLine.replace('\t', ',');
List<String> wordList = Arrays.asList(currentLine.split(","));
bw.write(wordList.get(0) + "," + wordList.get(2));
bw.newLine();
}
br.close();
bw.close();
ConsoleView.printlInConsoleln("Created topic keys file " + fileName
+ "-" + df.format(dateObj) + ".csv");
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
private void convertComposition2csv(String fileName, Date dateObj) {
DateFormat df = new SimpleDateFormat("MM-dd-yy-HH-mm-ss");
BufferedReader br;
BufferedWriter bw;
try {
br = new BufferedReader(new FileReader(new File(fileName + ".txt")));
bw = new BufferedWriter(new FileWriter(new File(fileName + "-"
+ df.format(dateObj) + ".csv")));
String currentLine = br.readLine();
currentLine = "Number,File Name";
for (int i = 0; i < numTopics; i++) {
currentLine = currentLine + "," + "Topic " + i + " Probability";
}
bw.write(currentLine);
bw.newLine();
while ((currentLine = br.readLine()) != null) {
currentLine = currentLine.replace('\t', ',');
List<String> wordList = Arrays.asList(currentLine.split(","));
HashMap<String, String> probabilities = new HashMap<String, String>();
for (int i = 2; i < wordList.size(); i = i + 2) {
probabilities.put(wordList.get(i), wordList.get(i + 1));
}
currentLine = wordList.get(0) + "," + wordList.get(1);
// bw.write(wordList.get(0)+","+wordList.get(1));
for (int i = 0; i < numTopics; i++) {
String keyVal = probabilities.get(Integer.toString(i));
currentLine = currentLine + "," + keyVal;
}
bw.write(currentLine);
bw.newLine();
ConsoleView.printlInConsoleln(fileName + "-"
+ df.format(dateObj) + ".csv");
}
br.close();
bw.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}