package edu.usc.cssl.tacit.wordcount.cooccurrence.services;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.SortedSet;
import java.util.TreeSet;
import org.eclipse.core.runtime.IProgressMonitor;
import edu.usc.cssl.tacit.common.TacitUtility;
import edu.usc.cssl.tacit.common.ui.views.ConsoleView;
public class CooccurrenceAnalysis {
private static String delimiters = "";// " .,;\"!-()\\[\\]{}:?'/\\`~$%#@&*_=+<>";
private boolean doPhrases;
private Map<String, Integer> seedWords;
private String outputPath;
private Map<String, Map<String, Integer>> wordMat;
/**
* This function populates the seedWords map with seed words mentioned in
* the input file.
*
* @param seedFile
* - absolute filepath of seedFile.
* @return boolean value indicating success or failure
* @throws IOException
*/
private boolean setSeedWords(String seedFile) throws IOException {
String[] seeds = null;
String currentLine = null;
BufferedReader br = new BufferedReader(new FileReader(
new File(seedFile)));
while ((currentLine = br.readLine()) != null) {
seeds = currentLine.split(" ");
for (String seed : seeds) {
if (!seedWords.containsKey(seed)) {
seedWords.put(seed, 1);
}
}
}
br.close();
return (seedWords.size() > 0) ? true : false;
}
public CooccurrenceAnalysis() {
seedWords = new HashMap<String, Integer>();
wordMat = new HashMap<String, Map<String, Integer>>();
doPhrases = false;
}
private void setOutputPath(String outputPath) {
this.outputPath = outputPath;
createIfMissing(outputPath);
}
public boolean calculateCooccurrences(List<String> selectedFiles,
String seedFile, int windowSize, String outputPath, int threshold,
boolean buildMatrix, IProgressMonitor monitor) {
String currentLine = null;
Queue<String> q = new LinkedList<String>();
List<String> phrase = new ArrayList<String>();
setOutputPath(outputPath);
// build the seed word dictionary
try {
boolean ret = false;
if (windowSize > 0) { // TODO : prevent from GUI
ret = setSeedWords(seedFile);
}
if (ret) {
doPhrases = true;
}
File[] listOfFiles = (File[]) selectedFiles
.toArray(new File[selectedFiles.size()]);
int seedWordCount = seedWords.size();
int count;
for (File f : listOfFiles) {
monitor.subTask("Processing inout file "+f.getName());
appendLog("Processing inout file "+f.getName());
count = 0;
if (f.getAbsolutePath().contains("DS_Store"))
continue;
List<String> words = new ArrayList<String>();
if (!f.exists() || f.isDirectory())
continue;
BufferedReader br = new BufferedReader(new FileReader(f));
int line_no = 0;
while ((currentLine = br.readLine()) != null) {
if (currentLine.isEmpty() || currentLine.equals(""))
continue;
line_no++;
for (String word : currentLine.split(" ")) {
if (word.isEmpty() || word.equals(""))
continue;
word.replaceAll(delimiters, "");
if (buildMatrix)
words.add(word);
if (doPhrases) {
if (count >= threshold || count >= seedWordCount) {
StringBuilder match = new StringBuilder();
for (String str : q) {
if (seedWords.containsKey(str))
match.append('*');
match.append(str + ' ');
}
phrase.add(f.getName() + " " + line_no + " "
+ match.toString());
q.clear();
count = 0;
for (String s : seedWords.keySet()) {
seedWords.put(s, 1);
}
} else if (q.size() >= windowSize) {
String first = q.remove();
if (seedWords.containsKey(first)) {
if (seedWords.get(first) == 0) {
count--;
seedWords.put(first, 1);
}
}
}
q.add(word);
if (seedWords.containsKey(word)) {
if (seedWords.get(word) != 0) {
count++;
seedWords.put(word, 0);
}
}
}
if (buildMatrix) {
Map<String, Integer> vec = null;
// ConsoleView.writeInConsole("Building word mat for " +
// word);
if (wordMat.containsKey(word)) {
vec = wordMat.get(word);
} else {
vec = new HashMap<String, Integer>();
wordMat.put(word, vec);
}
for (String second : words) {
if (vec.containsKey(second)) {
vec.put(second, vec.get(second) + 1);
} else {
vec.put(second, 1);
}
Map<String, Integer> temp = wordMat.get(second);
if (temp.containsKey(word)) {
temp.put(word, temp.get(word) + 1);
} else {
temp.put(word, 1);
}
}
}
}
}
br.close();
monitor.worked(1);
}
if (buildMatrix) {
monitor.subTask("Writing Word Matrix");
writeWordMatrix();
}
monitor.worked(10);
if (ret && phrase.size() > 0) {
monitor.subTask("Writing Phrases");
writePhrases(phrase);
}
monitor.worked(10);
ConsoleView.printlInConsoleln(String.valueOf(phrase.size()));
Date dateObj = new Date();
TacitUtility.createRunReport(outputPath, "Cooccurrence Analysis",dateObj);
return true;
} catch (Exception e) {
ConsoleView.printlInConsoleln("Exception occurred in Cooccurrence Analysis "
+ e);
}
return false;
}
/**
* write the phrases into file phrases.txt
*
* @param phrases
* - phrases to be written
*/
private void writePhrases(List<String> phrases) {
try {
FileWriter fw = new FileWriter(new File(outputPath + File.separator
+ "phrases.txt"));
for (String p : phrases) {
fw.write(p + "\n");
}
ConsoleView.printlInConsoleln("Writing phrases at "+outputPath + File.separator
+ "phrases.txt");
fw.close();
} catch (IOException e) {
ConsoleView.printlInConsoleln("Error writing output to file phrases.txt " + e);
}
}
/**
* Creates a directory in the file system if it does not already exists
*
* @param folder
* : full path of the directory which has to be created.
*/
private void createIfMissing(String folder) {
File path = new File(folder);
if (!path.exists()) {
path.mkdirs();
}
}
/**
* write the word matrix into the file word-to-word-matrix.csv
*/
private void writeWordMatrix() {
SortedSet<String> keys = new TreeSet<String>(wordMat.keySet());
Map<String, Integer> vec = null;
try {
FileWriter fw = new FileWriter(new File(outputPath + File.separator
+ "word-to-word-matrix.csv"));
fw.write(" ,");
for (String key : keys) {
fw.write(key + ",");
}
fw.write("\n");
for (String key : keys) {
fw.write(key + ",");
vec = wordMat.get(key);
for (String value : keys) {
if (vec.containsKey(value)) {
fw.write(vec.get(value) + ",");
} else {
fw.write("0,");
}
}
fw.write("\n");
}
appendLog("Writng Word Matrix into word-to-word-matrix.csv");
fw.close();
} catch (IOException e) {
ConsoleView.printlInConsoleln("Error writing output to files" + e);
}
}
public boolean invokeCooccurrence(List<String> selectedFiles,
String seedFileLocation, String fOutputDir, String numTopics,
String ftxtThreshold, boolean fOption, IProgressMonitor monitor) {
int windowSize = 0;
if (!numTopics.equals(""))
windowSize = Integer.parseInt(numTopics);
int threshold = 0;
if (!ftxtThreshold.equals(""))
threshold = Integer.parseInt(ftxtThreshold);
boolean buildMatrix = false;
if (fOption)
buildMatrix = true;
// ConsoleView.writeInConsole("Running Co-occurrence Analysis...");
appendLog("Running Co-occurrence Analysis...");
boolean isSuccess = calculateCooccurrences(selectedFiles,
seedFileLocation, windowSize, fOutputDir, threshold,
buildMatrix,monitor);
if (isSuccess == false) {
appendLog("Sorry. Something went wrong with Co-occurrence Analysis. Please check your input and try again.\n");
return isSuccess;
}
appendLog("Output for Co-occurrence Analysis");
appendLog("Word to word matrix stored in " + fOutputDir
+ File.separator + "word-to-word-matrix.csv");
if (seedFileLocation != "" && !seedFileLocation.isEmpty()
&& windowSize != 0)
appendLog("Phrases stored in " + fOutputDir + File.separator
+ "phrases.txt");
return true;
}
private void appendLog(String string) {
ConsoleView.printlInConsoleln(string);
}
}