package edu.usc.cssl.tacit.classify.naivebayes.services;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.eclipse.core.runtime.IProgressMonitor;
import bsh.EvalError;
import edu.usc.cssl.tacit.common.TacitUtility;
import edu.usc.cssl.tacit.common.ui.views.ConsoleView;
public class NaiveBayesClassifier {
private String tmpLocation;
private String outputDir;
private String tempoutputDir;
private long currTime;
public NaiveBayesClassifier() {
this.tmpLocation = System.getProperty("user.dir") + System.getProperty("file.separator")
+ "NB_Classifier";
this.outputDir = this.tmpLocation + System.getProperty("file.separator") + "Output";
this.tempoutputDir = System.getProperty("user.dir") + System.getProperty("file.separator") + "tacit_temp_files";
if (!new File(tempoutputDir).exists()){
new File(tempoutputDir).mkdir();
}
this.currTime = System.currentTimeMillis();
// this.tmpLocation =
// "F:\\NLP\\Naive Bayes Classifier\\2 Class Analysis\\preprocess\\NB_Classifier";
String outputDir = this.outputDir;
if (!new File(outputDir).exists()) {
new File(outputDir).mkdirs();
}
}
public String getTmpLocation() {
return this.tmpLocation;
}
public String predict(ArrayList<String> trainingClasses,
ArrayList<String> testingClasses, String outputDirectory,
boolean removeStopwords, boolean doLowercase, Date dateObj)
throws FileNotFoundException, IOException, EvalError {
if (trainingClasses.isEmpty() || testingClasses.isEmpty())
return null;
String outputDir = (outputDirectory.isEmpty()) ? this.outputDir
: outputDirectory;
String tempoutputPath = this.tempoutputDir + System.getProperty("file.separator")+ currTime;
DateFormat df = new SimpleDateFormat("MM-dd-yy-HH-mm-ss");
String tempOutputPath = "";
String tempTrainDirs = "";
// Create a output filename and comma separated source directories
for (String classPath : trainingClasses) {
tempOutputPath += classPath.substring(classPath.lastIndexOf(System
.getProperty("file.separator")) + 1) + "_";
tempTrainDirs += classPath + ",";
}
String outputPath = outputDir + System.getProperty("file.separator")
+ tempOutputPath.substring(0, tempOutputPath.length() - 1)
+ "-" + df.format(dateObj);
String tempTestDirs = "";
for (String classPath : testingClasses) {
if (!classPath.isEmpty())
tempTestDirs += classPath + ",";
}
String keepSeq = "FALSE", stopWords = "FALSE", preserveCase = "TRUE";
if (removeStopwords) {
stopWords = "TRUE";
}
if (doLowercase) {
preserveCase = "FALSE";
}
// Set up the args
tempTrainDirs = tempTrainDirs.substring(0, tempTrainDirs.length() - 1);
String trainDirs[] = tempTrainDirs.split(",");
tempTestDirs = tempTestDirs.substring(0, tempTestDirs.length() - 1);
String testDirs[] = tempTestDirs.split(",");
ArrayList<String> tempT2vArgs = new ArrayList<String>(Arrays.asList(
"--input", "--output", tempoutputPath + ".train",
"--keep-sequence", keepSeq, "--remove-stopwords", stopWords,
"--preserve-case", preserveCase));
// add all the class paths to the argument
tempT2vArgs.addAll(1, Arrays.asList(trainDirs));
// convert the object array to string, this feature is available in only
// java 1.6 or greater
String[] t2vArgs = Arrays.copyOf(tempT2vArgs.toArray(),
tempT2vArgs.toArray().length, String[].class);
ArrayList<String> tempT2vArgsTest = new ArrayList<String>(
Arrays.asList("--input", "--output", tempoutputPath + ".test",
"--keep-sequence", keepSeq, "--remove-stopwords",
stopWords, "--preserve-case", preserveCase,
"--use-pipe-from", tempoutputPath + ".train"));
// add all the class paths to the argument
tempT2vArgsTest.addAll(1, Arrays.asList(testDirs));
String[] t2vArgs_test = Arrays.copyOf(tempT2vArgsTest.toArray(),
tempT2vArgsTest.toArray().length, String[].class);
String[] v2cArgs = { "--training-file", tempoutputPath + ".train",
"--testing-file", tempoutputPath + ".test", "--output-classifier",
tempoutputPath + ".out" };
Text2Vectors.main(t2vArgs);
System.out.println("Created training file " + tempoutputPath + ".train");
Text2Vectors.main(t2vArgs_test);
System.out.println("Created test file " + tempoutputPath + ".test");
ArrayList<String> result = Vectors2Classify.main(v2cArgs);
ConsoleView.printlInConsoleln("\nCreated classifier output file "
+ tempoutputPath + ".out");
System.out.println(result.get(0));
TacitUtility.createRunReport(outputDirectory, "Naive Bayes's", dateObj);
return result.get(0);
}
public void classify(ArrayList<String> trainingClasses,
String classificationInputDir, String classificationOutputDir,
boolean removeStopwords, boolean doLowercase, Date dateObj)
throws FileNotFoundException, IOException, EvalError {
DateFormat df = new SimpleDateFormat("MM-dd-yy-HH-mm-ss");
ConsoleView.printlInConsoleln("Classification starts ..");
String tempoutputPath = this.tempoutputDir + System.getProperty("file.separator")+ currTime;
if (trainingClasses.isEmpty() || classificationInputDir.isEmpty()
|| classificationOutputDir.isEmpty())
return;
String tempOutputPath = "";
String tempSourceDir = "";
// Create a output filename and comma separated source directories
for (String classPath : trainingClasses) {
tempOutputPath += classPath.substring(classPath.lastIndexOf(System
.getProperty("file.separator")) + 1) + "_";
tempSourceDir += classPath + ",";
}
String outputPath = classificationOutputDir
+ System.getProperty("file.separator")
+ tempOutputPath.substring(0, tempOutputPath.length() - 1)
+ "-" + df.format(dateObj);
String keepSeq = "FALSE", stopWords = "FALSE", preserveCase = "TRUE";
if (removeStopwords) {
stopWords = "TRUE";
}
if (doLowercase) {
preserveCase = "FALSE";
}
// Set up the args
tempSourceDir = tempSourceDir.substring(0, tempSourceDir.length() - 1);
String sourceDirs[] = tempSourceDir.split(",");
ArrayList<String> tempT2vArgs = new ArrayList<String>(Arrays.asList(
"--input", "--output", tempoutputPath + ".train",
"--keep-sequence", keepSeq, "--remove-stopwords", stopWords,
"--preserve-case", preserveCase));
// add all the class paths to the argument
tempT2vArgs.addAll(1, Arrays.asList(sourceDirs));
// convert the object array to string, this feature is available in only
// java 1.6 or greater
String[] t2vArgs = Arrays.copyOf(tempT2vArgs.toArray(),
tempT2vArgs.toArray().length, String[].class);
String[] t2vArgs_test = { "--input", classificationInputDir,
"--output", tempoutputPath + ".test", "--keep-sequence", keepSeq,
"--remove-stopwords", stopWords, "--preserve-case",
preserveCase, "--use-pipe-from", tempoutputPath + ".train" };
String[] v2cArgs = { "--training-file", tempoutputPath + ".train",
"--testing-file", tempoutputPath + ".test", "--output-classifier",
tempoutputPath + ".out", "--report", "test:raw" };
System.out.println("Args :" + Arrays.toString(t2vArgs));
System.out.println("Args test :" + Arrays.toString(t2vArgs_test));
System.out.println("Command args :" + Arrays.toString(v2cArgs));
Text2Vectors.main(t2vArgs);
System.out.println("Created training file " + tempoutputPath + ".train");
Text2Vectors.main(t2vArgs_test);
System.out.println("Created validation file " + tempoutputPath + ".test");
ArrayList<String> result = Vectors2Classify.main(v2cArgs);
ConsoleView.printlInConsoleln("\nCreated classifier output file "
+ tempoutputPath + ".out");
BufferedWriter bw = new BufferedWriter(new FileWriter(new File(
outputPath + "-output.csv")));
bw.write("File,Predicted Class,Predicted Class Probability,Other Class Probabilities\n");
for (String s : result)
bw.write(s + "\n");
bw.close();
ConsoleView.printlInConsoleln("Created prediction CSV file "
+ outputPath + "_output.csv");
TacitUtility.createRunReport(classificationOutputDir, "Naive Bayes's",
dateObj);
}
public void purgeDirectory(File dir) {
if (null == dir || !dir.exists() || !dir.isDirectory())
return;
if (dir.listFiles().length > 0) {
for (File file : dir.listFiles()) {
if (file.isDirectory())
purgeDirectory(file);
file.delete();
}
}
}
public void createTempDirectories(HashMap<String, List<String>> classPaths,
ArrayList<String> trainingDataPaths, IProgressMonitor monitor)
throws IOException {
String tmpLocation = this.tmpLocation;
if (!new File(tmpLocation).exists()) {
new File(tmpLocation).mkdirs();
}
String tempTrainDir = new String();
for (String key : classPaths.keySet()) {
List<String> classFiles = classPaths.get(key);
int numFiles = classPaths.get(key).size();
File[] files = new File[numFiles];
// Create respective class directories
String className = new File(key).getName();
tempTrainDir = tmpLocation + System.getProperty("file.separator") + className;
System.out.println("Training data dir :" + tempTrainDir);
if (new File(tempTrainDir).exists()) {
purgeDirectory(new File(tempTrainDir));
}
new File(tempTrainDir).mkdirs();
for (int num = 0; num < numFiles; num++) {
files[num] = new File(classFiles.get(num));
// new File(files[num].getAbsolutePath(), new File(tempTrainDir
// + File.separator + files[num].getName()).getAbsolutePath());
// Files.copy(files[num].toPath(), new File(tempTrainDir+
// File.separator + files[num].getName()).toPath(), new
// CopyOption[] { REPLACE_EXISTING });
FileUtils.copyFileToDirectory(files[num],
new File(tempTrainDir));
}
trainingDataPaths.add(new File(tempTrainDir).getAbsolutePath());
monitor.worked(1); // processing of each directory/class
}
}
public void deleteTempDirectories(ArrayList<String> trainingDataPaths) {
for (String s : trainingDataPaths) {
System.out.println("Cleaning directory " + s);
purgeDirectory(new File(s));
new File(s).delete();
}
purgeDirectory(new File(this.tmpLocation));
new File(this.tmpLocation).delete();
}
public void selectAllFiles(String DirPath, ArrayList<String> files) {
File dir = new File(DirPath);
for (File f : dir.listFiles()) {
if (f.isDirectory()) {
selectAllFiles(f.getAbsolutePath(), files);
} else {
files.add(f.getAbsolutePath());
}
}
}
}