package edu.usc.cssl.tacit.common; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.net.URL; import java.util.HashSet; import java.util.List; import org.apache.commons.io.FileUtils; import org.eclipse.core.runtime.FileLocator; import org.eclipse.core.runtime.Path; import org.eclipse.core.runtime.Platform; import org.osgi.framework.Bundle; import com.cybozu.labs.langdetect.Detector; import com.cybozu.labs.langdetect.DetectorFactory; import com.cybozu.labs.langdetect.LangDetectException; import edu.usc.cssl.tacit.common.snowballstemmer.DanishStemmer; import edu.usc.cssl.tacit.common.snowballstemmer.DutchStemmer; import edu.usc.cssl.tacit.common.snowballstemmer.EnglishStemmer; import edu.usc.cssl.tacit.common.snowballstemmer.FinnishStemmer; import edu.usc.cssl.tacit.common.snowballstemmer.FrenchStemmer; import edu.usc.cssl.tacit.common.snowballstemmer.GermanStemmer; import edu.usc.cssl.tacit.common.snowballstemmer.HungarianStemmer; import edu.usc.cssl.tacit.common.snowballstemmer.ItalianStemmer; import edu.usc.cssl.tacit.common.snowballstemmer.NorwegianStemmer; import edu.usc.cssl.tacit.common.snowballstemmer.SnowballStemmer; import edu.usc.cssl.tacit.common.snowballstemmer.TurkishStemmer; import edu.usc.cssl.tacit.common.ui.CommonUiActivator; import edu.usc.cssl.tacit.common.ui.IPreprocessorSettingsConstant; import edu.usc.cssl.tacit.common.ui.views.ConsoleView; public class Preprocess { private boolean doLowercase = false; private boolean doStemming = false; private boolean doStopWords = false; private boolean doLangDetect = false; private boolean doCleanUp = false; private String delimiters = " .,;'\"!-()[]{}:?"; // private String[] inputFiles; private String outputPath; private String stopwordsFile; private HashSet<String> stopWordsSet = new HashSet<String>(); SnowballStemmer stemmer = null; private String stemLang; private String callingPlugin; private String currTime; private String preprocessingParentFolder; public Preprocess(String caller) { this.stopwordsFile = CommonUiActivator.getDefault() .getPreferenceStore().getString("stop_words_path"); this.delimiters = CommonUiActivator.getDefault().getPreferenceStore() .getString("delimeters"); this.stemLang = CommonUiActivator.getDefault().getPreferenceStore() .getString("language"); this.doLowercase = Boolean.parseBoolean(CommonUiActivator.getDefault() .getPreferenceStore().getString("islower_case")); this.doStemming = Boolean.parseBoolean(CommonUiActivator.getDefault() .getPreferenceStore().getString("isStemming")); this.doCleanUp = Boolean.parseBoolean(CommonUiActivator.getDefault() .getPreferenceStore().getString("ispreprocessed")); this.outputPath = CommonUiActivator.getDefault().getPreferenceStore() .getString("pp_output_path"); this.callingPlugin = caller; this.currTime = String.valueOf(System.currentTimeMillis()); } // for File as well as Directory public String doPreprocessing(List<String> inputFiles, String subFolder) throws IOException { File[] files; files = new File[inputFiles.size()]; int i = 0; boolean outputPathNotSet = false; for (String filepath : inputFiles) { if ((new File(filepath).isDirectory())) continue; if (new File(filepath).getAbsolutePath().contains("DS_Store")) continue; files[i] = new File(filepath); i = i + 1; } if (this.outputPath == null || this.outputPath.trim().length() == 0) { this.outputPath = System.getProperty("user.dir"); // this.outputPath = (new File(inputFiles.get(0)).getParent()); outputPathNotSet = true; } preprocessingParentFolder = this.outputPath + File.separator + callingPlugin + "_" + currTime; if (outputPathNotSet) this.outputPath = ""; if (!(new File(preprocessingParentFolder).exists())) { new File(preprocessingParentFolder).mkdir(); ConsoleView.printlInConsoleln("Folder " + preprocessingParentFolder + " created successfully."); } if (subFolder.trim().length() != 0) { preprocessingParentFolder = preprocessingParentFolder + File.separator + subFolder; if (new File(preprocessingParentFolder).mkdir()) { ConsoleView.printlInConsoleln("Folder " + preprocessingParentFolder + " created successfully."); } } if (stopwordsFile.trim().length() != 0) { doStopWords = true; String currentLine; BufferedReader br = new BufferedReader(new FileReader(new File( stopwordsFile))); while ((currentLine = br.readLine()) != null) { stopWordsSet.add(currentLine.trim().toLowerCase()); } br.close(); } if (doStemming) { // If stemming has to be performed, find the // appropriate stemmer. if (stemLang.equals("AUTODETECT")) { doLangDetect = true; Bundle bundle = Platform .getBundle("edu.usc.cssl.tacit.common"); URL url = FileLocator.find(bundle, new Path("profiles"), null); URL fileURL = FileLocator.toFileURL(url); //ConsoleView.printlInConsoleln(fileURL.getPath()); try { DetectorFactory.loadProfile(fileURL.getPath()); } catch (com.cybozu.labs.langdetect.LangDetectException ex) { // ex.printStackTrace(); ConsoleView.printlInConsoleln("Exception code - " + ex.getCode()); // ex.getCode().toString() -> is not visible! } } else { doLangDetect = false; stemmer = stemSelect(stemLang); } } int currentCount = 0; int adder = files.length/10; int breakPoint = adder; int statusPoint = 0; ConsoleView.printlInConsoleln("Preprocessing Status: 0% completed"); for (File f : files) { currentCount++; if (currentCount >= breakPoint){ if (statusPoint != 9){ statusPoint++; ConsoleView.printlInConsoleln("Preprocessing Status: " + statusPoint*10+"% completed"); breakPoint = breakPoint+adder; } } if (f == null) break; // Mac cache file filtering if (f.getAbsolutePath().contains("DS_Store")) continue; if ("_preprocessed".equals(f.getName())) continue; String inputFile = f.getAbsolutePath(); //ConsoleView.printlInConsoleln("Preprocessing " + inputFile); // doLangDetect only if doStemming is true if (doLangDetect) { try { stemmer = findLangStemmer(f); } catch (LangDetectException e) { e.printStackTrace(); } } File iFile = new File(inputFile); if (!iFile.exists() || iFile.isDirectory()) { ConsoleView.printlInConsoleln("Error in input file path " + iFile.getAbsolutePath()); continue; } File oFile = new File(preprocessingParentFolder + System.getProperty("file.separator") + f.getName()); BufferedReader br = new BufferedReader(new InputStreamReader( new FileInputStream(iFile), "UTF8")); BufferedWriter bw = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(oFile), "UTF-8")); String linear; while ((linear = br.readLine()) != null) { if (linear != "") { if (doLowercase) linear = linear.toLowerCase(); for (char c : delimiters.toCharArray()) linear = linear.replace(c, ' '); if (doStopWords) linear = removeStopWords(linear); if (doStemming && stemmer != null) linear = stem(linear); bw.write(linear + "\n"); } } //ConsoleView.printlInConsoleln(preprocessingParentFolder // + System.getProperty("file.separator") + f.getName()); br.close(); bw.close(); } ConsoleView.printlInConsoleln("Preprocessing Status: 100% completed"); //ConsoleView.printlInConsoleln("Preprocessing Status: 100% completed"); ConsoleView.printlInConsoleln("Preprocessed files stored in " + preprocessingParentFolder); return preprocessingParentFolder; } private SnowballStemmer findLangStemmer(File iFile) throws IOException, LangDetectException { BufferedReader br = new BufferedReader(new FileReader(iFile)); String sampleText = ""; for (int i = 0; i < 10; i++) { String currentLine = br.readLine(); if (currentLine == null) break; sampleText = sampleText + currentLine.trim().replace('\n', ' '); } Detector detector = DetectorFactory.create(); detector.append(sampleText); String lang = detector.detect(); br.close(); return stemSelect(lang); } private String stem(String linear) { if (linear.isEmpty()) return ""; StringBuilder returnString = new StringBuilder(); String[] wordArray = linear.split("\\s+"); for (String word : wordArray) { stemmer.setCurrent(word); String stemmedWord = ""; if (stemmer.stem()) stemmedWord = stemmer.getCurrent(); if (!stemmedWord.equals("")) word = stemmedWord; returnString.append(word); returnString.append(' '); } return returnString.toString(); } private SnowballStemmer stemSelect(String stemLang) { if (stemLang.toUpperCase().equals("EN")) { return new EnglishStemmer(); } else if (stemLang.toUpperCase().equals("DE")) { return new GermanStemmer(); } else if (stemLang.toUpperCase().equals("FR")) { return new FrenchStemmer(); } else if (stemLang.toUpperCase().equals("IT")) { return new ItalianStemmer(); } else if (stemLang.toUpperCase().equals("DA")) { return new DanishStemmer(); } else if (stemLang.toUpperCase().equals("NL")) { return new DutchStemmer(); } else if (stemLang.toUpperCase().equals("FI")) { return new FinnishStemmer(); } else if (stemLang.toUpperCase().equals("HU")) { return new HungarianStemmer(); } else if (stemLang.toUpperCase().equals("NO")) { return new NorwegianStemmer(); } else if (stemLang.toUpperCase().equals("TR")) { return new TurkishStemmer(); } return null; } private String removeStopWords(String linear) { StringBuilder returnString = new StringBuilder(); String[] wordArray = linear.split("\\s+"); for (String word : wordArray) { if (!stopWordsSet.contains(word.toLowerCase())) { returnString.append(word); returnString.append(' '); } } return returnString.toString(); } public void clean() { final Boolean cleanUp = Boolean.valueOf(CommonUiActivator.getDefault() .getPreferenceStore().getString(IPreprocessorSettingsConstant.PRE_PROCESSED)); if(!cleanUp){ return; } File toDel = new File(this.outputPath+System.getProperty("file.separator")+this.callingPlugin+"_"+this.currTime); try { if (toDel.exists()) FileUtils.deleteDirectory(toDel); } catch (IOException e) { e.printStackTrace(); } } public boolean doCleanUp() { return doCleanUp; } }