package context.core.task.keyword;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.FilenameUtils;
import context.core.entity.CorpusData;
import context.core.entity.FileData;
import context.core.entity.TabularData;
import context.core.util.JavaIO;
import java.util.HashMap;
import java.util.Map;
/**
*
* @author Kiumars Soltani This class will remove stop words defined by the user
*/
public class KeywordInContext {
private CorpusData input;
private CorpusData output;
private TabularData tabular;
private KeywordTaskInstance instance;
private String replaceString;
/**
*
* @param instance
*/
public KeywordInContext(KeywordTaskInstance instance) {
this.instance = instance;
this.replaceString = "";
init();
}
/**
*
*/
public void init() {
this.replaceString = "```";
this.input = (CorpusData) instance.getInput();
this.output = (CorpusData) instance.getTextOutput();
this.tabular = instance.getTabularOutput(0);
}
/**
*
* @return
*/
public boolean removeOthers() {
List<String> keywords = new ArrayList<String>();
if (JavaIO.readCSVFileIntoList(keywords, instance.getKeywordFile().getFile(), "\n", true) == 0) {
return false;
}
System.out.println("Read all the key words");
Map<String, Integer> keywordMap = new HashMap<String, Integer>();
try {
for (FileData f : input.getFiles()) {
StringBuffer s = new StringBuffer();
String content = JavaIO.readFile(f.getFile());
String[] words = content.split("\\W+");
boolean[] mark = new boolean[words.length];
for (int j = 0; j < words.length; j++) {
mark[j] = false;
}
for (int j = 0; j < words.length; j++) {
if (keywords.contains(words[j])) {
if (!keywordMap.containsKey(words[j])) {
keywordMap.put(words[j], 0);
}
Integer count = keywordMap.get(words[j]);
keywordMap.put(words[j], count + 1);
for (int i = 1; i < instance.getLeftBound(); i++) {
if (j - i >= 0) {
if (!keywords.contains(words[j - i])) {
mark[j - i] = true;
}
} else {
break;
}
}
for (int i = 1; i < instance.getRightBound(); i++) {
if (j + i < words.length) {
if (!keywords.contains(words[j + i])) {
mark[j + i] = true;
}
} else {
break;
}
}
}
}
for (int j = 0; j < words.length; j++) {
if (mark[j]) {
words[j] = replaceString;
}
}
for (int j = 0; j < words.length; j++) {
s.append(words[j]).append(" ");
}
String inputNameWithoutExtension = FilenameUtils.getBaseName(f.getFile().getName());
String inputExtension = FilenameUtils.getExtension(f.getFile().getName());
int index = output.addFile(inputNameWithoutExtension + "-KWIC." + inputExtension);
output.writeFile(index, s.toString());
}
this.writeCsv(keywordMap, tabular.getPath().get());
} catch (IOException e) {
e.printStackTrace();
return false;
}
return true;
}
private void writeCsv(Map<String, Integer> keywordsMap, String filePath) {
System.out.println("keywords#=" + keywordsMap.size());
StringBuffer sb = new StringBuffer();
sb.append("Keyword, Frequency\n");
String toWrite = "";
for (String key : keywordsMap.keySet()) {
toWrite = key + "," + keywordsMap.get(key) + "\n";
sb.append(toWrite);
}
// 2016.03 Add this code to delete existing file
File toDelete = new File(filePath);
if (toDelete.exists()) {
toDelete.delete();
}
//
FileData.writeDataIntoFile(sb.toString(), filePath);
}
}