package context.core.task.codebook;
import context.core.entity.CorpusData;
import context.core.entity.FileData;
import context.core.util.JavaIO;
import context.core.util.comparators.StringLengthComparator;
import gnu.trove.map.hash.TObjectIntHashMap;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
/**
*
* @author Kiumars Soltani
*
*/
public class Codebook {
private CodebookApplicationTaskInstance instance;
private TObjectIntHashMap<String> cbMap;
private Vector<Pair<String, String>> cbInfo;
private CorpusData input;
private CorpusData textOutput;
/**
*
* @param instance
*/
public Codebook(CodebookApplicationTaskInstance instance) {
this.cbMap = new TObjectIntHashMap<String>();
this.cbInfo = new Vector<Pair<String, String>>();
this.instance = instance;
init();
}
private void init() {
this.input = (CorpusData) instance.getInput();
this.textOutput = (CorpusData) instance.getTextOutput();
}
/**
*
* @return
*/
public boolean loadCodebook() {
File nn = new File(instance.getCodebookFile());
List<String> inn = new ArrayList<String>();
if (JavaIO.readCSVFileIntoList(inn, nn, "[\n\r]", false) == 0) {
return false;
}
String[] entities;
int i = 0;
for (String s : inn) {
entities = s.split(",");
if (entities.length != 3) {
break;
}
cbMap.put(entities[0].toLowerCase().trim(), i);
cbInfo.add(i, new ImmutablePair<String, String>(entities[1], entities[2]));
i++;
}
return true;
}
private String findReplacement(String stemp) {
if (instance.getIsDrop() == 1) {
stemp = stemp.replaceAll("[^.,:;()?!\"\\s]", "");
stemp = stemp.replaceAll("[ ]+", " ");
} else if (instance.getIsDrop() == 2) {
stemp = stemp.replaceAll("[^.,:;()?!\"\\s]", "`");
}
return stemp;
}
/**
*
* @return
*/
public boolean applyCodebook() {
List<FileData> files = input.getFiles();
//Make the patterns
List<String> words = new ArrayList<>(this.cbMap.keySet());
Collections.sort(words, new StringLengthComparator());
Collections.reverse(words);
StringBuffer sb = new StringBuffer();
for (String word : words) {
sb.append(("\\b(?i)" + word + "\\b|"));
}
String regex = sb.substring(0, sb.length() - 1).toLowerCase();
Pattern p = Pattern.compile(regex);
try {
for (FileData ff : files) {
File f = ff.getFile();
StringBuffer s = new StringBuffer();
String content = JavaIO.readFile(f);
Matcher m = p.matcher(content);
String replc = "";
StringBuffer stempb = new StringBuffer();
String stemp = "";
while (m.find()) {
stempb.setLength(0);
stemp = "";
int index = this.cbMap.get(m.group().toLowerCase());
//System.out.println(m.group() + "," + index);
if (instance.getIsNormal() == 0) {
replc = this.cbInfo.get(index).getLeft();
} else {
replc = this.cbInfo.get(index).getRight();
}
m.appendReplacement(stempb, "");
stemp = this.findReplacement(stempb.toString());
s.append(stemp);
s.append(replc);
}
stempb.setLength(0);
m.appendTail(stempb);
s.append(this.findReplacement(stempb.toString()));
String nameInputFileWithoutExtension = FilenameUtils.getBaseName(f.getName());
String inputFileExtension = FilenameUtils.getExtension(f.getName());
final String name = nameInputFileWithoutExtension + "-Cb." + inputFileExtension;
int ii = textOutput.addFile(name);
System.out.println("write codebook applied file " + name);
textOutput.writeFile(ii, s.toString());
}
} catch (IOException e) {
e.printStackTrace();
return false;
}
return true;
}
/**
*
* @return
*/
public Vector<Pair<String, String>> getCbInfo() {
return this.cbInfo;
}
}