package context.core.task.removestopword; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.FilenameUtils; import context.core.entity.CorpusData; import context.core.entity.FileData; import context.core.util.JavaIO; /** * * @author Kiumars Soltani This class will remove stop words defined by the user */ public class RemoveStopWord { private CorpusData input; private CorpusData output; private RemoveStopwordsTaskInstance instance; private String replaceString; /** * */ public RemoveStopWord() { this.replaceString = ""; } /** * * @param instance */ public RemoveStopWord(RemoveStopwordsTaskInstance instance) { this.instance = instance; this.replaceString = ""; init(); } /** * */ public void init() { if (instance.getType() == 1) { this.replaceString = ""; } else { this.replaceString = "```"; } this.input = (CorpusData) instance.getInput(); this.output = (CorpusData) instance.getTextOutput(); } /** * * @return */ public boolean remove_stop_words() { List<String> stopwords = new ArrayList<String>(); if (JavaIO.readCSVFileIntoList(stopwords, instance.getStopwordFile().getFile(), "\n", true) == 0) { return false; } Pattern p = generateStopwordsPattern(stopwords); System.out.println("Read all the stop words"); try { for (FileData f : input.getFiles()) { String content = JavaIO.readFile(f.getFile()); StringBuffer s = removeStringPattern(p, content); /* StringBuffer sb = new StringBuffer(); PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<CoreLabel>(new FileReader(f.getFile()), new CoreLabelTokenFactory(), "invertible=true"); for (String word; ptbt.hasNext();) { CoreLabel label = (CoreLabel) ptbt.next(); //word = label.originalText(); word = label.word(); String word_lowercase = word.toLowerCase(); String before = label.getString(CoreAnnotations.BeforeAnnotation.class); System.out.println(before); if (stopwords.contains(word_lowercase)) { sb.append(before + this.replaceString); } else { sb.append(before + word); } } */ String inputNameWithoutExtension = FilenameUtils.getBaseName(f.getFile().getName()); String inputExtension = FilenameUtils.getExtension(f.getFile().getName()); int index = output.addFile(inputNameWithoutExtension + "-RS." + inputExtension); output.writeFile(index, s.toString()); } } catch (IOException e) { e.printStackTrace(); return false; } return true; } /** * * @param stopwords * @return */ public Pattern generateStopwordsPattern(List<String> stopwords) { StringBuffer sb = new StringBuffer(); for (String stopword : stopwords) { sb.append(("\\b(?i)" + stopword + "\\b|")); } String regex = sb.substring(0, sb.length() - 1).toLowerCase(); Pattern p = Pattern.compile(regex); return p; } /** * * @param p * @param content * @return */ public StringBuffer removeStringPattern(Pattern p, String content) { StringBuffer s = new StringBuffer(); Matcher m = p.matcher(content); while (m.find()) { m.appendReplacement(s, replaceString); } m.appendTail(s); return s; } }