package i5.las2peer.services.ocd.preprocessing;
import java.util.Locale;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.tartarus.snowball.ext.PorterStemmer;
public class TextProcessor {
public String preprocText(String thread){
thread = deleteNonWords(thread);
//thread = stemming(thread);
return thread;
}
public String deletWhiteSpace(String thread){
return thread.replaceAll("\\s", "");
}
private String deleteNonWords(String thread){
String result = null;
thread = thread.replaceAll("<[^>]*>", ""); // remove html tags
result = thread.replaceAll("\\p{Punct}"," "); // remove Punctuation
result = result.toLowerCase(Locale.ROOT);
return result;
}
private String stemming(String thread){
PorterStemmer stemmer = new PorterStemmer();
stemmer.setCurrent(thread);
stemmer.stem();
return stemmer.getCurrent();
}
}