package jhazm.tokenizer;
import jhazm.utility.RegexPattern;
import java.util.Arrays;
import java.util.List;
/**
* @author Mojtaba Khallash
*/
public class SentenceTokenizer {
public static SentenceTokenizer instance;
private final RegexPattern pattern;
public SentenceTokenizer() {
this.pattern = new RegexPattern("([!\\.\\?⸮؟]+)[ \\n]+", "$1\n\n");
}
public static SentenceTokenizer i() {
if (instance != null) return instance;
instance = new SentenceTokenizer();
return instance;
}
public List<String> tokenize(String text) {
text = this.pattern.apply(text);
List<String> sentences = Arrays.asList(text.split("\n\n"));
for (String sentence : sentences) {
sentence = sentence.replace("\n", " ").trim();
}
return sentences;
}
}