package jhazm;
import jhazm.utility.MakeTrans;
import jhazm.utility.RegexPattern;
import java.util.ArrayList;
import java.util.List;
/**
* @author Mojtaba Khallash
*/
public class Normalizer {
public static Normalizer instance;
private final String puncAfter = "!:\\.،؛؟»\\]\\)\\}";
private final String puncBefore = "«\\[\\(\\{";
private boolean characterRefinement = true;
private List<RegexPattern> characterRefinementPatterns;
private boolean punctuationSpacing = true;
private List<RegexPattern> punctuationSpacingPatterns;
private boolean affixSpacing = true;
private List<RegexPattern> affixSpacingPatterns;
private MakeTrans translations;
public Normalizer() {
this(true, true, true);
}
public Normalizer(boolean characterRefinement, boolean punctuationSpacing, boolean affixSpacing) {
this.characterRefinement = characterRefinement;
this.punctuationSpacing = punctuationSpacing;
this.affixSpacing = affixSpacing;
this.translations = new MakeTrans(" كي;%1234567890", " کی؛٪۱۲۳۴۵۶۷۸۹۰");
if (this.characterRefinement) {
this.characterRefinementPatterns = new ArrayList<>();
// remove "keshide" and "carriage return" characters
this.characterRefinementPatterns.add(new RegexPattern("[ـ\\r]", ""));
// remove extra spaces
this.characterRefinementPatterns.add(new RegexPattern(" +", " "));
// remove extra newlines
this.characterRefinementPatterns.add(new RegexPattern("\n\n+", "\n\n"));
// replace 3 dots
this.characterRefinementPatterns.add(new RegexPattern(" ?\\.\\.\\.", " …"));
}
if (this.punctuationSpacing) {
this.punctuationSpacingPatterns = new ArrayList<>();
// remove space before punctuation
this.punctuationSpacingPatterns.add(new RegexPattern(" ([" + puncAfter + "])", "$1"));
// remove space after punctuation
this.punctuationSpacingPatterns.add(new RegexPattern("([" + puncBefore + "]) ", "$1"));
// put space after
this.punctuationSpacingPatterns.add(new RegexPattern("([" + puncAfter + "])([^ " + puncAfter + "])", "$1 $2"));
// put space before
this.punctuationSpacingPatterns.add(new RegexPattern("([^ " + puncBefore + "])([" + puncBefore + "])", "$1 $2"));
}
if (this.affixSpacing) {
this.affixSpacingPatterns = new ArrayList<>();
// fix ی space
this.affixSpacingPatterns.add(new RegexPattern("([^ ]ه) ی ", "$1ی "));
// put zwnj after می, نمی
this.affixSpacingPatterns.add(new RegexPattern("(^| )(ن?می) ", "$1$2"));
// put zwnj before تر, ترین, ها, های
this.affixSpacingPatterns.add(new RegexPattern(" (تر(ی(ن)?)?|ها(ی)?)(?=[ \n" + puncAfter + puncBefore + "]|$)", "$1"));
// join ام, ات, اش, ای
this.affixSpacingPatterns.add(new RegexPattern("([^ ]ه) (ا(م|ت|ش|ی))(?=[ \n" + puncAfter + "]|$)", "$1$2"));
}
}
public static Normalizer i() {
if (instance != null) return instance;
instance = new Normalizer();
return instance;
}
public String run(String text) {
if (this.characterRefinement)
text = characterRefinement(text);
if (this.punctuationSpacing)
text = punctuationSpacing(text);
if (this.affixSpacing)
text = affixSpacing(text);
return text;
}
private String characterRefinement(String text) {
text = this.translations.translate(text);
for (RegexPattern pattern : this.characterRefinementPatterns)
text = pattern.apply(text);
return text;
}
private String punctuationSpacing(String text) {
// TODO: don't put space inside time and float numbers
for (RegexPattern pattern : this.punctuationSpacingPatterns)
text = pattern.apply(text);
return text;
}
private String affixSpacing(String text) {
for (RegexPattern pattern : this.affixSpacingPatterns)
text = pattern.apply(text);
return text;
}
}