package bg.bozho.ikratko;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.PostConstruct;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.BooleanUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.commons.collections4.trie.PatriciaTrie;
import org.apache.commons.collections4.Trie;
import org.springframework.stereotype.Component;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
@Component
public class Checker {
public static Trie<String, Set<String>> dictionary;
public static Trie<String, InflectedFormType> formsDictionary;
private static boolean initialized = false;
public static Map<String, Multimap<String, String>> inflectionClasses = Maps.newHashMap();
public static Map<String, Multimap<String, String>> pluralInflectionClasses = Maps.newHashMap();
private static final String POTENTIAL_MISTAKE_REGEX = "(\\p{L}*[аъоуеиюя][ий]\\p{L}*)";
private static final String POTENTIAL_MISTAKE_REGEX_I = "(\\p{L}*[аъоуеиюя])и(\\p{L}*)";
private static final String POTENTIAL_MISTAKE_REGEX_Y = "(\\p{L}*[аъоуеиюя])й(\\p{L}*)";
private static final String END_OF_SENTENCE = "[\\.!?]";
public static final List<String> toBeFormsSg = Arrays.asList("съм", "си", "е", "бях", "беше", "бъда", "бъдеш", "бъде");
public static final List<String> toBeFormsPl = Arrays.asList("сме", "сте", "са", "бяхме", "бяхте", "бяха", "бъдат");
private static final Set<String> pronounsSgSet = Sets.newHashSet("някой", "никой", "кой", "чий");
private static final Set<String> pronounsPlSet = Sets.newHashSet("някои", "никои", "кои", "чии");
private static final Set<String> linkingPronounsSgSet = Sets.newHashSet("който", "чийто");
private static final Set<String> linkingPronounsPlSet = Sets.newHashSet("които", "чиито");
private static final Set<String> pluralIdentfiers = Sets.newHashSet("няколко", "николко", "много", "малко", "доста", "брой", "безброй", "тези", "онези");
private static final Set<String> singularIdentfiers = Sets.newHashSet("един", "този", "онзи");
public static final Set<String> verbClasses = Sets.newHashSet("P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z");
@PostConstruct
public synchronized void initialize() {
initialize(true);
}
@PostConstruct
public synchronized void initialize(boolean cleanupDictionary) {
if (!initialized) {
load();
loadInflections();
loadFormsDictionary();
if (cleanupDictionary) {
dictionary = null; // eligible for GC. TODO can merge these two load methods, but it's easier not to, for now
}
initialized = true;
}
}
public Result process(String input, boolean spellcheckAll) {
if (!initialized) {
throw new IllegalStateException("The checker must be initialized first");
}
// getting all words + the punctuation marks for end of sentence
// in order to be able to identify sentence boundaries
String[] words = input.split("(?<=" + END_OF_SENTENCE + ")|(?=" + END_OF_SENTENCE + ")|(\\p{Punct}*\\p{Space}+)");
for (int i = 2; i < words.length; i ++) {
if (words[i].equals("ма") && NumberUtils.isNumber(words[i-1])) {
words[i-1] = words[i-1] + "-ма";
words = ArrayUtils.remove(words, i);
}
}
List<Mistake> potentialMistakes = new ArrayList<Mistake>();
List<Mistake> mistakes = new ArrayList<Mistake>();
List<Mistake> otherMistakes = new ArrayList<Mistake>();
List<String> properNames = new ArrayList<String>();
int idx = 0;
int lengthSum = 0;
for (String word : words) {
if (StringUtils.isEmpty(word)) {
continue;
}
// proper names are not checked. They are those starting with
// capital letter and are not in the beginning of a sentence
int previousWordIdx = idx - 1;
if (previousWordIdx >= 0) {
if (Character.isUpperCase(word.charAt(0)) && !words[previousWordIdx].matches(END_OF_SENTENCE)) {
properNames.add(word);
continue;
}
}
if (word.toLowerCase().matches(POTENTIAL_MISTAKE_REGEX)) {
Mistake pm = new Mistake();
pm.setWord(word.toLowerCase());
// set as next (and previous) only words that can be inflected. If the next word
// is a misspelled one (not found in the dictionary), set it as empty
int nextWordIdx = idx + 1;
while (words.length > nextWordIdx && nextWordIdx - idx < 5) {
String nextWord = words[nextWordIdx++].toLowerCase();
if (nextWord.matches(END_OF_SENTENCE)) {
break;
}
InflectedFormType inflectedFormType = formsDictionary.get(nextWord);
if (inflectedFormType == null || inflectedFormType != InflectedFormType.NOT_INFLECTABLE) {
pm.setNextInflectableWord(StringUtils.trimToEmpty(nextWord));
break;
}
}
while (previousWordIdx > -1 && idx - previousWordIdx < 5) {
String previousWord = words[previousWordIdx].toLowerCase();
if (previousWord.matches(END_OF_SENTENCE)) {
break;
}
if (previousWordIdx == idx - 1) {
pm.setPreviousWord(previousWord);
}
InflectedFormType inflectedFormType = formsDictionary.get(previousWord);
if (inflectedFormType != null && inflectedFormType != InflectedFormType.NOT_INFLECTABLE) {
pm.setPreviousInflectableWord(StringUtils.trimToEmpty(previousWord));
break;
}
if ((inflectedFormType == null || inflectedFormType == InflectedFormType.NOT_INFLECTABLE)
&& NumeralDetector.isNumeral(previousWord)) {
pm.setPreviousInflectableWord(previousWord);
}
previousWordIdx--;
}
pm.setIndexInText(input.indexOf(word, lengthSum));
potentialMistakes.add(pm);
} else if (spellcheckAll) {
if (formsDictionary.get(word.toLowerCase()) == null) {
otherMistakes.add(new Mistake(word));
}
}
idx++;
lengthSum += word.length();
}
System.out.println(potentialMistakes);
for (Iterator<Mistake> it = potentialMistakes.iterator(); it.hasNext();) {
Mistake potentialMistake = it.next();
// if this word is an inexistent word form, but its alternative in terms of и/й exists -
// it's a и/й mistake, otherwise - it's a regular spelling mistake
if (!isExistingWordForm(potentialMistake.getWord())) {
String alternative = null;
if (potentialMistake.getWord().matches(POTENTIAL_MISTAKE_REGEX_I)) {
alternative = potentialMistake.getWord().replaceAll(POTENTIAL_MISTAKE_REGEX_I, "$1й$2");
} else if (potentialMistake.getWord().matches(POTENTIAL_MISTAKE_REGEX_Y)) {
alternative = potentialMistake.getWord().replaceAll(POTENTIAL_MISTAKE_REGEX_Y, "$1и$2");
}
if (!isExistingWordForm(alternative)) {
otherMistakes.add(potentialMistake);
} else {
mistakes.add(potentialMistake);
}
continue;
}
handleWrongLinkingPronounForm(potentialMistake, mistakes);
handleWrongPronounForm(potentialMistake, mistakes);
handleWrongPluralForms(potentialMistake, mistakes);
//handle wrong imperatives: пеЙ, пеи
}
Result result = new Result();
result.setMistakes(mistakes);
result.setOtherSpellingMistakes(otherMistakes);
result.setProperNames(properNames);
return result;
}
private boolean isExistingWordForm(String word) {
if (!formsDictionary.containsKey(word)) {
return false;
}
return true;
}
private void handleWrongPronounForm(Mistake potentialMistake, List<Mistake> mistakes) {
//"някой", "никой", "кой"
if (potentialMistake.matches("няко[ий]|нико[ий]|ко[ий]")) {
if (!agreesOnPlurality(potentialMistake.getWord(), potentialMistake.getNextInflectableWord(),
pronounsSgSet, pronounsPlSet)) {
mistakes.add(potentialMistake);
}
}
}
private void handleWrongLinkingPronounForm(Mistake potentialMistake, List<Mistake> mistakes) {
if (potentialMistake.matches("чи[ий]то") ) {
if (!agreesOnPlurality(potentialMistake.getWord(), potentialMistake.getNextInflectableWord(),
linkingPronounsSgSet, linkingPronounsPlSet)) {
mistakes.add(potentialMistake);
}
}
if (potentialMistake.matches("ко[ий]то|") ) {
boolean isNextWordVerb = false;
if (StringUtils.isNotEmpty(potentialMistake.getNextInflectableWord())) {
InflectedFormType nextWordFormType = formsDictionary.get(potentialMistake.getNextInflectableWord());
isNextWordVerb = nextWordFormType != null && nextWordFormType.isVerb();
}
// if the next word is a verb, check for agreement with the previous (the verb does not necessarily agree with the pronoun)
// if there is no previous word (i.e. if it's the start of the sentence), use the next word
String agreeingWord = StringUtils.isNotEmpty(potentialMistake.getPreviousInflectableWord()) ? potentialMistake
.getPreviousInflectableWord() : potentialMistake.getNextInflectableWord();
if (isNextWordVerb && !agreesOnPlurality(potentialMistake.getWord(), agreeingWord,
linkingPronounsSgSet, linkingPronounsPlSet)) {
mistakes.add(potentialMistake);
} else if (!isNextWordVerb && !agreesOnPlurality(potentialMistake.getWord(), potentialMistake.getNextInflectableWord(),
linkingPronounsSgSet, linkingPronounsPlSet)) {
mistakes.add(potentialMistake);
}
}
}
private void handleWrongPluralForms(Mistake potentialMistake, List<Mistake> mistakes) {
// cases like полицай/полицаи, трамвай/трамваи
InflectedFormType formType = formsDictionary.get(potentialMistake.getWord());
if (formType.isSpecialCaseNoun()) {
InflectedFormType previousFormType = formsDictionary.get(potentialMistake.getPreviousWord());
if (previousFormType == InflectedFormType.NOT_INFLECTABLE || previousFormType == null) {
if (!formType.isPlural() && NumeralDetector.isNumeral(potentialMistake.getPreviousInflectableWord())) {
mistakes.add(potentialMistake);
} else if (pluralIdentfiers.contains(potentialMistake.getPreviousWord()) || NumeralDetector.isNumeral(potentialMistake.getPreviousWord())) {
if (!formType.isPlural()) {
mistakes.add(potentialMistake);
}
} else if (singularIdentfiers.contains(potentialMistake.getPreviousWord())) {
if (formType.isPlural()) {
mistakes.add(potentialMistake);
}
} else {
InflectedFormType previousInflectedWordFormType = formsDictionary.get(potentialMistake.getPreviousInflectableWord());
if (previousInflectedWordFormType != null && disagreesOnPlurality(formType, previousInflectedWordFormType)) {
mistakes.add(potentialMistake);
}
}
} else if (previousFormType != null) {
if (disagreesOnPlurality(formType, previousFormType)) {
mistakes.add(potentialMistake);
}
}
}
}
private boolean disagreesOnPlurality(InflectedFormType formType, InflectedFormType previousFormType) {
// if they don't agree on plurality, it's a mistake
return BooleanUtils.xor(new boolean[] {previousFormType.isPlural(), formType.isPlural()});
}
@Deprecated
private boolean handleWrongPluralSimple(Mistake potentialMistake, List<Mistake> mistakes) {
if (potentialMistake.getWord().endsWith("й")) {
String baseForm = potentialMistake.getWord().substring(0, potentialMistake.getWord().length() - 1) + "я";
for (String type : dictionary.get(baseForm)) {
if (type != null && type.equals("M") || type.equals("K")) {
mistakes.add(potentialMistake);
return true;
}
}
}
return false;
}
private boolean agreesOnPlurality(String word, String agreeingWord, Set<String> singularWords, Set<String> pluralWord) {
// if there is nothing to agree with assume the form is correct
if (StringUtils.isEmpty(agreeingWord)) {
return true;
}
InflectedFormType formType = formsDictionary.get(agreeingWord);
if (formType == null) {
return false;
}
if ((singularWords.contains(word) && !formType.isPlural())
|| (pluralWord.contains(word) && formType.isPlural())) {
return true;
}
return false;
}
public static void load() {
InputStream is = Checker.class.getResourceAsStream("/bg_BG.dic");
List<String> lines = null;
try {
lines = IOUtils.readLines(is, "utf-8");
} catch (IOException ex) {
throw new IllegalStateException(ex);
} finally {
IOUtils.closeQuietly(is);
}
dictionary = new PatriciaTrie<Set<String>>();
for (String line : lines) {
int paradigmIdx = line.indexOf("/");
if (paradigmIdx != -1) {
String inflectionClasses = line.substring(paradigmIdx + 1);
// /AK is possible, i.e. multiple infl. classes per word
dictionary.put(line.substring(0, paradigmIdx).toLowerCase(),
Sets.newHashSet(charToStringArray(inflectionClasses.toCharArray())));
} else {
dictionary.put(line.toLowerCase(), Collections.<String>emptySet());
}
}
}
public static void loadFormsDictionary() {
formsDictionary = new PatriciaTrie<InflectedFormType>();
for (Map.Entry<String, Set<String>> word : dictionary.entrySet()) {
String baseForm = word.getKey();
if (word.getValue().isEmpty()) {
formsDictionary.put(baseForm, InflectedFormType.NOT_INFLECTABLE);
continue;
}
for (String inflectionClass : word.getValue()) {
Multimap<String, String> inflections = inflectionClasses.get(inflectionClass);
if (inflections == null) {
formsDictionary.put(baseForm, InflectedFormType.NOT_INFLECTABLE);
continue;
}
boolean specialCaseNoun = false;
if (baseForm.endsWith("й") && (inflectionClass.equals("O") || inflectionClass.equals("M"))) {
specialCaseNoun = true;
}
boolean verb = verbClasses.contains(inflectionClass);
for (String ending : inflections.keySet()) {
int endingIdx = baseForm.lastIndexOf(ending);
if (!baseForm.endsWith(ending) || endingIdx == -1) {
continue;
}
formsDictionary.put(baseForm, getInflectedFormType(specialCaseNoun, verb, false));
Collection<String> pluralSuffixes = pluralInflectionClasses.get(inflectionClass).get(ending);
for (String suffix : inflections.get(ending)) {
String inflectedWord = baseForm.substring(0, endingIdx) + suffix;
boolean isPlural = pluralSuffixes.contains(suffix);
formsDictionary.put(inflectedWord, getInflectedFormType(specialCaseNoun, verb, isPlural));
}
}
}
}
// override the forms of the verb "to be"
for (String sgForm : toBeFormsSg) {
formsDictionary.put(sgForm, InflectedFormType.REGULAR_FORM_VERB);
}
for (String plForm : toBeFormsPl) {
formsDictionary.put(plForm, InflectedFormType.PLURAL_FORM_VERB);
}
}
public static InflectedFormType getInflectedFormType(boolean specialCaseNoun, boolean verb, boolean plural) {
if (specialCaseNoun && plural) {
return InflectedFormType.PLURAL_FORM_SPECIAL;
} else if (specialCaseNoun && !plural) {
return InflectedFormType.REGULAR_FORM_SPECIAL;
} else if (verb && plural) {
return InflectedFormType.PLURAL_FORM_VERB;
} else if (verb && !plural) {
return InflectedFormType.REGULAR_FORM_VERB;
} else if (plural) {
return InflectedFormType.PLURAL_FORM;
} else {
return InflectedFormType.REGULAR_FORM;
}
}
public static void loadInflections() {
InputStream inputStreamAll = Checker.class.getResourceAsStream("/bg_BG.aff");
fillInflectionClasses(inflectionClasses, inputStreamAll);
InputStream inputStreamPlurals = Checker.class.getResourceAsStream("/plurals.aff");
fillInflectionClasses(pluralInflectionClasses, inputStreamPlurals);
}
private static void fillInflectionClasses(Map<String, Multimap<String, String>> map, InputStream is) {
List<String> lines = null;
try {
lines = IOUtils.readLines(is, "utf-8");
} catch (IOException ex) {
throw new IllegalStateException(ex);
} finally {
IOUtils.closeQuietly(is);
}
boolean newInflectionClass = false;
for (String line : lines) {
if (line.trim().isEmpty()) {
newInflectionClass = true;
continue;
}
if (!line.startsWith("SFX")) {
continue;
}
String inflectionClass = line.substring(4, 5);
if (newInflectionClass) {
map.put(inflectionClass, HashMultimap.<String, String>create());
} else {
String[] parts = line.split("\\p{Space}+");
String suffix = parts[3];
if (suffix.equals("0")) {
suffix = "";
}
String baseFormEnding = parts[2];
if (baseFormEnding.equals("0")) {
baseFormEnding = "";
}
// the inflection suffixes are the values of the multimap, with key=the base form ending
map.get(inflectionClass).put(baseFormEnding, suffix);
}
newInflectionClass = false;
}
}
private static String[] charToStringArray(char[] array) {
String[] result = new String[array.length];
for (int i = 0; i < array.length; i ++) {
result[i] = String.valueOf(array[i]);
}
return result;
}
public static void main(String args[]) throws Exception {
String input = "Който управлява и провежда изборите, невинаги ги печели, но почти винаги върши неща извън позволените, защото средствата за разкриването им са под негов контрол. Нарушения вършат и чуждите, но вероятността те да бъдат посочени, а нашите покрити, е докъм 100-процентова.";
//input = "Децата, чийто обувки продадоха";
//input = "Човекът, който пее";
//input = "Кои ще дойде с мен";
//input = "Човекът, които дойде, ще ни донесе филий";
//input = "В София има много автобуси и трамваи.";
//input = "Качих се на един трамвай";
//input = "Качих 32-ма полицаи. Тъпи ченгета! Край.";
//input = "Които пее, зло не мисли";
//input = "видях бастуна, които те купиха";
//input = "двамата полицай, който видях";
//input = "хората, които дойдоха, ядат филий";
//input = "трима полицай";
//input = "Има ли хора, които да си хапват пълнени чушки точно в този момент?";
input = "Господин полицаи, оставете ме намира.";
input = "Двамата полицай, който видях";
input = "някой";
input = "Общината планува да закупи 10 тролей";
input = "Някой, който идва";
input = "пеней на гошо, гошовата майка";
input = "кой";
input = "баба й дядо";
input = "Дойде Антиохий и падна зад мъжа, който те изядоха";
Result result = new Checker().process(input, true);
System.out.println(result.getMistakes());
System.out.println(result.getOtherSpellingMistakes());
System.out.println(result.getProperNames());
}
public static class Mistake {
private String word;
private String clause = "";
private String nextInflectableWord = "";
private String previousInflectableWord = "";
private String previousWord = "";
private int indexInText;
public Mistake() {
}
public Mistake(String word) {
this.word = word;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public String getClause() {
return clause;
}
public void setClause(String sentence) {
this.clause = sentence;
}
public String getNextInflectableWord() {
return nextInflectableWord;
}
public void setNextInflectableWord(String nextWord) {
this.nextInflectableWord = nextWord;
}
public int getIndexInText() {
return indexInText;
}
public void setIndexInText(int indexInText) {
this.indexInText = indexInText;
}
public String getPreviousInflectableWord() {
return previousInflectableWord;
}
public void setPreviousInflectableWord(String previousWord) {
this.previousInflectableWord = previousWord;
}
public String getPreviousWord() {
return previousWord;
}
public void setPreviousWord(String previousWord) {
this.previousWord = previousWord;
}
public boolean matches(String regex) {
return word.matches(regex);
}
@Override
public String toString() {
return "Mistake [word=" + word + ", nextInflectableWord=" + nextInflectableWord
+ ", previousInflectableWord=" + previousInflectableWord + ", previousWord="
+ previousWord + "]";
}
}
public static class Result {
private List<Mistake> mistakes;
private List<Mistake> otherSpellingMistakes;
private List<String> properNames;
public List<Mistake> getMistakes() {
return mistakes;
}
public void setMistakes(List<Mistake> mistakes) {
this.mistakes = mistakes;
}
public List<Mistake> getOtherSpellingMistakes() {
return otherSpellingMistakes;
}
public void setOtherSpellingMistakes(List<Mistake> spellingMistakes) {
this.otherSpellingMistakes = spellingMistakes;
}
public List<String> getProperNames() {
return properNames;
}
public void setProperNames(List<String> properNames) {
this.properNames = properNames;
}
}
// using enum for value in the trie to save memory - otherwise there will be different instance for each form
public static enum InflectedFormType {
PLURAL_FORM(true, false, false),
PLURAL_FORM_SPECIAL(true, true, false),
PLURAL_FORM_VERB(true, false, true),
REGULAR_FORM(false, false, false),
REGULAR_FORM_SPECIAL(false, true, false),
REGULAR_FORM_VERB(false, false, true),
NOT_INFLECTABLE(false, false, false);
private boolean plural;
private boolean specialCaseNoun;
private boolean verb;
private InflectedFormType(boolean plural, boolean specialCaseNoun, boolean verb) {
this.plural = plural;
this.specialCaseNoun = specialCaseNoun;
this.verb = verb;
}
public boolean isPlural() {
return plural;
}
public boolean isSpecialCaseNoun() {
return specialCaseNoun;
}
public boolean isVerb() {
return verb;
}
}
}