package bg.bozho.ikratko; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import javax.annotation.PostConstruct; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.BooleanUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.math.NumberUtils; import org.apache.commons.collections4.trie.PatriciaTrie; import org.apache.commons.collections4.Trie; import org.springframework.stereotype.Component; import com.google.common.collect.HashMultimap; import com.google.common.collect.Maps; import com.google.common.collect.Multimap; import com.google.common.collect.Sets; @Component public class Checker { public static Trie<String, Set<String>> dictionary; public static Trie<String, InflectedFormType> formsDictionary; private static boolean initialized = false; public static Map<String, Multimap<String, String>> inflectionClasses = Maps.newHashMap(); public static Map<String, Multimap<String, String>> pluralInflectionClasses = Maps.newHashMap(); private static final String POTENTIAL_MISTAKE_REGEX = "(\\p{L}*[аъоуеиюя][ий]\\p{L}*)"; private static final String POTENTIAL_MISTAKE_REGEX_I = "(\\p{L}*[аъоуеиюя])и(\\p{L}*)"; private static final String POTENTIAL_MISTAKE_REGEX_Y = "(\\p{L}*[аъоуеиюя])й(\\p{L}*)"; private static final String END_OF_SENTENCE = "[\\.!?]"; public static final List<String> toBeFormsSg = Arrays.asList("съм", "си", "е", "бях", "беше", "бъда", "бъдеш", "бъде"); public static final List<String> toBeFormsPl = Arrays.asList("сме", "сте", "са", "бяхме", "бяхте", "бяха", "бъдат"); private static final Set<String> pronounsSgSet = Sets.newHashSet("някой", "никой", "кой", "чий"); private static final Set<String> pronounsPlSet = Sets.newHashSet("някои", "никои", "кои", "чии"); private static final Set<String> linkingPronounsSgSet = Sets.newHashSet("който", "чийто"); private static final Set<String> linkingPronounsPlSet = Sets.newHashSet("които", "чиито"); private static final Set<String> pluralIdentfiers = Sets.newHashSet("няколко", "николко", "много", "малко", "доста", "брой", "безброй", "тези", "онези"); private static final Set<String> singularIdentfiers = Sets.newHashSet("един", "този", "онзи"); public static final Set<String> verbClasses = Sets.newHashSet("P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"); @PostConstruct public synchronized void initialize() { initialize(true); } @PostConstruct public synchronized void initialize(boolean cleanupDictionary) { if (!initialized) { load(); loadInflections(); loadFormsDictionary(); if (cleanupDictionary) { dictionary = null; // eligible for GC. TODO can merge these two load methods, but it's easier not to, for now } initialized = true; } } public Result process(String input, boolean spellcheckAll) { if (!initialized) { throw new IllegalStateException("The checker must be initialized first"); } // getting all words + the punctuation marks for end of sentence // in order to be able to identify sentence boundaries String[] words = input.split("(?<=" + END_OF_SENTENCE + ")|(?=" + END_OF_SENTENCE + ")|(\\p{Punct}*\\p{Space}+)"); for (int i = 2; i < words.length; i ++) { if (words[i].equals("ма") && NumberUtils.isNumber(words[i-1])) { words[i-1] = words[i-1] + "-ма"; words = ArrayUtils.remove(words, i); } } List<Mistake> potentialMistakes = new ArrayList<Mistake>(); List<Mistake> mistakes = new ArrayList<Mistake>(); List<Mistake> otherMistakes = new ArrayList<Mistake>(); List<String> properNames = new ArrayList<String>(); int idx = 0; int lengthSum = 0; for (String word : words) { if (StringUtils.isEmpty(word)) { continue; } // proper names are not checked. They are those starting with // capital letter and are not in the beginning of a sentence int previousWordIdx = idx - 1; if (previousWordIdx >= 0) { if (Character.isUpperCase(word.charAt(0)) && !words[previousWordIdx].matches(END_OF_SENTENCE)) { properNames.add(word); continue; } } if (word.toLowerCase().matches(POTENTIAL_MISTAKE_REGEX)) { Mistake pm = new Mistake(); pm.setWord(word.toLowerCase()); // set as next (and previous) only words that can be inflected. If the next word // is a misspelled one (not found in the dictionary), set it as empty int nextWordIdx = idx + 1; while (words.length > nextWordIdx && nextWordIdx - idx < 5) { String nextWord = words[nextWordIdx++].toLowerCase(); if (nextWord.matches(END_OF_SENTENCE)) { break; } InflectedFormType inflectedFormType = formsDictionary.get(nextWord); if (inflectedFormType == null || inflectedFormType != InflectedFormType.NOT_INFLECTABLE) { pm.setNextInflectableWord(StringUtils.trimToEmpty(nextWord)); break; } } while (previousWordIdx > -1 && idx - previousWordIdx < 5) { String previousWord = words[previousWordIdx].toLowerCase(); if (previousWord.matches(END_OF_SENTENCE)) { break; } if (previousWordIdx == idx - 1) { pm.setPreviousWord(previousWord); } InflectedFormType inflectedFormType = formsDictionary.get(previousWord); if (inflectedFormType != null && inflectedFormType != InflectedFormType.NOT_INFLECTABLE) { pm.setPreviousInflectableWord(StringUtils.trimToEmpty(previousWord)); break; } if ((inflectedFormType == null || inflectedFormType == InflectedFormType.NOT_INFLECTABLE) && NumeralDetector.isNumeral(previousWord)) { pm.setPreviousInflectableWord(previousWord); } previousWordIdx--; } pm.setIndexInText(input.indexOf(word, lengthSum)); potentialMistakes.add(pm); } else if (spellcheckAll) { if (formsDictionary.get(word.toLowerCase()) == null) { otherMistakes.add(new Mistake(word)); } } idx++; lengthSum += word.length(); } System.out.println(potentialMistakes); for (Iterator<Mistake> it = potentialMistakes.iterator(); it.hasNext();) { Mistake potentialMistake = it.next(); // if this word is an inexistent word form, but its alternative in terms of и/й exists - // it's a и/й mistake, otherwise - it's a regular spelling mistake if (!isExistingWordForm(potentialMistake.getWord())) { String alternative = null; if (potentialMistake.getWord().matches(POTENTIAL_MISTAKE_REGEX_I)) { alternative = potentialMistake.getWord().replaceAll(POTENTIAL_MISTAKE_REGEX_I, "$1й$2"); } else if (potentialMistake.getWord().matches(POTENTIAL_MISTAKE_REGEX_Y)) { alternative = potentialMistake.getWord().replaceAll(POTENTIAL_MISTAKE_REGEX_Y, "$1и$2"); } if (!isExistingWordForm(alternative)) { otherMistakes.add(potentialMistake); } else { mistakes.add(potentialMistake); } continue; } handleWrongLinkingPronounForm(potentialMistake, mistakes); handleWrongPronounForm(potentialMistake, mistakes); handleWrongPluralForms(potentialMistake, mistakes); //handle wrong imperatives: пеЙ, пеи } Result result = new Result(); result.setMistakes(mistakes); result.setOtherSpellingMistakes(otherMistakes); result.setProperNames(properNames); return result; } private boolean isExistingWordForm(String word) { if (!formsDictionary.containsKey(word)) { return false; } return true; } private void handleWrongPronounForm(Mistake potentialMistake, List<Mistake> mistakes) { //"някой", "никой", "кой" if (potentialMistake.matches("няко[ий]|нико[ий]|ко[ий]")) { if (!agreesOnPlurality(potentialMistake.getWord(), potentialMistake.getNextInflectableWord(), pronounsSgSet, pronounsPlSet)) { mistakes.add(potentialMistake); } } } private void handleWrongLinkingPronounForm(Mistake potentialMistake, List<Mistake> mistakes) { if (potentialMistake.matches("чи[ий]то") ) { if (!agreesOnPlurality(potentialMistake.getWord(), potentialMistake.getNextInflectableWord(), linkingPronounsSgSet, linkingPronounsPlSet)) { mistakes.add(potentialMistake); } } if (potentialMistake.matches("ко[ий]то|") ) { boolean isNextWordVerb = false; if (StringUtils.isNotEmpty(potentialMistake.getNextInflectableWord())) { InflectedFormType nextWordFormType = formsDictionary.get(potentialMistake.getNextInflectableWord()); isNextWordVerb = nextWordFormType != null && nextWordFormType.isVerb(); } // if the next word is a verb, check for agreement with the previous (the verb does not necessarily agree with the pronoun) // if there is no previous word (i.e. if it's the start of the sentence), use the next word String agreeingWord = StringUtils.isNotEmpty(potentialMistake.getPreviousInflectableWord()) ? potentialMistake .getPreviousInflectableWord() : potentialMistake.getNextInflectableWord(); if (isNextWordVerb && !agreesOnPlurality(potentialMistake.getWord(), agreeingWord, linkingPronounsSgSet, linkingPronounsPlSet)) { mistakes.add(potentialMistake); } else if (!isNextWordVerb && !agreesOnPlurality(potentialMistake.getWord(), potentialMistake.getNextInflectableWord(), linkingPronounsSgSet, linkingPronounsPlSet)) { mistakes.add(potentialMistake); } } } private void handleWrongPluralForms(Mistake potentialMistake, List<Mistake> mistakes) { // cases like полицай/полицаи, трамвай/трамваи InflectedFormType formType = formsDictionary.get(potentialMistake.getWord()); if (formType.isSpecialCaseNoun()) { InflectedFormType previousFormType = formsDictionary.get(potentialMistake.getPreviousWord()); if (previousFormType == InflectedFormType.NOT_INFLECTABLE || previousFormType == null) { if (!formType.isPlural() && NumeralDetector.isNumeral(potentialMistake.getPreviousInflectableWord())) { mistakes.add(potentialMistake); } else if (pluralIdentfiers.contains(potentialMistake.getPreviousWord()) || NumeralDetector.isNumeral(potentialMistake.getPreviousWord())) { if (!formType.isPlural()) { mistakes.add(potentialMistake); } } else if (singularIdentfiers.contains(potentialMistake.getPreviousWord())) { if (formType.isPlural()) { mistakes.add(potentialMistake); } } else { InflectedFormType previousInflectedWordFormType = formsDictionary.get(potentialMistake.getPreviousInflectableWord()); if (previousInflectedWordFormType != null && disagreesOnPlurality(formType, previousInflectedWordFormType)) { mistakes.add(potentialMistake); } } } else if (previousFormType != null) { if (disagreesOnPlurality(formType, previousFormType)) { mistakes.add(potentialMistake); } } } } private boolean disagreesOnPlurality(InflectedFormType formType, InflectedFormType previousFormType) { // if they don't agree on plurality, it's a mistake return BooleanUtils.xor(new boolean[] {previousFormType.isPlural(), formType.isPlural()}); } @Deprecated private boolean handleWrongPluralSimple(Mistake potentialMistake, List<Mistake> mistakes) { if (potentialMistake.getWord().endsWith("й")) { String baseForm = potentialMistake.getWord().substring(0, potentialMistake.getWord().length() - 1) + "я"; for (String type : dictionary.get(baseForm)) { if (type != null && type.equals("M") || type.equals("K")) { mistakes.add(potentialMistake); return true; } } } return false; } private boolean agreesOnPlurality(String word, String agreeingWord, Set<String> singularWords, Set<String> pluralWord) { // if there is nothing to agree with assume the form is correct if (StringUtils.isEmpty(agreeingWord)) { return true; } InflectedFormType formType = formsDictionary.get(agreeingWord); if (formType == null) { return false; } if ((singularWords.contains(word) && !formType.isPlural()) || (pluralWord.contains(word) && formType.isPlural())) { return true; } return false; } public static void load() { InputStream is = Checker.class.getResourceAsStream("/bg_BG.dic"); List<String> lines = null; try { lines = IOUtils.readLines(is, "utf-8"); } catch (IOException ex) { throw new IllegalStateException(ex); } finally { IOUtils.closeQuietly(is); } dictionary = new PatriciaTrie<Set<String>>(); for (String line : lines) { int paradigmIdx = line.indexOf("/"); if (paradigmIdx != -1) { String inflectionClasses = line.substring(paradigmIdx + 1); // /AK is possible, i.e. multiple infl. classes per word dictionary.put(line.substring(0, paradigmIdx).toLowerCase(), Sets.newHashSet(charToStringArray(inflectionClasses.toCharArray()))); } else { dictionary.put(line.toLowerCase(), Collections.<String>emptySet()); } } } public static void loadFormsDictionary() { formsDictionary = new PatriciaTrie<InflectedFormType>(); for (Map.Entry<String, Set<String>> word : dictionary.entrySet()) { String baseForm = word.getKey(); if (word.getValue().isEmpty()) { formsDictionary.put(baseForm, InflectedFormType.NOT_INFLECTABLE); continue; } for (String inflectionClass : word.getValue()) { Multimap<String, String> inflections = inflectionClasses.get(inflectionClass); if (inflections == null) { formsDictionary.put(baseForm, InflectedFormType.NOT_INFLECTABLE); continue; } boolean specialCaseNoun = false; if (baseForm.endsWith("й") && (inflectionClass.equals("O") || inflectionClass.equals("M"))) { specialCaseNoun = true; } boolean verb = verbClasses.contains(inflectionClass); for (String ending : inflections.keySet()) { int endingIdx = baseForm.lastIndexOf(ending); if (!baseForm.endsWith(ending) || endingIdx == -1) { continue; } formsDictionary.put(baseForm, getInflectedFormType(specialCaseNoun, verb, false)); Collection<String> pluralSuffixes = pluralInflectionClasses.get(inflectionClass).get(ending); for (String suffix : inflections.get(ending)) { String inflectedWord = baseForm.substring(0, endingIdx) + suffix; boolean isPlural = pluralSuffixes.contains(suffix); formsDictionary.put(inflectedWord, getInflectedFormType(specialCaseNoun, verb, isPlural)); } } } } // override the forms of the verb "to be" for (String sgForm : toBeFormsSg) { formsDictionary.put(sgForm, InflectedFormType.REGULAR_FORM_VERB); } for (String plForm : toBeFormsPl) { formsDictionary.put(plForm, InflectedFormType.PLURAL_FORM_VERB); } } public static InflectedFormType getInflectedFormType(boolean specialCaseNoun, boolean verb, boolean plural) { if (specialCaseNoun && plural) { return InflectedFormType.PLURAL_FORM_SPECIAL; } else if (specialCaseNoun && !plural) { return InflectedFormType.REGULAR_FORM_SPECIAL; } else if (verb && plural) { return InflectedFormType.PLURAL_FORM_VERB; } else if (verb && !plural) { return InflectedFormType.REGULAR_FORM_VERB; } else if (plural) { return InflectedFormType.PLURAL_FORM; } else { return InflectedFormType.REGULAR_FORM; } } public static void loadInflections() { InputStream inputStreamAll = Checker.class.getResourceAsStream("/bg_BG.aff"); fillInflectionClasses(inflectionClasses, inputStreamAll); InputStream inputStreamPlurals = Checker.class.getResourceAsStream("/plurals.aff"); fillInflectionClasses(pluralInflectionClasses, inputStreamPlurals); } private static void fillInflectionClasses(Map<String, Multimap<String, String>> map, InputStream is) { List<String> lines = null; try { lines = IOUtils.readLines(is, "utf-8"); } catch (IOException ex) { throw new IllegalStateException(ex); } finally { IOUtils.closeQuietly(is); } boolean newInflectionClass = false; for (String line : lines) { if (line.trim().isEmpty()) { newInflectionClass = true; continue; } if (!line.startsWith("SFX")) { continue; } String inflectionClass = line.substring(4, 5); if (newInflectionClass) { map.put(inflectionClass, HashMultimap.<String, String>create()); } else { String[] parts = line.split("\\p{Space}+"); String suffix = parts[3]; if (suffix.equals("0")) { suffix = ""; } String baseFormEnding = parts[2]; if (baseFormEnding.equals("0")) { baseFormEnding = ""; } // the inflection suffixes are the values of the multimap, with key=the base form ending map.get(inflectionClass).put(baseFormEnding, suffix); } newInflectionClass = false; } } private static String[] charToStringArray(char[] array) { String[] result = new String[array.length]; for (int i = 0; i < array.length; i ++) { result[i] = String.valueOf(array[i]); } return result; } public static void main(String args[]) throws Exception { String input = "Който управлява и провежда изборите, невинаги ги печели, но почти винаги върши неща извън позволените, защото средствата за разкриването им са под негов контрол. Нарушения вършат и чуждите, но вероятността те да бъдат посочени, а нашите покрити, е докъм 100-процентова."; //input = "Децата, чийто обувки продадоха"; //input = "Човекът, който пее"; //input = "Кои ще дойде с мен"; //input = "Човекът, които дойде, ще ни донесе филий"; //input = "В София има много автобуси и трамваи."; //input = "Качих се на един трамвай"; //input = "Качих 32-ма полицаи. Тъпи ченгета! Край."; //input = "Които пее, зло не мисли"; //input = "видях бастуна, които те купиха"; //input = "двамата полицай, който видях"; //input = "хората, които дойдоха, ядат филий"; //input = "трима полицай"; //input = "Има ли хора, които да си хапват пълнени чушки точно в този момент?"; input = "Господин полицаи, оставете ме намира."; input = "Двамата полицай, който видях"; input = "някой"; input = "Общината планува да закупи 10 тролей"; input = "Някой, който идва"; input = "пеней на гошо, гошовата майка"; input = "кой"; input = "баба й дядо"; input = "Дойде Антиохий и падна зад мъжа, който те изядоха"; Result result = new Checker().process(input, true); System.out.println(result.getMistakes()); System.out.println(result.getOtherSpellingMistakes()); System.out.println(result.getProperNames()); } public static class Mistake { private String word; private String clause = ""; private String nextInflectableWord = ""; private String previousInflectableWord = ""; private String previousWord = ""; private int indexInText; public Mistake() { } public Mistake(String word) { this.word = word; } public String getWord() { return word; } public void setWord(String word) { this.word = word; } public String getClause() { return clause; } public void setClause(String sentence) { this.clause = sentence; } public String getNextInflectableWord() { return nextInflectableWord; } public void setNextInflectableWord(String nextWord) { this.nextInflectableWord = nextWord; } public int getIndexInText() { return indexInText; } public void setIndexInText(int indexInText) { this.indexInText = indexInText; } public String getPreviousInflectableWord() { return previousInflectableWord; } public void setPreviousInflectableWord(String previousWord) { this.previousInflectableWord = previousWord; } public String getPreviousWord() { return previousWord; } public void setPreviousWord(String previousWord) { this.previousWord = previousWord; } public boolean matches(String regex) { return word.matches(regex); } @Override public String toString() { return "Mistake [word=" + word + ", nextInflectableWord=" + nextInflectableWord + ", previousInflectableWord=" + previousInflectableWord + ", previousWord=" + previousWord + "]"; } } public static class Result { private List<Mistake> mistakes; private List<Mistake> otherSpellingMistakes; private List<String> properNames; public List<Mistake> getMistakes() { return mistakes; } public void setMistakes(List<Mistake> mistakes) { this.mistakes = mistakes; } public List<Mistake> getOtherSpellingMistakes() { return otherSpellingMistakes; } public void setOtherSpellingMistakes(List<Mistake> spellingMistakes) { this.otherSpellingMistakes = spellingMistakes; } public List<String> getProperNames() { return properNames; } public void setProperNames(List<String> properNames) { this.properNames = properNames; } } // using enum for value in the trie to save memory - otherwise there will be different instance for each form public static enum InflectedFormType { PLURAL_FORM(true, false, false), PLURAL_FORM_SPECIAL(true, true, false), PLURAL_FORM_VERB(true, false, true), REGULAR_FORM(false, false, false), REGULAR_FORM_SPECIAL(false, true, false), REGULAR_FORM_VERB(false, false, true), NOT_INFLECTABLE(false, false, false); private boolean plural; private boolean specialCaseNoun; private boolean verb; private InflectedFormType(boolean plural, boolean specialCaseNoun, boolean verb) { this.plural = plural; this.specialCaseNoun = specialCaseNoun; this.verb = verb; } public boolean isPlural() { return plural; } public boolean isSpecialCaseNoun() { return specialCaseNoun; } public boolean isVerb() { return verb; } } }