package bg.bozho.ikratko.other; import java.util.Collection; import java.util.Map; import java.util.Set; import org.apache.commons.collections4.trie.PatriciaTrie; import bg.bozho.ikratko.Checker; import bg.bozho.ikratko.Checker.InflectedFormType; import com.google.appengine.repackaged.com.google.common.collect.Maps; import com.google.appengine.repackaged.com.google.common.collect.Sets; import com.google.common.collect.Multimap; /** * Намиране на всички омоними, вкл. тези, които са омоними само в определена форма. * * @author bozhanov * */ public class Homonyms { public static void main(String[] args) { Checker c = new Checker(); c.initialize(false); Checker.formsDictionary.clear(); // we don't need it in memory Set<String> homonyms = Sets.newHashSet(); // this is a variation of the formsDictionary initialization method, which counts homonyms PatriciaTrie<InflectedFormType> formsDictionary = new PatriciaTrie<InflectedFormType>(); for (Map.Entry<String, Set<String>> word : Checker.dictionary.entrySet()) { String baseForm = word.getKey(); if (word.getValue().isEmpty()) { checkAndInsertHomonym(baseForm, formsDictionary, homonyms, InflectedFormType.NOT_INFLECTABLE, baseForm); formsDictionary.put(baseForm, InflectedFormType.NOT_INFLECTABLE); continue; } for (String inflectionClass : word.getValue()) { Multimap<String, String> inflections = Checker.inflectionClasses.get(inflectionClass); if (inflections == null) { checkAndInsertHomonym(baseForm, formsDictionary, homonyms, InflectedFormType.NOT_INFLECTABLE, baseForm); formsDictionary.put(baseForm, InflectedFormType.NOT_INFLECTABLE); continue; } if (inflectionClass.equals("O") && (baseForm.endsWith("ане") || baseForm.endsWith("яне"))) { continue; // отлаголни съществителни } boolean specialCaseNoun = false; if (baseForm.endsWith("й") && (inflectionClass.equals("O") || inflectionClass.equals("M"))) { specialCaseNoun = true; } boolean verb = Checker.verbClasses.contains(inflectionClass); for (String ending : inflections.keySet()) { int endingIdx = baseForm.lastIndexOf(ending); if (!baseForm.endsWith(ending) || endingIdx == -1) { continue; } InflectedFormType type = Checker.getInflectedFormType(specialCaseNoun, verb, false); checkAndInsertHomonym(baseForm, formsDictionary, homonyms, type, baseForm); formsDictionary.put(baseForm, type); Collection<String> pluralSuffixes = Checker.pluralInflectionClasses.get(inflectionClass).get(ending); for (String suffix : inflections.get(ending)) { String inflectedWord = baseForm.substring(0, endingIdx) + suffix; boolean isPlural = pluralSuffixes.contains(suffix); InflectedFormType formType = Checker.getInflectedFormType(specialCaseNoun, verb, isPlural); checkAndInsertHomonym(inflectedWord, formsDictionary, homonyms, formType, baseForm); formsDictionary.put(inflectedWord, formType); } } } } System.out.println(homonyms.size()); } private static Map<String, String> mapping = Maps.newHashMap(); private static void checkAndInsertHomonym(String word, PatriciaTrie<InflectedFormType> formsDictionary, Set<String> homonyms, InflectedFormType type, String baseForm) { if (formsDictionary.containsKey(word)) { String originalBase = mapping.get(word); InflectedFormType originalType = formsDictionary.get(word); if (baseForm != null && originalBase != null) { if (sameRoot(baseForm, originalBase) // heuristic based on length || ignore(baseForm, originalBase, "н", "м", 1, 1, type, originalType, false, true) // "шлифовам" и "шлифован", напр. || ignore(baseForm, originalBase, "я", "ен", 1, 2, type, originalType, true, false) // червя и червен || ignore(baseForm, originalBase, "ващ", "вам", 1, 1, type, originalType, false, true) || ignore(baseForm, originalBase, "вяне", "вям", 2, 1, type, originalType, false, true) || ignore(baseForm, originalBase, "ан", "а", 1, 0, type, originalType, false, true) || ignore(baseForm, originalBase, "ян", "а", 1, 0, type, originalType, false, true) || ignore(baseForm, originalBase, "ение", "а", 4, 1, type, originalType, false, true) || ignore(baseForm, originalBase, "я", "ение", 1, 4, type, originalType, true, false) || ignore(baseForm, originalBase, "ат", "а", 1, 0, type, originalType, false, true) || ignore(baseForm, originalBase, "ая", "ан", 1, 1, type, originalType, true, false) || ignore(baseForm, originalBase, "я", "ене", 1, 3, type, originalType, true, false) || ignore(baseForm, originalBase, "я", "ан", 1, 2, type, originalType, true, false) || ignore(baseForm, originalBase, "ен", "а", 2, 1, type, originalType, false, true) || ignore(baseForm, originalBase, "я", "ея", 1, 2, type, originalType, true, true) || ignore(baseForm, originalBase, "я", "ещ", 1, 2, type, originalType, true, false) || ignore(baseForm, originalBase, "ящ", "я", 2, 1, type, originalType, false, true) || ignore(baseForm, originalBase, "ещ", "а", 2, 1, type, originalType, false, true) || ignore(baseForm, originalBase, "ин", "", 2, 0, type, originalType, false, false) || ignore(baseForm, originalBase, "ия", "ил", 2, 2, type, originalType, true, false) || ignore(baseForm, originalBase, "ия", "ит", 2, 2, type, originalType, true, false) || ignore(baseForm, originalBase, "ял", "я", 1, 0, type, originalType, false, true) || ignore(baseForm, originalBase, "ям", "я", 1, 0, type, originalType, false, true)) { return; // омоними в основна форма - няма нужда от всичките им форми } else if (baseForm.equals(originalBase) && !word.equals(baseForm)) { return; } } System.out.println(word + " (" + type + "): " + originalType + " (base: " + baseForm + "), original base: " + mapping.get(word) + ")"); homonyms.add(word); } else { mapping.put(word, baseForm); } } private static boolean sameRoot(String baseForm, String originalBase) { // with sufficient length, we can ignore suffixes and return baseForm.length() > 6 && originalBase.length() > 6 && baseForm.substring(0, 5).equals(originalBase.substring(0, 5)); } private static boolean ignore(String baseForm, String originalBase, String suffixBase, String suffixOriginal, int comparisonCutBase, int comparisonCutOriginal, InflectedFormType type, InflectedFormType originalType, boolean baseFormShouldBeVerb, boolean originalShouldBeVerb) { boolean formComparison = originalBase.endsWith(suffixOriginal) && baseForm.endsWith(suffixBase) && originalBase.substring(0, originalBase.length() - comparisonCutOriginal) .equals(baseForm.substring(0, baseForm.length() - comparisonCutBase)); if (!formComparison) { return false; } else { // requirements for form types that would mean that the even though form comparison succeeds, the root is not the same boolean requirementsMet = false; if (!baseFormShouldBeVerb && !originalShouldBeVerb) { requirementsMet = true; } if (baseFormShouldBeVerb && type.isVerb()) { requirementsMet = true; } if (originalShouldBeVerb && originalType.isVerb()) { requirementsMet = true; } return requirementsMet; } } }