/* LanguageTool, a natural language style checker * Copyright (C) 2012 Marcin Miłkowski (http://www.languagetool.org) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.rules.pl; import org.languagetool.AnalyzedTokenReadings; import org.languagetool.Language; import org.languagetool.rules.*; import org.languagetool.rules.spelling.morfologik.MorfologikSpellerRule; import java.io.IOException; import java.util.*; import java.util.regex.Pattern; public final class MorfologikPolishSpellerRule extends MorfologikSpellerRule { private static final String RESOURCE_FILENAME = "/pl/hunspell/pl_PL.dict"; private static final Pattern POLISH_TOKENIZING_CHARS = Pattern.compile("(?:[Qq]uasi|[Nn]iby)-"); /** * The set of prefixes that are not allowed to be split in the suggestions. */ private static final Set<String> prefixes; //Polish prefixes that should never be used to //split parts of words static { final Set<String> tempSet = new HashSet<>(); tempSet.add("arcy"); tempSet.add("neo"); tempSet.add("pre"); tempSet.add("anty"); tempSet.add("eks"); tempSet.add("bez"); tempSet.add("beze"); tempSet.add("ekstra"); tempSet.add("hiper"); tempSet.add("infra"); tempSet.add("kontr"); tempSet.add("maksi"); tempSet.add("midi"); tempSet.add("między"); tempSet.add("mini"); tempSet.add("nad"); tempSet.add("nade"); tempSet.add("około"); tempSet.add("ponad"); tempSet.add("post"); tempSet.add("pro"); tempSet.add("przeciw"); tempSet.add("pseudo"); tempSet.add("super"); tempSet.add("śród"); tempSet.add("ultra"); tempSet.add("wice"); tempSet.add("wokół"); tempSet.add("wokoło"); prefixes = Collections.unmodifiableSet(tempSet); } /** * non-word suffixes that should not be suggested (only morphological endings, never after a space) */ private static final Set<String> bannedSuffixes; static { final Set<String> tempSet = new HashSet<>(); tempSet.add("ami"); tempSet.add("ach"); tempSet.add("e"); tempSet.add("ego"); tempSet.add("em"); tempSet.add("emu"); tempSet.add("ie"); tempSet.add("im"); tempSet.add("m"); tempSet.add("om"); tempSet.add("owie"); tempSet.add("owi"); tempSet.add("ze"); bannedSuffixes = Collections.unmodifiableSet(tempSet); } public MorfologikPolishSpellerRule(ResourceBundle messages, Language language) throws IOException { super(messages, language); setCategory(Categories.TYPOS.getCategory(messages)); addExamplePair(Example.wrong("To jest zdanie z <marker>bledem</marker>"), Example.fixed("To jest zdanie z <marker>błędem</marker>.")); } @Override public String getFileName() { return RESOURCE_FILENAME; } @Override public String getId() { return "MORFOLOGIK_RULE_PL_PL"; } @Override public Pattern tokenizingPattern() { return POLISH_TOKENIZING_CHARS; } @Override protected List<RuleMatch> getRuleMatches(final String word, final int startPos) throws IOException { final List<RuleMatch> ruleMatches = new ArrayList<>(); if (isMisspelled(speller1, word) && isNotCompound(word)) { final RuleMatch ruleMatch = new RuleMatch(this, startPos, startPos + word.length(), messages.getString("spelling"), messages.getString("desc_spelling_short")); //If lower case word is not a misspelled word, return it as the only suggestion if (!isMisspelled(speller1, word.toLowerCase(conversionLocale))) { List<String> suggestion = Arrays.asList(word.toLowerCase(conversionLocale)); ruleMatch.setSuggestedReplacements(suggestion); ruleMatches.add(ruleMatch); return ruleMatches; } List<String> suggestions = speller1.getSuggestions(word); suggestions.addAll(0, getAdditionalTopSuggestions(suggestions, word)); suggestions.addAll(getAdditionalSuggestions(suggestions, word)); if (!suggestions.isEmpty()) { ruleMatch.setSuggestedReplacements(pruneSuggestions(orderSuggestions(suggestions,word))); } ruleMatches.add(ruleMatch); } return ruleMatches; } /** * Check whether the word is a compound adjective or contains a non-splitting prefix. * Used to suppress false positives. * * @param word Word to be checked. * @return True if the word is not a compound. * @since 2.5 */ private boolean isNotCompound(String word) throws IOException { List<String> probablyCorrectWords = new ArrayList<>(); List<String> testedTokens = new ArrayList<>(2); for (int i = 2; i < word.length(); i++) { // chop from left to right final String first = word.substring(0, i); final String second = word.substring(i, word.length()); if (prefixes.contains(first.toLowerCase(conversionLocale)) && !isMisspelled(speller1, second) && second.length() > first.length()) { // but not for short words such as "premoc" // ignore this match, it's fine probablyCorrectWords.add(word); // FIXME: some strange words are being accepted, like prekupa } else { testedTokens.clear(); testedTokens.add(first); testedTokens.add(second); List<AnalyzedTokenReadings> taggedToks = language.getTagger().tag(testedTokens); if (taggedToks.size() == 2 // "białozielony", trzynastobitowy && (taggedToks.get(0).hasPosTag("adja") || (taggedToks.get(0).hasPosTag("num:comp") && !taggedToks.get(0).hasPosTag("adv"))) && taggedToks.get(1).hasPartialPosTag("adj:")) { probablyCorrectWords.add(word); } } } if (!probablyCorrectWords.isEmpty()) { addIgnoreTokens(probablyCorrectWords); return false; } return true; } /** * Remove suggestions -- not really runon words using a list of non-word suffixes * @return A list of pruned suggestions. */ private List<String> pruneSuggestions(final List<String> suggestions) { List<String> prunedSuggestions = new ArrayList<>(suggestions.size()); for (final String suggestion : suggestions) { if (suggestion.indexOf(' ') == -1) { prunedSuggestions.add(suggestion); } else { String[] complexSug = suggestion.split(" "); if (!bannedSuffixes.contains(complexSug[1])) { prunedSuggestions.add(suggestion); } } } return prunedSuggestions; } }