/* LanguageTool, a natural language style checker
* Copyright (C) 2012 Marcin Miłkowski (http://www.languagetool.org)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.rules.uk;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.ResourceBundle;
import java.util.regex.Pattern;
import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.rules.spelling.morfologik.MorfologikMultiSpeller;
import org.languagetool.rules.spelling.morfologik.MorfologikSpellerRule;
import org.languagetool.tagging.uk.IPOSTag;
public final class MorfologikUkrainianSpellerRule extends MorfologikSpellerRule {
private static final String ABBREVIATION_CHAR = ".";
private static final String RESOURCE_FILENAME = "/uk/hunspell/uk_UA.dict";
private static final Pattern UKRAINIAN_LETTERS = Pattern.compile(".*[а-яіїєґА-ЯІЇЄҐ].*");
private static final Pattern DO_NOT_SUGGEST_SPACED_PATTERN = Pattern.compile(
"(авіа|авто|анти|аудіо|відео|водо|гідро|екстра|квазі|кіно|лже|мета|моно|мото|псевдо|пост|радіо|стерео|супер|ультра|фото) .*");
public MorfologikUkrainianSpellerRule(ResourceBundle messages,
Language language) throws IOException {
super(messages, language);
// setCheckCompound(true);
}
@Override
public String getFileName() {
return RESOURCE_FILENAME;
}
@Override
public String getId() {
return "MORFOLOGIK_RULE_UK_UA";
}
@Override
protected boolean isMisspelled(MorfologikMultiSpeller speller, String word) {
if( word.endsWith("²") || word.endsWith("³") ) {
word = word.substring(0, word.length() - 1);
}
return super.isMisspelled(speller, word);
}
@Override
protected boolean ignoreToken(AnalyzedTokenReadings[] tokens, int idx) throws IOException {
String word = tokens[idx].getToken();
// don't check words that don't have Ukrainian letters
if( ! UKRAINIAN_LETTERS.matcher(word).matches() )
return true;
if( super.ignoreToken(tokens, idx) )
return true;
if( idx < tokens.length - 1 && tokens[idx+1].getToken().equals(ABBREVIATION_CHAR) ) {
if( super.ignoreWord(word + ABBREVIATION_CHAR) ) {
return true;
}
if( word.matches("[А-ЯІЇЄҐ]") ) { //TODO: only do this for initials when last name is followed
return true;
}
}
if( word.contains("-") || word.contains("\u2011") || word.endsWith(".")
|| word.equalsIgnoreCase("раза") ) {
return hasGoodTag(tokens[idx]);
}
return false;
}
private boolean hasGoodTag(AnalyzedTokenReadings tokens) {
for (AnalyzedToken analyzedToken : tokens) {
String posTag = analyzedToken.getPOSTag();
if( posTag != null
&& ! posTag.equals(JLanguageTool.SENTENCE_START_TAGNAME)
&& ! posTag.equals(JLanguageTool.SENTENCE_END_TAGNAME)
&& ! posTag.contains(IPOSTag.bad.getText())
&& ! (posTag.contains(":inanim") && posTag.contains(":v_kly")) )
return true;
}
return false;
}
@Override
protected void filterSuggestions(List<String> suggestions) {
super.filterSuggestions(suggestions);
for (Iterator<String> iterator = suggestions.iterator(); iterator.hasNext();) {
String item = iterator.next();
if( item.contains(" ") && DO_NOT_SUGGEST_SPACED_PATTERN.matcher(item).matches() ) {
iterator.remove();
}
}
}
}