/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2013 Zoltan Bartko, Aaron Madlon-Kay Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.tokenizer; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.hunspell.Dictionary; import org.apache.lucene.analysis.hunspell.HunspellStemFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.omegat.util.Language; import org.omegat.util.Log; import org.omegat.util.OConsts; import org.omegat.util.Preferences; /** * Methods for tokenize string. * * @author Zoltan Bartko - bartkozoltan@bartkozoltan.com * @author Aaron Madlon-Kay */ @Tokenizer(languages = { Tokenizer.DISCOVER_AT_RUNTIME }) public class HunspellTokenizer extends BaseTokenizer { private static Map<Language, File> AFFIX_FILES; private static Map<Language, File> DICTIONARY_FILES; private Dictionary dict; private Dictionary getDict() { if (dict != null) { return dict; } if (AFFIX_FILES == null || DICTIONARY_FILES == null) { populateInstalledDicts(); } Language language = getLanguage(); File affixFile = AFFIX_FILES.get(language); File dictionaryFile = DICTIONARY_FILES.get(language); if (affixFile == null || dictionaryFile == null || !affixFile.exists() || !dictionaryFile.exists()) { Log.logErrorRB("HUNSPELL_TOKENIZER_DICT_NOT_INSTALLED", language.getLocale()); } try { dict = new Dictionary(new FileInputStream(affixFile), new FileInputStream(dictionaryFile)); return dict; } catch (Exception ex) { // Nothing } return null; } @Override protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed) throws IOException { StandardTokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(strOrig)); if (stemsAllowed) { Dictionary dictionary = getDict(); if (dictionary == null) { return tokenizer; } return new HunspellStemFilter(tokenizer, dictionary); /// TODO: implement stop words checks } else { return tokenizer; } } @Override public String[] getSupportedLanguages() { populateInstalledDicts(); Set<Language> commonLangs = AFFIX_FILES.keySet(); commonLangs.retainAll(DICTIONARY_FILES.keySet()); return langsToStrings(commonLangs); } private static void populateInstalledDicts() { AFFIX_FILES = new HashMap<Language, File>(); DICTIONARY_FILES = new HashMap<Language, File>(); String dictionaryDirPath = Preferences.getPreference(Preferences.SPELLCHECKER_DICTIONARY_DIRECTORY); if (dictionaryDirPath.isEmpty()) { return; } File dictionaryDir = new File(dictionaryDirPath); if (!dictionaryDir.isDirectory()) { return; } for (File file : dictionaryDir.listFiles()) { String name = file.getName(); if (name.endsWith(OConsts.SC_AFFIX_EXTENSION)) { Language lang = new Language(name.substring(0, name.lastIndexOf(OConsts.SC_AFFIX_EXTENSION))); AFFIX_FILES.put(lang, file); AFFIX_FILES.put(new Language(lang.getLanguageCode()), file); } else if (name.endsWith(OConsts.SC_DICTIONARY_EXTENSION)) { Language lang = new Language(name.substring(0, name.lastIndexOf(OConsts.SC_DICTIONARY_EXTENSION))); DICTIONARY_FILES.put(lang, file); DICTIONARY_FILES.put(new Language(lang.getLanguageCode()), file); } } } private static String[] langsToStrings(Set<Language> langs) { List<String> result = new ArrayList<String>(); for (Language lang : langs) { result.add(lang.getLanguage().toLowerCase()); result.add(lang.getLanguageCode().toLowerCase()); } return result.toArray(new String[result.size()]); } }