HunspellTokenizer.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2013 Zoltan Bartko, Aaron Madlon-Kay
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.tokenizer;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.analysis.hunspell.HunspellStemFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.omegat.util.Language;
import org.omegat.util.Log;
import org.omegat.util.OConsts;
import org.omegat.util.Preferences;

/**
 * Methods for tokenize string.
 *
 * @author Zoltan Bartko - bartkozoltan@bartkozoltan.com
 * @author Aaron Madlon-Kay
 */
@Tokenizer(languages = { Tokenizer.DISCOVER_AT_RUNTIME })
public class HunspellTokenizer extends BaseTokenizer {

    private static Map<Language, File> AFFIX_FILES;
    private static Map<Language, File> DICTIONARY_FILES;

    private Dictionary dict;

    private Dictionary getDict() {
        if (dict != null) {
            return dict;
        }

        if (AFFIX_FILES == null || DICTIONARY_FILES == null) {
            populateInstalledDicts();
        }

        Language language = getLanguage();
        File affixFile = AFFIX_FILES.get(language);
        File dictionaryFile = DICTIONARY_FILES.get(language);

        if (affixFile == null || dictionaryFile == null || !affixFile.exists()
                || !dictionaryFile.exists()) {
            Log.logErrorRB("HUNSPELL_TOKENIZER_DICT_NOT_INSTALLED", language.getLocale());
        }

        try {
            dict = new Dictionary(new FileInputStream(affixFile), new FileInputStream(dictionaryFile));
            return dict;
        } catch (Exception ex) {
            // Nothing
        }
        return null;
    }

    @Override
    protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed,
            final boolean stopWordsAllowed) throws IOException {
        StandardTokenizer tokenizer = new StandardTokenizer();
        tokenizer.setReader(new StringReader(strOrig));
        if (stemsAllowed) {
            Dictionary dictionary = getDict();
            if (dictionary == null) {
                return tokenizer;
            }

            return new HunspellStemFilter(tokenizer, dictionary);

            /// TODO: implement stop words checks
        } else {
            return tokenizer;
        }
    }

    @Override
    public String[] getSupportedLanguages() {

        populateInstalledDicts();

        Set<Language> commonLangs = AFFIX_FILES.keySet();
        commonLangs.retainAll(DICTIONARY_FILES.keySet());

        return langsToStrings(commonLangs);
    }

    private static void populateInstalledDicts() {
        AFFIX_FILES = new HashMap<Language, File>();
        DICTIONARY_FILES = new HashMap<Language, File>();

        String dictionaryDirPath = Preferences.getPreference(Preferences.SPELLCHECKER_DICTIONARY_DIRECTORY);
        if (dictionaryDirPath.isEmpty()) {
            return;
        }

        File dictionaryDir = new File(dictionaryDirPath);
        if (!dictionaryDir.isDirectory()) {
            return;
        }

        for (File file : dictionaryDir.listFiles()) {
            String name = file.getName();
            if (name.endsWith(OConsts.SC_AFFIX_EXTENSION)) {
                Language lang = new Language(name.substring(0, name.lastIndexOf(OConsts.SC_AFFIX_EXTENSION)));
                AFFIX_FILES.put(lang, file);
                AFFIX_FILES.put(new Language(lang.getLanguageCode()), file);
            } else if (name.endsWith(OConsts.SC_DICTIONARY_EXTENSION)) {
                Language lang = new Language(name.substring(0, name.lastIndexOf(OConsts.SC_DICTIONARY_EXTENSION)));
                DICTIONARY_FILES.put(lang, file);
                DICTIONARY_FILES.put(new Language(lang.getLanguageCode()), file);
            }
        }
    }

    private static String[] langsToStrings(Set<Language> langs) {
        List<String> result = new ArrayList<String>();
        for (Language lang : langs) {
            result.add(lang.getLanguage().toLowerCase());
            result.add(lang.getLanguageCode().toLowerCase());
        }
        return result.toArray(new String[result.size()]);
    }
}