StarDict.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2009 Alex Buloichik
               2015-2016 Hiroshi Miura, Aaron Madlon-Kay
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.core.dictionaries;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.TreeMap;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.zip.GZIPInputStream;

import org.dict.zip.DictZipInputStream;
import org.dict.zip.RandomAccessInputStream;
import org.omegat.util.Language;
import org.omegat.util.Log;

/**
 * Dictionary implementation for StarDict format.
 * <p>
 * StarDict format described on http://code.google.com/p/babiloo/wiki/StarDict_format
 * <p>
 * <h1>Files</h1>
 * Every dictionary consists of these files:
 * <ol><li>somedict.ifo
 * <li>somedict.idx or somedict.idx.gz
 * <li>somedict.dict or somedict.dict.dz
 * <li>somedict.syn (optional)
 * </ol>
 *
 * @author Alex Buloichik <alex73mail@gmail.com>
 * @author Hiroshi Miura
 * @author Aaron Madlon-Kay
 */
public class StarDict implements IDictionaryFactory {

    private enum DictType {
        DICTZIP,
        DICTFILE
    }

    @Override
    public boolean isSupportedFile(File file) {
        return file.getPath().endsWith(".ifo");
    }

    @Override
    public IDictionary loadDict(File file) throws Exception {
        return loadDict(file, new Language(Locale.getDefault()));
    }

    @Override
    public IDictionary loadDict(File file, Language language) throws Exception {
        return new StarDictDict(file, language);
    }

    static class StarDictDict implements IDictionary {

        private final Language language;

        /**
         * Field in StarDict .ifo file, added in version 3.0.0. This must be
         * retained in order to support idxoffsetbits=64 dictionaries (not yet
         * implemented).
         *
         * @see <a href="http://www.stardict.org/StarDictFileFormat">StarDict
         *      File Format</a>
         */
        private int idxoffsetbits = 32;

        private final String dictName;
        private final DictType dictType;
        private final String dataFile;

        protected final DictionaryData<Entry> data;

        /**
         * @param ifoFile
         *            ifo file with dictionary
         */
        public StarDictDict(File ifoFile, Language language) throws Exception {

            this.language = language;

            Map<String, String> header = readIFO(ifoFile);
            String version = header.get("version");
            if (!"2.4.2".equals(version) && !"3.0.0".equals(version)) {
                throw new Exception("Invalid version of dictionary: " + version);
            }
            String sametypesequence = header.get("sametypesequence");
            if (!"g".equals(sametypesequence)
                    && !"m".equals(sametypesequence)
                    && !"x".equals(sametypesequence)
                    && !"h".equals(sametypesequence)) {
                throw new Exception("Invalid type of dictionary: " + sametypesequence);
            }

            if ("3.0.0".equals(version)) {
                String bitsString = header.get("idxoffsetbits");
                if (bitsString != null) {
                    idxoffsetbits = Integer.parseInt(bitsString);
                }
            }

            if (idxoffsetbits != 32) {
                throw new Exception("StarDict dictionaries with idxoffsetbits=64 are not supported.");
            }

            String f = ifoFile.getPath();
            if (f.endsWith(".ifo")) {
                f = f.substring(0, f.length() - ".ifo".length());
            }
            dictName = f;

            try {
                dataFile = getFile(".dict.dz", ".dict").get().getPath();
                dictType = dataFile.endsWith(".dz") ? DictType.DICTZIP : DictType.DICTFILE;
            } catch (NoSuchElementException ex) {
                throw new FileNotFoundException("No .dict.dz or .dict files were found for " + dictName);
            }

            try {
                data = loadData(getFile(".idx.gz", ".idx").get());
            } catch (NoSuchElementException ex) {
                throw new FileNotFoundException("No .idx file could be found");
            }
        }

        private Optional<File> getFile(String... suffixes) {
            return Stream.of(suffixes).map(suff -> new File(dictName + suff)).filter(f -> f.isFile())
                    .findFirst();
        }

        private DictionaryData<Entry> loadData(File idxFile) throws IOException {
            InputStream is = new FileInputStream(idxFile);
            if (idxFile.getName().endsWith(".gz")) {
                is = new GZIPInputStream(is);
            }
            DictionaryData<Entry> newData = new DictionaryData<>(language);
            try (DataInputStream idx = new DataInputStream(new BufferedInputStream(is));
                  ByteArrayOutputStream mem = new ByteArrayOutputStream()) {
                while (true) {
                    int b = idx.read();
                    if (b == -1) {
                        break;
                    }
                    if (b == 0) {
                        String key = new String(mem.toByteArray(), 0, mem.size(), StandardCharsets.UTF_8);
                        mem.reset();
                        int bodyOffset = idx.readInt();
                        int bodyLength = idx.readInt();
                        newData.add(key, new Entry(bodyOffset, bodyLength));
                    } else {
                        mem.write(b);
                    }
                }
            }
            is.close();
            newData.done();
            return newData;
        }

        @Override
        public List<DictionaryEntry> readArticles(String word) throws Exception {
            return data.lookUp(word).stream().map(e -> new DictionaryEntry(e.getKey(), e.getValue().getArticle()))
                    .collect(Collectors.toList());
        }

        @Override
        public List<DictionaryEntry> readArticlesPredictive(String word) {
            return data.lookUpPredictive(word).stream()
                    .map(e -> new DictionaryEntry(e.getKey(), e.getValue().getArticle())).collect(Collectors.toList());
        }

        /**
         * Read an article from the data file on disk. Convenience method that
         * dispatches on {@link #dictType} to call the appropriate
         * format-specific method.
         * <p>
         * Synchronized to prevent concurrent reading of the same file from
         * disk.
         *
         * @param start
         *            Start offset in data file
         * @param len
         *            Length of article data
         * @return Raw article text
         */
        private synchronized String readArticle(int start, int len) {
            switch (dictType) {
            case DICTFILE:
                return readDictArticleText(start, len);
            case DICTZIP:
                return readDictZipArticleText(start, len);
            default:
                throw new RuntimeException("Unknown StarDict dictionary type: " + dictType);
            }
        }

        /**
         * Read .dict file data and return article string. Intended to be called
         * only from {@link #readArticle(int, int)}.
         *
         * @param start
         *            Start offset in data file
         * @param len
         *            Length of article data
         * @return Raw article text
         */
        private String readDictArticleText(int start, int len) {
            String result = null;
            try (FileInputStream in = new FileInputStream(dataFile)) {
                byte[] data = new byte[len];
                long moved = in.skip(start);
                if (moved < start) {
                    long moved2 = in.skip(start - moved);
                    if (moved2 < start - moved) {
                        throw new IOException("Cannot seek dictionary.");
                    }
                }
                int readLen = in.read(data);
                result = new String(data, 0, readLen, StandardCharsets.UTF_8);
            } catch (IOException e) {
                System.err.println(e.getMessage());
            }
            return result;
        }

        /**
         * Read .dict.dz file data. Intended to be called only from
         * {@link #readArticle(int, int)}.
         *
         * @param start
         *            Start offset in data file
         * @param len
         *            Length of article data
         * @return Raw article text
         */
        private String readDictZipArticleText(int start, int len) {
            String result = null;
            try (DictZipInputStream din = new DictZipInputStream(new
                    RandomAccessInputStream(dataFile, "r"))) {
                din.seek(start);
                byte[] data = new byte[len];
                din.readFully(data);
                result = new String(data, StandardCharsets.UTF_8);
            } catch (IOException e) {
                Log.log(e);
            }
            return result;
        }

        /**
         * Read header.
         */
        private Map<String, String> readIFO(File ifoFile) throws Exception {
            Map<String, String> result = new TreeMap<>();
            try (BufferedReader rd = Files.newBufferedReader(ifoFile.toPath(), StandardCharsets.UTF_8)) {
                String line;
                String first = rd.readLine();
                if (!"StarDict's dict ifo file".equals(first)) {
                    throw new Exception("Invalid header of .ifo file: " + first);
                }
                while ((line = rd.readLine()) != null) {
                    if (line.trim().isEmpty()) {
                        continue;
                    }
                    int pos = line.indexOf('=');
                    if (pos < 0) {
                        throw new Exception("Invalid format of .ifo file: " + line);
                    }
                    result.put(line.substring(0, pos), line.substring(pos + 1));
                }
            }
            return result;
        }

        class Entry {
            private volatile String cache;
            private final int start;
            private final int len;

            Entry(int start, int len) {
                this.start = start;
                this.len = len;
            }

            public String getArticle() {
                String article = cache;
                if (article == null) {
                    synchronized (this) {
                        article = cache;
                        if (article == null) {
                            article = cache = loadArticle();
                        }
                    }
                }
                return article;
            }

            private String loadArticle() {
                return readArticle(start, len).replace("\n", "<br>");
            }
        }
    }
}