/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2009 Alex Buloichik 2015-2016 Hiroshi Miura, Aaron Madlon-Kay Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.core.dictionaries; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.NoSuchElementException; import java.util.Optional; import java.util.TreeMap; import java.util.stream.Collectors; import java.util.stream.Stream; import java.util.zip.GZIPInputStream; import org.dict.zip.DictZipInputStream; import org.dict.zip.RandomAccessInputStream; import org.omegat.util.Language; import org.omegat.util.Log; /** * Dictionary implementation for StarDict format. * <p> * StarDict format described on http://code.google.com/p/babiloo/wiki/StarDict_format * <p> * <h1>Files</h1> * Every dictionary consists of these files: * <ol><li>somedict.ifo * <li>somedict.idx or somedict.idx.gz * <li>somedict.dict or somedict.dict.dz * <li>somedict.syn (optional) * </ol> * * @author Alex Buloichik <alex73mail@gmail.com> * @author Hiroshi Miura * @author Aaron Madlon-Kay */ public class StarDict implements IDictionaryFactory { private enum DictType { DICTZIP, DICTFILE } @Override public boolean isSupportedFile(File file) { return file.getPath().endsWith(".ifo"); } @Override public IDictionary loadDict(File file) throws Exception { return loadDict(file, new Language(Locale.getDefault())); } @Override public IDictionary loadDict(File file, Language language) throws Exception { return new StarDictDict(file, language); } static class StarDictDict implements IDictionary { private final Language language; /** * Field in StarDict .ifo file, added in version 3.0.0. This must be * retained in order to support idxoffsetbits=64 dictionaries (not yet * implemented). * * @see <a href="http://www.stardict.org/StarDictFileFormat">StarDict * File Format</a> */ private int idxoffsetbits = 32; private final String dictName; private final DictType dictType; private final String dataFile; protected final DictionaryData<Entry> data; /** * @param ifoFile * ifo file with dictionary */ public StarDictDict(File ifoFile, Language language) throws Exception { this.language = language; Map<String, String> header = readIFO(ifoFile); String version = header.get("version"); if (!"2.4.2".equals(version) && !"3.0.0".equals(version)) { throw new Exception("Invalid version of dictionary: " + version); } String sametypesequence = header.get("sametypesequence"); if (!"g".equals(sametypesequence) && !"m".equals(sametypesequence) && !"x".equals(sametypesequence) && !"h".equals(sametypesequence)) { throw new Exception("Invalid type of dictionary: " + sametypesequence); } if ("3.0.0".equals(version)) { String bitsString = header.get("idxoffsetbits"); if (bitsString != null) { idxoffsetbits = Integer.parseInt(bitsString); } } if (idxoffsetbits != 32) { throw new Exception("StarDict dictionaries with idxoffsetbits=64 are not supported."); } String f = ifoFile.getPath(); if (f.endsWith(".ifo")) { f = f.substring(0, f.length() - ".ifo".length()); } dictName = f; try { dataFile = getFile(".dict.dz", ".dict").get().getPath(); dictType = dataFile.endsWith(".dz") ? DictType.DICTZIP : DictType.DICTFILE; } catch (NoSuchElementException ex) { throw new FileNotFoundException("No .dict.dz or .dict files were found for " + dictName); } try { data = loadData(getFile(".idx.gz", ".idx").get()); } catch (NoSuchElementException ex) { throw new FileNotFoundException("No .idx file could be found"); } } private Optional<File> getFile(String... suffixes) { return Stream.of(suffixes).map(suff -> new File(dictName + suff)).filter(f -> f.isFile()) .findFirst(); } private DictionaryData<Entry> loadData(File idxFile) throws IOException { InputStream is = new FileInputStream(idxFile); if (idxFile.getName().endsWith(".gz")) { is = new GZIPInputStream(is); } DictionaryData<Entry> newData = new DictionaryData<>(language); try (DataInputStream idx = new DataInputStream(new BufferedInputStream(is)); ByteArrayOutputStream mem = new ByteArrayOutputStream()) { while (true) { int b = idx.read(); if (b == -1) { break; } if (b == 0) { String key = new String(mem.toByteArray(), 0, mem.size(), StandardCharsets.UTF_8); mem.reset(); int bodyOffset = idx.readInt(); int bodyLength = idx.readInt(); newData.add(key, new Entry(bodyOffset, bodyLength)); } else { mem.write(b); } } } is.close(); newData.done(); return newData; } @Override public List<DictionaryEntry> readArticles(String word) throws Exception { return data.lookUp(word).stream().map(e -> new DictionaryEntry(e.getKey(), e.getValue().getArticle())) .collect(Collectors.toList()); } @Override public List<DictionaryEntry> readArticlesPredictive(String word) { return data.lookUpPredictive(word).stream() .map(e -> new DictionaryEntry(e.getKey(), e.getValue().getArticle())).collect(Collectors.toList()); } /** * Read an article from the data file on disk. Convenience method that * dispatches on {@link #dictType} to call the appropriate * format-specific method. * <p> * Synchronized to prevent concurrent reading of the same file from * disk. * * @param start * Start offset in data file * @param len * Length of article data * @return Raw article text */ private synchronized String readArticle(int start, int len) { switch (dictType) { case DICTFILE: return readDictArticleText(start, len); case DICTZIP: return readDictZipArticleText(start, len); default: throw new RuntimeException("Unknown StarDict dictionary type: " + dictType); } } /** * Read .dict file data and return article string. Intended to be called * only from {@link #readArticle(int, int)}. * * @param start * Start offset in data file * @param len * Length of article data * @return Raw article text */ private String readDictArticleText(int start, int len) { String result = null; try (FileInputStream in = new FileInputStream(dataFile)) { byte[] data = new byte[len]; long moved = in.skip(start); if (moved < start) { long moved2 = in.skip(start - moved); if (moved2 < start - moved) { throw new IOException("Cannot seek dictionary."); } } int readLen = in.read(data); result = new String(data, 0, readLen, StandardCharsets.UTF_8); } catch (IOException e) { System.err.println(e.getMessage()); } return result; } /** * Read .dict.dz file data. Intended to be called only from * {@link #readArticle(int, int)}. * * @param start * Start offset in data file * @param len * Length of article data * @return Raw article text */ private String readDictZipArticleText(int start, int len) { String result = null; try (DictZipInputStream din = new DictZipInputStream(new RandomAccessInputStream(dataFile, "r"))) { din.seek(start); byte[] data = new byte[len]; din.readFully(data); result = new String(data, StandardCharsets.UTF_8); } catch (IOException e) { Log.log(e); } return result; } /** * Read header. */ private Map<String, String> readIFO(File ifoFile) throws Exception { Map<String, String> result = new TreeMap<>(); try (BufferedReader rd = Files.newBufferedReader(ifoFile.toPath(), StandardCharsets.UTF_8)) { String line; String first = rd.readLine(); if (!"StarDict's dict ifo file".equals(first)) { throw new Exception("Invalid header of .ifo file: " + first); } while ((line = rd.readLine()) != null) { if (line.trim().isEmpty()) { continue; } int pos = line.indexOf('='); if (pos < 0) { throw new Exception("Invalid format of .ifo file: " + line); } result.put(line.substring(0, pos), line.substring(pos + 1)); } } return result; } class Entry { private volatile String cache; private final int start; private final int len; Entry(int start, int len) { this.start = start; this.len = len; } public String getArticle() { String article = cache; if (article == null) { synchronized (this) { article = cache; if (article == null) { article = cache = loadArticle(); } } } return article; } private String loadArticle() { return readArticle(start, len).replace("\n", "<br>"); } } } }