/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2010 Alex Buloichik 2015 Aaron Madlon-Kay Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.gui.glossary; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import org.omegat.util.EncodingDetector; /** * Reader for comma separated glossaries. * * @author Keith Godfrey * @author Maxym Mykhalchuk * @author Alex Buloichik <alex73mail@gmail.com> * @author Aaron Madlon-Kay */ public final class GlossaryReaderCSV { /** Fields separator. Can be dependent of regional options. */ protected static final char SEPARATOR = ','; private GlossaryReaderCSV() { } public static List<GlossaryEntry> read(final File file, boolean priorityGlossary) throws IOException { String encoding = EncodingDetector.detectEncodingDefault(file, StandardCharsets.UTF_8.name()); List<GlossaryEntry> result = new ArrayList<GlossaryEntry>(); try (BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(file), encoding))) { // BOM (byte order mark) bugfix in.mark(1); int ch = in.read(); if (ch != 0xFEFF) { in.reset(); } for (String s = in.readLine(); s != null; s = in.readLine()) { // skip lines that start with '#' if (s.startsWith("#")) { continue; } // divide lines on tabs String[] tokens = parseLine(s); // check token list to see if it has a valid string if (tokens.length < 2 || tokens[0].isEmpty()) { continue; } // creating glossary entry and add it to the hash // (even if it's already there!) String comment = ""; if (tokens.length >= 3) { comment = tokens[2]; } result.add(new GlossaryEntry(tokens[0], tokens[1], comment, priorityGlossary)); } } return result; } private static String[] parseLine(String line) { List<String> result = new ArrayList<String>(); StringBuilder w = new StringBuilder(); boolean fopened = false; // field opened by " for (int cp, cpn, i = 0; i < line.length(); i += Character.charCount(cp)) { cp = line.codePointAt(i); try { cpn = line.codePointAt(i + Character.charCount(cp)); } catch (StringIndexOutOfBoundsException ex) { cpn = 0; } switch (cp) { case '"': if (w.length() == 0 && !fopened) { // first " in field fopened = true; } else if (cpn == '"') { // double " - add one w.appendCodePoint(cp); i++; } else { // last " in field fopened = false; } break; case SEPARATOR: if (fopened) { w.appendCodePoint(cp); } else { result.add(w.toString()); w.setLength(0); } break; default: w.appendCodePoint(cp); break; } } result.add(w.toString()); return result.toArray(new String[result.size()]); } }